diff --git a/docs/.vuepress/notes/en/mm_guide.ts b/docs/.vuepress/notes/en/mm_guide.ts
index 4986b80b..a156098f 100644
--- a/docs/.vuepress/notes/en/mm_guide.ts
+++ b/docs/.vuepress/notes/en/mm_guide.ts
@@ -22,6 +22,12 @@ export const MMGuide: ThemeNote = defineNoteConfig({
             prefix: 'image_understanding',
             items: [
                 'install_image_understanding',
+                'context_vqa',
+                'image_gcot',
+                'vision_mct_reasoning_pipeline',
+                'image_region_caption_pipeline',
+                'image_scale_caption_pipeline',
+                'image_visual_only_mcq_pipeline',
             ],
         },
         {
@@ -35,7 +41,8 @@ export const MMGuide: ThemeNote = defineNoteConfig({
                 'video_clip_and_filter',
                 'video_qa',
                 'video_cotqa',
-                'video_longvideo_cotqa_api'
+                'video_longvideo_cotqa_api',
+                'multirole_videoqa_pipeline'
             ],
         },
         {
diff --git a/docs/.vuepress/notes/en/mm_operators.ts b/docs/.vuepress/notes/en/mm_operators.ts
index af463fac..a7be6236 100644
--- a/docs/.vuepress/notes/en/mm_operators.ts
+++ b/docs/.vuepress/notes/en/mm_operators.ts
@@ -21,27 +21,71 @@ export const MMOperators: ThemeNote = defineNoteConfig({
             icon: 'carbon:idea',
             prefix: 'image_understanding',
             items: [
-                'install_image_understanding',
-                'generate/image_caption',
-                'generate/image_qa',
-                'generate/image_pers_qa',
-                'generate/multimodal_math',
-                'generate/vision_mct_reasoning',
-                'generate/image_region_caption',
-                'generate/image_scale_caption',
-                'generate/image_gcot',
-                'generate/image_skvqa',
-                'generate/image_caprl',
-                'eval/clip_image_text_evaluator',
-                'eval/longclip_image_text_evaluator',
-                'eval/vqa_score_image_text_evaluator',
-                'filter/cat_filter',
-                'filter/clip_filter',
-                'filter/complexity_filter',
-                'filter/deduplication_filter',
-                'filter/image_aesthetic_filter',
-                'filter/sensitive_filter',
-                'filter/text_image_diversity_filter'
+                {
+                    text: 'install',
+                    collapsed: false,
+                    prefix: '',
+                    items: ['install_image_understanding'],
+                },
+                {
+                    text: "generate",
+                    collapsed: false,
+                    prefix: 'generate/',
+                    items: [
+                        'image_caption',
+                        'image_qa',
+                        'image_pers_qa',
+                        'multimodal_math',
+                        "prompt_templated_vqa_generator",
+                        "fix_prompted_vqa_generator",
+                        "prompted_vqa_generator",
+                        "batch_vqa_generator",
+                        "visual_reasoning_generator",
+                        "vlm_bbox_generator",
+                        "image_bbox_generator"
+                        // 'vision_mct_reasoning',
+                        // 'image_region_caption',
+                        // 'image_scale_caption',
+                        // 'image_gcot',
+                        // 'image_caprl',
+                        // 'multirole_videoqa',
+                    ]
+                },
+                {
+                    text: "eval",
+                    collapsed: false,
+                    prefix: 'eval/',
+                    items: [
+                        'image_clip_evaluator',
+                        'image_longclip_evaluator',
+                        'image_vqa_evaluator',
+                    ]
+                },
+                {
+                    text: "filter",
+                    collapsed: false,
+                    prefix: 'filter/',
+                    items: [
+                        'image_aesthetic_filter',
+                        'image_cat_filter',
+                        'image_clip_filter',
+                        'image_complexity_filter',
+                        'image_consistency_filter',
+                        'image_deduplication_filter',
+                        'image_diversity_filter',
+                        'image_sensitive_filter',
+                    ]
+                },
+                {
+                    text: "refine",
+                    collapsed: false,
+                    prefix: 'refine/',
+                    items: [
+                        'visual_dependency_refiner',
+                        'visual_grounding_refiner',
+                        'wiki_qa_refiner',
+                    ]
+                }
             ],
         },
         {
diff --git a/docs/.vuepress/notes/zh/mm_guide.ts b/docs/.vuepress/notes/zh/mm_guide.ts
index 7c9f6e2e..47399ba1 100644
--- a/docs/.vuepress/notes/zh/mm_guide.ts
+++ b/docs/.vuepress/notes/zh/mm_guide.ts
@@ -23,6 +23,12 @@ export const MMGuide: ThemeNote = defineNoteConfig({
             prefix: 'image_understanding',
             items: [
                 'install_image_understanding',
+                'context_vqa',
+                'image_gcot',
+                'vision_mct_reasoning_pipeline',
+                'image_region_caption_pipeline',
+                'image_scale_caption_pipeline',
+                'image_visual_only_mcq_pipeline',
             ],
         },
         {
@@ -36,7 +42,8 @@ export const MMGuide: ThemeNote = defineNoteConfig({
                 'video_clip_and_filter',
                 'video_qa',
                 'video_cotqa',
-                'video_longvideo_cotqa_api'
+                'video_longvideo_cotqa_api',
+                'multirole_videoqa_pipeline'
             ],
         },
         {
diff --git a/docs/.vuepress/notes/zh/mm_operators.ts b/docs/.vuepress/notes/zh/mm_operators.ts
index 77c6fbf0..28097da1 100644
--- a/docs/.vuepress/notes/zh/mm_operators.ts
+++ b/docs/.vuepress/notes/zh/mm_operators.ts
@@ -22,27 +22,71 @@ export const MMOperators: ThemeNote = defineNoteConfig({
             icon: 'carbon:idea',
             prefix: 'image_understanding',
             items: [
-                'install_image_understanding',
-                'generate/image_caption',
-                'generate/image_qa',
-                'generate/image_pers_qa',
-                'generate/multimodal_math',
-                'generate/vision_mct_reasoning',
-                'generate/image_region_caption',
-                'generate/image_scale_caption',
-                'generate/image_gcot',
-                'generate/image_skvqa',
-                'generate/image_caprl',
-                'eval/clip_image_text_evaluator',
-                'eval/longclip_image_text_evaluator',
-                'eval/vqa_score_image_text_evaluator',
-                'filter/cat_filter',
-                'filter/clip_filter',
-                'filter/complexity_filter',
-                'filter/deduplication_filter',
-                'filter/image_aesthetic_filter',
-                'filter/sensitive_filter',
-                'filter/text_image_diversity_filter'
+                {
+                    text: '安装',
+                    collapsed: false,
+                    prefix: '',
+                    items: ['install_image_understanding'],
+                },
+                {
+                    text: "generate",
+                    collapsed: false,
+                    prefix: 'generate/',
+                    items: [
+                        'image_caption',
+                        'image_qa',
+                        'image_pers_qa',
+                        'multimodal_math',
+                        'prompt_templated_vqa_generator',
+                        'fix_prompted_vqa_generator',
+                        "prompted_vqa_generator",
+                        "batch_vqa_generator",
+                        "visual_reasoning_generator",
+                        "vlm_bbox_generator",
+                        "image_bbox_generator"
+                        // 'vision_mct_reasoning',
+                        // 'image_region_caption',
+                        // 'image_scale_caption',
+                        // 'image_gcot',
+                        // 'image_caprl',
+                        // 'multirole_videoqa',
+                    ]
+                },
+                {
+                    text: "eval",
+                    collapsed: false,
+                    prefix: 'eval/',
+                    items: [
+                        'image_clip_evaluator',
+                        'image_longclip_evaluator',
+                        'image_vqa_evaluator',
+                    ]
+                },
+                {
+                    text: "filter",
+                    collapsed: false,
+                    prefix: 'filter/',
+                    items: [
+                        'image_aesthetic_filter',
+                        'image_cat_filter',
+                        'image_clip_filter',
+                        'image_complexity_filter',
+                        'image_consistency_filter',
+                        'image_deduplication_filter',
+                        'image_diversity_filter',
+                        'image_sensitive_filter',
+                    ]
+                },
+                {
+                    text: "refine",
+                    collapsed: false,
+                    prefix: 'refine/',
+                    items: [
+                        'visual_dependency_refiner',
+                        'visual_grounding_refiner',
+                        'wiki_qa_refiner',
+                    ]
+                }
             ],
         },
         {
diff --git a/docs/en/notes/mm_guide/image_understanding/context_vqa.md b/docs/en/notes/mm_guide/image_understanding/context_vqa.md
new file mode 100644
index 00000000..8d228fe7
--- /dev/null
+++ b/docs/en/notes/mm_guide/image_understanding/context_vqa.md
@@ -0,0 +1,297 @@
+---
+
+title: ContextVQA Multimodal Question Answering Data Generation Pipeline
+icon: mdi:image-text
+createTime: 2025/06/16 14:30:00
+permalink: /en/mm_guide/contextvqa_pipeline/
+---
+## 1. Overview
+
+The **ContextVQA Generation Pipeline** is designed to automatically generate **Context-based Visual Question Answering (VQA)** data from images. This pipeline utilizes a Vision-Language Model (VLM) to generate a Wikipedia-style article and Q&A pairs related to an image, and subsequently parses them into structured data.
+
+We support the following application scenarios:
+
+* **Knowledge-based VQA Synthesis**: Constructing datasets requiring reasoning over external knowledge.
+* **Multimodal RAG Construction**: Generating high-quality data for Retrieval-Augmented Generation training.
+* **Visual Reasoning Tasks**: Creating data where questions point to the image, but answers are derived from the text context.
+
+The main process of the pipeline includes:
+
+1. **Data Loading**: Reading data files containing image paths.
+2. **Context & QA Generation**: Using a VLM to analyze the image and generate a Wiki-style article with Q&A pairs.
+3. **Refining & Structuring**: Parsing the raw generated text into a structured `{context, qas}` format.
+
+---
+
+## 2. Quick Start
+
+### Step 1: Create a Working Directory
+
+```bash
+mkdir run_context_vqa
+cd run_context_vqa
+
+```
+
+### Step 2: Prepare the Script
+
+Save the code provided in the "Pipeline Example" section below as `context_vqa_pipeline.py`.
+
+### Step 3: Configure Parameters
+
+This pipeline supports CLI argument configuration. You can specify the model path and input file directly via the command line:
+
+```bash
+# Ensure dependencies are installed
+pip install open-dataflow vllm
+
+```
+
+### Step 4: Run
+
+```bash
+python context_vqa_pipeline.py \
+  --model_path "Qwen/Qwen2.5-VL-3B-Instruct" \
+  --images_file "path/to/your/images.jsonl" \
+  --cache_path "./cache_local"
+
+```
+
+---
+
+## 3. Data Flow & Logic
+
+### 1. **Input Data**
+
+The input data for this process includes the following fields:
+
+* **image**: Path to the image file (local path or URL).
+* **id** (optional): Unique identifier for the data entry.
+
+Data is managed via `FileStorage`, which supports resuming from checkpoints.
+
+**Input Data Example**:
+
+```json
+[
+    {
+        "id": 1,
+        "image": "./images/landmark.jpg"
+    },
+    {
+        "id": 2,
+        "image": "./images/animal.jpg"
+    }
+]
+
+```
+
+### 2. **Core Operator Logic**
+
+The pipeline chains two core operators to complete the task:
+
+#### A. **FixPromptedVQAGenerator (Context Generation)**
+
+This operator uses the VLM model to generate raw text based on a predefined Prompt template.
+
+**Functionality:**
+
+* Generates a Wikipedia-style article based on the image.
+* Generates Q&A pairs based on the article.
+* **Prompt Constraints**: Questions refer to the image but avoid naming objects directly; answers must come from the text and not be objects in the image; answers must be concise.
+
+**Model Serving Configuration**:
+
+```python
+self.serving = LocalModelVLMServing_vllm(
+    hf_model_name_or_path=model_path,
+    hf_cache_dir=hf_cache_dir,
+    vllm_tensor_parallel_size=1,
+    vllm_temperature=0.7,  # Maintains some creativity
+    vllm_top_p=0.9,
+    vllm_max_tokens=512,
+)
+
+```
+
+**Operator Execution**:
+
+```python
+self.vqa_generator.run(
+    storage=self.storage.step(),
+    input_image_key="image",
+    output_answer_key="vqa" # Outputs raw generated text
+)
+
+```
+
+#### B. **WikiQARefiner (Refining)**
+
+This operator cleans and converts the unstructured text generated by the VLM into a standard format.
+
+**Functionality:**
+
+* Cleans Markdown formatting and excess whitespace.
+* Separates the article content (Context) from the Q&A pairs (QAs).
+
+**Operator Execution**:
+
+```python
+self.refiner.run(
+    storage=self.storage.step(),
+    input_key="vqa",          # Input raw text from previous step
+    output_key="context_vqa"  # Output final structured data
+)
+
+```
+
+### 3. **Output Data**
+
+Finally, the output data generated by the pipeline will contain:
+
+* **image**: Original image path.
+* **vqa**: Raw text generated by the VLM (intermediate result).
+* **context_vqa**: Structured final result containing `context` (article) and `qas` (list of Q&A).
+
+**Output Data Example**:
+
+```json
+{
+    "id": 1,
+    "image": "./images/landmark.jpg",
+    "context_vqa": {
+        "context": "The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France...",
+        "qas": [
+            {
+                "question": "In which city is this structure located?",
+                "answer": "Paris"
+            },
+            {
+                "question": "What material is the tower primarily constructed from?",
+                "answer": "wrought-iron"
+            }
+        ]
+    }
+}
+
+```
+
+---
+
+## 4. Pipeline Example
+
+Below is the complete `ContextVQAPipeline` code implementation, supporting CLI execution.
+
+```python
+import argparse
+from dataflow.utils.storage import FileStorage
+from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
+from dataflow.operators.core_vision import FixPromptedVQAGenerator
+from dataflow.operators.core_vision import WikiQARefiner
+
+class ContextVQAPipeline:
+    """
+    Batch generate ContextVQA Captions with a single command.
+    """
+
+    def __init__(
+        self,
+        model_path: str,
+        *,
+        hf_cache_dir: str | None = None,
+        download_dir: str = "./ckpt",
+        device: str = "cuda",
+        first_entry_file: str = "dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl",
+        cache_path: str = "./cache_local_skvqa",
+        file_name_prefix: str = "skvqa_cache_step",
+        cache_type: str = "jsonl",
+    ):
+        # ---------- 1. Storage ----------
+        self.storage = FileStorage(
+            first_entry_file_name=first_entry_file,
+            cache_path=cache_path,
+            file_name_prefix=file_name_prefix,
+            cache_type=cache_type,
+        )
+
+        # ---------- 2. Serving ----------
+        self.serving = LocalModelVLMServing_vllm(
+            hf_model_name_or_path=model_path,
+            hf_cache_dir=hf_cache_dir,
+            hf_local_dir=download_dir,
+            vllm_tensor_parallel_size=1,
+            vllm_temperature=0.7,
+            vllm_top_p=0.9,
+            vllm_max_tokens=512,
+        )
+
+        # ---------- 3. Operator ----------
+        # Use specific Prompt to generate Wiki-style article and Q&A
+        self.vqa_generator = FixPromptedVQAGenerator(
+            serving=self.serving,
+            system_prompt="You are a helpful assistant.",
+            user_prompt= """
+            Write a Wikipedia article related to this image without directly referring to the image. Then write question answer pairs. The question answer pairs should satisfy the following criteria.
+            1: The question should refer to the image.
+            2: The question should avoid mentioning the name of the object in the image.
+            3: The question should be answered by reasoning over the Wikipedia article.
+            4: The question should sound natural and concise.
+            5: The answer should be extracted from the Wikipedia article.
+            6: The answer should not be any objects in the image.
+            7: The answer should be a single word or phrase and list all correct answers separated by commas.
+            8: The answer should not contain 'and', 'or', rather you can split them into multiple answers.
+            """
+        )
+
+        # Refine and structure results
+        self.refiner = WikiQARefiner()
+
+    # ------------------------------------------------------------------ #
+    def forward(self):
+        input_image_key = "image"
+        output_answer_key = "vqa"
+        output_wiki_key = "context_vqa"
+
+        # Step 1: Generate raw text
+        self.vqa_generator.run(
+            storage=self.storage.step(),
+            input_image_key=input_image_key,
+            output_answer_key=output_answer_key
+        )
+
+        # Step 2: Parse into structured data
+        self.refiner.run(
+            storage=self.storage.step(),
+            input_key=output_answer_key,
+            output_key=output_wiki_key
+        )
+
+# ---------------------------- CLI Entry -------------------------------- #
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Batch SKVQA caption generation with DataFlow")
+
+    parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct")
+    parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface")
+    parser.add_argument("--download_dir", default="./ckpt")
+    parser.add_argument("--device", choices=["cuda", "cpu", "mps"], default="cuda")
+
+    parser.add_argument("--images_file", default="dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl")
+    parser.add_argument("--cache_path", default="./cache_local")
+    parser.add_argument("--file_name_prefix", default="context_vqa")
+    parser.add_argument("--cache_type", default="jsonl")
+
+    args = parser.parse_args()
+
+    pipe = ContextVQAPipeline(
+        model_path=args.model_path,
+        hf_cache_dir=args.hf_cache_dir,
+        download_dir=args.download_dir,
+        device=args.device,
+        first_entry_file=args.images_file,
+        cache_path=args.cache_path,
+        file_name_prefix=args.file_name_prefix,
+        cache_type=args.cache_type,
+    )
+    pipe.forward()
+
+```
\ No newline at end of file
diff --git a/docs/en/notes/mm_guide/image_understanding/image_gcot.md b/docs/en/notes/mm_guide/image_understanding/image_gcot.md
new file mode 100644
index 00000000..85a41201
--- /dev/null
+++ b/docs/en/notes/mm_guide/image_understanding/image_gcot.md
@@ -0,0 +1,340 @@
+---
+title: Image Grounded CoT (GCoT) Pipeline
+icon: mdi:image-text
+createTime: 2026/01/11 20:44:55
+permalink: /en/mm_guide/image_gcot/
+---
+## 1. Overview
+
+The **Image Grounded Chain-of-Thought (GCoT) Pipeline** is designed to automatically generate **Grounded Chain-of-Thought** data. This pipeline generates multi-step reasoning to answer a question and simultaneously spatially locates (via Bounding Boxes) the key objects mentioned during the reasoning process. This significantly enhances the interpretability and precision of multimodal data.
+
+Unlike traditional methods, this pipeline uses a **Single VLM (e.g., Qwen2.5-VL)** to handle both "Reasoning" and "Grounding" tasks, making the process streamlined and efficient.
+
+We support the following application scenarios:
+
+* **Enhanced Multimodal Data Construction**: Adding interpretability and grounding annotations to VQA datasets.
+* **Complex Scene Understanding**: Generating detailed reasoning steps containing object coordinates.
+* **Model Reasoning Training**: Building data to train models to be "grounded" and reduce hallucinations.
+
+The main process of the pipeline includes:
+
+1. **CoT Generation**: The model generates step-by-step reasoning text and extracts key nouns.
+2. **Keyword Parsing**: Cleaning and extracting keywords to be grounded from the generated text.
+3. **Visual Grounding**: The model generates bounding boxes (BBoxes) for the extracted keywords.
+4. **Information Injection**: Injecting BBox coordinates back into the reasoning text to form the final GCoT.
+
+---
+
+## 2. Quick Start
+
+### Step 1: Create a Working Directory
+
+```bash
+mkdir run_gcot
+cd run_gcot
+
+```
+
+### Step 2: Prepare the Script
+
+Save the code in the "Pipeline Example" section below as `image_gcot_pipeline.py`.
+
+### Step 3: Configure Parameters
+
+Ensure you have a VLM model capable of grounding (e.g., Qwen2.5-VL-7B-Instruct).
+
+```bash
+# Install dependencies
+pip install open-dataflow vllm
+
+```
+
+### Step 4: Run
+
+```bash
+python image_gcot_pipeline.py \
+  --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \
+  --input_file "data/image_qa.jsonl"
+
+```
+
+---
+
+## 3. Data Flow & Logic
+
+### 1. **Input Data**
+
+The input data for this process typically consists of standard VQA data:
+
+* **image**: Path to the image file.
+* **question**: Question about the image.
+* **answer**: Standard answer to the question (used to assist CoT generation).
+
+**Input Data Example**:
+
+```json
+{
+    "image": "./images/cat_dog.jpg",
+    "question": "Is the cat looking at the dog?",
+    "answer": "Yes"
+}
+
+```
+
+### 2. **Core Operator Logic**
+
+This pipeline combines multiple fine-grained operators to achieve complex GCoT generation logic:
+
+#### A. **CoT Generation (PromptTemplatedVQAGenerator)**
+
+Uses a predefined `GCOT_PROMPT_TEMPLATE` to guide the model to generate "Step-by-step Reasoning" and a "Keyword List".
+
+* **Prompt Strategy**: Asks the model to output in the format `Step 1: ...`, `Step 2: ...`, `Keywords: ...`.
+* **Output**: Raw string containing reasoning text and keywords.
+
+#### B. **Text Cleaning & Extraction (FunctionalRefiner)**
+
+Uses custom functions to parse the output from the previous step:
+
+* `extract_clean_cot_logic`: Strips the keyword section, keeping pure CoT text.
+* `extract_keywords_logic`: Parses the content after `Keywords:` to generate a Python List.
+
+#### C. **Visual Grounding (VLMBBoxGenerator)**
+
+Calls the VLM's grounding capability to generate bounding boxes for each extracted keyword.
+
+* **Input**: Image + List of Keywords.
+* **Output**: Dictionary mapping keywords to bounding box coordinates.
+
+#### D. **Coordinate Injection (FunctionalRefiner)**
+
+Uses the `inject_bboxes_logic` function to intelligently insert the generated BBox coordinates back into the original CoT text after the corresponding words.
+
+### 3. **Output Data**
+
+Finally, the output data generated by the pipeline will contain the following key fields:
+
+* **raw_cot_output**: Raw text generated by the model.
+* **cleaned_cot**: Cleaned reasoning text.
+* **bbox_mapping**: Mapping of keywords to their coordinates.
+* **gcot**: Final result, reasoning chain containing coordinate information.
+
+**Output Data Example (gcot field)**:
+
+```text
+Step 1: Locate the cat [200, 300, 400, 500]. The cat is sitting on the left.
+Step 2: Locate the dog [500, 300, 700, 500]. The dog is sleeping on the right.
+Step 3: Observe their gaze. The cat is facing the dog.
+Answer: Yes
+
+```
+
+---
+
+## 4. Pipeline Example
+
+Below is the complete `ImageGCoTPipeline` code implementation.
+
+```python
+import re
+from typing import List, Dict, Any
+import argparse
+import torch
+from dataflow.utils.storage import FileStorage
+from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
+
+from dataflow.operators.core_vision import PromptTemplatedVQAGenerator, VLMBBoxGenerator
+from dataflow.operators.core_text import FunctionalRefiner
+from dataflow.prompts.prompt_template import NamedPlaceholderPromptTemplate
+
+# 定义 Prompt 模板，强制模型输出推理步骤和关键词
+GCOT_PROMPT_TEMPLATE = (
+    "Question: {question}\n"
+    "Answer: {answer}\n\n"
+    "Task: Provide a detailed step-by-step reasoning (Chain-of-Thought) that explains "
+    "how to arrive at this answer based on the image.\n"
+    "Then, extract key nouns and objects mentioned in your reasoning that are "
+    "visible in the image and can be spatially located.\n\n"
+    "Format:\n"
+    "Step 1: ...\n"
+    "Step 2: ...\n"
+    "Answer: {answer}\n"
+    "Keywords: object1, object2\n"
+)
+
+DEFAULT_BBOX_PROMPT = 'Detect "{keyword}".'
+
+# ----------------- 辅助逻辑函数 ----------------- #
+
+def _parse_base(text: str) -> Dict[str, Any]:
+    """基础解析逻辑：分离 CoT 文本和 Keywords 行"""
+    if not text: return {"cot": "", "keywords": []}
+    lines = text.split('\n')
+    cot_lines = []
+    keywords = []
+    for line in lines:
+        if line.strip().lower().startswith('keywords:'):
+            keyword_str = line.split(':', 1)[-1].strip()
+            # 简单的分词处理
+            raw_kws = [kw.strip().strip('.,;:!?"\'') for kw in keyword_str.replace(';', ',').split(',')]
+            keywords = [k for k in raw_kws if k]
+        else:
+            cot_lines.append(line)
+    return {"cot": '\n'.join(cot_lines).strip(), "keywords": keywords}
+
+def extract_clean_cot_logic(text: str) -> str:
+    return _parse_base(text)["cot"]
+
+def extract_keywords_logic(text: str) -> List[str]:
+    return _parse_base(text)["keywords"]
+
+def inject_bboxes_logic(cot_text: str, bbox_map: Dict[str, List[str]]) -> str:
+    """将 BBox 注入回 CoT 文本"""
+    if not cot_text or not bbox_map: return cot_text
+    # 优先匹配长词，避免子串误匹配
+    sorted_keywords = sorted(bbox_map.keys(), key=lambda x: len(x), reverse=True)
+    result_text = cot_text
+    replaced = set()
+    
+    for keyword in sorted_keywords:
+        if keyword in replaced: continue
+        # 简单策略：只在 'Answer:' 之前注入，防止破坏答案区
+        answer_pos = result_text.find('Answer:')
+        search_limit = answer_pos if answer_pos != -1 else len(result_text)
+        
+        # 大小写不敏感查找
+        pos = result_text.lower().find(keyword.lower(), 0, search_limit)
+        if pos == -1: continue
+        
+        boxes = bbox_map[keyword] # List[str]
+        box_str = "".join(boxes)
+        # 替换：保留原词，追加 Box
+        replacement = f"{keyword} {box_str}"
+        
+        result_text = result_text[:pos] + replacement + result_text[pos + len(keyword):]
+        replaced.add(keyword)
+    return result_text
+
+# ----------------- 流水线定义 ----------------- #
+
+class ImageGCoTPipeline:
+    def __init__(
+        self,
+        model_path: str,
+        *,
+        first_entry_file: str,
+        cache_path: str = "./cache_gcot",
+        file_name_prefix: str = "gcot",
+        # Keys 配置
+        question_key: str = "question",
+        answer_key: str = "answer",
+        image_key: str = "image",
+        output_key: str = "gcot",
+        vllm_max_tokens: int = 512
+    ):
+        # 1. 存储初始化
+        self.storage = FileStorage(
+            first_entry_file_name=first_entry_file,
+            cache_path=cache_path,
+            file_name_prefix=file_name_prefix,
+            cache_type="jsonl"
+        )
+        
+        # 2. 模型服务 (单一模型)
+        self.vlm_serving = LocalModelVLMServing_vllm(
+            hf_model_name_or_path=model_path,
+            vllm_tensor_parallel_size=1,
+            vllm_temperature=0.7,
+            vllm_max_tokens=vllm_max_tokens
+        )
+        
+        self.keys = {
+            "q": question_key,
+            "a": answer_key,
+            "img": image_key,
+            "raw_cot": "raw_cot_output",
+            "clean_cot": "cleaned_cot",
+            "keywords": "extracted_keywords",
+            "bbox_map": "bbox_mapping",
+            "final": output_key
+        }
+
+        # 3. 算子链配置
+        
+        # Step A: 生成 CoT 和 Keywords
+        self.op_gen_cot = PromptTemplatedVQAGenerator(
+            serving=self.vlm_serving,
+            system_prompt="You are a helpful assistant.",
+            prompt_template=NamedPlaceholderPromptTemplate(template=GCOT_PROMPT_TEMPLATE)
+        )
+        
+        # Step B: 解析清洗 CoT
+        self.op_extract_cot = FunctionalRefiner(func=extract_clean_cot_logic)
+        
+        # Step C: 解析 Keywords
+        self.op_extract_kws = FunctionalRefiner(func=extract_keywords_logic)
+
+        # Step D: 生成 BBox (Grounding)
+        self.op_bbox_gen = VLMBBoxGenerator(
+            serving=self.vlm_serving,
+            prompt_template=DEFAULT_BBOX_PROMPT
+        )
+        
+        # Step E: 注入 BBox 到 CoT
+        self.op_inject = FunctionalRefiner(func=inject_bboxes_logic)
+
+    def forward(self):
+        print(">>> [Pipeline] Step 1: Generating CoT...")
+        self.op_gen_cot.run(
+            self.storage.step(),
+            input_image_key=self.keys["img"],
+            output_answer_key=self.keys["raw_cot"],
+            question=self.keys["q"],
+            answer=self.keys["a"]
+        )
+        
+        print(">>> [Pipeline] Step 2: Parsing Outputs...")
+        self.op_extract_cot.run(
+            self.storage.step(),
+            output_key=self.keys["clean_cot"],
+            text=self.keys["raw_cot"]
+        )
+        self.op_extract_kws.run(
+            self.storage.step(),
+            output_key=self.keys["keywords"],
+            text=self.keys["raw_cot"]
+        )
+        
+        print(">>> [Pipeline] Step 3: Generating BBoxes (Grounding)...")
+        self.op_bbox_gen.run(
+            self.storage.step(),
+            input_image_key=self.keys["img"],
+            input_kws_key=self.keys["keywords"],
+            output_key=self.keys["bbox_map"]
+        )
+        
+        print(">>> [Pipeline] Step 4: Injecting GCoT...")
+        self.op_inject.run(
+            self.storage.step(),
+            output_key=self.keys["final"],
+            cot_text=self.keys["clean_cot"],
+            bbox_map=self.keys["bbox_map"]
+        )
+        
+        print(f">>> [Pipeline] Done. Final GCoT saved to: {self.keys['final']}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_file", default="dataflow/example/image_to_text_pipeline/image_qa_result.jsonl")
+    parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct")
+    
+    args = parser.parse_args()
+    
+    pipe = ImageGCoTPipeline(
+        model_path=args.model_path,
+        first_entry_file=args.input_file
+    )
+    pipe.forward()
+
+```
diff --git a/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline.md b/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline.md
new file mode 100644
index 00000000..a033dcf1
--- /dev/null
+++ b/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline.md
@@ -0,0 +1,276 @@
+---
+title: Image Region Captioning Pipeline
+createTime: 2026/01/11 22:04:27
+icon: mdi:image-text
+permalink: /en/mm_guide/image_region_caption_pipeline/
+---
+## 1. Overview
+
+The **Image Region Captioning Pipeline** is designed to generate detailed text descriptions for specific regions within an image. Combining the localization capabilities of Computer Vision with the understanding of Multimodal Large Models (VLMs), this pipeline identifies Regions of Interest (ROI) and generates precise natural language annotations for them.
+
+This pipeline supports processing **pre-defined Bounding Box** data, visualizing these boxes, and then feeding them into a VLM for caption generation.
+
+We support the following application scenarios:
+
+* **Dense Captioning**: Generating descriptions for multiple objects within a single image.
+* **Fine-grained Image Understanding**: Focusing on local details rather than global descriptions.
+* **Dataset Augmentation**: Constructing image-text pair datasets that include localization information.
+
+The main process of the pipeline includes:
+
+1. **Data Loading**: Reading source data containing image paths and bounding box information.
+2. **BBox Processing & Visualization**: Processing input bounding boxes and generating a version of the image with visual markers (e.g., drawn boxes).
+3. **Region Caption Generation**: Using a VLM to generate text descriptions based on the marked images or specific regions.
+
+---
+
+## 2. Quick Start
+
+### Step 1: Create a Working Directory
+
+```bash
+mkdir run_region_caption
+cd run_region_caption
+
+```
+
+### Step 2: Prepare the Script
+
+Save the code in the "Pipeline Example" section below as `region_caption_pipeline.py`.
+
+### Step 3: Configure Parameters
+
+Ensure the input file (jsonl) contains `image` and `bbox` fields.
+
+```bash
+# Install dependencies
+pip install open-dataflow vllm
+
+```
+
+### Step 4: Run
+
+```bash
+python region_caption_pipeline.py \
+  --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \
+  --first_entry_file "data/region_captions.jsonl" \
+  --output_jsonl_path "data/results.jsonl"
+
+```
+
+---
+
+## 3. Data Flow & Logic
+
+### 1. **Input Data**
+
+The input data typically contains the image path and a list of corresponding bounding boxes:
+
+* **image**: Path to the image file.
+* **bbox**: List of bounding box coordinates, typically in `[[x, y, w, h], ...]` or `[[x1, y1, x2, y2], ...]` format (depending on configuration).
+
+**Input Data Example**:
+
+```json
+{
+    "image": "./images/kitchen.jpg",
+    "bbox": [[196, 104, 310, 495], [50, 60, 100, 200]]
+}
+
+```
+
+### 2. **Core Operator Logic**
+
+This pipeline chains two core operators to complete the task:
+
+#### A. **ImageBboxGenerator**
+
+This operator handles the vision-level tasks.
+
+* **Input**: Raw image + `bbox` data.
+* **Functionality**: Reads bounding boxes and draws them onto the image (visualization) or preprocesses them according to configuration.
+* **Configuration (`ExistingBBoxDataGenConfig`)**: Controls parameters like `max_boxes` and visualization options (`draw_visualization`).
+* **Output**: Generates a new image path containing visual markers (`image_with_bbox`).
+
+#### B. **PromptedVQAGenerator**
+
+This operator is responsible for generating text using the VLM.
+
+* **Input**: The `image_with_bbox` generated in the previous step.
+* **Functionality**: The VLM receives the marked image and generates descriptions for the corresponding regions based on prompts.
+* **Output**: Region description text.
+
+### 3. **Output Data**
+
+The final output data will contain the processed image path and the generated descriptions:
+
+* **image_with_bbox**: Path to the image with drawn boxes.
+* **mdvp_record**: List of generated region descriptions.
+
+**Output Data Example**:
+
+```json
+{
+    "image": "./images/kitchen.jpg",
+    "image_with_bbox": "./images/kitchen_visualized.jpg",
+    "mdvp_record": [
+        "A wooden chair located near the table.",
+        "A white refrigerator in the background."
+    ]
+}
+
+```
+
+---
+
+## 4. Pipeline Example
+
+Below is the complete `ImageRegionCaptioningPipeline` code implementation.
+
+```python
+import argparse
+from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
+from dataflow.operators.core_vision.generate.image_bbox_generator import (
+    ImageBboxGenerator, 
+    ExistingBBoxDataGenConfig
+)
+from dataflow.operators.core_vision.generate.prompted_vqa_generator import (
+    PromptedVQAGenerator
+)
+from dataflow.utils.storage import FileStorage
+
+
+class ImageRegionCaptioningPipeline:
+    def __init__(
+        self,
+        model_path: str,
+        *,
+        hf_cache_dir: str | None = None,
+        download_dir: str = "./ckpt/models",
+        device: str = "cuda",
+        # Storage & Paths
+        first_entry_file: str = "./dataflow/example/image_to_text_pipeline/region_captions.jsonl",
+        cache_path: str = "./dataflow/example/cache",
+        file_name_prefix: str = "region_caption",
+        cache_type: str = "jsonl",
+        # Keys
+        input_image_key: str = "image",
+        input_bbox_key: str = "bbox",
+        image_with_bbox_path: str = 'image_with_bbox', # Key for intermediate image
+        output_key: str = "mdvp_record",
+        # BBox Config
+        max_boxes: int = 10,
+        input_jsonl_path: str = "./dataflow/example/image_to_text_pipeline/region_captions.jsonl",
+        output_jsonl_path: str = "./dataflow/example/image_to_text_pipeline/region_captions_results_v1.jsonl",
+        output_image_with_bbox_path: str = "./dataflow/example/image_to_text_pipeline/image_with_bbox_results_v1.jsonl",
+        draw_visualization: bool = True
+    ):
+        # 1. 初始化存储 (Storage)
+        # 用于 BBox 生成阶段的存储
+        self.bbox_storage = FileStorage(
+            first_entry_file_name=first_entry_file,
+            cache_path=cache_path,
+            file_name_prefix=file_name_prefix,
+            cache_type=cache_type
+        )
+        
+        # 2. 配置 BBox 生成器
+        self.cfg = ExistingBBoxDataGenConfig(
+            max_boxes=max_boxes,
+            input_jsonl_path=input_jsonl_path,
+            output_jsonl_path=output_image_with_bbox_path,
+        )
+
+        # 3. 初始化 Caption 阶段的存储
+        # 注意：这里接续了上一步的输出路径
+        self.caption_storage = FileStorage(
+            first_entry_file_name=output_image_with_bbox_path,
+            cache_path=cache_path,
+            file_name_prefix=file_name_prefix,
+            cache_type=cache_type
+        )
+
+        # 4. 初始化 VLM 服务
+        self.serving = LocalModelVLMServing_vllm(
+            hf_model_name_or_path=model_path,
+            hf_cache_dir=hf_cache_dir,
+            hf_local_dir=download_dir,
+            vllm_tensor_parallel_size=1,
+            vllm_temperature=0.7,
+            vllm_top_p=0.9,
+            vllm_max_tokens=512,
+        )
+
+        # 5. 初始化核心算子
+        self.bbox_generator = ImageBboxGenerator(config=self.cfg)
+        self.caption_generator = PromptedVQAGenerator(serving=self.serving)
+        
+        self.input_image_key = input_image_key
+        self.input_bbox_key = input_bbox_key
+        self.output_key = output_key
+        self.image_with_bbox_path = image_with_bbox_path
+
+    def forward(self):
+        # 步骤 1: 生成带 BBox 可视化的图像
+        print(">>> [Pipeline] Step 1: Processing BBoxes and Visualizing...")
+        self.bbox_generator.run(
+            storage=self.bbox_storage.step(),
+            input_image_key=self.input_image_key,
+            input_bbox_key=self.input_bbox_key,
+            output_key=self.image_with_bbox_path,
+        )
+
+        # 步骤 2: 基于可视化图像生成描述
+        print(">>> [Pipeline] Step 2: Generating Region Captions...")
+        self.caption_generator.run(
+            storage=self.caption_storage.step(),
+            input_image_key='image_with_bbox' # 使用上一步生成的带框图像
+        )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Image region captioning with DataFlow")
+ 
+    parser.add_argument("--model_path", default="/data0/happykeyan/Models/Qwen2.5-VL-3B-Instruct")
+    parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface")
+    parser.add_argument("--download_dir", default="./ckpt/models")
+    parser.add_argument("--device", choices=["cuda", "cpu", "mps"], default="cuda")
+
+    parser.add_argument("--first_entry_file", default="./dataflow/example/image_to_text_pipeline/region_captions.jsonl")
+    parser.add_argument("--cache_path", default="./dataflow/example/cache")
+    parser.add_argument("--file_name_prefix", default="region_caption")
+    parser.add_argument("--cache_type", default="jsonl")
+    
+    parser.add_argument("--input_image_key", default="image")
+    parser.add_argument("--input_bbox_key", default="bbox")
+    parser.add_argument("--output_key", default="mdvp_record")
+
+    parser.add_argument("--max_boxes", type=int, default=10)
+    parser.add_argument("--input_jsonl_path", default="./dataflow/example/image_to_text_pipeline/region_captions.jsonl")
+    parser.add_argument("--output_jsonl_path", default="./dataflow/example/image_to_text_pipeline/region_captions_results_v1.jsonl")
+    parser.add_argument("--output_image_with_bbox_path", default="./dataflow/example/image_to_text_pipeline/image_with_bbox_results_v1.jsonl")
+    parser.add_argument("--draw_visualization", type=bool, default=True)
+
+    args = parser.parse_args()
+
+    pipe = ImageRegionCaptioningPipeline(
+        model_path=args.model_path,
+        hf_cache_dir=args.hf_cache_dir,
+        download_dir=args.download_dir,
+        device=args.device,
+        first_entry_file=args.first_entry_file,
+        cache_path=args.cache_path,
+        file_name_prefix=args.file_name_prefix,
+        cache_type=args.cache_type,
+        input_image_key=args.input_image_key,
+        input_bbox_key=args.input_bbox_key,
+        output_key=args.output_key,
+        max_boxes=args.max_boxes,
+        input_jsonl_path=args.input_jsonl_path,
+        output_image_with_bbox_path=args.output_image_with_bbox_path,
+        draw_visualization=args.draw_visualization
+    )
+    pipe.forward()
+
+```
+
diff --git a/docs/en/notes/mm_guide/image_understanding/image_scale_caption_pipeline.md b/docs/en/notes/mm_guide/image_understanding/image_scale_caption_pipeline.md
new file mode 100644
index 00000000..8dc770ae
--- /dev/null
+++ b/docs/en/notes/mm_guide/image_understanding/image_scale_caption_pipeline.md
@@ -0,0 +1,367 @@
+---
+title: ScaleCap High-Density Captioning Pipeline
+createTime: 2026/01/11 22:08:57
+icon: mdi:image-text
+permalink: /en/mm_guide/image_scale_caption_pipeline/
+---
+## 1. Overview
+
+The **ScaleCap High-Density Captioning Pipeline** implements an advanced **"Generate-Verify-Expand-Fuse"** paradigm for image captioning. This pipeline is designed to generate **extremely high information density** captions with **minimal hallucinations**, making it ideal for scenarios requiring deep understanding of image details.
+
+Based on the paper *ScaleCap: Inference-Time Scalable Image Captioning via Dual-Modality Debiasing*, this method progressively mines object and position details through multi-turn dialogue and visual self-verification (Visual Grounding), filtering out hallucinations along the way.
+
+We support the following application scenarios:
+
+* **High-Quality Multimodal Dataset Construction**: Generating training data that is more detailed and accurate than standard captions.
+* **Fine-Grained Image Retrieval**: Providing index text rich in detail.
+* **Accessibility/Blind Assistance**: Generating "What You See Is What You Get" (WYSIWYG) detailed narrations.
+
+The main process of the pipeline includes:
+
+1. **Initial Caption Generation**: VLM generates a baseline description.
+2. **Visual Debiasing**: Splitting the description into sentences and verifying each sentence against visual evidence (Visual Grounding).
+3. **Detail Expansion**: Generating follow-up questions about object attributes and positions based on verified "Golden Sentences".
+4. **Answering & Re-verification**: VLM answers the questions and performs another round of visual grounding to filter incorrect details.
+5. **Final Fusion**: Merging all verified information into a coherent, long description.
+
+---
+
+## 2. Quick Start
+
+### Step 1: Create a Working Directory
+
+```bash
+mkdir run_scalecap
+cd run_scalecap
+
+```
+
+### Step 2: Prepare the Script
+
+Save the code in the "Pipeline Example" section below as `scalecap_pipeline.py`.
+
+### Step 3: Configure Parameters
+
+Ensure the VLM model path (e.g., Qwen2.5-VL) is correct.
+
+```bash
+# Install dependencies
+pip install open-dataflow vllm
+
+```
+
+### Step 4: Run
+
+```bash
+python scalecap_pipeline.py \
+  --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \
+  --input_jsonl "data/images.jsonl" \
+  --output_key "final_caption"
+
+```
+
+---
+
+## 3. Data Flow & Logic
+
+### 1. **Input Data**
+
+The input data requires only the image path:
+
+* **image**: Path to the image file.
+
+**Input Data Example**:
+
+```json
+{
+    "image": "./images/complex_scene.jpg"
+}
+
+```
+
+### 2. **Core Operator Logic**
+
+This pipeline is a complex orchestration of multiple atomic operators:
+
+#### A. **Initial Generation (PromptedVQAGenerator)**
+
+* **Function**: Generates a preliminary description (`init_caption`) of the image using a basic prompt.
+
+#### B. **Visual Debiasing (VisualGroundingRefiner)**
+
+* **Function**: The core anti-hallucination mechanism of ScaleCap.
+* **Logic**:
+1. Uses `split_sentences` to break the draft into single sentences.
+2. Asks the VLM: "Given the image, is the description '{text}' directly supported by visual evidence?".
+3. Keeps only sentences where the answer is "Yes", forming **"Golden Sentences"**.
+
+
+
+#### C. **Question Generation & Parsing (PromptTemplatedQAGenerator)**
+
+* **Function**: Generates targeted follow-up questions based on Golden Sentences using LLM capabilities.
+* **Logic**: The model generates text like "Describe more details about the [Object]", which is then automatically expanded into **Object Detail** and **Positional Relation** questions via `parse_questions_logic`.
+
+#### D. **Batch Answering & Refiltering (BatchVQAGenerator & Refiner)**
+
+* **Function**: Mining deep image information.
+* **Logic**:
+1. Uses `BatchVQAGenerator` to have the VLM answer all generated questions in a batch.
+2. Uses `VisualGroundingRefiner` again to check if these new details are accurate.
+3. Retains reliable details (`final_details`).
+
+
+
+#### E. **Final Fusion (PromptTemplatedQAGenerator)**
+
+* **Function**: Rewrites the "Golden Sentences" and "Verified Details" into a fluent text.
+* **Output**: `final_caption`.
+
+### 3. **Output Data**
+
+The output data records the entire pipeline process, facilitating debugging and analysis:
+
+* **init_caption**: Raw generated draft.
+* **golden_sentences**: List of sentences that passed the first check.
+* **q_list**: List of generated follow-up questions.
+* **final_details**: Detailed answers that passed the second check.
+* **final_caption**: The final high-density description.
+
+**Output Data Example**:
+
+```json
+{
+    "image": "./images/complex_scene.jpg",
+    "init_caption": "A dog sitting on a bench.",
+    "golden_sentences": ["A dog is sitting on a wooden bench."],
+    "q_list": ["Describe more details about the dog.", "Describe position of the bench."],
+    "final_details": ["The dog is a Golden Retriever with a red collar.", "The bench is located in a park."],
+    "final_caption": "A Golden Retriever with a red collar is sitting on a wooden bench located in a park..."
+}
+
+```
+
+---
+
+## 4. Pipeline Example
+
+Below is the complete `ImageScaleCaptionPipeline` code implementation.
+
+```python
+import re
+import argparse
+from typing import Callable, Any, List
+
+from dataflow.utils.storage import FileStorage
+from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
+from dataflow.prompts.prompt_template import NamedPlaceholderPromptTemplate
+from dataflow.prompts.image import ImageScaleCaptionPrompt
+from dataflow.operators.core_vision import PromptedVQAGenerator, BatchVQAGenerator, VisualGroundingRefiner
+from dataflow.operators.core_text import PromptTemplatedQAGenerator, FunctionalRefiner
+
+class ImageScaleCaptionPipeline:
+    def __init__(
+        self,
+        model_path: str,
+        *,
+        hf_cache_dir: str | None = None,
+        download_dir: str = "./ckpt/models",
+        device: str = "cuda",
+        # Storage params
+        first_entry_file: str = "images.jsonl",
+        cache_path: str = "./cache_scalecap",
+        file_name_prefix: str = "scalecap",
+        cache_type: str = "jsonl",
+        # Keys
+        input_image_key: str = "image",
+        output_key: str = "final_caption",
+        # VLLM Config
+        vllm_tensor_parallel_size: int = 1,
+        vllm_temperature: float = 0.7,
+        vllm_top_p: float = 0.9,
+        vllm_max_tokens: int = 512,
+    ):
+        # 1. Storage
+        self.storage = FileStorage(
+            first_entry_file_name=first_entry_file,
+            cache_path=cache_path,
+            file_name_prefix=file_name_prefix,
+            cache_type=cache_type,
+        )
+
+        # 2. Serving
+        self.serving = LocalModelVLMServing_vllm(
+            hf_model_name_or_path=model_path,
+            hf_cache_dir=hf_cache_dir,
+            hf_local_dir=download_dir,
+            vllm_tensor_parallel_size=vllm_tensor_parallel_size,
+            vllm_temperature=vllm_temperature,
+            vllm_top_p=vllm_top_p,
+            vllm_max_tokens=vllm_max_tokens,
+        )
+
+        # 3. Prompts
+        self.prompts_db = ImageScaleCaptionPrompt().build_prompt()
+
+        # 4. Keys
+        self.input_image_key = input_image_key
+        self.output_key = output_key
+
+        # ================== Operator Initialization ==================
+
+        # --- Step A: Generate Init Caption ---
+        self.refine_const_prompt = FunctionalRefiner(func=lambda: self.prompts_db["VLM_PROMPT_1"])
+        self.gen_init_caption = PromptedVQAGenerator(
+            serving=self.serving,
+            system_prompt="You are a helpful assistant."
+        )
+
+        # --- Step B: Refine Golden Sentences ---
+        self.refine_split = FunctionalRefiner(func=split_sentences)
+        # 视觉自检 (保留 Yes 的句子)
+        self.refine_golden = VisualGroundingRefiner(
+            serving=self.serving,
+            prompt_template="Given the image, is the description '{text}' directly supported by visual evidence? Answer strictly yes or no."
+        )
+
+        # --- Step C: Generate Questions ---
+        self.refine_join = FunctionalRefiner(func=join_list)
+        tpl_q = NamedPlaceholderPromptTemplate(
+            template=self.prompts_db["LLM_PROMPT_1"], 
+            join_list_with="\n"
+        )
+        self.gen_questions_text = PromptTemplatedQAGenerator(
+            serving=self.serving,
+            prompt_template=tpl_q
+        )
+        self.refine_parse_qs = FunctionalRefiner(func=parse_questions_logic)
+
+        # --- Step D: Generate Answers ---
+        self.gen_answers = BatchVQAGenerator(serving=self.serving)
+        self.refine_answers = VisualGroundingRefiner(
+            serving=self.serving,
+            prompt_template="Given the image, is the statement '{text}' grounded in the image and not generic? Answer strictly yes or no."
+        )
+
+        # --- Step E: Integrate Final Caption ---
+        tpl_final = NamedPlaceholderPromptTemplate(
+            template=self.prompts_db["LLM_PROMPT_4"], 
+            join_list_with="\n"
+        )
+        self.gen_final_caption = PromptTemplatedQAGenerator(
+            serving=self.serving,
+            prompt_template=tpl_final
+        )
+
+    def forward(self):
+        print(">>> [Pipeline] Step 0: Preparing Prompts...")
+        self.refine_const_prompt.run(
+            self.storage.step(), 
+            output_key="init_prompt"
+        )
+
+        print(">>> [Pipeline] Step 1: Generating Initial Caption...")
+        self.gen_init_caption.run(
+            self.storage.step(),
+            input_prompt_key="init_prompt",
+            input_image_key=self.input_image_key,
+            output_answer_key="init_caption"
+        )
+
+        print(">>> [Pipeline] Step 2: Refining Golden Sentences...")
+        self.refine_split.run(
+            self.storage.step(), 
+            output_key="sentences", 
+            text="init_caption"
+        )
+        self.refine_golden.run(
+            self.storage.step(), 
+            input_list_key="sentences", 
+            input_image_key=self.input_image_key, 
+            output_key="golden_sentences"
+        )
+
+        print(">>> [Pipeline] Step 3: Generating Details Questions...")
+        self.refine_join.run(
+            self.storage.step(), 
+            output_key="golden_str", 
+            data="golden_sentences"
+        )
+        self.gen_questions_text.run(
+            self.storage.step(), 
+            output_answer_key="raw_q_text", 
+            sentence="golden_str"
+        )
+        self.refine_parse_qs.run(
+            self.storage.step(), 
+            output_key="q_list", 
+            text="raw_q_text"
+        )
+
+        print(">>> [Pipeline] Step 4: Generating & Filtering Answers...")
+        self.gen_answers.run(
+            self.storage.step(), 
+            input_prompts_key="q_list", 
+            input_image_key=self.input_image_key, 
+            output_key="raw_answers"
+        )
+        self.refine_answers.run(
+            self.storage.step(), 
+            input_list_key="raw_answers", 
+            input_image_key=self.input_image_key, 
+            output_key="final_details"
+        )
+
+        print(">>> [Pipeline] Step 5: Integrating Final Caption...")
+        self.refine_join.run(
+            self.storage.step(), 
+            output_key="details_str", 
+            data="final_details"
+        )
+        self.gen_final_caption.run(
+            self.storage.step(),
+            output_answer_key=self.output_key,
+            context="golden_str",
+            object_info="details_str",
+            position_info="details_str"
+        )
+
+        print(f">>> [Pipeline] All Done. Result saved to: {self.storage.cache_path}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="ScaleCap Dense Captioning Pipeline")
+    
+    parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct")
+    parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface")
+    parser.add_argument("--download_dir", default="./ckpt/models")
+    parser.add_argument("--device", default="cuda")
+
+    parser.add_argument("--input_jsonl", default="./dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl")
+    parser.add_argument("--cache_path", default="./cache_scalecap_results")
+    parser.add_argument("--file_name_prefix", default="scalecap")
+    parser.add_argument("--input_image_key", default="image")
+    parser.add_argument("--output_key", default="final_caption")
+
+    parser.add_argument("--tp", type=int, default=1)
+    parser.add_argument("--max_tokens", type=int, default=1024)
+
+    args = parser.parse_args()
+
+    pipe = ImageScaleCaptionPipeline(
+        model_path=args.model_path,
+        hf_cache_dir=args.hf_cache_dir,
+        download_dir=args.download_dir,
+        device=args.device,
+        first_entry_file=args.input_jsonl,
+        cache_path=args.cache_path,
+        file_name_prefix=args.file_name_prefix,
+        input_image_key=args.input_image_key,
+        output_key=args.output_key,
+        vllm_tensor_parallel_size=args.tp,
+        vllm_max_tokens=args.max_tokens
+    )
+    
+    pipe.forward()
+
+```
diff --git a/docs/en/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline.md b/docs/en/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline.md
new file mode 100644
index 00000000..cc3806af
--- /dev/null
+++ b/docs/en/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline.md
@@ -0,0 +1,325 @@
+---
+title: Visual-Only MCQ Pipeline
+createTime: 2026/01/11 22:13:45
+icon: mdi:image-text
+permalink: /en/mm_guide/image_visual_only_mcq_pipeline/
+---
+## 1. Overview
+
+The **Visual-Only MCQ Pipeline** is a core component of the CapRL (Caption Reinforcement Learning) framework. Its goal is to generate a set of high-quality Multiple Choice Questions (MCQs) that satisfy **strict visual dependency**: the model must "see" the image to answer correctly; answering based on text alone (guessing or common sense) is not possible.
+
+This pipeline uses a **Generate-Parse-Verify** three-step method, leveraging **Option Rotation** and **Blind Tests** to rigorously filter out hallucinations or overly simple questions. The generated questions serve as a robust reward signal for Reinforcement Learning.
+
+The main process includes:
+
+1. **MCQ Generation**: VLM generates raw QA pairs based on the image.
+2. **Structured Parsing**: Using regex logic to parse text into standard question/option structures.
+3. **Visual Dependency Verification**:
+* **Rotation Test**: Shuffling options multiple times to eliminate positional bias.
+* **Dual Filtering**: Requiring high "Visual Accuracy" and low "Text-only Accuracy".
+
+
+
+---
+
+## 2. Quick Start
+
+### Step 1: Create Working Directory
+
+```bash
+mkdir run_vis_mcq
+cd run_vis_mcq
+
+```
+
+### Step 2: Prepare Script
+
+Save the code in the "Pipeline Example" section below as `visual_mcq_pipeline.py`.
+
+### Step 3: Configure Parameters
+
+Control filtering thresholds via CLI. For example, requiring 100% visual accuracy and less than 25% blind accuracy:
+
+```bash
+# Install dependencies
+pip install open-dataflow vllm
+
+```
+
+### Step 4: Run
+
+```bash
+python visual_mcq_pipeline.py \
+  --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \
+  --input_file "data/captions.jsonl" \
+  --rotate_num 4 \
+  --pass_vis 1.0 \
+  --pass_txt 0.25
+
+```
+
+---
+
+## 3. Data Flow & Logic
+
+### 1. **Input Data**
+
+Input only requires the image path:
+
+* **image**: Path to the image file.
+
+**Input Data Example**:
+
+```json
+{
+    "image": "./images/sample_01.jpg"
+}
+
+```
+
+### 2. **Core Operator Logic**
+
+This pipeline chains three key operators:
+
+#### A. **FixPromptedVQAGenerator (Raw Generation)**
+
+* **Function**: Uses CapRL predefined Prompt templates (`SYS_PROMPT_MCQ` / `USER_PROMPT_MCQ`) to generate 5 MCQs at once.
+* **Output**: Unstructured text block containing multiple `#### Question` and options.
+
+#### B. **FunctionalRefiner (Regex Parsing)**
+
+* **Logic Function**: `parse_mcq_text_logic`
+* **Function**: Extracts questions, options (A-F), and correct answers from raw text using regex.
+* **Output**: Structured MCQ list (`parsed_mcq_list`).
+
+#### C. **VisualDependencyRefiner (Dependency Verification)**
+
+This is the core filter. It performs N inferences (N = `rotate_num`) for each question:
+
+1. **Option Rotation**: Randomly shuffles options (e.g., moving answer from A to C) to prevent the model from cheating by "always picking A".
+2. **Visual Pass**: Input Image + Question. Records the model's accuracy.
+3. **Textual Pass**: Input Question only (no image). Records the model's blind guessing accuracy.
+4. **Filtering Criteria**:
+* Keep the question IF AND ONLY IF: `Visual_Acc >= pass_visual_min` **AND** `Textual_Acc <= pass_textual_max`.
+* *Example*: If a question can be answered correctly without the image (high text accuracy), it tests common sense rather than vision, so it is **discarded**.
+
+
+
+### 3. **Output Data**
+
+The output data (`final_mcqs`) contains only questions that passed rigorous verification. These questions possess high quality and visual relevance.
+
+**Output Data Example**:
+
+```json
+{
+    "image": "./images/sample_01.jpg",
+    "final_mcqs": [
+        {
+            "question": "What is the color of the car on the far left?\n - A) Red\n - B) Blue...",
+            "answer": "A",
+            "stats": {
+                "visual_acc": 1.0,  # 4/4 correct with image
+                "text_acc": 0.0     # 0/4 correct without image
+            }
+        }
+    ]
+}
+
+```
+
+---
+
+## 4. Pipeline Example
+
+Below is the complete `VisualOnlyMCQPipeline` code implementation.
+
+```python
+import argparse
+import re
+from typing import List, Dict, Any
+from dataflow.utils.storage import FileStorage
+from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
+
+from dataflow.operators.core_vision import FixPromptedVQAGenerator, VisualDependencyRefiner
+from dataflow.operators.core_text import FunctionalRefiner
+from dataflow.prompts.image import ImageCaprlPrompt
+
+# 正则解析逻辑
+_Q_BLOCK_SPLIT = re.compile(r"^####\s*\d+\.\s*\*\*(.*?)\*\*\s*$", re.M)
+_OPT_LINE_RE = re.compile(r"^\s*-\s*([A-F])\)\s*(.+?)\s*$")
+_ANS_LINE_RE = re.compile(r"^\s*\*\*Answer:\*\*\s*([A-F])\)\s*(.+?)\s*$", re.I)
+
+def parse_mcq_text_logic(mcq_text: str, expected: int = 5) -> List[Dict[str, Any]]:
+    """将 VLM 生成的原始文本解析为结构化字典列表"""
+    if not mcq_text or not isinstance(mcq_text, str): return []
+    
+    indices = [m.start() for m in _Q_BLOCK_SPLIT.finditer(mcq_text)]
+    if not indices: return []
+    indices.append(len(mcq_text))
+    blocks = [mcq_text[indices[i]:indices[i+1]].strip() for i in range(len(indices)-1)]
+    
+    parsed = []
+    for block in blocks:
+        lines = [ln.rstrip() for ln in block.splitlines() if ln.strip()]
+        q_title_m = _Q_BLOCK_SPLIT.search(block)
+        if not q_title_m: continue
+        
+        q_title = q_title_m.group(1).strip()
+        options = {}
+        ans_letter, ans_text = None, None
+        
+        for ln in lines:
+            m_opt = _OPT_LINE_RE.match(ln)
+            if m_opt:
+                options[m_opt.group(1)] = m_opt.group(2).strip()
+                continue
+            m_ans = _ANS_LINE_RE.match(ln)
+            if m_ans:
+                ans_letter = m_ans.group(1).upper()
+                ans_text = m_ans.group(2).strip()
+                break
+        
+        if options and ans_letter and ans_letter in options:
+            q_lines = [q_title]
+            for lbl in ["A", "B", "C", "D", "E", "F"]:
+                if lbl in options:
+                    q_lines.append(f"   - {lbl}) {options[lbl]}")
+            
+            parsed.append({
+                "question": "\n".join(q_lines),
+                "question_title": q_title,
+                "options": options,
+                "answer": ans_letter,
+                "answer_text": ans_text
+            })
+            
+    if expected > 0:
+        parsed = parsed[:expected]
+        
+    uniq = []
+    seen = set()
+    for it in parsed:
+        key = (it["question_title"], it["answer"])
+        if key not in seen:
+            seen.add(key)
+            uniq.append(it)
+    return uniq
+
+
+class VisualOnlyMCQPipeline:
+    def __init__(
+        self,
+        model_path: str,
+        *,
+        first_entry_file: str,
+        cache_path: str = "./cache_mcq",
+        file_name_prefix: str = "vis_mcq",
+        # Config
+        rotate_num: int = 4,
+        pass_visual_min: float = 1.0,
+        pass_textual_max: float = 0.25,
+        add_none_above: bool = True,
+        # Keys
+        input_image_key: str = "image",
+        output_key: str = "final_mcqs",
+        # VLLM
+        device: str = "cuda",
+        vllm_max_tokens: int = 2048
+    ):
+        # 1. 初始化存储
+        self.storage = FileStorage(
+            first_entry_file_name=first_entry_file,
+            cache_path=cache_path,
+            file_name_prefix=file_name_prefix,
+            cache_type="jsonl"
+        )
+        
+        # 2. 初始化 VLM 服务
+        self.serving = LocalModelVLMServing_vllm(
+            hf_model_name_or_path=model_path,
+            vllm_tensor_parallel_size=1,
+            vllm_temperature=0.1,  # 低温度以保证格式稳定
+            vllm_max_tokens=vllm_max_tokens
+        )
+        
+        # Keys 配置
+        self.keys = {
+            "img": input_image_key,
+            "raw_text": "raw_mcq_text",
+            "parsed_list": "parsed_mcq_list",
+            "final": output_key
+        }
+        
+        # 加载 Prompt 库
+        self.prompts_db = ImageCaprlPrompt().build_prompt()
+
+        # ================== 算子初始化 ==================
+        
+        # 算子 1: 生成原始 MCQ 文本
+        self.op_gen_raw = FixPromptedVQAGenerator(
+            serving=self.serving,
+            system_prompt=self.prompts_db["SYS_PROMPT_MCQ"],
+            user_prompt=self.prompts_db["USER_PROMPT_MCQ"]
+        )
+        
+        # 算子 2: 解析文本为结构化数据
+        self.op_parse = FunctionalRefiner(func=parse_mcq_text_logic)
+        
+        # 算子 3: 视觉依赖性验证 (核心过滤)
+        # 包含旋转 (Rotation) 和 无图检测 (Text-only check)
+        self.op_verify = VisualDependencyRefiner(
+            serving=self.serving,
+            instruction_template=self.prompts_db["ANSWER_INSTRUCTION"],
+            rotate_num=rotate_num,
+            pass_visual_min=pass_visual_min,
+            pass_textual_max=pass_textual_max,
+            add_none_above_visual=add_none_above
+        )
+
+    def forward(self):
+        print(">>> [Pipeline] Step 1: Generating Raw MCQs (FixPrompted)...")
+        self.op_gen_raw.run(
+            self.storage.step(),
+            input_image_key=self.keys["img"],
+            output_answer_key=self.keys["raw_text"]
+        )
+        
+        print(">>> [Pipeline] Step 2: Parsing MCQs...")
+        self.op_parse.run(
+            self.storage.step(),
+            output_key=self.keys["parsed_list"],
+            mcq_text=self.keys["raw_text"], 
+            expected=5
+        )
+        
+        print(">>> [Pipeline] Step 3: Verifying Visual Dependency (Rotation Check)...")
+        self.op_verify.run(
+            self.storage.step(),
+            input_list_key=self.keys["parsed_list"],
+            input_image_key=self.keys["img"],
+            output_key=self.keys["final"]
+        )
+        
+        print(f">>> [Pipeline] Done. Results in: {self.keys['final']}")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_file", default="./dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl")
+    parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct")
+    parser.add_argument("--rotate_num", type=int, default=4)
+    parser.add_argument("--pass_vis", type=float, default=1.0)
+    parser.add_argument("--pass_txt", type=float, default=0.25)
+    
+    args = parser.parse_args()
+    
+    pipe = VisualOnlyMCQPipeline(
+        model_path=args.model_path,
+        first_entry_file=args.input_file,
+        rotate_num=args.rotate_num,
+        pass_visual_min=args.pass_vis,
+        pass_textual_max=args.pass_txt
+    )
+    pipe.forward()
+
+```
\ No newline at end of file
diff --git a/docs/en/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md b/docs/en/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md
new file mode 100644
index 00000000..0904691e
--- /dev/null
+++ b/docs/en/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md
@@ -0,0 +1,230 @@
+---
+title: Vision MCTS Reasoning Pipeline
+icon: mdi:image-text
+createTime: 2026/01/11 21:59:59
+permalink: /en/mm_guide/vision_mct_reasoning_pipeline/
+---
+
+## 1. Overview
+
+The **Vision MCTS Reasoning Pipeline** is designed to construct high-quality **Process Supervision Data** for multimodal large models. This pipeline handles two types of data sources: existing Monte Carlo Tree Search (MCTS) trajectory data, or direct generation of new reasoning chains using a VLM.
+
+This pipeline is a core tool for **Grounded-RL** and **SFT Data Construction**, converting complex tree-search processes into a linearized `<think>...</think><answer>...</answer>` format that models can learn from.
+
+We support the following application scenarios:
+
+* **MCTS Data Extraction**: Converting high-value paths (Rollouts) from search trees into linear training data.
+* **Hybrid Data Construction**: Automatically falling back to VLM-based CoT generation for samples without search trees.
+* **Spatial Reasoning Enhancement**: Supporting the generation of spatial reasoning chains containing explicit coordinates (Bounding Boxes).
+
+The main process of the pipeline includes:
+
+1. **MCTS Tree Parsing**: Parsing the search tree structure in the input data to extract successful reasoning paths.
+2. **Visual Reasoning Generation (Fallback)**: Using a VLM to regenerate reasoning chains for samples where the tree structure is missing or parsing fails.
+3. **Data Standardization**: Outputting reasoning chain data in a unified format.
+
+---
+
+## 2. Quick Start
+
+### Step 1: Create a Working Directory
+
+```bash
+mkdir run_mcts_reasoning
+cd run_mcts_reasoning
+
+```
+
+### Step 2: Prepare the Script
+
+Save the code in the "Pipeline Example" section below as `vision_mcts_pipeline.py`.
+
+### Step 3: Configure Parameters
+
+Ensure the input file (jsonl) contains the `tree` field (for extraction) or just `question/image` (for generation).
+
+```bash
+# Install dependencies
+pip install open-dataflow vllm
+
+```
+
+### Step 4: Run
+
+```bash
+python vision_mcts_pipeline.py \
+  --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \
+  --input_file "data/mcts_trajectories.jsonl" \
+  --prompt_type "spatial"
+
+```
+
+---
+
+## 3. Data Flow & Logic
+
+### 1. **Input Data**
+
+Input data typically comes from MCTS search logs or unlabelled image-text pairs:
+
+* **image**: Path to the image.
+* **question**: Visual question.
+* **tree** (optional): JSON structure of the MCTS search tree, containing node values, visit counts, and actions.
+
+**Input Data Example**:
+
+```json
+{
+    "image": "./images/puzzle.jpg",
+    "question": "What is the next step to solve this?",
+    "tree": { "root": { "children": [...], "value": 1.0, "text": "Step 1..." } }
+}
+
+```
+
+### 2. **Core Operator Logic**
+
+The pipeline employs an **"Extract First, Fallback to Generate"** hybrid strategy:
+
+#### A. **MCTSTreeRefiner**
+
+This operator is responsible for processing the `tree` field. It traverses the tree structure and filters for the best paths from root to leaf based on node Q-values.
+
+* **Input**: `tree` object.
+* **Functionality**: Linearizes tree paths, filtering out low-value or incomplete search branches.
+* **Output**: List of extracted reasoning chains (`mcts_chains`).
+
+#### B. **VisualReasoningGenerator**
+
+This operator is the "Generation Engine" of the pipeline. It takes the extraction results from the previous step as input.
+
+* **Mechanism**: Checks `input_existing_chains_key` (i.e., `mcts_chains`).
+* If MCTS parsing was successful (chains exist), it reuses them directly without running inference (saving compute).
+* If MCTS chains are empty (tree missing or parsing failed), it calls the VLM to generate reasoning chains from scratch based on the `prompt_type`.
+
+
+* **Prompt Type**: Supports modes like `spatial` (spatial coordinate reasoning), `logical` (logical reasoning), etc.
+
+### 3. **Output Data**
+
+The final output data (`final_reasoning_chains`) will contain high-quality Chain-of-Thought data ready for SFT training.
+
+**Output Example**:
+
+```json
+{
+    "image": "./images/puzzle.jpg",
+    "final_reasoning_chains": [
+        "<think>First, locate the red block at [100, 200]. To solve the puzzle, it needs to move right...</think><answer>Move Red Block</answer>"
+    ]
+}
+
+```
+
+---
+
+## 4. Pipeline Example
+
+Below is the complete `VisionMCTSReasoningPipeline` code implementation.
+```python
+import argparse
+from dataflow.utils.storage import FileStorage
+from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
+
+# 引入原子算子
+from dataflow.operators.core_text import MCTSTreeRefiner
+from dataflow.operators.core_vision import VisualReasoningGenerator
+
+class VisionMCTSReasoningPipeline:
+    def __init__(
+        self,
+        model_path: str,
+        *,
+        # Storage
+        first_entry_file: str,
+        cache_path: str = "./cache_mcts",
+        file_name_prefix: str = "mcts_reason",
+        # Config
+        prompt_type: str = "spatial",
+        max_samples_per_file: int = 10000,
+        # Keys
+        input_question_key: str = "question",
+        input_image_key: str = "image",
+        input_tree_key: str = "tree",
+        output_key: str = "final_reasoning_chains",
+        # VLLM
+        vllm_max_tokens: int = 1024
+    ):
+        # 1. 存储初始化
+        self.storage = FileStorage(
+            first_entry_file_name=first_entry_file,
+            cache_path=cache_path,
+            file_name_prefix=file_name_prefix,
+            cache_type="jsonl"
+        )
+        
+        # 2. 模型服务
+        self.serving = LocalModelVLMServing_vllm(
+            hf_model_name_or_path=model_path,
+            vllm_tensor_parallel_size=1,
+            vllm_temperature=0.7,
+            vllm_max_tokens=vllm_max_tokens
+        )
+        
+        self.keys = {
+            "q": input_question_key,
+            "img": input_image_key,
+            "tree": input_tree_key,
+            "mcts_chains": "mcts_extracted_chains", # 中间结果
+            "final": output_key
+        }
+
+        # ================== Operators ==================
+        
+        # 算子 1: MCTS Tree -> Chains (提取器)
+        # 负责将树结构扁平化为线性链
+        self.op_mcts_refine = MCTSTreeRefiner(
+            max_chains_per_sample=max_samples_per_file
+        )
+        
+        # 算子 2: VLM -> Chains (生成器/Fallback)
+        # 如果 MCTS 提取失败，则使用 VLM 生成；如果成功，则跳过
+        self.op_vlm_gen = VisualReasoningGenerator(
+            serving=self.serving,
+            prompt_type=prompt_type
+        )
+
+    def forward(self):
+        print(">>> [Pipeline] Step 1: Extracting Chains from MCTS Trees...")
+        self.op_mcts_refine.run(
+            self.storage.step(),
+            input_tree_key=self.keys["tree"],
+            output_key=self.keys["mcts_chains"]
+        )
+        
+        print(">>> [Pipeline] Step 2: Generating Chains via VLM (Fallback)...")
+        # 注意：input_existing_chains_key 实现了混合/回退逻辑
+        self.op_vlm_gen.run(
+            self.storage.step(),
+            input_question_key=self.keys["q"],
+            input_image_key=self.keys["img"],
+            input_existing_chains_key=self.keys["mcts_chains"],
+            output_key=self.keys["final"]
+        )
+        
+        
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_file", default="dataflow/example/image_to_text_pipeline/mct_reasoning.jsonl")
+    parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct")
+    parser.add_argument("--prompt_type", default="spatial")
+    args = parser.parse_args()
+    
+    pipe = VisionMCTSReasoningPipeline(
+        model_path=args.model_path,
+        first_entry_file=args.input_file,
+        prompt_type=args.prompt_type
+    )
+    pipe.forward()
+
+```
\ No newline at end of file
diff --git a/docs/en/notes/mm_guide/video_understanding/multirole_videoqa_pipeline.md b/docs/en/notes/mm_guide/video_understanding/multirole_videoqa_pipeline.md
new file mode 100644
index 00000000..bbe78d17
--- /dev/null
+++ b/docs/en/notes/mm_guide/video_understanding/multirole_videoqa_pipeline.md
@@ -0,0 +1,288 @@
+---
+title: Multi-Role Video QA Pipeline
+createTime: 2026/01/11 22:15:28
+icon: mdi:image-text
+permalink: /en/mm_guide/multirole_videoqa_pipeline/
+---
+## 1. Overview
+
+The **Multi-Role Video QA Pipeline** leverages Multimodal Large Models (VLMs) and a Multi-Agent collaboration mechanism to automatically generate high-quality, deep Question-Answer (QA) pairs from long videos or advertising footage.
+
+Unlike standard single-pass generation, this pipeline introduces a **Multi-Agent Iterative Refinement** phase. It first generates initial QAs, then refines them through multiple rounds of interaction simulating different agent roles (e.g., Questioner, Checker, Polisher), finally outputting logical and accurate QA data.
+
+We support the following application scenarios:
+
+* **Ad Video Understanding**: Extracting key selling points, emotional tone, and narrative logic from ads.
+* **Complex Video Reasoning**: Constructing deep QA datasets requiring reasoning across different time segments.
+* **Long Video Summarization & QA**: Handling video data containing rich Metadata (`Meta`) and multiple Clips (`Clips`).
+
+The main process of the pipeline includes:
+
+1. **Initial Generation**: Generates baseline QA pairs based on video metadata and clips.
+2. **Multi-Agent Refinement**: Critiques, corrects, and optimizes QA pairs through multiple iterations (default 3 rounds).
+3. **Final Generation**: Cleans the data and outputs the final QA set in a standard format.
+
+---
+
+## 2. Quick Start
+
+### Step 1: Create a Working Directory
+
+```bash
+mkdir run_video_qa
+cd run_video_qa
+
+```
+
+### Step 2: Prepare the Script
+
+Save the code in the "Pipeline Example" section below as `multirole_videoqa_pipeline.py`.
+
+### Step 3: Configure Parameters
+
+Ensure the input data contains `Meta` and `Clips` fields.
+
+```bash
+# Install dependencies
+pip install open-dataflow vllm
+
+```
+
+### Step 4: Run
+
+```bash
+python multirole_videoqa_pipeline.py \
+  --model_path "/path/to/Qwen2.5-VL-7B-Instruct" \
+  --images_file "data/adsQA.jsonl" \
+  --card_id "0"
+
+```
+
+---
+
+## 3. Data Flow & Logic
+
+### 1. **Input Data**
+
+Input data is typically pre-processed video data containing global metadata and segment information:
+
+* **Meta**: Global description, title, or background info of the video.
+* **Clips**: List of video clips, where each clip contains audio text, frame image paths, and clip descriptions.
+
+**Input Data Example**:
+
+```json
+{
+    "Meta": "A commercial for a new sports car featuring dynamic driving scenes.",
+    "Clips": [
+        {
+            "Audio_Text": "Experience the speed.",
+            "Frames_Images": ["./frames/001.jpg", "./frames/002.jpg"],
+            "Description": "Car accelerating on a highway."
+        },
+        {
+            "Audio_Text": "Safety meets luxury.",
+            "Frames_Images": ["./frames/003.jpg"],
+            "Description": "Interior shot showing leather seats."
+        }
+    ]
+}
+
+```
+
+### 2. **Core Operator Logic**
+
+This pipeline executes through a chain of three specialized operators:
+
+#### A. **MultiroleVideoQAInitialGenerator**
+
+* **Function**: Acts as the "Draft Author", reading `Meta` and `Clips` to generate the first version of QA pairs using the VLM.
+* **Output**: A DataFrame containing preliminary QAs.
+
+#### B. **MultiroleVideoQAMultiAgentGenerator**
+
+* **Function**: Acts as the "Editorial Team", polishing the draft.
+* **Mechanism**: Configured with `max_iterations` (e.g., 3 rounds). During these rounds, the model may simulate different roles (e.g., a reviewer pointing out errors, a polisher improving wording) to progressively enhance QA quality.
+* **Input**: Initial DataFrame.
+* **Output**: Intermediate DataFrame after multiple rounds of correction.
+
+#### C. **MultiroleVideoQAFinalGenerator**
+
+* **Function**: Acts as the "Publisher", responsible for final formatting and cleaning.
+* **Output**: Standardized `QA` list.
+
+### 3. **Output Data**
+
+The output data adds a high-quality QA list to the original fields:
+
+* **QA**: List of generated QA pairs, including labels (e.g., question type), question text, and answer text.
+
+**Output Data Example**:
+
+```json
+{
+    "Meta": "...",
+    "Clips": [...],
+    "QA": [
+        {
+            "Label": "Feature Extraction",
+            "Question": "What specific features of the car are highlighted in the interior shots?",
+            "Answer": "The video highlights the luxury leather seats and the advanced dashboard interface."
+        },
+        {
+            "Label": "Narrative Analysis",
+            "Question": "How does the audio complement the visual transition?",
+            "Answer": "The narration 'Experience speed' coincides with the acceleration scene, reinforcing the dynamic visual."
+        }
+    ]
+}
+
+```
+
+---
+
+## 4. Pipeline Example
+
+Below is the complete `MultiRoleVideoQAPipeline` code implementation.
+
+```python
+import argparse
+import os 
+from dataflow.serving import LocalModelVLMServing_vllm
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.core_vision import (
+    MultiroleVideoQAInitialGenerator, 
+    MultiroleVideoQAMultiAgentGenerator, 
+    MultiroleVideoQAFinalGenerator
+)
+
+try:
+    import torch
+    # 多进程启动方式设置为 spawn，避免 CUDA 初始化冲突
+    if 'spawn' not in torch.multiprocessing.get_all_start_methods():
+        torch.multiprocessing.set_start_method('spawn', force=True)
+except ImportError:
+    pass
+
+
+class MultiRoleVideoQAPipeline():
+    def __init__(
+        self,
+        model_path: str,
+        *,
+        hf_cache_dir: str | None = None,
+        download_dir: str = "./ckpt",
+        first_entry_file: str = "/dataflow/example/ads_QA/adsQA.jsonl",
+        cache_path: str = "./cache_local",
+        file_name_prefix: str = "dataflow_cache_step",
+        cache_type: str = "jsonl",
+        # Keys Configuration
+        Meta_key: str = "Meta",
+        clips_key: str = "Clips", 
+        output_key: str = "QA"
+    ):
+        # 1. 存储初始化
+        self.storage = FileStorage(
+            first_entry_file_name=first_entry_file,
+            cache_path=cache_path,
+            file_name_prefix=file_name_prefix,
+            cache_type=cache_type,
+        )
+        
+        # 强制设置 vLLM 的多进程方法
+        os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = "spawn"
+
+        # 2. VLM 服务初始化
+        self.llm_serving = LocalModelVLMServing_vllm(
+            hf_model_name_or_path=model_path,
+            hf_cache_dir=hf_cache_dir,
+            hf_local_dir=download_dir,
+            vllm_tensor_parallel_size=1, 
+            vllm_temperature=0.7,
+            vllm_top_p=0.9,
+            vllm_max_tokens=6000, # 视频问答通常需要较长的 Context
+        )
+
+        # 3. 算子链初始化
+        # 阶段一：初始生成
+        self.initial_QA_generation = MultiroleVideoQAInitialGenerator(llm_serving = self.llm_serving)
+        
+        # 阶段二：多智能体迭代优化 (核心差异点)
+        self.multiAgent_QA_generation = MultiroleVideoQAMultiAgentGenerator(
+            llm_serving = self.llm_serving, 
+            max_iterations = 3
+        )
+        
+        # 阶段三：最终格式化
+        self.final_QA_generation = MultiroleVideoQAFinalGenerator(llm_serving = self.llm_serving)
+
+        self.input_meta_key = Meta_key
+        self.input_clips_key = clips_key
+        self.output_key = output_key
+
+    def forward(self):
+        print(">>> [Pipeline] Step 1: Initial QA Generation...")
+        init_df = self.initial_QA_generation.run(
+            storage = self.storage.step(),
+            input_meta_key = self.input_meta_key, 
+            input_clips_key = self.input_clips_key, 
+            output_key = self.output_key
+        )
+        
+        print(">>> [Pipeline] Step 2: Multi-Agent Refinement (3 iterations)...")
+        # 注意：此算子接收上一阶段的 DataFrame (init_df) 作为输入
+        middle_df = self.multiAgent_QA_generation.run(
+            df = init_df,
+            input_meta_key = self.input_meta_key, 
+            input_clips_key = self.input_clips_key, 
+            output_key = self.output_key
+        )
+        
+        print(">>> [Pipeline] Step 3: Finalizing QA Pairs...")
+        self.final_QA_generation.run(
+            storage = self.storage,
+            df = middle_df,
+            input_meta_key = self.input_meta_key, 
+            input_clips_key = self.input_clips_key, 
+            output_key = self.output_key
+        )
+        print(">>> [Pipeline] Done.")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Batch video QA generation with DataFlow (Single GPU)")
+
+    parser.add_argument("--model_path", default="../../Models/Qwen2.5-VL-7B-Instruct",
+                                 help="Path to the local model or HuggingFace repo ID.")
+    parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface",
+                                 help="HuggingFace cache directory.")
+    parser.add_argument("--download_dir", default="./ckpt",
+                                 help="Local directory for downloading models.")
+    
+    parser.add_argument("--card_id", type=str, default="0",
+                                 help="The single CUDA device ID to use (e.g., '0' or '1').")
+    
+    parser.add_argument("--images_file", default="./dataflow/example/ads_QA/adsQA.jsonl",
+                                 help="Path to the first entry file for DataFlow.")
+    parser.add_argument("--cache_path", default="./cache_local",
+                                 help="Directory for caching DataFlow steps.")
+    parser.add_argument("--file_name_prefix", default="caption",
+                                 help="Prefix for cache file names.")
+    parser.add_argument("--cache_type", default="jsonl",
+                                 help="Type of cache file (e.g., jsonl).")
+
+    args = parser.parse_args()
+
+    os.environ['CUDA_VISIBLE_DEVICES'] = args.card_id.replace(' ', '')
+    
+    pipe = MultiRoleVideoQAPipeline(
+        model_path=args.model_path,
+        hf_cache_dir=args.hf_cache_dir,
+        download_dir=args.download_dir,
+        first_entry_file=args.images_file,
+        cache_path=args.cache_path,
+        file_name_prefix=args.file_name_prefix,
+        cache_type=args.cache_type,
+    )
+    pipe.forward()
+
+```
diff --git a/docs/en/notes/mm_operators/image_understanding/eval/clip_image_text_evaluator.md b/docs/en/notes/mm_operators/image_understanding/eval/clip_image_text_evaluator.md
deleted file mode 100644
index 6a998fef..00000000
--- a/docs/en/notes/mm_operators/image_understanding/eval/clip_image_text_evaluator.md
+++ /dev/null
@@ -1,107 +0,0 @@
----
-title: clip_image_text_evaluator
-createTime: 2025/10/15 19:56:33
-icon: material-symbols-light:image
-permalink: /en/mm_operators/eval/clip_image_text_evaluator/
----
-## 📘 Overview
-`CLIPEvaluator` computes the **image-text alignment score** using **CLIP**, with scores ranging from `[0,1]`.  
-Internally, it encodes both the image and text using the CLIP model, performs L2 normalization,  
-then calculates cosine similarity and linearly maps it to `[0,1]` via `(cos + 1) / 2`.
-
-
-## ```__init__```
-```python
-def __init__(
-    self,
-    model_name: str = "/data0/happykeyan/workspace/ckpt/clip-vit-base-patch32",
-    device: str = None
-)
-```
-
-
-## `init` Parameters
-| Parameter | Type | Default | Description |
-| :--- | :--- | :--- | :--- |
-| `model_name` | `str` | `"/data0/happykeyan/workspace/ckpt/clip-vit-base-patch32"` | Local path or Hugging Face Model ID for the CLIP model. Loaded via `CLIPProcessor` / `CLIPModel` (`use_safetensors=True`). |
-| `device` | `str \| None` | `None` | The inference device. Automatically selects `"cuda"` if available, otherwise falls back to `"cpu"`. |
-
-
-
-## `run`
-```python
-def run(
-    self,
-    storage: DataFlowStorage,
-    image_key: str = "image_path",
-    text_key: str = "text",
-    output_key: str = "clip_score"
-):
-    ...
-```
-
-Parameters
-| Parameter | Type | Default | Description |
-| :--- | :--- | :--- | :--- |
-| `storage` | `DataFlowStorage` | — | The Dataflow storage object used for reading and writing data. |
-| `image_key` | `str` | `"image_path"` | The column name containing the image path. |
-| `text_key` | `str` | `"text"` | The column name containing the text input. |
-| `output_key` | `str` | `"clip_score"` | The column name for storing the output alignment score (range `[0,1]`). |
-
-
-
-## 🧠 Example Usage
-
-```python
-from dataflow.utils.storage import FileStorage
-from dataflow.operators.core_vision import CLIPEvaluator
-
-# 1) Prepare FileStorage (must contain image_path and text columns)
-storage = FileStorage(
-    first_entry_file_name="data/clip_input.jsonl",
-    cache_path="./cache_local",
-    file_name_prefix="clip_eval",
-    cache_type="jsonl"
-)
-
-# 2) Initialize the operator (can also use HF model ID, e.g. "openai/clip-vit-base-patch32")
-evaluator = CLIPEvaluator(
-    model_name="/data0/happykeyan/workspace/ckpt/clip-vit-base-patch32",
-    device=None  # automatically chooses cuda/cpu
-)
-
-# 3) Execute evaluation
-cols = evaluator.run(
-    storage=storage.step(),
-    image_key="image_path",
-    text_key="text",
-    output_key="clip_score"
-)
-print(cols)  # ["clip_score"]
-```
-
-### 🧾 Default Output Format
-| Field | Type | Description |
-| :--- | :--- | :--- |
-| `image_path` / specified `image_key` | `string` | The input image path. |
-| `text` / specified `text_key` | `string` | The input text. |
-| `clip_score` / specified `output_key` | `float` | The image-text alignment score (range `[0,1]`). |
-
-
-
-Example Input:
-```jsonl
-{
-  "image_path": "1.png",
-  "text": "The image shows a man and a woman in what appears to be a car."
-}
-```
-
-Example Output:
-```jsonl
-{
-  "image_path": "1.png",
-  "text": "The image shows a man and a woman in what appears to be a car.",
-  "clip_score": 0.642
-}
-```
\ No newline at end of file
diff --git a/docs/en/notes/mm_operators/image_understanding/eval/image_clip_evaluator.md b/docs/en/notes/mm_operators/image_understanding/eval/image_clip_evaluator.md
new file mode 100644
index 00000000..4476615c
--- /dev/null
+++ b/docs/en/notes/mm_operators/image_understanding/eval/image_clip_evaluator.md
@@ -0,0 +1,108 @@
+---
+title: ImageCLIPEvaluator
+createTime: 2025/10/15 19:56:33
+# icon: material-symbols-light:image
+permalink: /en/mm_operators/eval/image_clip_evaluator/
+---
+## 📘 Overview
+`ImageCLIPEvaluator` computes an **image–text alignment score** based on **CLIP**, with scores ranging in `[0, 1]`.  
+Internally, it encodes the image and the text with CLIP → normalizes the embeddings → computes cosine similarity and linearly maps it to `[0, 1]` via `(cos + 1) / 2`.
+
+
+
+
+## ```__init__```
+```python
+def __init__(
+    self,
+    model_name: str = "openai/clip-vit-base-patch32",
+    device: str = None
+):
+    ...
+```
+
+
+## `init` Parameters
+| Parameter    | Type          | Default                          | Description |
+| :----------- | :------------ | :------------------------------- | :---------- |
+| `model_name` | `str`         | `"openai/clip-vit-base-patch32"` | Local path or Hugging Face Model ID of the CLIP model; loaded via `CLIPProcessor` / `CLIPModel` (`use_safetensors=True`). |
+| `device`     | `str \| None` | `None`                           | Inference device; when `None`, the operator automatically selects `"cuda"` if available, otherwise falls back to `"cpu"`. |
+
+
+## `run`
+```python
+def run(
+    self,
+    storage: DataFlowStorage,
+    input_image_key: str = "image_path",
+    input_text_key: str = "text",
+    output_key: str = "clip_score"
+):
+    ...
+```
+
+Parameters
+| Parameter        | Type              | Default         | Description |
+| :--------------- | :---------------- | :-------------- | :---------- |
+| `storage`        | `DataFlowStorage` | —               | The Dataflow storage object used for reading and writing data. |
+| `input_image_key`| `str`             | `"image_path"`  | Column name of the input image path. |
+| `input_text_key` | `str`             | `"text"`        | Column name of the input text. |
+| `output_key`     | `str`             | `"clip_score"`  | Column name for the output alignment score (range `[0, 1]`). |
+
+
+
+## 🧠 Example Usage
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.core_vision import ImageCLIPEvaluator
+
+# 1) Prepare FileStorage (must contain at least image_path and text columns)
+storage = FileStorage(
+    first_entry_file_name="./dataflow/example/test_image_eval/test_image_eval.jsonl",
+    cache_path="./cache_local",
+    file_name_prefix="clip_eval",
+    cache_type="jsonl"
+)
+
+# 2) Initialize the operator (can also use an HF model ID such as "openai/clip-vit-base-patch32")
+evaluator = ImageCLIPEvaluator(
+    model_name="openai/clip-vit-base-patch32",
+    device=None  # automatically selects cuda/cpu
+)
+
+# 3) Run evaluation
+cols = evaluator.run(
+    storage=storage.step(),
+    input_image_key="image_path",
+    input_text_key="text",
+    output_key="clip_score"
+)
+print(cols)  # ["clip_score"]
+```
+
+### 🧾 Default Output Format 
+| Field name                                   | Type     | Default | Description |
+| :------------------------------------------- | :------- | :------ | :---------- |
+| `image_path` (or column given by `input_image_key`) | `string` | —      | Input image path. |
+| `text` (or column given by `input_text_key`)        | `string` | —      | Input text. |
+| `clip_score` (or `output_key`)                     | `float`  | —      | Image–text alignment score in the range `[0, 1]`. |
+
+
+
+Example Input:
+```jsonl
+{
+  "image_path": "1.png",
+  "text": "The image shows a man and a woman in what appears to be a car."
+}
+```
+
+Example Output:
+```jsonl
+{
+  "image_path": "1.png",
+  "text": "The image shows a man and a woman in what appears to be a car.",
+  "clip_score": 0.642
+}
+```
\ No newline at end of file
diff --git a/docs/en/notes/mm_operators/image_understanding/eval/image_longclip_evaluator.md b/docs/en/notes/mm_operators/image_understanding/eval/image_longclip_evaluator.md
new file mode 100644
index 00000000..1f38a452
--- /dev/null
+++ b/docs/en/notes/mm_operators/image_understanding/eval/image_longclip_evaluator.md
@@ -0,0 +1,114 @@
+---
+title: ImageLongCLIPEvaluator
+createTime: 2025/10/15 14:30:52
+# icon: material-symbols-light:image
+permalink: /en/mm_operators/eval/image_longclip_evaluator/
+---
+## 📘 Overview
+
+`ImageLongCLIPEvaluator` computes an **alignment score between images and long-form text** using **LongCLIP**, with scores in the range `[0, 1]`.  
+Compared with standard CLIP, LongCLIP supports substantially longer textual context (in this implementation, the default is `context_length = 248`), which makes it suitable for paragraph-level description matching and alignment evaluation.
+
+The internal pipeline is analogous to CLIP: it encodes the image and the text separately, applies L2 normalization to both embeddings, computes cosine similarity, and then maps the similarity to the interval `[0, 1]` via `(cos + 1) / 2`.
+
+
+
+
+## ```__init__```
+```python
+def __init__(
+    self,
+    model_name: str = "BeichenZhang/LongCLIP-L-336px",
+    device: str = None,
+):
+    ...
+```
+
+
+## `__init__` Parameters
+| Parameter    | Type          | Default                          | Description |
+| :----------- | :------------ | :------------------------------ | :---------- |
+| `model_name` | `str`         | `"BeichenZhang/LongCLIP-L-336px"` | LongCLIP checkpoint spec. If it is a **directory path**, the operator will search for files ending with `.pt`, `.bin` or `.ckpt` and automatically pick one checkpoint to load; if it is a **file path**, that file is used directly as the checkpoint. |
+| `device`     | `str \| None` | `None`                          | Inference device. Automatically selects `"cuda"` if available, otherwise falls back to `"cpu"`. |
+
+
+
+## `run`
+```python
+def run(
+    self,
+    storage: DataFlowStorage,
+    input_image_key: str = "image_path",
+    input_text_key: str = "text",
+    output_key: str = "longclip_score",
+):
+    ...
+```
+
+Parameters
+| Parameter        | Type              | Default           | Description |
+| :--------------- | :---------------- | :---------------- | :---------- |
+| `storage`        | `DataFlowStorage` | —                 | Dataflow storage object used to read and write the DataFrame inside the operator. |
+| `input_image_key`| `str`             | `"image_path"`    | Name of the input image column corresponding to file paths in the DataFrame. |
+| `input_text_key` | `str`             | `"text"`          | Name of the input long-text column corresponding to paragraph-level descriptions in the DataFrame. |
+| `output_key`     | `str`             | `"longclip_score"`| Name of the output score column (range `[0, 1]`) used to store LongCLIP image–text alignment scores. |
+
+
+
+
+## 🧠 Example Usage
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.core_vision import ImageLongCLIPEvaluator
+
+# 1) Prepare FileStorage (must contain at least image_path and text columns)
+storage = FileStorage(
+    first_entry_file_name="./dataflow/example/test_image_eval/test_image_eval.jsonl",
+    cache_path="./cache_local",
+    file_name_prefix="longclip_eval",
+    cache_type="jsonl"
+)
+
+# 2) Initialize the LongCLIP evaluator (model_name can be a directory or a specific checkpoint file)
+evaluator = ImageLongCLIPEvaluator(
+    model_name="BeichenZhang/LongCLIP-L-336px",
+    device=None  # automatically selects cuda/cpu
+)
+
+# 3) Run evaluation: adds longclip_score ∈ [0, 1] for each row
+cols = evaluator.run(
+    storage=storage.step(),
+    input_image_key="image_path",
+    input_text_key="text",
+    output_key="longclip_score"
+)
+print(cols)  # ["longclip_score"]
+```
+
+### 🧾 Default Output Format
+| Field name                                   | Type     | Default | Description |
+| :------------------------------------------- | :------- | :------ | :---------- |
+| `image_path` (or the column given by `input_image_key`) | `string` | —      | Input image path. |
+| `text` (or the column given by `input_text_key`)        | `string` | —      | Input long-text description. |
+| `longclip_score` (or `output_key`)          | `float`  | —      | Long-text image–text alignment score in the range `[0, 1]`. |
+
+
+
+
+Example Input:
+```jsonl
+{
+  "image_path": "1.png",
+  "text": "The image shows a man and a woman in what appears to be a car."
+}
+```
+
+Example Output:
+```jsonl
+{
+  "image_path": "1.png",
+  "text": "The image shows a man and a woman in what appears to be a car.",
+  "clip_score": 0.642
+}
+```
\ No newline at end of file
diff --git a/docs/en/notes/mm_operators/image_understanding/eval/image_vqa_evaluator.md b/docs/en/notes/mm_operators/image_understanding/eval/image_vqa_evaluator.md
new file mode 100644
index 00000000..59f82843
--- /dev/null
+++ b/docs/en/notes/mm_operators/image_understanding/eval/image_vqa_evaluator.md
@@ -0,0 +1,111 @@
+---
+title: ImageVQAScoreEvaluator
+createTime: 2025/10/15 14:52:29
+# icon: material-symbols-light:image
+permalink: /en/mm_operators/eval/image_vqa_evaluator/
+---
+## 📘 Overview
+`ImageVQAScoreEvaluator` leverages a **BLIP visual question answering (VQA) model** to compute a **Yes-probability score** that quantifies whether an image is aligned with a given textual description, with values in the interval `[0, 1]`.  
+The core idea is as follows: the textual description is wrapped into an English interrogative prompt of the form *“Does this image match the description?”*, using `"yes"` and `"no"` as candidate answers.  
+The model is then queried twice with `"yes"` and `"no"` as labels, and their respective losses are converted into relative probabilities. The normalized probability assigned to `"yes"` is taken as the image–text consistency score.
+
+
+## ```__init__```
+```python
+def __init__(
+    self,
+    model_name: str = "Salesforce/blip-vqa-base",
+    device: str = None,
+    local_only: bool = True,
+):
+  ...
+```
+
+
+## `init` Parameters
+| Parameter    | Type          | Default                       | Description |
+| :----------- | :------------ | :---------------------------- | :---------- |
+| `model_name` | `str`         | `"Salesforce/blip-vqa-base"`  | Hugging Face Model ID or local path of the BLIP VQA model; loaded via `BlipProcessor` / `BlipForQuestionAnswering`. |
+| `device`     | `str \| None` | `None`                        | Inference device. When `None`, the operator automatically selects `"cuda"` if available; otherwise it falls back to `"cpu"`. |
+| `local_only` | `bool`        | `True`                        | Whether to load model weights strictly from local files. When `True`, the model is loaded with `local_files_only=True` (recommended for offline or restricted-network environments). |
+
+
+
+## `run`
+```python
+def run(
+    self,
+    storage: DataFlowStorage,
+    input_image_key: str = "image_path",
+    input_text_key: str = "text",
+    output_key: str = "vqa_score"
+):
+    ...
+```
+
+Parameters
+| Parameter        | Type              | Default         | Description |
+| :--------------- | :---------------- | :-------------- | :---------- |
+| `storage`        | `DataFlowStorage` | —               | Dataflow storage object used for reading and writing the DataFrame. |
+| `input_image_key`| `str`             | `"image_path"`  | Name of the column containing image paths. |
+| `input_text_key` | `str`             | `"text"`        | Name of the column containing textual descriptions (which will be wrapped into English questions). |
+| `output_key`     | `str`             | `"vqa_score"`   | Name of the output field storing the VQA score (range `[0, 1]`), representing the model’s probability that “the image matches the description”. |
+
+
+
+## 🧠 Example Usage
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.core_vision import ImageVQAScoreEvaluator
+
+# 1) Prepare FileStorage (must contain at least image_path and text columns)
+storage = FileStorage(
+    first_entry_file_name="./dataflow/example/test_image_eval/test_image_eval.jsonl",
+    cache_path="./cache_local",
+    file_name_prefix="vqa_eval",
+    cache_type="jsonl"
+)
+
+# 2) Initialize the VQA-based evaluator (can be pointed to a local model path)
+evaluator = ImageVQAScoreEvaluator(
+    model_name="Salesforce/blip-vqa-base",
+    device=None,      # automatically selects cuda/cpu
+    local_only=True   # load from local files only (recommended offline)
+)
+
+# 3) Run evaluation: adds vqa_score ∈ [0, 1] for each row
+cols = evaluator.run(
+    storage=storage.step(),
+    input_image_key="image_path",
+    input_text_key="text",
+    output_key="vqa_score"
+)
+print(cols)  # ["vqa_score"]
+```
+
+### 🧾 Default Output Format 
+| Field name                                     | Type     | Default | Description |
+| :--------------------------------------------- | :------- | :------ | :---------- |
+| `image_path` (or the column given by `input_image_key`) | `string` | —      | Input image path. |
+| `text` (or the column given by `input_text_key`)        | `string` | —      | Input textual description. |
+| `vqa_score` (or `output_key`)                 | `float`  | —      | Yes-probability produced by BLIP VQA for the question “Does this image match the description?”, in the range `[0, 1]`. |
+
+
+
+Example Input:
+```jsonl
+{
+  "image_path": "1.png",
+  "text": "The image shows a man and a woman in what appears to be a car."
+}
+```
+
+Example Output:
+```jsonl
+{
+  "image_path": "1.png",
+  "text": "The image shows a man and a woman in what appears to be a car.",
+  "vqa_score": 0.774
+}
+```
\ No newline at end of file
diff --git a/docs/en/notes/mm_operators/image_understanding/eval/longclip_image_text_evaluator.md b/docs/en/notes/mm_operators/image_understanding/eval/longclip_image_text_evaluator.md
deleted file mode 100644
index d554083f..00000000
--- a/docs/en/notes/mm_operators/image_understanding/eval/longclip_image_text_evaluator.md
+++ /dev/null
@@ -1,109 +0,0 @@
----
-title: longclip_image_text_evaluator
-createTime: 2025/10/15 19:56:29
-icon: material-symbols-light:image
-permalink: /en/mm_operators/eval/longclip_image_text_evaluator/
----
-## 📘 Overview
-`LongCLIPEvaluator` computes **image–long-text alignment scores** using **LongCLIP**, producing scores in the range `[0,1]`.  
-Compared to the standard CLIP model, LongCLIP supports longer text contexts (default `context_length=248`),  
-making it ideal for paragraph-level description evaluation and alignment tasks.
-
-
-
-## ```__init__```
-```python
-def __init__(
-    self,
-    ckpt_path: str = "/data0/happykeyan/DataFlow-MM/Dataflow-MM-Preview/ckpt/LongCLIP-L-336px/longclip-L@336px.pt",
-    device: str = None,
-):
-    ...
-```
-
-
-## `init` Parameters
-| Parameter | Type | Default | Description |
-| :--- | :--- | :--- | :--- |
-| `model_name` | `str` | `"/data0/happykeyan/workspace/ckpt/clip-vit-base-patch32"` | Local path or Hugging Face Model ID for the CLIP model. Loaded via `CLIPProcessor` / `CLIPModel` (`use_safetensors=True`). |
-| `device` | `str \| None` | `None` | The inference device. Automatically selects `"cuda"` if available, otherwise falls back to `"cpu"`. |
-
-
-
-## `run`
-```python
-def run(
-    self,
-    storage: DataFlowStorage,
-    image_key: str = "image_path",
-    text_key: str = "text",
-    output_key: str = "clip_score"
-):
-    ...
-```
-
-Parameters
-| Parameter | Type | Default | Description |
-| :--- | :--- | :--- | :--- |
-| `storage` | `DataFlowStorage` | — | The Dataflow storage object used for reading and writing data. |
-| `image_key` | `str` | `"image_path"` | The column name containing the image path. |
-| `text_key` | `str` | `"text"` | The column name containing the text input. |
-| `output_key` | `str` | `"clip_score"` | The column name for storing the output alignment score (range `[0,1]`). |
-
-
-
-## 🧠 Example Usage
-
-```python
-from dataflow.utils.storage import FileStorage
-from dataflow.operators.core_vision import LongCLIPEvaluator
-
-# 1) Prepare FileStorage (must include image_path and text columns)
-storage = FileStorage(
-    first_entry_file_name="data/longclip_input.jsonl",
-    cache_path="./cache_local",
-    file_name_prefix="longclip_eval",
-    cache_type="jsonl"
-)
-
-# 2) Initialize LongCLIP evaluator (replace ckpt_path with your checkpoint)
-evaluator = LongCLIPEvaluator(
-    ckpt_path="/data0/happykeyan/DataFlow-MM/Dataflow-MM-Preview/ckpt/LongCLIP-L-336px/longclip-L@336px.pt",
-    device=None  # auto-selects cuda/cpu
-)
-
-# 3) Run evaluation — adds a new column 'longclip_score' ∈ [0,1]
-cols = evaluator.run(
-    storage=storage.step(),
-    image_key="image_path",
-    text_key="text",
-    output_key="longclip_score"
-)
-print(cols)  # ["longclip_score"]
-```
-
-### 🧾 Default Output Format
-| Field | Type | Description |
-| :--- | :--- | :--- |
-| `image_path` / specified `image_key` | `string` | The input image path. |
-| `text` / specified `text_key` | `string` | The input text. |
-| `clip_score` / specified `output_key` | `float` | The image-text alignment score (range `[0,1]`). |
-
-
-
-Example Input:
-```jsonl
-{
-  "image_path": "1.png",
-  "text": "The image shows a man and a woman in what appears to be a car."
-}
-```
-
-Example Output:
-```jsonl
-{
-  "image_path": "1.png",
-  "text": "The image shows a man and a woman in what appears to be a car.",
-  "clip_score": 0.642
-}
-```
\ No newline at end of file
diff --git a/docs/en/notes/mm_operators/image_understanding/eval/vqa_score_image_text_evaluator.md b/docs/en/notes/mm_operators/image_understanding/eval/vqa_score_image_text_evaluator.md
deleted file mode 100644
index a88ba543..00000000
--- a/docs/en/notes/mm_operators/image_understanding/eval/vqa_score_image_text_evaluator.md
+++ /dev/null
@@ -1,113 +0,0 @@
----
-title: vqa_score_image_text_evaluator
-createTime: 2025/10/15 19:56:31
-icon: material-symbols-light:image
-permalink: /en/mm_operators/eval/vqa_score_image_text_evaluator/
----
-## 📘 Overview
-`VQAScoreEvaluator` uses **BLIP VQA** to transform the question *“Does this image match the description?”* into a **Yes/No probability score** ranging from `[0,1]`.  
-- Constructed question: `Does this image match the description: {text}? Answer yes or no.`  
-- Forward pass twice with labels `"yes"` and `"no"` to obtain respective losses `L_yes` and `L_no`.  
-- Compute normalized probabilities.
-
-## ```__init__```
-```python
-def __init__(
-    self,
-    model_name: str = "/data0/happykeyan/DataFlow-MM/Dataflow-MM-Preview/ckpt/blip-vqa-base",
-    device: str = None,
-    local_only: bool = True,
-):
-    ...
-```
-
-
-## `__init__` Parameters
-| Parameter | Type | Default | Description |
-| :--- | :--- | :--- | :--- |
-| `model_name` | `str` | `"/data0/happykeyan/DataFlow-MM/Dataflow-MM-Preview/ckpt/blip-vqa-base"` | Path or Hugging Face Model ID of the BLIP VQA model. Loaded via `BlipProcessor` and `BlipForQuestionAnswering`. |
-| `device` | `str \| None` | `None` | Inference device. Automatically selects `"cuda"` if available, otherwise falls back to `"cpu"`. |
-| `local_only` | `bool` | `True` | If `True`, load model weights only from local files (`local_files_only=True`). |
-
-
-
-
-## `run`
-```python
-def run(
-    self,
-    storage: DataFlowStorage,
-    image_key: str = "image_path",
-    text_key: str = "text",
-    output_key: str = "clip_score"
-):
-    ...
-```
-
-Parameters
-| Parameter | Type | Default | Description |
-| :--- | :--- | :--- | :--- |
-| `storage` | `DataFlowStorage` | — | The Dataflow storage object for reading and writing data. |
-| `image_key` | `str` | `"image_path"` | Column name containing the input image path. |
-| `text_key` | `str` | `"text"` | Column name containing the text description. |
-| `output_key` | `str` | `"vqa_score"` | Column name where the computed Yes-probability score (range `[0,1]`) is stored. |
-
-
-
-## 🧠 Example Usage
-
-```python
-from dataflow.utils.storage import FileStorage
-from dataflow.operators.core_vision import VQAScoreEvaluator
-
-# 1) Prepare FileStorage (must include image_path and text columns)
-storage = FileStorage(
-    first_entry_file_name="data/vqa_input.jsonl",
-    cache_path="./cache_local",
-    file_name_prefix="vqa_score",
-    cache_type="jsonl"
-)
-
-# 2) Initialize the evaluator (you can replace with your own model path or HF model ID)
-evaluator = VQAScoreEvaluator(
-    model_name="/data0/happykeyan/DataFlow-MM/Dataflow-MM-Preview/ckpt/blip-vqa-base",
-    device=None,       # auto-select cuda/cpu
-    local_only=True    # load from local weights only
-)
-
-# 3) Run evaluation (adds a column 'vqa_score')
-cols = evaluator.run(
-    storage=storage.step(),
-    image_key="image_path",
-    text_key="text",
-    output_key="vqa_score"
-)
-print(cols)  # ["vqa_score"]
-```
-
-### 🧾 Default Output Format
-| Field | Type | Description |
-| :--- | :--- | :--- |
-| `image_path` / specified `image_key` | `string` | The input image path. |
-| `text` / specified `text_key` | `string` | The input text description. |
-| `vqa_score` / specified `output_key` | `float` | The BLIP-predicted probability that the image matches the text (Yes probability, range `[0,1]`). |
-
-
-
-
-Example Input:
-```jsonl
-{
-  "image_path": "1.png",
-  "text": "The image shows a man and a woman in what appears to be a car."
-}
-```
-
-Example Output:
-```jsonl
-{
-  "image_path": "1.png",
-  "text": "The image shows a man and a woman in what appears to be a car.",
-  "vqa_score": 0.774
-}
-```
\ No newline at end of file
diff --git a/docs/en/notes/mm_operators/image_understanding/filter/cat_filter.md b/docs/en/notes/mm_operators/image_understanding/filter/cat_filter.md
deleted file mode 100644
index c1e542f5..00000000
--- a/docs/en/notes/mm_operators/image_understanding/filter/cat_filter.md
+++ /dev/null
@@ -1,106 +0,0 @@
----
-title: cat_filter
-createTime: 2025/10/15 19:56:39
-icon: material-symbols-light:image
-permalink: /en/mm_operators/filter/cat_filter/
----
-## 📘 Overview
-`CatFilter` (Caption-as-Teacher) is a **joint filtering operator** based on **textual complexity** and **OCR overlap rate**.  
-For each `caption`, it performs the following operations:
-1. Uses **Stanza** for dependency parsing to extract subject-verb-object triples and assess **semantic complexity**.  
-2. Requires the sentence to **contain at least one verb** (actional requirement).  
-3. Applies **OCR** on the associated image and computes the **Jaccard overlap** between OCR text and `caption`; captions with excessive overlap are considered direct OCR copies and are filtered out.  
-
-A sample is retained only if it meets **all three conditions**.
-
-
-## ```__init__```
-```python
-def __init__(
-    self,
-    min_triples: int = 2,
-    ocr_overlap_threshold: float = 0.2
-)
-```
-
-
-## `init` Parameters
-| Parameter | Type | Default | Description |
-| :--- | :--- | :--- | :--- |
-| `min_triples` | `int` | `2` | The minimum number of dependency triples (subject-verb-object or argument structures) required to determine sufficient caption complexity. |
-| `ocr_overlap_threshold` | `float` | `0.2` | The maximum allowed Jaccard overlap between OCR text and caption. If overlap **≥** this threshold, the sample is considered an OCR copy and is filtered out. |
-
-
-
-## `run`
-```python
-def run(
-    self,
-    storage: DataFlowStorage,
-    image_key: str,
-    caption_key: str
-):
-    ...
-```
-
-Parameters
-| Parameter | Type | Default | Description |
-| :--- | :--- | :--- | :--- |
-| `storage` | `DataFlowStorage` | — | The Dataflow storage object for reading and writing. |
-| `image_key` | `str` | — | The column name for image paths (e.g., `"image_path"`). |
-| `caption_key` | `str` | — | The column name for caption text (e.g., `"caption"`). |
-
-
-
-## 🧠 Example Usage
-
-```python
-from dataflow.utils.storage import FileStorage
-from dataflow.operators.core_vision import CatFilter
-
-# 1) Prepare FileStorage (must contain image_path and caption columns)
-storage = FileStorage(
-    first_entry_file_name="data/cat_input.jsonl",
-    cache_path="./cache_local",
-    file_name_prefix="cat_filter",
-    cache_type="jsonl"
-)
-
-# 2) Initialize the operator
-filt = CatFilter(
-    min_triples=2,            # minimum complexity threshold
-    ocr_overlap_threshold=0.2 # maximum allowed OCR overlap
-)
-
-# 3) Execute filtering
-cols = filt.run(
-    storage=storage.step(),
-    image_key="image_path",
-    caption_key="caption"
-)
-print(cols)  # ["image_path", "caption"]
-```
-
-### 🧾 Default Output Format
-| Field | Type | Description |
-| :--- | :--- | :--- |
-| `image_path` | `string` | The retained image path after filtering. |
-| `caption` | `string` | The retained caption text that meets all three conditions: complexity ≥ `min_triples`, contains verbs, and OCR overlap < `ocr_overlap_threshold`. |
-
-
-Example Input:
-```jsonl
-  "caption": "SALE SALE SALE 50% OFF"
-
-{
-  "caption": "Two kids count seashells on a sandy beach while their mother reads under a blue umbrella."
-}
-```
-
-Example Output:
-```jsonl
-{
-  "image_path": "1.png",
-  "caption": "A bride smiles while the groom points ahead inside a car, their hands resting together on the seat."
-}
-```
\ No newline at end of file
diff --git a/docs/en/notes/mm_operators/image_understanding/filter/clip_filter.md b/docs/en/notes/mm_operators/image_understanding/filter/clip_filter.md
deleted file mode 100644
index 2b7d2a01..00000000
--- a/docs/en/notes/mm_operators/image_understanding/filter/clip_filter.md
+++ /dev/null
@@ -1,110 +0,0 @@
----
-title: clip_filter
-createTime: 2025/10/15 19:56:41
-icon: material-symbols-light:image
-permalink: /en/mm_operators/filter/clip_filter/
----
-## 📘 Overview
-`ClipFilter` is a **consistency filtering operator** based on **CLIP similarity**.  
-For each sample, it computes the cosine similarity between the normalized image and text embeddings (mapped to the `[0,1]` range).  
-Samples with similarity **≥ `threshold`** are retained, while others are filtered out.
-
-
-
-## ```__init__```
-```python
-def __init__(
-    self,
-    model_name: str = "../ckpt/clip-vit-base-patch32",
-    device: str = None
-)
-```
-
-
-## `init` Parameters
-| Parameter | Type | Default | Description |
-| :--- | :--- | :--- | :--- |
-| `model_name` | `str` | `"../ckpt/clip-vit-base-patch32"` | The local path or Hugging Face Model ID for the CLIP model. Internally loaded using `CLIPProcessor` and `CLIPModel` (`use_safetensors=True`, `weights_only=False`). |
-| `device` | `str \| None` | `None` | The inference device; if `None`, automatically selects `"cuda"` when available, otherwise falls back to `"cpu"`. |
-
-
-
-
-## `run`
-```python
-def run(
-    self,
-    storage: DataFlowStorage,
-    image_key: str = "image",
-    caption_key: str = "caption",
-    threshold: float = 0.25
-):
-    ...
-```
-
-Parameters
-| Parameter | Type | Default | Description |
-| :--- | :--- | :--- | :--- |
-| `storage` | `DataFlowStorage` | — | The Dataflow storage object used for reading and writing data. |
-| `image_key` | `str` | `"image"` | The column name containing the image path. |
-| `caption_key` | `str` | `"caption"` | The column name containing the text description. |
-| `threshold` | `float` | `0.25` | The minimum CLIP similarity required to retain a sample; samples below this value will be filtered out. |
-
-
-
-
-## 🧠 Example Usage
-
-```python
-from dataflow.utils.storage import FileStorage
-from dataflow.operators.core_vision import ClipFilter
-
-# 1) Prepare FileStorage (must contain "image" and "caption" columns)
-storage = FileStorage(
-    first_entry_file_name="data/clip_filter_input.jsonl",
-    cache_path="./cache_local",
-    file_name_prefix="clip_filter",
-    cache_type="jsonl"
-)
-
-# 2) Initialize the operator (can use local or Hugging Face model)
-flt = ClipFilter(
-    model_name="../ckpt/clip-vit-base-patch32",  # or "openai/clip-vit-base-patch32"
-    device=None                                 # auto-select cuda/cpu
-)
-
-# 3) Execute filtering (retains only samples with similarity ≥ 0.25)
-cols = flt.run(
-    storage=storage.step(),
-    image_key="image",
-    caption_key="caption",
-    threshold=0.25
-)
-print(cols)  # ["image", "caption"]
-```
-
-### 🧾 Default Output Format
-| Field | Type | Description |
-| :--- | :--- | :--- |
-| `image` | `string` | The retained image path after filtering. |
-| `caption` | `string` | The retained caption text whose image-text similarity is **≥ `threshold`**. |
-
-Example Input:
-```jsonl
-{
-  "image": "1.png",
-  "caption": "A bride and groom smiling in a car."
-}
-{
-  "image": "2.jpg",
-  "caption": "A red bus driving across a snowy mountain road at night."
-}
-```
-
-Example Output:
-```jsonl
-{
-  "image": "1.png",
-  "caption": "A bride and groom smiling in a car."
-}
-```
\ No newline at end of file
diff --git a/docs/en/notes/mm_operators/image_understanding/filter/complexity_filter.md b/docs/en/notes/mm_operators/image_understanding/filter/complexity_filter.md
deleted file mode 100644
index 77e25b6d..00000000
--- a/docs/en/notes/mm_operators/image_understanding/filter/complexity_filter.md
+++ /dev/null
@@ -1,106 +0,0 @@
----
-title: complexity_filter
-createTime: 2025/10/15 19:56:44
-icon: material-symbols-light:image
-permalink: /en/mm_operators/filter/complexity_filter/
----
-## 📘 Overview
-`ComplexityFilter` is a **text filtering operator** based on **Natural Language Inference (NLI)**. It evaluates whether a caption covers multiple visual capability elements (such as color, shape, action recognition, counting, and spatial relationships), thereby determining its **capability richness**.  
-For each caption, the operator constructs hypothesis sentences (template: `"The following text describes {}."`) and uses an MNLI model to calculate the **entailment** probability. A sample is retained if the number of matched capability elements reaches the threshold (`min_k`); otherwise, it is filtered out.
-
-## ```__init__```
-```python
-def __init__(
-    self,
-    model_name: str = "../ckpt/bart-large-mnli",
-    threshold: float = 0.4,
-    min_k: int = 2,
-    device: str = None
-)
-```
-
-
-## `init` Parameters
-| Parameter | Type | Default | Description |
-| :--- | :--- | :--- | :--- |
-| `model_name` | `str` | `"../ckpt/bart-large-mnli"` | The local path or Hugging Face Model ID for the NLI model. Internally loaded using `AutoTokenizer` and `AutoModelForSequenceClassification` (`local_files_only=True`, `use_safetensors=True`, `weights_only=False`). |
-| `threshold` | `float` | `0.4` | The minimum entailment probability threshold for a capability element to be considered “hit.” Higher values imply stricter filtering. |
-| `min_k` | `int` | `2` | The minimum number of capability elements that must be hit; samples below this threshold are filtered out. |
-| `device` | `str \| None` | `None` | The inference device; if `None`, automatically selects `"cuda"` when available, otherwise falls back to `"cpu"`. |
-
-## `run`
-```python
-def run(
-    self,
-    storage: DataFlowStorage,
-    caption_key: str
-):
-    ...
-```
-
-Parameters
-| Parameter | Type | Default | Description |
-| :--- | :--- | :--- | :--- |
-| `storage` | `DataFlowStorage` | — | The Dataflow storage object for reading and writing. |
-| `caption_key` | `str` | — | The name of the column containing the captions to be evaluated (e.g., `"caption"`). |
-
-
-
-## 🧠 Example Usage
-
-```python
-from dataflow.utils.storage import FileStorage
-from dataflow.operators.core_vision import ComplexityFilter
-
-# 1) Prepare FileStorage (must contain a caption column)
-storage = FileStorage(
-    first_entry_file_name="data/complexity_input.jsonl",
-    cache_path="./cache_local",
-    file_name_prefix="complexity_filter",
-    cache_type="jsonl"
-)
-
-# 2) Initialize the operator (can use local or HF model)
-filt = ComplexityFilter(
-    model_name="../ckpt/bart-large-mnli",   # or "facebook/bart-large-mnli"
-    threshold=0.4,                          # entailment probability threshold
-    min_k=2,                                # minimum number of matched elements
-    device=None                             # auto-select cuda/cpu
-)
-
-# 3) Execute filtering
-cols = filt.run(
-    storage=storage.step(),
-    caption_key="caption"
-)
-print(cols)  # ["caption"]
-```
-
-#### 🧾 Default Output Format
-| Field | Type | Description |
-| :--- | :--- | :--- |
-| `caption` | `string` | The filtered caption text; only samples with the number of matched capability elements `≥ min_k` are retained. |
-
-Example Input:
-```jsonl
-{
-  "caption": "A red double-decker bus turns left at a city intersection while pedestrians wait at the crosswalk."
-}
-{
-  "caption": "SALE SALE SALE 50% OFF"
-}
-{
-  "caption": "Two kids count seashells on a sandy beach while their mother reads under a blue umbrella."
-}
-
-```
-
-Example Output:
-```jsonl
-{
-  "caption": "A red double-decker bus turns left at a city intersection while pedestrians wait at the crosswalk."
-}
-{
-  "caption": "Two kids count seashells on a sandy beach while their mother reads under a blue umbrella."
-}
-```
\ No newline at end of file
diff --git a/docs/en/notes/mm_operators/image_understanding/filter/deduplication_filter.md b/docs/en/notes/mm_operators/image_understanding/filter/deduplication_filter.md
deleted file mode 100644
index d99f63ce..00000000
--- a/docs/en/notes/mm_operators/image_understanding/filter/deduplication_filter.md
+++ /dev/null
@@ -1,114 +0,0 @@
----
-title: deduplication_filter
-createTime: 2025/10/15 19:56:47
-icon: material-symbols-light:image
-permalink: /en/mm_operators/filter/deduplication_filter/
----
-## 📘 Overview
-`DeduplicateFilter` is a **duplicate removal operator** based on **CLIP image embedding similarity**.  
-It extracts CLIP feature vectors for all images in a dataset and computes pairwise cosine similarity.  
-For any image pair with similarity **≥ `threshold`**, the operator keeps the **first** image and removes the **later duplicates**.  
-Additionally, it records the **maximum similarity score** for each retained image (stored in the column `output_score_key`, default: `max_similarity`) for auditing purposes.
-
-
-## ```__init__```
-```python
-def __init__(
-    self,
-    model_name: str = "openai/clip-vit-base-patch32",
-    threshold: float = 0.90,
-    batch_size: int = 32,
-    device: str = "cuda" if torch.cuda.is_available() else "cpu"
-)
-```
-
-
-## `init` Parameters
-| Parameter | Type | Default | Description |
-| :--- | :--- | :--- | :--- |
-| `model_name` | `str` | `"openai/clip-vit-base-patch32"` | The CLIP model used to extract image embeddings (Hugging Face Model ID or local path). |
-| `threshold` | `float` | `0.90` | The deduplication threshold; if cosine similarity between two images **≥** this value, the later image is considered a duplicate and removed. |
-| `batch_size` | `int` | `32` | The batch size for CLIP inference; higher values increase throughput but also memory usage. |
-| `device` | `str` | `"cuda"` if available, otherwise `"cpu"` | The device used for model inference. |
-
-
-
-## `run`
-```python
-def run(
-    self, 
-    storage: DataFlowStorage,
-    input_image_key: str = "image",
-    output_score_key: str = "max_similarity"
-) -> None:
-    ...
-```
-
-Parameters
-| Parameter | Type | Default | Description |
-| :--- | :--- | :--- | :--- |
-| `storage` | `DataFlowStorage` | — | The Dataflow storage object used for reading and writing data. |
-| `input_image_key` | `str` | `"image"` | The column name containing image paths or objects that can be parsed by `_load_image`. |
-| `output_score_key` | `str` | `"max_similarity"` | The name of the column storing each image’s maximum similarity with all others. |
-
-
-## 🧠 Example Usage
-
-```python
-from dataflow.utils.storage import FileStorage
-from dataflow.operators.core_vision import DeduplicateFilter
-
-# 1) Prepare FileStorage (must contain an "image" column)
-storage = FileStorage(
-    first_entry_file_name="data/dedup_input.jsonl",
-    cache_path="./cache_local",
-    file_name_prefix="image_dedup",
-    cache_type="jsonl"
-)
-
-# 2) Initialize the operator
-dedup = DeduplicateFilter(
-    model_name="openai/clip-vit-base-patch32",
-    threshold=0.90,
-    batch_size=32,
-    device="cuda"  # or "cpu"
-)
-
-# 3) Execute deduplication
-dedup.run(
-    storage=storage.step(),
-    input_image_key="image",           # image column
-    output_score_key="max_similarity"  # column to record max similarity
-)
-```
-
-### 🧾 Default Output Format
-| Field | Type | Description |
-| :--- | :--- | :--- |
-| `image` (or the column specified by `input_image_key`) | `string/any` | The retained image entries after deduplication. |
-| `max_similarity` (or the column specified by `output_score_key`) | `float` | The maximum similarity score of this image compared to all others (for audit; duplicate rows are excluded from output). |
-
-Example Input:
-```jsonl
-{
-  "image": "a.jpg"
-}
-{
-  "image": "b.jpg"
-}
-{
-  "image": "a_copy.jpg"
-}
-```
-
-Example Output:
-```jsonl
-{
-  "image": "a.jpg",
-  "max_similarity": 0.96
-}
-{
-  "image": "b.jpg",
-  "max_similarity": 0.12
-}
-```
\ No newline at end of file
diff --git a/docs/en/notes/mm_operators/image_understanding/filter/image_aesthetic_filter.md b/docs/en/notes/mm_operators/image_understanding/filter/image_aesthetic_filter.md
index a4d9e470..0cfd39aa 100644
--- a/docs/en/notes/mm_operators/image_understanding/filter/image_aesthetic_filter.md
+++ b/docs/en/notes/mm_operators/image_understanding/filter/image_aesthetic_filter.md
@@ -1,13 +1,18 @@
 ---
-title: image_aesthetic_filter
-createTime: 2025/10/15 19:56:51
-icon: material-symbols-light:image
+title: ImageAestheticFilter
+createTime: 2025/10/15 15:45:04
+# icon: material-symbols-light:image
 permalink: /en/mm_operators/filter/image_aesthetic_filter/
 ---
 ## 📘 Overview
-`ImageAestheticFilter` is a **basic image aesthetic filtering operator** designed to quickly remove low-quality images.  
-It evaluates four grayscale-based metrics for each image: **sharpness** (Laplacian variance), **brightness** (mean), **contrast** (standard deviation), and **extreme pixel ratio** (proportion of near-black or near-white pixels).  
-A sample is retained only if all four metrics meet the defined thresholds.
+`ImageAestheticFilter` performs **basic quality and aesthetic filtering** over input images by jointly evaluating:
+
+- Sharpness (degree of blur)
+- Global brightness (overly dark / overly bright)
+- Contrast (whether the image appears flat and washed-out)
+- Proportions of near-black / near-white pixels (whether the image is almost entirely black or white)
+
+The operator is intended to remove **low-quality images** that are blurry, strongly mis-exposed, or almost uniform in color, thereby providing cleaner inputs for subsequent detection, recognition, retrieval, or generation tasks.
 
 
 
@@ -20,18 +25,20 @@ def __init__(
     contrast_thresh: float = 40.0,
     max_black_ratio: float = 0.90,
     max_white_ratio: float = 0.90
-)
+):
+  ...
 ```
 
 
 ## `init` Parameters
-| Parameter | Type | Default | Description |
-| :--- | :--- | :--- | :--- |
-| `blur_thresh` | `float` | `150.0` | Threshold for image sharpness (Laplacian variance); higher values indicate stricter sharpness requirements. |
-| `brightness_range` | `tuple[float, float]` | `(30, 230)` | Allowed average brightness range (grayscale values 0–255). |
-| `contrast_thresh` | `float` | `40.0` | Minimum required contrast level (grayscale standard deviation); higher means more contrast required. |
-| `max_black_ratio` | `float` | `0.90` | Upper limit for near-black pixel ratio (<10); exceeding this suggests extreme darkness or large occluded areas. |
-| `max_white_ratio` | `float` | `0.90` | Upper limit for near-white pixel ratio (>245); exceeding this indicates overexposure or excessive white areas. |
+| Parameter          | Type                    | Default               | Description |
+| :----------------- | :---------------------- | :-------------------- | :---------- |
+| `blur_thresh`      | `float`                 | `150.0`               | Sharpness threshold based on the variance of the Laplacian. Higher values correspond to sharper images; images with values below this threshold are treated as blurry. |
+| `brightness_range` | `tuple[float, float]`   | `(30, 230)`           | Admissible range of global brightness (mean grayscale intensity). Images with mean intensity below the lower bound are considered too dark; those above the upper bound are considered too bright. Only images whose mean lies within this interval are regarded as properly exposed. |
+| `contrast_thresh`  | `float`                 | `40.0`                | Contrast threshold based on the standard deviation of the grayscale image. Values below this threshold indicate insufficient contrast (visually “flat” or washed-out images). |
+| `max_black_ratio`  | `float`                 | `0.90`                | Maximum allowed proportion of **near-black pixels** (`gray < 10`). Images exceeding this ratio are treated as almost entirely black. |
+| `max_white_ratio`  | `float`                 | `0.90`                | Maximum allowed proportion of **near-white pixels** (`gray > 245`). Images exceeding this ratio are treated as almost entirely white. |
+
 
 
 ## `run`
@@ -39,16 +46,16 @@ def __init__(
 def run(
     self,
     storage: DataFlowStorage,
-    image_key: str
+    input_image_key: str = "image_path",
 ):
     ...
 ```
 
 Parameters
-| Parameter | Type | Default | Description |
-| :--- | :--- | :--- | :--- |
-| `storage` | `DataFlowStorage` | — | The Dataflow storage object used for reading and writing data. |
-| `image_key` | `str` | — | The column name containing the image path (e.g., `"image_path"`). |
+| Parameter        | Type              | Default         | Description |
+| :--------------- | :---------------- | :-------------- | :---------- |
+| `storage`        | `DataFlowStorage` | —               | Dataflow storage object used to read and write the DataFrame. |
+| `input_image_key`| `str`             | `"image_path"`  | Name of the column containing image paths. |
 
 
 
@@ -58,36 +65,36 @@ Parameters
 from dataflow.utils.storage import FileStorage
 from dataflow.operators.core_vision import ImageAestheticFilter
 
-# 1) Prepare FileStorage (must contain an image_path column)
+# 1) Prepare FileStorage (must contain at least an image_path column)
 storage = FileStorage(
-    first_entry_file_name="data/aesthetic_input.jsonl",
+    first_entry_file_name="./dataflow/example/test_image_filter/test_image_filter.jsonl",
     cache_path="./cache_local",
-    file_name_prefix="img_aesthetic",
+    file_name_prefix="aes_filter",
     cache_type="jsonl"
 )
 
-# 2) Initialize the operator (thresholds can be adjusted as needed)
-flt = ImageAestheticFilter(
+# 2) Initialize the aesthetic filter (thresholds can be tuned as needed)
+aes_filter = ImageAestheticFilter(
     blur_thresh=150.0,
     brightness_range=(30, 230),
     contrast_thresh=40.0,
     max_black_ratio=0.90,
-    max_white_ratio=0.90
+    max_white_ratio=0.90,
 )
 
-# 3) Execute filtering
-cols = flt.run(
+# 3) Run filtering: only images passing the quality checks are retained
+cols = aes_filter.run(
     storage=storage.step(),
-    image_key="image_path"
+    input_image_key="image_path",
 )
 print(cols)  # ["image_path"]
 ```
 
 ### 🧾 Default Output Format
-| Field | Type | Description |
-| :--- | :--- | :--- |
-| `image_path` | `string` | The retained image path after filtering. |
-| `quality` | `bool` | The image quality flag; only samples with `quality=True` are kept in the output. |
+| Field name                                      | Type      | Default | Description |
+| :---------------------------------------------- | :-------- | :------ | :---------- |
+| `image_path` (or the column specified by `input_image_key`) | `string`  | —      | Input image path. |
+| `quality`                                      | `boolean` | —      | Indicates whether the image passes the aesthetic/quality filter. Only rows with `quality == true` are preserved in the final output. |
 
 
 Example Input:
diff --git a/docs/en/notes/mm_operators/image_understanding/filter/image_cat_filter.md b/docs/en/notes/mm_operators/image_understanding/filter/image_cat_filter.md
new file mode 100644
index 00000000..9db6ad96
--- /dev/null
+++ b/docs/en/notes/mm_operators/image_understanding/filter/image_cat_filter.md
@@ -0,0 +1,120 @@
+---
+title: ImageCatFilter
+createTime: 2025/10/15 15:00:00
+# icon: material-symbols-light:image
+permalink: /en/mm_operators/filter/image_cat_filter/
+---
+## 📘 Overview
+`ImageCatFilter` is a caption-quality filtering operator inspired by the **Caption-as-Teacher** paradigm. It combines a **BART-large-MNLI natural language inference (NLI) model** with optional **Tesseract OCR**, and applies a three-stage criterion—**semantic complexity**, **action description**, and **OCR-style transcription**—to image–text pairs. The operator is designed to retain only captions that are semantically rich and genuinely describe the visual content of the corresponding image.
+
+
+## ```__init__```
+```python
+def __init__(
+    self,
+    model_name: str = "facebook/bart-large-mnli",
+    complexity_thresh: float = 0.4,
+    min_caps: int = 2,
+    action_thresh: float = 0.4,
+    ocr_overlap_threshold: float = 0.2,
+    ocr_nli_thresh: float = 0.6,
+    device: str | None = None,
+):
+  ...
+```
+
+
+## `init` Parameters
+| Parameter              | Type              | Default                       | Description |
+| :--------------------- | :---------------- | :---------------------------- | :---------- |
+| `model_name`           | `str`             | `"facebook/bart-large-mnli"`  | Name or local path of the pretrained NLI model. Loaded via `AutoTokenizer` and `AutoModelForSequenceClassification`. |
+| `complexity_thresh`    | `float`           | `0.4`                         | Entailment probability threshold used when matching the caption against a set of “capability hypotheses”. Entailment scores above this threshold indicate that the corresponding capability is covered by the caption. |
+| `min_caps`             | `int`             | `2`                           | Minimum number of capability hypotheses that must be supported by the caption (e.g., actions, interactions, scene details) for it to be considered sufficiently complex. |
+| `action_thresh`        | `float`           | `0.4`                         | Entailment probability threshold for the `ACTION_HYPOTHESIS` (“The caption clearly describes an action happening in the scene.”). Captions below this threshold are considered to lack adequate action description. |
+| `ocr_overlap_threshold`| `float`           | `0.2`                         | Jaccard-overlap threshold between OCR tokens and caption tokens. Only when this overlap is high will the operator further check via NLI whether the caption is primarily an OCR transcription. |
+| `ocr_nli_thresh`       | `float`           | `0.6`                         | Entailment probability threshold for `OCR_ONLY_HYPOTHESIS` (“The caption mainly transcribes the visible text in the image instead of describing the visual scene.”). Samples with high overlap and entailment above this threshold are treated as OCR transcriptions and removed. |
+| `device`               | `str \| None`     | `None`                        | Inference device. If `None`, the operator automatically selects `"cuda"` when available; otherwise it falls back to `"cpu"`. |
+
+
+## `run`
+```python
+def run(
+    self,
+    storage: DataFlowStorage,
+    input_image_key: str = "image",
+    input_caption_key: str = "caption",
+):
+    ...
+```
+
+Parameters
+| Parameter            | Type              | Default       | Description |
+| :------------------- | :---------------- | :------------ | :---------- |
+| `storage`            | `DataFlowStorage` | —             | Dataflow storage object used to read and write the DataFrame. |
+| `input_image_key`    | `str`             | `"image"`     | Name of the column containing image paths. |
+| `input_caption_key`  | `str`             | `"caption"`   | Name of the column containing the English image descriptions. |
+
+
+
+## 🧠 Example Usage
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.core_vision import ImageCatFilter
+
+# 1) Prepare FileStorage (must contain at least `image` and `caption` columns)
+storage = FileStorage(
+    first_entry_file_name="./dataflow/example/test_image_filter/test_image_filter.jsonl",
+    cache_path="./cache_local",
+    file_name_prefix="cat_filter",
+    cache_type="jsonl"
+)
+
+# 2) Initialize the CatFilter operator (complexity and OCR-related thresholds can be tuned)
+cat_filter = ImageCatFilter(
+    model_name="facebook/bart-large-mnli",
+    complexity_thresh=0.4,
+    min_caps=2,
+    action_thresh=0.4,
+    ocr_overlap_threshold=0.2,
+    ocr_nli_thresh=0.6,
+    device=None  # automatically selects cuda/cpu
+)
+
+# 3) Run filtering: retain only captions that are semantically complex, action-descriptive,
+#    and not mere OCR transcriptions
+cols = cat_filter.run(
+    storage=storage.step(),
+    input_image_key="image",
+    input_caption_key="caption",
+)
+print(cols)  # ["image", "caption"]
+```
+
+### 🧾 Default Output Format 
+| Field name                                  | Type     | Default | Description |
+| :------------------------------------------ | :------- | :------ | :---------- |
+| `image` (or the column specified by `input_image_key`)    | `string` | —      | Input image path. |
+| `caption` (or the column specified by `input_caption_key`)| `string` | —      | Input English image description. |
+
+
+Example Input:
+```jsonl
+{
+  "image_path": "1.png",
+  "caption": "A bride smiles while the groom points ahead inside a car, their hands resting together on the seat."
+}
+{
+  "image_path": "2.jpg",
+  "caption": "SALE SALE SALE 50% OFF"
+}
+
+```
+
+Example Output:
+```jsonl
+{
+  "image_path": "1.png",
+  "caption": "A bride smiles while the groom points ahead inside a car, their hands resting together on the seat."
+}
+```
\ No newline at end of file
diff --git a/docs/en/notes/mm_operators/image_understanding/filter/image_clip_filter.md b/docs/en/notes/mm_operators/image_understanding/filter/image_clip_filter.md
new file mode 100644
index 00000000..30e508cc
--- /dev/null
+++ b/docs/en/notes/mm_operators/image_understanding/filter/image_clip_filter.md
@@ -0,0 +1,112 @@
+---
+title: ImageClipFilter
+createTime: 2025/10/15 15:48:32
+# icon: material-symbols-light:image
+permalink: /en/mm_operators/filter/image_clip_filter/
+---
+## 📘 Overview
+`ImageClipFilter` is an image–text consistency operator built on a pretrained **CLIP** model.  
+It computes the **semantic similarity** between an image and its accompanying textual description, and then filters image–text pairs according to a user-specified similarity threshold.  
+Pairs whose similarity score falls below the threshold are discarded as semantically inconsistent.
+ 
+
+
+
+## ```__init__```
+```python
+def __init__(
+    self,
+    model_name: str = "openai/clip-vit-base-patch32",
+    device: str = None
+):
+    ...
+```
+
+
+## `init` Parameters
+| Parameter    | Type          | Default                           | Description |
+| :----------- | :------------ | :-------------------------------- | :---------- |
+| `model_name` | `str`         | `"openai/clip-vit-base-patch32"` | Local path or Hugging Face Model ID of the CLIP model. Internally loaded via `CLIPProcessor` / `CLIPModel` with `use_safetensors=True` and `weights_only=False`. |
+| `device`     | `str \| None` | `None`                            | Inference device. If `None`, the operator automatically selects `"cuda"` when available; otherwise it falls back to `"cpu"`. |
+
+
+
+
+## `run`
+```python
+def run(
+    self,
+    storage: DataFlowStorage,
+    input_image_key: str = "image",
+    input_caption_key: str = "caption",
+    threshold: float = 0.25
+):
+    ...
+```
+
+Parameters
+| Parameter            | Type              | Default      | Description |
+| :------------------- | :---------------- | :----------- | :---------- |
+| `storage`            | `DataFlowStorage` | —            | Dataflow storage object used for reading and writing the DataFrame. |
+| `input_image_key`    | `str`             | `"image"`    | Column name containing image paths. |
+| `input_caption_key`  | `str`             | `"caption"`  | Column name containing the textual description (caption). |
+| `threshold`          | `float`           | `0.25`       | Minimum CLIP similarity threshold; only image–text pairs with similarity `≥ threshold` are retained. |
+
+
+
+
+## 🧠 Example Usage
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.core_vision import ImageClipFilter
+
+# 1) Prepare FileStorage (must contain at least `image` and `caption` columns)
+storage = FileStorage(
+    first_entry_file_name="./dataflow/example/test_image_filter/test_image_filter.jsonl",
+    cache_path="./cache_local",
+    file_name_prefix="image_clip_filter",
+    cache_type="jsonl"
+)
+
+# 2) Initialize the operator (model_name can be an HF model ID such as "openai/clip-vit-base-patch32")
+filt = ImageClipFilter(
+    model_name="openai/clip-vit-base-patch32",
+    device=None  # automatically select cuda/cpu
+)
+
+# 3) Run filtering: keep only image–text pairs with CLIP similarity ≥ 0.25
+cols = filt.run(
+    storage=storage.step(),
+    input_image_key="image",
+    input_caption_key="caption",
+    threshold=0.25
+)
+print(cols)  # ["image", "caption"]
+```
+
+### 🧾 Default Output Format
+| Field     | Type    | Default | Description |
+| :-------- | :------ | :------ | :---------- |
+| `image`   | `string`| —       | Image path for retained samples. |
+| `caption` | `string`| —       | Textual description of retained samples (for which CLIP similarity ≥ `threshold`). |
+
+Example Input:
+```jsonl
+{
+  "image": "1.png",
+  "caption": "A bride and groom smiling in a car."
+}
+{
+  "image": "2.jpg",
+  "caption": "A red bus driving across a snowy mountain road at night."
+}
+```
+
+Example Output:
+```jsonl
+{
+  "image": "1.png",
+  "caption": "A bride and groom smiling in a car."
+}
+```
\ No newline at end of file
diff --git a/docs/en/notes/mm_operators/image_understanding/filter/image_complexity_filter.md b/docs/en/notes/mm_operators/image_understanding/filter/image_complexity_filter.md
new file mode 100644
index 00000000..a0a8633a
--- /dev/null
+++ b/docs/en/notes/mm_operators/image_understanding/filter/image_complexity_filter.md
@@ -0,0 +1,114 @@
+---
+title: ComplexityFilter
+createTime: 2025/10/15 16:10:28
+# icon: material-symbols-light:image
+permalink: /en/mm_operators/filter/complexity_filter/
+---
+## 📘 Overview
+`ComplexityFilter` is an **NLI-based (Natural Language Inference)** text filtering operator designed to evaluate whether a caption simultaneously covers multiple **visual capability dimensions** (e.g., color, shape, action recognition, counting, spatial relations).  
+The operator thereby estimates the **capability richness** of a caption.
+
+For each caption, the operator constructs a set of hypothesis sentences using a common template, e.g.  
+`"The following text describes {}."`  
+An MNLI-style model is then used to compute the **entailment probability** for each capability hypothesis.  
+If the number of “hit” capabilities (those whose entailment probability exceeds a threshold `min_k`) meets or exceeds a user-specified threshold, the sample is retained; otherwise it is filtered out.
+
+
+
+## ```__init__```
+```python
+def __init__(
+    self,
+    model_name: str = "facebook/bart-large-mnli",
+    threshold: float = 0.4,
+    min_k: int = 2,
+    device: str = None
+):
+  ...
+```
+
+
+## `init` Parameters
+| Parameter    | Type          | Default                     | Description |
+| :----------- | :------------ | :-------------------------- | :---------- |
+| `model_name` | `str`         | `"facebook/bart-large-mnli"` | Local path or Hugging Face Model ID of the NLI model. Internally loaded using `AutoTokenizer` / `AutoModelForSequenceClassification` with `local_files_only=True`, `use_safetensors=True`, and `weights_only=False`. |
+| `threshold`  | `float`       | `0.4`                       | Minimum entailment probability required to mark a single capability as “hit”. Higher values yield stricter filtering. |
+| `min_k`      | `int`         | `2`                         | Minimum number of capability dimensions that must be hit. Captions with fewer than `min_k` hits are discarded. |
+| `device`     | `str \| None` | `None`                      | Inference device. If `None`, the operator automatically selects `"cuda"` when available; otherwise it falls back to `"cpu"`. |
+
+## `run`
+```python
+def run(
+    self,
+    storage: DataFlowStorage,
+    input_caption_key: str = "caption"
+):
+    ...
+```
+
+Parameters
+| Parameter          | Type              | Default      | Description |
+| :----------------- | :---------------- | :----------- | :---------- |
+| `storage`          | `DataFlowStorage` | —            | Dataflow storage object used for reading and writing the DataFrame. |
+| `input_caption_key`| `str`             | `"caption"`  | Name of the text column to be evaluated, usually the image description (caption) field. |
+
+
+## 🧠 Example Usage
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.core_vision import ComplexityFilter
+
+# 1) Prepare FileStorage (must contain at least a `caption` column)
+storage = FileStorage(
+    first_entry_file_name="./dataflow/example/test_image_filter/test_image_filter.jsonl",
+    cache_path="./cache_local",
+    file_name_prefix="complexity_filter",
+    cache_type="jsonl"
+)
+
+# 2) Initialize the operator (can use a local or HF model)
+filt = ComplexityFilter(
+    model_name="facebook/bart-large-mnli",  # or "../ckpt/bart-large-mnli"
+    threshold=0.4,                          # entailment probability threshold
+    min_k=2,                                # require at least 2 capability hits
+    device=None                             # automatically select cuda/cpu
+)
+
+# 3) Run filtering
+cols = filt.run(
+    storage=storage.step(),
+    input_caption_key="caption"
+)
+print(cols)  # ["caption"]
+```
+
+### 🧾 Default Output Format
+| Field name                                      | Type    | Default | Description |
+| :--------------------------------------------- | :------ | :------ | :---------- |
+| `caption` (or the column specified by `input_caption_key`) | `string` | — | Caption text retained after filtering; only samples with a number of capability hits `≥ min_k` are kept. |
+
+
+Example Input:
+```jsonl
+{
+  "caption": "A red double-decker bus turns left at a city intersection while pedestrians wait at the crosswalk."
+}
+{
+  "caption": "SALE SALE SALE 50% OFF"
+}
+{
+  "caption": "Two kids count seashells on a sandy beach while their mother reads under a blue umbrella."
+}
+
+```
+
+Example Output:
+```jsonl
+{
+  "caption": "A red double-decker bus turns left at a city intersection while pedestrians wait at the crosswalk."
+}
+{
+  "caption": "Two kids count seashells on a sandy beach while their mother reads under a blue umbrella."
+}
+```
\ No newline at end of file
diff --git a/docs/en/notes/mm_operators/image_understanding/filter/image_consistency_filter.md b/docs/en/notes/mm_operators/image_understanding/filter/image_consistency_filter.md
new file mode 100644
index 00000000..a22e2513
--- /dev/null
+++ b/docs/en/notes/mm_operators/image_understanding/filter/image_consistency_filter.md
@@ -0,0 +1,120 @@
+---
+title: ImageConsistencyFilter
+createTime: 2025/10/15 15:48:32
+# icon: material-symbols-light:image
+permalink: /en/mm_operators/filter/image_consistency_filter/
+---
+## 📘 Overview
+`ImageConsistencyFilter` is an **NLI-based (Natural Language Inference)** consistency filtering operator.  
+It evaluates whether, for the same image, the triplet  
+**(caption, question, answer)** is semantically coherent; that is, whether the **answer can be logically inferred from caption + question**.
+
+Internally, the operator treats `caption + question` as the **premise** and `answer` as the **hypothesis**, and then uses the `bart-large-mnli` model to compute the **entailment probability**.  
+If this probability falls below the threshold `threshold`, the sample is deemed semantically inconsistent and is filtered out.
+
+
+
+## ```__init__```
+```python
+def __init__(
+    self,
+    model_name: str = "facebook/bart-large-mnli",
+    threshold: float = 0.35,
+    device: str = None
+):
+    ...
+```
+
+
+## `init` Parameters
+| Parameter   | Type          | Default                      | Description |
+| :---------- | :------------ | :--------------------------- | :---------- |
+| `model_name` | `str`        | `"facebook/bart-large-mnli"` | Local path or Hugging Face Model ID for the NLI model. Internally loaded via `AutoTokenizer` / `AutoModelForSequenceClassification` with `local_files_only=True`, `use_safetensors=True`, and `weights_only=False`. |
+| `threshold` | `float`       | `0.35`                       | Entailment probability threshold. If the entailment probability for **caption + question → answer** is below this value, the sample is treated as semantically inconsistent and discarded. Higher values result in stricter filtering. |
+| `device`    | `str \| None` | `None`                       | Inference device. If `None`, the operator automatically selects `"cuda"` when available; otherwise it falls back to `"cpu"`. |
+
+
+
+## `run`
+```python
+def run(
+    self,
+    storage: DataFlowStorage,
+    input_caption_key: str = "caption",
+    input_question_key: str = "question",
+    input_answer_key: str = "answer",
+):
+    ...
+
+```
+
+Parameters
+| Parameter            | Type              | Default      | Description |
+| :------------------- | :---------------- | :----------- | :---------- |
+| `storage`            | `DataFlowStorage` | —            | Dataflow storage object used for reading from and writing to the DataFrame. |
+| `input_caption_key`  | `str`             | `"caption"`  | Column name of the caption text, typically the natural-language description of the image. |
+| `input_question_key` | `str`             | `"question"` | Column name of the question text associated with the image. |
+| `input_answer_key`   | `str`             | `"answer"`   | Column name of the answer text, representing the response to the question. |
+
+
+## 🧠 Example Usage
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.core_vision import ImageConsistencyFilter
+
+# 1) Prepare FileStorage (must contain at least caption / question / answer)
+storage = FileStorage(
+    first_entry_file_name="./dataflow/example/test_image_filter/test_image_filter.jsonl",
+    cache_path="./cache_local",
+    file_name_prefix="image_consistency_filter",
+    cache_type="jsonl"
+)
+
+# 2) Initialize the operator (can use a local or HF model)
+filt = ImageConsistencyFilter(
+    model_name="facebook/bart-large-mnli",  # or a local path "../ckpt/bart-large-mnli"
+    threshold=0.35,                         # entailment probability threshold
+    device=None                             # automatically select cuda/cpu
+)
+
+# 3) Run filtering
+cols = filt.run(
+    storage=storage.step(),
+    input_caption_key="caption",
+    input_question_key="question",
+    input_answer_key="answer"
+)
+print(cols)  # ["caption", "question", "answer"]
+```
+
+### 🧾 Default Output Format (Output Format)
+| Field name                                       | Type    | Default | Description |
+| :----------------------------------------------- | :------ | :------ | :---------- |
+| `caption` (or the column specified by `input_caption_key`)  | `string` | — | Caption text retained after filtering. |
+| `question` (or the column specified by `input_question_key`) | `string` | — | Question text that, together with the caption, is deemed to entail the answer. |
+| `answer` (or the column specified by `input_answer_key`)     | `string` | — | Answer text whose entailment probability from caption + question is `≥ threshold` under the NLI model. |
+
+
+Example Input:
+```jsonl
+{
+  "caption":  "A groom in a black tuxedo sits in a car next to his smiling bride.",
+  "question": "Where are the couple sitting?",
+  "answer":   "They are sitting inside a car."
+}
+{
+  "caption":  "A groom in a black tuxedo sits in a car next to his smiling bride.",
+  "question": "What color is the sky in this picture?",
+  "answer":   "The sky is green with purple stripes."
+}
+```
+
+Example Output:
+```jsonl
+{
+  "caption":  "A groom in a black tuxedo sits in a car next to his smiling bride.",
+  "question": "Where are the couple sitting?",
+  "answer":   "They are sitting inside a car."
+}
+```
\ No newline at end of file
diff --git a/docs/en/notes/mm_operators/image_understanding/filter/image_deduplication_filter.md b/docs/en/notes/mm_operators/image_understanding/filter/image_deduplication_filter.md
new file mode 100644
index 00000000..1be6a002
--- /dev/null
+++ b/docs/en/notes/mm_operators/image_understanding/filter/image_deduplication_filter.md
@@ -0,0 +1,118 @@
+---
+title: ImageDeduplicateFilter
+createTime: 2025/10/15 19:56:47
+# icon: material-symbols-light:image
+permalink: /en/mm_operators/filter/image_deduplication_filter/
+---
+## 📘 Overview
+`ImageDeduplicateFilter` is an **image-level deduplication operator** built upon **CLIP-based image embeddings**.  
+The operator encodes all images in a dataset into CLIP feature vectors and computes pairwise cosine similarities.  
+For any pair of images whose similarity is **greater than or equal to `threshold`**, the operator **retains the first occurrence** and  
+marks subsequent ones as near-duplicate samples to be removed.
+
+In addition, for every retained image, the operator records its **maximum cosine similarity** with all other images in the column  
+specified by `output_score_key` (by default, `max_similarity`). This value can be used for downstream quality control, auditing,  
+or further manual examination of near-duplicate content.
+
+
+## ```__init__```
+```python
+def __init__(
+    self,
+    model_name: str = "openai/clip-vit-base-patch32",
+    threshold: float = 0.90,
+    batch_size: int = 32,
+    device: str = "cuda" if torch.cuda.is_available() else "cpu"
+):
+  ...
+```
+
+## `init` Parameters
+| Parameter      | Type   | Default                                  | Description |
+| :------------- | :----- | :---------------------------------------- | :---------- |
+| `model_name`   | `str`  | `"openai/clip-vit-base-patch32"`         | Identifier or local path of the CLIP model used to extract image embeddings (Hugging Face Model ID or local checkpoint directory). |
+| `threshold`    | `float`| `0.90`                                   | Deduplication similarity threshold. If the cosine similarity between two image embeddings is **greater than or equal to** this value, the later image in the sequence is treated as a near-duplicate and removed. |
+| `batch_size`   | `int`  | `32`                                     | Batch size used during CLIP inference. Larger batch sizes improve throughput but increase GPU/CPU memory consumption. |
+| `device`       | `str`  | `"cuda"` if available, otherwise `"cpu"` | Computational device used for CLIP inference. The operator automatically defaults to GPU when available; otherwise, it falls back to CPU execution. |
+
+## `run`
+```python
+def run(
+    self, 
+    storage: DataFlowStorage,
+    input_image_key: str = "image",
+    output_score_key: str = "max_similarity"
+) -> None:
+    ...
+```
+
+Parameters
+| Parameter | Type | Default | Description |
+| :--- | :--- | :--- | :--- |
+| `storage` | `DataFlowStorage` | — | The Dataflow storage object used for reading and writing data. |
+| `input_image_key` | `str` | `"image"` | The column name containing image paths or objects that can be parsed by `_load_image`. |
+| `output_score_key` | `str` | `"max_similarity"` | The name of the column storing each image’s maximum similarity with all others. |
+
+
+## 🧠 Example Usage
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.core_vision import ImageDeduplicateFilter
+
+# 1) Prepare FileStorage (must contain an "image" column)
+storage = FileStorage(
+    first_entry_file_name="data/dedup_input.jsonl",
+    cache_path="./cache_local",
+    file_name_prefix="image_dedup",
+    cache_type="jsonl"
+)
+
+# 2) Initialize the operator
+dedup = ImageDeduplicateFilter(
+    model_name="openai/clip-vit-base-patch32",
+    threshold=0.90,
+    batch_size=32,
+    device="cuda"  # or "cpu"
+)
+
+# 3) Execute deduplication
+cols = dedup.run(
+    storage=storage.step(),
+    input_image_key="image",           # image column
+    output_score_key="max_similarity"  # column to record max similarity
+)
+print(cols)  # ["image", "max_similarity"]
+```
+
+### 🧾 Default Output Format
+| Field                                           | Type         | Description |
+| :--------------------------------------------- | :----------- | :---------- |
+| `image` (or the column specified by `input_image_key`) | `string/any` | The retained image entries after deduplication; near-duplicate images are removed according to the similarity threshold. |
+| `max_similarity` (or the column specified by `output_score_key`) | `float`      | Maximum cosine similarity between this image and all other images in the dataset (for auditing and analysis; removed duplicates are not present in the final output). |
+
+
+Example Input:
+```jsonl
+{
+  "image": "a.jpg"
+}
+{
+  "image": "b.jpg"
+}
+{
+  "image": "a_copy.jpg"
+}
+```
+
+Example Output:
+```jsonl
+{
+  "image": "a.jpg",
+  "max_similarity": 0.96
+}
+{
+  "image": "b.jpg",
+  "max_similarity": 0.12
+}
+```
\ No newline at end of file
diff --git a/docs/en/notes/mm_operators/image_understanding/filter/image_diversity_filter.md b/docs/en/notes/mm_operators/image_understanding/filter/image_diversity_filter.md
new file mode 100644
index 00000000..887639e6
--- /dev/null
+++ b/docs/en/notes/mm_operators/image_understanding/filter/image_diversity_filter.md
@@ -0,0 +1,126 @@
+---
+title: ImageDiversityFilter
+createTime: 2025/10/15 19:57:00
+# icon: material-symbols-light:image
+permalink: /en/mm_operators/filter/image_diversity_filter/
+---
+## 📘 Overview
+`ImageDiversityFilter` is a joint **text–image deduplication operator** designed to preserve **content diversity** when cleaning multimodal datasets.  
+It relies on two complementary signals:
+
+1. **Text side**: estimates similarity between the current caption and previously retained captions using **TF–IDF with cosine similarity**.  
+2. **Image side**: measures visual redundancy using **perceptual hash (pHash) Hamming distance** over images.
+
+A sample is retained **only if** it is sufficiently novel **both** in text and image space; otherwise, it is treated as a near-duplicate and filtered out.
+
+This dual-view strategy avoids failure modes that occur when only one modality is considered (e.g., different images with nearly identical text, or vice versa), and helps construct **de-duplicated, semantically diverse** multimodal corpora.
+
+
+
+
+
+## ```__init__```
+```python
+def __init__(
+    self,
+    text_thresh: float = 0.8,
+    hash_size: int = 8,
+    img_dist_thresh: int = 5
+):
+    ...
+```
+
+
+## `init` Parameters
+| Parameter | Type | Default | Description |
+| :--- | :--- | :--- | :--- |
+| `text_thresh` | `float` | `0.8` | Text uniqueness threshold. The maximum cosine similarity with the most recent corpus (managed by the internal `TextDuplicateFilter`) must be **< this value** to be considered unique. |
+| `hash_size` | `int` | `8` | Hash size used for perceptual hashing (pHash). Larger values capture finer visual details but require more computation and memory (used by `ImageDuplicateFilter`). |
+| `img_dist_thresh` | `int` | `5` | Image uniqueness threshold. The minimum Hamming distance with the most recent image hashes must be **> this value** to be considered unique. |
+
+
+## `run`
+```python
+def run(
+    self,
+    storage: DataFlowStorage,
+    input_image_key: str = "image_path",
+    input_text_key: str = "text"
+):
+    ...
+```
+
+Parameters
+| Parameter        | Type              | Default       | Description |
+| :--------------- | :---------------- | :------------ | :---------- |
+| `storage`        | `DataFlowStorage` | —             | Dataflow storage object containing the multimodal table to be de-duplicated. |
+| `input_image_key`| `str`             | `"image_path"`| Name of the image column. Entries should be image paths (or other disk-resident locations that `PIL` can open). |
+| `input_text_key` | `str`             | `"text"`      | Name of the text column, typically a caption or description field used for computing TF–IDF similarity. |
+
+
+## 🧠 Example Usage
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.core_vision import ImageDiversityFilter
+
+# 1) Prepare FileStorage (must contain at least "image_path" and "text" columns)
+storage = FileStorage(
+    first_entry_file_name="./dataflow/example/test_image_filter/test_image_filter.jsonl",
+    cache_path="./cache_local",
+    file_name_prefix="image_diversity_filter",
+    cache_type="jsonl"
+)
+
+# 2) Initialize the operator
+filt = ImageDiversityFilter(
+    text_thresh=0.8,   # text similarity threshold (higher → stricter)
+    hash_size=8,       # perceptual hash size
+    img_dist_thresh=5  # minimum Hamming distance threshold (higher → require larger visual difference)
+)
+
+# 3) Run filtering
+cols = filt.run(
+    storage=storage.step(),
+    input_image_key="image_path",
+    input_text_key="text"
+)
+print(cols)  # ["image_path", "text"]
+```
+
+
+### 🧾 Default Output Format
+
+| Field                                          | Type     | Default | Description |
+| :-------------------------------------------- | :------- | :------ | :---------- |
+| `image_path` (or the column specified by `input_image_key`) | `string` | —       | Image paths retained after filtering; only rows whose text and image are both sufficiently dissimilar from historical samples are kept. |
+| `text` (or the column specified by `input_text_key`)       | `string` | —       | Text descriptions paired with the retained images, guaranteed not to be overly similar to previously kept texts in TF–IDF space. |
+
+
+Example Input:
+```jsonl
+{
+  "image_path": "a.jpg",
+  "text": "A cat sitting on a wooden chair."
+}
+{
+  "image_path": "a_dup.jpg",
+  "text": "A cat sits on a wooden chair."
+}
+{
+  "image_path": "b.jpg",
+  "text": "A bus driving through a snowy mountain pass at night."
+}
+```
+
+Example Output:
+```jsonl
+{
+  "image_path": "a.jpg",
+  "text": "A cat sitting on a wooden chair."
+}
+{
+  "image_path": "b.jpg",
+  "text": "A bus driving through a snowy mountain pass at night."
+}
+```
\ No newline at end of file
diff --git a/docs/en/notes/mm_operators/image_understanding/filter/image_sensitive_filter.md b/docs/en/notes/mm_operators/image_understanding/filter/image_sensitive_filter.md
new file mode 100644
index 00000000..fee0c87c
--- /dev/null
+++ b/docs/en/notes/mm_operators/image_understanding/filter/image_sensitive_filter.md
@@ -0,0 +1,119 @@
+---
+title: ImageSensitiveFilter
+createTime: 2025/10/15 15:31:35
+# icon: material-symbols-light:image
+permalink: /en/mm_operators/filter/image_sensitive_filter/
+---
+## 📘 Overview
+`ImageSensitiveFilter` is a **multi-label safety filtering operator** built on top of the **BART Large MNLI** zero-shot natural language inference model.  
+It evaluates multiple text fields associated with an image and automatically identifies and filters samples containing the following **high-risk content categories**:
+
+- Sexual content (pornography, nudity, etc.)
+- Violence or physical harm
+- Suicide or self-harm
+- Hate speech
+- Harassment or insults
+- Threats or intimidation
+
+Unlike traditional keyword-based blacklists, this operator leverages NLI-style reasoning between **input text** and **natural-language risk descriptions** to decide whether sensitive content is present. This design is more **flexible and extensible**, and is particularly suitable for safety- and compliance-critical cleaning of multimodal datasets.
+
+
+
+## ```__init__```
+```python
+def __init__(
+    self,
+    model_name: str = "facebook/bart-large-mnli",
+    threshold: float = 0.5,
+    device: str | None = None,
+):
+    ...
+```
+
+
+## `init` Parameters
+| Parameter   | Type              | Default                      | Description |
+| :---------- | :---------------- | :--------------------------- | :---------- |
+| `model_name` | `str`            | `"facebook/bart-large-mnli"` | Local path or Hugging Face Model ID of the NLI model. Internally loaded via `AutoTokenizer` / `AutoModelForSequenceClassification` (`local_files_only=True`, `use_safetensors=True`, `weights_only=False`). |
+| `threshold` | `float`           | `0.5`                        | **Entailment probability threshold** for risk categories. If the entailment probability for any risk label is `≥ threshold`, the corresponding text is deemed *unsafe*. Higher values lead to stricter filtering. |
+| `device`    | `str \| None`     | `None`                       | Inference device. If `None`, the operator automatically selects `"cuda"` when available; otherwise it falls back to `"cpu"`. |
+
+
+## `run`
+```python
+def run(
+    self,
+    storage: DataFlowStorage,
+    input_image_key: str,
+    input_text_keys: list
+):
+    ...
+```
+
+Parameters
+| Parameter        | Type              | Default | Description |
+| :--------------- | :---------------- | :------ | :---------- |
+| `storage`        | `DataFlowStorage` | —       | Dataflow storage object used to read and write the underlying DataFrame. |
+| `input_image_key` | `str`            | —       | Name of the column containing image paths (e.g., `"image"`). Used only to check path existence; no visual inference is performed. |
+| `input_text_keys` | `list[str]`      | —       | List of text column names to be evaluated for safety (e.g., `["caption", "question", "answer"]`). Each of these fields is scored against all risk labels. |
+
+
+
+
+## 🧠 Example Usage
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.core_vision import ImageSensitiveFilter
+
+# 1) Prepare FileStorage (must contain at least "image" and caption-like columns)
+storage = FileStorage(
+    first_entry_file_name="./dataflow/example/test_image_filter/test_image_filter.jsonl",
+    cache_path="./cache_local",
+    file_name_prefix="imgtext_sensitive_filter",
+    cache_type="jsonl"
+)
+
+# 2) Initialize the operator (using either a local or HF model)
+filt = ImageSensitiveFilter(
+    model_name="facebook/bart-large-mnli",  # or a local checkpoint path
+    threshold=0.5,                          # risk decision threshold
+    device=None                             # automatically choose cuda/cpu
+)
+
+# 3) Run filtering: jointly check image path + multiple text fields for sensitive content
+cols = filt.run(
+    storage=storage.step(),
+    input_image_key="image",
+    input_text_keys=["caption", "question", "answer"]
+)
+print(cols)  # ["image", "caption", "question", "answer"]
+```
+
+### 🧾 Default Output Format
+
+| Field                                       | Type    | Default | Description |
+| :----------------------------------------- | :------ | :------ | :---------- |
+| column specified by `input_image_key`      | `string`| —       | Original image-path column; after filtering, only rows that pass the safety check are retained. |
+| columns specified by `input_text_keys`     | `string`| —       | Original text columns (caption / question / answer, etc.); after filtering, only rows whose texts are all judged safe are retained. |
+
+
+Example Input:
+```jsonl
+{
+  "image_path": "1.png",
+  "text": "A bride and groom smiling in a car."
+}
+{
+  "image_path": "2.jpg",
+  "text": "Some abusive or hateful phrase here."
+}
+```
+
+Example Output:
+```jsonl
+{
+  "image_path": "1.png",
+  "text": "A bride and groom smiling in a car."
+}
+```
\ No newline at end of file
diff --git a/docs/en/notes/mm_operators/image_understanding/filter/sensitive_filter.md b/docs/en/notes/mm_operators/image_understanding/filter/sensitive_filter.md
deleted file mode 100644
index 4fab238a..00000000
--- a/docs/en/notes/mm_operators/image_understanding/filter/sensitive_filter.md
+++ /dev/null
@@ -1,115 +0,0 @@
----
-title: sensitive_filter
-createTime: 2025/10/15 19:56:56
-icon: material-symbols-light:image
-permalink: /en/mm_operators/filter/sensitive_filter/
----
-## 📘 Overview
-`SensitiveFilter` is a **sensitive content filtering operator** that combines **image NSFW classification** and **text toxicity/hate speech detection** to remove unsafe samples.  
-
-- **Image side**: Uses an image classification model to detect labels such as `{porn, hentai, sexy, nsfw}`; if the confidence score **≥ `img_thresh`**, the image is marked unsafe.  
-- **Text side**: Uses a text classification model to detect labels such as `{toxic, offensive, hate, obscene, threat, sexual_explicit, identity_attack}`; if the confidence score **≥ `txt_thresh`**, the text is marked unsafe.  
-A sample is filtered out if **either** the image **or any text field** is classified as unsafe.
-
-
-## ```__init__```
-```python
-def __init__(
-    self,
-    img_model_name="../ckpt/nsfw_image_detection",
-    txt_model_name="../ckpt/toxic-bert",
-    img_thresh=0.5,
-    txt_thresh=0.5,
-):
-    ...
-```
-
-
-## `init` Parameters
-| Parameter | Type | Default | Description |
-| :--- | :--- | :--- | :--- |
-| `img_model_name` | `str` | `"../ckpt/nsfw_image_detection"` | Local path or Hugging Face Model ID for the image-sensitive detection model. Internally loaded using `AutoImageProcessor` and `AutoModelForImageClassification` (`use_safetensors=True`, `weights_only=False`). |
-| `txt_model_name` | `str` | `"../ckpt/toxic-bert"` | Local path or Hugging Face Model ID for the text-toxicity detection model. Internally loaded using `AutoTokenizer` and `AutoModelForSequenceClassification` (`use_safetensors=True`, `weights_only=False`). |
-| `img_thresh` | `float` | `0.5` | The image sensitivity threshold; if the image matches `{porn, hentai, sexy, nsfw}` with score **≥** this value, it is classified as unsafe. |
-| `txt_thresh` | `float` | `0.5` | The text sensitivity threshold; if any text field matches `{toxic, offensive, hate, obscene, threat, sexual_explicit, identity_attack}` with score **≥** this value, it is classified as unsafe. |
-
-## `run`
-```python
-def run(
-    self,
-    storage: DataFlowStorage,
-    image_key: str,
-    text_keys: list
-):
-    ...
-```
-
-Parameters
-| Parameter | Type | Default | Description |
-| :--- | :--- | :--- | :--- |
-| `storage` | `DataFlowStorage` | — | The Dataflow storage object used for reading and writing data. |
-| `image_key` | `str` | — | The column name containing the image path (e.g., `"image_path"`). |
-| `text_keys` | `list[str]` | — | A list of text column names to be checked (e.g., `["caption", "question", "answer"]`). |
-
-
-
-
-## 🧠 Example Usage
-
-```python
-from dataflow.utils.storage import FileStorage
-from dataflow.operators.core_vision import SensitiveFilter
-
-# 1) Prepare FileStorage (must contain image_path and text columns)
-storage = FileStorage(
-    first_entry_file_name="data/sensitive_input.jsonl",
-    cache_path="./cache_local",
-    file_name_prefix="sensitive_filter",
-    cache_type="jsonl"
-)
-
-# 2) Initialize the operator (you can also use HF model IDs like
-#     img_model_name="Falconsai/nsfw_image_detection",
-#     txt_model_name="unitary/toxic-bert")
-filt = SensitiveFilter(
-    img_model_name="../ckpt/nsfw_image_detection",
-    txt_model_name="../ckpt/toxic-bert",
-    img_thresh=0.5,
-    txt_thresh=0.5,
-)
-
-# 3) Execute filtering
-cols = filt.run(
-    storage=storage.step(),
-    image_key="image_path",
-    text_keys=["text"]  # or ["caption", "question", "answer"]
-)
-print(cols)  # ["image_path", "text"]
-```
-
-### 🧾 Default Output Format
-| Field | Type | Description |
-| :--- | :--- | :--- |
-| `image_path` / specified `image_key` | `string` | The retained image path after filtering. |
-| Each field in `text_keys` | `string` | The retained text fields; only samples where both the image and all texts are safe are included in the output. |
-
-
-Example Input:
-```jsonl
-{
-  "image_path": "1.png",
-  "text": "A bride and groom smiling in a car."
-}
-{
-  "image_path": "2.jpg",
-  "text": "Some abusive or hateful phrase here."
-}
-```
-
-Example Output:
-```jsonl
-{
-  "image_path": "1.png",
-  "text": "A bride and groom smiling in a car."
-}
-```
\ No newline at end of file
diff --git a/docs/en/notes/mm_operators/image_understanding/filter/text_image_diversity_filter.md b/docs/en/notes/mm_operators/image_understanding/filter/text_image_diversity_filter.md
deleted file mode 100644
index de3cebe6..00000000
--- a/docs/en/notes/mm_operators/image_understanding/filter/text_image_diversity_filter.md
+++ /dev/null
@@ -1,118 +0,0 @@
----
-title: text_image_diversity_filter
-createTime: 2025/10/15 19:57:00
-icon: material-symbols-light:image
-permalink: /en/mm_operators/filter/text_image_diversity_filter/
----
-## 📘 Overview
-`TextImageDiversityFilter` is a **joint text-image deduplication operator** that ensures both textual and visual diversity in a dataset.  
-
-- **Text side:** Uses **TF-IDF + cosine similarity** to compute the maximum similarity between the current text and the historical corpus. A sample is considered *textually unique* if the similarity is below `text_thresh`.  
-- **Image side:** Uses **perceptual hash (pHash)** and computes the Hamming distance. A sample is considered *visually unique* if the distance is greater than `img_dist_thresh`.  
-
-A sample is retained only if **both the text and image are unique**; otherwise, it is filtered out.
-
-
-
-## ```__init__```
-```python
-def __init__(
-    self,
-    text_thresh: float = 0.8,
-    hash_size: int = 8,
-    img_dist_thresh: int = 5
-):
-    ...
-```
-
-
-## `init` Parameters
-| Parameter | Type | Default | Description |
-| :--- | :--- | :--- | :--- |
-| `text_thresh` | `float` | `0.8` | Text uniqueness threshold. The maximum cosine similarity with the most recent corpus (managed by the internal `TextDuplicateFilter`) must be **< this value** to be considered unique. |
-| `hash_size` | `int` | `8` | Hash size used for perceptual hashing (pHash). Larger values capture finer visual details but require more computation and memory (used by `ImageDuplicateFilter`). |
-| `img_dist_thresh` | `int` | `5` | Image uniqueness threshold. The minimum Hamming distance with the most recent image hashes must be **> this value** to be considered unique. |
-
-
-## `run`
-```python
-def run(
-    self,
-    storage: DataFlowStorage,
-    image_key: str,
-    te
-    ...
-```
-
-Parameters
-| Parameter | Type | Default | Description |
-| :--- | :--- | :--- | :--- |
-| `storage` | `DataFlowStorage` | — | The Dataflow storage object used for reading and writing data. |
-| `image_key` | `str` | — | The column name containing the image path (e.g., `"image_path"`). |
-| `text_key` | `str` | — | The column name containing the text content (e.g., `"text"` or `"caption"`). |
-
-
-
-## 🧠 Example Usage
-
-```python
-from dataflow.utils.storage import FileStorage
-from dataflow.operators.core_vision import TextImageDiversityFilter
-
-# 1) Prepare FileStorage (must contain image_path and text columns)
-storage = FileStorage(
-    first_entry_file_name="data/ti_diversity_input.jsonl",
-    cache_path="./cache_local",
-    file_name_prefix="ti_diversity",
-    cache_type="jsonl"
-)
-
-# 2) Initialize the operator
-filt = TextImageDiversityFilter(
-    text_thresh=0.8,   # Text uniqueness threshold (lower = looser)
-    hash_size=8,       # pHash size
-    img_dist_thresh=5  # Image uniqueness threshold (higher = stricter)
-)
-
-# 3) Execute filtering
-cols = filt.run(
-    storage=storage.step(),
-    image_key="image_path",
-    text_key="text"
-)
-print(cols)  # ["image_path", "text"]
-```
-
-### 🧾 Default Output Format
-| Field | Type | Description |
-| :--- | :--- | :--- |
-| `image_path` / specified `image_key` | `string` | The retained image path after deduplication. |
-| `text` / specified `text_key` | `string` | The retained text content after deduplication. |
-
-Example Input:
-```jsonl
-{
-  "image_path": "a.jpg",
-  "text": "A cat sitting on a wooden chair."
-}
-{
-  "image_path": "a_dup.jpg",
-  "text": "A cat sits on a wooden chair."
-}
-{
-  "image_path": "b.jpg",
-  "text": "A bus driving through a snowy mountain pass at night."
-}
-```
-
-Example Output:
-```jsonl
-{
-  "image_path": "a.jpg",
-  "text": "A cat sitting on a wooden chair."
-}
-{
-  "image_path": "b.jpg",
-  "text": "A bus driving through a snowy mountain pass at night."
-}
-```
\ No newline at end of file
diff --git a/docs/en/notes/mm_operators/image_understanding/generate/batch_vqa_generator.md b/docs/en/notes/mm_operators/image_understanding/generate/batch_vqa_generator.md
new file mode 100644
index 00000000..287ed23b
--- /dev/null
+++ b/docs/en/notes/mm_operators/image_understanding/generate/batch_vqa_generator.md
@@ -0,0 +1,115 @@
+---
+title: BatchVQAGenerator
+createTime: 2026/01/11 21:54:10
+permalink: /en/mm_operators/generate/batch_vqa_generator/
+---
+## 📘 Overview
+
+`BatchVQAGenerator` is a **Batch Visual Question Answering Operator**.
+
+It is designed for **"One Image, Many Questions"** scenarios. The input consists of a single image and a list of questions (e.g., ["What color?", "How many?", "What action?"]). The operator automatically pairs the image with each question in the list, constructs a batch request, and generates answers in parallel.
+
+This mechanism is highly efficient for dense captioning, multi-perspective image analysis, or attribute-based Q&A tasks.
+
+## 🏗️ `__init__` Function
+
+```python
+def __init__(
+    self, 
+    serving: LLMServingABC, 
+    system_prompt: str = "You are a helpful assistant."
+):
+
+```
+
+### 🧾 Parameters
+
+| Parameter | Type | Default | Description |
+| --- | --- | --- | --- |
+| `serving` | `LLMServingABC` | N/A | The model serving instance for inference (must support VLM multimodal inputs). |
+| `system_prompt` | `str` | `"You are..."` | The system prompt sent to the model. |
+
+## ⚡ `run` Function
+
+```python
+def run(
+    self, 
+    storage: DataFlowStorage, 
+    input_prompts_key: str, 
+    input_image_key: str, 
+    output_key: str
+):
+    ...
+
+```
+
+Executes the main logic:
+
+1. **Data Loading**
+Reads the image path (`input_image_key`) and the list of questions (`input_prompts_key`) from the DataFrame.
+2. **Broadcasting & Batch Construction**
+For each row:
+* Retrieves the single image path.
+* Iterates through every question `q` in the list.
+* Constructs a standard multimodal message `[Image, Text(q)]` for each question.
+* Packages all Q&A requests for that single image into one Batch.
+
+
+3. **Parallel Inference**
+Calls `serving.generate_from_input` to generate answers for all questions related to that image simultaneously using GPU parallelism.
+4. **Save Results**
+Writes the list of generated answers (in the same order as the question list) to the `output_key` column.
+
+### 🧾 `run` Parameters
+
+| Parameter | Type | Default | Description |
+| --- | --- | --- | --- |
+| `storage` | `DataFlowStorage` | N/A | DataFlow storage object. |
+| `input_prompts_key` | `str` | N/A | Column name containing the **list of questions** (`List[str]`). |
+| `input_image_key` | `str` | N/A | Column name containing the **single image** path. |
+| `output_key` | `str` | N/A | Output column name for the list of answers (`List[str]`). |
+
+## 🧩 Example Usage
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.core import LLMServing
+from dataflow.operators.generate import BatchVQAGenerator
+
+# 1) Initialize Model
+serving = LLMServing(model_path="Qwen/Qwen2.5-VL-7B-Instruct")
+
+# 2) Initialize Operator
+generator = BatchVQAGenerator(
+    serving=serving,
+    system_prompt="Answer briefly."
+)
+
+# 3) Prepare Data (jsonl)
+# Format: {"image": "scene.jpg", "questions": ["Weather?", "Object count?", "Action?"]}
+storage = FileStorage(file_name_prefix="dense_captioning")
+storage.step()
+
+# 4) Execute Batch VQA
+generator.run(
+    storage=storage,
+    input_prompts_key="questions",
+    input_image_key="image",
+    output_key="answers"
+)
+
+```
+
+### 🧾 Output Format
+
+The `output_key` column contains a list of strings corresponding to the input question list.
+
+**Example Input DataFrame:**
+| image | questions |
+| :--- | :--- |
+| `"park.jpg"` | `["Weather?", "Count?", "Action?"]` |
+
+**Example Output DataFrame:**
+| image | questions | answers |
+| :--- | :--- | :--- |
+| `"park.jpg"` | `["Weather?", "Count?", "Action?"]` | `["Sunny", "3 people", "Running"]` |
\ No newline at end of file
diff --git a/docs/en/notes/mm_operators/image_understanding/generate/fix_prompted_vqa_generator.md b/docs/en/notes/mm_operators/image_understanding/generate/fix_prompted_vqa_generator.md
new file mode 100644
index 00000000..8440c5c0
--- /dev/null
+++ b/docs/en/notes/mm_operators/image_understanding/generate/fix_prompted_vqa_generator.md
@@ -0,0 +1,126 @@
+---
+title: FixPromptedVQAGenerator
+createTime: 2026/01/11 21:31:49
+permalink: /en/mm_operators/fix_prompted_vqa_generator/
+---
+## 📘 Overview
+
+`FixPromptedVQAGenerator` is a **Fixed-Prompt Multimodal VQA Operator**.
+
+It is designed to execute the **same** instruction task on a batch of images or videos. Unlike dynamic templating operators, this operator accepts a static `user_prompt` (e.g., "Please caption this image") during initialization and applies it uniformly to every media sample in the input DataFrame.
+
+Use Cases:
+
+* Batch Image/Video Captioning.
+* Uniform VQA queries across a dataset (e.g., "Is there any violence in this image?").
+
+## 🏗️ `__init__` Function
+
+```python
+def __init__(
+    self, 
+    serving: LLMServingABC, 
+    system_prompt: str = "You are a helpful assistant.",
+    user_prompt: str = "Please caption the media in detail."
+):
+
+```
+
+### 🧾 Parameters
+
+| Parameter | Type | Default | Description |
+| --- | --- | --- | --- |
+| `serving` | `LLMServingABC` | N/A | The model serving instance for inference (must support multimodal inputs). |
+| `system_prompt` | `str` | `"You are..."` | The system prompt sent to the model. |
+| `user_prompt` | `str` | `"Please caption..."` | **Core Parameter**. The user instruction (Prompt) applied uniformly to all input samples. |
+
+## ⚡ `run` Function
+
+```python
+def run(
+    self, 
+    storage: DataFlowStorage,
+    input_image_key: str = "image", 
+    input_video_key: str = "video",
+    output_answer_key: str = "answer",
+):
+    ...
+
+```
+
+Executes the main logic:
+
+1. **Read Data**
+Reads the DataFrame from `storage`.
+2. **Input Construction**
+* Checks for and reads the `input_image_key` or `input_video_key` column.
+* Constructs the input message for each media file, combining the fixed `system_prompt`, the media file itself, and the fixed `user_prompt`.
+
+
+3. **Batch Inference**
+* Packages the constructed prompts and media data into a batch.
+* Calls `serving.generate_from_input` to execute parallel inference.
+
+
+4. **Save Results**
+* Writes the text generated by the model into the `output_answer_key` column.
+* Updates and saves the DataFrame.
+
+
+
+### 🧾 `run` Parameters
+
+| Parameter | Type | Default | Description |
+| --- | --- | --- | --- |
+| `storage` | `DataFlowStorage` | N/A | DataFlow storage object. |
+| `input_image_key` | `str` | `"image"` | Column name for image paths (mutually exclusive with video_key). |
+| `input_video_key` | `str` | `"video"` | Column name for video paths (mutually exclusive with image_key). |
+| `output_answer_key` | `str` | `"answer"` | Column name for the generated output. |
+
+## 🧩 Example Usage
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.core import LLMServing
+from dataflow.operators.generate import FixPromptedVQAGenerator
+
+# 1) Initialize Model
+serving = LLMServing(model_path="Qwen/Qwen2.5-VL-3B-Instruct")
+
+# 2) Initialize Operator: Set a fixed prompt
+# Example: Generate detailed descriptions for a batch of images
+generator = FixPromptedVQAGenerator(
+    serving=serving,
+    system_prompt="You are a helpful visual assistant.",
+    user_prompt="Describe the content of this image in detail, including objects, colors, and spatial relationships."
+)
+
+# 3) Prepare Data
+storage = FileStorage(
+    file_name_prefix="image_captioning_task",
+    cache_path="./cache_data"
+)
+storage.step()
+
+# 4) Execute Generation
+generator.run(
+    storage=storage,
+    input_image_key="image_path",
+    output_answer_key="detailed_caption"
+)
+
+```
+
+### 🧾 Input/Output Example
+
+**Input DataFrame Row:**
+| image_path |
+| :--- |
+| `"/data/cat.jpg"` |
+| `"/data/dog.png"` |
+
+**Output DataFrame Row:**
+| image_path | detailed_caption |
+| :--- | :--- |
+| `"/data/cat.jpg"` | `"A black and white cat sitting on a sofa..."` |
+| `"/data/dog.png"` | `"A golden retriever running on the grass..."` |
diff --git a/docs/en/notes/mm_operators/image_understanding/generate/image_bbox_generator.md b/docs/en/notes/mm_operators/image_understanding/generate/image_bbox_generator.md
new file mode 100644
index 00000000..e930c005
--- /dev/null
+++ b/docs/en/notes/mm_operators/image_understanding/generate/image_bbox_generator.md
@@ -0,0 +1,147 @@
+---
+title: ImageBboxGenerator
+createTime: 2026/01/11 21:49:37
+permalink: /en/mm_operators/generate/image_bbox_generator/
+---
+## 📘 Overview
+
+`ImageBboxGenerator` is an **Image Region Annotation & Prompt Preparation Operator**.
+
+It is primarily used for data preprocessing in multimodal tasks (such as Grounding Caption). It handles raw data containing image paths, normalizes Regions of Interest (RoI), visualizes them, and generates structured Prompts for subsequent VLM inference.
+
+Key Capabilities:
+
+1. **Dual BBox Acquisition**:
+* **Existing Mode**: Reads existing BBox coordinates directly from the input data.
+* **Auto-Extraction Mode**: If no BBox is provided, automatically extracts salient object regions using OpenCV (Edge Detection + Contour Fitting).
+
+
+2. **Coordinate Normalization**: Converts pixel coordinates into normalized coordinates (0-1 range) compliant with VLM input standards.
+3. **Visualization Enhancement**: Generates images with numbered, colored bounding boxes to help the model understand "Region N" references.
+4. **Prompt Construction**: Automatically generates prompts containing region count information (e.g., "Describe the content of each marked region...").
+
+## 🏗️ `__init__` Function
+
+```python
+def __init__(self, config: Optional[ExistingBBoxDataGenConfig] = None):
+    ...
+
+```
+
+### 🧾 Parameters
+
+| Parameter | Type | Default | Description |
+| --- | --- | --- | --- |
+| `config` | `ExistingBBoxDataGenConfig` | `None` | Configuration object defining input/output paths and max box limits. |
+
+#### `ExistingBBoxDataGenConfig` Details
+
+| Field | Type | Default | Description |
+| --- | --- | --- | --- |
+| `max_boxes` | `int` | `10` | Max BBoxes per image (sorted by area). Zero-padded if fewer. |
+| `input_jsonl_path` | `str` | `None` | **Required**. Path to the input JSONL file. |
+| `output_jsonl_path` | `str` | `None` | **Required**. Path to save the processed results. |
+
+## ⚡ `run` Function
+
+```python
+def run(
+    self, 
+    storage: DataFlowStorage, 
+    input_image_key: str = "image", 
+    input_bbox_key: str = "bbox", 
+    output_key: str = "mdvp_record"
+):
+    ...
+
+```
+
+Executes the main logic:
+
+1. **Data Loading**
+Reads raw data from `config.input_jsonl_path`.
+2. **BBox Acquisition (Extract/Get)**
+* Checks each row for `input_bbox_key`.
+* **Type A (With BBox)**: Uses the coordinates provided in the data.
+* **Type B (Without BBox)**: Calls `extract_boxes_from_image` to extract object contours via adaptive thresholding and morphology, applying NMS (Non-Maximum Suppression) to remove duplicates.
+
+
+3. **Normalization & Visualization**
+* **Normalization**: Converts `[x, y, w, h]` to normalized `[x1, y1, x2, y2]` format, truncating or padding with `0.0` to match `max_boxes`.
+* **Visualization**: Draws green rectangles and numeric labels on the original image, saving the result to `storage.cache_path`.
+
+
+4. **Prompt Generation**
+* Generates a fixed template prompt based on the valid box count:
+> "Describe the content of each marked region in the image. There are {N} regions: \<region1\> to \<regionN\>."
+
+
+
+
+5. **Result Export**
+* Writes the complete record containing raw info, normalized BBoxes, visualization paths, and the Prompt to `config.output_jsonl_path`.
+
+
+
+### 🧾 `run` Parameters
+
+| Parameter | Type | Default | Description |
+| --- | --- | --- | --- |
+| `storage` | `DataFlowStorage` | N/A | Storage object, mainly used to provide the `cache_path`. |
+| `input_image_key` | `str` | `"image"` | Field name for image paths in the input JSONL. |
+| `input_bbox_key` | `str` | `"bbox"` | Field name for BBox data in the input JSONL. |
+| `output_key` | `str` | `"mdvp_record"` | (Reserved) Key name for the output record. |
+
+## 🧩 Example Usage
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.cv import ImageBboxGenerator, ExistingBBoxDataGenConfig
+
+# 1) Configure Parameters
+config = ExistingBBoxDataGenConfig(
+    max_boxes=5,
+    input_jsonl_path="./data/raw_images.jsonl",
+    output_jsonl_path="./data/processed_with_prompts.jsonl"
+)
+
+# 2) Initialize Operator
+# Note: This operator is for data prep and does not require a Serving instance
+generator = ImageBboxGenerator(config=config)
+
+# 3) Prepare Storage (Only for providing cache path)
+storage = FileStorage(
+    cache_path="./cache_vis_images",
+    file_name_prefix="bbox_gen"
+)
+
+# 4) Execute Processing
+# Automatically reads from config input, writes to config output
+generator.run(
+    storage=storage,
+    input_image_key="image_path",
+    input_bbox_key="ground_truth_bbox" # Will auto-extract if this column is missing
+)
+
+```
+
+### 🧾 Output Data Format (Output JSONL)
+
+Each line in the `output_jsonl_path` file contains:
+
+```json
+{
+  "image": "/data/raw/cat.jpg",
+  "type": "without_bbox", // or "with_bbox"
+  "bbox": [[100, 200, 50, 60], ...], // Raw pixel coords [x, y, w, h]
+  "normalized_bbox": [
+      [0.1, 0.2, 0.15, 0.26], 
+      [0.0, 0.0, 0.0, 0.0] // Zero-padded
+  ],
+  "result_file": "./cache_vis_images",
+  "image_with_bbox": "./cache_vis_images/1_bbox_vis.jpg", // Path to visualized image
+  "valid_bboxes_num": 1,
+  "prompt": "Describe the content of each marked region in the image. There are 1 regions: \<region1\> to \<region1\>."
+}
+
+```
diff --git a/docs/en/notes/mm_operators/image_understanding/generate/image_caprl.md b/docs/en/notes/mm_operators/image_understanding/generate/image_caprl.md
index 9649a5c2..c8e82317 100644
--- a/docs/en/notes/mm_operators/image_understanding/generate/image_caprl.md
+++ b/docs/en/notes/mm_operators/image_understanding/generate/image_caprl.md
@@ -64,7 +64,7 @@ class CapRLMCQGenerate(OperatorABC):
 def run(
     self,
     storage: DataFlowStorage,
-    image_key: str = "image",
+    input_image_key: str = "image",
     output_key: str = "cap_rl_qa",
 ):
     ...
@@ -79,7 +79,7 @@ Reads images, generates MCQs, parses, rotates, validates under both conditions,
 | Parameter  | Type              | Default       | Description                                |
 | ---------- | ----------------- | ------------- | ------------------------------------------ |
 | storage    | `DataFlowStorage` | required      | IO and caching.                            |
-| image_key  | str               | `"image"`     | Input field (image path).                  |
+| input_image_key  | str               | `"image"`     | Input field (image path).                  |
 | output_key | str               | `"cap_rl_qa"` | Output field to store the full stats JSON. |
 
 ---
@@ -111,7 +111,7 @@ cfg = CapRLMCQConfig(
 )
 
 op = CapRLMCQGenerate(serving, cfg)
-op.run(storage=storage.step(), image_key="image", output_key="cap_rl_qa")
+op.run(storage=storage.step(), input_image_key="image", output_key="cap_rl_qa")
 ```
 
 ---
diff --git a/docs/en/notes/mm_operators/image_understanding/generate/image_caption.md b/docs/en/notes/mm_operators/image_understanding/generate/image_caption.md
index 746397f1..0171f15a 100644
--- a/docs/en/notes/mm_operators/image_understanding/generate/image_caption.md
+++ b/docs/en/notes/mm_operators/image_understanding/generate/image_caption.md
@@ -1,15 +1,19 @@
 ---
-title: Image Caption Generation
+title: ImageCaptionGenerator
 createTime: 2025/10/15 15:00:00
-icon: material-symbols-light:image
+# icon: material-symbols-light:image
 permalink: /en/mm_operators/generate/image_caption/
 ---
 
 ## 📘 Overview
 
-`ImageCaptionGenerate` is an operator designed to **automatically generate image captions using large vision-language models (VLMs)**.  
-Given input images, it constructs prompts to guide the model in producing high-quality scene or object descriptions.  
-This is suitable for multimodal annotation, dataset construction, and image-text matching tasks.
+`ImageCaptionGenerator` is an operator designed to **automatically generate image captions using large vision-language models (VLMs)**.  
+Given input images, it constructs prompts to guide the model in producing high-quality scene or object descriptions. This is suitable for multimodal annotation, dataset construction, and image-text matching tasks.
+
+**Features:**
+* Supports batch processing of multiple images.
+* Generates high-quality descriptions using VLMs like Qwen.
+* Automatically handles image input and prompt construction.
 
 ---
 
@@ -25,11 +29,11 @@ def __init__(
 
 ## 🧾 `__init__` Parameters
 
-| Parameter     | Type            | Default | Description                                                   |
-| :------------ | :-------------- | :------ | :------------------------------------------------------------ |
-| `llm_serving` | `LLMServingABC` | -       | Model serving object used to call VLM for generating captions |
+| Parameter     | Type            | Default | Description                                                     |
+| :------------ | :-------------- | :------ | :-------------------------------------------------------------- |
+| `llm_serving` | `LLMServingABC` | -       | **Model Serving Object** used to call the VLM for caption generation |
 
----
+-----
 
 ## ⚡ `run` Function
 
@@ -37,35 +41,35 @@ def __init__(
 def run(
     self,
     storage: DataFlowStorage,
-    multi_modal_key: str = "image",
-    output_key: str = "caption"
+    input_modal_key: str = "image",
+    output_key: str = "output"
 ):
     ...
 ```
 
 The `run` function executes the main caption generation workflow:
-read image paths → construct prompts → call the model → generate text captions → write results to output.
+read image paths → **validate DataFrame** → construct prompts → call the model → generate text captions → write results to output.
 
 ## 🧾 `run` Parameters
 
-| Parameter         | Type              | Default     | Description                    |
-| :---------------- | :---------------- | :---------- | :----------------------------- |
-| `storage`         | `DataFlowStorage` | -           | Dataflow storage object        |
-| `multi_modal_key` | `str`             | `"image"`   | Multimodal input field name    |
-| `output_key`      | `str`             | `"caption"` | Output field name for captions |
+| Parameter         | Type              | Default     | Description                                           |
+| :---------------- | :---------------- | :---------- | :---------------------------------------------------- |
+| `storage`         | `DataFlowStorage` | -           | Dataflow storage object                               |
+| `input_modal_key` | `str`             | `"image"`   | **Multimodal Input Field Name** (e.g., image paths)   |
+| `output_key`      | `str`             | `"output"`  | **Model Output Field Name** (the generated description text) |
 
----
+-----
 
 ## 🧠 Example Usage
 
 ```python
 from dataflow.utils.storage import FileStorage
 from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
-from dataflow.operators.core_vision import ImageCaptionGenerate
+from dataflow.operators.core_vision import ImageCaptionGenerator
 
 # Step 1: Launch local model service
 serving = LocalModelVLMServing_vllm(
-    hf_model_name_or_path="./models/Qwen2.5-VL-3B-Instruct",
+    hf_model_name_or_path="Qwen/Qwen2.5-VL-3B-Instruct",
     vllm_tensor_parallel_size=1,
     vllm_temperature=0.7,
     vllm_top_p=0.9,
@@ -74,25 +78,23 @@ serving = LocalModelVLMServing_vllm(
 
 # Step 2: Prepare input data
 storage = FileStorage(
-    first_entry_file_name="data/example_caption.jsonl",
+    first_entry_file_name="dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl",
     cache_path="./cache_local",
-    file_name_prefix="caption",
+    file_name_prefix="dataflow_cache_step",
     cache_type="jsonl",
-    media_key="image",
-    media_type="image"
 )
-storage.step()
+storage.step() # Load data
 
 # Step 3: Initialize and run the operator
-generator = ImageCaptionGenerate(serving)
+generator = ImageCaptionGenerator(serving)
 generator.run(
     storage=storage,
-    multi_modal_key="image",
-    output_key="caption"
+    input_modal_key="image",
+    output_key="caption" # Explicitly specifying output field as "caption" in the example
 )
 ```
 
----
+-----
 
 ## 🧾 Default Output Format
 
@@ -101,7 +103,7 @@ generator.run(
 | `image`   | `List[str]` | Input image paths            |
 | `caption` | `str`       | Generated image caption text |
 
----
+-----
 
 ### 📥 Example Input
 
diff --git a/docs/en/notes/mm_operators/image_understanding/generate/image_gcot.md b/docs/en/notes/mm_operators/image_understanding/generate/image_gcot.md
index 389d835d..6bf2dfab 100644
--- a/docs/en/notes/mm_operators/image_understanding/generate/image_gcot.md
+++ b/docs/en/notes/mm_operators/image_understanding/generate/image_gcot.md
@@ -54,9 +54,9 @@ def __init__(
 def run(
     self,
     storage: DataFlowStorage,
-    question_key: str = "question",
-    answer_key: str = "answer",
-    image_key: str = "image",
+    input_question_key: str = "question",
+    input_answer_key: str = "answer",
+    input_image_key: str = "image",
     output_key: str = "gcot",
     save_intermediate: bool = True,
     qwen_unload_callback = None
@@ -74,9 +74,9 @@ Executes the complete GCoT generation pipeline:
 | Parameter              | Type              | Default      | Description                                           |
 | :--------------------- | :---------------- | :----------- | :---------------------------------------------------- |
 | `storage`              | `DataFlowStorage` | -            | Dataflow storage object                               |
-| `question_key`         | `str`             | `"question"` | Input question field name                             |
-| `answer_key`           | `str`             | `"answer"`   | Input answer field name                               |
-| `image_key`            | `str`             | `"image"`    | Input image field name                                |
+| `input_question_key`         | `str`             | `"question"` | Input question field name                             |
+| `input_answer_key`           | `str`             | `"answer"`   | Input answer field name                               |
+| `input_image_key`            | `str`             | `"image"`    | Input image field name                                |
 | `output_key`           | `str`             | `"gcot"`     | Output GCoT field name                                |
 | `save_intermediate`    | `bool`            | `True`       | Whether to save intermediate results and visualizations |
 | `qwen_unload_callback` | `Callable`        | `None`       | Callback function to unload Qwen model (for memory management) |
@@ -124,9 +124,9 @@ def unload_qwen():
 
 gcot_generator.run(
     storage=storage,
-    question_key="question",
-    answer_key="answer",
-    image_key="image",
+    input_question_key="question",
+    input_answer_key="answer",
+    input_image_key="image",
     output_key="gcot",
     save_intermediate=True,
     qwen_unload_callback=unload_qwen
diff --git a/docs/en/notes/mm_operators/image_understanding/generate/image_pers_qa.md b/docs/en/notes/mm_operators/image_understanding/generate/image_pers_qa.md
index 968f0b4b..4b965947 100644
--- a/docs/en/notes/mm_operators/image_understanding/generate/image_pers_qa.md
+++ b/docs/en/notes/mm_operators/image_understanding/generate/image_pers_qa.md
@@ -1,20 +1,30 @@
 ---
-title: Personalized Image QA Generation
+title: PersQAGenerator
 createTime: 2025/10/15 18:20:00
-icon: material-symbols-light:quiz
+# icon: material-symbols-light:quiz
 permalink: /en/mm_operators/generate/image_pers_qa/
 ---
 
 ## 📘 Overview
 
-`PersQAGenerate` is an operator for **generating personalized image question-answer pairs using large vision-language models (VLMs)**.  
-It can:  
-- Automatically assign name tags to main characters in images (e.g., `<mam>`);  
-- Randomly select suitable questions from predefined templates;  
-- Guide the model to start answers with the character's name;  
-- Output structured QA pairs suitable for multimodal QA dataset construction and evaluation of character understanding.
+`PersQAGenerator` is an operator designed to **generate personalized image Question-Answer (QA) pairs based on large vision-language models (VLMs)**.  
+It performs the following steps:
 
----
+  * Automatically assigns a name tag to the main character in the image (hardcoded as `<mam>` in the implementation).
+  * Randomly selects an appropriate question from predefined templates.
+  * Guides the VLM to start the answer with the character's name tag.
+  * Outputs structured QA pairs, suitable for multimodal QA dataset construction and character role understanding evaluation.
+
+**Features:**
+
+  * Supports generating personalized QA for specific characters in images.
+  * Automatically assigns name tags (e.g., `<mam>`) to the main subject.
+  * Randomly selects relevant questions from predefined templates.
+  * Requires the model to start answers with the main character's name tag.
+  * Supports batch processing of multiple images.
+  * Output includes the complete Question-Answer pair in the format: `Question: ..., Answer: ...`.
+
+-----
 
 ## 🏗️ `__init__` Function
 
@@ -28,11 +38,11 @@ def __init__(
 
 ## 🧾 `__init__` Parameters
 
-| Parameter     | Type            | Default | Description                                                   |
-| :------------ | :-------------- | :------ | :------------------------------------------------------------ |
-| `llm_serving` | `LLMServingABC` | -       | Model serving object used to call VLM for generating QA pairs |
+| Parameter     | Type            | Default | Description                                                     |
+| :------------ | :-------------- | :------ | :-------------------------------------------------------------- |
+| `llm_serving` | `LLMServingABC` | -       | **Model Serving Object** used to call the VLM for QA generation |
 
----
+-----
 
 ## ⚡ `run` Function
 
@@ -40,71 +50,68 @@ def __init__(
 def run(
     self,
     storage: DataFlowStorage,
-    multi_modal_key: str = "image",
-    output_key: str = "pers_qa"
+    input_modal_key: str = "image",
+    output_key: str = "output"
 ):
     ...
 ```
 
-The `run` function executes the main QA generation workflow:
-read image paths → construct questions and prompts → call the model → return structured QA results.
+The `run` function executes the main QA generation logic: read image paths → construct questions and prompts → call the model → return structured QA results.
 
 ## 🧾 `run` Parameters
 
-| Parameter         | Type              | Default     | Description                    |
-| :---------------- | :---------------- | :---------- | :----------------------------- |
-| `storage`         | `DataFlowStorage` | -           | Dataflow storage object        |
-| `multi_modal_key` | `str`             | `"image"`   | Multimodal input field name    |
-| `output_key`      | `str`             | `"pers_qa"` | Output field name for QA pairs |
+| Parameter         | Type              | Default     | Description                                                          |
+| :---------------- | :---------------- | :---------- | :------------------------------------------------------------------- |
+| `storage`         | `DataFlowStorage` | -           | Dataflow storage object                                              |
+| `input_modal_key` | `str`             | `"image"`   | **Multimodal Input Field Name** (image path)                         |
+| `output_key`      | `str`             | `"output"`  | **Model Output Field Name** (personalized QA text, defaults to `output`) |
 
----
+-----
 
 ## 🧠 Example Usage
 
 ```python
 from dataflow.utils.storage import FileStorage
 from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
-from dataflow.operators.core_vision import PersQAGenerate
+from dataflow.operators.core_vision import PersQAGenerator
 
 # Step 1: Launch local model service
 serving = LocalModelVLMServing_vllm(
-    hf_model_name_or_path="./models/Qwen2.5-VL-3B-Instruct",
+    hf_model_name_or_path="Qwen/Qwen2.5-VL-3B-Instruct",
     vllm_tensor_parallel_size=1,
     vllm_temperature=0.7,
     vllm_top_p=0.9,
     vllm_max_tokens=512
 )
 
-# Step 2: Prepare storage
+# Step 2: Set up storage
 storage = FileStorage(
-    first_entry_file_name="data/example.jsonl",
+    first_entry_file_name="dataflow/example/Image2TextPipeline/test_image2caption.jsonl",
     cache_path="./cache_local",
     file_name_prefix="pers_qa",
     cache_type="jsonl",
-    media_key="image",
-    media_type="image"
 )
 storage.step()
 
 # Step 3: Initialize and run the operator
-generator = PersQAGenerate(serving)
+generator = PersQAGenerator(serving)
 generator.run(
     storage=storage,
-    multi_modal_key="image",
+    input_modal_key="image",
     output_key="pers_qa"
 )
 ```
 
----
+-----
 
 ## 🧾 Default Output Format
 
-| Field     | Type        | Description                                                              |
-| :-------- | :---------- | :----------------------------------------------------------------------- |
-| `image`   | `List[str]` | Input image paths                                                        |
-| `pers_qa` | `str`       | Generated personalized QA text in the format `Question: ... Answer: ...` |
+| Field     | Type        | Description                                                          |
+| :-------- | :---------- | :------------------------------------------------------------------- |
+| `image`   | `List[str]` | Input image paths                                                    |
+| `pers_qa` | `str`       | Generated personalized QA pair text, format: `Question: ..., Answer: ...` |
 
----
+-----
 
 ### 📥 Example Input
 
@@ -116,6 +123,8 @@ generator.run(
 ### 📤 Example Output
 
 ```jsonl
-{"image": ["./test/example1.jpg"], "pers_qa": "Question: <mam> What is she doing? Answer: <mam> is smiling at the camera."}
-{"image": ["./test/example2.jpg"], "pers_qa": "Question: <mam> Where is she? Answer: <mam> is in a cafe."}
-```
\ No newline at end of file
+{"image": ["./test/example1.jpg"], "pers_qa": "Question: <mam> is doing what?, Answer: <mam> is smiling at the camera."}
+{"image": ["./test/example2.jpg"], "pers_qa": "Question: Where is <mam>?, Answer: <mam> is in a cafe."}
+```
+
+> **Tips:** Using a stronger Multimodal Large Language Model (MLLM) can ensure more accurate format generation.
\ No newline at end of file
diff --git a/docs/en/notes/mm_operators/image_understanding/generate/image_qa.md b/docs/en/notes/mm_operators/image_understanding/generate/image_qa.md
index a76dde2a..db71f0f9 100644
--- a/docs/en/notes/mm_operators/image_understanding/generate/image_qa.md
+++ b/docs/en/notes/mm_operators/image_understanding/generate/image_qa.md
@@ -1,16 +1,23 @@
 ---
-title: Image Question-Answer Generation
+title: ImageQAGenerator
 createTime: 2025/10/15 16:00:00
-icon: material-symbols-light:quiz
+# icon: material-symbols-light:quiz
 permalink: /en/mm_operators/generate/image_qa/
 ---
 
 ## 📘 Overview
 
-`ImageQAGenerate` is an operator for **automatically generating question-answer pairs (Visual QA) based on image content**.  
-It can intelligently ask relevant questions about the image scene and generate reference answers. This is suitable for multimodal QA dataset construction, retrieval augmentation, and image-text matching enhancement.
+`ImageQAGenerator` is an operator designed to **automatically generate Question-Answer (QA) pairs based on image content (Visual QA)**.  
+It intelligently proposes relevant questions based on the image scene and generates corresponding reference answers.
 
----
+**Features:**
+
+  * Supports batch processing of multiple images.
+  * Automatically generates relevant QA pairs using Vision-Language Models (VLMs).
+  * Applicable for Visual QA dataset construction and model training.
+  * Automatically handles image input and QA prompt construction.
+
+-----
 
 ## 🏗️ `__init__` Function
 
@@ -24,11 +31,11 @@ def __init__(
 
 ## 🧾 `__init__` Parameters
 
-| Parameter     | Type            | Default | Description                                                                 |
-| :------------ | :-------------- | :------ | :-------------------------------------------------------------------------- |
-| `llm_serving` | `LLMServingABC` | -       | Model serving object used to call a vision-language model for QA generation |
+| Parameter     | Type            | Default | Description                                                     |
+| :------------ | :-------------- | :------ | :-------------------------------------------------------------- |
+| `llm_serving` | `LLMServingABC` | -       | **Model Serving Object** used to call the VLM for QA generation |
 
----
+-----
 
 ## ⚡ `run` Function
 
@@ -36,34 +43,34 @@ def __init__(
 def run(
     self,
     storage: DataFlowStorage,
-    multi_modal_key: str = "image",
-    output_key: str = "qa_pairs"
+    input_modal_key: str = "image",
+    output_key: str = "output"
 ):
     ...
 ```
 
-The `run` function executes the main QA generation workflow: generates multiple QA pairs for the input images and writes them to the output file.
+The `run` function executes the main operator logic: read image paths → **validate DataFrame** → construct prompts → call the model → generate Visual QA pairs and write them to the output file.
 
 ## 🧾 `run` Parameters
 
-| Parameter         | Type              | Default      | Description                    |
-| :---------------- | :---------------- | :----------- | :----------------------------- |
-| `storage`         | `DataFlowStorage` | -            | Dataflow storage object        |
-| `multi_modal_key` | `str`             | `"image"`    | Multimodal input field name    |
-| `output_key`      | `str`             | `"qa_pairs"` | Output field name for QA pairs |
+| Parameter         | Type              | Default     | Description                                                               |
+| :---------------- | :---------------- | :---------- | :------------------------------------------------------------------------ |
+| `storage`         | `DataFlowStorage` | -           | Dataflow storage object                                                   |
+| `input_modal_key` | `str`             | `"image"`   | **Multimodal Input Field Name** (e.g., image paths)                       |
+| `output_key`      | `str`             | `"output"`  | **Output QA Field Name** (defaults to `output`, can be customized)        |
 
----
+-----
 
 ## 🧠 Example Usage
 
 ```python
 from dataflow.utils.storage import FileStorage
 from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
-from dataflow.operators.core_vision import ImageQAGenerate
+from dataflow.operators.core_vision import ImageQAGenerator
 
 # Step 1: Launch local model service
 serving = LocalModelVLMServing_vllm(
-    hf_model_name_or_path="./models/Qwen2.5-VL-3B-Instruct",
+    hf_model_name_or_path="Qwen/Qwen2.5-VL-3B-Instruct",
     vllm_tensor_parallel_size=1,
     vllm_temperature=0.7,
     vllm_top_p=0.9,
@@ -72,34 +79,34 @@ serving = LocalModelVLMServing_vllm(
 
 # Step 2: Prepare input data
 storage = FileStorage(
-    first_entry_file_name="data/example_qa.jsonl",
+    first_entry_file_name="dataflow/example/Image2TextPipeline/test_image2qa.jsonl",
     cache_path="./cache_local",
     file_name_prefix="imageqa",
     cache_type="jsonl",
-    media_key="image",
-    media_type="image"
 )
-storage.step()
+storage.step() # Load data
 
 # Step 3: Initialize and run the operator
-qa_generator = ImageQAGenerate(serving)
+qa_generator = ImageQAGenerator(serving)
 qa_generator.run(
     storage=storage,
-    multi_modal_key="image",
-    output_key="qa_pairs"
+    input_modal_key="image",
+    output_key="qa_pairs" # Explicitly specifying output field as "qa_pairs" in the example
 )
 ```
 
----
+-----
 
 ## 🧾 Default Output Format
 
-| Field      | Type                   | Description                                                        |
-| :--------- | :--------------------- | :----------------------------------------------------------------- |
-| `image`    | `List[str]`            | Input image paths                                                  |
-| `qa_pairs` | `List[Dict[str, str]]` | Generated QA pairs, each containing `question` and `answer` fields |
+| Field      | Type                     | Description                                                          |
+| :--------- | :--------------------- | :------------------------------------------------------------------- |
+| `image`    | `List[str]`            | Input image paths                                                    |
+| `qa_pairs` | `List[Dict[str, str]]` | Generated QA pairs, containing `question` and `answer` fields        |
 
----
+> **Note:** The raw output from the model (`output_key`) is typically a single string containing all QA pairs. A subsequent operator (like `JsonParser`) is usually required to structure this output into the `List[Dict[str, str]]` format shown here.
+
+-----
 
 ### 📥 Example Input
 
@@ -107,15 +114,15 @@ qa_generator.run(
 {"image": ["./test/street_scene.jpg"]}
 ```
 
-### 📤 Example Output
+### 📤 Example Output (Structured)
 
 ```jsonl
 {
   "image": ["./test/street_scene.jpg"],
   "qa_pairs": [
-    {"question": "How many cars are in the image?", "answer": "Two"},
-    {"question": "What kind of scene is captured in this photo?", "answer": "City street"},
-    {"question": "What is the main type of transportation in the image?", "answer": "Cars"}
+    {"question": "How many cars are in the image?", "answer": "Two cars"},
+    {"question": "What is the scene depicted in this photo?", "answer": "A city street"},
+    {"question": "What is the main mode of transportation shown?", "answer": "A car"}
   ]
 }
 ```
\ No newline at end of file
diff --git a/docs/en/notes/mm_operators/image_understanding/generate/image_region_caption.md b/docs/en/notes/mm_operators/image_understanding/generate/image_region_caption.md
index 986c2cd7..5c9c4ea2 100644
--- a/docs/en/notes/mm_operators/image_understanding/generate/image_region_caption.md
+++ b/docs/en/notes/mm_operators/image_understanding/generate/image_region_caption.md
@@ -36,8 +36,8 @@ def __init__(
 def run(
     self, 
     storage: DataFlowStorage, 
-    image_key: str = "image", 
-    bbox_key: str = "bbox", 
+    input_image_key: str = "image", 
+    input_bbox_key: str = "bbox", 
     output_key: str = "mdvp_record"
 ):
 ```
@@ -47,8 +47,8 @@ def run(
 | Parameter | Type | Default | Description |
 | :--- | :--- | :--- | :--- |
 | storage | DataFlowStorage | No default (required) | Storage instance for file operations and cache path management |
-| image_key | str | "image" | Field name for image path in input JSONL data |
-| bbox_key | str | "bbox" | Field name for bounding boxes in input data. If missing, automatically extracts from image |
+| input_image_key | str | "image" | Field name for image path in input JSONL data |
+| input_bbox_key | str | "bbox" | Field name for bounding boxes in input data. If missing, automatically extracts from image |
 | output_key | str | "mdvp_record" | Field name for output region caption records in result data |
 
 
@@ -88,8 +88,8 @@ operator = ImageRegionCaptionGenerate(llm_serving=model, config=cfg)
 
 operator.run(
     storage=storage.step(),
-    image_key="image",
-    bbox_key="bbox",
+    input_image_key="image",
+    input_bbox_key="bbox",
     output_key="mdvp_record"
 )
 
diff --git a/docs/en/notes/mm_operators/image_understanding/generate/image_scale_caption.md b/docs/en/notes/mm_operators/image_understanding/generate/image_scale_caption.md
index 01586b25..4423cad6 100644
--- a/docs/en/notes/mm_operators/image_understanding/generate/image_scale_caption.md
+++ b/docs/en/notes/mm_operators/image_understanding/generate/image_scale_caption.md
@@ -66,7 +66,7 @@ Returns the operator description in either Chinese or English.
 def run(
     self, 
     storage: DataFlowStorage,
-    image_key: str = "image", 
+    input_image_key: str = "image", 
     output_key: str = "scalecap_record"
 )
 ```
@@ -76,7 +76,7 @@ def run(
 | Parameter    | Type              | Default             | Description                                               |
 | ------------ | ----------------- | ------------------- | --------------------------------------------------------- |
 | `storage`    | `DataFlowStorage` | —                   | DataFlow storage interface for reading and writing data.  |
-| `image_key`  | `str`             | `"image"`           | The column name for image paths in the input.             |
+| `input_image_key`  | `str`             | `"image"`           | The column name for image paths in the input.             |
 | `output_key` | `str`             | `"scalecap_record"` | The column name where the generated captions are written. |
 
 ---
@@ -107,7 +107,7 @@ operator = ImageScaleCaptionGenerate(vlm_serving=model, config=cfg)
 # Run the operator
 operator.run(
     storage=storage,
-    image_key="image",
+    input_image_key="image",
     output_key="scalecap_record"
 )
 ```
diff --git a/docs/en/notes/mm_operators/image_understanding/generate/image_skvqa.md b/docs/en/notes/mm_operators/image_understanding/generate/image_skvqa.md
deleted file mode 100644
index 5850e976..00000000
--- a/docs/en/notes/mm_operators/image_understanding/generate/image_skvqa.md
+++ /dev/null
@@ -1,163 +0,0 @@
----
-
-title: Image-based Knowledge-Enhanced Question Answering Generation (SKVQA)
-createTime: 2025/10/26 15:00:00
-icon: material-symbols-light:image
-permalink: /en/mm_operators/generate/image_skvqa/
----
-
-## 📘 Overview
-
-`ImageSKVQAGenerate` is an operator designed to generate **Synthetic Knowledge Visual Question Answering (SKVQA)** data.
-Unlike standard Visual Question Answering (VQA), SKVQA integrates external **contextual knowledge** into the question–answer generation process,
-enabling the model to reason based not only on the image itself but also on relevant textual descriptions or background information.
-
-This capability is highly useful for **visual knowledge understanding, product manual QA generation, and multimodal knowledge-enhanced training** tasks.
-
----
-
-## 🏗️ `__init__` Function
-
-```python
-def __init__(
-    self,
-    llm_serving: LLMServingABC
-):
-    ...
-```
-
-## 🧾 `__init__` Parameters
-
-| Parameter     | Type            | Default | Description                                                                               |
-| :------------ | :-------------- | :------ | :---------------------------------------------------------------------------------------- |
-| `llm_serving` | `LLMServingABC` | -       | The model serving object used to call a Vision-Language Model (VLM) for SKVQA generation. |
-
----
-
-## ⚡ `run` Function
-
-```python
-def run(
-    self,
-    storage: DataFlowStorage,
-    multi_modal_key: str = "image",
-    output_key: str = "skvqa"
-):
-    ...
-```
-
-Executes the main operator logic to generate structured SKVQA outputs — including contextual text (`context`) and question–answer pairs (`QAs`) — for each input image.
-
----
-
-## 🧾 `run` Parameters
-
-| Parameter         | Type              | Default   | Description                                                  |
-| :---------------- | :---------------- | :-------- | :----------------------------------------------------------- |
-| `storage`         | `DataFlowStorage` | -         | The DataFlow storage object.                                 |
-| `multi_modal_key` | `str`             | `"image"` | The multimodal input field name (usually the image path).    |
-| `output_key`      | `str`             | `"skvqa"` | The output field name used to store the parsed SKVQA result. |
-
----
-
-## 🧠 Operator Functionality
-
-* Automatically generates a structured **SKVQA output** for each image, containing:
-
-  * `context`: Contextual background information or knowledge relevant to the image.
-  * `qas`: A list of question–answer pairs (`question`, `answer`).
-
-* Parses model outputs formatted in Markdown, such as:
-
-  ```
-  ### Wikipedia Article
-  (context text)
-
-  ### Question Answer Pairs
-  1. **Question**
-     - Answer
-  2. **Question**
-     - Answer
-  ```
-
-* Supports **fault-tolerant parsing**, meaning even imperfectly formatted text can be interpreted as best as possible.
-
-* Applicable for **visual knowledge enhancement, multimodal training, and QA generation** tasks.
-
----
-
-## 🧩 Example Usage
-
-```python
-from dataflow.utils.storage import FileStorage
-from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
-from dataflow.operators.core_vision.generate.sk_vqa_generator import ImageSKVQAGenerate
-
-# Step 1: Launch a local vision-language model
-serving = LocalModelVLMServing_vllm(
-    hf_model_name_or_path="./models/Qwen2.5-VL-3B-Instruct",
-    vllm_tensor_parallel_size=1,
-    vllm_temperature=0.7,
-    vllm_top_p=0.9,
-    vllm_max_tokens=512
-)
-
-# Step 2: Prepare input data
-storage = FileStorage(
-    first_entry_file_name="data/example_skvqa.jsonl",
-    cache_path="./cache_skvqa",
-    cache_type="jsonl"
-)
-storage.step()
-
-# Step 3: Initialize the operator and run it
-skvqa_generator = ImageSKVQAGenerate(serving)
-skvqa_generator.run(
-    storage=storage,
-    multi_modal_key="image",
-    output_key="skvqa"
-)
-```
-
----
-
-## 🧾 Default Output Format
-
-| Field   | Type             | Description                                                                          |
-| :------ | :--------------- | :----------------------------------------------------------------------------------- |
-| `image` | `List[str]`      | List of input image paths.                                                           |
-| `skvqa` | `Dict[str, Any]` | The structured SKVQA output generated by the model, including context and Q&A pairs. |
-
----
-
-### 📥 Example Input
-
-```jsonl
-{"image": ["./data/product_manual.jpg"]}
-```
-
-### 📤 Example Output
-
-```jsonl
-{
-  "image": ["./data/product_manual.jpg"],
-  "skvqa": {
-    "context": "This is a section from a smartwatch user manual showing the health monitoring interface.",
-    "qas": [
-      {"question": "What device is shown in the image?", "answer": "A smartwatch"},
-      {"question": "What are its main features?", "answer": "It supports heart rate monitoring, step tracking, and sleep analysis."},
-      {"question": "What is the main topic of this text?", "answer": "An introduction to smartwatch functions"}
-    ]
-  }
-}
-```
-
----
-
-## 💡 Key Features
-
-* ✅ Supports batch image inputs
-* ✅ Automatically generates structured context + Q&A results
-* ✅ Built-in format cleaning and fault tolerance
-* ✅ Compatible with any vision–language model (e.g., Qwen-VL, InternVL, MiniCPM-V)
-* ✅ Ideal for multimodal knowledge enhancement, retrieval QA, and data generation tasks
diff --git a/docs/en/notes/mm_operators/image_understanding/generate/multimodal_math.md b/docs/en/notes/mm_operators/image_understanding/generate/multimodal_math.md
index 5a4de97b..7a237e23 100644
--- a/docs/en/notes/mm_operators/image_understanding/generate/multimodal_math.md
+++ b/docs/en/notes/mm_operators/image_understanding/generate/multimodal_math.md
@@ -1,23 +1,23 @@
 ---
-title: Multimodal Math Question Generation
+title: MultimodalMathGenerator
 createTime: 2025/10/15 19:00:00
-icon: material-symbols-light:functions
+# icon: material-symbols-light:functions
 permalink: /en/mm_operators/generate/multimodal_math/
 ---
 
 ## 📘 Overview
 
-`MultimodalMathGenerate` is an operator for **automatically generating math function plots along with math question-answer pairs**.  
-It supports various function types such as linear, quadratic, sine, and exponential functions. Users can generate both simple and complex math problems, with automatically plotted corresponding function images. This is suitable for educational scenarios, visual QA model training, and mathematical reasoning evaluation.
+`MultimodalMathGenerator` is a data generation operator for **automatically creating function plots (images) and corresponding math Question-Answer (QA) pairs**.  
+It supports various function types (linear, quadratic, sine, exponential, etc.) and generates simple calculation problems or advanced conceptual problems based on the `mode` field (`simple` or `complex`) in the input data. This operator is suitable for educational applications, visual QA model training, and math reasoning evaluation.
 
----
+-----
 
 ## 🏗️ `__init__` Function
 
 ```python
 def __init__(
     self,
-    image_dir: str = "/data0/mt/Dataflow-MM-Preview/cache",
+    image_dir: str = "~/cache",
     seed: int | None = None
 ):
     ...
@@ -25,12 +25,12 @@ def __init__(
 
 ## 🧾 `__init__` Parameters
 
-| Parameter   | Type          | Default                                 | Description                                |
-| :---------- | :------------ | :-------------------------------------- | :----------------------------------------- |
-| `image_dir` | `str`         | `"/data0/mt/Dataflow-MM-Preview/cache"` | Directory to save generated function plots |
-| `seed`      | `int \| None` | `None`                                  | Random seed for reproducibility            |
+| Parameter   | Type          | Default | Description                                                     |
+| :---------- | :------------ | :------ | :-------------------------------------------------------------- |
+| `image_dir` | `str`         | `"~/cache"` | Directory used to save the generated function plots             |
+| `seed`      | `int \| None` | `None`  | Random seed to ensure reproducibility of generated results      |
 
----
+-----
 
 ## ⚡ `run` Function
 
@@ -38,108 +38,117 @@ def __init__(
 def run(
     self,
     storage: DataFlowStorage,
-    n: int = 200,
-    mode: str = "complex",
-    output_key: str = "multimodal_math"
+    input_key: str = "mode",
 ):
     ...
 ```
 
-Executes the main workflow, automatically generating a specified number of function plots along with corresponding math QA pairs.
+The `run` function executes the main operator logic: it reads the data from `storage`, generates the corresponding function image and math QA pair based on the value in the `input_key` field for each row, and then horizontally concatenates the newly generated columns back to the original data.
 
----
+-----
 
 ## 🧾 `run` Parameters
 
-| Parameter    | Type              | Default             | Description                                                                                                 |
-| :----------- | :---------------- | :------------------ | :---------------------------------------------------------------------------------------------------------- |
-| `storage`    | `DataFlowStorage` | -                   | Dataflow storage object                                                                                     |
-| `n`          | `int`             | `200`               | Number of samples to generate                                                                               |
-| `mode`       | `str`             | `"complex"`         | Generation mode: `"simple"` for straightforward numeric problems, `"complex"` for advanced concept problems |
-| `output_key` | `str`             | `"multimodal_math"` | Output field name prefix for generated data                                                                 |
+| Parameter   | Type              | Default | Description                                                              |
+| :---------- | :---------------- | :------ | :----------------------------------------------------------------------- |
+| `storage`   | `DataFlowStorage` | -       | Dataflow storage object (contains the rows to be processed)              |
+| `input_key` | `str`             | `"mode"` | **Field name for the mode column**. Its value determines whether to generate a `"simple"` or `"complex"` problem. |
 
----
+-----
+
+## 🧠 Mode Description and Example Usage
 
-## 🧠 Example Usage
+### 📐 Mode Description
+
+| Mode | `mode` Column Value | Characteristics | Problem Type |
+| :--- | :--- | :--- | :--- |
+| **Simple** | `"simple"` | Basic function recognition and numerical substitution. | Given the function expression $f(x)$, find the function value $f(a)$ at $x=a$. |
+| **Complex** | Other values (e.g., `"complex"`) | Emphasizes mathematical analysis skills (derivatives, extrema, monotonicity). | Randomly generates questions on derivative sign, extreme points within an interval, or monotonicity judgment. |
+
+### 🧩 Example Usage (Requires an input file pre-populated with a `mode` column)
 
 ```python
 from dataflow.utils.storage import FileStorage
-from dataflow.operators.core_math import MultimodalMathGenerate
+from dataflow.operators.core_math import MultimodalMathGenerator
+import pandas as pd
+
+# Step 1: Prepare an input file containing the 'mode' column (e.g., data/math_tasks.jsonl)
+# Assuming data/math_tasks.jsonl contains:
+# {"id": 1, "mode": "simple"}
+# {"id": 2, "mode": "complex"}
+# {"id": 3, "mode": "complex"}
 
-# Step 1: Prepare storage
 storage = FileStorage(
-    first_entry_file_name="data/math_samples.jsonl",
+    first_entry_file_name="data/math_tasks.jsonl",
     cache_path="./cache_local",
-    file_name_prefix="math",
+    file_name_prefix="math_out",
     cache_type="jsonl"
 )
+storage.step() # Load data
 
-# Step 2: Initialize operator
-math_generator = MultimodalMathGenerate(
+# Step 2: Initialize the operator
+math_generator = MultimodalMathGenerator(
     image_dir="./math_plots",
     seed=42
 )
 
-# Step 3: Generate complex math problems (derivatives, extrema, monotonicity)
+# Step 3: Run the operator, generating problems based on the 'mode' column of each row
 math_generator.run(
     storage=storage,
-    n=10,
-    mode="complex",
-    output_key="multimodal_math"
-)
-
-# Step 4: Generate simple numeric problems
-math_generator.run(
-    storage=storage,
-    n=10,
-    mode="simple",
-    output_key="multimodal_math_simple"
+    input_key="mode" # Specify 'mode' column to control generation
 )
 ```
 
----
+-----
 
 ## 🧾 Default Output Format
 
-| Field        | Type  | Description                           |
-| :----------- | :---- | :------------------------------------ |
-| `image_path` | `str` | Path to the generated function plot   |
-| `question`   | `str` | Automatically generated math question |
-| `answer`     | `str` | Answer to the question                |
-| `solution`   | `str` | Detailed step-by-step solution        |
+The operator will **horizontally concatenate** the following four fields onto the original input DataFrame:
 
----
+| Field        | Type | Description                                   |
+| :----------- | :--- | :-------------------------------------------- |
+| `image_path` | `str` | Local path where the function plot image is saved |
+| `question`   | `str` | Automatically generated mathematical question |
+| `answer`     | `str` | Answer                                        |
+| `solution`   | `str` | Detailed solution steps and explanation       |
+
+-----
 
 ### 📥 Example Input
 
+> **Note:** The operator relies on the number of rows in the input `storage` and the value of the `input_key` column (defaults to `mode`) to generate data.
+
 ```jsonl
-{}
+{"id": 1, "mode": "simple"}
+{"id": 2, "mode": "complex"}
 ```
 
-> This operator does not depend on external input data and generates samples directly.
-
----
+-----
 
-### 📤 Example Output (Simple Mode)
+### 📤 Example Output (Simple Mode Row)
 
 ```jsonl
 {
+  "id": 1,
+  "mode": "simple",
   "image_path": "./math_plots/plot_0.png",
-  "question": "The function plot represents f(x) = x². What is the value of the function at x=3.5?",
+  "question": "The function plot represents f(x) = x². What is the function value at x=3.5?",
   "answer": "12.25",
-  "solution": "According to the function expression f(x) = x², substituting x=3.5 gives y=12.25."
+  "solution": "According to the function expression f(x) = x², substitute x=3.5 to get y=12.25."
 }
 ```
 
----
+-----
 
-### 📤 Example Output (Complex Mode)
+### 📤 Example Output (Complex Mode Row)
 
 ```jsonl
 {
-  "image_path": "./math_plots/plot_7.png",
-  "question": "The function plot represents f(x) = sin(x). Determine whether the rate of change of the function at x=2.5 is positive or negative.",
-  "answer": "Negative",
-  "solution": "By observing the slope near x=2.5 on the plot, the rate of change is negative."
+  "id": 2,
+  "mode": "complex",
+  "image_path": "./math_plots/plot_1.png",
+  "question": "The function plot represents f(x) = sin(x). Is the rate of change (derivative) at x=2.5 positive or negative?",
+  "answer": "negative",
+  "solution": "By observing the slope of the plot near x=2.5, the rate of change is negative."
 }
 ```
\ No newline at end of file
diff --git a/docs/en/notes/mm_operators/image_understanding/generate/multirole_videoqa.md b/docs/en/notes/mm_operators/image_understanding/generate/multirole_videoqa.md
new file mode 100644
index 00000000..2f45e04c
--- /dev/null
+++ b/docs/en/notes/mm_operators/image_understanding/generate/multirole_videoqa.md
@@ -0,0 +1,140 @@
+---
+title: MultiRole Video QA Generation
+createTime: 2025/12/2 20:00:00
+icon: material-symbols-light:video
+permalink: /en/mm_operators/generate/multirole_videoqa/
+---
+
+## 📘 Overview
+
+`MultiroleVideoQAGenerate` is a data generation operator for **automatically creating Question-Answer (QA) pairs based on the preprocessed video data**.  
+Given input preprocessed video data, it constructs several QA pairs relative to the video. This is suitable for Advertisement video annotation, dataset construction, and video understanding tasks.
+
+**Features:**
+* Supports batch processing of multiple preprocessed video data.
+* Generates high-quality QA pairs using VLMs like Qwen2.5-VL.
+* Automatically handles video input and using prompt to generate data.
+
+---
+
+## 🏗️ `__init__` Function
+
+```python
+def __init__(
+    self,
+    llm_serving: VLMServingABC
+):
+    ...
+```
+## 🧾 `__init__` Parameters
+
+| Parameter     | Type            | Default | Description                                                     |
+| :------------ | :-------------- | :------ | :-------------------------------------------------------------- |
+| `llm_serving` | `VLMServingABC` | -       | **Model Serving Object** used to call the VLM for QA pairs generation |
+
+-----
+
+## ⚡ `run` Function
+
+```python
+def run(
+        self,
+        storage: DataFlowStorage,
+        input_meta_key: str = "Meta", 
+        input_clips_key: str = "Clips", 
+        output_key: str = "QA"
+):
+    ...
+```
+
+The `run` function executes the main QA pairs generation workflow:
+read data paths → **validate DataFrame** → construct prompts → call the model → generate QA pairs captions → write results to output.
+
+## 🧾 `run` Parameters
+
+| Parameter         | Type              | Default     | Description                                           |
+| :---------------- | :---------------- | :---------- | :---------------------------------------------------- |
+| `storage`         | `DataFlowStorage` | -           | Dataflow storage object                               |
+| `input_mets_key`  | `str`             | `"Meta"`    | **Multimodal Input Field Name**                       |
+| `input_clips_key` | `str`             | `"Clips"`   | **Multimodal Input Field Name**                       |
+| `output_key`      | `str`             | `"QA"`      | **Model Output Field Name** (the generated QA pairs)  |
+
+-----
+
+## 🧠 Example Usage
+
+```python
+import os 
+import argparse
+from dataflow.serving import LocalModelVLMServing_vllm
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.core_vision import MultiroleVideoQAInitialGenerator, MultiroleVideoQAMultiAgentGenerator, MultiroleVideoQAFinalGenerator
+
+# Step 1: Launch local model service
+llm_serving = LocalModelVLMServing_vllm(
+            hf_model_name_or_path=model_path,
+            hf_cache_dir=hf_cache_dir,
+            hf_local_dir=download_dir,
+            vllm_tensor_parallel_size=1, 
+            vllm_temperature=0.7,
+            vllm_top_p=0.9,
+            vllm_max_tokens=6000,
+        )
+
+# Step 2: Prepare input data
+storage = FileStorage(
+            first_entry_file_name=first_entry_file,
+            cache_path=cache_path,
+            file_name_prefix=file_name_prefix,
+            cache_type=cache_type,
+        )
+
+# Step 3: Initialize and run the operator
+initial_QA_generation = MultiroleVideoQAInitialGenerator(llm_serving = self.llm_serving)
+multiAgent_QA_generation = MultiroleVideoQAMultiAgentGenerator(llm_serving = self.llm_serving, max_iterations = 3)
+final_QA_generation = MultiroleVideoQAFinalGenerator(llm_serving = self.llm_serving)
+
+init_df = initial_QA_generation.run(
+            storage = self.storage.step(),
+            input_meta_key = self.input_meta_key, 
+            input_clips_key = self.input_clips_key, 
+            output_key = self.output_key
+        )
+middle_df = multiAgent_QA_generation.run(
+            df = init_df,
+            input_meta_key = self.input_meta_key, 
+            input_clips_key = self.input_clips_key, 
+            output_key = self.output_key
+        )
+final_QA_generation.run(
+            storage = self.storage,
+            df = middle_df,
+            input_meta_key = self.input_meta_key, 
+            input_clips_key = self.input_clips_key, 
+            output_key = self.output_key
+        )
+```
+
+-----
+
+## 🧾 Default Output Format
+
+| Field     | Type         | Description                      |
+| :-------- | :----------- | :------------------------------- |
+| `Meta`    | `str`        | Meta information for video       |
+| `Clips`   | `List[Dict]` | Interleaved modality video Clips |
+| `QA`      | `List[Dict]` | QA pairs                         |
+
+-----
+
+### 📥 Example Input
+
+```jsonl
+{"Meta": "Meta Information", "Clips": [{"Audio_Text": "Audio_Text1", "Frames_Images": ["image_path1","image_path2"], "Description": "Description1"}, {"Audio_Text": "Audio_Text2", "Frames_Images": ["image_path3","image_path4"], "Description": "Description2"}]}
+```
+
+### 📤 Example Output
+
+```jsonl
+{"Meta": "Meta Information", "Clips": [{"Audio_Text": "Audio_Text1", "Frames_Images": ["image_path1","image_path2"], "Description": "Description1"}, {"Audio_Text": "Audio_Text2", "Frames_Images": ["image_path3","image_path4"], "Description": "Description2"}], "QA":[{"Label":"label1", "Question": "Question1", "Answer": "Answer1"},{"Label":"label2", "Question": "Question2", "Answer": "Answer2"}]}
+```
\ No newline at end of file
diff --git a/docs/en/notes/mm_operators/image_understanding/generate/prompt_templated_vqa_generator.md b/docs/en/notes/mm_operators/image_understanding/generate/prompt_templated_vqa_generator.md
new file mode 100644
index 00000000..425b5273
--- /dev/null
+++ b/docs/en/notes/mm_operators/image_understanding/generate/prompt_templated_vqa_generator.md
@@ -0,0 +1,132 @@
+---
+title: PromptTemplatedVQAGenerator
+createTime: 2026/01/11 21:25:34
+permalink: /en/mm_operators/generate/prompt_templated_vqa_generator/
+---
+## 📘 Overview
+
+`PromptTemplatedVQAGenerator` is a **Template-Based Multimodal VQA Operator**. It allows users to dynamically inject multiple fields from a DataFrame into a predefined Prompt Template to generate customized text instructions, which are then combined with image or video inputs for batch inference.
+
+Unlike standard VQA operators, this operator supports complex prompt construction logic (e.g., dynamically filling in categories, context descriptions, etc.), making it highly suitable for scenarios requiring **structured prompt engineering**, such as attribute-guided image captioning or controlled dialogue simulation.
+
+## 🏗️ `__init__` Function
+
+```python
+def __init__(
+    self,
+    serving: LLMServingABC,
+    prompt_template: NamedPlaceholderPromptTemplate,
+    system_prompt: str = "You are a helpful assistant.",
+):
+
+```
+
+### 🧾 Parameters
+
+| Parameter | Type | Default | Description |
+| --- | --- | --- | --- |
+| `serving` | `LLMServingABC` | N/A | The model serving instance for inference (must support multimodal inputs). |
+| `prompt_template` | `NamedPlaceholderPromptTemplate` | N/A | A template object implementing `build_prompt` to convert dictionary data into a string prompt. |
+| `system_prompt` | `str` | `"You are..."` | The system prompt sent to the model. |
+
+## ⚡ `run` Function
+
+```python
+def run(
+    self,
+    storage: DataFlowStorage,
+    input_image_key: str = "image",
+    input_video_key: str = "video",
+    output_answer_key: str = "answer",
+    **input_keys,
+):
+    ...
+
+```
+
+Executes the main logic:
+
+1. **Read Data**
+Reads the DataFrame from `storage`.
+2. **Dynamic Prompt Construction**
+Iterates through each row of the DataFrame:
+* Extracts data from columns specified in `input_keys` (e.g., `descriptions` column, `type` column).
+* Calls `prompt_template.build_prompt()` to fill these values into the template, generating a unique `prompt_text` for that sample.
+
+
+3. **Multimodal Input Assembly**
+* Reads media paths from `input_image_key` or `input_video_key`.
+* Packages the generated text prompt with the corresponding image/video data into the format required by the model.
+
+
+4. **Inference & Output**
+* Calls the model service for batch generation.
+* Writes the results to the column specified by `output_answer_key` and saves the updated DataFrame.
+
+
+
+### 🧾 `run` Parameters
+
+| Parameter | Type | Default | Description |
+| --- | --- | --- | --- |
+| `storage` | `DataFlowStorage` | N/A | DataFlow storage object. |
+| `input_image_key` | `str` | `"image"` | Column name for image paths (mutually exclusive with video_key). |
+| `input_video_key` | `str` | `"video"` | Column name for video paths (mutually exclusive with image_key). |
+| `output_answer_key` | `str` | `"answer"` | Column name for the generated output. |
+| `**input_keys` | `kwargs` | N/A | **Key Parameter**. Defines the mapping between template placeholders and DataFrame columns.<br>
+
+<br>Format: `template_var="dataframe_column"`. |
+
+## 🧩 Example Usage
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.core import LLMServing
+from dataflow.prompts.prompt_template import NamedPlaceholderPromptTemplate
+from dataflow.operators.generate import PromptTemplatedVQAGenerator
+
+# 1) Define a template with placeholders
+# We want the model to check for a specific object type, referencing existing descriptions
+TEMPLATE = (
+    "Context: {descriptions}\n\n"
+    "Task: Describe the appearance of {type} in the image based on the context above."
+)
+prompt_template = NamedPlaceholderPromptTemplate(template=TEMPLATE)
+
+# 2) Initialize Operator
+op = PromptTemplatedVQAGenerator(
+    serving=LLMServing(model_path="Qwen/Qwen2.5-VL-3B-Instruct"),
+    prompt_template=prompt_template
+)
+
+# 3) Prepare Data (assuming jsonl has image, meta_desc, obj_type columns)
+storage = FileStorage(file_name_prefix="vqa_task")
+storage.step()
+
+# 4) Run Operator: Map 'meta_desc' to {descriptions}, 'obj_type' to {type}
+op.run(
+    storage=storage,
+    input_image_key="image",
+    output_answer_key="generated_caption",
+    # Dynamic Mapping:
+    descriptions="meta_desc", 
+    type="obj_type"
+)
+
+```
+
+### 🧾 Input/Output Example
+
+**Input DataFrame Row:**
+| image | meta_desc | obj_type |
+| :--- | :--- | :--- |
+| `"/path/to/car.jpg"` | `"A photo taken on a sunny day."` | `"vintage car"` |
+
+**Constructed Prompt:**
+
+> "Context: A photo taken on a sunny day.\n\nTask: Describe the appearance of **vintage car** in the image based on the context above."
+
+**Output DataFrame Row:**
+| image | meta_desc | obj_type | generated_caption |
+| :--- | :--- | :--- | :--- |
+| `"/path/to/car.jpg"` | `...` | `...` | `"The vintage car is red with..."` |
\ No newline at end of file
diff --git a/docs/en/notes/mm_operators/image_understanding/generate/prompted_vqa_generator.md b/docs/en/notes/mm_operators/image_understanding/generate/prompted_vqa_generator.md
new file mode 100644
index 00000000..4cb870be
--- /dev/null
+++ b/docs/en/notes/mm_operators/image_understanding/generate/prompted_vqa_generator.md
@@ -0,0 +1,133 @@
+---
+title: PromptedVQAGenerator
+createTime: 2026/01/11 21:37:37
+permalink: /en/mm_operators/generate/prompted_vqa_generator/
+---
+## 📘 Overview
+
+`PromptedVQAGenerator` is a **General-Purpose Multimodal VQA Operator**.
+
+It reads **Prompts** and **Optional Media Paths (Image/Video)** directly from a DataFrame to generate answers. This operator is highly flexible:
+
+* **Multimodal Support**: Performs VQA with text and image/video inputs.
+* **Pure Text Support**: Automatically switches to pure text chat mode if no image or video columns are provided or if paths are empty.
+* **Flexible Input Formats**: Can read raw text prompts or parse conversation-style lists.
+* **Compatibility**: Automatically handles Chat Template encapsulation for local models (Local VLLM) and direct calls for API models.
+
+## 🏗️ `__init__` Function
+
+```python
+def __init__(
+    self, 
+    serving: LLMServingABC, 
+    system_prompt: str = "You are a helpful assistant."
+):
+
+```
+
+### 🧾 Parameters
+
+| Parameter | Type | Default | Description |
+| --- | --- | --- | --- |
+| `serving` | `LLMServingABC` | N/A | The model serving instance for inference (supports Local or API models). |
+| `system_prompt` | `str` | `"You are..."` | The system prompt sent to the model. |
+
+## ⚡ `run` Function
+
+```python
+def run(
+    self, 
+    storage: DataFlowStorage,
+    input_prompt_key: str = None,
+    input_conversation_key: str = None,
+    input_image_key: str = None,
+    input_video_key: str = None,
+    output_answer_key: str = "answer",
+):
+    ...
+
+```
+
+Executes the main logic:
+
+1. **Data Loading & Prompt Extraction**
+* Reads the DataFrame from `storage`.
+* **Prompt Source (Mutually Exclusive)**:
+* `input_prompt_key`: Reads the text string from this column as the User Prompt.
+* `input_conversation_key`: Reads the conversation list (List[Dict]) and extracts the content of the first User Message.
+
+
+
+
+2. **Media Processing**
+* Attempts to read `input_image_key` and `input_video_key`.
+* **Pure Text Mode Detection**: If media columns are not provided or media paths are empty/None for a row, the operator constructs a **Pure Text** request without `<image>` or `<video>` placeholders.
+
+
+3. **Input Construction & Inference**
+* **Local Mode**: Uses `process_vision_info` to handle images/videos and applies the Chat Template.
+* **API Mode**: Passes raw prompts and media path lists directly.
+* Calls `serving.generate_from_input` for batch inference.
+
+
+4. **Save Results**
+* Writes the generated output to the `output_answer_key` column.
+
+
+
+### 🧾 `run` Parameters
+
+| Parameter | Type | Default | Description |
+| --- | --- | --- | --- |
+| `storage` | `DataFlowStorage` | N/A | DataFlow storage object. |
+| `input_prompt_key` | `str` | `None` | **Text Prompt Column**. Mutually exclusive with `conversation_key`. |
+| `input_conversation_key` | `str` | `None` | **Conversation Column**. Mutually exclusive with `prompt_key`. Extracts the first user input if used. |
+| `input_image_key` | `str` | `None` | **(Optional)** Image path column. Treated as a pure text task if empty. |
+| `input_video_key` | `str` | `None` | **(Optional)** Video path column. |
+| `output_answer_key` | `str` | `"answer"` | Output column name for generated results. |
+
+## 🧩 Example Usage
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.core import LLMServing
+from dataflow.operators.generate import PromptedVQAGenerator
+
+# 1) Initialize Model
+serving = LLMServing(model_path="Qwen/Qwen2.5-VL-3B-Instruct")
+
+# 2) Initialize Operator
+generator = PromptedVQAGenerator(
+    serving=serving,
+    system_prompt="You are a helpful assistant."
+)
+
+# 3) Prepare Data (jsonl)
+# Sample A: {"image": "1.jpg", "question": "Describe this image."}
+# Sample B: {"question": "What is AI?"} (No image, pure text)
+storage = FileStorage(file_name_prefix="mixed_tasks")
+storage.step()
+
+# 4) Execute Generation
+generator.run(
+    storage=storage,
+    input_prompt_key="question",  # Read prompt from 'question' column
+    input_image_key="image",      # Read image from 'image' column (optional)
+    output_answer_key="answer"
+)
+
+```
+
+### 🧾 Input/Output Example
+
+**Input DataFrame:**
+| image | question |
+| :--- | :--- |
+| `"/data/cat.jpg"` | `"What animal is this?"` |
+| `None` | `"Explain quantum physics briefly."` |
+
+**Output DataFrame:**
+| image | question | answer |
+| :--- | :--- | :--- |
+| `"/data/cat.jpg"` | `"What animal is this?"` | `"It is a cat."` |
+| `None` | `"Explain quantum physics briefly."` | `"Quantum physics is the study of..."` |
\ No newline at end of file
diff --git a/docs/en/notes/mm_operators/image_understanding/generate/vision_mct_reasoning.md b/docs/en/notes/mm_operators/image_understanding/generate/vision_mct_reasoning.md
index 66464831..53a52733 100644
--- a/docs/en/notes/mm_operators/image_understanding/generate/vision_mct_reasoning.md
+++ b/docs/en/notes/mm_operators/image_understanding/generate/vision_mct_reasoning.md
@@ -53,10 +53,10 @@ def __init__(
 def run(
     self,
     storage: DataFlowStorage,
-    question_key: str = "question",
-    image_key: str = "image",
-    tree_key: Optional[str] = "tree",
-    true_answer_key: str = "true_answer",
+    input_question_key: str = "question",
+    input_image_key: str = "image",
+    input_tree_key: Optional[str] = "tree",
+    input_true_answer_key: str = "true_answer",
     output_key: str = "sft_entry",
 )
 ```
@@ -66,10 +66,10 @@ def run(
 | Parameter         | Type              | Default         | Description                           |
 | ----------------- | ----------------- | --------------- | ------------------------------------- |
 | `storage`         | `DataFlowStorage` | —               | DataFlow storage interface.           |
-| `question_key`    | `str`             | `"question"`    | Column for input question text.       |
-| `image_key`       | `str`             | `"image"`       | Column for image file path.           |
-| `tree_key`        | `Optional[str]`   | `"tree"`        | MCTS search tree field, if available. |
-| `true_answer_key` | `str`             | `"true_answer"` | Ground-truth coordinate or answer.    |
+| `input_question_key`    | `str`             | `"question"`    | Column for input question text.       |
+| `input_image_key`       | `str`             | `"image"`       | Column for image file path.           |
+| `input_tree_key`        | `Optional[str]`   | `"tree"`        | MCTS search tree field, if available. |
+| `input_true_answer_key` | `str`             | `"true_answer"` | Ground-truth coordinate or answer.    |
 | `output_key`      | `str`             | `"sft_entry"`   | Output column name for SFT entries.   |
 
 ---
@@ -94,9 +94,9 @@ op = VisionMCTSReasoningSFTGenerate(
 
 op.run(
     storage=storage,
-    question_key="question",
-    image_key="image",
-    true_answer_key="true_answer",
+    input_question_key="question",
+    input_image_key="image",
+    input_true_answer_key="true_answer",
     output_key="sft_entry",
 )
 ```
diff --git a/docs/en/notes/mm_operators/image_understanding/generate/visual_reasoning_generator.md b/docs/en/notes/mm_operators/image_understanding/generate/visual_reasoning_generator.md
new file mode 100644
index 00000000..efd1bff4
--- /dev/null
+++ b/docs/en/notes/mm_operators/image_understanding/generate/visual_reasoning_generator.md
@@ -0,0 +1,123 @@
+---
+title: VisualReasoningGenerator
+createTime: 2026/01/11 21:42:10
+permalink: /en/mm_operators/generate/visual_reasoning_generator/
+---
+## 📘 Overview
+
+`VisualReasoningGenerator` is a **Visual Reasoning Generation Operator** designed to invoke VLMs for creating detailed reasoning processes (e.g., text containing `<think>` and `<answer>` tags).
+
+This operator features a built-in **Fallback Mechanism**: before generation, it checks a specified `input_existing_chains_key` column. If valid reasoning chain data already exists for a row, the operator reuses that data directly, skipping model inference. This makes it ideal for resuming interrupted tasks or completing partially processed datasets.
+
+## 🏗️ `__init__` Function
+
+```python
+def __init__(
+    self, 
+    serving: LLMServingABC, 
+    prompt_type: str = "web_grounding"
+):
+
+```
+
+### 🧾 Parameters
+
+| Parameter | Type | Default | Description |
+| --- | --- | --- | --- |
+| `serving` | `LLMServingABC` | N/A | The model serving instance for inference. |
+| `prompt_type` | `str` | `"web_grounding"` | **Prompt Type Key**. Used to retrieve the corresponding System Prompt from the `MCTReasoningPrompt` library (e.g., preset prompts for web grounding, math reasoning, etc.). |
+
+## ⚡ `run` Function
+
+```python
+def run(
+    self, 
+    storage: DataFlowStorage, 
+    input_question_key: str, 
+    input_image_key: str, 
+    output_key: str,
+    input_existing_chains_key: Optional[str] = None
+):
+    ...
+
+```
+
+Executes the main logic:
+
+1. **Fallback Check**
+* If `input_existing_chains_key` is provided, checks this column in the DataFrame.
+* If a row contains a non-empty list, the operator uses this existing data and **skips model invocation** for that row.
+
+
+2. **Input Construction**
+* For samples requiring generation, reads `input_question_key` and `input_image_key`.
+* Constructs a multimodal input `[Image, Text]` using the `System Prompt` determined during initialization.
+
+
+3. **Batch Generation**
+* Packages pending requests into a batch.
+* Calls `serving.generate_from_input` to execute inference.
+
+
+4. **Result Integration**
+* Merges the reused old data with the newly generated data (wrapped as a List).
+* Writes to `output_key` and saves.
+
+
+
+### 🧾 `run` Parameters
+
+| Parameter | Type | Default | Description |
+| --- | --- | --- | --- |
+| `storage` | `DataFlowStorage` | N/A | DataFlow storage object. |
+| `input_question_key` | `str` | N/A | Column name for the question text. |
+| `input_image_key` | `str` | N/A | Column name for the image path. |
+| `output_key` | `str` | N/A | Column name for the output result (stored as `List[str]`). |
+| `input_existing_chains_key` | `str` | `None` | **(Optional) Existing Chains Column**. If this column has values, generation is skipped. |
+
+## 🧩 Example Usage
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.core import LLMServing
+from dataflow.operators.generate import VisualReasoningGenerator
+
+# 1) Initialize Model
+serving = LLMServing(model_path="Qwen/Qwen2.5-VL-7B-Instruct")
+
+# 2) Initialize Operator
+# prompt_type="web_grounding" loads the corresponding System Prompt automatically
+generator = VisualReasoningGenerator(
+    serving=serving,
+    prompt_type="web_grounding"
+)
+
+# 3) Prepare Data
+# Assume we have partially processed data where 'history_reasoning' is populated for some rows
+storage = FileStorage(file_name_prefix="reasoning_task")
+storage.step()
+
+# 4) Execute Generation (with Resume capability)
+generator.run(
+    storage=storage,
+    input_question_key="question",
+    input_image_key="image",
+    output_key="reasoning_result",
+    input_existing_chains_key="history_reasoning" # Prioritize data from this column
+)
+
+```
+
+### 🧾 Input/Output Example
+
+**Input DataFrame:**
+| image | question | history_reasoning |
+| :--- | :--- | :--- |
+| `"1.jpg"` | `"Find the button."` | `["<think>The button is red...</think>..."]` |
+| `"2.jpg"` | `"Where is the logo?"` | `[]` (or `None`) |
+
+**Output DataFrame (`reasoning_result`):**
+| image | question | reasoning_result | Note |
+| :--- | :--- | :--- | :--- |
+| `"1.jpg"` | `"Find the button."` | `["<think>The button is red...</think>..."]` | **Reuse**: Copied from `history_reasoning` |
+| `"2.jpg"` | `"Where is the logo?"` | `["<think>Scanning image...</think> Top left."]` | **Gen**: Generated by model |
diff --git a/docs/en/notes/mm_operators/image_understanding/generate/vlm_bbox_generator.md b/docs/en/notes/mm_operators/image_understanding/generate/vlm_bbox_generator.md
new file mode 100644
index 00000000..898dd381
--- /dev/null
+++ b/docs/en/notes/mm_operators/image_understanding/generate/vlm_bbox_generator.md
@@ -0,0 +1,135 @@
+---
+title: VLMBBoxGenerator
+createTime: 2026/01/11 21:44:23
+permalink: /en/mm_operators/generate/vlm_bbox_generator/
+---
+
+
+## 📘 Overview
+
+`VLMBBoxGenerator` is a **Visual Grounding Generation Operator**.
+
+It takes an image and a list of keywords, utilizing the localization capabilities of a VLM to detect objects corresponding to each keyword and output normalized Bounding Boxes (BBoxes). This operator automatically parses the coordinate text output by the model, converting it into a standardized `[x1, y1, x2, y2]` format.
+
+Key Features:
+
+* **Batch Parallelism**: Automatically assembles batch requests for multiple keywords within a single image to maximize inference efficiency.
+* **Coordinate Normalization**: Compatible with both 0-1000 integer coordinates and 0-1 float coordinates, unifying them into a normalized range.
+* **Exception Filtering**: Automatically filters out "not found" responses or invalid parsing results.
+
+## 🏗️ `__init__` Function
+
+```python
+def __init__(
+    self, 
+    serving: LLMServingABC, 
+    prompt_template: str = 'Detect "{keyword}".'
+):
+
+```
+
+### 🧾 Parameters
+
+| Parameter | Type | Default | Description |
+| --- | --- | --- | --- |
+| `serving` | `LLMServingABC` | N/A | The model serving instance for inference (must support Grounding/BBox output, e.g., Qwen-VL). |
+| `prompt_template` | `str` | `'Detect "{keyword}".'` | The prompt template used to trigger detection. Must include the `{keyword}` placeholder. |
+
+## ⚡ `run` Function
+
+```python
+def run(
+    self, 
+    storage: DataFlowStorage, 
+    input_image_key: str, 
+    input_kws_key: str, 
+    output_key: str
+):
+    ...
+
+```
+
+Executes the main logic:
+
+1. **Read Data**
+Reads the image path (`input_image_key`) and the list of keywords to detect (`input_kws_key`) from the DataFrame.
+2. **Batch Inference Construction**
+For each row:
+* Retrieves the unique list of keywords.
+* Constructs a prompt (e.g., `"Detect "cat"."`) and the corresponding image input for each keyword.
+* Packages requests for all keywords of that image into a single Batch and calls `serving.generate_from_input` for parallel generation.
+
+
+3. **Result Parsing**
+* **Coordinate Extraction**: Uses regex to extract coordinates in `(x1, y1), (x2, y2)` format.
+* **Normalization**: If coordinates are > 1 (e.g., 0-1000 scale), divides by 1000 to normalize to 0-1.
+* **Formatting**: Converts coordinates to `[x1, y1, x2, y2]` string format.
+* **Filtering**: Discards failed responses containing "not found".
+
+
+4. **Save Results**
+Constructs a dictionary `{keyword: [bbox1, bbox2, ...]}` and writes it to the `output_key` column.
+
+### 🧾 `run` Parameters
+
+| Parameter | Type | Default | Description |
+| --- | --- | --- | --- |
+| `storage` | `DataFlowStorage` | N/A | DataFlow storage object. |
+| `input_image_key` | `str` | N/A | Column name for the image path. |
+| `input_kws_key` | `str` | N/A | Column name for the keyword list (`List[str]`). |
+| `output_key` | `str` | N/A | Output column name (stored as a Dictionary). |
+
+## 🧩 Example Usage
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.core import LLMServing
+from dataflow.operators.generate import VLMBBoxGenerator
+
+# 1) Initialize Model (Must support Grounding)
+serving = LLMServing(model_path="Qwen/Qwen2.5-VL-7B-Instruct")
+
+# 2) Initialize Operator
+generator = VLMBBoxGenerator(
+    serving=serving,
+    prompt_template='Find the bounding box of "{keyword}".'
+)
+
+# 3) Prepare Data
+# Format: {"image": "park.jpg", "objects": ["dog", "frisbee", "tree"]}
+storage = FileStorage(file_name_prefix="bbox_task")
+storage.step()
+
+# 4) Execute Detection
+generator.run(
+    storage=storage,
+    input_image_key="image",
+    input_kws_key="objects",
+    output_key="bbox_result"
+)
+
+```
+
+### 🧾 Output Format
+
+The `output_key` column contains a dictionary (`Dict[str, List[str]]`):
+
+| Field | Type | Description |
+| --- | --- | --- |
+| Keyword | `str` | The input keyword. |
+| BBoxes | `List[str]` | Detected BBoxes, formatted as `"[x1, y1, x2, y2]"` (top 3 results kept). |
+
+**JSONL Example Output:**
+
+```json
+{
+  "image": "park.jpg",
+  "objects": ["dog", "frisbee", "ufo"],
+  "bbox_result": {
+    "dog": ["[0.125, 0.450, 0.230, 0.600]"],
+    "frisbee": ["[0.240, 0.500, 0.280, 0.540]"]
+    // "ufo" not detected, hence not in results
+  }
+}
+
+```
diff --git a/docs/en/notes/mm_operators/image_understanding/refine/visual_dependency_refiner.md b/docs/en/notes/mm_operators/image_understanding/refine/visual_dependency_refiner.md
new file mode 100644
index 00000000..5a0c23f5
--- /dev/null
+++ b/docs/en/notes/mm_operators/image_understanding/refine/visual_dependency_refiner.md
@@ -0,0 +1,141 @@
+---
+title: VisualDependencyRefiner
+createTime: 2026/01/11 20:27:11
+permalink: /en/mm_operators/refine/visual_dependency_refiner/
+---
+## 📘 Overview
+
+`VisualDependencyRefiner` is a **Visual Dependency Validation Operator** designed for strict quality control of Multiple Choice Questions (MCQs).
+
+In multimodal datasets, many questions can inadvertently be answered using common sense or textual bias without looking at the image. This operator employs a **"Rotation + Double-Blind Test"** mechanism to filter for high-quality questions that are **Visually Dependent (High Visual Acc)** and **Not Textually Dependent (Low Text Acc)**.
+
+Core Mechanisms:
+
+1. **Option Rotation**: Shuffles answer options multiple times for the same question to eliminate position bias (e.g., model always choosing 'A').
+2. **Double-Blind Comparison**:
+* **Visual Mode**: Inputs Image + Question. Requires high accuracy.
+* **Text-Only Mode**: Inputs only the Question (blind test). Requires low accuracy (close to random chance).
+
+
+
+## `__init__` Function
+
+```python
+def __init__(
+    self, 
+    serving: LLMServingABC, 
+    instruction_template: str,
+    rotate_num: int = 4,
+    pass_visual_min: float = 1.0,
+    pass_textual_max: float = 0.25, 
+    add_none_above_visual: bool = True
+):
+
+```
+
+### Parameters
+
+| Parameter | Type | Default | Description |
+| --- | --- | --- | --- |
+| `serving` | `LLMServingABC` | N/A | The model serving instance for inference (must support both multimodal and text-only modes). |
+| `instruction_template` | `str` | N/A | Prompt template containing a `{}` placeholder for the question and options. |
+| `rotate_num` | `int` | `4` | Number of validation rounds. N variants with shuffled options are generated per question. |
+| `pass_visual_min` | `float` | `1.0` | **Visual Threshold**. Accuracy in Visual Mode must be  this value (default: 100% correct). |
+| `pass_textual_max` | `float` | `0.25` | **Textual Threshold**. Accuracy in Text-Only Mode must be  this value (default: 25%, random chance for 4 options). |
+| `add_none_above_visual` | `bool` | `True` | Whether to dynamically add "None of the above" to options in **Visual Mode** to increase difficulty and reduce hallucinations. |
+
+## `run` Function
+
+```python
+def run(
+    self, 
+    storage: DataFlowStorage, 
+    input_list_key: str, 
+    input_image_key: str, 
+    output_key: str
+):
+    ...
+
+```
+
+Executes the main logic:
+
+1. **Read Data**
+Iterates through the DataFrame, retrieving image paths (`input_image_key`) and MCQ lists (`input_list_key`).
+2. **Construct Double-Blind Tests**
+For each question, iterates `rotate_num` times:
+* **Visual Case**: Shuffles options (optionally adds "None of the above") and builds an `[Image, Instruction]` prompt.
+* **Text-Only Case**: Shuffles options (without extra distractors) and builds an `[Instruction]` prompt.
+
+
+3. **Batch Inference**
+* Groups Visual Prompts and Text Prompts into separate batches.
+* Calls `serving.generate_from_input` to get results for both modes.
+
+
+4. **Accuracy Calculation & Filtering**
+* Parses the model output for option letters (A/B/C...).
+* Calculates **Visual Accuracy (`v_acc`)** and **Text-Only Accuracy (`l_acc`)**.
+* Keeps the question only if `v_acc >= pass_visual_min` **AND** `l_acc <= pass_textual_max`.
+
+
+5. **Save Results**
+Writes the filtered list of questions to the `output_key` column.
+
+### Parameters
+
+| Parameter | Type | Default | Description |
+| --- | --- | --- | --- |
+| `storage` | `DataFlowStorage` | N/A | DataFlow storage object. |
+| `input_list_key` | `str` | N/A | Column name containing the list of MCQs (List[Dict]). |
+| `input_image_key` | `str` | N/A | Column name containing image paths. |
+| `output_key` | `str` | N/A | Output column name for the filtered list. |
+
+## 🧠 Example Usage
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.core import LLMServing
+from dataflow.operators.refine import VisualDependencyRefiner
+
+# 1) Initialize Model Serving (e.g., Qwen-VL)
+serving = LLMServing(model_path="Qwen/Qwen-VL-Chat", device="cuda")
+
+# 2) Initialize Refiner
+# Criteria: Must be perfect with image (1.0), but fail without image (<= 0.25)
+refiner = VisualDependencyRefiner(
+    serving=serving,
+    instruction_template="Answer the question based on the image.\n{}",
+    rotate_num=4,
+    pass_visual_min=1.0,
+    pass_textual_max=0.25
+)
+
+# 3) Execute
+refiner.run(
+    storage=storage,
+    input_list_key="generated_qas",
+    input_image_key="image_path",
+    output_key="refined_qas"
+)
+
+```
+
+### 🧾 Output Format
+
+The `output_key` column contains the filtered list of questions. Each question item includes a new `stats` field:
+
+```json
+[
+  {
+    "question": "What color is the car?",
+    "options": {"A": "Red", "B": "Blue", ...},
+    "answer": "A",
+    "stats": {
+      "v_acc": 1.0,  // Visual Accuracy
+      "t_acc": 0.0   // Text-Only Accuracy
+    }
+  }
+]
+
+```
diff --git a/docs/en/notes/mm_operators/image_understanding/refine/visual_grounding_refiner.md b/docs/en/notes/mm_operators/image_understanding/refine/visual_grounding_refiner.md
new file mode 100644
index 00000000..bde249cd
--- /dev/null
+++ b/docs/en/notes/mm_operators/image_understanding/refine/visual_grounding_refiner.md
@@ -0,0 +1,123 @@
+---
+title: VisualGroundingRefiner
+createTime: 2026/01/11 20:33:54
+permalink: /en/mm_operators/refine/visual_grounding_refiner/
+---
+## 📘 Overview
+
+`VisualGroundingRefiner` is a **Visual Consistency Refinement Operator** designed to eliminate "hallucinations" in multimodal text generation.
+
+This operator takes a list of text items (e.g., tags, sentences, or attributes) and an image, and performs **item-wise Visual Verification** using a VLM. It employs a "Yes/No" discrimination mechanism, retaining only the text items that the model confirms as "Yes" (consistent with the image), thereby filtering out non-existent objects or incorrect descriptions.
+
+## `__init__` Function
+
+```python
+def __init__(
+    self, 
+    serving: LLMServingABC, 
+    prompt_template: str, 
+    system_prompt: str = "You are a helpful assistant."
+):
+
+```
+
+### Parameters
+
+| Parameter | Type | Default | Description |
+| --- | --- | --- | --- |
+| `serving` | `LLMServingABC` | N/A | The model serving instance for inference (must support VLM multimodal inference). |
+| `prompt_template` | `str` | N/A | The prompt template used for verification. **Must include the `{text}` placeholder** and be designed to elicit a "Yes" or "No" response. |
+| `system_prompt` | `str` | `"You are..."` | The system prompt sent to the model. |
+
+## `run` Function
+
+```python
+def run(
+    self, 
+    storage: DataFlowStorage, 
+    input_list_key: str, 
+    input_image_key: str, 
+    output_key: str
+):
+    ...
+
+```
+
+Executes the main logic:
+
+1. **Read Data**
+Retrieves the list of texts to be verified (`input_list_key`) and the corresponding image path (`input_image_key`) from the DataFrame.
+2. **Batch Construction**
+For each `item` in the text list:
+* Generates a query using `prompt_template.format(text=item)`.
+* Constructs a multimodal message containing `[Image, Text]`.
+
+
+3. **Batch Inference**
+* Packages multiple text verification requests for the single image into a batch.
+* Calls `serving.generate_from_input` for parallel inference to get responses.
+
+
+4. **Filtering Logic**
+* Checks if the model's response contains `"yes"` (case-insensitive).
+* **Keeps** items where the answer is Yes, and **Discards** items where the answer is No or ambiguous.
+
+
+5. **Save Results**
+Writes the filtered list to the `output_key`.
+
+### Parameters
+
+| Parameter | Type | Default | Description |
+| --- | --- | --- | --- |
+| `storage` | `DataFlowStorage` | N/A | DataFlow storage object. |
+| `input_list_key` | `str` | N/A | Column name containing the list of texts to verify (List[str]). |
+| `input_image_key` | `str` | N/A | Column name containing the image path. |
+| `output_key` | `str` | N/A | Output column name for the verified list. |
+
+## 🧠 Example Usage
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.core import LLMServing
+from dataflow.operators.refine import VisualGroundingRefiner
+
+# 1) Initialize Model Serving
+serving = LLMServing(model_path="Qwen/Qwen-VL-Chat", device="cuda")
+
+# 2) Initialize Refiner
+# Key: The template must ask for a Yes/No answer
+refiner = VisualGroundingRefiner(
+    serving=serving,
+    prompt_template="Look at the image. Is the object '{text}' visible in the scene? Answer only Yes or No."
+)
+
+# 3) Execute
+refiner.run(
+    storage=storage,
+    input_list_key="candidate_tags",  # e.g., ["Cat", "Dog", "UFO"]
+    input_image_key="image_path",
+    output_key="verified_tags"
+)
+
+```
+
+### 🧾 Output Format
+
+The `output_key` column contains the filtered list of strings:
+
+Example Input (`candidate_tags`):
+
+```json
+["Cat", "Grass", "Flying Saucer"]
+
+```
+
+*(Assuming the image shows a cat on grass)*
+
+Example Output (`verified_tags`):
+
+```json
+["Cat", "Grass"]
+
+```
\ No newline at end of file
diff --git a/docs/en/notes/mm_operators/image_understanding/refine/wiki_qa_refiner.md b/docs/en/notes/mm_operators/image_understanding/refine/wiki_qa_refiner.md
new file mode 100644
index 00000000..695d6c8d
--- /dev/null
+++ b/docs/en/notes/mm_operators/image_understanding/refine/wiki_qa_refiner.md
@@ -0,0 +1,129 @@
+---
+
+title: WikiQARefiner
+createTime: 2025/10/15 19:00:00
+# icon: material-symbols-light:article
+permalink: /en/mm_operators/refine/wikiqa/
+--------------------------------------------
+## 📘 Overview
+
+`WikiQARefiner` is a **pure text-processing operator** designed to normalize raw text containing *Wikipedia Article + Question Answer Pairs*. It parses unstructured text into structured JSON data without relying on any models or GPUs.
+
+This operator focuses on cleaning Markdown/rich-text noise (e.g., `**bold**`, `*italic*`) and robustly parsing common WikiQA text structures to separate the article content from QA pairs, making it ideal for RAG or reading comprehension data pipelines.
+
+## `__init__` Function
+
+```python
+def __init__(self):
+
+```
+
+This operator requires no initialization parameters. It incurs minimal startup overhead as it does not load any models.
+
+## `run` Function
+
+```python
+def run(
+    self,
+    storage: DataFlowStorage,
+    input_key: str = "text",
+    output_key: str = "parsed",
+):
+    ...
+
+```
+
+Executes the main operator logic:
+
+1. **Read Data**
+Reads the DataFrame from `storage` and iterates through rows to process the raw text column specified by `input_key`.
+2. **Context Parsing**
+* Automatically scans the text to identify and separate the Article body.
+* Supports various header variants (e.g., `### QA`, `### Q&A`, `### Question Answer Pairs`, case-insensitive).
+* **Noise Cleaning**: Removes Markdown markers (like `**bold**`, `*italic*`) and normalizes excess whitespace to produce a clean `context` string.
+
+
+3. **QA Parsing**
+* Uses a **line-structure based strategy** rather than strict Markdown parsing for higher robustness.
+* **Question Detection**: Identifies lines starting with a number followed by a dot (e.g., `1.`, `2.`).
+* **Answer Detection**: Identifies lines introduced by hyphens or dashes (`-`, `–`, `—`).
+* Correctly extracts `question` and `answer` even in cases with nested emphasis (e.g., `**Q**`), irregular spacing, or broken Markdown syntax.
+
+
+4. **Result Construction**
+* Combines the parsed `context` string and `qas` list into a dictionary.
+* Writes the result into the new column specified by `output_key` and updates `storage`.
+
+
+
+### Parameters
+
+| Parameter Name | Type | Default | Description |
+| --- | --- | --- | --- |
+| `storage` | `DataFlowStorage` | N/A | The DataFlow storage object for reading/writing. |
+| `input_key` | `str` | `"text"` | The column name containing the raw WikiQA text. |
+| `output_key` | `str` | `"parsed"` | The output column name for the parsed structured data. |
+
+## 🧠 Example Usage
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.text import WikiQARefiner
+
+# 1) Prepare FileStorage
+storage = FileStorage(
+    first_entry_file_name="data/wiki_raw.jsonl",
+    cache_path="./cache_local",
+    file_name_prefix="wikiqa_refined",
+    cache_type="jsonl",
+)
+
+# 2) Initialize the Refiner operator
+op = WikiQARefiner()
+
+# 3) Execute parsing
+op.run(
+    storage=storage.step(),
+    input_key="text",
+    output_key="parsed",
+)
+
+```
+
+### 🧾 Output Format
+
+The data structure in the `output_key` column is as follows:
+
+| Field Name | Type | Description |
+| --- | --- | --- |
+| `context` | `string` | The normalized Wikipedia article text. |
+| `qas` | `List[Dict]` | A list of QA pairs, where each item contains `question` and `answer`. |
+
+Example Input:
+
+```jsonl
+{
+  "id": 1,
+  "text": "### Wikipedia Article\nArtificial **intelligence** (AI) is...\n\n### Question Answer Pairs\n1. What does AI stand for?\n- Artificial Intelligence."
+}
+
+```
+
+Example Output:
+
+```jsonl
+{
+  "id": 1,
+  "text": "...",
+  "parsed": {
+    "context": "Artificial intelligence (AI) is...",
+    "qas": [
+      {
+        "question": "What does AI stand for?",
+        "answer": "Artificial Intelligence."
+      }
+    ]
+  }
+}
+
+```
\ No newline at end of file
diff --git a/docs/zh/notes/mm_guide/image_understanding/context_vqa.md b/docs/zh/notes/mm_guide/image_understanding/context_vqa.md
new file mode 100644
index 00000000..66c804d6
--- /dev/null
+++ b/docs/zh/notes/mm_guide/image_understanding/context_vqa.md
@@ -0,0 +1,296 @@
+---
+title: ContextVQA 多模态问答数据生成流水线
+icon: mdi:image-text
+createTime: 2025/06/16 14:30:00
+permalink: /zh/mm_guide/contextvqa_pipeline/
+---
+## 1. 概述
+
+**ContextVQA 多模态问答数据生成流水线**旨在从图像出发，自动生成**具备外部知识上下文的视觉问答（Context-based VQA）数据**。该流水线利用视觉语言模型（VLM）生成与图像相关的 Wikipedia 风格文章及问答对，并将其解析为结构化数据。
+
+我们支持以下应用场景：
+
+* **知识型 VQA 数据合成**：构建需要外部知识推理的问答数据集。
+* **多模态 RAG 数据构建**：生成用于检索增强生成（RAG）训练的高质量数据。
+* **视觉推理训练**：生成问题指向图像、但答案需从文本上下文推理的数据。
+
+流水线的主要流程包括：
+
+1. **数据加载**：读取包含图像路径的数据文件。
+2. **上下文与问答生成**：利用 VLM 基于图像生成 Wikipedia 风格文章及原始问答对。
+3. **数据清洗与结构化**：解析原始文本，提取结构化的 `{context, qas}` 格式。
+
+---
+
+## 2. 快速开始
+
+### 第一步：准备工作目录
+
+```bash
+mkdir run_context_vqa
+cd run_context_vqa
+
+```
+
+### 第二步：准备脚本
+
+将下文“流水线示例”中的代码保存为 `context_vqa_pipeline.py`。
+
+### 第三步：配置运行参数
+
+该流水线支持命令行参数配置。你可以直接通过命令行指定模型路径和输入文件：
+
+```bash
+# 确保安装了相关依赖
+pip install open-dataflow vllm
+
+```
+
+### 第四步：一键运行
+
+```bash
+python context_vqa_pipeline.py \
+  --model_path "Qwen/Qwen2.5-VL-3B-Instruct" \
+  --images_file "path/to/your/images.jsonl" \
+  --cache_path "./cache_local"
+
+```
+
+---
+
+## 3. 数据流与流水线逻辑
+
+### 1. **输入数据**
+
+该流程的输入数据主要包含以下字段：
+
+* **image**：图像文件路径（本地路径或 URL）。
+* **id**（可选）：数据的唯一标识符。
+
+数据通过 `FileStorage` 进行管理，支持断点续传。
+
+**输入数据示例**：
+
+```json
+[
+    {
+        "id": 1,
+        "image": "./images/landmark.jpg"
+    },
+    {
+        "id": 2,
+        "image": "./images/animal.jpg"
+    }
+]
+
+```
+
+### 2. **核心算子逻辑**
+
+该流水线通过串联两个核心算子来完成任务：
+
+#### A. **FixPromptedVQAGenerator（上下文生成）**
+
+该算子负责利用 VLM 模型，根据预设的 Prompt 模板生成原始文本。
+
+**功能：**
+
+* 基于图像生成一段 Wikipedia 风格的科普文章。
+* 基于文章生成问答对。
+* **Prompt 约束**：问题指向图像但避免直接提及物体名称；答案必须来自文章内容且非图像中的物体；答案简练。
+
+**模型服务配置**：
+
+```python
+self.serving = LocalModelVLMServing_vllm(
+    hf_model_name_or_path=model_path,
+    hf_cache_dir=hf_cache_dir,
+    vllm_tensor_parallel_size=1,
+    vllm_temperature=0.7,  # 保持一定的创造性
+    vllm_top_p=0.9,
+    vllm_max_tokens=512,
+)
+
+```
+
+**算子运行**：
+
+```python
+self.vqa_generator.run(
+    storage=self.storage.step(),
+    input_image_key="image",
+    output_answer_key="vqa" # 输出原始生成的文本
+)
+
+```
+
+#### B. **WikiQARefiner（结果解析）**
+
+该算子负责将 VLM 生成的非结构化文本清洗并转换为标准格式。
+
+**功能：**
+
+* 清洗 Markdown 格式和多余的空白字符。
+* 分离文章内容（Context）和问答对（QAs）。
+
+**算子运行**：
+
+```python
+self.refiner.run(
+    storage=self.storage.step(),
+    input_key="vqa",          # 输入上一涉的原始文本
+    output_key="context_vqa"  # 输出最终结构化数据
+)
+
+```
+
+### 3. **输出数据**
+
+最终，流水线生成的输出数据将包含以下内容：
+
+* **image**：原始图像路径。
+* **vqa**：VLM 生成的原始文本（中间结果）。
+* **context_vqa**：结构化的最终结果，包含 `context`（文章）和 `qas`（问答列表）。
+
+**输出数据示例**：
+
+```json
+{
+    "id": 1,
+    "image": "./images/landmark.jpg",
+    "context_vqa": {
+        "context": "The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France...",
+        "qas": [
+            {
+                "question": "In which city is this structure located?",
+                "answer": "Paris"
+            },
+            {
+                "question": "What material is the tower primarily constructed from?",
+                "answer": "wrought-iron"
+            }
+        ]
+    }
+}
+
+```
+
+---
+
+## 4. 流水线示例
+
+以下是完整的 `ContextVQAPipeline` 代码实现，支持命令行参数调用。
+
+```python
+import argparse
+from dataflow.utils.storage import FileStorage
+from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
+from dataflow.operators.core_vision import FixPromptedVQAGenerator
+from dataflow.operators.core_vision import WikiQARefiner
+
+class ContextVQAPipeline:
+    """
+    一行命令即可完成图片批量 ContextVQA Caption 生成。
+    """
+
+    def __init__(
+        self,
+        model_path: str,
+        *,
+        hf_cache_dir: str | None = None,
+        download_dir: str = "./ckpt",
+        device: str = "cuda",
+        first_entry_file: str = "dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl",
+        cache_path: str = "./cache_local_skvqa",
+        file_name_prefix: str = "skvqa_cache_step",
+        cache_type: str = "jsonl",
+    ):
+        # ---------- 1. Storage ----------
+        self.storage = FileStorage(
+            first_entry_file_name=first_entry_file,
+            cache_path=cache_path,
+            file_name_prefix=file_name_prefix,
+            cache_type=cache_type,
+        )
+
+        # ---------- 2. Serving ----------
+        self.serving = LocalModelVLMServing_vllm(
+            hf_model_name_or_path=model_path,
+            hf_cache_dir=hf_cache_dir,
+            hf_local_dir=download_dir,
+            vllm_tensor_parallel_size=1,
+            vllm_temperature=0.7,
+            vllm_top_p=0.9,
+            vllm_max_tokens=512,
+        )
+
+        # ---------- 3. Operator ----------
+        # 使用特定 Prompt 生成 Wiki 风格文章与问答
+        self.vqa_generator = FixPromptedVQAGenerator(
+            serving=self.serving,
+            system_prompt="You are a helpful assistant.",
+            user_prompt= """
+            Write a Wikipedia article related to this image without directly referring to the image. Then write question answer pairs. The question answer pairs should satisfy the following criteria.
+            1: The question should refer to the image.
+            2: The question should avoid mentioning the name of the object in the image.
+            3: The question should be answered by reasoning over the Wikipedia article.
+            4: The question should sound natural and concise.
+            5: The answer should be extracted from the Wikipedia article.
+            6: The answer should not be any objects in the image.
+            7: The answer should be a single word or phrase and list all correct answers separated by commas.
+            8: The answer should not contain 'and', 'or', rather you can split them into multiple answers.
+            """
+        )
+
+        # 结果清洗与结构化
+        self.refiner = WikiQARefiner()
+
+    # ------------------------------------------------------------------ #
+    def forward(self):
+        input_image_key = "image"
+        output_answer_key = "vqa"
+        output_wiki_key = "context_vqa"
+
+        # 步骤 1: 生成原始文本
+        self.vqa_generator.run(
+            storage=self.storage.step(),
+            input_image_key=input_image_key,
+            output_answer_key=output_answer_key
+        )
+
+        # 步骤 2: 解析为结构化数据
+        self.refiner.run(
+            storage=self.storage.step(),
+            input_key=output_answer_key,
+            output_key=output_wiki_key
+        )
+
+# ---------------------------- CLI 入口 -------------------------------- #
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Batch SKVQA caption generation with DataFlow")
+
+    parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct")
+    parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface")
+    parser.add_argument("--download_dir", default="./ckpt")
+    parser.add_argument("--device", choices=["cuda", "cpu", "mps"], default="cuda")
+
+    parser.add_argument("--images_file", default="dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl")
+    parser.add_argument("--cache_path", default="./cache_local")
+    parser.add_argument("--file_name_prefix", default="context_vqa")
+    parser.add_argument("--cache_type", default="jsonl")
+
+    args = parser.parse_args()
+
+    pipe = ContextVQAPipeline(
+        model_path=args.model_path,
+        hf_cache_dir=args.hf_cache_dir,
+        download_dir=args.download_dir,
+        device=args.device,
+        first_entry_file=args.images_file,
+        cache_path=args.cache_path,
+        file_name_prefix=args.file_name_prefix,
+        cache_type=args.cache_type,
+    )
+    pipe.forward()
+
+```
diff --git a/docs/zh/notes/mm_guide/image_understanding/image_gcot.md b/docs/zh/notes/mm_guide/image_understanding/image_gcot.md
new file mode 100644
index 00000000..a4a11c3d
--- /dev/null
+++ b/docs/zh/notes/mm_guide/image_understanding/image_gcot.md
@@ -0,0 +1,340 @@
+---
+title: 图像定位思维链 (GCoT) 生成流水线
+icon: mdi:image-text
+createTime: 2026/01/11 20:44:55
+permalink: /zh/mm_guide/image_gcot/
+---
+## 1. 概述
+
+**图像定位思维链 (GCoT) 生成流水线** 旨在自动化生成**带视觉定位的思维链（Grounded Chain-of-Thought）**数据。该流水线通过多步推理，不仅生成回答问题的逻辑步骤，还将推理过程中提到的关键物体在图像中进行空间定位（Bounding Box），从而显著提升多模态数据的可解释性和精确度。
+
+与传统方法不同，本流水线采用 **单一 VLM（如 Qwen2.5-VL）** 同时完成“推理”和“定位”任务，流程更加精简高效。
+
+我们支持以下应用场景：
+
+* **增强型多模态数据构建**：为 VQA 数据集增加解释性和定位标注。
+* **复杂场景理解**：生成包含物体坐标的详细推理步骤。
+* **模型推理能力训练**：构建数据以训练模型“言之有物”，减少幻觉。
+
+流水线的主要流程包括：
+
+1. **CoT 生成**：模型生成分步推理文本，并提取关键名词。
+2. **关键词解析**：从生成的文本中清洗并提取待定位的关键词。
+3. **视觉定位 (Grounding)**：模型针对提取的关键词生成边界框 (BBox)。
+4. **信息注入**：将 BBox 坐标回填至推理文本中，形成最终的 GCoT。
+
+---
+
+## 2. 快速开始
+
+### 第一步：准备工作目录
+
+```bash
+mkdir run_gcot
+cd run_gcot
+
+```
+
+### 第二步：准备脚本
+
+将下文“流水线示例”中的代码保存为 `image_gcot_pipeline.py`。
+
+### 第三步：配置运行参数
+
+确保你拥有支持定位能力的 VLM 模型（如 Qwen2.5-VL-7B-Instruct）。
+
+```bash
+# 安装依赖
+pip install open-dataflow vllm
+
+```
+
+### 第四步：一键运行
+
+```bash
+python image_gcot_pipeline.py \
+  --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \
+  --input_file "data/image_qa.jsonl"
+
+```
+
+---
+
+## 3. 数据流与流水线逻辑
+
+### 1. **输入数据**
+
+该流程的输入数据通常是标准的 VQA 数据：
+
+* **image**：图像文件路径。
+* **question**：关于图像的问题。
+* **answer**：问题的标准答案（用于辅助生成 CoT）。
+
+**输入数据示例**：
+
+```json
+{
+    "image": "./images/cat_dog.jpg",
+    "question": "Is the cat looking at the dog?",
+    "answer": "Yes"
+}
+
+```
+
+### 2. **核心算子逻辑**
+
+本流水线通过组合多个细粒度算子来实现复杂的 GCoT 生成逻辑：
+
+#### A. **CoT 生成 (PromptTemplatedVQAGenerator)**
+
+利用预设的 `GCOT_PROMPT_TEMPLATE`，引导模型生成“步骤化推理”和“关键词列表”。
+
+* **Prompt 策略**：要求模型按 `Step 1: ...`, `Step 2: ...`, `Keywords: ...` 格式输出。
+* **输出**：包含推理文本和关键词的原始字符串。
+
+#### B. **文本清洗与提取 (FunctionalRefiner)**
+
+使用自定义函数对上一步的输出进行解析：
+
+* `extract_clean_cot_logic`：剥离关键词部分，保留纯净的 CoT 文本。
+* `extract_keywords_logic`：解析 `Keywords:` 后的内容，生成 Python List。
+
+#### C. **视觉定位 (VLMBBoxGenerator)**
+
+针对提取出的每一个关键词，调用 VLM 的定位能力生成边界框。
+
+* **输入**：图像 + 关键词列表。
+* **输出**：关键词到边界框坐标的映射字典 (Map)。
+
+#### D. **坐标注入 (FunctionalRefiner)**
+
+使用 `inject_bboxes_logic` 函数，将生成的 BBox 坐标智能插入回原始 CoT 文本中对应的单词之后。
+
+### 3. **输出数据**
+
+最终，流水线生成的输出数据将包含以下关键字段：
+
+* **raw_cot_output**：模型原始生成的文本。
+* **cleaned_cot**：清洗后的纯推理文本。
+* **bbox_mapping**：关键词与其坐标的映射。
+* **gcot**：最终结果，包含坐标信息的推理链。
+
+**输出数据示例 (gcot 字段)**：
+
+```text
+Step 1: Locate the cat [200, 300, 400, 500]. The cat is sitting on the left.
+Step 2: Locate the dog [500, 300, 700, 500]. The dog is sleeping on the right.
+Step 3: Observe their gaze. The cat is facing the dog.
+Answer: Yes
+
+```
+
+---
+
+## 4. 流水线示例
+
+以下是完整的 `ImageGCoTPipeline` 代码实现。
+
+```python
+import re
+from typing import List, Dict, Any
+import argparse
+import torch
+from dataflow.utils.storage import FileStorage
+from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
+
+from dataflow.operators.core_vision import PromptTemplatedVQAGenerator, VLMBBoxGenerator
+from dataflow.operators.core_text import FunctionalRefiner
+from dataflow.prompts.prompt_template import NamedPlaceholderPromptTemplate
+
+# 定义 Prompt 模板，强制模型输出推理步骤和关键词
+GCOT_PROMPT_TEMPLATE = (
+    "Question: {question}\n"
+    "Answer: {answer}\n\n"
+    "Task: Provide a detailed step-by-step reasoning (Chain-of-Thought) that explains "
+    "how to arrive at this answer based on the image.\n"
+    "Then, extract key nouns and objects mentioned in your reasoning that are "
+    "visible in the image and can be spatially located.\n\n"
+    "Format:\n"
+    "Step 1: ...\n"
+    "Step 2: ...\n"
+    "Answer: {answer}\n"
+    "Keywords: object1, object2\n"
+)
+
+DEFAULT_BBOX_PROMPT = 'Detect "{keyword}".'
+
+# ----------------- 辅助逻辑函数 ----------------- #
+
+def _parse_base(text: str) -> Dict[str, Any]:
+    """基础解析逻辑：分离 CoT 文本和 Keywords 行"""
+    if not text: return {"cot": "", "keywords": []}
+    lines = text.split('\n')
+    cot_lines = []
+    keywords = []
+    for line in lines:
+        if line.strip().lower().startswith('keywords:'):
+            keyword_str = line.split(':', 1)[-1].strip()
+            # 简单的分词处理
+            raw_kws = [kw.strip().strip('.,;:!?"\'') for kw in keyword_str.replace(';', ',').split(',')]
+            keywords = [k for k in raw_kws if k]
+        else:
+            cot_lines.append(line)
+    return {"cot": '\n'.join(cot_lines).strip(), "keywords": keywords}
+
+def extract_clean_cot_logic(text: str) -> str:
+    return _parse_base(text)["cot"]
+
+def extract_keywords_logic(text: str) -> List[str]:
+    return _parse_base(text)["keywords"]
+
+def inject_bboxes_logic(cot_text: str, bbox_map: Dict[str, List[str]]) -> str:
+    """将 BBox 注入回 CoT 文本"""
+    if not cot_text or not bbox_map: return cot_text
+    # 优先匹配长词，避免子串误匹配
+    sorted_keywords = sorted(bbox_map.keys(), key=lambda x: len(x), reverse=True)
+    result_text = cot_text
+    replaced = set()
+    
+    for keyword in sorted_keywords:
+        if keyword in replaced: continue
+        # 简单策略：只在 'Answer:' 之前注入，防止破坏答案区
+        answer_pos = result_text.find('Answer:')
+        search_limit = answer_pos if answer_pos != -1 else len(result_text)
+        
+        # 大小写不敏感查找
+        pos = result_text.lower().find(keyword.lower(), 0, search_limit)
+        if pos == -1: continue
+        
+        boxes = bbox_map[keyword] # List[str]
+        box_str = "".join(boxes)
+        # 替换：保留原词，追加 Box
+        replacement = f"{keyword} {box_str}"
+        
+        result_text = result_text[:pos] + replacement + result_text[pos + len(keyword):]
+        replaced.add(keyword)
+    return result_text
+
+# ----------------- 流水线定义 ----------------- #
+
+class ImageGCoTPipeline:
+    def __init__(
+        self,
+        model_path: str,
+        *,
+        first_entry_file: str,
+        cache_path: str = "./cache_gcot",
+        file_name_prefix: str = "gcot",
+        # Keys 配置
+        question_key: str = "question",
+        answer_key: str = "answer",
+        image_key: str = "image",
+        output_key: str = "gcot",
+        vllm_max_tokens: int = 512
+    ):
+        # 1. 存储初始化
+        self.storage = FileStorage(
+            first_entry_file_name=first_entry_file,
+            cache_path=cache_path,
+            file_name_prefix=file_name_prefix,
+            cache_type="jsonl"
+        )
+        
+        # 2. 模型服务 (单一模型)
+        self.vlm_serving = LocalModelVLMServing_vllm(
+            hf_model_name_or_path=model_path,
+            vllm_tensor_parallel_size=1,
+            vllm_temperature=0.7,
+            vllm_max_tokens=vllm_max_tokens
+        )
+        
+        self.keys = {
+            "q": question_key,
+            "a": answer_key,
+            "img": image_key,
+            "raw_cot": "raw_cot_output",
+            "clean_cot": "cleaned_cot",
+            "keywords": "extracted_keywords",
+            "bbox_map": "bbox_mapping",
+            "final": output_key
+        }
+
+        # 3. 算子链配置
+        
+        # Step A: 生成 CoT 和 Keywords
+        self.op_gen_cot = PromptTemplatedVQAGenerator(
+            serving=self.vlm_serving,
+            system_prompt="You are a helpful assistant.",
+            prompt_template=NamedPlaceholderPromptTemplate(template=GCOT_PROMPT_TEMPLATE)
+        )
+        
+        # Step B: 解析清洗 CoT
+        self.op_extract_cot = FunctionalRefiner(func=extract_clean_cot_logic)
+        
+        # Step C: 解析 Keywords
+        self.op_extract_kws = FunctionalRefiner(func=extract_keywords_logic)
+
+        # Step D: 生成 BBox (Grounding)
+        self.op_bbox_gen = VLMBBoxGenerator(
+            serving=self.vlm_serving,
+            prompt_template=DEFAULT_BBOX_PROMPT
+        )
+        
+        # Step E: 注入 BBox 到 CoT
+        self.op_inject = FunctionalRefiner(func=inject_bboxes_logic)
+
+    def forward(self):
+        print(">>> [Pipeline] Step 1: Generating CoT...")
+        self.op_gen_cot.run(
+            self.storage.step(),
+            input_image_key=self.keys["img"],
+            output_answer_key=self.keys["raw_cot"],
+            question=self.keys["q"],
+            answer=self.keys["a"]
+        )
+        
+        print(">>> [Pipeline] Step 2: Parsing Outputs...")
+        self.op_extract_cot.run(
+            self.storage.step(),
+            output_key=self.keys["clean_cot"],
+            text=self.keys["raw_cot"]
+        )
+        self.op_extract_kws.run(
+            self.storage.step(),
+            output_key=self.keys["keywords"],
+            text=self.keys["raw_cot"]
+        )
+        
+        print(">>> [Pipeline] Step 3: Generating BBoxes (Grounding)...")
+        self.op_bbox_gen.run(
+            self.storage.step(),
+            input_image_key=self.keys["img"],
+            input_kws_key=self.keys["keywords"],
+            output_key=self.keys["bbox_map"]
+        )
+        
+        print(">>> [Pipeline] Step 4: Injecting GCoT...")
+        self.op_inject.run(
+            self.storage.step(),
+            output_key=self.keys["final"],
+            cot_text=self.keys["clean_cot"],
+            bbox_map=self.keys["bbox_map"]
+        )
+        
+        print(f">>> [Pipeline] Done. Final GCoT saved to: {self.keys['final']}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_file", default="dataflow/example/image_to_text_pipeline/image_qa_result.jsonl")
+    parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct")
+    
+    args = parser.parse_args()
+    
+    pipe = ImageGCoTPipeline(
+        model_path=args.model_path,
+        first_entry_file=args.input_file
+    )
+    pipe.forward()
+
+```
diff --git a/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline.md b/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline.md
new file mode 100644
index 00000000..245a43db
--- /dev/null
+++ b/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline.md
@@ -0,0 +1,276 @@
+---
+title: 图像区域描述生成流水线
+createTime: 2026/01/11 22:04:27
+icon: mdi:image-text
+permalink: /zh/mm_guide/image_region_caption_pipeline/
+---
+## 1. 概述
+
+**图像区域描述生成流水线 (Image Region Captioning Pipeline)** 旨在为图像中的特定区域生成详细的文本描述。该流水线结合了计算机视觉的定位能力与多模态大模型的理解能力，能够识别图像中的感兴趣区域（ROI），并为其生成精确的自然语言标注。
+
+该流水线支持处理**预定义边界框 (Bounding Box)** 数据，并将其可视化后输入 VLM 进行描述生成。
+
+我们支持以下应用场景：
+
+* **密集描述生成 (Dense Captioning)**：为图像中的多个物体分别生成描述。
+* **细粒度图像理解**：关注图像的局部细节而非全局描述。
+* **数据集增强**：构建带定位信息的图文对数据集。
+
+流水线的主要流程包括：
+
+1. **数据加载**：读取包含图像和边界框信息的源数据。
+2. **边界框处理与可视化**：处理输入的边界框，生成带有可视化标记（如画框）的图像版本。
+3. **区域描述生成**：利用 VLM 针对标记后的图像或特定区域生成文本描述。
+
+---
+
+## 2. 快速开始
+
+### 第一步：准备工作目录
+
+```bash
+mkdir run_region_caption
+cd run_region_caption
+
+```
+
+### 第二步：准备脚本
+
+将下文“流水线示例”中的代码保存为 `region_caption_pipeline.py`。
+
+### 第三步：配置运行参数
+
+确保输入文件（jsonl）包含 `image` 和 `bbox` 字段。
+
+```bash
+# 安装依赖
+pip install open-dataflow vllm
+
+```
+
+### 第四步：一键运行
+
+```bash
+python region_caption_pipeline.py \
+  --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \
+  --first_entry_file "data/region_captions.jsonl" \
+  --output_jsonl_path "data/results.jsonl"
+
+```
+
+---
+
+## 3. 数据流与流水线逻辑
+
+### 1. **输入数据**
+
+输入数据通常包含图像路径和对应的边界框列表：
+
+* **image**：图像文件路径。
+* **bbox**：边界框坐标列表，通常格式为 `[[x, y, w, h], ...]` 或 `[[x1, y1, x2, y2], ...]`（取决于具体配置）。
+
+**输入数据示例**：
+
+```json
+{
+    "image": "./images/kitchen.jpg",
+    "bbox": [[196, 104, 310, 495], [50, 60, 100, 200]]
+}
+
+```
+
+### 2. **核心算子逻辑**
+
+该流水线通过串联两个核心算子来完成任务：
+
+#### A. **ImageBboxGenerator（边界框处理器）**
+
+该算子负责处理视觉层面的任务。
+
+* **输入**：原始图像 + `bbox` 数据。
+* **功能**：读取边界框，将其绘制在图像上（可视化），或者根据配置进行预处理。
+* **配置 (`ExistingBBoxDataGenConfig`)**：控制最大框数量 (`max_boxes`) 和可视化选项 (`draw_visualization`)。
+* **输出**：生成带有视觉标记的新图像路径（`image_with_bbox`）。
+
+#### B. **PromptedVQAGenerator（VQA 生成器）**
+
+该算子负责利用 VLM 生成文本。
+
+* **输入**：上一步生成的 `image_with_bbox`。
+* **功能**：VLM 接收带有标记的图像，根据提示生成对应区域的描述。
+* **输出**：区域描述文本。
+
+### 3. **输出数据**
+
+最终生成的输出数据将包含处理后的图像路径和生成的描述：
+
+* **image_with_bbox**：画了框的图像路径。
+* **mdvp_record**：生成的区域描述列表。
+
+**输出数据示例**：
+
+```json
+{
+    "image": "./images/kitchen.jpg",
+    "image_with_bbox": "./images/kitchen_visualized.jpg",
+    "mdvp_record": [
+        "A wooden chair located near the table.",
+        "A white refrigerator in the background."
+    ]
+}
+
+```
+
+---
+
+## 4. 流水线示例
+
+以下是完整的 `ImageRegionCaptioningPipeline` 代码实现。
+
+```python
+import argparse
+from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
+from dataflow.operators.core_vision.generate.image_bbox_generator import (
+    ImageBboxGenerator, 
+    ExistingBBoxDataGenConfig
+)
+from dataflow.operators.core_vision.generate.prompted_vqa_generator import (
+    PromptedVQAGenerator
+)
+from dataflow.utils.storage import FileStorage
+
+
+class ImageRegionCaptioningPipeline:
+    def __init__(
+        self,
+        model_path: str,
+        *,
+        hf_cache_dir: str | None = None,
+        download_dir: str = "./ckpt/models",
+        device: str = "cuda",
+        # Storage & Paths
+        first_entry_file: str = "./dataflow/example/image_to_text_pipeline/region_captions.jsonl",
+        cache_path: str = "./dataflow/example/cache",
+        file_name_prefix: str = "region_caption",
+        cache_type: str = "jsonl",
+        # Keys
+        input_image_key: str = "image",
+        input_bbox_key: str = "bbox",
+        image_with_bbox_path: str = 'image_with_bbox', # Key for intermediate image
+        output_key: str = "mdvp_record",
+        # BBox Config
+        max_boxes: int = 10,
+        input_jsonl_path: str = "./dataflow/example/image_to_text_pipeline/region_captions.jsonl",
+        output_jsonl_path: str = "./dataflow/example/image_to_text_pipeline/region_captions_results_v1.jsonl",
+        output_image_with_bbox_path: str = "./dataflow/example/image_to_text_pipeline/image_with_bbox_results_v1.jsonl",
+        draw_visualization: bool = True
+    ):
+        # 1. 初始化存储 (Storage)
+        # 用于 BBox 生成阶段的存储
+        self.bbox_storage = FileStorage(
+            first_entry_file_name=first_entry_file,
+            cache_path=cache_path,
+            file_name_prefix=file_name_prefix,
+            cache_type=cache_type
+        )
+        
+        # 2. 配置 BBox 生成器
+        self.cfg = ExistingBBoxDataGenConfig(
+            max_boxes=max_boxes,
+            input_jsonl_path=input_jsonl_path,
+            output_jsonl_path=output_image_with_bbox_path,
+        )
+
+        # 3. 初始化 Caption 阶段的存储
+        # 注意：这里接续了上一步的输出路径
+        self.caption_storage = FileStorage(
+            first_entry_file_name=output_image_with_bbox_path,
+            cache_path=cache_path,
+            file_name_prefix=file_name_prefix,
+            cache_type=cache_type
+        )
+
+        # 4. 初始化 VLM 服务
+        self.serving = LocalModelVLMServing_vllm(
+            hf_model_name_or_path=model_path,
+            hf_cache_dir=hf_cache_dir,
+            hf_local_dir=download_dir,
+            vllm_tensor_parallel_size=1,
+            vllm_temperature=0.7,
+            vllm_top_p=0.9,
+            vllm_max_tokens=512,
+        )
+
+        # 5. 初始化核心算子
+        self.bbox_generator = ImageBboxGenerator(config=self.cfg)
+        self.caption_generator = PromptedVQAGenerator(serving=self.serving)
+        
+        self.input_image_key = input_image_key
+        self.input_bbox_key = input_bbox_key
+        self.output_key = output_key
+        self.image_with_bbox_path = image_with_bbox_path
+
+    def forward(self):
+        # 步骤 1: 生成带 BBox 可视化的图像
+        print(">>> [Pipeline] Step 1: Processing BBoxes and Visualizing...")
+        self.bbox_generator.run(
+            storage=self.bbox_storage.step(),
+            input_image_key=self.input_image_key,
+            input_bbox_key=self.input_bbox_key,
+            output_key=self.image_with_bbox_path,
+        )
+
+        # 步骤 2: 基于可视化图像生成描述
+        print(">>> [Pipeline] Step 2: Generating Region Captions...")
+        self.caption_generator.run(
+            storage=self.caption_storage.step(),
+            input_image_key='image_with_bbox' # 使用上一步生成的带框图像
+        )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Image region captioning with DataFlow")
+ 
+    parser.add_argument("--model_path", default="/data0/happykeyan/Models/Qwen2.5-VL-3B-Instruct")
+    parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface")
+    parser.add_argument("--download_dir", default="./ckpt/models")
+    parser.add_argument("--device", choices=["cuda", "cpu", "mps"], default="cuda")
+
+    parser.add_argument("--first_entry_file", default="./dataflow/example/image_to_text_pipeline/region_captions.jsonl")
+    parser.add_argument("--cache_path", default="./dataflow/example/cache")
+    parser.add_argument("--file_name_prefix", default="region_caption")
+    parser.add_argument("--cache_type", default="jsonl")
+    
+    parser.add_argument("--input_image_key", default="image")
+    parser.add_argument("--input_bbox_key", default="bbox")
+    parser.add_argument("--output_key", default="mdvp_record")
+
+    parser.add_argument("--max_boxes", type=int, default=10)
+    parser.add_argument("--input_jsonl_path", default="./dataflow/example/image_to_text_pipeline/region_captions.jsonl")
+    parser.add_argument("--output_jsonl_path", default="./dataflow/example/image_to_text_pipeline/region_captions_results_v1.jsonl")
+    parser.add_argument("--output_image_with_bbox_path", default="./dataflow/example/image_to_text_pipeline/image_with_bbox_results_v1.jsonl")
+    parser.add_argument("--draw_visualization", type=bool, default=True)
+
+    args = parser.parse_args()
+
+    pipe = ImageRegionCaptioningPipeline(
+        model_path=args.model_path,
+        hf_cache_dir=args.hf_cache_dir,
+        download_dir=args.download_dir,
+        device=args.device,
+        first_entry_file=args.first_entry_file,
+        cache_path=args.cache_path,
+        file_name_prefix=args.file_name_prefix,
+        cache_type=args.cache_type,
+        input_image_key=args.input_image_key,
+        input_bbox_key=args.input_bbox_key,
+        output_key=args.output_key,
+        max_boxes=args.max_boxes,
+        input_jsonl_path=args.input_jsonl_path,
+        output_image_with_bbox_path=args.output_image_with_bbox_path,
+        draw_visualization=args.draw_visualization
+    )
+    pipe.forward()
+
+```
+
diff --git a/docs/zh/notes/mm_guide/image_understanding/image_scale_caption_pipeline.md b/docs/zh/notes/mm_guide/image_understanding/image_scale_caption_pipeline.md
new file mode 100644
index 00000000..3bb6b039
--- /dev/null
+++ b/docs/zh/notes/mm_guide/image_understanding/image_scale_caption_pipeline.md
@@ -0,0 +1,368 @@
+---
+title: ScaleCap 高密度描述生成流水线
+createTime: 2026/01/11 22:08:57
+icon: mdi:image-text
+permalink: /zh/mm_guide/image_scale_caption_pipeline/
+---
+
+## 1. 概述
+
+**ScaleCap 高密度描述生成流水线 (Image Scale Caption Pipeline)** 是一种基于**“生成-验证-扩展-融合”**范式的先进图像描述生成方案。该流水线旨在生成**信息密度极高**且**幻觉率极低**的图像描述，特别适用于需要深度理解图像细节的场景。
+
+该方法的理论基础源自论文 *ScaleCap: Inference-Time Scalable Image Captioning via Dual-Modality Debiasing*。它通过多轮对话和视觉自检（Visual Grounding），逐步挖掘图像中的对象与位置细节，并过滤掉模型产生的幻觉。
+
+我们支持以下应用场景：
+
+* **高质量多模态数据集构建**：生成比普通 Caption 更详尽、准确的训练数据。
+* **细粒度图像检索**：提供包含丰富细节的索引文本。
+* **盲人辅助/图像无障碍**：生成“所见即所得”的详细解说。
+
+流水线的主要流程包括：
+
+1. **初稿生成**：VLM 生成基础描述。
+2. **视觉自检 (Debiasing)**：将描述拆分为句子，逐句验证其是否被图像证据支持（Visual Grounding）。
+3. **细节追问**：针对通过验证的“黄金句子”，生成关于对象属性和位置的追问。
+4. **回答与再验证**：VLM 回答追问，并再次进行视觉自检以过滤错误细节。
+5. **最终融合**：将所有经过验证的信息融合成一段连贯的长描述。
+
+---
+
+## 2. 快速开始
+
+### 第一步：准备工作目录
+
+```bash
+mkdir run_scalecap
+cd run_scalecap
+
+```
+
+### 第二步：准备脚本
+
+将下文“流水线示例”中的代码保存为 `scalecap_pipeline.py`。
+
+### 第三步：配置运行参数
+
+确保 VLM 模型（如 Qwen2.5-VL）路径正确。
+
+```bash
+# 安装依赖
+pip install open-dataflow vllm
+
+```
+
+### 第四步：一键运行
+
+```bash
+python scalecap_pipeline.py \
+  --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \
+  --input_jsonl "data/images.jsonl" \
+  --output_key "final_caption"
+
+```
+
+---
+
+## 3. 数据流与流水线逻辑
+
+### 1. **输入数据**
+
+输入数据非常简单，仅需图像路径：
+
+* **image**：图像文件路径。
+
+**输入数据示例**：
+
+```json
+{
+    "image": "./images/complex_scene.jpg"
+}
+
+```
+
+### 2. **核心算子逻辑**
+
+该流水线是多个原子算子的复杂编排：
+
+#### A. **初稿生成 (PromptedVQAGenerator)**
+
+* **功能**：使用基础 Prompt 生成图像的初步描述 (`init_caption`)。
+
+#### B. **视觉自检 (VisualGroundingRefiner)**
+
+* **功能**：这是 ScaleCap 的核心防幻觉机制。
+* **逻辑**：
+1. 使用 `split_sentences` 将初稿拆分为单句。
+2. 调用 VLM 询问：“Given the image, is the description '{text}' directly supported by visual evidence?”。
+3. 仅保留回答为 "Yes" 的句子，形成 **"Golden Sentences"**。
+
+
+
+#### C. **问题生成与解析 (PromptTemplatedQAGenerator)**
+
+* **功能**：基于 Golden Sentences，利用 LLM 能力生成针对性的追问。
+* **逻辑**：模型生成如 "Describe more details about the [Object]" 的文本，并通过 `parse_questions_logic` 自动扩展为**对象细节**和**位置关系**两类问题。
+
+#### D. **批量回答与二次过滤 (BatchVQAGenerator & Refiner)**
+
+* **功能**：挖掘图像深层信息。
+* **逻辑**：
+1. 使用 `BatchVQAGenerator` 一次性让 VLM 回答上述生成的所有问题。
+2. 再次使用 `VisualGroundingRefiner` 检查这些新生成的细节是否准确。
+3. 保留可靠的细节信息 (`final_details`)。
+
+
+
+#### E. **最终融合 (PromptTemplatedQAGenerator)**
+
+* **功能**：将“黄金句子”和“验证后的细节”重写为一段流畅的文本。
+* **输出**：`final_caption`。
+
+### 3. **输出数据**
+
+输出数据记录了流水线的全过程，方便调试和分析：
+
+* **init_caption**：原始生成的初稿。
+* **golden_sentences**：通过第一次自检的句子列表。
+* **q_list**：生成的追问列表。
+* **final_details**：通过第二次自检的细节回答。
+* **final_caption**：最终的高密度描述。
+
+**输出数据示例**：
+
+```json
+{
+    "image": "./images/complex_scene.jpg",
+    "init_caption": "A dog sitting on a bench.",
+    "golden_sentences": ["A dog is sitting on a wooden bench."],
+    "q_list": ["Describe more details about the dog.", "Describe position of the bench."],
+    "final_details": ["The dog is a Golden Retriever with a red collar.", "The bench is located in a park."],
+    "final_caption": "A Golden Retriever with a red collar is sitting on a wooden bench located in a park..."
+}
+
+```
+
+---
+
+## 4. 流水线示例
+
+以下是完整的 `ImageScaleCaptionPipeline` 代码实现。
+
+```python
+import re
+import argparse
+from typing import Callable, Any, List
+
+from dataflow.utils.storage import FileStorage
+from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
+from dataflow.prompts.prompt_template import NamedPlaceholderPromptTemplate
+from dataflow.prompts.image import ImageScaleCaptionPrompt
+from dataflow.operators.core_vision import PromptedVQAGenerator, BatchVQAGenerator, VisualGroundingRefiner
+from dataflow.operators.core_text import PromptTemplatedQAGenerator, FunctionalRefiner
+
+class ImageScaleCaptionPipeline:
+    def __init__(
+        self,
+        model_path: str,
+        *,
+        hf_cache_dir: str | None = None,
+        download_dir: str = "./ckpt/models",
+        device: str = "cuda",
+        # Storage params
+        first_entry_file: str = "images.jsonl",
+        cache_path: str = "./cache_scalecap",
+        file_name_prefix: str = "scalecap",
+        cache_type: str = "jsonl",
+        # Keys
+        input_image_key: str = "image",
+        output_key: str = "final_caption",
+        # VLLM Config
+        vllm_tensor_parallel_size: int = 1,
+        vllm_temperature: float = 0.7,
+        vllm_top_p: float = 0.9,
+        vllm_max_tokens: int = 512,
+    ):
+        # 1. Storage
+        self.storage = FileStorage(
+            first_entry_file_name=first_entry_file,
+            cache_path=cache_path,
+            file_name_prefix=file_name_prefix,
+            cache_type=cache_type,
+        )
+
+        # 2. Serving
+        self.serving = LocalModelVLMServing_vllm(
+            hf_model_name_or_path=model_path,
+            hf_cache_dir=hf_cache_dir,
+            hf_local_dir=download_dir,
+            vllm_tensor_parallel_size=vllm_tensor_parallel_size,
+            vllm_temperature=vllm_temperature,
+            vllm_top_p=vllm_top_p,
+            vllm_max_tokens=vllm_max_tokens,
+        )
+
+        # 3. Prompts
+        self.prompts_db = ImageScaleCaptionPrompt().build_prompt()
+
+        # 4. Keys
+        self.input_image_key = input_image_key
+        self.output_key = output_key
+
+        # ================== Operator Initialization ==================
+
+        # --- Step A: Generate Init Caption ---
+        self.refine_const_prompt = FunctionalRefiner(func=lambda: self.prompts_db["VLM_PROMPT_1"])
+        self.gen_init_caption = PromptedVQAGenerator(
+            serving=self.serving,
+            system_prompt="You are a helpful assistant."
+        )
+
+        # --- Step B: Refine Golden Sentences ---
+        self.refine_split = FunctionalRefiner(func=split_sentences)
+        # 视觉自检 (保留 Yes 的句子)
+        self.refine_golden = VisualGroundingRefiner(
+            serving=self.serving,
+            prompt_template="Given the image, is the description '{text}' directly supported by visual evidence? Answer strictly yes or no."
+        )
+
+        # --- Step C: Generate Questions ---
+        self.refine_join = FunctionalRefiner(func=join_list)
+        tpl_q = NamedPlaceholderPromptTemplate(
+            template=self.prompts_db["LLM_PROMPT_1"], 
+            join_list_with="\n"
+        )
+        self.gen_questions_text = PromptTemplatedQAGenerator(
+            serving=self.serving,
+            prompt_template=tpl_q
+        )
+        self.refine_parse_qs = FunctionalRefiner(func=parse_questions_logic)
+
+        # --- Step D: Generate Answers ---
+        self.gen_answers = BatchVQAGenerator(serving=self.serving)
+        self.refine_answers = VisualGroundingRefiner(
+            serving=self.serving,
+            prompt_template="Given the image, is the statement '{text}' grounded in the image and not generic? Answer strictly yes or no."
+        )
+
+        # --- Step E: Integrate Final Caption ---
+        tpl_final = NamedPlaceholderPromptTemplate(
+            template=self.prompts_db["LLM_PROMPT_4"], 
+            join_list_with="\n"
+        )
+        self.gen_final_caption = PromptTemplatedQAGenerator(
+            serving=self.serving,
+            prompt_template=tpl_final
+        )
+
+    def forward(self):
+        print(">>> [Pipeline] Step 0: Preparing Prompts...")
+        self.refine_const_prompt.run(
+            self.storage.step(), 
+            output_key="init_prompt"
+        )
+
+        print(">>> [Pipeline] Step 1: Generating Initial Caption...")
+        self.gen_init_caption.run(
+            self.storage.step(),
+            input_prompt_key="init_prompt",
+            input_image_key=self.input_image_key,
+            output_answer_key="init_caption"
+        )
+
+        print(">>> [Pipeline] Step 2: Refining Golden Sentences...")
+        self.refine_split.run(
+            self.storage.step(), 
+            output_key="sentences", 
+            text="init_caption"
+        )
+        self.refine_golden.run(
+            self.storage.step(), 
+            input_list_key="sentences", 
+            input_image_key=self.input_image_key, 
+            output_key="golden_sentences"
+        )
+
+        print(">>> [Pipeline] Step 3: Generating Details Questions...")
+        self.refine_join.run(
+            self.storage.step(), 
+            output_key="golden_str", 
+            data="golden_sentences"
+        )
+        self.gen_questions_text.run(
+            self.storage.step(), 
+            output_answer_key="raw_q_text", 
+            sentence="golden_str"
+        )
+        self.refine_parse_qs.run(
+            self.storage.step(), 
+            output_key="q_list", 
+            text="raw_q_text"
+        )
+
+        print(">>> [Pipeline] Step 4: Generating & Filtering Answers...")
+        self.gen_answers.run(
+            self.storage.step(), 
+            input_prompts_key="q_list", 
+            input_image_key=self.input_image_key, 
+            output_key="raw_answers"
+        )
+        self.refine_answers.run(
+            self.storage.step(), 
+            input_list_key="raw_answers", 
+            input_image_key=self.input_image_key, 
+            output_key="final_details"
+        )
+
+        print(">>> [Pipeline] Step 5: Integrating Final Caption...")
+        self.refine_join.run(
+            self.storage.step(), 
+            output_key="details_str", 
+            data="final_details"
+        )
+        self.gen_final_caption.run(
+            self.storage.step(),
+            output_answer_key=self.output_key,
+            context="golden_str",
+            object_info="details_str",
+            position_info="details_str"
+        )
+
+        print(f">>> [Pipeline] All Done. Result saved to: {self.storage.cache_path}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="ScaleCap Dense Captioning Pipeline")
+    
+    parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct")
+    parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface")
+    parser.add_argument("--download_dir", default="./ckpt/models")
+    parser.add_argument("--device", default="cuda")
+
+    parser.add_argument("--input_jsonl", default="./dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl")
+    parser.add_argument("--cache_path", default="./cache_scalecap_results")
+    parser.add_argument("--file_name_prefix", default="scalecap")
+    parser.add_argument("--input_image_key", default="image")
+    parser.add_argument("--output_key", default="final_caption")
+
+    parser.add_argument("--tp", type=int, default=1)
+    parser.add_argument("--max_tokens", type=int, default=1024)
+
+    args = parser.parse_args()
+
+    pipe = ImageScaleCaptionPipeline(
+        model_path=args.model_path,
+        hf_cache_dir=args.hf_cache_dir,
+        download_dir=args.download_dir,
+        device=args.device,
+        first_entry_file=args.input_jsonl,
+        cache_path=args.cache_path,
+        file_name_prefix=args.file_name_prefix,
+        input_image_key=args.input_image_key,
+        output_key=args.output_key,
+        vllm_tensor_parallel_size=args.tp,
+        vllm_max_tokens=args.max_tokens
+    )
+    
+    pipe.forward()
+
+```
diff --git a/docs/zh/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline.md b/docs/zh/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline.md
new file mode 100644
index 00000000..a586da66
--- /dev/null
+++ b/docs/zh/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline.md
@@ -0,0 +1,326 @@
+---
+title: 视觉依赖 MCQ 生成流水线
+createTime: 2026/01/11 22:13:45
+icon: mdi:image-text
+permalink: /zh/mm_guide/image_visual_only_mcq_pipeline/
+---
+## 1. 概述
+
+**视觉依赖 MCQ 生成流水线 (Visual-Only MCQ Pipeline)** 是 CapRL (Caption Reinforcement Learning) 框架中的核心组件。它的目标是生成一组高质量的多项选择题 (MCQ)，且这些题目必须满足**强视觉依赖性**：即模型必须“看”图才能答对，仅凭文本（猜题或常识）无法作答。
+
+该流水线通过**生成-解析-验证**三步法，利用**选项旋转 (Rotation)** 和**无图盲测 (Blind Test)** 机制，严格过滤掉模型幻觉或过于简单的题目。生成的题目可作为强化学习的奖励信号（Reward Model）。
+
+主要流程包括：
+
+1. **MCQ 生成**：VLM 基于图像生成原始的问答对文本。
+2. **结构化解析**：利用正则逻辑将文本解析为标准的题目与选项结构。
+3. **视觉依赖验证**：
+* **旋转测试**：多次打乱选项顺序，消除位置偏见。
+* **双重过滤**：要求“有图答对率”高，“无图答对率”低。
+
+
+
+---
+
+## 2. 快速开始
+
+### 第一步：创建工作目录
+
+```bash
+mkdir run_vis_mcq
+cd run_vis_mcq
+
+```
+
+### 第二步：准备脚本
+
+将下文“流水线示例”中的代码保存为 `visual_mcq_pipeline.py`。
+
+### 第三步：配置运行参数
+
+该流水线通过命令行参数控制过滤阈值。例如，要求有图 100% 正确，无图正确率低于 25%：
+
+```bash
+# 安装依赖
+pip install open-dataflow vllm
+
+```
+
+### 第四步：一键运行
+
+```bash
+python visual_mcq_pipeline.py \
+  --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \
+  --input_file "data/captions.jsonl" \
+  --rotate_num 4 \
+  --pass_vis 1.0 \
+  --pass_txt 0.25
+
+```
+
+---
+
+## 3. 数据流与流水线逻辑
+
+### 1. **输入数据**
+
+输入仅需包含图像路径：
+
+* **image**：图像文件路径。
+
+**输入数据示例**：
+
+```json
+{
+    "image": "./images/sample_01.jpg"
+}
+
+```
+
+### 2. **核心算子逻辑**
+
+该流水线由三个关键算子串联而成：
+
+#### A. **FixPromptedVQAGenerator（原始生成）**
+
+* **功能**：使用 CapRL 预设的 Prompt 模板（`SYS_PROMPT_MCQ` / `USER_PROMPT_MCQ`），让 VLM 一次性生成 5 道 MCQ。
+* **输出**：包含多个 `#### Question` 和选项的非结构化文本块。
+
+#### B. **FunctionalRefiner（正则解析）**
+
+* **逻辑函数**：`parse_mcq_text_logic`
+* **功能**：利用正则表达式从原始文本中提取题目、选项（A-F）和正确答案。
+* **输出**：结构化的 MCQ 列表 (`parsed_mcq_list`)。
+
+#### C. **VisualDependencyRefiner（依赖性验证）**
+
+这是本流水线的核心过滤器。它对每道题进行 N 次推理（N = `rotate_num`）：
+
+1. **选项旋转**：随机打乱选项顺序（例如将答案从 A 换到 C），防止模型通过“总是选 A”来作弊。
+2. **有图推理 (Visual Pass)**：输入图像 + 题目。记录模型答对的比例。
+3. **无图推理 (Textual Pass)**：仅输入题目（无图像）。记录模型盲猜对的比例。
+4. **过滤判据**：
+* 保留题目，当且仅当：`Visual_Acc >= pass_visual_min` **且** `Textual_Acc <= pass_textual_max`。
+* *示例*：如果一道题不看图也能答对（无图准确率高），说明它考的是常识而非视觉，**剔除**。
+
+
+
+### 3. **输出数据**
+
+输出数据 (`final_mcqs`) 仅包含通过了严苛验证的题目。这些题目具有极高的质量和视觉相关性。
+
+**输出数据示例**：
+
+```json
+{
+    "image": "./images/sample_01.jpg",
+    "final_mcqs": [
+        {
+            "question": "What is the color of the car on the far left?\n - A) Red\n - B) Blue...",
+            "answer": "A",
+            "stats": {
+                "visual_acc": 1.0,  # 4次全对
+                "text_acc": 0.0     # 盲猜全错
+            }
+        }
+    ]
+}
+
+```
+
+---
+
+## 4. 流水线示例
+
+以下是完整的 `VisualOnlyMCQPipeline` 代码实现。
+
+```python
+import argparse
+import re
+from typing import List, Dict, Any
+from dataflow.utils.storage import FileStorage
+from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
+
+from dataflow.operators.core_vision import FixPromptedVQAGenerator, VisualDependencyRefiner
+from dataflow.operators.core_text import FunctionalRefiner
+from dataflow.prompts.image import ImageCaprlPrompt
+
+# 正则解析逻辑
+_Q_BLOCK_SPLIT = re.compile(r"^####\s*\d+\.\s*\*\*(.*?)\*\*\s*$", re.M)
+_OPT_LINE_RE = re.compile(r"^\s*-\s*([A-F])\)\s*(.+?)\s*$")
+_ANS_LINE_RE = re.compile(r"^\s*\*\*Answer:\*\*\s*([A-F])\)\s*(.+?)\s*$", re.I)
+
+def parse_mcq_text_logic(mcq_text: str, expected: int = 5) -> List[Dict[str, Any]]:
+    """将 VLM 生成的原始文本解析为结构化字典列表"""
+    if not mcq_text or not isinstance(mcq_text, str): return []
+    
+    indices = [m.start() for m in _Q_BLOCK_SPLIT.finditer(mcq_text)]
+    if not indices: return []
+    indices.append(len(mcq_text))
+    blocks = [mcq_text[indices[i]:indices[i+1]].strip() for i in range(len(indices)-1)]
+    
+    parsed = []
+    for block in blocks:
+        lines = [ln.rstrip() for ln in block.splitlines() if ln.strip()]
+        q_title_m = _Q_BLOCK_SPLIT.search(block)
+        if not q_title_m: continue
+        
+        q_title = q_title_m.group(1).strip()
+        options = {}
+        ans_letter, ans_text = None, None
+        
+        for ln in lines:
+            m_opt = _OPT_LINE_RE.match(ln)
+            if m_opt:
+                options[m_opt.group(1)] = m_opt.group(2).strip()
+                continue
+            m_ans = _ANS_LINE_RE.match(ln)
+            if m_ans:
+                ans_letter = m_ans.group(1).upper()
+                ans_text = m_ans.group(2).strip()
+                break
+        
+        if options and ans_letter and ans_letter in options:
+            q_lines = [q_title]
+            for lbl in ["A", "B", "C", "D", "E", "F"]:
+                if lbl in options:
+                    q_lines.append(f"   - {lbl}) {options[lbl]}")
+            
+            parsed.append({
+                "question": "\n".join(q_lines),
+                "question_title": q_title,
+                "options": options,
+                "answer": ans_letter,
+                "answer_text": ans_text
+            })
+            
+    if expected > 0:
+        parsed = parsed[:expected]
+        
+    uniq = []
+    seen = set()
+    for it in parsed:
+        key = (it["question_title"], it["answer"])
+        if key not in seen:
+            seen.add(key)
+            uniq.append(it)
+    return uniq
+
+
+class VisualOnlyMCQPipeline:
+    def __init__(
+        self,
+        model_path: str,
+        *,
+        first_entry_file: str,
+        cache_path: str = "./cache_mcq",
+        file_name_prefix: str = "vis_mcq",
+        # Config
+        rotate_num: int = 4,
+        pass_visual_min: float = 1.0,
+        pass_textual_max: float = 0.25,
+        add_none_above: bool = True,
+        # Keys
+        input_image_key: str = "image",
+        output_key: str = "final_mcqs",
+        # VLLM
+        device: str = "cuda",
+        vllm_max_tokens: int = 2048
+    ):
+        # 1. 初始化存储
+        self.storage = FileStorage(
+            first_entry_file_name=first_entry_file,
+            cache_path=cache_path,
+            file_name_prefix=file_name_prefix,
+            cache_type="jsonl"
+        )
+        
+        # 2. 初始化 VLM 服务
+        self.serving = LocalModelVLMServing_vllm(
+            hf_model_name_or_path=model_path,
+            vllm_tensor_parallel_size=1,
+            vllm_temperature=0.1,  # 低温度以保证格式稳定
+            vllm_max_tokens=vllm_max_tokens
+        )
+        
+        # Keys 配置
+        self.keys = {
+            "img": input_image_key,
+            "raw_text": "raw_mcq_text",
+            "parsed_list": "parsed_mcq_list",
+            "final": output_key
+        }
+        
+        # 加载 Prompt 库
+        self.prompts_db = ImageCaprlPrompt().build_prompt()
+
+        # ================== 算子初始化 ==================
+        
+        # 算子 1: 生成原始 MCQ 文本
+        self.op_gen_raw = FixPromptedVQAGenerator(
+            serving=self.serving,
+            system_prompt=self.prompts_db["SYS_PROMPT_MCQ"],
+            user_prompt=self.prompts_db["USER_PROMPT_MCQ"]
+        )
+        
+        # 算子 2: 解析文本为结构化数据
+        self.op_parse = FunctionalRefiner(func=parse_mcq_text_logic)
+        
+        # 算子 3: 视觉依赖性验证 (核心过滤)
+        # 包含旋转 (Rotation) 和 无图检测 (Text-only check)
+        self.op_verify = VisualDependencyRefiner(
+            serving=self.serving,
+            instruction_template=self.prompts_db["ANSWER_INSTRUCTION"],
+            rotate_num=rotate_num,
+            pass_visual_min=pass_visual_min,
+            pass_textual_max=pass_textual_max,
+            add_none_above_visual=add_none_above
+        )
+
+    def forward(self):
+        print(">>> [Pipeline] Step 1: Generating Raw MCQs (FixPrompted)...")
+        self.op_gen_raw.run(
+            self.storage.step(),
+            input_image_key=self.keys["img"],
+            output_answer_key=self.keys["raw_text"]
+        )
+        
+        print(">>> [Pipeline] Step 2: Parsing MCQs...")
+        self.op_parse.run(
+            self.storage.step(),
+            output_key=self.keys["parsed_list"],
+            mcq_text=self.keys["raw_text"], 
+            expected=5
+        )
+        
+        print(">>> [Pipeline] Step 3: Verifying Visual Dependency (Rotation Check)...")
+        self.op_verify.run(
+            self.storage.step(),
+            input_list_key=self.keys["parsed_list"],
+            input_image_key=self.keys["img"],
+            output_key=self.keys["final"]
+        )
+        
+        print(f">>> [Pipeline] Done. Results in: {self.keys['final']}")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_file", default="./dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl")
+    parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct")
+    parser.add_argument("--rotate_num", type=int, default=4)
+    parser.add_argument("--pass_vis", type=float, default=1.0)
+    parser.add_argument("--pass_txt", type=float, default=0.25)
+    
+    args = parser.parse_args()
+    
+    pipe = VisualOnlyMCQPipeline(
+        model_path=args.model_path,
+        first_entry_file=args.input_file,
+        rotate_num=args.rotate_num,
+        pass_visual_min=args.pass_vis,
+        pass_textual_max=args.pass_txt
+    )
+    pipe.forward()
+
+```
+
diff --git a/docs/zh/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md b/docs/zh/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md
new file mode 100644
index 00000000..ee038567
--- /dev/null
+++ b/docs/zh/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md
@@ -0,0 +1,231 @@
+---
+title: 视觉 MCTS 推理链生成流水线
+icon: mdi:image-text
+createTime: 2026/01/11 21:59:59
+permalink: /zh/mm_guide/vision_mct_reasoning_pipeline/
+---
+
+## 1. 概述
+
+**视觉 MCTS 推理链生成流水线 (Vision MCTS Reasoning Pipeline)** 旨在为多模态大模型构建高质量的**过程监督数据（Process Supervision Data）**。该流水线能够处理两种数据来源：已有的蒙特卡洛树搜索（MCTS）轨迹数据，或直接利用 VLM 生成新的推理链。
+
+该流水线是 **Grounded-RL** 和 **SFT 数据构建**的核心工具，它将复杂的树状搜索过程“线性化”为模型可学习的 `<think>...</think><answer>...</answer>` 格式。
+
+我们支持以下应用场景：
+
+* **从 MCTS 树提取数据**：将搜索树中高价值的路径（Rollouts）转化为线性训练数据。
+* **混合数据构建**：对于没有搜索树的样本，自动回退到使用 VLM 进行 CoT 生成。
+* **空间推理增强**：支持生成包含显式坐标（Bounding Box）的空间推理链。
+
+流水线的主要流程包括：
+
+1. **MCTS 树解析**：解析输入数据中的搜索树结构，提取成功的推理路径。
+2. **视觉推理生成 (Fallback)**：对于缺失树结构或解析失败的样本，利用 VLM 重新生成推理链。
+3. **数据标准化**：输出统一格式的推理链数据。
+
+---
+
+## 2. 快速开始
+
+### 第一步：准备工作目录
+
+```bash
+mkdir run_mcts_reasoning
+cd run_mcts_reasoning
+
+```
+
+### 第二步：准备脚本
+
+将下文“流水线示例”中的代码保存为 `vision_mcts_pipeline.py`。
+
+### 第三步：配置运行参数
+
+确保输入文件（jsonl）包含 `tree` 字段（用于提取）或仅包含 `question/image`（用于生成）。
+
+```bash
+# 安装依赖
+pip install open-dataflow vllm
+
+```
+
+### 第四步：一键运行
+
+```bash
+python vision_mcts_pipeline.py \
+  --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \
+  --input_file "data/mcts_trajectories.jsonl" \
+  --prompt_type "spatial"
+
+```
+
+---
+
+## 3. 数据流与流水线逻辑
+
+### 1. **输入数据**
+
+输入数据通常来源于 MCTS 搜索过程的日志，或未标注的图文对：
+
+* **image**：图像路径。
+* **question**：视觉问题。
+* **tree**（可选）：MCTS 搜索树的 JSON 结构，包含节点值（Value）、访问次数（Visits）和动作（Actions）。
+
+**输入数据示例**：
+
+```json
+{
+    "image": "./images/puzzle.jpg",
+    "question": "What is the next step to solve this?",
+    "tree": { "root": { "children": [...], "value": 1.0, "text": "Step 1..." } }
+}
+
+```
+
+### 2. **核心算子逻辑**
+
+该流水线采用 **“提取优先，生成兜底”** 的混合策略：
+
+#### A. **MCTSTreeRefiner（树结构解析器）**
+
+该算子负责处理 `tree` 字段。它遍历树结构，根据节点价值（Q-value）筛选出从根节点到叶子节点的最佳路径。
+
+* **输入**：`tree` 对象。
+* **功能**：线性化树路径，过滤掉低价值或未完成的搜索分支。
+* **输出**：提取出的推理链列表（`mcts_chains`）。
+
+#### B. **VisualReasoningGenerator（视觉推理生成器）**
+
+该算子是流水线的“生成引擎”。它接收上一步的提取结果作为输入。
+
+* **机制**：检查 `input_existing_chains_key`（即 `mcts_chains`）。
+* 如果 MCTS 解析成功（链存在），则直接复用，不进行推理（节省计算资源）。
+* 如果 MCTS 链为空（树不存在或解析失败），则调用 VLM，根据 `prompt_type`（如 `spatial`）从头生成推理链。
+
+
+* **Prompt 类型**：支持 `spatial`（空间坐标推理）、`logical`（逻辑推理）等模式。
+
+### 3. **输出数据**
+
+最终生成的输出数据（`final_reasoning_chains`）将包含高质量的思维链，可直接用于 SFT 训练。
+
+**输出示例**：
+
+```json
+{
+    "image": "./images/puzzle.jpg",
+    "final_reasoning_chains": [
+        "<think>First, locate the red block at [100, 200]. To solve the puzzle, it needs to move right...</think><answer>Move Red Block</answer>"
+    ]
+}
+
+```
+
+---
+
+## 4. 流水线示例
+
+以下是完整的 `VisionMCTSReasoningPipeline` 代码实现。
+
+```python
+import argparse
+from dataflow.utils.storage import FileStorage
+from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
+
+# 引入原子算子
+from dataflow.operators.core_text import MCTSTreeRefiner
+from dataflow.operators.core_vision import VisualReasoningGenerator
+
+class VisionMCTSReasoningPipeline:
+    def __init__(
+        self,
+        model_path: str,
+        *,
+        # Storage
+        first_entry_file: str,
+        cache_path: str = "./cache_mcts",
+        file_name_prefix: str = "mcts_reason",
+        # Config
+        prompt_type: str = "spatial",
+        max_samples_per_file: int = 10000,
+        # Keys
+        input_question_key: str = "question",
+        input_image_key: str = "image",
+        input_tree_key: str = "tree",
+        output_key: str = "final_reasoning_chains",
+        # VLLM
+        vllm_max_tokens: int = 1024
+    ):
+        # 1. 存储初始化
+        self.storage = FileStorage(
+            first_entry_file_name=first_entry_file,
+            cache_path=cache_path,
+            file_name_prefix=file_name_prefix,
+            cache_type="jsonl"
+        )
+        
+        # 2. 模型服务
+        self.serving = LocalModelVLMServing_vllm(
+            hf_model_name_or_path=model_path,
+            vllm_tensor_parallel_size=1,
+            vllm_temperature=0.7,
+            vllm_max_tokens=vllm_max_tokens
+        )
+        
+        self.keys = {
+            "q": input_question_key,
+            "img": input_image_key,
+            "tree": input_tree_key,
+            "mcts_chains": "mcts_extracted_chains", # 中间结果
+            "final": output_key
+        }
+
+        # ================== Operators ==================
+        
+        # 算子 1: MCTS Tree -> Chains (提取器)
+        # 负责将树结构扁平化为线性链
+        self.op_mcts_refine = MCTSTreeRefiner(
+            max_chains_per_sample=max_samples_per_file
+        )
+        
+        # 算子 2: VLM -> Chains (生成器/Fallback)
+        # 如果 MCTS 提取失败，则使用 VLM 生成；如果成功，则跳过
+        self.op_vlm_gen = VisualReasoningGenerator(
+            serving=self.serving,
+            prompt_type=prompt_type
+        )
+
+    def forward(self):
+        print(">>> [Pipeline] Step 1: Extracting Chains from MCTS Trees...")
+        self.op_mcts_refine.run(
+            self.storage.step(),
+            input_tree_key=self.keys["tree"],
+            output_key=self.keys["mcts_chains"]
+        )
+        
+        print(">>> [Pipeline] Step 2: Generating Chains via VLM (Fallback)...")
+        # 注意：input_existing_chains_key 实现了混合/回退逻辑
+        self.op_vlm_gen.run(
+            self.storage.step(),
+            input_question_key=self.keys["q"],
+            input_image_key=self.keys["img"],
+            input_existing_chains_key=self.keys["mcts_chains"],
+            output_key=self.keys["final"]
+        )
+        
+        
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_file", default="dataflow/example/image_to_text_pipeline/mct_reasoning.jsonl")
+    parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct")
+    parser.add_argument("--prompt_type", default="spatial")
+    args = parser.parse_args()
+    
+    pipe = VisionMCTSReasoningPipeline(
+        model_path=args.model_path,
+        first_entry_file=args.input_file,
+        prompt_type=args.prompt_type
+    )
+    pipe.forward()
+
+```
diff --git a/docs/zh/notes/mm_guide/video_understanding/multirole_videoqa_pipeline.md b/docs/zh/notes/mm_guide/video_understanding/multirole_videoqa_pipeline.md
new file mode 100644
index 00000000..1d423513
--- /dev/null
+++ b/docs/zh/notes/mm_guide/video_understanding/multirole_videoqa_pipeline.md
@@ -0,0 +1,288 @@
+---
+title: 多角色视频问答生成流水线
+createTime: 2026/01/11 22:15:28
+icon: mdi:image-text
+permalink: /zh/mm_guide/multirole_videoqa_pipeline/
+---
+## 1. 概述
+
+**多角色视频问答生成流水线 (MultiRole Video QA Pipeline)** 旨在利用多模态大模型（VLM）和多智能体（Multi-Agent）协作机制，自动从长视频或广告视频中生成高质量、深度的问答对（QA Pairs）。
+
+与普通的单次生成不同，该流水线引入了**多智能体迭代优化**环节。它首先生成初始问答，然后通过模拟不同角色的智能体（如提问者、检查者、润色者）进行多轮交互和修正，最终输出逻辑严密、信息准确的问答数据。
+
+我们支持以下应用场景：
+
+* **广告视频理解**：提取广告中的关键卖点、情感倾向和叙事逻辑。
+* **复杂视频推理**：构建需要跨时间段推理的深度问答数据集。
+* **长视频摘要与问答**：处理包含丰富元数据（Meta）和多个片段（Clips）的视频数据。
+
+流水线的主要流程包括：
+
+1. **初始生成 (Initial Generation)**：基于视频元数据和片段生成基础问答对。
+2. **多智能体协作 (Multi-Agent Refinement)**：通过多轮迭代（默认 3 轮），对问答对进行批判、修正和优化。
+3. **最终整合 (Final Generation)**：清洗数据，输出标准格式的最终问答集。
+
+---
+
+## 2. 快速开始
+
+### 第一步：准备工作目录
+
+```bash
+mkdir run_video_qa
+cd run_video_qa
+
+```
+
+### 第二步：准备脚本
+
+将下文“流水线示例”中的代码保存为 `multirole_videoqa_pipeline.py`。
+
+### 第三步：配置运行参数
+
+确保输入数据包含 `Meta` 和 `Clips` 字段。
+
+```bash
+# 安装依赖
+pip install open-dataflow vllm
+
+```
+
+### 第四步：一键运行
+
+```bash
+python multirole_videoqa_pipeline.py \
+  --model_path "/path/to/Qwen2.5-VL-7B-Instruct" \
+  --images_file "data/adsQA.jsonl" \
+  --card_id "0"
+
+```
+
+---
+
+## 3. 数据流与流水线逻辑
+
+### 1. **输入数据**
+
+输入数据通常是经过预处理的视频数据，包含全局元数据和分段信息：
+
+* **Meta**：视频的全局描述、标题或背景信息。
+* **Clips**：视频片段列表，每个片段包含音频文本、帧图像路径和片段描述。
+
+**输入数据示例**：
+
+```json
+{
+    "Meta": "A commercial for a new sports car featuring dynamic driving scenes.",
+    "Clips": [
+        {
+            "Audio_Text": "Experience the speed.",
+            "Frames_Images": ["./frames/001.jpg", "./frames/002.jpg"],
+            "Description": "Car accelerating on a highway."
+        },
+        {
+            "Audio_Text": "Safety meets luxury.",
+            "Frames_Images": ["./frames/003.jpg"],
+            "Description": "Interior shot showing leather seats."
+        }
+    ]
+}
+
+```
+
+### 2. **核心算子逻辑**
+
+该流水线通过三个专门的算子串联执行：
+
+#### A. **MultiroleVideoQAInitialGenerator（初始生成器）**
+
+* **功能**：作为“初稿作者”，它读取 `Meta` 和 `Clips`，利用 VLM 生成第一版问答对。
+* **输出**：包含初步 QA 的 DataFrame。
+
+#### B. **MultiroleVideoQAMultiAgentGenerator（多智能体优化器）**
+
+* **功能**：作为“编辑团队”，它对初稿进行打磨。
+* **机制**：设置 `max_iterations`（如 3 次），在多轮次中，模型可能扮演不同角色（如审核员指出错误、润色员优化措辞），逐步提升 QA 质量。
+* **输入**：初始 DataFrame。
+* **输出**：经过多轮修正后的中间态 DataFrame。
+
+#### C. **MultiroleVideoQAFinalGenerator（最终生成器）**
+
+* **功能**：作为“出版商”，它负责最终的格式化和清洗。
+* **输出**：标准化的 `QA` 列表。
+
+### 3. **输出数据**
+
+输出数据在原有字段基础上增加了高质量的问答列表：
+
+* **QA**：生成的问答对列表，包含标签（如问题类型）、问题文本和答案文本。
+
+**输出数据示例**：
+
+```json
+{
+    "Meta": "...",
+    "Clips": [...],
+    "QA": [
+        {
+            "Label": "Feature Extraction",
+            "Question": "What specific features of the car are highlighted in the interior shots?",
+            "Answer": "The video highlights the luxury leather seats and the advanced dashboard interface."
+        },
+        {
+            "Label": "Narrative Analysis",
+            "Question": "How does the audio complement the visual transition?",
+            "Answer": "The narration 'Experience speed' coincides with the acceleration scene, reinforcing the dynamic visual."
+        }
+    ]
+}
+
+```
+
+---
+
+## 4. 流水线示例
+
+以下是完整的 `MultiRoleVideoQAPipeline` 代码实现。
+
+```python
+import argparse
+import os 
+from dataflow.serving import LocalModelVLMServing_vllm
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.core_vision import (
+    MultiroleVideoQAInitialGenerator, 
+    MultiroleVideoQAMultiAgentGenerator, 
+    MultiroleVideoQAFinalGenerator
+)
+
+try:
+    import torch
+    # 多进程启动方式设置为 spawn，避免 CUDA 初始化冲突
+    if 'spawn' not in torch.multiprocessing.get_all_start_methods():
+        torch.multiprocessing.set_start_method('spawn', force=True)
+except ImportError:
+    pass
+
+
+class MultiRoleVideoQAPipeline():
+    def __init__(
+        self,
+        model_path: str,
+        *,
+        hf_cache_dir: str | None = None,
+        download_dir: str = "./ckpt",
+        first_entry_file: str = "/dataflow/example/ads_QA/adsQA.jsonl",
+        cache_path: str = "./cache_local",
+        file_name_prefix: str = "dataflow_cache_step",
+        cache_type: str = "jsonl",
+        # Keys Configuration
+        Meta_key: str = "Meta",
+        clips_key: str = "Clips", 
+        output_key: str = "QA"
+    ):
+        # 1. 存储初始化
+        self.storage = FileStorage(
+            first_entry_file_name=first_entry_file,
+            cache_path=cache_path,
+            file_name_prefix=file_name_prefix,
+            cache_type=cache_type,
+        )
+        
+        # 强制设置 vLLM 的多进程方法
+        os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = "spawn"
+
+        # 2. VLM 服务初始化
+        self.llm_serving = LocalModelVLMServing_vllm(
+            hf_model_name_or_path=model_path,
+            hf_cache_dir=hf_cache_dir,
+            hf_local_dir=download_dir,
+            vllm_tensor_parallel_size=1, 
+            vllm_temperature=0.7,
+            vllm_top_p=0.9,
+            vllm_max_tokens=6000, # 视频问答通常需要较长的 Context
+        )
+
+        # 3. 算子链初始化
+        # 阶段一：初始生成
+        self.initial_QA_generation = MultiroleVideoQAInitialGenerator(llm_serving = self.llm_serving)
+        
+        # 阶段二：多智能体迭代优化 (核心差异点)
+        self.multiAgent_QA_generation = MultiroleVideoQAMultiAgentGenerator(
+            llm_serving = self.llm_serving, 
+            max_iterations = 3
+        )
+        
+        # 阶段三：最终格式化
+        self.final_QA_generation = MultiroleVideoQAFinalGenerator(llm_serving = self.llm_serving)
+
+        self.input_meta_key = Meta_key
+        self.input_clips_key = clips_key
+        self.output_key = output_key
+
+    def forward(self):
+        print(">>> [Pipeline] Step 1: Initial QA Generation...")
+        init_df = self.initial_QA_generation.run(
+            storage = self.storage.step(),
+            input_meta_key = self.input_meta_key, 
+            input_clips_key = self.input_clips_key, 
+            output_key = self.output_key
+        )
+        
+        print(">>> [Pipeline] Step 2: Multi-Agent Refinement (3 iterations)...")
+        # 注意：此算子接收上一阶段的 DataFrame (init_df) 作为输入
+        middle_df = self.multiAgent_QA_generation.run(
+            df = init_df,
+            input_meta_key = self.input_meta_key, 
+            input_clips_key = self.input_clips_key, 
+            output_key = self.output_key
+        )
+        
+        print(">>> [Pipeline] Step 3: Finalizing QA Pairs...")
+        self.final_QA_generation.run(
+            storage = self.storage,
+            df = middle_df,
+            input_meta_key = self.input_meta_key, 
+            input_clips_key = self.input_clips_key, 
+            output_key = self.output_key
+        )
+        print(">>> [Pipeline] Done.")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Batch video QA generation with DataFlow (Single GPU)")
+
+    parser.add_argument("--model_path", default="../../Models/Qwen2.5-VL-7B-Instruct",
+                                 help="Path to the local model or HuggingFace repo ID.")
+    parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface",
+                                 help="HuggingFace cache directory.")
+    parser.add_argument("--download_dir", default="./ckpt",
+                                 help="Local directory for downloading models.")
+    
+    parser.add_argument("--card_id", type=str, default="0",
+                                 help="The single CUDA device ID to use (e.g., '0' or '1').")
+    
+    parser.add_argument("--images_file", default="./dataflow/example/ads_QA/adsQA.jsonl",
+                                 help="Path to the first entry file for DataFlow.")
+    parser.add_argument("--cache_path", default="./cache_local",
+                                 help="Directory for caching DataFlow steps.")
+    parser.add_argument("--file_name_prefix", default="caption",
+                                 help="Prefix for cache file names.")
+    parser.add_argument("--cache_type", default="jsonl",
+                                 help="Type of cache file (e.g., jsonl).")
+
+    args = parser.parse_args()
+
+    os.environ['CUDA_VISIBLE_DEVICES'] = args.card_id.replace(' ', '')
+    
+    pipe = MultiRoleVideoQAPipeline(
+        model_path=args.model_path,
+        hf_cache_dir=args.hf_cache_dir,
+        download_dir=args.download_dir,
+        first_entry_file=args.images_file,
+        cache_path=args.cache_path,
+        file_name_prefix=args.file_name_prefix,
+        cache_type=args.cache_type,
+    )
+    pipe.forward()
+
+```
diff --git a/docs/zh/notes/mm_operators/image_understanding/eval/clip_image_text_evaluator.md b/docs/zh/notes/mm_operators/image_understanding/eval/clip_image_text_evaluator.md
deleted file mode 100644
index 7d1258ba..00000000
--- a/docs/zh/notes/mm_operators/image_understanding/eval/clip_image_text_evaluator.md
+++ /dev/null
@@ -1,108 +0,0 @@
----
-title: Clip分数评估
-createTime: 2025/10/15 13:47:08
-icon: material-symbols-light:image
-permalink: /zh/mm_operators/eval/clip_image_text_evaluator/
----
-## 📘 概述
-CLIPEvaluator` 基于 **CLIP** 计算图像与文本的**对齐分数**，范围 `[0,1]`。  
-内部做法：对图像与文本编码 → 向量归一化 → 余弦相似度线性映射到 `[0,1]`（`(cos + 1)/2`）。
-
-
-## ```__init__```函数
-```python
-def __init__(
-    self,
-    model_name: str = "/data0/happykeyan/workspace/ckpt/clip-vit-base-patch32",
-    device: str = None
-)
-```
-
-## `init`参数说明
-| 参数名 | 类型 | 默认值 | 说明 |
-| :--- | :--- | :--- | :--- |
-| `model_name` | `str` | `"/data0/happykeyan/workspace/ckpt/clip-vit-base-patch32"` | CLIP 模型本地路径或 HF Model ID；通过 `CLIPProcessor` / `CLIPModel` 加载（`use_safetensors=True`）。 |
-| `device` | `str \| None` | `None` | 推理设备；`None` 时自动选择可用的 `"cuda"`，否则使用 `"cpu"`。 |
-
-
-
-## `run`函数
-```python
-def run(
-    self,
-    storage: DataFlowStorage,
-    image_key: str = "image_path",
-    text_key: str = "text",
-    output_key: str = "clip_score"
-):
-    ...
-```
-执行算子主逻辑
-1. 从 `storage` 读取 DataFrame，逐行读取 `image_key` 与 `text_key`。  
-2. 使用 `CLIPProcessor` 组装输入（`padding="max_length"`, `truncation=True`, `max_length=77`），前向得到 `image_embeds` 与 `text_embeds`。  
-3. 对嵌入做 L2 归一化，计算点积得到余弦相似度 `cos`，并映射为对齐分数 `score=(cos+1)/2`，裁剪到 `[0,1]`。  
-4. 将分数写入新列 `output_key`，写回 `storage` 并返回 `[output_key]`。  
-5. 若图片无法读取或文本为空，该样本分数记为 `0.0`。 
-
-参数
-| 参数名 | 类型 | 默认值 | 说明 |
-| :--- | :--- | :--- | :--- |
-| `storage` | `DataFlowStorage` | 无 | Dataflow 的读写存储对象。 |
-| `image_key` | `str` | `"image_path"` | 输入图片列名。 |
-| `text_key` | `str` | `"text"` | 输入文本列名。 |
-| `output_key` | `str` | `"clip_score"` | 输出分数字段名（范围 `[0,1]`）。 |
-
-## 🧠 示例用法
-
-```python
-from dataflow.utils.storage import FileStorage
-from dataflow.operators.core_vision import CLIPEvaluator
-
-# 1) 准备 FileStorage（至少包含 image_path 与 text 两列）
-storage = FileStorage(
-    first_entry_file_name="data/clip_input.jsonl",
-    cache_path="./cache_local",
-    file_name_prefix="clip_eval",
-    cache_type="jsonl"
-)
-
-# 2) 初始化算子（可改为 HF 模型ID，如 "openai/clip-vit-base-patch32"）
-evaluator = CLIPEvaluator(
-    model_name="/data0/happykeyan/workspace/ckpt/clip-vit-base-patch32",
-    device=None  # 自动选择 cuda/cpu
-)
-
-# 3) 执行评估
-cols = evaluator.run(
-    storage=storage.step(),
-    image_key="image_path",
-    text_key="text",
-    output_key="clip_score"
-)
-print(cols)  # ["clip_score"]
-```
-
-### 🧾 默认输出格式（Output Format）
-| 字段 | 类型 | 默认值 | 说明 |
-| :--- | :--- | :--- | :--- |
-| `image_path`（或 `image_key` 指定列） | `string` | 无 | 输入图片路径。 |
-| `text`（或 `text_key` 指定列） | `string` | 无 | 输入文本。 |
-| `clip_score`（或 `output_key`） | `float` | 无 | 图文对齐分数，范围 `[0,1]`。 |
-
-
-示例输入：
-```jsonl
-{
-  "image_path": "1.png",
-  "text": "The image shows a man and a woman in what appears to be a car."
-}
-```
-
-示例输出：
-```jsonl
-{
-  "image_path": "1.png",
-  "text": "The image shows a man and a woman in what appears to be a car.",
-  "clip_score": 0.642
-}
-```
\ No newline at end of file
diff --git a/docs/zh/notes/mm_operators/image_understanding/eval/image_clip_evaluator.md b/docs/zh/notes/mm_operators/image_understanding/eval/image_clip_evaluator.md
new file mode 100644
index 00000000..fdb950d8
--- /dev/null
+++ b/docs/zh/notes/mm_operators/image_understanding/eval/image_clip_evaluator.md
@@ -0,0 +1,109 @@
+---
+title: ImageCLIPEvaluator
+createTime: 2025/10/15 13:47:08
+# icon: material-symbols-light:image
+permalink: /zh/mm_operators/eval/image_clip_evaluator/
+---
+## 📘 概述
+`ImageCLIPEvaluator` 基于 **CLIP** 计算图像与文本的**对齐分数**，范围 `[0,1]`。  
+内部做法：对图像与文本编码 → 向量归一化 → 余弦相似度线性映射到 `[0,1]`（`(cos + 1)/2`）。
+
+
+
+## ```__init__```函数
+```python
+def __init__(
+    self,
+    model_name: str = "openai/clip-vit-base-patch32",
+    device: str = None
+):
+```
+
+## `init`参数说明
+| 参数名       | 类型          | 默认值                               | 说明 |
+| :----------- | :------------ | :----------------------------------- | :--- |
+| `model_name` | `str`         | `"openai/clip-vit-base-patch32"`    | CLIP 模型本地路径或 HF Model ID；通过 `CLIPProcessor` / `CLIPModel` 加载（`use_safetensors=True`）。 |
+| `device`     | `str \| None` | `None`                              | 推理设备；`None` 时自动选择可用的 `"cuda"`，否则使用 `"cpu"`。 |
+
+
+## `run`函数
+```python
+def run(
+    self,
+    storage: DataFlowStorage,
+    input_image_key: str = "image_path",
+    input_text_key: str = "text",
+    output_key: str = "clip_score"
+):
+    ...
+```
+执行算子主逻辑
+1. 从 `storage` 读取当前 DataFrame，逐行读取 `input_image_key` 与 `input_text_key` 对应的值。  
+2. 使用 `CLIPProcessor` 组装输入（`padding="max_length"`, `truncation=True`, `max_length=77`），前向得到 `image_embeds` 与 `text_embeds`。  
+3. 对嵌入向量做 L2 归一化，计算点积得到余弦相似度 `cos`，并映射为对齐分数`score = (cos + 1) / 2`，再裁剪到 `[0,1]` 区间。  
+4. 将分数写入新列 `output_key`，写回 `storage`，并返回 `[output_key]`。  
+5. 若图片无法读取或文本为空，则该样本分数记为 `0.0`。
+
+参数
+| 参数名            | 类型              | 默认值           | 说明 |
+| :---------------- | :---------------- | :--------------- | :--- |
+| `storage`         | `DataFlowStorage` | 无               | Dataflow 的读写存储对象。 |
+| `input_image_key` | `str`             | `"image_path"`   | 输入图片列名。 |
+| `input_text_key`  | `str`             | `"text"`         | 输入文本列名。 |
+| `output_key`      | `str`             | `"clip_score"`   | 输出分数字段名（范围 `[0,1]`）。 |
+
+## 🧠 示例用法
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.core_vision import ImageCLIPEvaluator
+
+# 1) 准备 FileStorage（至少包含 image_path 与 text 两列）
+storage = FileStorage(
+    first_entry_file_name="./dataflow/example/test_image_eval/test_image_eval.jsonl",
+    cache_path="./cache_local",
+    file_name_prefix="clip_eval",
+    cache_type="jsonl"
+)
+
+# 2) 初始化算子（可改为 HF 模型ID，如 "openai/clip-vit-base-patch32"）
+evaluator = ImageCLIPEvaluator(
+    model_name="openai/clip-vit-base-patch32",
+    device=None  # 自动选择 cuda/cpu
+)
+
+# 3) 执行评估
+cols = evaluator.run(
+    storage=storage.step(),
+    input_image_key="image_path",
+    input_text_key="text",
+    output_key="clip_score"
+)
+print(cols)  # ["clip_score"]
+```
+
+### 🧾 默认输出格式（Output Format）
+| 字段名                                      | 类型     | 默认值 | 说明 |
+| :------------------------------------------ | :------- | :----- | :--- |
+| `image_path`（或 `input_image_key` 指定列） | `string` | 无     | 输入图片路径。 |
+| `text`（或 `input_text_key` 指定列）        | `string` | 无     | 输入文本。 |
+| `clip_score`（或 `output_key`）             | `float`  | 无     | 图文对齐分数，范围 `[0,1]`。 |
+
+
+
+示例输入：
+```jsonl
+{
+  "image_path": "1.png",
+  "text": "The image shows a man and a woman in what appears to be a car."
+}
+```
+
+示例输出：
+```jsonl
+{
+  "image_path": "1.png",
+  "text": "The image shows a man and a woman in what appears to be a car.",
+  "clip_score": 0.642
+}
+```
\ No newline at end of file
diff --git a/docs/zh/notes/mm_operators/image_understanding/eval/image_longclip_evaluator.md b/docs/zh/notes/mm_operators/image_understanding/eval/image_longclip_evaluator.md
new file mode 100644
index 00000000..e14e6b09
--- /dev/null
+++ b/docs/zh/notes/mm_operators/image_understanding/eval/image_longclip_evaluator.md
@@ -0,0 +1,115 @@
+---
+title: ImageLongCLIPEvaluator
+createTime: 2025/10/15 14:30:52
+# icon: material-symbols-light:image
+permalink: /zh/mm_operators/eval/image_longclip_evaluator/
+---
+## 📘 概述
+`ImageLongCLIPEvaluator` 使用 **LongCLIP** 计算图像与**长文本**的对齐分数，分数范围为 `[0,1]`。  
+相较于标准 CLIP，LongCLIP 支持更长的文本上下文（本实现默认 `context_length=248`），适合段落级描述的匹配评估与对齐打分任务。
+
+内部流程与 CLIP 类似：对图像与文本分别编码，得到向量后做 L2 归一化，再计算余弦相似度，并通过 `(cos + 1) / 2` 映射到 `[0,1]` 区间。
+
+
+
+## ```__init__```函数
+```python
+def __init__(
+    self,
+    model_name: str = "BeichenZhang/LongCLIP-L-336px",
+    device: str = None,
+):
+    ...
+```
+
+## `init`参数说明
+| 参数名       | 类型          | 默认值                             | 说明 |
+| :----------- | :------------ | :--------------------------------- | :--- |
+| `model_name` | `str`         | `"BeichenZhang/LongCLIP-L-336px"` | LongCLIP 模型 checkpoint 标识；若为**目录路径**，算子会在目录中搜索以 `.pt` / `.bin` / `.ckpt` 结尾的文件并自动选择一个进行加载；若为**文件路径**，则直接将该文件作为权重加载。 |
+| `device`     | `str \| None` | `None`                            | 推理设备；`None` 时自动选择可用的 `"cuda"`，否则使用 `"cpu"`。 |
+
+
+
+
+## `run`函数
+```python
+def run(
+    self,
+    storage: DataFlowStorage,
+    input_image_key: str = "image_path",
+    input_text_key: str = "text",
+    output_key: str = "longclip_score",
+):
+    ...
+```
+执行算子主逻辑
+1. 从 `storage` 读取 DataFrame，逐行读取 `input_image_key` 与 `input_text_key`。  
+2. 使用 `self.preprocess` 将图像转为张量 `img_t`；使用 `_tokenize_safe` 对长文本进行安全分词与截断（优先 `context_length=248`，失败则回退）。  
+3. 分别前向 `model.encode_image(img_t)` 与 `model.encode_text(txt_t)` 得到图像/文本特征。  
+4. 对两个特征做 L2 归一化并计算点积余弦相似度 `cos`，映射为分数 `score = (cos + 1) / 2`，裁剪到 `[0,1]`。  
+5. 将分数写入新列 `output_key`，写回 `storage` 并返回 `[output_key]`。  
+6. 若图片无法读取或文本为空，则该样本分数记为 `0.0`。
+
+参数
+| 参数名            | 类型              | 默认值             | 说明 |
+| :---------------- | :---------------- | :----------------- | :--- |
+| `storage`         | `DataFlowStorage` | 无                 | Dataflow 的读写存储对象，用于在算子内部读取与写回 DataFrame。 |
+| `input_image_key` | `str`             | `"image_path"`     | 输入图片列名，对应该 DataFrame 中存放图像路径的字段。 |
+| `input_text_key`  | `str`             | `"text"`           | 输入长文本列名，对应该 DataFrame 中存放段落级文本描述的字段。 |
+| `output_key`      | `str`             | `"longclip_score"` | 输出分数字段名（范围 `[0,1]`），用于存放 LongCLIP 图文对齐分数。 |
+
+
+## 🧠 示例用法
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.core_vision import ImageLongCLIPEvaluator
+
+# 1) 准备 FileStorage（至少包含 image_path 与 text 两列）
+storage = FileStorage(
+    first_entry_file_name="./dataflow/example/test_image_eval/test_image_eval.jsonl",
+    cache_path="./cache_local",
+    file_name_prefix="longclip_eval",
+    cache_type="jsonl"
+)
+
+# 2) 初始化 LongCLIP 评估算子（model_name 可以是目录或具体权重文件路径）
+evaluator = ImageLongCLIPEvaluator(
+    model_name="BeichenZhang/LongCLIP-L-336px",
+    device=None  # 自动选择 cuda/cpu
+)
+
+# 3) 执行评估：将为每行新增 longclip_score ∈ [0,1]
+cols = evaluator.run(
+    storage=storage.step(),
+    input_image_key="image_path",
+    input_text_key="text",
+    output_key="longclip_score"
+)
+print(cols)  # ["longclip_score"]
+```
+
+### 🧾 默认输出格式（Output Format）
+| 字段名                                      | 类型     | 默认值 | 说明 |
+| :------------------------------------------ | :------- | :----- | :--- |
+| `image_path`（或 `input_image_key` 指定列） | `string` | 无     | 输入图片路径。 |
+| `text`（或 `input_text_key` 指定列）        | `string` | 无     | 输入长文本描述。 |
+| `longclip_score`（或 `output_key`）         | `float`  | 无     | 长文本图文对齐分数，范围 `[0,1]`。 |
+
+
+示例输入：
+```jsonl
+{
+  "image_path": "1.png",
+  "text": "The image shows a man and a woman in what appears to be a car."
+}
+```
+
+示例输出：
+```jsonl
+{
+  "image_path": "1.png",
+  "text": "The image shows a man and a woman in what appears to be a car.",
+  "clip_score": 0.642
+}
+```
\ No newline at end of file
diff --git a/docs/zh/notes/mm_operators/image_understanding/eval/image_vqa_evaluator.md b/docs/zh/notes/mm_operators/image_understanding/eval/image_vqa_evaluator.md
new file mode 100644
index 00000000..8f6c883b
--- /dev/null
+++ b/docs/zh/notes/mm_operators/image_understanding/eval/image_vqa_evaluator.md
@@ -0,0 +1,131 @@
+---
+title: ImageVQAScoreEvaluator
+createTime: 2025/10/15 14:52:29
+# icon: material-symbols-light:image
+permalink: /zh/mm_operators/eval/image_vqa_evaluator/
+---
+## 📘 概述
+`ImageVQAScoreEvaluator` 基于 **BLIP 视觉问答模型**，计算“图像是否与给定文本描述匹配”的 **Yes 概率分数**，取值范围为 `[0,1]`。  
+内部思路：将文本包装成“该图像是否符合该描述？”的英文问句，分别以 `"yes"` 与 `"no"` 作为候选答案，  
+利用模型对两种答案的损失构造相对概率，以 `"yes"` 的归一化概率作为图文一致性分数。
+
+
+
+
+
+## ```__init__```函数
+```python
+def __init__(
+    self,
+    model_name: str = "Salesforce/blip-vqa-base",
+    device: str = None,
+    local_only: bool = True,
+):
+  ...
+```
+
+## `init`参数说明
+| 参数名        | 类型          | 默认值                       | 说明 |
+| :------------ | :------------ | :--------------------------- | :--- |
+| `model_name`  | `str`         | `"Salesforce/blip-vqa-base"` | BLIP VQA 模型的 HF Model ID 或本地路径；通过 `BlipProcessor` / `BlipForQuestionAnswering` 加载。 |
+| `device`      | `str \| None` | `None`                      | 推理设备；`None` 时自动选择可用的 `"cuda"`，否则使用 `"cpu"`。 |
+| `local_only`  | `bool`        | `True`                      | 是否仅从本地加载权重；为 `True` 时会以 `local_files_only=True` 方式加载（无网络环境推荐保持为 `True`）。 |
+
+
+## `run`函数
+```python
+def run(
+    self,
+    storage: DataFlowStorage,
+    input_image_key: str = "image_path",
+    input_text_key: str = "text",
+    output_key: str = "vqa_score"
+):
+    ...
+```
+执行算子主逻辑
+1. 从 `storage` 读取当前 DataFrame。按行遍历，从每一行中取出：  
+   - `input_image_key` 对应的图像路径（例如 `"image_path"`）；  
+   - `input_text_key` 对应的文本描述（例如 `"text"`）。  
+
+2. 对每个样本构造英文问句：  
+   `question = "Does this image match the description: {text}? Answer yes or no."`  
+
+3. 使用 `BlipProcessor` 将图像和 `question` 打包成模型输入张量，放到指定 `device` 上。  
+
+4. 分别以 `"yes"` 和 `"no"` 作为标签调用 BLIP VQA 模型得到两个损失：  
+   - `out_yes = model(**inputs, labels=yes_ids)`  
+   - `out_no  = model(**inputs, labels=no_ids)`  
+
+5. 将两者的损失转换为“得分”并归一化为概率：  
+   - 先计算 `py = exp(-loss_yes)`，`pn = exp(-loss_no)`；  
+   - 再得到 `"yes"` 的归一化概率`p_yes = py / (py + pn + 1e-8)` 
+   - 将 `p_yes` 限制在 `[0,1]` 区间作为最终 `vqa_score`。  
+
+6. 若图像无法读取或文本为空，则该样本分数设为 `0.0`。  
+
+7. 将所有样本的 `vqa_score` 写入新列 `output_key`，写回 `storage`，并返回 `[output_key]` 以供后续算子使用。 
+
+参数
+| 参数名            | 类型              | 默认值           | 说明 |
+| :---------------- | :---------------- | :--------------- | :--- |
+| `storage`         | `DataFlowStorage` | 无               | Dataflow 的读写存储对象。 |
+| `input_image_key` | `str`             | `"image_path"`   | 输入图片路径列名。 |
+| `input_text_key`  | `str`             | `"text"`         | 输入文本描述列名（会被包装成英文问句）。 |
+| `output_key`      | `str`             | `"vqa_score"`    | 输出 VQA 评分字段名（范围 `[0,1]`，表示模型认为“图像匹配描述”的概率）。 |
+
+## 🧠 示例用法
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.core_vision import ImageVQAScoreEvaluator
+
+# 1) 准备 FileStorage（至少包含 image_path 与 text 两列）
+storage = FileStorage(
+    first_entry_file_name="./dataflow/example/test_image_eval/test_image_eval.jsonl",
+    cache_path="./cache_local",
+    file_name_prefix="vqa_eval",
+    cache_type="jsonl"
+)
+
+# 2) 初始化 VQA 评估算子（可改为本地路径）
+evaluator = ImageVQAScoreEvaluator(
+    model_name="Salesforce/blip-vqa-base",
+    device=None,      # 自动选择 cuda/cpu
+    local_only=True   # 仅从本地加载（无网环境建议保持为 True）
+)
+
+# 3) 执行评估：将为每行新增 vqa_score ∈ [0,1]
+cols = evaluator.run(
+    storage=storage.step(),
+    input_image_key="image_path",
+    input_text_key="text",
+    output_key="vqa_score"
+)
+print(cols)  # ["vqa_score"]
+```
+
+### 🧾 默认输出格式（Output Format）
+| 字段名                                         | 类型     | 默认值 | 说明 |
+| :--------------------------------------------- | :------- | :----- | :--- |
+| `image_path`（或 `input_image_key` 指定列）    | `string` | 无     | 输入图片路径。 |
+| `text`（或 `input_text_key` 指定列）           | `string` | 无     | 输入文本描述。 |
+| `vqa_score`（或 `output_key`）                 | `float`  | 无     | BLIP VQA 计算的“该图像是否匹配该描述”的 Yes 概率，范围 `[0,1]`。 |
+
+
+示例输入：
+```jsonl
+{
+  "image_path": "1.png",
+  "text": "A bride and groom smiling in a car."
+}
+```
+
+示例输出：
+```jsonl
+{
+  "image_path": "1.png",
+  "text": "A bride and groom smiling in a car.",
+  "vqa_score": 0.774
+}
+```
\ No newline at end of file
diff --git a/docs/zh/notes/mm_operators/image_understanding/eval/longclip_image_text_evaluator.md b/docs/zh/notes/mm_operators/image_understanding/eval/longclip_image_text_evaluator.md
deleted file mode 100644
index aa76e68d..00000000
--- a/docs/zh/notes/mm_operators/image_understanding/eval/longclip_image_text_evaluator.md
+++ /dev/null
@@ -1,111 +0,0 @@
----
-title: LongClip分数评估
-createTime: 2025/10/15 14:30:52
-icon: material-symbols-light:image
-permalink: /zh/mm_operators/eval/longclip_image_text_evaluator/
----
-## 📘 概述
-`LongCLIPEvaluator` 使用 **LongCLIP** 计算图像与**长文本**的对齐分数，分数范围为 `[0,1]`。相较于标准 CLIP，LongCLIP 支持更长的文本上下文（本实现默认 `context_length=248`），适合段落级描述的匹配评估。
-
-
-
-## ```__init__```函数
-```python
-def __init__(
-    self,
-    ckpt_path: str = "/data0/happykeyan/DataFlow-MM/Dataflow-MM-Preview/ckpt/LongCLIP-L-336px/longclip-L@336px.pt",
-    device: str = None,
-):
-    ...
-```
-
-## `init`参数说明
-## `init`参数说明
-| 参数名 | 类型 | 默认值 | 说明 |
-| :--- | :--- | :--- | :--- |
-| `ckpt_path` | `str` | `"/data0/happykeyan/DataFlow-MM/Dataflow-MM-Preview/ckpt/LongCLIP-L-336px/longclip-L@336px.pt"` | LongCLIP 权重路径；通过 `longclip.load(ckpt_path, device=...)` 加载模型与预处理。 |
-| `device` | `str \| None` | `None` | 推理设备；`None` 时自动选择可用的 `"cuda"`，否则使用 `"cpu"`。 ||
-
-
-
-## `run`函数
-```python
-def run(
-    self,
-    storage: DataFlowStorage,
-    image_key: str = "image_path",
-    text_key: str = "text",
-    output_key: str = "longclip_score"
-):
-    ...
-```
-执行算子主逻辑
-1. 从 `storage` 读取 DataFrame，逐行读取 `image_key` 与 `text_key`。  
-2. 使用 `self.preprocess` 将图像转为张量 `img_t`；使用 `_tokenize_safe` 对长文本进行安全分词与截断（优先 `context_length=248`，失败则回退）。  
-3. 分别前向 `model.encode_image(img_t)` 与 `model.encode_text(txt_t)` 得到图像/文本特征。  
-4. 对两个特征做 L2 归一化并计算点积余弦相似度 `cos`，映射为分数 `score = (cos + 1) / 2`，裁剪到 `[0,1]`。  
-5. 将分数写入新列 `output_key`，写回 `storage` 并返回 `[output_key]`。  
-6. 若图片无法读取或文本为空，则该样本分数记为 `0.0`。
-
-参数
-| 参数名 | 类型 | 默认值 | 说明 |
-| :--- | :--- | :--- | :--- |
-| `storage` | `DataFlowStorage` | 无 | Dataflow 的读写存储对象。 |
-| `image_key` | `str` | `"image_path"` | 输入图片列名。 |
-| `text_key` | `str` | `"text"` | 输入长文本列名。 |
-| `output_key` | `str` | `"longclip_score"` | 输出分数字段名（范围 `[0,1]`）。 |
-
-## 🧠 示例用法
-
-```python
-from dataflow.utils.storage import FileStorage
-from dataflow.operators.core_vision import LongCLIPEvaluator
-
-# 1) 准备 FileStorage（至少包含 image_path 与 text 两列）
-storage = FileStorage(
-    first_entry_file_name="data/longclip_input.jsonl",
-    cache_path="./cache_local",
-    file_name_prefix="longclip_eval",
-    cache_type="jsonl"
-)
-
-# 2) 初始化 LongCLIP 评估算子（替换为你的权重路径）
-evaluator = LongCLIPEvaluator(
-    ckpt_path="/data0/happykeyan/DataFlow-MM/Dataflow-MM-Preview/ckpt/LongCLIP-L-336px/longclip-L@336px.pt",
-    device=None  # 自动选择 cuda/cpu
-)
-
-# 3) 执行评估：将为每行新增 longclip_score ∈ [0,1]
-cols = evaluator.run(
-    storage=storage.step(),
-    image_key="image_path",
-    text_key="text",
-    output_key="longclip_score"
-)
-print(cols)  # ["longclip_score"]
-```
-
-### 🧾 默认输出格式（Output Format）
-| 字段 | 类型 | 默认值 | 说明 |
-| :--- | :--- | :--- | :--- |
-| `image_path`（或 `image_key` 指定列） | `string` | 无 | 输入图片路径。 |
-| `text`（或 `text_key` 指定列） | `string` | 无 | 输入长文本。 |
-| `longclip_score`（或 `output_key`） | `float` | 无 | 长文本图文对齐分数，范围 `[0,1]`。 |
-
-
-示例输入：
-```jsonl
-{
-  "image_path": "1.png",
-  "text": "The image shows a man and a woman in what appears to be a car."
-}
-```
-
-示例输出：
-```jsonl
-{
-  "image_path": "1.png",
-  "text": "The image shows a man and a woman in what appears to be a car.",
-  "clip_score": 0.642
-}
-```
\ No newline at end of file
diff --git a/docs/zh/notes/mm_operators/image_understanding/eval/vqa_score_image_text_evaluator.md b/docs/zh/notes/mm_operators/image_understanding/eval/vqa_score_image_text_evaluator.md
deleted file mode 100644
index 0ae37e2a..00000000
--- a/docs/zh/notes/mm_operators/image_understanding/eval/vqa_score_image_text_evaluator.md
+++ /dev/null
@@ -1,117 +0,0 @@
----
-title: Blip分数评估
-createTime: 2025/10/15 14:52:29
-icon: material-symbols-light:image
-permalink: /zh/mm_operators/eval/vqa_score_image_text_evaluator/
----
-## 📘 概述
-`VQAScoreEvaluator` 使用 **BLIP VQA** 将“图像是否与描述匹配”转化为一个**是/否（Yes/No）概率分数**，范围 `[0,1]`：  
-- 构造问题：`Does this image match the description: {text}? Answer yes or no.`  
-- 分别以 `labels="yes"` 与 `labels="no"` 前向得到损失 `L_yes`、`L_no`；  
-- 通过 `p_yes ∝ exp(-L_yes)`、`p_no ∝ exp(-L_no)` 归一化，得到 `vqa_score = p_yes / (p_yes + p_no)`。
-
-
-
-
-## ```__init__```函数
-```python
-def __init__(
-    self,
-    model_name: str = "/data0/happykeyan/DataFlow-MM/Dataflow-MM-Preview/ckpt/blip-vqa-base",
-    device: str = None,
-    local_only: bool = True,
-):
-  ...
-```
-
-## `init`参数说明
-| 参数名 | 类型 | 默认值 | 说明 |
-| :--- | :--- | :--- | :--- |
-| `model_name` | `str` | `"/data0/happykeyan/DataFlow-MM/Dataflow-MM-Preview/ckpt/blip-vqa-base"` | BLIP VQA 模型路径或 HF Model ID；通过 `BlipProcessor` / `BlipForQuestionAnswering` 加载。 |
-| `device` | `str \| None` | `None` | 推理设备；`None` 时自动选择可用的 `"cuda"`，否则使用 `"cpu"`。 |
-| `local_only` | `bool` | `True` | 是否只从本地加载权重（传入 `from_pretrained(local_files_only=True)`）。 |
-
-
-
-## `run`函数
-```python
-def run(
-    self,
-    storage: DataFlowStorage,
-    image_key: str = "image_path",
-    text_key: str = "text",
-    output_key: str = "vqa_score"
-):
-    ...
-```
-执行算子主逻辑
-1. 从 `storage` 读取 DataFrame，逐行读取 `image_key` 与 `text_key`。  
-2. 若图片无法读取或文本为空，当前行分数置为 `0.0`。  
-3. 组装问题：`Does this image match the description: {text}? Answer yes or no.`。  
-4. 使用 `BlipProcessor` 打包输入，分别以前向带 `labels="yes"` 与 `labels="no"` 的方式得到损失 `L_yes`、`L_no`。  
-5. 计算 `p_yes = exp(-L_yes)`、`p_no = exp(-L_no)`，归一化得到 `score = p_yes / (p_yes + p_no + 1e-8)`，裁剪到 `[0,1]`。  
-6. 将分数写入列 `output_key`，回写 `storage` 并返回 `[output_key]`。
-
-参数
-| 参数名 | 类型 | 默认值 | 说明 |
-| :--- | :--- | :--- | :--- |
-| `storage` | `DataFlowStorage` | 无 | Dataflow 的读写存储对象。 |
-| `image_key` | `str` | `"image_path"` | 输入图片列名。 |
-| `text_key` | `str` | `"text"` | 输入文本列名。 |
-| `output_key` | `str` | `"vqa_score"` | 输出分数字段名（Yes 概率，范围 `[0,1]`）。 |
-
-## 🧠 示例用法
-
-```python
-from dataflow.utils.storage import FileStorage
-from dataflow.operators.core_vision import VQAScoreEvaluator
-
-# 1) 准备 FileStorage（至少包含 image_path 与 text 两列）
-storage = FileStorage(
-    first_entry_file_name="data/vqa_input.jsonl",
-    cache_path="./cache_local",
-    file_name_prefix="vqa_score",
-    cache_type="jsonl"
-)
-
-# 2) 初始化算子（可替换为你的权重路径或HF模型ID）
-evaluator = VQAScoreEvaluator(
-    model_name="/data0/happykeyan/DataFlow-MM/Dataflow-MM-Preview/ckpt/blip-vqa-base",
-    device=None,       # 自动选择 cuda/cpu
-    local_only=True    # 仅使用本地权重
-)
-
-# 3) 执行评估（新增 vqa_score 列）
-cols = evaluator.run(
-    storage=storage.step(),
-    image_key="image_path",
-    text_key="text",
-    output_key="vqa_score"
-)
-print(cols)  # ["vqa_score"]
-```
-
-### 🧾 默认输出格式（Output Format）
-| 字段 | 类型 | 默认值 | 说明 |
-| :--- | :--- | :--- | :--- |
-| `image_path`（或 `image_key` 指定列） | `string` | 无 | 输入图片路径。 |
-| `text`（或 `text_key` 指定列） | `string` | 无 | 输入文本描述。 |
-| `vqa_score`（或 `output_key`） | `float` | 无 | BLIP 预测“匹配”为 **Yes** 的概率，范围 `[0,1]`。 |
-
-
-示例输入：
-```jsonl
-{
-  "image_path": "1.png",
-  "text": "A bride and groom smiling in a car."
-}
-```
-
-示例输出：
-```jsonl
-{
-  "image_path": "1.png",
-  "text": "A bride and groom smiling in a car.",
-  "vqa_score": 0.774
-}
-```
\ No newline at end of file
diff --git a/docs/zh/notes/mm_operators/image_understanding/filter/cat_filter.md b/docs/zh/notes/mm_operators/image_understanding/filter/cat_filter.md
deleted file mode 100644
index b24863aa..00000000
--- a/docs/zh/notes/mm_operators/image_understanding/filter/cat_filter.md
+++ /dev/null
@@ -1,108 +0,0 @@
----
-title: 文本自然度过滤（CatFilter）
-createTime: 2025/10/15 15:00:00
-icon: material-symbols-light:image
-permalink: /zh/mm_operators/filter/cat_filter/
----
-## 📘 概述
-`CatFilter`（Caption-as-Teacher）是一个**文本复杂度 + OCR 重叠度**联合过滤算子。它对每条 `caption`：
-1) 使用 **Stanza** 做依存句法，抽取主语-谓语-宾语等三元组衡量**语义复杂度**；  
-2) 要求句子**包含动词**（动作性）；  
-3) 对图片做 **OCR**，计算 OCR 文本与 `caption` 的 **Jaccard 重叠率**，若过高则视为“抄写图片文字”而过滤。  
-当三者均满足条件时，样本被保留，否则被剔除
-
-## ```__init__```函数
-```python
-def __init__(
-    self,
-    min_triples: int = 2,
-    ocr_overlap_threshold: float = 0.2
-)
-```
-
-## `init`参数说明
-| 参数名 | 类型 | 默认值 | 说明 |
-| :--- | :--- | :--- | :--- |
-| `min_triples` | `int` | `2` | 依存句法三元组（主语-谓语-宾语/论元）最小数量阈值，用于判定 caption 的语义复杂度。 |
-| `ocr_overlap_threshold` | `float` | `0.2` | OCR 文本与 caption 的 Jaccard 重叠上限；重叠 **≥** 该阈值视为 OCR 抄写，样本将被过滤。 |
-
-
-
-
-## `run`函数
-```python
-def run(
-    self,
-    storage: DataFlowStorage,
-    image_key: str,
-    caption_key: str
-):
-    ...
-```
-执行算子主逻辑：从 `storage` 读取数据表，逐行判断是否满足“复杂度达标 + 含动词 + 非 OCR 纯抄写”。不满足任一条件的样本将被过滤，最终仅写回保留下来的高质量样本行。
-参数
-| 参数名 | 类型 | 默认值 | 说明 |
-| :--- | :--- | :--- | :--- |
-| `storage` | `DataFlowStorage` | 无 | Dataflow 的读写存储对象。 |
-| `image_key` | `str` | 无 | 图片路径列名（如 `"image_path"`）。 |
-| `caption_key` | `str` | 无 | 文本标题列名（如 `"caption"`）。 |
-
-
-
-
-## 🧠 示例用法
-
-```python
-from dataflow.utils.storage import FileStorage
-from dataflow.operators.core_vision import CatFilter
-
-# 1) 准备 FileStorage（至少包含 image_path 与 caption）
-storage = FileStorage(
-    first_entry_file_name="data/cat_input.jsonl",
-    cache_path="./cache_local",
-    file_name_prefix="cat_filter",
-    cache_type="jsonl"
-)
-
-# 2) 初始化算子
-filt = CatFilter(
-    min_triples=2,            # 复杂度下限
-    ocr_overlap_threshold=0.2 # OCR 与 caption 的最大允许重叠
-)
-
-# 3) 执行过滤
-cols = filt.run(
-    storage=storage.step(),
-    image_key="image_path",
-    caption_key="caption"
-)
-print(cols)  # ["image_path", "caption"]
-```
-
-### 🧾 默认输出格式（Output Format）
-| 字段 | 类型 | 默认值 | 说明 |
-| :--- | :--- | :--- | :--- |
-| `image_path` | `string` | 无 | 过滤后保留样本的图片路径。 |
-| `caption` | `string` | 无 | 过滤后保留样本的标题文本（满足：复杂度 ≥ `min_triples`，且包含动词，并且 OCR 重叠 < `ocr_overlap_threshold`）。 |
-
-
-
-示例输入：
-```jsonl
-{
-  "image_path": "1.png",
-  "caption": "A bride smiles while the groom points ahead inside a car, their hands resting together on the seat."
-}
-{
-  "image_path": "2.jpg",
-  "caption": "SALE SALE SALE 50% OFF"
-}
-```
-
-示例输出：
-```jsonl
-{
-  "image_path": "1.png",
-  "caption": "A bride smiles while the groom points ahead inside a car, their hands resting together on the seat."
-}
-```
\ No newline at end of file
diff --git a/docs/zh/notes/mm_operators/image_understanding/filter/clip_filter.md b/docs/zh/notes/mm_operators/image_understanding/filter/clip_filter.md
deleted file mode 100644
index dc0f7348..00000000
--- a/docs/zh/notes/mm_operators/image_understanding/filter/clip_filter.md
+++ /dev/null
@@ -1,108 +0,0 @@
----
-title: 相似度过滤（ClipFilter）
-createTime: 2025/10/15 15:48:32
-icon: material-symbols-light:image
-permalink: /zh/mm_operators/filter/clip_filter/
----
-## 📘 概述
-`ClipFilter` 是一个基于 **CLIP 相似度** 的图文一致性过滤算子。对每条样本计算图片与文本的归一化嵌入余弦相似度（映射到 `[0,1]` 范围内），当相似度 **≥ 阈值**（`threshold`）时保留该样本，否则过滤掉。
-
-
-## ```__init__```函数
-```python
-def __init__(
-    self,
-    model_name: str = "../ckpt/clip-vit-base-patch32",
-    device: str = None
-)
-```
-
-## `init`参数说明
-| 参数名 | 类型 | 默认值 | 说明 |
-| :--- | :--- | :--- | :--- |
-| `model_name` | `str` | `"../ckpt/clip-vit-base-patch32"` | CLIP 模型本地路径或 Hugging Face Model ID。内部以 `CLIPProcessor` / `CLIPModel` 加载（`use_safetensors=True`, `weights_only=False`）。 |
-| `device` | `str \| None` | `None` | 推理设备；`None` 时自动选择可用的 `"cuda"`，否则回退到 `"cpu"`。 |
-
-
-
-
-
-## `run`函数
-```python
-def run(
-    self,
-    storage: DataFlowStorage,
-    image_key: str = "image",
-    caption_key: str = "caption",
-    threshold: float = 0.25
-):
-    ...
-```
-执行算子主逻辑：从 storage 读取数据表，按行计算 CLIP 图文相似度，仅保留相似度 ≥ threshold 的样本行，并写回存储。
-参数
-| 参数名 | 类型 | 默认值 | 说明 |
-| :--- | :--- | :--- | :--- |
-| `storage` | `DataFlowStorage` | 无 | Dataflow 的读写存储对象。 |
-| `image_key` | `str` | `"image"` | 图片路径列名。 |
-| `caption_key` | `str` | `"caption"` | 文本描述列名。 |
-| `threshold` | `float` | `0.25` | 图文相似度阈值；样本相似度 **<** 该值将被过滤。 |
-
-
-
-
-## 🧠 示例用法
-
-```python
-from dataflow.utils.storage import FileStorage
-from dataflow.operators.core_vision import ClipFilter
-
-# 1) 准备 FileStorage（至少包含 image 与 caption 列）
-storage = FileStorage(
-    first_entry_file_name="data/clip_filter_input.jsonl",
-    cache_path="./cache_local",
-    file_name_prefix="clip_filter",
-    cache_type="jsonl"
-)
-
-# 2) 初始化算子（可用本地或HF模型）
-flt = ClipFilter(
-    model_name="../ckpt/clip-vit-base-patch32",  # 或 "openai/clip-vit-base-patch32"
-    device=None                                  # 自动选择cuda/cpu
-)
-
-# 3) 执行过滤（将只保留相似度≥0.25的样本）
-cols = flt.run(
-    storage=storage.step(),
-    image_key="image",
-    caption_key="caption",
-    threshold=0.25
-)
-print(cols)  # ["image", "caption"]
-```
-
-### 🧾 默认输出格式（Output Format）
-| 字段 | 类型 | 默认值 | 说明 |
-| :--- | :--- | :--- | :--- |
-| `image` | `string` | 无 | 过滤后保留样本的图片路径。 |
-| `caption` | `string` | 无 | 过滤后保留样本的文本描述（图文相似度 ≥ `threshold`）。 |
-
-
-示例输入：
-```jsonl
-{
-  "image": "1.png",
-  "caption": "A bride and groom smiling in a car."
-}
-{
-  "image": "2.jpg",
-  "caption": "A red bus driving across a snowy mountain road at night."
-}
-```
-
-示例输出：
-```jsonl
-{
-  "image": "1.png",
-  "caption": "A bride and groom smiling in a car."
-}
-```
\ No newline at end of file
diff --git a/docs/zh/notes/mm_operators/image_understanding/filter/complexity_filter.md b/docs/zh/notes/mm_operators/image_understanding/filter/complexity_filter.md
deleted file mode 100644
index 1d9e68fe..00000000
--- a/docs/zh/notes/mm_operators/image_understanding/filter/complexity_filter.md
+++ /dev/null
@@ -1,107 +0,0 @@
----
-title: 能力复杂度过滤（ComplexityFilter）
-createTime: 2025/10/15 16:10:28
-icon: material-symbols-light:image
-permalink: /zh/mm_operators/filter/complexity_filter/
----
-## 📘 概述
-`ComplexityFilter` 是一个基于 **NLI（自然语言推理）** 的文本过滤算子，用于评估 caption 是否同时覆盖多种视觉能力要素（如颜色、形状、动作识别、计数、空间关系等），从而判定其**能力丰富度**。算子会为每条 caption 构造假设句（模板：`"The following text describes {}."`），使用 MNLI 模型计算 **entailment** 概率；当命中要素的数量达到阈值（`min_k`）时保留该样本，否则过滤掉。
-
-## ```__init__```函数
-```python
-def __init__(
-    self,
-    model_name: str = "../ckpt/bart-large-mnli",
-    threshold: float = 0.4,
-    min_k: int = 2,
-    device: str = None
-)
-```
-
-## `init`参数说明
-| 参数名 | 类型 | 默认值 | 说明 |
-| :--- | :--- | :--- | :--- |
-| `model_name` | `str` | `"../ckpt/bart-large-mnli"` | NLI 模型本地路径或 Hugging Face Model ID；内部以 `AutoTokenizer`/`AutoModelForSequenceClassification` 加载（`local_files_only=True`, `use_safetensors=True`, `weights_only=False`）。 |
-| `threshold` | `float` | `0.4` | 将某一能力要素判定为“命中（entailment）”的最低概率阈值；越高越严格。 |
-| `min_k` | `int` | `2` | 至少需要命中的能力要素个数；低于该值的样本将被过滤。 |
-| `device` | `str \| None` | `None` | 推理设备；`None` 时自动选择可用的 `"cuda"` 否则回退到 `"cpu"`。 |
-
-
-
-## `run`函数
-```python
-def run(
-    self,
-    storage: DataFlowStorage,
-    caption_key: str
-):
-    ...
-```
-执行算子主逻辑：从 `storage` 读取数据表，对`caption_key`指定的列逐条做 NLI 评估，仅保留命中要素数量`≥ min_k`的样本行，并写回存储。
-
-参数
-| 参数名 | 类型 | 默认值 | 说明 |
-| :--- | :--- | :--- | :--- |
-| `storage` | `DataFlowStorage` | 无 | Dataflow 的读写存储对象。 |
-| `caption_key` | `str` | 无 | 待评估文本列名（如 `"caption"`）。 |
-
-
-
-## 🧠 示例用法
-
-```python
-from dataflow.utils.storage import FileStorage
-from dataflow.operators.core_vision import ComplexityFilter
-
-# 1) 准备 FileStorage（至少包含 caption 列）
-storage = FileStorage(
-    first_entry_file_name="data/complexity_input.jsonl",
-    cache_path="./cache_local",
-    file_name_prefix="complexity_filter",
-    cache_type="jsonl"
-)
-
-# 2) 初始化算子（可用本地或HF模型）
-filt = ComplexityFilter(
-    model_name="../ckpt/bart-large-mnli",   # 或 "facebook/bart-large-mnli"
-    threshold=0.4,                          # entailment 概率阈值
-    min_k=2,                                # 至少命中2个能力要素
-    device=None                             # 自动选择cuda/cpu
-)
-
-# 3) 执行过滤
-cols = filt.run(
-    storage=storage.step(),
-    caption_key="caption"
-)
-print(cols)  # ["caption"]
-```
-
-### 🧾 默认输出格式（Output Format）
-| 字段 | 类型 | 默认值 | 说明 |
-| :--- | :--- | :--- | :--- |
-| `caption` | `string` | 无 | 过滤后保留的 caption 文本；仅保留命中要素数 `≥ min_k` 的样本行。 |
-
-
-示例输入：
-```jsonl
-{
-  "caption": "A red double-decker bus turns left at a city intersection while pedestrians wait at the crosswalk."
-}
-{
-  "caption": "SALE SALE SALE 50% OFF"
-}
-{
-  "caption": "Two kids count seashells on a sandy beach while their mother reads under a blue umbrella."
-}
-```
-
-示例输出：
-```jsonl
-{
-  "caption": "A red double-decker bus turns left at a city intersection while pedestrians wait at the crosswalk."
-}
-{
-  "caption": "Two kids count seashells on a sandy beach while their mother reads under a blue umbrella."
-}
-```
\ No newline at end of file
diff --git a/docs/zh/notes/mm_operators/image_understanding/filter/deduplication_filter.md b/docs/zh/notes/mm_operators/image_understanding/filter/deduplication_filter.md
deleted file mode 100644
index 79d9a260..00000000
--- a/docs/zh/notes/mm_operators/image_understanding/filter/deduplication_filter.md
+++ /dev/null
@@ -1,120 +0,0 @@
----
-title: 图片相似度过滤（DuplicateFilter）
-createTime: 2025/10/15 19:24:01
-icon: material-symbols-light:image
-permalink: /zh/mm_operators/filter/deduplication_filter/
----
-## 📘 概述
-`DeduplicateFilter` 是一个**基于 CLIP 图像嵌入相似度**的去重过滤算子。它为数据集中每张图片提取 CLIP 向量，计算两两余弦相似度，凡是相似度**≥ `threshold`** 的图片对，保留**第一张**、移除**后出现**的重复项。同时会在输出中给出每张保留图片的**最大相似度分数**（列名为 `output_score_key`，默认 `max_similarity`），以便后续审计。
-
-## ```__init__```函数
-```python
-def __init__(
-    self,
-    model_name: str = "openai/clip-vit-base-patch32",
-    threshold: float = 0.90,
-    batch_size: int = 32,
-    device: str = "cuda" if torch.cuda.is_available() else "cpu"
-)
-```
-
-## `init`参数说明
-| 参数名 | 类型 | 默认值 | 说明 |
-| :--- | :--- | :--- | :--- |
-| `model_name` | `str` | `"openai/clip-vit-base-patch32"` | 用于提取图像嵌入的 CLIP 模型（HF Model ID 或本地路径）。 |
-| `threshold` | `float` | `0.90` | 去重判定阈值；两图余弦相似度 **≥** 该值即视为重复，保留先出现的一张、移除后者。 |
-| `batch_size` | `int` | `32` | CLIP 推理批大小；越大吞吐越高，显存占用也越高。 |
-| `device` | `str` | `"cuda"`（可用时）否则 `"cpu"` | 模型推理设备。 |
-
-
-
-
-## `run`函数
-```python
-def run(
-    self, 
-    storage: DataFlowStorage,
-    input_image_key: str = "image",
-    output_score_key: str = "max_similarity"
-) -> None:
-    ...
-```
-执行算子主逻辑：
-1. 从 `storage` 读取 DataFrame，提取 `input_image_key` 列为待处理图片序列。  
-2. **批处理提取嵌入**：使用 CLIP 对有效图片生成向量，并进行 L2 归一化，拼接为矩阵 `E`。  
-3. **相似度计算**：计算 `cosine_similarity(E, E)` 获得两两相似度矩阵。  
-4. **重复判定**：遍历所有 `i < j` 且相似度 **≥ `threshold`** 的索引对 `(i, j)`，将 `j` 标记为重复项（移除），`i` 为保留项。  
-5. **审计分数**：计算每个样本与其它样本的**最大相似度**，写入列 `output_score_key`（默认 `max_similarity`）。  
-6. **写回结果**：仅保留“非重复”的样本行并写回 `storage`；释放模型资源（`del self.model`）。  
-
-参数
-| 参数名 | 类型 | 默认值 | 说明 |
-| :--- | :--- | :--- | :--- |
-| `storage` | `DataFlowStorage` | 无 | Dataflow 的读写存储对象。 |
-| `input_image_key` | `str` | `"image"` | 输入图片列名；元素可为路径或可被 `_load_image` 解析的对象。 |
-| `output_score_key` | `str` | `"max_similarity"` | 输出中记录“该图片与其它图片的最大相似度”的列名。 |
-
-
-
-## 🧠 示例用法
-
-```python
-from dataflow.utils.storage import FileStorage
-from dataflow.operators.core_vision import DeduplicateFilter
-
-# 1) 准备 FileStorage（至少包含 image 列）
-storage = FileStorage(
-    first_entry_file_name="data/dedup_input.jsonl",
-    cache_path="./cache_local",
-    file_name_prefix="image_dedup",
-    cache_type="jsonl"
-)
-
-# 2) 初始化算子
-dedup = DeduplicateFilter(
-    model_name="openai/clip-vit-base-patch32",
-    threshold=0.90,
-    batch_size=32,
-    device="cuda"  # 或 "cpu"
-)
-
-# 3) 执行去重
-dedup.run(
-    storage=storage.step(),
-    input_image_key="image",           # 图片列
-    output_score_key="max_similarity"  # 最大相似度列
-)
-```
-
-### 🧾 默认输出格式（Output Format）
-| 字段 | 类型 | 默认值 | 说明 |
-| :--- | :--- | :--- | :--- |
-| `image`（或 `input_image_key` 指定列） | `string/any` | 无 | 去重后保留样本的图片字段。 |
-| `max_similarity`（或 `output_score_key`） | `float` | 无 | 该样本与其它样本的最大相似度（用于审计；被判定为重复的行不在输出中）。 |
-
-
-示例输入：
-```jsonl
-{
-  "image": "a.jpg"
-}
-{
-  "image": "b.jpg"
-}
-{
-  "image": "a_copy.jpg"
-}
-
-```
-
-示例输出：
-```jsonl
-{
-  "image": "a.jpg",
-  "max_similarity": 0.96
-}
-{
-  "image": "b.jpg",
-  "max_similarity": 0.12
-}
-```
\ No newline at end of file
diff --git a/docs/zh/notes/mm_operators/image_understanding/filter/image_aesthetic_filter.md b/docs/zh/notes/mm_operators/image_understanding/filter/image_aesthetic_filter.md
index 877c050a..c325170d 100644
--- a/docs/zh/notes/mm_operators/image_understanding/filter/image_aesthetic_filter.md
+++ b/docs/zh/notes/mm_operators/image_understanding/filter/image_aesthetic_filter.md
@@ -1,11 +1,18 @@
 ---
-title: 图像美学过滤（ImageAsetheticFilter）
+title: ImageAestheticFilter
 createTime: 2025/10/15 15:45:04
-icon: material-symbols-light:image
+# icon: material-symbols-light:image
 permalink: /zh/mm_operators/filter/image_aesthetic_filter/
 ---
 ## 📘 概述
-`ImageAestheticFilter` 是一个**图片基础美学过滤**算子，用于快速剔除低质量图像。它基于灰度图的四项指标做判定：**清晰度**（Laplacian 方差）、**亮度**（均值）、**对比度**（标准差）以及**极端像素比例**（近黑/近白占比）。仅当四项全部达标时，样本被保留。
+`ImageAestheticFilter` 对输入图像做**基础质量 / 美学过滤**，综合评估：
+
+- 清晰度（是否模糊）
+- 整体亮度（是否过暗 / 过亮）
+- 对比度（是否灰蒙蒙一片）
+- 极端黑/白像素比例（是否几乎全黑或全白）
+
+用于剔除模糊、曝光异常或几乎纯色背景的**低质量图片**，为后续检测、识别、检索或生成任务提供更干净的输入数据。
 
 
 ## ```__init__```函数
@@ -17,17 +24,19 @@ def __init__(
     contrast_thresh: float = 40.0,
     max_black_ratio: float = 0.90,
     max_white_ratio: float = 0.90
-)
+):
 ```
 
+
 ## `init`参数说明
-| 参数名 | 类型 | 默认值 | 说明 |
-| :--- | :--- | :--- | :--- |
-| `blur_thresh` | `float` | `150.0` | 清晰度阈值（Laplacian 方差）；数值越大要求越清晰。 |
-| `brightness_range` | `tuple[float, float]` | `(30, 230)` | 允许的平均亮度范围（灰度 0–255）。 |
-| `contrast_thresh` | `float` | `40.0` | 对比度阈值（灰度标准差）；越大要求越高。 |
-| `max_black_ratio` | `float` | `0.90` | 近黑像素（<10）占比上限；超过则视为极暗/大块遮挡。 |
-| `max_white_ratio` | `float` | `0.90` | 近白像素（>245）占比上限；超过则视为过曝/大面积空白。 |
+| 参数名              | 类型                    | 默认值                 | 说明 |
+| :------------------ | :---------------------- | :--------------------- | :--- |
+| `blur_thresh`       | `float`                 | `150.0`                | 清晰度阈值，基于拉普拉斯方差（Variance of Laplacian）。数值越高代表图像越清晰；低于该阈值则认为图像模糊。 |
+| `brightness_range`  | `tuple[float, float]`   | `(30, 230)`            | 允许的整体亮度范围（灰度均值），过低视为过暗，过高视为过亮；落在区间内才视为亮度正常。 |
+| `contrast_thresh`   | `float`                 | `40.0`                 | 对比度阈值，基于灰度图标准差；低于该值说明图像整体灰蒙，对比度不足。 |
+| `max_black_ratio`   | `float`                 | `0.90`                 | 允许的**近纯黑像素**最大比例（`gray < 10`），超过则认为图像几乎全黑。 |
+| `max_white_ratio`   | `float`                 | `0.90`                 | 允许的**近纯白像素**最大比例（`gray > 245`），超过则认为图像几乎全白。 |
+
 
 
 
@@ -38,18 +47,40 @@ def __init__(
 def run(
     self,
     storage: DataFlowStorage,
-    image_key: str
+    input_image_key: str = "image_path",
 ):
     ...
 ```
-执行算子主逻辑：从 `storage` 读取数据表，对 `image_key` 指定列逐行进行清晰度/亮度/对比度/极端像素比例四项检测；新增布尔列 `quality`，仅保留 `quality == True` 的样本并写回存储。
-参数
-| 参数名 | 类型 | 默认值 | 说明 |
-| :--- | :--- | :--- | :--- |
-| `storage` | `DataFlowStorage` | 无 | Dataflow 的读写存储对象。 |
-| `image_key` | `str` | 无 | 图片路径列名（如 `"image_path"`）。 |
+执行算子主逻辑：
+1. **读取数据**  
+   从 `storage` 中读取当前 DataFrame，`input_image_key` 列给出图像路径（默认 `"image_path"`）。
+
+2. **逐图像质量评估**  
+   对每一行样本执行以下步骤：
+   1. 使用 OpenCV 以灰度方式读取图像：`cv2.imread(path, cv2.IMREAD_GRAYSCALE)`；若读取失败，直接判定为不合格。  
+   2. 计算清晰度：  
+      - 使用 `cv2.Laplacian(gray, cv2.CV_64F).var()` 得到拉普拉斯方差；  
+      - 与 `blur_thresh` 比较，低于阈值认为模糊。  
+   3. 计算亮度：  
+      - 灰度图均值 `gray.mean()`，要求在 `brightness_range = (bright_min, bright_max)` 区间内。  
+   4. 计算对比度：  
+      - 灰度图标准差 `gray.std()`，要求 ≥ `contrast_thresh`。  
+   5. 计算极端像素比例：  
+      - 近黑像素比例 `black_ratio = (gray < 10).sum() / total_pixels`；  
+      - 近白像素比例 `white_ratio = (gray > 245).sum() / total_pixels`；  
+      - 若 `black_ratio > max_black_ratio` 或 `white_ratio > max_white_ratio`，则认为图像过于极端（几乎纯黑/纯白）。  
+
+3. **综合判定**  
+   仅当 **清晰度达标** 且 **亮度正常** 且 **对比度达标** 且 **极端像素比例不过高** 时，  
+   将该样本的 `quality` 记为 `True`，否则记为 `False`。
 
 
+参数
+| 参数名            | 类型              | 默认值          | 说明 |
+| :---------------- | :---------------- | :-------------- | :--- |
+| `storage`         | `DataFlowStorage` | 无              | Dataflow 的读写存储对象。 |
+| `input_image_key` | `str`             | `"image_path"`  | 图像路径所在列名。 |
+
 
 
 
@@ -61,34 +92,35 @@ from dataflow.operators.core_vision import ImageAestheticFilter
 
 # 1) 准备 FileStorage（至少包含 image_path 列）
 storage = FileStorage(
-    first_entry_file_name="data/aesthetic_input.jsonl",
+    first_entry_file_name="./dataflow/example/test_image_filter/test_image_filter.jsonl",
     cache_path="./cache_local",
-    file_name_prefix="img_aesthetic",
+    file_name_prefix="aes_filter",
     cache_type="jsonl"
 )
 
-# 2) 初始化算子（可按需调整阈值）
-flt = ImageAestheticFilter(
+# 2) 初始化美学过滤算子（可按需要调整阈值）
+aes_filter = ImageAestheticFilter(
     blur_thresh=150.0,
     brightness_range=(30, 230),
     contrast_thresh=40.0,
     max_black_ratio=0.90,
-    max_white_ratio=0.90
+    max_white_ratio=0.90,
 )
 
-# 3) 执行过滤
-cols = flt.run(
+# 3) 执行过滤：仅保留通过质量检查的图片
+cols = aes_filter.run(
     storage=storage.step(),
-    image_key="image_path"
+    input_image_key="image_path",
 )
 print(cols)  # ["image_path"]
 ```
 
 ### 🧾 默认输出格式（Output Format）
-| 字段 | 类型 | 默认值 | 说明 |
-| :--- | :--- | :--- | :--- |
-| `image_path` | `string` | 无 | 过滤后保留样本的图片路径。 |
-| `quality` | `bool` | `True` | 质量判定结果；输出中仅保留 `quality=True` 的样本行（其余行被剔除）。 |
+| 字段名                                          | 类型      | 默认值 | 说明 |
+| :---------------------------------------------- | :-------- | :----- | :--- |
+| `image_path`（或 `input_image_key` 指定列）     | `string`  | 无     | 输入图像路径。 |
+| `quality`                                      | `boolean` | 无     | 该图像是否通过美学/质量过滤；仅 `quality == true` 的行会保留在最终输出中。 |
+
 
 
 
diff --git a/docs/zh/notes/mm_operators/image_understanding/filter/image_cat_filter.md b/docs/zh/notes/mm_operators/image_understanding/filter/image_cat_filter.md
new file mode 100644
index 00000000..05c1aff9
--- /dev/null
+++ b/docs/zh/notes/mm_operators/image_understanding/filter/image_cat_filter.md
@@ -0,0 +1,156 @@
+---
+title: ImageCatFilter
+createTime: 2025/10/15 15:00:00
+# icon: material-symbols-light:image
+permalink: /zh/mm_operators/filter/image_cat_filter/
+---
+## 📘 概述
+`ImageCatFilter` 基于 **Caption-as-Teacher** 思想，结合 **BART-large-MNLI 自然语言推理模型** 与可选的 **Tesseract OCR**，  
+对图文样本进行「语义复杂度 + 动作性 + OCR 抄写」三重过滤，仅保留语义信息更丰富、真正描述图像内容的 caption。
+
+## ```__init__```函数
+```python
+def __init__(
+    self,
+    model_name: str = "facebook/bart-large-mnli",
+    complexity_thresh: float = 0.4,
+    min_caps: int = 2,
+    action_thresh: float = 0.4,
+    ocr_overlap_threshold: float = 0.2,
+    ocr_nli_thresh: float = 0.6,
+    device: str | None = None,
+):
+```
+
+## `init`参数说明
+| 参数名                 | 类型              | 默认值                          | 说明 |
+| :--------------------- | :---------------- | :------------------------------ | :--- |
+| `model_name`           | `str`             | `"facebook/bart-large-mnli"`    | 用于 NLI 判断的预训练模型名称或本地路径；通过 `AutoTokenizer` 与 `AutoModelForSequenceClassification` 加载。 |
+| `complexity_thresh`    | `float`           | `0.4`                           | caption 对各类「能力假设句」的 **蕴含概率阈值**；高于该值视为该能力被 caption 覆盖，用于衡量语义复杂度。 |
+| `min_caps`             | `int`             | `2`                             | 至少需要被支持的能力假设条数；即 caption 至少要蕴含多少种能力（动作、交互、场景细节等）才算“复杂度达标”。 |
+| `action_thresh`        | `float`           | `0.4`                           | caption 对 `ACTION_HYPOTHESIS`（描述场景中正在发生的动作）的蕴含概率阈值；低于该值认为动作性不足而被过滤。 |
+| `ocr_overlap_threshold`| `float`           | `0.2`                           | OCR 文本与 caption tokens 的 Jaccard 重叠度阈值；仅当重叠度高于该值时才进一步用 NLI 判断是否为 OCR 抄写。 |
+| `ocr_nli_thresh`       | `float`           | `0.6`                           | caption 对 `OCR_ONLY_HYPOTHESIS`（主要抄写图像文字）的蕴含概率阈值；重叠度高且该概率超过此阈值时，判定为 OCR 抄写并过滤。 |
+| `device`               | `str \| None`     | `None`   
+
+
+## `run`函数
+```python
+def run(
+    self,
+    storage: DataFlowStorage,
+    input_image_key: str = "image",
+    input_caption_key: str = "caption",
+):
+    ...
+```
+执行算子主逻辑：
+1. **读取数据**  
+   从 `storage` 中读取当前 DataFrame，并按行遍历，每一行视为一个图文样本：  
+   - `input_image_key` 列给出图像路径（默认 `"image"`）；  
+   - `input_caption_key` 列给出英文 caption（默认 `"caption"`）。
+
+2. **复杂度检测（Complexity check）**  
+   - 对当前 caption，依次与 `CAPS_HYPOTHESES` 中的每条「能力假设句」组成 NLI 前提-假设对：  
+     `premise = caption`，`hypothesis = 某条能力句`。  
+   - 用 `BART-large-mnli` 计算每条假设的 **entailment 概率** `p_entail`；  
+   - 当 `p_entail >= complexity_thresh` 时，认为该能力被 caption 覆盖，并计入能力计数；  
+   - 最终能力计数 `cap_num >= min_caps` 时，认为 caption 具有足够的语义复杂度，否则该样本被过滤。
+
+3. **动作性检测（Action check）**  
+   - 使用单一假设 `ACTION_HYPOTHESIS`：  
+     > "The caption clearly describes an action happening in the scene."  
+   - 计算 caption 对该假设的 entailment 概率 `p_action`；  
+   - 当 `p_action < action_thresh` 时，认为 caption 并未清晰描述动作，样本被过滤。
+
+4. **OCR 抄写检测（OCR-only check，可选）**  
+   - 若本机未安装 Tesseract 或初始化时检测失败，则 **跳过** 本步骤，默认通过 OCR 检查；  
+   - 否则：  
+     1. 使用 `pytesseract.image_to_string` 对图像做 OCR，得到 `ocr_text`；  
+     2. 对 `ocr_text` 与 `caption` 分别提取英文 token 集合 `ocr_tokens` 与 `cap_tokens`；  
+     3. 计算 Jaccard 重叠度：  
+        \[
+        J = \frac{|ocr\_tokens \cap cap\_tokens|}{|ocr\_tokens \cup cap\_tokens|}
+        \]  
+     4. 若 `J < ocr_overlap_threshold`，认为 caption 不主要抄写文字，通过 OCR 检查；  
+     5. 若 `J >= ocr_overlap_threshold`，进一步构造 `OCR_ONLY_HYPOTHESIS`：  
+        > "The caption mainly transcribes the visible text in the image instead of describing the visual scene."  
+        并计算其 entailment 概率 `p_ocr_only`：  
+        - 若 `p_ocr_only >= ocr_nli_thresh`，则认为 caption 主要是 OCR 抄写，样本被过滤；  
+        - 否则仍视为通过 OCR 检查。
+
+5. **综合判定**  
+   对每一行样本，只有当 **复杂度检测通过 + 动作检测通过 + OCR 检测通过** 三者同时满足时，  
+   才将该行标记为保留；否则标记为过滤。
+
+
+参数
+| 参数名            | 类型              | 默认值         | 说明 |
+| :---------------- | :---------------- | :------------- | :--- |
+| `storage`         | `DataFlowStorage` | 无             | Dataflow 的读写存储对象。 |
+| `input_image_key` | `str`             | `"image"`      | 图像路径所在列名。 |
+| `input_caption_key` | `str`           | `"caption"`    | 英文图像描述所在列名。 |
+
+
+
+
+## 🧠 示例用法
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.core_vision import ImageCatFilter
+
+# 1) 准备 FileStorage（至少包含 image 与 caption 两列）
+storage = FileStorage(
+    first_entry_file_name="./dataflow/example/test_image_filter/test_image_filter.jsonl",
+    cache_path="./cache_local",
+    file_name_prefix="cat_filter",
+    cache_type="jsonl"
+)
+
+# 2) 初始化 Cat 过滤算子（可调节复杂度阈值与 OCR 相关超参）
+cat_filter = ImageCatFilter(
+    model_name="facebook/bart-large-mnli",
+    complexity_thresh=0.4,
+    min_caps=2,
+    action_thresh=0.4,
+    ocr_overlap_threshold=0.2,
+    ocr_nli_thresh=0.6,
+    device=None  # 自动选择 cuda/cpu
+)
+
+# 3) 执行过滤：仅保留语义复杂、具有动作且非纯 OCR 抄写的样本
+cols = cat_filter.run(
+    storage=storage.step(),
+    input_image_key="image",
+    input_caption_key="caption",
+)
+print(cols)  # ["image", "caption"]
+```
+
+### 🧾 默认输出格式（Output Format）
+| 字段名                                          | 类型     | 默认值 | 说明 |
+| :---------------------------------------------- | :------- | :----- | :--- |
+| `image`（或 `input_image_key` 指定列）          | `string` | 无     | 输入图像路径。 |
+| `caption`（或 `input_caption_key` 指定列）      | `string` | 无     | 输入英文图像描述。 |
+
+
+示例输入：
+```jsonl
+{
+  "image_path": "1.png",
+  "caption": "A bride smiles while the groom points ahead inside a car, their hands resting together on the seat."
+}
+{
+  "image_path": "2.jpg",
+  "caption": "SALE SALE SALE 50% OFF"
+}
+```
+
+示例输出：
+```jsonl
+{
+  "image_path": "1.png",
+  "caption": "A bride smiles while the groom points ahead inside a car, their hands resting together on the seat."
+}
+```
\ No newline at end of file
diff --git a/docs/zh/notes/mm_operators/image_understanding/filter/image_clip_filter.md b/docs/zh/notes/mm_operators/image_understanding/filter/image_clip_filter.md
new file mode 100644
index 00000000..5c508b06
--- /dev/null
+++ b/docs/zh/notes/mm_operators/image_understanding/filter/image_clip_filter.md
@@ -0,0 +1,118 @@
+---
+title: ImageClipFilter
+createTime: 2025/10/15 15:48:32
+# icon: material-symbols-light:image
+permalink: /zh/mm_operators/filter/image_clip_filter/
+---
+## 📘 概述
+`ImageClipFilter` 基于预训练 **CLIP** 模型，计算图像与文本描述的**语义相似度**，并按照给定阈值过滤不一致的图文对。  
+
+
+## ```__init__```函数
+```python
+def __init__(
+    self,
+    model_name: str = "openai/clip-vit-base-patch32",
+    device: str = None
+):
+    ...
+```
+
+## `init`参数说明
+| 参数名        | 类型          | 默认值                             | 说明 |
+| :----------- | :------------ | :--------------------------------- | :--- |
+| `model_name` | `str`         | `"openai/clip-vit-base-patch32"`  | CLIP 模型本地路径或 Hugging Face Model ID；通过 `CLIPProcessor` / `CLIPModel` 加载（`use_safetensors=True`, `weights_only=False`）。 |
+| `device`     | `str \| None` | `None`                            | 推理设备；`None` 时自动选择可用的 `"cuda"`，否则使用 `"cpu"`。 |
+
+
+
+## `run`函数
+```python
+def run(
+    self,
+    storage: DataFlowStorage,
+    input_image_key: str = "image",
+    input_caption_key: str = "caption",
+    threshold: float = 0.25
+):
+    ...
+```
+执行算子主逻辑：
+1. 从 `storage` 读取 DataFrame，逐行读取 `input_image_key` 与 `input_caption_key` 指定的列。  
+2. 对于每一行样本：  
+   - 使用 `PIL.Image.open` 读取图像并转为 RGB；若图像无法读取则记为相似度 `0.0`。  
+   - 若文本为空或仅包含空白字符，也记为相似度 `0.0`。  
+   - 使用 `CLIPProcessor` 以 `text=[caption]`、`images=[image]` 组装输入张量，移动到指定 `device`。  
+   - 前向通过 `CLIPModel` 得到 `image_embeds` 与 `text_embeds`，分别做 L2 归一化后计算点积，得到相似度 `sim`。  
+   - 将 `sim` 裁剪到 `[0, 1]` 区间，得到该图文对的最终相似度分数。  
+3. 若 `sim ≥ threshold`，则认为图像与 caption 语义一致，将该行标记为保留；否则视为不一致图文对并过滤。  
+4. 将所有保留样本的行组成新的 DataFrame，重置索引后写回 `storage`。  
+5. 返回 `[input_image_key, input_caption_key]` 作为后续算子的输入列名列表。  
+
+
+参数
+| 参数名             | 类型              | 默认值        | 说明 |
+| :---------------- | :---------------- | :------------ | :--- |
+| `storage`         | `DataFlowStorage` | 无            | Dataflow 的读写存储对象。 |
+| `input_image_key` | `str`             | `"image"`     | 输入图片路径列名。 |
+| `input_caption_key` | `str`           | `"caption"`   | 输入文本描述列名（caption）。 |
+| `threshold`       | `float`           | `0.25`        | 图文对最小相似度阈值；仅当 CLIP 相似度 `≥ threshold` 时保留样本。 |
+
+
+
+## 🧠 示例用法
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.core_vision import ImageClipFilter
+
+# 1) 准备 FileStorage（至少包含 image 与 caption 两列）
+storage = FileStorage(
+    first_entry_file_name="./dataflow/example/test_image_filter/test_image_filter.jsonl",
+    cache_path="./cache_local",
+    file_name_prefix="image_clip_filter",
+    cache_type="jsonl"
+)
+
+# 2) 初始化算子（可改为 HF 模型ID，如 "openai/clip-vit-base-patch32"）
+filt = ImageClipFilter(
+    model_name="openai/clip-vit-base-patch32",
+    device=None  # 自动选择 cuda/cpu
+)
+
+# 3) 执行过滤：仅保留 CLIP 相似度 ≥ 0.25 的图文对
+cols = filt.run(
+    storage=storage.step(),
+    input_image_key="image",
+    input_caption_key="caption",
+    threshold=0.25
+)
+print(cols)  # ["image", "caption"]
+```
+
+### 🧾 默认输出格式（Output Format）
+| 字段 | 类型 | 默认值 | 说明 |
+| :--- | :--- | :--- | :--- |
+| `image` | `string` | 无 | 过滤后保留样本的图片路径。 |
+| `caption` | `string` | 无 | 过滤后保留样本的文本描述（图文相似度 ≥ `threshold`）。 |
+
+
+示例输入：
+```jsonl
+{
+  "image": "1.png",
+  "caption": "A bride and groom smiling in a car."
+}
+{
+  "image": "2.jpg",
+  "caption": "A red bus driving across a snowy mountain road at night."
+}
+```
+
+示例输出：
+```jsonl
+{
+  "image": "1.png",
+  "caption": "A bride and groom smiling in a car."
+}
+```
\ No newline at end of file
diff --git a/docs/zh/notes/mm_operators/image_understanding/filter/image_complexity_filter.md b/docs/zh/notes/mm_operators/image_understanding/filter/image_complexity_filter.md
new file mode 100644
index 00000000..41eb09f1
--- /dev/null
+++ b/docs/zh/notes/mm_operators/image_understanding/filter/image_complexity_filter.md
@@ -0,0 +1,137 @@
+---
+title: ComplexityFilter
+createTime: 2025/10/15 16:10:28
+# icon: material-symbols-light:image
+permalink: /zh/mm_operators/filter/complexity_filter/
+---
+## 📘 概述
+`ComplexityFilter` 是一个基于 **NLI（自然语言推理）** 的文本过滤算子，用于评估 caption 是否同时覆盖多种视觉能力要素（如颜色、形状、动作识别、计数、空间关系等），从而判定其**能力丰富度**。
+
+算子会为每条 caption 构造假设句（模板：`"The following text describes {}."`），使用 MNLI 模型计算 **entailment** 概率；当命中要素的数量达到阈值（`min_k`）时保留该样本，否则过滤掉。
+
+## ```__init__```函数
+```python
+def __init__(
+    self,
+    model_name: str = "facebook/bart-large-mnli",
+    threshold: float = 0.4,
+    min_k: int = 2,
+    device: str = None
+)
+```
+
+## `init`参数说明
+| 参数名       | 类型        | 默认值                       | 说明 |
+| :----------- | :---------- | :--------------------------- | :--- |
+| `model_name` | `str`       | `"facebook/bart-large-mnli"` | NLI 模型本地路径或 Hugging Face Model ID；内部使用 `AutoTokenizer` / `AutoModelForSequenceClassification` 加载（`local_files_only=True`, `use_safetensors=True`, `weights_only=False`）。 |
+| `threshold`  | `float`     | `0.4`                        | 将某一能力要素判定为“命中（entailment）”的最低概率阈值；数值越高过滤越严格。 |
+| `min_k`      | `int`       | `2`                          | 至少需要命中的能力要素个数；若某条 caption 命中要素数 `< min_k` 则该样本会被过滤掉。 |
+| `device`     | `str \| None` | `None`                    | 推理设备；`None` 时自动选择可用的 `"cuda"`，否则回退到 `"cpu"`。 |
+
+
+
+## `run`函数
+```python
+def run(
+    self,
+    storage: DataFlowStorage,
+    input_caption_key: str = "caption"
+):
+    ...
+```
+执行算子主逻辑：
+1. **读取数据**  
+   从 `storage` 读取当前 DataFrame，例如包含列：`caption`。
+
+2. **为每条 caption 构造能力假设（hypotheses）**  
+   内部预定义了一组能力标签（如颜色、形状、物体识别、动作识别、文本识别、空间关系、计数、场景理解等），并使用统一模板：
+   > `"The following text describes {}."`  
+   对于每个能力标签 capability，拼出对应假设句（hypothesis）。
+
+3. **调用 MNLI 模型做 NLI 推理**  
+   对于每个能力标签，构造前提–假设对：
+   - premise：caption 文本本身  
+   - hypothesis：由模板生成的能力描述句  
+
+   使用 `bart-large-mnli` 计算该前提对该假设的 **entailment 概率**（通常是 `logits` 中第 3 维 / index 2 的 softmax 概率）。
+
+4. **统计命中能力数**  
+   - 若某个能力的 entailment 概率 ≥ `threshold`，则视为该能力被 caption “覆盖/命中”；  
+   - 对所有能力标签统计命中个数 `hit_count`。
+
+5. **过滤规则**  
+   - 若 `hit_count >= min_k`，则认为该 caption 在视觉能力维度上足够丰富，样本通过过滤；  
+   - 否则认为描写过于单一或内容极其简单，样本会被过滤掉。
+
+6. **写回结果**  
+   - 仅保留通过过滤的样本行（`hit_count >= min_k`），`reset_index(drop=True)` 后写回 `storage`；  
+   - 返回 `[input_caption_key]`，通常为 `["caption"]`，供后续算子继续使用同一列名进行处理。
+
+
+
+参数
+| 参数名             | 类型              | 默认值        | 说明 |
+| :----------------- | :---------------- | :------------ | :--- |
+| `storage`          | `DataFlowStorage` | 无            | Dataflow 的读写存储对象。 |
+| `input_caption_key` | `str`            | `"caption"`   | 待评估文本列名，一般为图像描述字段（caption）。 |
+
+
+
+## 🧠 示例用法
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.core_vision import ComplexityFilter
+
+# 1) 准备 FileStorage（至少包含 caption 列）
+storage = FileStorage(
+    first_entry_file_name="./dataflow/example/test_image_filter/test_image_filter.jsonl",
+    cache_path="./cache_local",
+    file_name_prefix="complexity_filter",
+    cache_type="jsonl"
+)
+
+# 2) 初始化算子（可用本地或 HF 模型）
+filt = ComplexityFilter(
+    model_name="facebook/bart-large-mnli",  # 或 "../ckpt/bart-large-mnli"
+    threshold=0.4,                          # entailment 概率阈值
+    min_k=2,                                # 至少命中 2 个能力要素
+    device=None                             # 自动选择 cuda/cpu
+)
+
+# 3) 执行过滤
+cols = filt.run(
+    storage=storage.step(),
+    input_caption_key="caption"
+)
+print(cols)  # ["caption"]
+```
+
+### 🧾 默认输出格式（Output Format）
+| 字段名                 | 类型     | 默认值 | 说明 |
+| :--------------------- | :------- | :----- | :--- |
+| `caption`（或 `input_caption_key` 指定列） | `string` | 无 | 过滤后保留的 caption 文本；仅包含命中能力要素数 `≥ min_k` 的样本行。 |
+
+
+示例输入：
+```jsonl
+{
+  "caption": "A red double-decker bus turns left at a city intersection while pedestrians wait at the crosswalk."
+}
+{
+  "caption": "SALE SALE SALE 50% OFF"
+}
+{
+  "caption": "Two kids count seashells on a sandy beach while their mother reads under a blue umbrella."
+}
+```
+
+示例输出：
+```jsonl
+{
+  "caption": "A red double-decker bus turns left at a city intersection while pedestrians wait at the crosswalk."
+}
+{
+  "caption": "Two kids count seashells on a sandy beach while their mother reads under a blue umbrella."
+}
+```
\ No newline at end of file
diff --git a/docs/zh/notes/mm_operators/image_understanding/filter/image_consistency_filter.md b/docs/zh/notes/mm_operators/image_understanding/filter/image_consistency_filter.md
new file mode 100644
index 00000000..b260d3e9
--- /dev/null
+++ b/docs/zh/notes/mm_operators/image_understanding/filter/image_consistency_filter.md
@@ -0,0 +1,145 @@
+---
+title: ImageConsistencyFilter
+createTime: 2025/10/15 15:48:32
+# icon: material-symbols-light:image
+permalink: /zh/mm_operators/filter/image_consistency_filter/
+---
+## 📘 概述
+`ImageConsistencyFilter` 是一个基于 **NLI（自然语言推理）** 的一致性过滤算子，用于检查同一图像下的三元组  
+**(caption, question, answer)** 在语义上是否自洽：**answer 是否可以由 caption + question 推理得到**。  
+
+内部通过将 `caption + question` 视为前提（premise），将 `answer` 视为假设（hypothesis），调用 `bart-large-mnli` 模型计算  
+**entailment 概率**，当概率低于阈值 `threshold` 时判定该样本语义不一致并将其过滤掉。
+
+
+## ```__init__```函数
+```python
+def __init__(
+    self,
+    model_name: str = "facebook/bart-large-mnli",
+    threshold: float = 0.35,
+    device: str = None
+):
+    ...
+```
+
+## `init`参数说明
+| 参数名       | 类型          | 默认值                      | 说明 |
+| :----------- | :------------ | :-------------------------- | :--- |
+| `model_name` | `str`         | `"facebook/bart-large-mnli"` | NLI 模型本地路径或 Hugging Face Model ID；内部通过 `AutoTokenizer` / `AutoModelForSequenceClassification` 加载（`local_files_only=True`, `use_safetensors=True`, `weights_only=False`）。 |
+| `threshold`  | `float`       | `0.35`                      | entailment 概率阈值；若某条样本的 **caption+question → answer** 蕴含概率小于该值，则认为语义不一致并过滤掉。数值越高过滤越严格。 |
+| `device`     | `str \| None` | `None`                      | 推理设备；`None` 时自动选择可用的 `"cuda"`，否则回退到 `"cpu"`。 |
+
+
+
+
+## `run`函数
+```python
+def run(
+    self,
+    storage: DataFlowStorage,
+    input_caption_key: str = "caption",
+    input_question_key: str = "question",
+    input_answer_key: str = "answer",
+):
+    ...
+```
+执行算子主逻辑：
+1. **读取数据**  
+   从 `storage` 中读取当前 DataFrame，一般至少包含三列：  
+   - `input_caption_key`：图像描述 caption 文本；  
+   - `input_question_key`：针对该图像的提问文本；  
+   - `input_answer_key`：对应的回答文本。  
+
+2. **构造前提与假设**  
+   对于每一行样本：
+   - 前提（premise）：`premise = caption.strip() + " " + question.strip()`  
+   - 假设（hypothesis）：`hypothesis = answer.strip()`  
+   若 `hypothesis` 为空，则直接将该样本视为不一致（概率记为 0）。
+
+3. **调用 NLI 模型计算蕴含概率**  
+   使用 `AutoTokenizer` 对 `(premise, hypothesis)` 做编码，输入 `bart-large-mnli` 模型得到 `logits`，  
+   对 `logits` 做 softmax，取 **entailment 类别**（MNLI 中 index=2）对应的概率 `p_entail` 作为一致性得分。
+
+4. **应用阈值进行过滤**  
+   - 若 `p_entail >= threshold`，则认为该 (caption, question, answer) 三元组语义自洽，样本通过过滤；  
+   - 否则判断为不一致样本并过滤，同时在调试日志中记录其得分与截断后的文本片段。
+
+5. **写回结果**  
+   - 使用布尔掩码仅保留通过过滤的样本行，并 `reset_index(drop=True)`；  
+   - 将过滤后的 DataFrame 写回 `storage`；  
+   - 返回 `[input_caption_key, input_question_key, input_answer_key]`，供后续算子继续使用相同的三列作为输入。
+
+
+参数
+| 参数名              | 类型              | 默认值        | 说明 |
+| :------------------ | :---------------- | :------------ | :--- |
+| `storage`           | `DataFlowStorage` | 无            | Dataflow 的读写存储对象。 |
+| `input_caption_key` | `str`             | `"caption"`   | caption 文本列名，通常为图像的自然语言描述。 |
+| `input_question_key`| `str`             | `"question"`  | question 文本列名，表示针对图像的提问。 |
+| `input_answer_key`  | `str`             | `"answer"`    | answer 文本列名，表示对问题的回答。 |
+
+
+
+
+## 🧠 示例用法
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.core_vision import ImageConsistencyFilter
+
+# 1) 准备 FileStorage（至少包含 caption / question / answer 三列）
+storage = FileStorage(
+    first_entry_file_name="./dataflow/example/test_image_filter/test_image_filter.jsonl",
+    cache_path="./cache_local",
+    file_name_prefix="image_consistency_filter",
+    cache_type="jsonl"
+)
+
+# 2) 初始化算子（可使用本地或 HF 模型）
+filt = ImageConsistencyFilter(
+    model_name="facebook/bart-large-mnli",  # 或本地路径 "../ckpt/bart-large-mnli"
+    threshold=0.35,                         # entailment 概率阈值
+    device=None                             # 自动选择 cuda/cpu
+)
+
+# 3) 执行过滤
+cols = filt.run(
+    storage=storage.step(),
+    input_caption_key="caption",
+    input_question_key="question",
+    input_answer_key="answer"
+)
+print(cols)  # ["caption", "question", "answer"]
+```
+
+### 🧾 默认输出格式（Output Format）
+| 字段名                                 | 类型     | 默认值 | 说明 |
+| :------------------------------------- | :------- | :----- | :--- |
+| `caption`（或 `input_caption_key` 指定列）  | `string` | 无 | 过滤后保留下来的图像描述文本。 |
+| `question`（或 `input_question_key` 指定列） | `string` | 无 | 与 caption 一致且被判定为可推理到 answer 的问题文本。 |
+| `answer`（或 `input_answer_key` 指定列）   | `string` | 无 | 在 NLI 模型下由 caption+question 蕴含概率 `≥ threshold` 的回答文本。 |
+
+示例输入：
+```jsonl
+{
+  "caption":  "A groom in a black tuxedo sits in a car next to his smiling bride.",
+  "question": "Where are the couple sitting?",
+  "answer":   "They are sitting inside a car."
+}
+{
+  "caption":  "A groom in a black tuxedo sits in a car next to his smiling bride.",
+  "question": "What color is the sky in this picture?",
+  "answer":   "The sky is green with purple stripes."
+}
+```
+
+示例输出：
+```jsonl
+{
+  "caption":  "A groom in a black tuxedo sits in a car next to his smiling bride.",
+  "question": "Where are the couple sitting?",
+  "answer":   "They are sitting inside a car."
+}
+
+```
\ No newline at end of file
diff --git a/docs/zh/notes/mm_operators/image_understanding/filter/image_deduplication_filter.md b/docs/zh/notes/mm_operators/image_understanding/filter/image_deduplication_filter.md
new file mode 100644
index 00000000..b0a1e92d
--- /dev/null
+++ b/docs/zh/notes/mm_operators/image_understanding/filter/image_deduplication_filter.md
@@ -0,0 +1,146 @@
+---
+title: ImageDeduplicateFilter
+createTime: 2025/10/15 19:24:01
+# icon: material-symbols-light:image
+permalink: /zh/mm_operators/filter/image_deduplicate_filter/
+---
+## 📘 概述
+`ImageDeduplicateFilter` 是一个基于 **CLIP 特征相似度** 的图像去重算子，用于从大规模图像集合中剔除**近重复样本**，只保留每一簇中一张代表图像，并可记录每张图像与其它图像的最高相似度，方便后续分析。
+
+
+
+## ```__init__```函数
+```python
+def __init__(
+    self,
+    model_name: str = "openai/clip-vit-base-patch32",
+    threshold: float = 0.90,
+    batch_size: int = 32,
+    device: str = "cuda" if torch.cuda.is_available() else "cpu"
+):
+  ...
+```
+
+## `init`参数说明
+| 参数名        | 类型        | 默认值                          | 说明 |
+| :----------- | :---------- | :----------------------------- | :--- |
+| `model_name` | `str`       | `"openai/clip-vit-base-patch32"` | CLIP 模型名称或本地路径；内部通过 `CLIPProcessor` / `CLIPModel` 加载图像编码器，用于提取图像嵌入。 |
+| `threshold`  | `float`     | `0.90`                         | 判定两张图像为“近重复”的余弦相似度阈值，范围 `[0,1]`；数值越高，去重越严格。 |
+| `batch_size` | `int`       | `32`                           | 进行 CLIP 前向推理时的批大小；数值过大可能导致显存占用上升。 |
+| `device`     | `str`       | `"cuda"`（若可用，否则 `"cpu"`） | 模型推理设备；通常为 `"cuda"` 或 `"cpu"`。 |
+
+
+
+
+## `run`函数
+```python
+def run(
+    self, 
+    storage: DataFlowStorage,
+    input_image_key: str = "image",
+    output_score_key: str = "max_similarity"
+):
+    ...
+```
+执行算子主逻辑：
+1. **图像特征提取（CLIP Embedding）**  
+   - 从 `storage` 读取 DataFrame，取出 `input_image_key` 指定列（默认 `"image"`）作为图像输入列表。  
+   - 以 `batch_size` 为批次，将图像逐批送入 CLIP 的 `CLIPProcessor` 与 `CLIPModel.get_image_features`：  
+     - 使用 `_load_image` 将路径或原始数据转换为 PIL 图像；  
+     - 丢弃加载失败的样本，仅对有效图像计算特征；  
+     - 对输出特征做 L2 归一化，得到单位向量嵌入。  
+   - 返回：  
+     - `embeddings`：形状为 `(N_valid, D)` 的 numpy 数组；  
+     - `valid_indices`：有效图像在原始 DataFrame 中的行索引。
+
+2. **余弦相似度矩阵与重复查找**  
+   - 对所有嵌入一次性调用 `cosine_similarity(embeddings, embeddings)`，得到 `N_valid × N_valid` 相似度矩阵。  
+   - 遍历所有 `(i, j)`（只取 `i < j`）的位置：  
+     - 当 `similarities[i, j] >= threshold`（默认 `0.90`）时，认为第 `j` 个样本是第 `i` 个样本的“近重复”。  
+     - 将 `j` 加入 `duplicate_indices` 集合，并记录一条 `duplicate_pairs` 信息：  
+       `{"kept_idx": i, "removed_idx": j, "similarity": similarities[i, j]}`。
+
+3. **构建输出 DataFrame 与最大相似度列**  
+   - 基于 `valid_indices` 构建 `dataframe_valid`，只包含嵌入成功的样本，并重置索引。  
+   - 初始化 `max_similarities` 数组，长度等于 `len(embeddings)`，初始为 `0`。  
+   - 遍历所有 `duplicate_pairs`：  
+     - 对 `kept_idx` 与 `removed_idx` 更新其 `max_similarities[...]`，记录其与其它样本之间的最高相似度。  
+   - 将 `max_similarities` 写入新列 `output_score_key`（默认 `"max_similarity"`）。  
+   - 构建布尔掩码 `keep_mask`，对所有在 `duplicate_indices` 中的索引标记为 `False`，实现只保留每一簇中的首个样本。  
+   - 过滤后得到 `dataframe_filtered`，重置索引并写回 `storage`。  
+   - 最终返回 `[input_image_key, output_score_key]`，作为后续算子的输入列名列表。
+
+
+
+参数
+| 参数名            | 类型              | 默认值           | 说明 |
+| :---------------- | :---------------- | :--------------- | :--- |
+| `storage`         | `DataFlowStorage` | 无               | Dataflow 的读写存储对象，内部包含待处理的 DataFrame。 |
+| `input_image_key` | `str`             | `"image"`        | 输入图像列名；列中的每个元素应为图像路径或可被 `_load_image` 解析的图像对象。 |
+| `output_score_key`| `str`             | `"max_similarity"` | 输出分数字段名，用于存储每张保留图像与其它图像之间的最高相似度。 |
+
+
+
+## 🧠 示例用法
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.core_vision import ImageDeduplicateFilter
+
+# 1) 准备 FileStorage（至少包含 image 列，值为图像路径或可被 _load_image 解析的内容）
+storage = FileStorage(
+    first_entry_file_name="./dataflow/example/test_image_filter/test_image_filter.jsonl",
+    cache_path="./cache_local",
+    file_name_prefix="image_dedupe_filter",
+    cache_type="jsonl"
+)
+
+# 2) 初始化算子（可使用本地或 HF 的 CLIP 权重）
+filt = ImageDeduplicateFilter(
+    model_name="openai/clip-vit-base-patch32",  # 或本地路径
+    threshold=0.90,                             # 去重余弦相似度阈值
+    batch_size=32,                              # CLIP 前向 batch 大小
+    device="cuda"                               # 或 "cpu"
+)
+
+# 3) 执行去重过滤
+cols = filt.run(
+    storage=storage.step(),
+    input_image_key="image",
+    output_score_key="max_similarity"
+)
+print(cols)  # ["image", "max_similarity"]
+```
+
+### 🧾 默认输出格式（Output Format）
+| 字段名                | 类型     | 默认值            | 说明 |
+| :-------------------- | :------- | :---------------- | :--- |
+| `image`（或 `input_image_key` 指定列） | `string` | 无                | 保留的图像路径或图像标识；近重复图像已被过滤，仅保留各簇中的代表样本。 |
+| `max_similarity`（或 `output_score_key`） | `float`  | 无                | 该图像与集合中其它图像之间的最高余弦相似度，范围 `[0,1]`；可用于分析去重强度或召回质量。 |
+
+
+示例输入：
+```jsonl
+{
+  "image": "a.jpg"
+}
+{
+  "image": "b.jpg"
+}
+{
+  "image": "a_copy.jpg"
+}
+
+```
+
+示例输出：
+```jsonl
+{
+  "image": "a.jpg",
+  "max_similarity": 0.96
+}
+{
+  "image": "b.jpg",
+  "max_similarity": 0.12
+}
+```
\ No newline at end of file
diff --git a/docs/zh/notes/mm_operators/image_understanding/filter/image_diversity_filter.md b/docs/zh/notes/mm_operators/image_understanding/filter/image_diversity_filter.md
new file mode 100644
index 00000000..9796e5b9
--- /dev/null
+++ b/docs/zh/notes/mm_operators/image_understanding/filter/image_diversity_filter.md
@@ -0,0 +1,127 @@
+---
+title: ImageDiversityFilter
+createTime: 2025/10/15 19:34:47
+# icon: material-symbols-light:image
+permalink: /zh/mm_operators/filter/image_diversity_filter/
+---
+## 📘 概述
+`ImageDiversityFilter` 是一个同时作用于**文本**与**图像**的去重过滤算子，目标是在清洗过程中尽量保留 **内容更加多样** 的图文样本。  
+它通过两条独立但互补的信号进行判定：
+
+1. 文本侧：基于 **TF-IDF + 余弦相似度** 估计当前文本与历史已保留文本的相似度；  
+2. 图像侧：基于 **感知哈希（pHash）汉明距离** 衡量图像视觉近似度。
+
+只有当“文本不太相似”**且**“图像也不太相似”时，样本才会被保留，否则视为近重复样本并过滤。
+
+这一策略可以避免只看图像或只看文本导致的误判，有助于构建**去重后、语义多样性高**的多模态数据集。
+
+## ```__init__```函数
+```python
+def __init__(
+    self,
+    text_thresh: float = 0.8,
+    hash_size: int = 8,
+    img_dist_thresh: int = 5
+):
+    ...
+```
+
+## `init`参数说明
+| 参数名         | 类型   | 默认值  | 说明 |
+| :------------ | :----- | :------ | :--- |
+| `text_thresh` | `float` | `0.8`  | 文本侧去重阈值：若当前 caption 与历史保留文本的 TF-IDF 余弦相似度 **小于**该值，则认为“文本足够不同”；否则判为文本近重复。 |
+| `hash_size`   | `int`   | `8`    | 感知哈希大小，传入 `imagehash.phash` 的 `hash_size`，数值越大哈希维度越高、计算稍重但区分力更强。 |
+| `img_dist_thresh` | `int` | `5` | 图像侧去重阈值：若当前图像哈希与历史保留图像哈希的**最小汉明距离** **大于**该值，则认为“图像足够不同”；否则判为图像近重复。 |
+
+
+## `run`函数
+```python
+def run(
+    self,
+    storage: DataFlowStorage,
+    input_image_key: str = "image_path",
+    input_text_key: str = "text"
+):
+    ...
+```
+执行算子主逻辑：
+1. 从 `storage` 读取 DataFrame，逐行读取 `input_image_key` 与 `input_text_key` 指定的列。  
+2. **文本侧**：用 `TextDuplicateFilter` 计算当前文本与最近语料的 TF-IDF 余弦相似度最大值 `max_sim`；若 `max_sim < text_thresh` → 记为“文本唯一”，并把当前文本加入语料缓存；否则记为“文本近重复”。  
+3. **图像侧**：用 `ImageDuplicateFilter` 计算当前图像 pHash 与最近图像哈希的最小汉明距离 `min_dist`；若历史为空或 `min_dist > img_dist_thresh` → 记为“图像唯一”，并把当前图像哈希入库；否则记为“图像近重复”。  
+4. 仅当“文本唯一 **且** 图像唯一”同时为真时保留该行样本；否则将其视为近重复样本并过滤掉。  
+5. 将保留下来的样本重置索引后写回 `storage`，并返回 `[input_image_key, input_text_key]` 作为后续算子的输入列名列表。  
+
+
+参数
+| 参数名            | 类型              | 默认值         | 说明 |
+| :---------------- | :---------------- | :------------- | :--- |
+| `storage`         | `DataFlowStorage` | 无             | Dataflow 的读写存储对象，内部包含待去重的图文数据表。 |
+| `input_image_key` | `str`             | `"image_path"` | 输入图像列名。列中元素应为图像路径（或其他可被 PIL 打开且存在于磁盘的路径）。 |
+| `input_text_key`  | `str`             | `"text"`       | 输入文本列名，通常为 caption 或描述字段，用于计算 TF-IDF 文本相似度。 |
+
+## 🧠 示例用法
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.core_vision import ImageDiversityFilter
+
+# 1) 准备 FileStorage（至少包含 image_path 与 text 两列）
+storage = FileStorage(
+    first_entry_file_name="./dataflow/example/test_image_filter/test_image_filter.jsonl",
+    cache_path="./cache_local",
+    file_name_prefix="image_diversity_filter",
+    cache_type="jsonl"
+)
+
+# 2) 初始化算子
+filt = ImageDiversityFilter(
+    text_thresh=0.8,   # 文本相似度阈值（越大越严格）
+    hash_size=8,       # 感知哈希尺寸
+    img_dist_thresh=5  # 图像最小汉明距离阈值（越大要求差异越明显）
+)
+
+# 3) 执行过滤
+cols = filt.run(
+    storage=storage.step(),
+    input_image_key="image_path",
+    input_text_key="text"
+)
+print(cols)  # ["image_path", "text"]
+```
+
+### 🧾 默认输出格式（Output Format）
+
+| 字段名                                | 类型     | 默认值 | 说明 |
+| :------------------------------------ | :------- | :----- | :--- |
+| `image_path`（或 `input_image_key` 指定列） | `string` | 无     | 过滤后保留的图像路径；仅保留文本与图像均与历史样本不太相似的行。 |
+| `text`（或 `input_text_key` 指定列）       | `string` | 无     | 与图像配对的文本描述；保证与历史已保留文本在 TF-IDF 空间中不过度相似。 |
+
+
+示例输入：
+```jsonl
+{
+  "image_path": "a.jpg",
+  "text": "A cat sitting on a wooden chair."
+}
+{
+  "image_path": "a_dup.jpg",
+  "text": "A cat sits on a wooden chair."  
+}
+{
+  "image_path": "b.jpg",
+  "text": "A bus driving through a snowy mountain pass at night."
+}
+```
+
+示例输出：
+```jsonl
+{
+  "image_path": "a.jpg",
+  "text": "A cat sitting on a wooden chair."
+}
+{
+  "image_path": "b.jpg",
+  "text": "A bus driving through a snowy mountain pass at night."
+}
+
+```
\ No newline at end of file
diff --git a/docs/zh/notes/mm_operators/image_understanding/filter/image_sensitive_filter.md b/docs/zh/notes/mm_operators/image_understanding/filter/image_sensitive_filter.md
new file mode 100644
index 00000000..2a7fafa0
--- /dev/null
+++ b/docs/zh/notes/mm_operators/image_understanding/filter/image_sensitive_filter.md
@@ -0,0 +1,150 @@
+---
+title: ImageSensitiveFilter
+createTime: 2025/10/15 15:31:35
+# icon: material-symbols-light:image
+permalink: /zh/mm_operators/filter/image_sensitive_filter/
+---
+## 📘 概述
+`ImageSensitiveFilter` 基于 **BART Large MNLI** 零样本自然语言推理模型，对与图像相关的多列文本进行**多标签安全检测**，自动识别以下高风险内容并过滤样本：
+
+- 性相关内容（色情、裸体等）
+- 暴力或伤害
+- 自杀 / 自残
+- 仇恨言论
+- 骚扰 / 侮辱
+- 威胁 / 恐吓
+
+与传统关键词黑名单不同，本算子通过“文本 + 风险描述”的 NLI 蕴含关系来判断是否存在敏感内容，更加**灵活、可扩展**，适用于多模态数据的安全合规清洗。
+
+## ```__init__```函数
+```python
+def __init__(
+    self,
+    model_name: str = "facebook/bart-large-mnli",
+    threshold: float = 0.5,
+    device: str | None = None,
+):
+    ...
+```
+
+
+
+## `init`参数说明
+| 参数名       | 类型              | 默认值                        | 说明 |
+| :---------- | :---------------- | :---------------------------- | :--- |
+| `model_name` | `str`            | `"facebook/bart-large-mnli"` | NLI 模型本地路径或 Hugging Face Model ID；内部通过 `AutoTokenizer` / `AutoModelForSequenceClassification` 加载（`local_files_only=True`, `use_safetensors=True`, `weights_only=False`）。 |
+| `threshold` | `float`           | `0.5`                        | 风险类别的**蕴含概率阈值**；当任一风险标签的蕴含概率 `≥ threshold` 时，即判定该文本为“不安全”。阈值越高，过滤越严格。 |
+| `device`    | `str \| None`     | `None`                       | 推理设备；`None` 时自动选择可用的 `"cuda"`，否则使用 `"cpu"`。 |
+
+
+
+## `run`函数
+```python
+def run(
+    self,
+    storage: DataFlowStorage,
+    input_image_key: str,
+    input_text_keys: list
+):
+    ...
+```
+执行算子主逻辑：
+
+1. 从 `storage` 读取 DataFrame，逐行遍历样本；对每一行读取：
+   - 图像路径：`row[input_image_key]`
+   - 多个文本字段：`[row[k] for k in input_text_keys]`  
+
+2. **图像路径检查**（轻量级）：
+   - 若 `image_path` 为空或对应文件在磁盘上不存在，直接视为“不安全”，该行被过滤并记录 warning 日志。  
+   - 本版本不做图像内容识别，仅做路径存在性检查（图像内容安全建议由其他算子处理）。  
+
+3. **文本敏感性检测（NLI 多标签风险评分）**：  
+   对于当前行的每一个文本字段 `text`：
+   1. 若文本为空或全是空白，返回所有风险标签得分 `0.0`。  
+   2. 否则，对每个风险标签 `label`（如 `"violence"`, `"hate"` 等），构造对应的**风险描述句** `desc`，例如：  
+      - `"The text describes physical violence, injury, or killing."`  
+      - `"The text attacks or insults a group based on race, religion, gender or similar traits."`  
+   3. 调用 `AutoTokenizer` 对 `(premise=text, hypothesis=desc)` 编码，并送入 MNLI 模型：  
+      - 取输出 logits 对应的 `entailment` 概率（索引 2）作为该 label 的风险分数 `score[label]`。  
+   4. 汇总得到当前文本的风险得分字典：`{label_name: entail_prob}`。  
+
+4. **样本级安全判定**：
+   - 对一行中所有文本字段和所有风险标签的分数取最大值：`max_risk = max(all_scores)`。  
+   - 若 `max_risk ≥ threshold`，则判定该样本在某一风险类别上高度敏感：  
+     - 标记为“不安全”，该行被过滤，并在调试日志中打印：行号、图像路径以及截断后的文本内容。  
+   - 若 `max_risk < threshold`，则该图文样本判定为“安全”，予以保留。  
+
+5. 将所有“安全”样本对应的布尔掩码 `refined_mask` 应用于原 DataFrame，得到过滤后的新 DataFrame：  
+   - 调用 `reset_index(drop=True)` 重排索引。  
+   - 使用 `storage.write(filtered_df)` 将结果写回。  
+
+6. 返回列名列表：
+   - 返回 `[input_image_key] + input_text_keys`，用于下游算子作为输入列名约定。  
+
+
+参数
+| 参数名           | 类型              | 默认值 | 说明 |
+| :-------------- | :---------------- | :----- | :--- |
+| `storage`       | `DataFlowStorage` | 无     | Dataflow 的读写存储对象。 |
+| `input_image_key` | `str`           | 无     | 图像路径所在列的列名（如 `"image"`）。仅用于检查路径是否存在，不做图像内容推理。 |
+| `input_text_keys` | `list[str]`     | 无     | 需要进行敏感性检测的文本列名列表（如 `["caption", "question", "answer"]`）。算子会对列表中的每一列文本逐条进行风险评估。 |
+
+## 🧠 示例用法
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.core_vision import ImageSensitiveFilter
+
+# 1) 准备 FileStorage（至少包含 image、caption 等列）
+storage = FileStorage(
+    first_entry_file_name="./dataflow/example/test_image_filter/test_image_filter.jsonl",
+    cache_path="./cache_local",
+    file_name_prefix="imgtext_sensitive_filter",
+    cache_type="jsonl"
+)
+
+# 2) 初始化算子（可使用本地或 HF 模型ID）
+filt = ImageSensitiveFilter(
+    model_name="facebook/bart-large-mnli",  # 或本地 ckpt 路径
+    threshold=0.5,                          # 风险判定阈值
+    device=None                             # 自动选择 cuda/cpu
+)
+
+# 3) 执行过滤：对 image + 多列文本做敏感内容检测
+cols = filt.run(
+    storage=storage.step(),
+    input_image_key="image",
+    input_text_keys=["caption", "question", "answer"]
+)
+print(cols)  # ["image", "caption", "question", "answer"]
+```
+
+### 🧾 默认输出格式（Output Format）
+
+
+| 字段                      | 类型     | 默认值 | 说明 |
+| :------------------------ | :------- | :----- | :--- |
+| `input_image_key` 对应列  | `string` | 无     | 原始图像路径列，过滤后仅保留通过安全检测的样本行。 |
+| `input_text_keys` 对应各列 | `string` | 无    | 原始文本列（caption / question / answer 等），过滤后仅保留文本均通过安全检测的样本行。 |
+
+
+
+示例输入：
+```jsonl
+{{
+  "image_path": "1.png",
+  "text": "A bride and groom smiling in a car."
+}
+{
+  "image_path": "2.jpg",
+  "text": "Some abusive or hateful phrase here."
+}
+```
+
+示例输出：
+```jsonl
+{
+  "image_path": "1.png",
+  "text": "A bride and groom smiling in a car."
+}
+```
\ No newline at end of file
diff --git a/docs/zh/notes/mm_operators/image_understanding/filter/sensitive_filter.md b/docs/zh/notes/mm_operators/image_understanding/filter/sensitive_filter.md
deleted file mode 100644
index 6f36d965..00000000
--- a/docs/zh/notes/mm_operators/image_understanding/filter/sensitive_filter.md
+++ /dev/null
@@ -1,119 +0,0 @@
----
-title: 敏感内容过滤（SensitiveFilter）
-createTime: 2025/10/15 15:31:35
-icon: material-symbols-light:image
-permalink: /zh/mm_operators/filter/sensitive_filter/
----
-## 📘 概述
-`SensitiveFilter` 是一个**敏感内容过滤**算子，联合**图像敏感分类（NSFW）**与**文本毒性/仇恨检测**对样本进行筛除。  
-- 图像侧：使用图像分类模型判断是否命中 `{porn, hentai, sexy, nsfw}` 等标签，且分数 ≥ `img_thresh`；  
-- 文本侧：使用文本分类模型判断是否命中 `{toxic, offensive, hate, obscene, threat, sexual_explicit, identity_attack}`，且分数 ≥ `txt_thresh`；  
-只要图像或任一文本判定为“不安全”，该样本即被过滤。
-
-## ```__init__```函数
-```python
-def __init__(
-    self,
-    img_model_name="../ckpt/nsfw_image_detection",
-    txt_model_name="../ckpt/toxic-bert",
-    img_thresh=0.5,
-    txt_thresh=0.5,
-):
-    ...
-```
-
-## `init`参数说明
-| 参数名 | 类型 | 默认值 | 说明 |
-| :--- | :--- | :--- | :--- |
-| `img_model_name` | `str` | `"../ckpt/nsfw_image_detection"` | 图像敏感检测模型本地路径或 HF Model ID；内部以 `AutoImageProcessor` / `AutoModelForImageClassification` 加载（`use_safetensors=True`, `weights_only=False`）。 |
-| `txt_model_name` | `str` | `"../ckpt/toxic-bert"` | 文本毒性检测模型本地路径或 HF Model ID；内部以 `AutoTokenizer` / `AutoModelForSequenceClassification` 加载（`use_safetensors=True`, `weights_only=False`）。 |
-| `img_thresh` | `float` | `0.5` | 图像敏感分数阈值；若命中 `{porn,hentai,sexy,nsfw}` 且分数 **≥** 阈值则判为不安全。 |
-| `txt_thresh` | `float` | `0.5` | 文本敏感分数阈值；若命中 `{toxic,offensive,hate,obscene,threat,sexual_explicit,identity_attack}` 且分数 **≥** 阈值则判为不安全。 |
-
-
-
-## `run`函数
-```python
-def run(
-    self,
-    storage: DataFlowStorage,
-    image_key: str,
-    text_keys: list
-):
-    ...
-```
-执行算子主逻辑：
-1. 从 `storage` 读取 DataFrame（至少包含 `image_key` 与 `text_keys` 指定列）。
-2. 对于每一行样本：
-   - **图像侧**：使用 `img_model_name` 对图片做分类，若命中 `{porn,hentai,sexy,nsfw}` 且分数 **≥ `img_thresh`** → 判为不安全。
-   - **文本侧**：对 `text_keys` 中的每个文本使用 `txt_model_name` 做分类，若命中 `{toxic,offensive,hate,obscene,threat,sexual_explicit,identity_attack}` 且分数 **≥ `txt_thresh`** → 判为不安全。
-   - 只要图像或任一文本为不安全，则该样本标记为 **过滤**；否则标记为 **保留**。
-3. 仅保留“安全”的样本行（图像安全 **且** 所有文本均安全），重置索引并写回 `storage`。
-4. 返回参与处理的列名列表：`[image_key] + text_keys`。
-
-参数
-| 参数名 | 类型 | 默认值 | 说明 |
-| :--- | :--- | :--- | :--- |
-| `storage` | `DataFlowStorage` | 无 | Dataflow 的读写存储对象。 |
-| `image_key` | `str` | 无 | 图片路径列名（如 `"image_path"`）。 |
-| `text_keys` | `list[str]` | 无 | 需要检测的文本列名列表（如 `["caption","question","answer"]`）。 |
-
-## 🧠 示例用法
-
-```python
-from dataflow.utils.storage import FileStorage
-from dataflow.operators.core_vision import SensitiveFilter
-
-# 1) 准备 FileStorage（至少包含 image_path 与若干文本列）
-storage = FileStorage(
-    first_entry_file_name="data/sensitive_input.jsonl",
-    cache_path="./cache_local",
-    file_name_prefix="sensitive_filter",
-    cache_type="jsonl"
-)
-
-# 2) 初始化算子（可换为 HF 模型ID，例如：
-#     img_model_name="Falconsai/nsfw_image_detection",
-#     txt_model_name="unitary/toxic-bert"）
-filt = SensitiveFilter(
-    img_model_name="../ckpt/nsfw_image_detection",
-    txt_model_name="../ckpt/toxic-bert",
-    img_thresh=0.5,
-    txt_thresh=0.5,
-)
-
-# 3) 执行过滤
-cols = filt.run(
-    storage=storage.step(),
-    image_key="image_path",
-    text_keys=["text"]  # 或 ["caption","question","answer"]
-)
-print(cols)  # ["image_path", "text"]
-```
-
-### 🧾 默认输出格式（Output Format）
-| 字段 | 类型 | 默认值 | 说明 |
-| :--- | :--- | :--- | :--- |
-| `image_path` / `image_key` 指定列 | `string` | 无 | 过滤后保留样本的图片路径。 |
-| 各 `text_keys` 指定列 | `string` | 无 | 过滤后保留样本的文本字段；仅包含图像与所有文本均判定为安全的行。 |
-
-
-示例输入：
-```jsonl
-{{
-  "image_path": "1.png",
-  "text": "A bride and groom smiling in a car."
-}
-{
-  "image_path": "2.jpg",
-  "text": "Some abusive or hateful phrase here."
-}
-```
-
-示例输出：
-```jsonl
-{
-  "image_path": "1.png",
-  "text": "A bride and groom smiling in a car."
-}
-```
\ No newline at end of file
diff --git a/docs/zh/notes/mm_operators/image_understanding/filter/text_image_diversity_filter.md b/docs/zh/notes/mm_operators/image_understanding/filter/text_image_diversity_filter.md
deleted file mode 100644
index 4c43c317..00000000
--- a/docs/zh/notes/mm_operators/image_understanding/filter/text_image_diversity_filter.md
+++ /dev/null
@@ -1,122 +0,0 @@
----
-title: 图文多样性过滤（TextImageDiversityFilter）
-createTime: 2025/10/15 19:34:47
-icon: material-symbols-light:image
-permalink: /zh/mm_operators/filter/text_image_diversity_filter/
----
-## 📘 概述
-`TextImageDiversityFilter` 是一个**文本 + 图像联合去重**算子：  
-- 文本侧使用 **TF-IDF + 余弦相似度** 检测与历史语料的最大相似度，低于阈值才视作“文本足够不同”；  
-- 图像侧使用 **perceptual hash（pHash）** 并计算汉明距离，高于阈值才视作“图像足够不同”。  
-仅当“文本唯一 **且** 图像唯一”同时满足时保留该样本，否则过滤。
-
-## ```__init__```函数
-```python
-def __init__(
-    self,
-    text_thresh: float = 0.8,
-    hash_size: int = 8,
-    img_dist_thresh: int = 5
-):
-    ...
-```
-
-## `init`参数说明
-| 参数名 | 类型 | 默认值 | 说明 |
-| :--- | :--- | :--- | :--- |
-| `text_thresh` | `float` | `0.8` | 文本唯一性阈值：与最近语料（最多 `max_corpus` 条，由内部 `TextDuplicateFilter` 维护）相比的**最大余弦相似度**需 **< 该值** 才视为“文本唯一”。 |
-| `hash_size` | `int` | `8` | 图像 pHash 的哈希尺寸；值越大越细致但计算/内存开销更高（由内部 `ImageDuplicateFilter` 使用）。 |
-| `img_dist_thresh` | `int` | `5` | 图像唯一性阈值：与最近入库图像 pHash 的**最小汉明距离**需 **> 该值** 才视为“图像唯一”。 |
-
-
-
-## `run`函数
-```python
-def run(
-    self,
-    storage: DataFlowStorage,
-    image_key: str,
-    text_key: str
-):
-    ...
-```
-执行算子主逻辑：
-1. 从 `storage` 读取 DataFrame，逐行读取 `image_key` 与 `text_key`。  
-2. **文本侧**：用 `TextDuplicateFilter` 计算当前文本与最近语料的 TF-IDF 余弦相似度的最大值 `max_sim`；若 `max_sim < text_thresh` → 记为“文本唯一”，并把当前文本加入语料；否则记为“重复”。  
-3. **图像侧**：用 `ImageDuplicateFilter` 计算当前图像 pHash 与最近图像哈希的**最小汉明距离** `min_dist`；若 `min_dist > img_dist_thresh`（或历史为空）→ 记为“图像唯一”，并把当前图像哈希入库；否则记为“重复”。  
-4. 仅当“文本唯一 **且** 图像唯一”同时为真时保留该行；否则过滤。  
-5. 将保留下来的行重置索引后写回 `storage`，返回 `[image_key, text_key]`。  
-
-参数
-| 参数名 | 类型 | 默认值 | 说明 |
-| :--- | :--- | :--- | :--- |
-| `storage` | `DataFlowStorage` | 无 | Dataflow 的读写存储对象。 |
-| `image_key` | `str` | 无 | 图片路径列名（如 `"image_path"`）。 |
-| `text_key` | `str` | 无 | 文本列名（如 `"text"` / `"caption"`）。 |
-
-## 🧠 示例用法
-
-```python
-from dataflow.utils.storage import FileStorage
-from dataflow.operators.core_vision import TextImageDiversityFilter
-
-# 1) 准备 FileStorage（至少包含 image_path 与 text）
-storage = FileStorage(
-    first_entry_file_name="data/ti_diversity_input.jsonl",
-    cache_path="./cache_local",
-    file_name_prefix="ti_diversity",
-    cache_type="jsonl"
-)
-
-# 2) 初始化算子
-filt = TextImageDiversityFilter(
-    text_thresh=0.8,  # 文本唯一阈值（越低越宽松）
-    hash_size=8,      # pHash 尺寸
-    img_dist_thresh=5 # 图像唯一阈值（越大越严格）
-)
-
-# 3) 执行过滤
-cols = filt.run(
-    storage=storage.step(),
-    image_key="image_path",
-    text_key="text"
-)
-print(cols)  # ["image_path", "text"]
-```
-
-### 🧾 默认输出格式（Output Format）
-| 字段 | 类型 | 默认值 | 说明 |
-| :--- | :--- | :--- | :--- |
-| `image_path`（或 `image_key` 指定列） | `string` | 无 | 去重后保留样本的图片路径。 |
-| `text`（或 `text_key` 指定列） | `string` | 无 | 去重后保留样本的文本内容。 |
-
-
-
-示例输入：
-```jsonl
-{
-  "image_path": "a.jpg",
-  "text": "A cat sitting on a wooden chair."
-}
-{
-  "image_path": "a_dup.jpg",
-  "text": "A cat sits on a wooden chair."  
-}
-{
-  "image_path": "b.jpg",
-  "text": "A bus driving through a snowy mountain pass at night."
-}
-```
-
-示例输出：
-```jsonl
-{
-  "image_path": "a.jpg",
-  "text": "A cat sitting on a wooden chair."
-}
-{
-  "image_path": "b.jpg",
-  "text": "A bus driving through a snowy mountain pass at night."
-}
-
-```
\ No newline at end of file
diff --git a/docs/zh/notes/mm_operators/image_understanding/generate/batch_vqa_generator.md b/docs/zh/notes/mm_operators/image_understanding/generate/batch_vqa_generator.md
new file mode 100644
index 00000000..5e56ee78
--- /dev/null
+++ b/docs/zh/notes/mm_operators/image_understanding/generate/batch_vqa_generator.md
@@ -0,0 +1,115 @@
+---
+title: BatchVQAGenerator
+createTime: 2026/01/11 21:54:10
+permalink: /zh/mm_operators/generate/batch_vqa_generator/
+---
+## 📘 概述
+
+`BatchVQAGenerator` 是一个 **批量视觉问答生成算子**。
+
+它专为 **"One Image, Many Questions"（一图多问）** 场景设计。输入是一张图片和一个问题列表（例如 ["颜色是什么?", "有多少人?", "他们在做什么?"]）。该算子会自动将这张图片与列表中的每一个问题配对，构造 Batch 请求并并行生成答案。
+
+这种机制非常适合由粗到细（Coarse-to-Fine）的密集描述生成、多角度图像分析或基于属性的详细问答任务。
+
+## 🏗️ `__init__` 函数
+
+```python
+def __init__(
+    self, 
+    serving: LLMServingABC, 
+    system_prompt: str = "You are a helpful assistant."
+):
+
+```
+
+### 🧾 参数说明
+
+| 参数名 | 类型 | 默认值 | 说明 |
+| --- | --- | --- | --- |
+| `serving` | `LLMServingABC` | 无 | 负责执行推理的模型服务实例（需支持 VLM 多模态输入）。 |
+| `system_prompt` | `str` | `"You are..."` | 发送给模型的系统提示词。 |
+
+## ⚡ `run` 函数
+
+```python
+def run(
+    self, 
+    storage: DataFlowStorage, 
+    input_prompts_key: str, 
+    input_image_key: str, 
+    output_key: str
+):
+    ...
+
+```
+
+执行算子主逻辑：
+
+1. **数据读取**
+从 DataFrame 中读取图像路径 (`input_image_key`) 和问题列表 (`input_prompts_key`)。
+2. **广播与 Batch 构建 (Broadcasting)**
+对于每一行数据：
+* 获取单张图片的路径。
+* 遍历问题列表中的每一个问题 `q`。
+* 为每个问题构造标准的多模态消息：`[Image, Text(q)]`。
+* 将该图片的所有问答请求打包为一个 Batch。
+
+
+3. **并行推理**
+调用 `serving.generate_from_input`，利用 GPU 并行能力一次性生成该图片所有问题的答案。
+4. **结果保存**
+将生成的答案列表（顺序与问题列表一致）写入 `output_key` 列。
+
+### 🧾 `run` 参数说明
+
+| 参数名 | 类型 | 默认值 | 说明 |
+| --- | --- | --- | --- |
+| `storage` | `DataFlowStorage` | 无 | DataFlow 数据存储对象。 |
+| `input_prompts_key` | `str` | 无 | **输入问题列表**所在的列名（`List[str]`）。 |
+| `input_image_key` | `str` | 无 | **单张图像**路径所在的列名。 |
+| `output_key` | `str` | 无 | 输出答案列表的列名（`List[str]`）。 |
+
+## 🧩 示例用法
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.core import LLMServing
+from dataflow.operators.generate import BatchVQAGenerator
+
+# 1) 初始化模型
+serving = LLMServing(model_path="Qwen/Qwen2.5-VL-7B-Instruct")
+
+# 2) 初始化算子
+generator = BatchVQAGenerator(
+    serving=serving,
+    system_prompt="Answer briefly."
+)
+
+# 3) 准备数据 (jsonl)
+# 数据格式: {"image": "scene.jpg", "questions": ["Weather?", "Object count?", "Action?"]}
+storage = FileStorage(file_name_prefix="dense_captioning")
+storage.step()
+
+# 4) 执行批量问答
+generator.run(
+    storage=storage,
+    input_prompts_key="questions",
+    input_image_key="image",
+    output_key="answers"
+)
+
+```
+
+### 🧾 默认输出格式
+
+`output_key` 列为字符串列表，长度与输入的问题列表一致。
+
+**示例输入 DataFrame：**
+| image | questions |
+| :--- | :--- |
+| `"park.jpg"` | `["Weather?", "Count?", "Action?"]` |
+
+**示例输出 DataFrame：**
+| image | questions | answers |
+| :--- | :--- | :--- |
+| `"park.jpg"` | `["Weather?", "Count?", "Action?"]` | `["Sunny", "3 people", "Running"]` |
diff --git a/docs/zh/notes/mm_operators/image_understanding/generate/fix_prompted_vqa_generator.md b/docs/zh/notes/mm_operators/image_understanding/generate/fix_prompted_vqa_generator.md
new file mode 100644
index 00000000..342efb0a
--- /dev/null
+++ b/docs/zh/notes/mm_operators/image_understanding/generate/fix_prompted_vqa_generator.md
@@ -0,0 +1,126 @@
+---
+title: FixPromptedVQAGenerator
+createTime: 2026/01/11 21:31:49
+permalink: /zh/mm_operators/fix_prompted_vqa_generator/
+---
+## 📘 概述
+
+`FixPromptedVQAGenerator` 是一个 **固定提示词的多模态问答算子**。
+
+它用于对批量图像或视频执行 **相同** 的指令任务。与动态模板不同，该算子在初始化时接受一个固定的 `user_prompt`（例如 "Please caption this image"），并将其应用于输入 DataFrame 中的每一个媒体样本。
+
+适用场景：
+
+* 批量图像/视频描述生成 (Captioning)。
+* 对整个数据集进行统一的 VQA 提问（例如 "图中是否有暴力内容？"）。
+
+## 🏗️ `__init__` 函数
+
+```python
+def __init__(
+    self, 
+    serving: LLMServingABC, 
+    system_prompt: str = "You are a helpful assistant.",
+    user_prompt: str = "Please caption the media in detail."
+):
+
+```
+
+### 🧾 参数说明
+
+| 参数名 | 类型 | 默认值 | 说明 |
+| --- | --- | --- | --- |
+| `serving` | `LLMServingABC` | 无 | 负责执行推理的模型服务实例（需支持多模态推理）。 |
+| `system_prompt` | `str` | `"You are..."` | 发送给模型的系统提示词。 |
+| `user_prompt` | `str` | `"Please caption..."` | **核心参数**。对所有输入样本统一使用的用户指令（Prompt）。 |
+
+## ⚡ `run` 函数
+
+```python
+def run(
+    self, 
+    storage: DataFlowStorage,
+    input_image_key: str = "image", 
+    input_video_key: str = "video",
+    output_answer_key: str = "answer",
+):
+    ...
+
+```
+
+执行算子主逻辑：
+
+1. **数据读取**
+从 `storage` 中读取 DataFrame。
+2. **输入构建**
+* 检查并读取 `input_image_key` 或 `input_video_key` 列。
+* 为每一个媒体文件构建输入消息：包含固定的 `system_prompt`、媒体文件本身以及固定的 `user_prompt`。
+
+
+3. **批量推理**
+* 将构建好的 Prompt 和媒体数据打包成 Batch。
+* 调用 `serving.generate_from_input` 执行并行推理。
+
+
+4. **结果保存**
+* 将模型生成的文本结果写入 `output_answer_key` 列。
+* 更新并保存 DataFrame。
+
+
+
+### 🧾 `run` 参数说明
+
+| 参数名 | 类型 | 默认值 | 说明 |
+| --- | --- | --- | --- |
+| `storage` | `DataFlowStorage` | 无 | DataFlow 数据存储对象。 |
+| `input_image_key` | `str` | `"image"` | 图像路径所在的列名（与 video_key 二选一）。 |
+| `input_video_key` | `str` | `"video"` | 视频路径所在的列名（与 image_key 二选一）。 |
+| `output_answer_key` | `str` | `"answer"` | 生成结果的输出列名。 |
+
+## 🧩 示例用法
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.core import LLMServing
+from dataflow.operators.generate import FixPromptedVQAGenerator
+
+# 1) 初始化模型
+serving = LLMServing(model_path="Qwen/Qwen2.5-VL-3B-Instruct")
+
+# 2) 初始化算子：设置固定的 Prompt
+# 例如：我们要对一批图片生成详细的中文描述
+generator = FixPromptedVQAGenerator(
+    serving=serving,
+    system_prompt="你是一个乐于助人的视觉助手。",
+    user_prompt="请详细描述这张图片中的内容，包括物体、颜色和空间关系。"
+)
+
+# 3) 准备数据
+storage = FileStorage(
+    file_name_prefix="image_captioning_task",
+    cache_path="./cache_data"
+)
+storage.step()
+
+# 4) 执行生成
+generator.run(
+    storage=storage,
+    input_image_key="image_path",
+    output_answer_key="detailed_caption"
+)
+
+```
+
+### 🧾 输入输出示例
+
+**输入 DataFrame 行：**
+| image_path |
+| :--- |
+| `"/data/cat.jpg"` |
+| `"/data/dog.png"` |
+
+**输出 DataFrame 行：**
+| image_path | detailed_caption |
+| :--- | :--- |
+| `"/data/cat.jpg"` | `"一只黑白相间的猫坐在沙发上..."` |
+| `"/data/dog.png"` | `"一只金毛犬在草地上奔跑..."` |
diff --git a/docs/zh/notes/mm_operators/image_understanding/generate/image_bbox_generator.md b/docs/zh/notes/mm_operators/image_understanding/generate/image_bbox_generator.md
new file mode 100644
index 00000000..661a1562
--- /dev/null
+++ b/docs/zh/notes/mm_operators/image_understanding/generate/image_bbox_generator.md
@@ -0,0 +1,145 @@
+---
+title: ImageBboxGenerator
+createTime: 2026/01/11 21:49:37
+permalink: /zh/mm_operators/generate/image_bbox_generator/
+---
+## 📘 概述
+
+`ImageBboxGenerator` 是一个 **图像区域标注与提示词准备算子**。
+
+该算子主要用于多模态任务（如 Grounding Caption）的数据预处理。它能够处理包含图像路径的原始数据，对感兴趣区域（Region of Interest）进行标准化处理和可视化绘制，并生成用于后续 VLM 推理的 Prompt。
+
+核心能力：
+
+1. **双模式 BBox 获取**：
+* **已有框模式**：直接读取输入数据中已有的 BBox 坐标。
+* **自动提取模式**：若无 BBox，利用 OpenCV（边缘检测 + 轮廓拟合）自动提取图像中的显著物体区域。
+
+
+2. **坐标标准化**：将像素坐标转换为符合 VLM 输入规范的归一化坐标（0-1 或 0-1000）。
+3. **可视化增强**：生成带有数字编号和彩色边框的可视化图像，辅助模型理解“第 N 个区域”的指代关系。
+4. **Prompt 构造**：自动生成包含区域数量信息的 Prompt（如 "Describe the content of each marked region..."）。
+
+## 🏗️ `__init__` 函数
+
+```python
+def __init__(self, config: Optional[ExistingBBoxDataGenConfig] = None):
+    ...
+
+```
+
+### 🧾 参数说明
+
+| 参数名 | 类型 | 默认值 | 说明 |
+| --- | --- | --- | --- |
+| `config` | `ExistingBBoxDataGenConfig` | `None` | 配置对象，定义了输入输出路径及最大框数量限制。 |
+
+#### `ExistingBBoxDataGenConfig` 配置详解
+
+| 字段名 | 类型 | 默认值 | 说明 |
+| --- | --- | --- | --- |
+| `max_boxes` | `int` | `10` | 单张图像保留的最大 BBox 数量（按面积排序）。不足部分补零。 |
+| `input_jsonl_path` | `str` | `None` | **必须指定**。输入 JSONL 文件路径。 |
+| `output_jsonl_path` | `str` | `None` | **必须指定**。处理结果的保存路径。 |
+
+## ⚡ `run` 函数
+
+```python
+def run(
+    self, 
+    storage: DataFlowStorage, 
+    input_image_key: str = "image", 
+    input_bbox_key: str = "bbox", 
+    output_key: str = "mdvp_record"
+):
+    ...
+
+```
+
+执行算子主逻辑：
+
+1. **数据读取**
+从 `config.input_jsonl_path` 读取原始数据。
+2. **BBox 获取 (Extract/Get)**
+* 检查每行数据是否包含 `input_bbox_key`。
+* **Type A (With BBox)**: 直接使用数据中的坐标。
+* **Type B (Without BBox)**: 调用 `extract_boxes_from_image`，通过自适应阈值和形态学操作提取物体轮廓，并应用 NMS (非极大值抑制) 去重。
+
+
+3. **标准化与可视化 (Normalize & Visualize)**
+*
+* **标准化**：将 `[x, y, w, h]` 转换为归一化的 `[x1, y1, x2, y2]` 格式，并根据 `max_boxes` 进行截断或补零 (`0.0, 0.0, 0.0, 0.0`)。
+* **可视化**：在原图上绘制绿色矩形框和数字标签，保存至 `storage.cache_path`。
+
+
+4. **Prompt 生成**
+* 根据有效框的数量，生成固定模板的 Prompt：
+> "Describe the content of each marked region in the image. There are {N} regions: \<region1\> to \<regionN\>."
+
+
+5. **结果导出**
+* 将包含原始信息、标准化 BBox、可视化路径及 Prompt 的完整记录写入 `config.output_jsonl_path`。
+
+
+
+### 🧾 `run` 参数说明
+
+| 参数名 | 类型 | 默认值 | 说明 |
+| --- | --- | --- | --- |
+| `storage` | `DataFlowStorage` | 无 | DataFlow 存储对象，主要用于获取缓存路径 (`cache_path`)。 |
+| `input_image_key` | `str` | `"image"` | 输入 JSONL 中图像路径的字段名。 |
+| `input_bbox_key` | `str` | `"bbox"` | 输入 JSONL 中 BBox 数据的字段名。 |
+| `output_key` | `str` | `"mdvp_record"` | (保留字段) 用于标识输出记录的键名。 |
+
+## 🧩 示例用法
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.cv import ImageBboxGenerator, ExistingBBoxDataGenConfig
+
+# 1) 配置参数
+config = ExistingBBoxDataGenConfig(
+    max_boxes=5,
+    input_jsonl_path="./data/raw_images.jsonl",
+    output_jsonl_path="./data/processed_with_prompts.jsonl"
+)
+
+# 2) 初始化算子
+# 注意：此算子主要用于数据准备，不依赖 Serving 实例
+generator = ImageBboxGenerator(config=config)
+
+# 3) 准备 Storage (仅用于提供缓存路径)
+storage = FileStorage(
+    cache_path="./cache_vis_images",
+    file_name_prefix="bbox_gen"
+)
+
+# 4) 执行处理
+# 自动读取 config 中的 input_jsonl_path，结果写入 output_jsonl_path
+generator.run(
+    storage=storage,
+    input_image_key="image_path",
+    input_bbox_key="ground_truth_bbox" # 若文件中无此列，将自动提取 BBox
+)
+
+```
+
+### 🧾 输出数据格式 (Output JSONL)
+
+生成的 `output_jsonl_path` 文件中，每一行包含以下结构：
+
+```json
+{
+  "image": "/data/raw/cat.jpg",
+  "type": "without_bbox", // 或 "with_bbox"
+  "bbox": [[100, 200, 50, 60], ...], // 原始像素坐标 [x, y, w, h]
+  "normalized_bbox": [
+      [0.1, 0.2, 0.15, 0.26], 
+      [0.0, 0.0, 0.0, 0.0] // 补零填充
+  ],
+  "result_file": "./cache_vis_images",
+  "image_with_bbox": "./cache_vis_images/1_bbox_vis.jpg", // 可视化图片路径
+  "valid_bboxes_num": 1,
+  "prompt": "Describe the content of each marked region in the image. There are 1 regions: \<region1\> to \<region1\>."
+}
+```
diff --git a/docs/zh/notes/mm_operators/image_understanding/generate/image_caprl.md b/docs/zh/notes/mm_operators/image_understanding/generate/image_caprl.md
index 8de1bf02..9d57e97e 100644
--- a/docs/zh/notes/mm_operators/image_understanding/generate/image_caprl.md
+++ b/docs/zh/notes/mm_operators/image_understanding/generate/image_caprl.md
@@ -77,7 +77,7 @@ class CapRLMCQConfig:
 def run(
     self,
     storage: DataFlowStorage,
-    image_key: str = "image",
+    input_image_key: str = "image",
     output_key: str = "cap_rl_qa",
 ):
     ...
@@ -93,7 +93,7 @@ def run(
 | 参数名        | 类型                | 默认值           | 说明               |
 | ---------- | ----------------- | ------------- | ---------------- |
 | storage    | `DataFlowStorage` | 必填            | 读写与缓存。           |
-| image_key  | str               | `"image"`     | 输入字段名（图像路径）。     |
+| input_image_key  | str               | `"image"`     | 输入字段名（图像路径）。     |
 | output_key | str               | `"cap_rl_qa"` | 输出字段名（写入整个统计结构）。 |
 
 ---
@@ -135,7 +135,7 @@ cfg = CapRLMCQConfig(
 op = CapRLMCQGenerate(vlm_serving=serving, config=cfg)
 
 # 5) Run（DataFrame 驱动：storage.step()）
-op.run(storage=storage.step(), image_key="image", output_key="cap_rl_qa")
+op.run(storage=storage.step(), input_image_key="image", output_key="cap_rl_qa")
 ```
 
 ---
diff --git a/docs/zh/notes/mm_operators/image_understanding/generate/image_caption.md b/docs/zh/notes/mm_operators/image_understanding/generate/image_caption.md
index 1e308f96..a63467bf 100644
--- a/docs/zh/notes/mm_operators/image_understanding/generate/image_caption.md
+++ b/docs/zh/notes/mm_operators/image_understanding/generate/image_caption.md
@@ -1,14 +1,19 @@
 ---
-title: 图片字幕生成
+title: ImageCaptionGenerator
 createTime: 2025/10/15 15:00:00
-icon: material-symbols-light:image
+# icon: material-symbols-light:image
 permalink: /zh/mm_operators/generate/image_caption/
 ---
 
 ## 📘 概述
 
-`ImageCaptionGenerate` 是一个用于**调用视觉语言大模型自动生成图片描述（Caption）**的算子。  
-它会根据输入图像，自动构建提示词，引导模型输出高质量的场景或目标描述，适用于多模态标注、数据集构建、图文匹配等场景。
+`ImageCaptionGenerator` 是一个用于**调用视觉语言大模型（VLM）自动生成图片描述（Caption）**的算子。  
+它根据输入图像，自动构建提示词，引导模型输出高质量的场景或目标描述，适用于多模态标注、数据集构建、图文匹配等场景。
+
+**功能特点：**
+* 支持批量处理多张图像。
+* 基于 Qwen 等视觉语言模型生成高质量描述。
+* 自动处理图像输入和提示词构建。
 
 ---
 
@@ -26,9 +31,9 @@ def __init__(
 
 | 参数名           | 类型              | 默认值 | 说明                     |
 | :------------ | :-------------- | :-- | :--------------------- |
-| `llm_serving` | `LLMServingABC` | -   | 模型服务对象，用于调用 VLM 生成图像字幕 |
+| `llm_serving` | `LLMServingABC` | -   | **模型服务对象**，用于调用 VLM 生成图像字幕 |
 
----
+-----
 
 ## ⚡ `run` 函数
 
@@ -36,35 +41,35 @@ def __init__(
 def run(
     self,
     storage: DataFlowStorage,
-    multi_modal_key: str = "image",
-    output_key: str = "caption"
+    input_modal_key: str = "image",
+    output_key: str = "output"
 ):
     ...
 ```
 
 `run` 是算子主逻辑，执行字幕生成任务：
-读取图片路径 → 构建提示词 → 调用模型 → 生成文本描述 → 写入输出文件。
+读取图片路径 → **验证数据框** → 构建提示词 → 调用模型 → 生成文本描述 → 写入输出文件。
 
 ## 🧾 `run` 参数说明
 
-| 参数名               | 类型                | 默认值         | 说明              |
-| :---------------- | :---------------- | :---------- | :-------------- |
-| `storage`         | `DataFlowStorage` | -           | Dataflow 数据存储对象 |
-| `multi_modal_key` | `str`             | `"image"`   | 多模态输入字段名        |
-| `output_key`      | `str`             | `"caption"` | 模型输出字段名         |
+| 参数名              | 类型                | 默认值          | 说明              |
+| :---------------- | :---------------- | :----------- | :-------------- |
+| `storage`         | `DataFlowStorage` | -            | Dataflow 数据存储对象 |
+| `input_modal_key` | `str`             | `"image"`    | **多模态输入字段名**（如图像路径） |
+| `output_key`      | `str`             | `"output"`   | **模型输出字段名**（生成的描述文本） |
 
----
+-----
 
 ## 🧠 示例用法
 
 ```python
 from dataflow.utils.storage import FileStorage
 from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
-from dataflow.operators.core_vision import ImageCaptionGenerate
+from dataflow.operators.core_vision import ImageCaptionGenerator
 
 # Step 1: 启动本地模型服务
 serving = LocalModelVLMServing_vllm(
-    hf_model_name_or_path="./models/Qwen2.5-VL-3B-Instruct",
+    hf_model_name_or_path="Qwen/Qwen2.5-VL-3B-Instruct",
     vllm_tensor_parallel_size=1,
     vllm_temperature=0.7,
     vllm_top_p=0.9,
@@ -73,25 +78,23 @@ serving = LocalModelVLMServing_vllm(
 
 # Step 2: 准备输入数据
 storage = FileStorage(
-    first_entry_file_name="data/example_caption.jsonl",
+    first_entry_file_name="dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl",
     cache_path="./cache_local",
-    file_name_prefix="caption",
+    file_name_prefix="dataflow_cache_step",
     cache_type="jsonl",
-    media_key="image",
-    media_type="image"
 )
-storage.step()
+storage.step() # 加载数据
 
 # Step 3: 初始化并运行算子
-generator = ImageCaptionGenerate(serving)
+generator = ImageCaptionGenerator(serving)
 generator.run(
     storage=storage,
-    multi_modal_key="image",
-    output_key="caption"
+    input_modal_key="image",
+    output_key="caption" # 在示例中指定输出字段为 "caption"
 )
 ```
 
----
+-----
 
 ## 🧾 默认输出格式（Output Format）
 
@@ -100,7 +103,7 @@ generator.run(
 | `image`   | `List[str]` | 输入图像路径      |
 | `caption` | `str`       | 模型生成的图像字幕文本 |
 
----
+-----
 
 ### 📥 示例输入
 
diff --git a/docs/zh/notes/mm_operators/image_understanding/generate/image_gcot.md b/docs/zh/notes/mm_operators/image_understanding/generate/image_gcot.md
index e5544143..d73788be 100644
--- a/docs/zh/notes/mm_operators/image_understanding/generate/image_gcot.md
+++ b/docs/zh/notes/mm_operators/image_understanding/generate/image_gcot.md
@@ -56,9 +56,9 @@ def __init__(
 def run(
     self,
     storage: DataFlowStorage,
-    question_key: str = "question",
-    answer_key: str = "answer",
-    image_key: str = "image",
+    input_question_key: str = "question",
+    input_answer_key: str = "answer",
+    input_image_key: str = "image",
     output_key: str = "gcot",
     save_intermediate: bool = True,
     qwen_unload_callback = None
@@ -77,9 +77,9 @@ def run(
 | 参数名                  | 类型              | 默认值        | 说明                                       |
 | :---------------------- | :---------------- | :------------ | :-----------------------------------------|
 | `storage`               | `DataFlowStorage` | -             | 数据流存储对象                             |
-| `question_key`          | `str`             | `"question"`  | 输入问题字段名                             |
-| `answer_key`            | `str`             | `"answer"`    | 输入答案字段名                             |
-| `image_key`             | `str`             | `"image"`     | 输入图像字段名                             |
+| `input_question_key`          | `str`             | `"question"`  | 输入问题字段名                             |
+| `input_answer_key`            | `str`             | `"answer"`    | 输入答案字段名                             |
+| `input_image_key`             | `str`             | `"image"`     | 输入图像字段名                             |
 | `output_key`            | `str`             | `"gcot"`      | 输出 GCoT 字段名                          |
 | `save_intermediate`     | `bool`            | `True`        | 是否保存中间结果和可视化                   |
 | `qwen_unload_callback`  | `Callable`        | `None`        | Qwen 模型卸载回调函数（用于释放显存）       |
diff --git a/docs/zh/notes/mm_operators/image_understanding/generate/image_pers_qa.md b/docs/zh/notes/mm_operators/image_understanding/generate/image_pers_qa.md
index 934efb5e..c164b7c4 100644
--- a/docs/zh/notes/mm_operators/image_understanding/generate/image_pers_qa.md
+++ b/docs/zh/notes/mm_operators/image_understanding/generate/image_pers_qa.md
@@ -1,18 +1,28 @@
 ---
-title: 个性化图片问答生成
+title: PersQAGenerator
 createTime: 2025/10/15 18:20:00
-icon: material-symbols-light:quiz
+# icon: material-symbols-light:quiz
 permalink: /zh/mm_operators/generate/image_pers_qa/
 ---
 
 ## 📘 概述
 
-`PersQAGenerate` 是一个用于**基于视觉语言大模型生成个性化图片问答**的算子。  
+`PersQAGenerator` 是一个用于**基于视觉语言大模型（VLM）生成个性化图片问答**的算子。  
 它会：
-- 自动为图像中的主要人物分配名称标签（如 `<mam>`）；
-- 从预定义模板中随机选择合适的问题；
-- 引导大模型以人物名为开头作答；
-- 输出结构化的问答对，适用于多模态问答数据集构建与角色理解能力评估。
+
+* 自动为图像中的主要人物分配名称标签（在代码中硬编码为 `<mam>`）；
+* 从预定义模板中随机选择合适的问题；
+* 引导大模型以人物名为开头作答；
+* 输出结构化的问答对，适用于多模态问答数据集构建与角色理解能力评估。
+
+**功能特点：**
+
+* 支持为图像中的特定人物生成个性化问答。
+* 自动为主人公分配名称标签（如 `<mam>`）。
+* 从预定义问题模板中随机选择相关问题。
+* 要求模型回答时以主人公名称开头。
+* 支持批量处理多张图像。
+* 输出包含完整的问题-答案对，格式为 `Question: ..., Answer: ...`。
 
 ---
 
@@ -30,9 +40,9 @@ def __init__(
 
 | 参数名           | 类型              | 默认值 | 说明                   |
 | :------------ | :-------------- | :-- | :------------------- |
-| `llm_serving` | `LLMServingABC` | -   | 模型服务对象，用于调用 VLM 生成问答 |
+| `llm_serving` | `LLMServingABC` | -   | **模型服务对象**，用于调用 VLM 生成问答 |
 
----
+-----
 
 ## ⚡ `run` 函数
 
@@ -40,8 +50,8 @@ def __init__(
 def run(
     self,
     storage: DataFlowStorage,
-    multi_modal_key: str = "image",
-    output_key: str = "pers_qa"
+    input_modal_key: str = "image",
+    output_key: str = "output"
 ):
     ...
 ```
@@ -50,24 +60,24 @@ def run(
 
 ## 🧾 `run` 参数说明
 
-| 参数名               | 类型                | 默认值         | 说明              |
+| 参数名              | 类型                | 默认值         | 说明              |
 | :---------------- | :---------------- | :---------- | :-------------- |
 | `storage`         | `DataFlowStorage` | -           | Dataflow 数据存储对象 |
-| `multi_modal_key` | `str`             | `"image"`   | 多模态输入字段名        |
-| `output_key`      | `str`             | `"pers_qa"` | 模型输出字段名         |
+| `input_modal_key` | `str`             | `"image"`   | **多模态输入字段名**（图像路径） |
+| `output_key`      | `str`             | `"output"`  | **模型输出字段名**（个性化问答文本，默认为 `output`） |
 
----
+-----
 
 ## 🧠 示例用法
 
 ```python
 from dataflow.utils.storage import FileStorage
 from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
-from dataflow.operators.core_vision import PersQAGenerate
+from dataflow.operators.core_vision import PersQAGenerator
 
 # Step 1: 启动本地模型服务
 serving = LocalModelVLMServing_vllm(
-    hf_model_name_or_path="./models/Qwen2.5-VL-3B-Instruct",
+    hf_model_name_or_path="Qwen/Qwen2.5-VL-3B-Instruct",
     vllm_tensor_parallel_size=1,
     vllm_temperature=0.7,
     vllm_top_p=0.9,
@@ -76,34 +86,32 @@ serving = LocalModelVLMServing_vllm(
 
 # Step 2: 构建存储
 storage = FileStorage(
-    first_entry_file_name="data/example.jsonl",
+    first_entry_file_name="dataflow/example/Image2TextPipeline/test_image2caption.jsonl",
     cache_path="./cache_local",
     file_name_prefix="pers_qa",
     cache_type="jsonl",
-    media_key="image",
-    media_type="image"
 )
 storage.step()
 
 # Step 3: 初始化并运行算子
-generator = PersQAGenerate(serving)
+generator = PersQAGenerator(serving)
 generator.run(
     storage=storage,
-    multi_modal_key="image",
+    input_modal_key="image",
     output_key="pers_qa"
 )
 ```
 
----
+-----
 
 ## 🧾 默认输出格式（Output Format）
 
 | 字段        | 类型          | 说明                                            |
 | :-------- | :---------- | :-------------------------------------------- |
 | `image`   | `List[str]` | 输入图像路径                                        |
-| `pers_qa` | `str`       | 模型生成的个性化问答对文本，格式为 `Question: ... Answer: ...` |
+| `pers_qa` | `str`       | 模型生成的个性化问答对文本，格式为 `Question: ..., Answer: ...` |
 
----
+-----
 
 ### 📥 示例输入
 
@@ -115,6 +123,8 @@ generator.run(
 ### 📤 示例输出
 
 ```jsonl
-{"image": ["./test/example1.jpg"], "pers_qa": "Question: <mam>在做什么？ Answer: <mam>正在微笑看向镜头。"}
-{"image": ["./test/example2.jpg"], "pers_qa": "Question: <mam>在哪里？ Answer: <mam>在一间咖啡馆。"}
-```
\ No newline at end of file
+{"image": ["./test/example1.jpg"], "pers_qa": "Question: <mam>在做什么？, Answer: <mam>正在微笑看向镜头。"}
+{"image": ["./test/example2.jpg"], "pers_qa": "Question: <mam>在哪里？, Answer: <mam>在一间咖啡馆。"}
+```
+
+> Tips: 尽量使用较强的 MLLM 可以确保准确的格式生成。
\ No newline at end of file
diff --git a/docs/zh/notes/mm_operators/image_understanding/generate/image_qa.md b/docs/zh/notes/mm_operators/image_understanding/generate/image_qa.md
index aa423630..b8bb9c9a 100644
--- a/docs/zh/notes/mm_operators/image_understanding/generate/image_qa.md
+++ b/docs/zh/notes/mm_operators/image_understanding/generate/image_qa.md
@@ -1,14 +1,20 @@
 ---
-title: 图片问答生成
+title: ImageQAGenerator
 createTime: 2025/10/15 16:00:00
-icon: material-symbols-light:quiz
+# icon: material-symbols-light:quiz
 permalink: /zh/mm_operators/generate/image_qa/
 ---
 
 ## 📘 概述
 
-`ImageQAGenerate` 是一个用于**根据图像内容自动生成问答对（Visual QA）**的算子。  
-它会基于图像场景智能提出合理问题，并生成参考答案，可用于多模态 QA 数据集构建、检索增强、图文匹配增强等场景。
+`ImageQAGenerator` 是一个用于**根据图像内容自动生成问答对（Visual QA）**的算子。  
+它会基于图像场景智能提出合理问题，并生成参考答案。
+
+**功能特点：**
+* 支持批量处理多张图像。
+* 基于视觉语言模型自动生成相关问答。
+* 可应用于视觉问答数据集构建和模型训练。
+* 自动处理图像输入和问答提示词构建。
 
 ---
 
@@ -26,9 +32,9 @@ def __init__(
 
 | 参数名           | 类型              | 默认值 | 说明                        |
 | :------------ | :-------------- | :-- | :------------------------ |
-| `llm_serving` | `LLMServingABC` | -   | 模型服务对象，用于调用视觉语言模型进行 QA 生成 |
+| `llm_serving` | `LLMServingABC` | -   | **模型服务对象**，用于调用视觉语言模型进行 QA 生成 |
 
----
+-----
 
 ## ⚡ `run` 函数
 
@@ -36,34 +42,34 @@ def __init__(
 def run(
     self,
     storage: DataFlowStorage,
-    multi_modal_key: str = "image",
-    output_key: str = "qa_pairs"
+    input_modal_key: str = "image",
+    output_key: str = "output"
 ):
     ...
 ```
 
-执行算子主逻辑，对输入的图片生成多个问答对并写入输出文件。
+执行算子主逻辑，读取图片路径 → **验证数据框** → 构建提示词 → 调用模型 → 生成问答对（VQA）并写入输出文件。
 
 ## 🧾 `run` 参数说明
 
-| 参数名               | 类型                | 默认值          | 说明              |
+| 参数名              | 类型                | 默认值          | 说明              |
 | :---------------- | :---------------- | :----------- | :-------------- |
 | `storage`         | `DataFlowStorage` | -            | Dataflow 数据存储对象 |
-| `multi_modal_key` | `str`             | `"image"`    | 多模态输入字段名        |
-| `output_key`      | `str`             | `"qa_pairs"` | 输出问答对字段名        |
+| `input_modal_key` | `str`             | `"image"`    | **多模态输入字段名**（如图像路径） |
+| `output_key`      | `str`             | `"output"`   | **输出问答对字段名**（默认为 `output`，可自定义） |
 
----
+-----
 
 ## 🧠 示例用法
 
 ```python
 from dataflow.utils.storage import FileStorage
 from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
-from dataflow.operators.core_vision import ImageQAGenerate
+from dataflow.operators.core_vision import ImageQAGenerator
 
 # Step 1: 启动本地模型服务
 serving = LocalModelVLMServing_vllm(
-    hf_model_name_or_path="./models/Qwen2.5-VL-3B-Instruct",
+    hf_model_name_or_path="Qwen/Qwen2.5-VL-3B-Instruct",
     vllm_tensor_parallel_size=1,
     vllm_temperature=0.7,
     vllm_top_p=0.9,
@@ -72,25 +78,23 @@ serving = LocalModelVLMServing_vllm(
 
 # Step 2: 准备输入数据
 storage = FileStorage(
-    first_entry_file_name="data/example_qa.jsonl",
+    first_entry_file_name="dataflow/example/Image2TextPipeline/test_image2qa.jsonl",
     cache_path="./cache_local",
     file_name_prefix="imageqa",
     cache_type="jsonl",
-    media_key="image",
-    media_type="image"
 )
-storage.step()
+storage.step() # 加载数据
 
 # Step 3: 初始化并运行算子
-qa_generator = ImageQAGenerate(serving)
+qa_generator = ImageQAGenerator(serving)
 qa_generator.run(
     storage=storage,
-    multi_modal_key="image",
-    output_key="qa_pairs"
+    input_modal_key="image",
+    output_key="qa_pairs" # 在示例中指定输出字段为 "qa_pairs"
 )
 ```
 
----
+-----
 
 ## 🧾 默认输出格式（Output Format）
 
@@ -99,7 +103,9 @@ qa_generator.run(
 | `image`    | `List[str]`            | 输入图像路径                             |
 | `qa_pairs` | `List[Dict[str, str]]` | 生成的问答对，包含 `question` 与 `answer` 字段 |
 
----
+> **注意：** 模型的原始输出 (`output_key`) 通常是一个包含所有问答对的字符串，需要后续的算子（如 `JsonParser`）进行结构化处理才能得到 `List[Dict[str, str]]` 结构。这里展示的是**理想的结构化后的输出格式**。
+
+-----
 
 ### 📥 示例输入
 
@@ -107,7 +113,7 @@ qa_generator.run(
 {"image": ["./test/street_scene.jpg"]}
 ```
 
-### 📤 示例输出
+### 📤 示例输出 (结构化后)
 
 ```jsonl
 {
diff --git a/docs/zh/notes/mm_operators/image_understanding/generate/image_region_caption.md b/docs/zh/notes/mm_operators/image_understanding/generate/image_region_caption.md
index 89f5d0f1..cda1a281 100644
--- a/docs/zh/notes/mm_operators/image_understanding/generate/image_region_caption.md
+++ b/docs/zh/notes/mm_operators/image_understanding/generate/image_region_caption.md
@@ -31,8 +31,8 @@ def __init__(
 def run(
     self, 
     storage: DataFlowStorage, 
-    image_key: str = "image", 
-    bbox_key: str = "bbox", 
+    input_image_key: str = "image", 
+    input_bbox_key: str = "bbox", 
     output_key: str = "mdvp_record"
 ):
 ```
@@ -41,8 +41,8 @@ def run(
 | 参数名 | 类型 | 默认值 | 描述 |
 | :--- | :--- | :--- | :--- |
 | storage | DataFlowStorage | 无默认值（必需） | 用于文件操作和缓存路径管理的存储实例 |
-| image_key | str | "image" | 输入 JSONL 数据中图像路径的字段名 |
-| bbox_key | str | "bbox" | 输入数据中边界框的字段名。如果缺失，则自动从图像中提取 |
+| input_image_key | str | "image" | 输入 JSONL 数据中图像路径的字段名 |
+| input_bbox_key | str | "bbox" | 输入数据中边界框的字段名。如果缺失，则自动从图像中提取 |
 | output_key | str | "mdvp_record" | 结果数据中输出区域描述记录的字段名 |
 
 ## 🧠 使用示例
@@ -81,8 +81,8 @@ operator = ImageRegionCaptionGenerate(llm_serving=model, config=cfg)
 
 operator.run(
     storage=storage.step(),
-    image_key="image",
-    bbox_key="bbox",
+    input_image_key="image",
+    input_bbox_key="bbox",
     output_key="mdvp_record"
 )
 ```
diff --git a/docs/zh/notes/mm_operators/image_understanding/generate/image_scale_caption.md b/docs/zh/notes/mm_operators/image_understanding/generate/image_scale_caption.md
index 81edbbdc..9a8092ec 100644
--- a/docs/zh/notes/mm_operators/image_understanding/generate/image_scale_caption.md
+++ b/docs/zh/notes/mm_operators/image_understanding/generate/image_scale_caption.md
@@ -65,7 +65,7 @@ def get_desc(lang: str = "zh") -> str
 def run(
     self, 
     storage: DataFlowStorage,
-    image_key: str = "image", 
+    input_image_key: str = "image", 
     output_key: str = "scalecap_record"
 )
 ```
@@ -77,7 +77,7 @@ def run(
 | 参数名          | 类型                | 默认值                 | 说明                |
 | :----------- | :---------------- | :------------------ | :---------------- |
 | `storage`    | `DataFlowStorage` | 无                   | 数据存储接口，用于读取和写入数据。 |
-| `image_key`  | `str`             | `"image"`           | 图像路径字段名。          |
+| `input_image_key`  | `str`             | `"image"`           | 图像路径字段名。          |
 | `output_key` | `str`             | `"scalecap_record"` | 输出字段名，用于保存生成的描述。  |
 
 ---
@@ -108,7 +108,7 @@ operator = ImageScaleCaptionGenerate(vlm_serving=model, config=cfg)
 # 执行算子
 operator.run(
     storage=storage,
-    image_key="image",
+    input_image_key="image",
     output_key="scalecap_record"
 )
 ```
diff --git a/docs/zh/notes/mm_operators/image_understanding/generate/image_skvqa.md b/docs/zh/notes/mm_operators/image_understanding/generate/image_skvqa.md
deleted file mode 100644
index 778348f4..00000000
--- a/docs/zh/notes/mm_operators/image_understanding/generate/image_skvqa.md
+++ /dev/null
@@ -1,160 +0,0 @@
----
-title: 图片知识增强问答生成（SKVQA）
-createTime: 2025/10/26 15:00:00
-icon: material-symbols-light:image
-permalink: /zh/mm_operators/generate/image_skvqa/
----
-
-## 📘 概述
-
-`ImageSKVQAGenerate` 是一个用于生成 **Synthetic Knowledge Visual Question Answering（SKVQA，合成知识视觉问答）** 数据的算子。
-与普通的视觉问答（VQA）不同，SKVQA 会在问答生成过程中结合外部**上下文（context）**信息，
-让模型在回答问题时不仅基于图像内容，还能参考文字描述或背景知识进行推理。
-
-这种能力可广泛应用于**视觉知识理解、产品文档问答生成、多模态知识增强训练**等任务。
-
----
-
-## 🏗️ `__init__` 函数
-
-```python
-def __init__(
-    self,
-    llm_serving: LLMServingABC
-):
-    ...
-```
-
-## 🧾 `__init__` 参数说明
-
-| 参数名           | 类型              | 默认值 | 说明                                |
-| :------------ | :-------------- | :-- | :-------------------------------- |
-| `llm_serving` | `LLMServingABC` | -   | 模型服务对象，用于调用视觉语言模型（VLM）生成 SKVQA 结果 |
-
----
-
-## ⚡ `run` 函数
-
-```python
-def run(
-    self,
-    storage: DataFlowStorage,
-    multi_modal_key: str = "image",
-    output_key: str = "skvqa"
-):
-    ...
-```
-
-执行算子主逻辑，对输入的图像批量生成包含上下文（context）与问答对（QAs）的结构化输出。
-
----
-
-## 🧾 `run` 参数说明
-
-| 参数名               | 类型                | 默认值       | 说明                |
-| :---------------- | :---------------- | :-------- | :---------------- |
-| `storage`         | `DataFlowStorage` | -         | DataFlow 数据存储对象   |
-| `multi_modal_key` | `str`             | `"image"` | 多模态输入字段名（通常为图片路径） |
-| `output_key`      | `str`             | `"skvqa"` | 输出结果字段名，用于保存解析结果  |
-
----
-
-## 🧠 算子功能说明
-
-* 自动为每张输入图像生成一个结构化的 **SKVQA 输出**，包括：
-
-  * `context`：与图像内容相关的上下文（背景描述或知识片段）
-  * `qas`：多个问答对，每个包含 `question` 和 `answer`
-
-* 自动解析模型输出中的 Markdown 格式结构，如：
-
-  ```
-  ### Wikipedia Article
-  （上下文内容）
-
-  ### Question Answer Pairs
-  1. **问题**  
-     - 答案
-  2. **问题**  
-     - 答案
-  ```
-
-* 支持容错解析，即使格式不完全符合预期也能尽量提取有效内容。
-
-* 可应用于视觉知识增强、图文融合训练、问答生成等场景。
-
----
-
-## 🧩 示例用法
-
-```python
-from dataflow.utils.storage import FileStorage
-from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm
-from dataflow.operators.core_vision.generate.sk_vqa_generator import ImageSKVQAGenerate
-
-# Step 1: 启动本地视觉语言模型
-serving = LocalModelVLMServing_vllm(
-    hf_model_name_or_path="./models/Qwen2.5-VL-3B-Instruct",
-    vllm_tensor_parallel_size=1,
-    vllm_temperature=0.7,
-    vllm_top_p=0.9,
-    vllm_max_tokens=512
-)
-
-# Step 2: 准备输入文件
-storage = FileStorage(
-    first_entry_file_name="data/example_skvqa.jsonl",
-    cache_path="./cache_skvqa",
-    cache_type="jsonl"
-)
-storage.step()
-
-# Step 3: 初始化算子并运行
-skvqa_generator = ImageSKVQAGenerate(serving)
-skvqa_generator.run(
-    storage=storage,
-    multi_modal_key="image",
-    output_key="skvqa"
-)
-```
-
----
-
-## 🧾 默认输出格式（Output Format）
-
-| 字段      | 类型               | 说明                           |
-| :------ | :--------------- | :--------------------------- |
-| `image` | `List[str]`      | 输入图像路径列表                     |
-| `skvqa` | `Dict[str, Any]` | 模型生成的结构化 SKVQA 输出，含上下文与问答对数组 |
-
----
-
-### 📥 示例输入
-
-```jsonl
-{"image": ["./data/product_manual.jpg"]}
-```
-
-### 📤 示例输出
-
-```jsonl
-{
-  "image": ["./data/product_manual.jpg"],
-  "skvqa": {
-    "context": "这是一份关于智能手表功能的说明文档，图中展示了健康监测界面。",
-    "qas": [
-      {"question": "图中展示的设备是什么？", "answer": "一块智能手表"},
-      {"question": "该设备有哪些主要功能？", "answer": "支持心率监测、步数统计和睡眠分析"},
-      {"question": "这段文字的主题是什么？", "answer": "智能手表的功能介绍"}
-    ]
-  }
-}
-```
-
-## 💡 特点总结
-
-* ✅ 支持批量图像输入
-* ✅ 自动生成上下文 + 问答对的结构化结果
-* ✅ 内置格式容错与清洗逻辑
-* ✅ 可与任意多模态大模型（如 Qwen-VL、InternVL、MiniCPM-V）兼容
-* ✅ 适用于多模态知识增强、检索问答、数据生成任务
diff --git a/docs/zh/notes/mm_operators/image_understanding/generate/multimodal_math.md b/docs/zh/notes/mm_operators/image_understanding/generate/multimodal_math.md
index 1178d001..f526e9ba 100644
--- a/docs/zh/notes/mm_operators/image_understanding/generate/multimodal_math.md
+++ b/docs/zh/notes/mm_operators/image_understanding/generate/multimodal_math.md
@@ -1,23 +1,23 @@
 ---
-title: 多模态数学题目生成
+title: MultimodalMathGenerator
 createTime: 2025/10/15 19:00:00
-icon: material-symbols-light:functions
+# icon: material-symbols-light:functions
 permalink: /zh/mm_operators/generate/multimodal_math/
 ---
 
 ## 📘 概述
 
-`MultimodalMathGenerate` 是一个用于 **自动生成数学函数图像 + 数学问答对** 的多模态数据生成算子。  
-它支持一次、二次、正弦、指数等多种函数类型，可生成简单和复杂两类数学问题，并自动绘制对应函数图像，适用于教育场景、视觉问答模型训练和数学推理评测。
+`MultimodalMathGenerator` 是一个用于 **自动生成数学函数图像 + 数学问答对** 的多模态数据生成算子。  
+它支持一次、二次、正弦、指数等多种函数类型，并根据输入数据中的 `mode` 字段（`simple` 或 `complex`）生成对应的简单计算题或高阶概念题，适用于教育场景、视觉问答模型训练和数学推理评测。
 
----
+-----
 
 ## 🏗️ `__init__` 函数
 
 ```python
 def __init__(
     self,
-    image_dir: str = "/data0/mt/Dataflow-MM-Preview/cache",
+    image_dir: str = "~/cache",
     seed: int | None = None
 ):
     ...
@@ -25,12 +25,12 @@ def __init__(
 
 ## 🧾 `__init__` 参数说明
 
-| 参数名         | 类型            | 默认值                                     | 说明               |
-| :---------- | :------------ | :-------------------------------------- | :--------------- |
-| `image_dir` | `str`         | `"/data0/mt/Dataflow-MM-Preview/cache"` | 用于保存生成的函数图像的目录   |
-| `seed`      | `int \| None` | `None`                                  | 随机种子，用于保证生成结果可复现 |
+| 参数名         | 类型            | 默认值          | 说明               |
+| :---------- | :------------ | :------------- | :--------------- |
+| `image_dir` | `str`         | `"~/cache"`    | 用于保存生成的函数图像的目录   |
+| `seed`      | `int \| None` | `None`         | 随机种子，用于保证生成结果可复现 |
 
----
+-----
 
 ## ⚡ `run` 函数
 
@@ -38,108 +38,117 @@ def __init__(
 def run(
     self,
     storage: DataFlowStorage,
-    n: int = 200,
-    mode: str = "complex",
-    output_key: str = "multimodal_math"
+    input_key: str = "mode",
 ):
     ...
 ```
 
-执行算子主逻辑，自动生成指定数量的函数图像及对应数学问答对。
+执行算子主逻辑，读取 `storage` 中的数据，并根据每行数据中 `input_key` 字段的值，生成对应的函数图像及数学问答对，然后将新生成的列横向拼接回原数据。
 
----
+-----
 
 ## 🧾 `run` 参数说明
 
-| 参数名          | 类型                | 默认值                 | 说明                                        |
-| :----------- | :---------------- | :------------------ | :---------------------------------------- |
-| `storage`    | `DataFlowStorage` | -                   | Dataflow 数据存储对象                           |
-| `n`          | `int`             | `200`               | 生成题目的数量                                   |
-| `mode`       | `str`             | `"complex"`         | 生成模式，`"simple"` 为简单数值题，`"complex"` 为高阶概念题 |
-| `output_key` | `str`             | `"multimodal_math"` | 输出数据的字段名前缀                                |
+| 参数名          | 类型                | 默认值    | 说明                                        |
+| :----------- | :---------------- | :----- | :---------------------------------------- |
+| `storage`    | `DataFlowStorage` | -      | Dataflow 数据存储对象（包含待处理的行数据）           |
+| `input_key`  | `str`             | `"mode"` | **模式列的字段名**。该列的值决定是生成 `"simple"` 还是 `"complex"` 题目。 |
 
----
+-----
+
+## 🧠 模式说明与示例用法
 
-## 🧠 示例用法
+### 📐 模式说明
+
+| 模式 | `mode` 列值 | 特点 | 题目类型 |
+| :--- | :--- | :--- | :--- |
+| **简单模式** | `"simple"` | 基础函数认知和数值代入。 | 给定函数表达式 $f(x)$，求 $x=a$ 时的函数值 $f(a)$。 |
+| **复杂模式** | 其他值（如 `"complex"`） | 强调数学分析能力（导数、极值、单调性）。 | 随机生成导数符号判断、区间极值点或单调性判断题。 |
+
+### 🧩 示例用法 (需预先准备含 `mode` 列的输入文件)
 
 ```python
 from dataflow.utils.storage import FileStorage
-from dataflow.operators.core_math import MultimodalMathGenerate
+from dataflow.operators.core_math import MultimodalMathGenerator
+import pandas as pd
+
+# Step 1: 准备一个包含 'mode' 列的输入文件 (例如 data/math_tasks.jsonl)
+# 假设 data/math_tasks.jsonl 内容如下：
+# {"id": 1, "mode": "simple"}
+# {"id": 2, "mode": "complex"}
+# {"id": 3, "mode": "complex"}
 
-# Step 1: 准备存储
 storage = FileStorage(
-    first_entry_file_name="data/math_samples.jsonl",
+    first_entry_file_name="data/math_tasks.jsonl",
     cache_path="./cache_local",
-    file_name_prefix="math",
+    file_name_prefix="math_out",
     cache_type="jsonl"
 )
+storage.step() # 读取数据
 
 # Step 2: 初始化算子
-math_generator = MultimodalMathGenerate(
+math_generator = MultimodalMathGenerator(
     image_dir="./math_plots",
     seed=42
 )
 
-# Step 3: 生成复杂数学题目（含导数、极值、单调性）
+# Step 3: 运行算子，根据每行的 'mode' 列生成题目
 math_generator.run(
     storage=storage,
-    n=10,
-    mode="complex",
-    output_key="multimodal_math"
-)
-
-# Step 4: 也可生成简单题（直接代入计算）
-math_generator.run(
-    storage=storage,
-    n=10,
-    mode="simple",
-    output_key="multimodal_math_simple"
+    input_key="mode" # 指定以 'mode' 列来控制生成模式
 )
 ```
 
----
+-----
 
 ## 🧾 默认输出格式（Output Format）
 
+算子会在原输入数据框的基础上，**横向拼接**以下四个字段：
+
 | 字段           | 类型    | 说明         |
 | :----------- | :---- | :--------- |
-| `image_path` | `str` | 函数图像保存路径   |
+| `image_path` | `str` | 函数图像保存的本地路径   |
 | `question`   | `str` | 自动生成的数学题目  |
 | `answer`     | `str` | 答案         |
 | `solution`   | `str` | 详细的解题步骤与解释 |
 
----
+-----
 
 ### 📥 示例输入
 
+> **注意：** 算子依赖输入 `storage` 的行数和 `input_key`（默认为 `mode`）列的值来生成数据。
+
 ```jsonl
-{}
+{"id": 1, "mode": "simple"}
+{"id": 2, "mode": "complex"}
 ```
 
-> 该算子不依赖外部输入数据，而是直接生成样本。
-
----
+-----
 
-### 📤 示例输出（Simple 模式）
+### 📤 示例输出（Simple 模式行）
 
 ```jsonl
 {
+  "id": 1,
+  "mode": "simple",
   "image_path": "./math_plots/plot_0.png",
-  "question": "函数图像表示 f(x) = x²，请问在 x=3.5 时，函数值是多少？",
+  "question": "The function plot represents f(x) = x². What is the function value at x=3.5?",
   "answer": "12.25",
-  "solution": "根据函数表达式 f(x) = x²，代入 x=3.5，计算得 y=12.25。"
+  "solution": "According to the function expression f(x) = x², substitute x=3.5 to get y=12.25."
 }
 ```
 
----
+-----
 
-### 📤 示例输出（Complex 模式）
+### 📤 示例输出（Complex 模式行）
 
 ```jsonl
 {
-  "image_path": "./math_plots/plot_7.png",
-  "question": "函数图像表示 f(x) = sin(x)，请判断在 x=2.5 处函数的变化率是正的还是负的？",
-  "answer": "负",
-  "solution": "观察图像在 x=2.5 附近的斜率，可知变化率为负。"
+  "id": 2,
+  "mode": "complex",
+  "image_path": "./math_plots/plot_1.png",
+  "question": "The function plot represents f(x) = sin(x). Is the rate of change (derivative) at x=2.5 positive or negative?",
+  "answer": "negative",
+  "solution": "By observing the slope of the plot near x=2.5, the rate of change is negative."
 }
 ```
\ No newline at end of file
diff --git a/docs/zh/notes/mm_operators/image_understanding/generate/multirole_videoqa.md b/docs/zh/notes/mm_operators/image_understanding/generate/multirole_videoqa.md
new file mode 100644
index 00000000..e33cc836
--- /dev/null
+++ b/docs/zh/notes/mm_operators/image_understanding/generate/multirole_videoqa.md
@@ -0,0 +1,140 @@
+---
+title: 多角色视频问答生成(MultiRole Video QA Generation)
+createTime: 2025/12/2 20:00:00
+icon: material-symbols-light:video
+permalink: /zh/mm_operators/generate/multirole_videoqa/
+---
+
+## 📘 概述
+
+`MultiroleVideoQAGenerate` 是一个数据生成算子，用于**基于预处理视频数据自动创建问答对（QA Pairs）**。 
+给定输入的预处理视频数据，它会构建多个与该视频相关的问答对。该算子适用于**广告视频标注**、**数据集构建**和**视频理解**任务。
+
+**功能特性：**
+* 支持**批量处理**多个预处理视频数据。
+* 使用 **VLM（如 Qwen2.5-VL）**生成**高质量**的问答对。
+* 自动处理视频输入并使用 Prompt 生成数据。
+
+---
+
+## 🏗️ `__init__` 函数
+
+```python
+def __init__(
+    self,
+    llm_serving: VLMServingABC
+):
+    ...
+```
+## 🧾 `__init__` 参数
+
+| Parameter     | Type            | Default | Description                                                     |
+| :------------ | :-------------- | :------ | :-------------------------------------------------------------- |
+| `llm_serving` | `VLMServingABC` | -       | **Model Serving Object** used to call the VLM for QA pairs generation |
+
+-----
+
+## ⚡ `run` 函数
+
+```python
+def run(
+        self,
+        storage: DataFlowStorage,
+        input_meta_key: str = "Meta", 
+        input_clips_key: str = "Clips", 
+        output_key: str = "QA"
+):
+    ...
+```
+
+The `run` function executes the main QA pairs generation workflow:
+read data paths → **validate DataFrame** → construct prompts → call the model → generate QA pairs captions → write results to output.
+
+## 🧾 `run` 参数
+
+| Parameter         | Type              | Default     | Description                                           |
+| :---------------- | :---------------- | :---------- | :---------------------------------------------------- |
+| `storage`         | `DataFlowStorage` | -           | Dataflow storage object                               |
+| `input_mets_key`  | `str`             | `"Meta"`    | **Multimodal Input Field Name**                       |
+| `input_clips_key` | `str`             | `"Clips"`   | **Multimodal Input Field Name**                       |
+| `output_key`      | `str`             | `"QA"`      | **Model Output Field Name** (the generated QA pairs)  |
+
+-----
+
+## 🧠 示例用法
+
+```python
+import os 
+import argparse
+from dataflow.serving import LocalModelVLMServing_vllm
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.core_vision import MultiroleVideoQAInitialGenerator, MultiroleVideoQAMultiAgentGenerator, MultiroleVideoQAFinalGenerator
+
+# Step 1: Launch local model service
+llm_serving = LocalModelVLMServing_vllm(
+            hf_model_name_or_path=model_path,
+            hf_cache_dir=hf_cache_dir,
+            hf_local_dir=download_dir,
+            vllm_tensor_parallel_size=1, 
+            vllm_temperature=0.7,
+            vllm_top_p=0.9,
+            vllm_max_tokens=6000,
+        )
+
+# Step 2: Prepare input data
+storage = FileStorage(
+            first_entry_file_name=first_entry_file,
+            cache_path=cache_path,
+            file_name_prefix=file_name_prefix,
+            cache_type=cache_type,
+        )
+
+# Step 3: Initialize and run the operator
+initial_QA_generation = MultiroleVideoQAInitialGenerator(llm_serving = self.llm_serving)
+multiAgent_QA_generation = MultiroleVideoQAMultiAgentGenerator(llm_serving = self.llm_serving, max_iterations = 3)
+final_QA_generation = MultiroleVideoQAFinalGenerator(llm_serving = self.llm_serving)
+
+init_df = initial_QA_generation.run(
+            storage = self.storage.step(),
+            input_meta_key = self.input_meta_key, 
+            input_clips_key = self.input_clips_key, 
+            output_key = self.output_key
+        )
+middle_df = multiAgent_QA_generation.run(
+            df = init_df,
+            input_meta_key = self.input_meta_key, 
+            input_clips_key = self.input_clips_key, 
+            output_key = self.output_key
+        )
+final_QA_generation.run(
+            storage = self.storage,
+            df = middle_df,
+            input_meta_key = self.input_meta_key, 
+            input_clips_key = self.input_clips_key, 
+            output_key = self.output_key
+        )
+```
+
+-----
+
+## 🧾 默认输出格式
+
+| Field     | Type         | Description                      |
+| :-------- | :----------- | :------------------------------- |
+| `Meta`    | `str`        | Meta information for video       |
+| `Clips`   | `List[Dict]` | Interleaved modality video Clips |
+| `QA`      | `List[Dict]` | QA pairs                         |
+
+-----
+
+### 📥 示例输入
+
+```jsonl
+{"Meta": "Meta Information", "Clips": [{"Audio_Text": "Audio_Text1", "Frames_Images": ["image_path1","image_path2"], "Description": "Description1"}, {"Audio_Text": "Audio_Text2", "Frames_Images": ["image_path3","image_path4"], "Description": "Description2"}]}
+```
+
+### 📤 示例输出
+
+```jsonl
+{"Meta": "Meta Information", "Clips": [{"Audio_Text": "Audio_Text1", "Frames_Images": ["image_path1","image_path2"], "Description": "Description1"}, {"Audio_Text": "Audio_Text2", "Frames_Images": ["image_path3","image_path4"], "Description": "Description2"}], "QA":[{"Label":"label1", "Question": "Question1", "Answer": "Answer1"},{"Label":"label2", "Question": "Question2", "Answer": "Answer2"}]}
+```
\ No newline at end of file
diff --git a/docs/zh/notes/mm_operators/image_understanding/generate/prompt_templated_vqa_generator.md b/docs/zh/notes/mm_operators/image_understanding/generate/prompt_templated_vqa_generator.md
new file mode 100644
index 00000000..cbced8dd
--- /dev/null
+++ b/docs/zh/notes/mm_operators/image_understanding/generate/prompt_templated_vqa_generator.md
@@ -0,0 +1,132 @@
+---
+title: PromptTemplatedVQAGenerator
+createTime: 2026/01/11 21:25:34
+permalink: /zh/mm_operators/generate/prompt_templated_vqa_generator/
+---
+## 📘 概述
+
+`PromptTemplatedVQAGenerator` 是一个 **基于模板的多模态问答算子**。它允许用户将 DataFrame 中的多个字段动态注入到预定义的 Prompt 模板中，生成定制化的文本指令，并结合图像或视频输入进行批量推理。
+
+与标准的 VQA 算子不同，该算子支持更复杂的 Prompt 逻辑（例如动态填入类别、上下文描述等），非常适合需要 **结构化 Prompt 构建** 的场景，如基于特定属性的图像描述、受控多轮对话模拟等。
+
+## 🏗️ `__init__` 函数
+
+```python
+def __init__(
+    self,
+    serving: LLMServingABC,
+    prompt_template: NamedPlaceholderPromptTemplate,
+    system_prompt: str = "You are a helpful assistant.",
+):
+
+```
+
+### 🧾 参数说明
+
+| 参数名 | 类型 | 默认值 | 说明 |
+| --- | --- | --- | --- |
+| `serving` | `LLMServingABC` | 无 | 负责执行推理的模型服务实例（需支持多模态推理）。 |
+| `prompt_template` | `NamedPlaceholderPromptTemplate` | 无 | 实现了 `build_prompt` 方法的模板对象，用于将字典数据转换为字符串 Prompt。 |
+| `system_prompt` | `str` | `"You are..."` | 发送给模型的系统提示词。 |
+
+## ⚡ `run` 函数
+
+```python
+def run(
+    self,
+    storage: DataFlowStorage,
+    input_image_key: str = "image",
+    input_video_key: str = "video",
+    output_answer_key: str = "answer",
+    **input_keys,
+):
+    ...
+
+```
+
+执行算子主逻辑：
+
+1. **数据读取**
+从 `storage` 中读取 DataFrame。
+2. **Prompt 动态构建**
+遍历 DataFrame 的每一行：
+* 提取 `input_keys` 映射中指定的列数据（例如 `descriptions` 列, `type` 列）。
+* 调用 `prompt_template.build_prompt()` 将这些数据填入模板，生成该样本专属的 `prompt_text`。
+
+
+3. **多模态输入组装**
+* 读取 `input_image_key` 或 `input_video_key` 指定的媒体路径。
+* 将生成的文本 Prompt 与对应的图像/视频数据打包成符合模型要求的输入格式。
+
+
+4. **模型推理与输出**
+* 调用模型服务进行批量生成。
+* 将结果写入 `output_answer_key` 指定的列，并保存更新后的 DataFrame。
+
+
+
+### 🧾 `run` 参数说明
+
+| 参数名 | 类型 | 默认值 | 说明 |
+| --- | --- | --- | --- |
+| `storage` | `DataFlowStorage` | 无 | DataFlow 数据存储对象。 |
+| `input_image_key` | `str` | `"image"` | 图像路径所在的列名（与 video_key 二选一）。 |
+| `input_video_key` | `str` | `"video"` | 视频路径所在的列名（与 image_key 二选一）。 |
+| `output_answer_key` | `str` | `"answer"` | 生成结果的输出列名。 |
+| `**input_keys` | `kwargs` | 无 | **关键参数**。定义模板占位符与 DataFrame 列名的映射关系。<br>
+
+<br>格式：`模板变量名="DataFrame列名"`。 |
+
+## 🧩 示例用法
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.core import LLMServing
+from dataflow.prompts.prompt_template import NamedPlaceholderPromptTemplate
+from dataflow.operators.generate import PromptTemplatedVQAGenerator
+
+# 1) 定义带占位符的模板
+# 假设我们要让模型检查图像中是否包含特定类型的物体，并参考已有的描述
+TEMPLATE = (
+    "Context: {descriptions}\n\n"
+    "Task: Describe the appearance of {type} in the image based on the context above."
+)
+prompt_template = NamedPlaceholderPromptTemplate(template=TEMPLATE)
+
+# 2) 初始化算子
+op = PromptTemplatedVQAGenerator(
+    serving=LLMServing(model_path="Qwen/Qwen2.5-VL-3B-Instruct"),
+    prompt_template=prompt_template
+)
+
+# 3) 准备数据 (假设 jsonl 中包含 image, meta_desc, obj_type 三列)
+storage = FileStorage(file_name_prefix="vqa_task")
+storage.step()
+
+# 4) 运行算子：将 meta_desc 列填入 {descriptions}，将 obj_type 列填入 {type}
+op.run(
+    storage=storage,
+    input_image_key="image",
+    output_answer_key="generated_caption",
+    # 动态映射：
+    descriptions="meta_desc", 
+    type="obj_type"
+)
+
+```
+
+### 🧾 输入输出示例
+
+**输入 DataFrame 行：**
+| image | meta_desc | obj_type |
+| :--- | :--- | :--- |
+| `"/path/to/car.jpg"` | `"A photo taken on a sunny day."` | `"vintage car"` |
+
+**构建的 Prompt：**
+
+> "Context: A photo taken on a sunny day.\n\nTask: Describe the appearance of **vintage car** in the image based on the context above."
+
+**输出 DataFrame 行：**
+| image | meta_desc | obj_type | generated_caption |
+| :--- | :--- | :--- | :--- |
+| `"/path/to/car.jpg"` | `...` | `...` | `"The vintage car is red with..."` |
diff --git a/docs/zh/notes/mm_operators/image_understanding/generate/prompted_vqa_generator.md b/docs/zh/notes/mm_operators/image_understanding/generate/prompted_vqa_generator.md
new file mode 100644
index 00000000..0fa7ee5c
--- /dev/null
+++ b/docs/zh/notes/mm_operators/image_understanding/generate/prompted_vqa_generator.md
@@ -0,0 +1,133 @@
+---
+title: PromptedVQAGenerator
+createTime: 2026/01/11 21:37:37
+permalink: /zh/mm_operators/generate/prompted_vqa_generator/
+---
+## 📘 概述
+
+`PromptedVQAGenerator` 是一个 **通用的多模态问答生成算子**。
+
+它直接从 DataFrame 中读取 **提示词 (Prompt)** 和 **可选的媒体路径 (Image/Video)**，并调用模型生成答案。该算子具有高度的灵活性：
+
+* **支持多模态**：同时输入文本与图像/视频进行 VQA。
+* **支持纯文本**：若未提供图像或视频列，自动切换为纯文本对话模式 (Chat)。
+* **支持多种输入格式**：既可以直接读取文本 Prompt 列，也可以解析 Conversation 格式列表。
+* **兼容性**：内部自动处理本地模型 (Local VLLM) 的 Chat Template 封装与 API 模型的直接调用。
+
+## 🏗️ `__init__` 函数
+
+```python
+def __init__(
+    self, 
+    serving: LLMServingABC, 
+    system_prompt: str = "You are a helpful assistant."
+):
+
+```
+
+### 🧾 参数说明
+
+| 参数名 | 类型 | 默认值 | 说明 |
+| --- | --- | --- | --- |
+| `serving` | `LLMServingABC` | 无 | 负责执行推理的模型服务实例（支持本地或 API 模型）。 |
+| `system_prompt` | `str` | `"You are..."` | 发送给模型的系统提示词。 |
+
+## ⚡ `run` 函数
+
+```python
+def run(
+    self, 
+    storage: DataFlowStorage,
+    input_prompt_key: str = None,
+    input_conversation_key: str = None,
+    input_image_key: str = None,
+    input_video_key: str = None,
+    output_answer_key: str = "answer",
+):
+    ...
+
+```
+
+执行算子主逻辑：
+
+1. **数据读取与 Prompt 提取**
+* 读取 `storage` 中的 DataFrame。
+* **Prompt 来源（二选一）**：
+* 若指定 `input_prompt_key`：直接读取该列的文本字符串作为 User Prompt。
+* 若指定 `input_conversation_key`：读取该列的对话列表（List[Dict]），自动提取第一条 User Message 的内容。
+
+
+
+
+2. **媒体数据处理**
+* 尝试读取 `input_image_key` 和 `input_video_key`。
+* **纯文本模式判定**：如果未提供媒体列，或某行数据的媒体路径为空，算子将自动以 **纯文本模式** 构造请求，不包含 `<image>` 或 `<video>` 占位符。
+
+
+3. **输入构造与推理**
+* **Local 模式**：使用 `process_vision_info` 处理图像/视频，应用 Chat Template 构造最终 prompt。
+* **API 模式**：直接传递原始 prompt 和媒体路径列表。
+* 调用 `serving.generate_from_input` 执行批量推理。
+
+
+4. **结果保存**
+* 将生成结果写入 `output_answer_key` 列并保存。
+
+
+
+### 🧾 `run` 参数说明
+
+| 参数名 | 类型 | 默认值 | 说明 |
+| --- | --- | --- | --- |
+| `storage` | `DataFlowStorage` | 无 | DataFlow 数据存储对象。 |
+| `input_prompt_key` | `str` | `None` | **文本 Prompt 列名**。与 conversation_key 二选一。 |
+| `input_conversation_key` | `str` | `None` | **对话列表列名**。与 prompt_key 二选一。若使用，将提取第一个用户的输入。 |
+| `input_image_key` | `str` | `None` | **(可选)** 图像路径列名。若为空，则视为纯文本任务。 |
+| `input_video_key` | `str` | `None` | **(可选)** 视频路径列名。 |
+| `output_answer_key` | `str` | `"answer"` | 生成结果的输出列名。 |
+
+## 🧩 示例用法
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.core import LLMServing
+from dataflow.operators.generate import PromptedVQAGenerator
+
+# 1) 初始化模型
+serving = LLMServing(model_path="Qwen/Qwen2.5-VL-3B-Instruct")
+
+# 2) 初始化算子
+generator = PromptedVQAGenerator(
+    serving=serving,
+    system_prompt="You are a helpful assistant."
+)
+
+# 3) 准备数据 (jsonl)
+# 示例数据 A: {"image": "1.jpg", "question": "Describe this image."}
+# 示例数据 B: {"question": "What is AI?"} (无图，纯文本)
+storage = FileStorage(file_name_prefix="mixed_tasks")
+storage.step()
+
+# 4) 执行生成
+generator.run(
+    storage=storage,
+    input_prompt_key="question",  # 读取 question 列作为 prompt
+    input_image_key="image",      # 读取 image 列 (可选)
+    output_answer_key="answer"
+)
+
+```
+
+### 🧾 输入输出示例
+
+**输入 DataFrame：**
+| image | question |
+| :--- | :--- |
+| `"/data/cat.jpg"` | `"What animal is this?"` |
+| `None` | `"Explain quantum physics briefly."` |
+
+**输出 DataFrame：**
+| image | question | answer |
+| :--- | :--- | :--- |
+| `"/data/cat.jpg"` | `"What animal is this?"` | `"It is a cat."` |
+| `None` | `"Explain quantum physics briefly."` | `"Quantum physics is the study of..."` |
diff --git a/docs/zh/notes/mm_operators/image_understanding/generate/vision_mct_reasoning.md b/docs/zh/notes/mm_operators/image_understanding/generate/vision_mct_reasoning.md
index 6ede26eb..3a587758 100644
--- a/docs/zh/notes/mm_operators/image_understanding/generate/vision_mct_reasoning.md
+++ b/docs/zh/notes/mm_operators/image_understanding/generate/vision_mct_reasoning.md
@@ -59,10 +59,10 @@ def __init__(
 def run(
     self,
     storage: DataFlowStorage,
-    question_key: str = "question",
-    image_key: str = "image",
-    tree_key: Optional[str] = "tree",
-    true_answer_key: str = "true_answer",
+    input_question_key: str = "question",
+    input_image_key: str = "image",
+    input_tree_key: Optional[str] = "tree",
+    input_true_answer_key: str = "true_answer",
     output_key: str = "sft_entry",
 )
 ```
@@ -75,10 +75,10 @@ def run(
 | 参数名               | 类型                | 默认值             | 说明                             |
 | :---------------- | :---------------- | :-------------- | :----------------------------- |
 | `storage`         | `DataFlowStorage` | 无               | DataFlow 的数据存取接口。              |
-| `question_key`    | `str`             | `"question"`    | 样本中问题字段名。                      |
-| `image_key`       | `str`             | `"image"`       | 图像路径字段名。                       |
-| `tree_key`        | `Optional[str]`   | `"tree"`        | 可选，MCTS 树字段；存在时解析其中的 rollouts。 |
-| `true_answer_key` | `str`             | `"true_answer"` | 样本中真实坐标或答案字段名。                 |
+| `input_question_key`    | `str`             | `"question"`    | 样本中问题字段名。                      |
+| `input_image_key`       | `str`             | `"image"`       | 图像路径字段名。                       |
+| `input_tree_key`        | `Optional[str]`   | `"tree"`        | 可选，MCTS 树字段；存在时解析其中的 rollouts。 |
+| `input_true_answer_key` | `str`             | `"true_answer"` | 样本中真实坐标或答案字段名。                 |
 | `output_key`      | `str`             | `"sft_entry"`   | 输出 SFT 条目的字段名。                 |
 
 ---
@@ -106,10 +106,10 @@ op = VisionMCTSReasoningSFTGenerate(
 # 执行
 op.run(
     storage=storage,
-    question_key="question",
-    image_key="image",
-    tree_key="tree",
-    true_answer_key="true_answer",
+    input_question_key="question",
+    input_image_key="image",
+    input_tree_key="tree",
+    input_true_answer_key="true_answer",
     output_key="sft_entry",
 )
 ```
diff --git a/docs/zh/notes/mm_operators/image_understanding/generate/visual_reasoning_generator.md b/docs/zh/notes/mm_operators/image_understanding/generate/visual_reasoning_generator.md
new file mode 100644
index 00000000..402c0200
--- /dev/null
+++ b/docs/zh/notes/mm_operators/image_understanding/generate/visual_reasoning_generator.md
@@ -0,0 +1,123 @@
+---
+title: VisualReasoningGenerator
+createTime: 2026/01/11 21:42:10
+permalink: /zh/mm_operators/generate/visual_reasoning_generator/
+---
+## 📘 概述
+
+`VisualReasoningGenerator` 是一个 **视觉推理生成算子**，用于调用 VLM 生成详细的推理过程（例如包含 `<think>` 和 `<answer>` 标签的文本）。
+
+该算子内置了 **Fallback（回退）机制**：在执行生成前，会先检查指定的 `input_existing_chains_key` 列。如果该列中已存在有效的推理链数据，算子将直接复用该数据，跳过模型推理。这一特性使其非常适合用于断点续跑或数据补全场景。
+
+## 🏗️ `__init__` 函数
+
+```python
+def __init__(
+    self, 
+    serving: LLMServingABC, 
+    prompt_type: str = "web_grounding"
+):
+
+```
+
+### 🧾 参数说明
+
+| 参数名 | 类型 | 默认值 | 说明 |
+| --- | --- | --- | --- |
+| `serving` | `LLMServingABC` | 无 | 负责执行推理的模型服务实例。 |
+| `prompt_type` | `str` | `"web_grounding"` | **Prompt 类型键值**。用于从 `MCTReasoningPrompt` 库中检索对应的 System Prompt（例如针对网页定位、数学推理等不同场景的预设 prompt）。 |
+
+## ⚡ `run` 函数
+
+```python
+def run(
+    self, 
+    storage: DataFlowStorage, 
+    input_question_key: str, 
+    input_image_key: str, 
+    output_key: str,
+    input_existing_chains_key: Optional[str] = None
+):
+    ...
+
+```
+
+执行算子主逻辑：
+
+1. **Fallback 检查**
+* 若提供了 `input_existing_chains_key`，检查 DataFrame 中该列的数据。
+* 若某行数据已存在非空的列表（List），则直接将其作为结果，**不进行模型调用**。
+
+
+2. **输入构造**
+* 对于需要生成的样本，读取 `input_question_key`（问题）和 `input_image_key`（图像路径）。
+* 结合初始化时选定的 `System Prompt`，构造包含 `[Image, Text]` 的多模态输入。
+
+
+3. **批量生成**
+* 将待生成的请求打包成 Batch。
+* 调用 `serving.generate_from_input` 执行推理。
+
+
+4. **结果整合**
+* 将复用的旧数据与新生成的数据（包装为 List 格式）合并。
+* 写入 `output_key` 并保存。
+
+
+
+### 🧾 `run` 参数说明
+
+| 参数名 | 类型 | 默认值 | 说明 |
+| --- | --- | --- | --- |
+| `storage` | `DataFlowStorage` | 无 | DataFlow 数据存储对象。 |
+| `input_question_key` | `str` | 无 | 问题文本所在的列名。 |
+| `input_image_key` | `str` | 无 | 图像路径所在的列名。 |
+| `output_key` | `str` | 无 | 输出结果的列名（存储为 `List[str]`）。 |
+| `input_existing_chains_key` | `str` | `None` | **(可选) 现有推理链列名**。若该列有值，则跳过生成直接复用。 |
+
+## 🧩 示例用法
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.core import LLMServing
+from dataflow.operators.generate import VisualReasoningGenerator
+
+# 1) 初始化模型
+serving = LLMServing(model_path="Qwen/Qwen2.5-VL-7B-Instruct")
+
+# 2) 初始化算子
+# prompt_type="web_grounding" 会自动加载对应的 System Prompt
+generator = VisualReasoningGenerator(
+    serving=serving,
+    prompt_type="web_grounding"
+)
+
+# 3) 准备数据 (jsonl)
+# 假设我们有一份部分跑过的数据，'history_reasoning' 列有的有值，有的为空
+storage = FileStorage(file_name_prefix="reasoning_task")
+storage.step()
+
+# 4) 执行生成 (带断点续跑功能)
+generator.run(
+    storage=storage,
+    input_question_key="question",
+    input_image_key="image",
+    output_key="reasoning_result",
+    input_existing_chains_key="history_reasoning" # 优先使用此列数据
+)
+
+```
+
+### 🧾 输入输出示例
+
+**输入 DataFrame：**
+| image | question | history_reasoning |
+| :--- | :--- | :--- |
+| `"1.jpg"` | `"Find the button."` | `["<think>The button is red...</think>..."]` |
+| `"2.jpg"` | `"Where is the logo?"` | `[]` (或 `None`) |
+
+**输出 DataFrame (`reasoning_result`)：**
+| image | question | reasoning_result | 说明 |
+| :--- | :--- | :--- | :--- |
+| `"1.jpg"` | `"Find the button."` | `["<think>The button is red...</think>..."]` | **复用**：直接拷贝 `history_reasoning` |
+| `"2.jpg"` | `"Where is the logo?"` | `["<think>Scanning image...</think> Top left."]` | **生成**：调用模型生成 |
diff --git a/docs/zh/notes/mm_operators/image_understanding/generate/vlm_bbox_generator.md b/docs/zh/notes/mm_operators/image_understanding/generate/vlm_bbox_generator.md
new file mode 100644
index 00000000..a9917219
--- /dev/null
+++ b/docs/zh/notes/mm_operators/image_understanding/generate/vlm_bbox_generator.md
@@ -0,0 +1,133 @@
+---
+title: VLMBBoxGenerator
+createTime: 2026/01/11 21:44:23
+permalink: /zh/mm_operators/generate/vlm_bbox_generator/
+---
+## 📘 概述
+
+`VLMBBoxGenerator` 是一个 **视觉定位生成算子**。
+
+它接收一张图像和一个关键词列表（Keywords List），利用 VLM 的定位能力检测每个关键词对应的物体，并输出归一化的边界框（Bounding Box）。该算子能够自动解析模型输出的坐标文本，将其转换为标准化的 `[x1, y1, x2, y2]` 格式。
+
+核心特性：
+
+* **批量并行**：针对单张图片中的多个关键词，自动组装 Batch 请求并行推理，提高检测效率。
+* **坐标归一化**：兼容 0-1000 整数坐标与 0-1 小数坐标输出，统一归一化处理。
+* **异常过滤**：自动过滤模型输出的 "not found" 或无法解析的无效结果。
+
+## 🏗️ `__init__` 函数
+
+```python
+def __init__(
+    self, 
+    serving: LLMServingABC, 
+    prompt_template: str = 'Detect "{keyword}".'
+):
+
+```
+
+### 🧾 参数说明
+
+| 参数名 | 类型 | 默认值 | 说明 |
+| --- | --- | --- | --- |
+| `serving` | `LLMServingABC` | 无 | 负责执行推理的模型服务实例（需支持 Grounding/BBox 输出，如 Qwen-VL）。 |
+| `prompt_template` | `str` | `'Detect "{keyword}".'` | 用于触发模型检测行为的 Prompt 模板。必须包含 `{keyword}` 占位符。 |
+
+## ⚡ `run` 函数
+
+```python
+def run(
+    self, 
+    storage: DataFlowStorage, 
+    input_image_key: str, 
+    input_kws_key: str, 
+    output_key: str
+):
+    ...
+
+```
+
+执行算子主逻辑：
+
+1. **读取数据**
+从 DataFrame 中读取图像路径 (`input_image_key`) 和待检测的关键词列表 (`input_kws_key`)。
+2. **批量推理构建**
+对于每一行数据：
+* 获取去重后的关键词列表。
+* 为每个关键词构造 Prompt（例如 `"Detect "cat"."`）和对应的图像输入。
+* 将该图片的所有关键词请求打包为一个 Batch，调用 `serving.generate_from_input` 并行生成。
+
+
+3. **结果解析 (Parsing)**
+* **坐标提取**：使用正则提取 `(x1, y1), (x2, y2)` 格式的坐标。
+* **归一化**：若坐标值大于 1（例如 0-1000 尺度），自动除以 1000 归一化到 0-1 范围。
+* **格式化**：将坐标转换为 `[x1, y1, x2, y2]` 字符串格式。
+* **过滤**：剔除包含 "not found" 的失败响应。
+
+
+4. **结果保存**
+构造 `{keyword: [bbox1, bbox2, ...]}` 的字典，写入 `output_key` 列。
+
+### 🧾 `run` 参数说明
+
+| 参数名 | 类型 | 默认值 | 说明 |
+| --- | --- | --- | --- |
+| `storage` | `DataFlowStorage` | 无 | DataFlow 数据存储对象。 |
+| `input_image_key` | `str` | 无 | 图像路径所在的列名。 |
+| `input_kws_key` | `str` | 无 | 关键词列表所在的列名（`List[str]`）。 |
+| `output_key` | `str` | 无 | 输出结果的列名（存储为字典）。 |
+
+## 🧩 示例用法
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.core import LLMServing
+from dataflow.operators.generate import VLMBBoxGenerator
+
+# 1) 初始化模型 (需支持 Grounding)
+serving = LLMServing(model_path="Qwen/Qwen2.5-VL-7B-Instruct")
+
+# 2) 初始化算子
+generator = VLMBBoxGenerator(
+    serving=serving,
+    prompt_template='Find the bounding box of "{keyword}".'
+)
+
+# 3) 准备数据 (jsonl)
+# 数据格式: {"image": "park.jpg", "objects": ["dog", "frisbee", "tree"]}
+storage = FileStorage(file_name_prefix="bbox_task")
+storage.step()
+
+# 4) 执行检测
+generator.run(
+    storage=storage,
+    input_image_key="image",
+    input_kws_key="objects",
+    output_key="bbox_result"
+)
+
+```
+
+### 🧾 默认输出格式
+
+`output_key` 列为字典格式（`Dict[str, List[str]]`）：
+
+| 字段名 | 类型 | 说明 |
+| --- | --- | --- |
+| Keyword | `str` | 输入的关键词。 |
+| BBoxes | `List[str]` | 检测到的 BBox 列表，格式为 `"[x1, y1, x2, y2]"` (保留前 3 个结果)。 |
+
+**JSONL 示例输出：**
+
+```json
+{
+  "image": "park.jpg",
+  "objects": ["dog", "frisbee", "ufo"],
+  "bbox_result": {
+    "dog": ["[0.125, 0.450, 0.230, 0.600]"],
+    "frisbee": ["[0.240, 0.500, 0.280, 0.540]"]
+    // "ufo" 未检测到，因此不出现在结果中
+  }
+}
+
+```
diff --git a/docs/zh/notes/mm_operators/image_understanding/refine/visual_dependency_refiner.md b/docs/zh/notes/mm_operators/image_understanding/refine/visual_dependency_refiner.md
new file mode 100644
index 00000000..688e8f02
--- /dev/null
+++ b/docs/zh/notes/mm_operators/image_understanding/refine/visual_dependency_refiner.md
@@ -0,0 +1,141 @@
+---
+title: VisualDependencyRefiner
+createTime: 2026/01/11 20:27:11
+permalink: /zh/mm_operators/refine/visual_dependency_refiner/
+---
+## 📘 概述
+
+`VisualDependencyRefiner` 是一个 **视觉依赖性校验算子**，用于对多项选择题（MCQ）进行严格的质量控制。
+
+在多模态数据集中，许多问题其实无需看图即可通过常识或文本偏差（Textual Bias）回答。本算子通过 **“旋转 + 双盲测试”** 机制，筛选出那些 **必须依赖视觉信息（High Visual Acc）** 且 **不能仅凭文本猜测（Low Text Acc）** 的高质量问题。
+
+核心机制：
+
+1. **选项旋转（Rotation）**：对同一道题多次打乱选项顺序，消除模型对选项位置（如总是选 A）的偏好。
+2. **双盲对比（Visual vs Text-only）**：
+* **有图模式**：输入图片 + 问题，要求高准确率。
+* **纯文本模式**：仅输入问题（盲测），要求低准确率（接近随机猜测）。
+
+
+
+## `__init__`函数
+
+```python
+def __init__(
+    self, 
+    serving: LLMServingABC, 
+    instruction_template: str,
+    rotate_num: int = 4,
+    pass_visual_min: float = 1.0,
+    pass_textual_max: float = 0.25, 
+    add_none_above_visual: bool = True
+):
+
+```
+
+### 参数说明
+
+| 参数名 | 类型 | 默认值 | 说明 |
+| --- | --- | --- | --- |
+| `serving` | `LLMServingABC` | 无 | 负责执行推理的模型服务实例（需支持多模态和纯文本）。 |
+| `instruction_template` | `str` | 无 | 提示词模板，需包含 `{}` 占位符以填入问题和选项。 |
+| `rotate_num` | `int` | `4` | 校验轮次。每道题会生成 N 个不同选项顺序的变体进行测试。 |
+| `pass_visual_min` | `float` | `1.0` | **视觉通过阈值**。有图模式下的准确率需  此值才算合格（默认要求 100% 正确）。 |
+| `pass_textual_max` | `float` | `0.25` | **文本过滤阈值**。无图模式下的准确率需  此值才算合格（默认 25%，即 4 选项的随机概率）。 |
+| `add_none_above_visual` | `bool` | `True` | 是否在**有图模式**的选项中动态增加 "None of the above" 干扰项，以增加难度并减少幻觉。 |
+
+## `run`函数
+
+```python
+def run(
+    self, 
+    storage: DataFlowStorage, 
+    input_list_key: str, 
+    input_image_key: str, 
+    output_key: str
+):
+    ...
+
+```
+
+执行算子主逻辑：
+
+1. **读取数据**
+遍历 DataFrame，获取图像路径（`input_image_key`）和对应的 MCQ 列表（`input_list_key`）。
+2. **双盲测试构建**
+对列表中的每一道题，循环 `rotate_num` 次：
+* **Visual Case**：打乱选项顺序（可选加入 "None of the above"），构建 `[Image, Instruction]` 的 Prompt。
+* **Text-Only Case**：打乱选项顺序（不加干扰项），仅构建 `[Instruction]` 的 Prompt。
+
+
+3. **批量推理**
+* 将 Visual Prompts 和 Text Prompts 分别组成 Batch。
+* 调用 `serving.generate_from_input` 分别获取有图和无图的推理结果。
+
+
+4. **准确率计算与过滤**
+* 解析模型输出的选项字母（A/B/C...）。
+* 计算 **Visual Accuracy (`v_acc`)** 和 **Text-Only Accuracy (`l_acc`)**。
+* 仅保留满足条件 `v_acc >= pass_visual_min` **且** `l_acc <= pass_textual_max` 的题目。
+
+
+5. **结果保存**
+将筛选后的题目列表写入 `output_key` 列。
+
+### 参数说明
+
+| 参数名 | 类型 | 默认值 | 说明 |
+| --- | --- | --- | --- |
+| `storage` | `DataFlowStorage` | 无 | Dataflow 的读写存储对象。 |
+| `input_list_key` | `str` | 无 | 包含 MCQ 题目列表（List[Dict]）的列名。 |
+| `input_image_key` | `str` | 无 | 图像路径所在的列名。 |
+| `output_key` | `str` | 无 | 输出筛选后题目列表的列名。 |
+
+## 🧠 示例用法
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.core import LLMServing
+from dataflow.operators.refine import VisualDependencyRefiner
+
+# 1) 初始化模型服务 (例如 Qwen-VL)
+serving = LLMServing(model_path="Qwen/Qwen-VL-Chat", device="cuda")
+
+# 2) 初始化校验算子
+# 要求：看图必须全对 (1.0)，不看图准确率不超过 25% (0.25)
+refiner = VisualDependencyRefiner(
+    serving=serving,
+    instruction_template="Answer the question based on the image.\n{}",
+    rotate_num=4,
+    pass_visual_min=1.0,
+    pass_textual_max=0.25
+)
+
+# 3) 执行过滤
+refiner.run(
+    storage=storage,
+    input_list_key="generated_qas",  # 之前生成的题目列表
+    input_image_key="image_path",
+    output_key="refined_qas"
+)
+
+```
+
+### 🧾 默认输出格式
+
+`output_key` 列包含过滤后的题目列表，每道题会新增一个 `stats` 字段记录测试结果：
+
+```json
+[
+  {
+    "question": "What color is the car?",
+    "options": {"A": "Red", "B": "Blue", ...},
+    "answer": "A",
+    "stats": {
+      "v_acc": 1.0,  // 有图准确率
+      "t_acc": 0.0   // 无图准确率
+    }
+  }
+]
+
+```
diff --git a/docs/zh/notes/mm_operators/image_understanding/refine/visual_grounding_refiner.md b/docs/zh/notes/mm_operators/image_understanding/refine/visual_grounding_refiner.md
new file mode 100644
index 00000000..4d7ef6b7
--- /dev/null
+++ b/docs/zh/notes/mm_operators/image_understanding/refine/visual_grounding_refiner.md
@@ -0,0 +1,123 @@
+---
+title: VisualGroundingRefiner
+createTime: 2026/01/11 20:33:54
+permalink: /zh/mm_operators/refine/visual_grounding_refiner/
+---
+## 📘 概述
+
+`VisualGroundingRefiner` 是一个 **视觉一致性精炼算子**，用于消除多模态文本生成中的“幻觉” (Hallucination)。
+
+该算子接收一个文本列表（如生成的标签、句子或属性）和一张图像，通过 VLM 对列表中的每一项进行 **逐项视觉验证 (Visual Verification)**。它利用“Yes/No”判别机制，仅保留模型判定为“Yes”（即与图像内容一致）的文本项，从而过滤掉图像中不存在的物体或错误的描述。
+
+## `__init__`函数
+
+```python
+def __init__(
+    self, 
+    serving: LLMServingABC, 
+    prompt_template: str, 
+    system_prompt: str = "You are a helpful assistant."
+):
+
+```
+
+### 参数说明
+
+| 参数名 | 类型 | 默认值 | 说明 |
+| --- | --- | --- | --- |
+| `serving` | `LLMServingABC` | 无 | 负责执行推理的模型服务实例（需支持 VLM 多模态推理）。 |
+| `prompt_template` | `str` | 无 | 验证用的 Prompt 模板。**必须包含 `{text}` 占位符**，且设计为引导模型回答 "Yes" 或 "No" 的问题。 |
+| `system_prompt` | `str` | `"You are..."` | 发送给模型的系统提示词。 |
+
+## `run`函数
+
+```python
+def run(
+    self, 
+    storage: DataFlowStorage, 
+    input_list_key: str, 
+    input_image_key: str, 
+    output_key: str
+):
+    ...
+
+```
+
+执行算子主逻辑：
+
+1. **读取数据**
+从 DataFrame 中获取待验证的文本列表（`input_list_key`）和对应图像路径（`input_image_key`）。
+2. **构造批量请求 (Batch Construction)**
+对列表中的每一个文本项 `item`：
+* 使用 `prompt_template.format(text=item)` 生成询问语句。
+* 构造包含 `[Image, Text]` 的多模态消息。
+
+
+3. **批量推理 (Batch Inference)**
+* 将单张图片对应的多个文本验证请求打包成 Batch。
+* 调用 `serving.generate_from_input` 进行并行推理，获取回答。
+
+
+4. **过滤逻辑 (Filtering)**
+* 检查模型的回答是否包含 `"yes"` (大小写不敏感)。
+* **保留** 回答为 Yes 的项，**丢弃** 回答为 No 或其他的项。
+
+
+5. **结果保存**
+将过滤后的新列表写入 `output_key`。
+
+### 参数说明
+
+| 参数名 | 类型 | 默认值 | 说明 |
+| --- | --- | --- | --- |
+| `storage` | `DataFlowStorage` | 无 | Dataflow 的读写存储对象。 |
+| `input_list_key` | `str` | 无 | 待验证的文本列表所在的列名（List[str]）。 |
+| `input_image_key` | `str` | 无 | 图像路径所在的列名。 |
+| `output_key` | `str` | 无 | 验证后保留的文本列表输出列名。 |
+
+## 🧠 示例用法
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.core import LLMServing
+from dataflow.operators.refine import VisualGroundingRefiner
+
+# 1) 初始化模型服务
+serving = LLMServing(model_path="Qwen/Qwen-VL-Chat", device="cuda")
+
+# 2) 初始化精炼算子
+# 模板设计要点：明确要求回答 Yes 或 No
+refiner = VisualGroundingRefiner(
+    serving=serving,
+    prompt_template="Look at the image. Is the object '{text}' visible in the scene? Answer only Yes or No."
+)
+
+# 3) 执行过滤
+refiner.run(
+    storage=storage,
+    input_list_key="candidate_tags",  # 例如 ["Cat", "Dog", "UFO"]
+    input_image_key="image_path",
+    output_key="verified_tags"
+)
+
+```
+
+### 🧾 默认输出格式
+
+`output_key` 列为过滤后的字符串列表：
+
+示例输入 (`candidate_tags`):
+
+```json
+["Cat", "Grass", "Flying Saucer"]
+
+```
+
+*(假设图片是一只猫在草地上)*
+
+示例输出 (`verified_tags`):
+
+```json
+["Cat", "Grass"]
+
+```
diff --git a/docs/zh/notes/mm_operators/image_understanding/refine/wiki_qa_refiner.md b/docs/zh/notes/mm_operators/image_understanding/refine/wiki_qa_refiner.md
new file mode 100644
index 00000000..e74cd085
--- /dev/null
+++ b/docs/zh/notes/mm_operators/image_understanding/refine/wiki_qa_refiner.md
@@ -0,0 +1,128 @@
+---
+title: WikiQARefiner
+createTime: 2025/10/15 19:00:00
+# icon: material-symbols-light:article
+permalink: /zh/mm_operators/refine/wikiqa/
+--------------------------------------------
+## 📘 概述
+
+`WikiQARefiner` 是一个 **纯文本处理算子**，不依赖任何模型或 GPU 资源。它主要用于清洗和规范化包含 **Wikipedia Article** 与 **Question Answer Pairs** 的原始文本。
+
+该算子通过鲁棒的规则解析逻辑，去除 Markdown 富文本噪声（如加粗、斜体等），识别并分离正文与问答对，最终将非结构化文本转换为标准的 `{context, qas}` JSON 结构，适用于 RAG 或阅读理解数据的预处理流水线。
+
+## `__init__`函数
+
+```python
+def __init__(self):
+
+```
+
+该算子无需初始化参数。启动时仅创建日志实例，无模型加载开销。
+
+## `run`函数
+
+```python
+def run(
+    self,
+    storage: DataFlowStorage,
+    input_key: str = "text",
+    output_key: str = "parsed",
+):
+    ...
+
+```
+
+执行算子主逻辑：
+
+1. **读取数据**
+从 `storage` 中读取 DataFrame，根据 `input_key` 获取原始文本列。
+2. **正文解析（Context Parsing）**
+* 自动扫描文本，识别并分离 Article 正文区域。
+* 支持多种标题变体识别（如 `### QA`, `### Q&A`, `### Question Answer Pairs` 等，大小写不敏感）。
+* **清洗噪声**：移除 Markdown 标记（如 `**bold**`, `*italic*`）及多余的空白字符，保留纯净文本作为 `context`。
+
+
+3. **问答对解析（QA Parsing）**
+* 基于**行结构**而非严格的 Markdown 语法进行解析，具有极高的鲁棒性。
+* **问题识别**：以数字加点（如 `1.`, `2.`）作为起始标记。
+* **答案识别**：以连字符（`-`, `–`, `—`）引导的行作为答案。
+* 即使面对嵌套加粗（`**Q**`）、不规范换行或 Markdown 语法错误，也能正确提取 `question` 和 `answer`。
+
+
+4. **结果构造与输出**
+* 将解析得到的 `context` 字符串与 `qas` 列表组合为字典。
+* 将结果写入 `output_key` 指定的新列，并更新 `storage`。
+
+
+
+### 参数说明
+
+| 参数名 | 类型 | 默认值 | 说明 |
+| --- | --- | --- | --- |
+| `storage` | `DataFlowStorage` | 无 | Dataflow 的读写存储对象。 |
+| `input_key` | `str` | `"text"` | 输入原始文本所在的列名。 |
+| `output_key` | `str` | `"parsed"` | 解析后的结构化数据输出列名。 |
+
+## 🧠 示例用法
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.text import WikiQARefiner
+
+# 1) 准备 FileStorage
+storage = FileStorage(
+    first_entry_file_name="data/wiki_raw.jsonl",
+    cache_path="./cache_local",
+    file_name_prefix="wikiqa_refined",
+    cache_type="jsonl",
+)
+
+# 2) 初始化 Refiner 算子
+op = WikiQARefiner()
+
+# 3) 执行解析
+op.run(
+    storage=storage.step(),
+    input_key="text",
+    output_key="parsed",
+)
+
+```
+
+### 🧾 默认输出格式（Output Format）
+
+`output_key` 列中的数据结构如下：
+
+| 字段名 | 类型 | 说明 |
+| --- | --- | --- |
+| `context` | `string` | 清洗后的 Wikipedia 正文内容。 |
+| `qas` | `List[Dict]` | 问答对列表，每项包含 `question` 和 `answer` 字段。 |
+
+示例输入：
+
+```jsonl
+{
+  "id": 1,
+  "text": "### Wikipedia Article\nArtificial **intelligence** (AI) is...\n\n### Question Answer Pairs\n1. What does AI stand for?\n- Artificial Intelligence."
+}
+
+```
+
+示例输出：
+
+```jsonl
+{
+  "id": 1,
+  "text": "...",
+  "parsed": {
+    "context": "Artificial intelligence (AI) is...",
+    "qas": [
+      {
+        "question": "What does AI stand for?",
+        "answer": "Artificial Intelligence."
+      }
+    ]
+  }
+}
+
+```
\ No newline at end of file