OpenDCAI · haolpku · Nov 12, 2025 · Oct 15, 2025 · Oct 16, 2025 · Oct 16, 2025
diff --git a/docs/.vuepress/notes/en/mm_operators.ts b/docs/.vuepress/notes/en/mm_operators.ts
@@ -22,6 +22,26 @@ export const MMOperators: ThemeNote = defineNoteConfig({
             prefix: 'image_understanding',
             items: [
                 'install_image_understanding',
+                'generate/image_caption',
+                'generate/image_qa',
+                'generate/image_pers_qa',
+                'generate/multimodal_math',
+                'generate/vision_mct_reasoning',
+                'generate/image_region_caption',
+                'generate/image_scale_caption',
+                'generate/image_gcot',
+                'generate/image_skvqa',
+                'generate/image_caprl',
+                'eval/clip_image_text_evaluator',
+                'eval/longclip_image_text_evaluator',
+                'eval/vqa_score_image_text_evaluator',
+                'filter/cat_filter',
+                'filter/clip_filter',
+                'filter/complexity_filter',
+                'filter/deduplication_filter',
+                'filter/image_aesthetic_filter',
+                'filter/sensitive_filter',
+                'filter/text_image_diversity_filter'
             ],
         },
         {

diff --git a/docs/.vuepress/notes/zh/mm_operators.ts b/docs/.vuepress/notes/zh/mm_operators.ts
@@ -23,6 +23,26 @@ export const MMOperators: ThemeNote = defineNoteConfig({
             prefix: 'image_understanding',
             items: [
                 'install_image_understanding',
+                'generate/image_caption',
+                'generate/image_qa',
+                'generate/image_pers_qa',
+                'generate/multimodal_math',
+                'generate/vision_mct_reasoning',
+                'generate/image_region_caption',
+                'generate/image_scale_caption',
+                'generate/image_gcot',
+                'generate/image_skvqa',
+                'generate/image_caprl',
+                'eval/clip_image_text_evaluator',
+                'eval/longclip_image_text_evaluator',
+                'eval/vqa_score_image_text_evaluator',
+                'filter/cat_filter',
+                'filter/clip_filter',
+                'filter/complexity_filter',
+                'filter/deduplication_filter',
+                'filter/image_aesthetic_filter',
+                'filter/sensitive_filter',
+                'filter/text_image_diversity_filter'
             ],
         },
         {

diff --git a/docs/en/notes/mm_operators/image_understanding/eval/clip_image_text_evaluator.md b/docs/en/notes/mm_operators/image_understanding/eval/clip_image_text_evaluator.md
@@ -0,0 +1,107 @@
+---
+title: clip_image_text_evaluator
+createTime: 2025/10/15 19:56:33
+icon: material-symbols-light:image
+permalink: /en/mm_operators/eval/clip_image_text_evaluator/
+---
+## 📘 Overview
+`CLIPEvaluator` computes the **image-text alignment score** using **CLIP**, with scores ranging from `[0,1]`.  
+Internally, it encodes both the image and text using the CLIP model, performs L2 normalization,  
+then calculates cosine similarity and linearly maps it to `[0,1]` via `(cos + 1) / 2`.
+
+
+## ```__init__```
+```python
+def __init__(
+    self,
+    model_name: str = "/data0/happykeyan/workspace/ckpt/clip-vit-base-patch32",
+    device: str = None
+)
+```
+
+
+## `init` Parameters
+| Parameter | Type | Default | Description |
+| :--- | :--- | :--- | :--- |
+| `model_name` | `str` | `"/data0/happykeyan/workspace/ckpt/clip-vit-base-patch32"` | Local path or Hugging Face Model ID for the CLIP model. Loaded via `CLIPProcessor` / `CLIPModel` (`use_safetensors=True`). |
+| `device` | `str \| None` | `None` | The inference device. Automatically selects `"cuda"` if available, otherwise falls back to `"cpu"`. |
+
+
+
+## `run`
+```python
+def run(
+    self,
+    storage: DataFlowStorage,
+    image_key: str = "image_path",
+    text_key: str = "text",
+    output_key: str = "clip_score"
+):
+    ...
+```
+
+Parameters
+| Parameter | Type | Default | Description |
+| :--- | :--- | :--- | :--- |
+| `storage` | `DataFlowStorage` | — | The Dataflow storage object used for reading and writing data. |
+| `image_key` | `str` | `"image_path"` | The column name containing the image path. |
+| `text_key` | `str` | `"text"` | The column name containing the text input. |
+| `output_key` | `str` | `"clip_score"` | The column name for storing the output alignment score (range `[0,1]`). |
+
+
+
+## 🧠 Example Usage
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.core_vision import CLIPEvaluator
+
+# 1) Prepare FileStorage (must contain image_path and text columns)
+storage = FileStorage(
+    first_entry_file_name="data/clip_input.jsonl",
+    cache_path="./cache_local",
+    file_name_prefix="clip_eval",
+    cache_type="jsonl"
+)
+
+# 2) Initialize the operator (can also use HF model ID, e.g. "openai/clip-vit-base-patch32")
+evaluator = CLIPEvaluator(
+    model_name="/data0/happykeyan/workspace/ckpt/clip-vit-base-patch32",
+    device=None  # automatically chooses cuda/cpu
+)
+
+# 3) Execute evaluation
+cols = evaluator.run(
+    storage=storage.step(),
+    image_key="image_path",
+    text_key="text",
+    output_key="clip_score"
+)
+print(cols)  # ["clip_score"]
+```
+
+### 🧾 Default Output Format
+| Field | Type | Description |
+| :--- | :--- | :--- |
+| `image_path` / specified `image_key` | `string` | The input image path. |
+| `text` / specified `text_key` | `string` | The input text. |
+| `clip_score` / specified `output_key` | `float` | The image-text alignment score (range `[0,1]`). |
+
+
+
+Example Input:
+```jsonl
+{
+  "image_path": "1.png",
+  "text": "The image shows a man and a woman in what appears to be a car."
+}
+```
+
+Example Output:
+```jsonl
+{
+  "image_path": "1.png",
+  "text": "The image shows a man and a woman in what appears to be a car.",
+  "clip_score": 0.642
+}
+```
diff --git a/...en/notes/mm_operators/image_understanding/eval/longclip_image_text_evaluator.md b/...en/notes/mm_operators/image_understanding/eval/longclip_image_text_evaluator.md
@@ -0,0 +1,109 @@
+---
+title: longclip_image_text_evaluator
+createTime: 2025/10/15 19:56:29
+icon: material-symbols-light:image
+permalink: /en/mm_operators/eval/longclip_image_text_evaluator/
+---
+## 📘 Overview
+`LongCLIPEvaluator` computes **image–long-text alignment scores** using **LongCLIP**, producing scores in the range `[0,1]`.  
+Compared to the standard CLIP model, LongCLIP supports longer text contexts (default `context_length=248`),  
+making it ideal for paragraph-level description evaluation and alignment tasks.
+
+
+
+## ```__init__```
+```python
+def __init__(
+    self,
+    ckpt_path: str = "/data0/happykeyan/DataFlow-MM/Dataflow-MM-Preview/ckpt/LongCLIP-L-336px/longclip-L@336px.pt",
+    device: str = None,
+):
+    ...
+```
+
+
+## `init` Parameters
+| Parameter | Type | Default | Description |
+| :--- | :--- | :--- | :--- |
+| `model_name` | `str` | `"/data0/happykeyan/workspace/ckpt/clip-vit-base-patch32"` | Local path or Hugging Face Model ID for the CLIP model. Loaded via `CLIPProcessor` / `CLIPModel` (`use_safetensors=True`). |
+| `device` | `str \| None` | `None` | The inference device. Automatically selects `"cuda"` if available, otherwise falls back to `"cpu"`. |
+
+
+
+## `run`
+```python
+def run(
+    self,
+    storage: DataFlowStorage,
+    image_key: str = "image_path",
+    text_key: str = "text",
+    output_key: str = "clip_score"
+):
+    ...
+```
+
+Parameters
+| Parameter | Type | Default | Description |
+| :--- | :--- | :--- | :--- |
+| `storage` | `DataFlowStorage` | — | The Dataflow storage object used for reading and writing data. |
+| `image_key` | `str` | `"image_path"` | The column name containing the image path. |
+| `text_key` | `str` | `"text"` | The column name containing the text input. |
+| `output_key` | `str` | `"clip_score"` | The column name for storing the output alignment score (range `[0,1]`). |
+
+
+
+## 🧠 Example Usage
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.core_vision import LongCLIPEvaluator
+
+# 1) Prepare FileStorage (must include image_path and text columns)
+storage = FileStorage(
+    first_entry_file_name="data/longclip_input.jsonl",
+    cache_path="./cache_local",
+    file_name_prefix="longclip_eval",
+    cache_type="jsonl"
+)
+
+# 2) Initialize LongCLIP evaluator (replace ckpt_path with your checkpoint)
+evaluator = LongCLIPEvaluator(
+    ckpt_path="/data0/happykeyan/DataFlow-MM/Dataflow-MM-Preview/ckpt/LongCLIP-L-336px/longclip-L@336px.pt",
+    device=None  # auto-selects cuda/cpu
+)
+
+# 3) Run evaluation — adds a new column 'longclip_score' ∈ [0,1]
+cols = evaluator.run(
+    storage=storage.step(),
+    image_key="image_path",
+    text_key="text",
+    output_key="longclip_score"
+)
+print(cols)  # ["longclip_score"]
+```
+
+### 🧾 Default Output Format
+| Field | Type | Description |
+| :--- | :--- | :--- |
+| `image_path` / specified `image_key` | `string` | The input image path. |
+| `text` / specified `text_key` | `string` | The input text. |
+| `clip_score` / specified `output_key` | `float` | The image-text alignment score (range `[0,1]`). |
+
+
+
+Example Input:
+```jsonl
+{
+  "image_path": "1.png",
+  "text": "The image shows a man and a woman in what appears to be a car."
+}
+```
+
+Example Output:
+```jsonl
+{
+  "image_path": "1.png",
+  "text": "The image shows a man and a woman in what appears to be a car.",
+  "clip_score": 0.642
+}
+```
diff --git a/...n/notes/mm_operators/image_understanding/eval/vqa_score_image_text_evaluator.md b/...n/notes/mm_operators/image_understanding/eval/vqa_score_image_text_evaluator.md
@@ -0,0 +1,113 @@
+---
+title: vqa_score_image_text_evaluator
+createTime: 2025/10/15 19:56:31
+icon: material-symbols-light:image
+permalink: /en/mm_operators/eval/vqa_score_image_text_evaluator/
+---
+## 📘 Overview
+`VQAScoreEvaluator` uses **BLIP VQA** to transform the question *“Does this image match the description?”* into a **Yes/No probability score** ranging from `[0,1]`.  
+- Constructed question: `Does this image match the description: {text}? Answer yes or no.`  
+- Forward pass twice with labels `"yes"` and `"no"` to obtain respective losses `L_yes` and `L_no`.  
+- Compute normalized probabilities.
+
+## ```__init__```
+```python
+def __init__(
+    self,
+    model_name: str = "/data0/happykeyan/DataFlow-MM/Dataflow-MM-Preview/ckpt/blip-vqa-base",
+    device: str = None,
+    local_only: bool = True,
+):
+    ...
+```
+
+
+## `__init__` Parameters
+| Parameter | Type | Default | Description |
+| :--- | :--- | :--- | :--- |
+| `model_name` | `str` | `"/data0/happykeyan/DataFlow-MM/Dataflow-MM-Preview/ckpt/blip-vqa-base"` | Path or Hugging Face Model ID of the BLIP VQA model. Loaded via `BlipProcessor` and `BlipForQuestionAnswering`. |
+| `device` | `str \| None` | `None` | Inference device. Automatically selects `"cuda"` if available, otherwise falls back to `"cpu"`. |
+| `local_only` | `bool` | `True` | If `True`, load model weights only from local files (`local_files_only=True`). |
+
+
+
+
+## `run`
+```python
+def run(
+    self,
+    storage: DataFlowStorage,
+    image_key: str = "image_path",
+    text_key: str = "text",
+    output_key: str = "clip_score"
+):
+    ...
+```
+
+Parameters
+| Parameter | Type | Default | Description |
+| :--- | :--- | :--- | :--- |
+| `storage` | `DataFlowStorage` | — | The Dataflow storage object for reading and writing data. |
+| `image_key` | `str` | `"image_path"` | Column name containing the input image path. |
+| `text_key` | `str` | `"text"` | Column name containing the text description. |
+| `output_key` | `str` | `"vqa_score"` | Column name where the computed Yes-probability score (range `[0,1]`) is stored. |
+
+
+
+## 🧠 Example Usage
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.core_vision import VQAScoreEvaluator
+
+# 1) Prepare FileStorage (must include image_path and text columns)
+storage = FileStorage(
+    first_entry_file_name="data/vqa_input.jsonl",
+    cache_path="./cache_local",
+    file_name_prefix="vqa_score",
+    cache_type="jsonl"
+)
+
+# 2) Initialize the evaluator (you can replace with your own model path or HF model ID)
+evaluator = VQAScoreEvaluator(
+    model_name="/data0/happykeyan/DataFlow-MM/Dataflow-MM-Preview/ckpt/blip-vqa-base",
+    device=None,       # auto-select cuda/cpu
+    local_only=True    # load from local weights only
+)
+
+# 3) Run evaluation (adds a column 'vqa_score')
+cols = evaluator.run(
+    storage=storage.step(),
+    image_key="image_path",
+    text_key="text",
+    output_key="vqa_score"
+)
+print(cols)  # ["vqa_score"]
+```
+
+### 🧾 Default Output Format
+| Field | Type | Description |
+| :--- | :--- | :--- |
+| `image_path` / specified `image_key` | `string` | The input image path. |
+| `text` / specified `text_key` | `string` | The input text description. |
+| `vqa_score` / specified `output_key` | `float` | The BLIP-predicted probability that the image matches the text (Yes probability, range `[0,1]`). |
+
+
+
+
+Example Input:
+```jsonl
+{
+  "image_path": "1.png",
+  "text": "The image shows a man and a woman in what appears to be a car."
+}
+```
+
+Example Output:
+```jsonl
+{
+  "image_path": "1.png",
+  "text": "The image shows a man and a woman in what appears to be a car.",
+  "vqa_score": 0.774
+}
+```