diff --git a/docs/.vuepress/notes/en/mm_guide.ts b/docs/.vuepress/notes/en/mm_guide.ts index 4986b80b..a156098f 100644 --- a/docs/.vuepress/notes/en/mm_guide.ts +++ b/docs/.vuepress/notes/en/mm_guide.ts @@ -22,6 +22,12 @@ export const MMGuide: ThemeNote = defineNoteConfig({ prefix: 'image_understanding', items: [ 'install_image_understanding', + 'context_vqa', + 'image_gcot', + 'vision_mct_reasoning_pipeline', + 'image_region_caption_pipeline', + 'image_scale_caption_pipeline', + 'image_visual_only_mcq_pipeline', ], }, { @@ -35,7 +41,8 @@ export const MMGuide: ThemeNote = defineNoteConfig({ 'video_clip_and_filter', 'video_qa', 'video_cotqa', - 'video_longvideo_cotqa_api' + 'video_longvideo_cotqa_api', + 'multirole_videoqa_pipeline' ], }, { diff --git a/docs/.vuepress/notes/en/mm_operators.ts b/docs/.vuepress/notes/en/mm_operators.ts index af463fac..a7be6236 100644 --- a/docs/.vuepress/notes/en/mm_operators.ts +++ b/docs/.vuepress/notes/en/mm_operators.ts @@ -21,27 +21,71 @@ export const MMOperators: ThemeNote = defineNoteConfig({ icon: 'carbon:idea', prefix: 'image_understanding', items: [ - 'install_image_understanding', - 'generate/image_caption', - 'generate/image_qa', - 'generate/image_pers_qa', - 'generate/multimodal_math', - 'generate/vision_mct_reasoning', - 'generate/image_region_caption', - 'generate/image_scale_caption', - 'generate/image_gcot', - 'generate/image_skvqa', - 'generate/image_caprl', - 'eval/clip_image_text_evaluator', - 'eval/longclip_image_text_evaluator', - 'eval/vqa_score_image_text_evaluator', - 'filter/cat_filter', - 'filter/clip_filter', - 'filter/complexity_filter', - 'filter/deduplication_filter', - 'filter/image_aesthetic_filter', - 'filter/sensitive_filter', - 'filter/text_image_diversity_filter' + { + text: 'install', + collapsed: false, + prefix: '', + items: ['install_image_understanding'], + }, + { + text: "generate", + collapsed: false, + prefix: 'generate/', + items: [ + 'image_caption', + 'image_qa', + 'image_pers_qa', + 'multimodal_math', + "prompt_templated_vqa_generator", + "fix_prompted_vqa_generator", + "prompted_vqa_generator", + "batch_vqa_generator", + "visual_reasoning_generator", + "vlm_bbox_generator", + "image_bbox_generator" + // 'vision_mct_reasoning', + // 'image_region_caption', + // 'image_scale_caption', + // 'image_gcot', + // 'image_caprl', + // 'multirole_videoqa', + ] + }, + { + text: "eval", + collapsed: false, + prefix: 'eval/', + items: [ + 'image_clip_evaluator', + 'image_longclip_evaluator', + 'image_vqa_evaluator', + ] + }, + { + text: "filter", + collapsed: false, + prefix: 'filter/', + items: [ + 'image_aesthetic_filter', + 'image_cat_filter', + 'image_clip_filter', + 'image_complexity_filter', + 'image_consistency_filter', + 'image_deduplication_filter', + 'image_diversity_filter', + 'image_sensitive_filter', + ] + }, + { + text: "refine", + collapsed: false, + prefix: 'refine/', + items: [ + 'visual_dependency_refiner', + 'visual_grounding_refiner', + 'wiki_qa_refiner', + ] + } ], }, { diff --git a/docs/.vuepress/notes/zh/mm_guide.ts b/docs/.vuepress/notes/zh/mm_guide.ts index 7c9f6e2e..47399ba1 100644 --- a/docs/.vuepress/notes/zh/mm_guide.ts +++ b/docs/.vuepress/notes/zh/mm_guide.ts @@ -23,6 +23,12 @@ export const MMGuide: ThemeNote = defineNoteConfig({ prefix: 'image_understanding', items: [ 'install_image_understanding', + 'context_vqa', + 'image_gcot', + 'vision_mct_reasoning_pipeline', + 'image_region_caption_pipeline', + 'image_scale_caption_pipeline', + 'image_visual_only_mcq_pipeline', ], }, { @@ -36,7 +42,8 @@ export const MMGuide: ThemeNote = defineNoteConfig({ 'video_clip_and_filter', 'video_qa', 'video_cotqa', - 'video_longvideo_cotqa_api' + 'video_longvideo_cotqa_api', + 'multirole_videoqa_pipeline' ], }, { diff --git a/docs/.vuepress/notes/zh/mm_operators.ts b/docs/.vuepress/notes/zh/mm_operators.ts index 77c6fbf0..28097da1 100644 --- a/docs/.vuepress/notes/zh/mm_operators.ts +++ b/docs/.vuepress/notes/zh/mm_operators.ts @@ -22,27 +22,71 @@ export const MMOperators: ThemeNote = defineNoteConfig({ icon: 'carbon:idea', prefix: 'image_understanding', items: [ - 'install_image_understanding', - 'generate/image_caption', - 'generate/image_qa', - 'generate/image_pers_qa', - 'generate/multimodal_math', - 'generate/vision_mct_reasoning', - 'generate/image_region_caption', - 'generate/image_scale_caption', - 'generate/image_gcot', - 'generate/image_skvqa', - 'generate/image_caprl', - 'eval/clip_image_text_evaluator', - 'eval/longclip_image_text_evaluator', - 'eval/vqa_score_image_text_evaluator', - 'filter/cat_filter', - 'filter/clip_filter', - 'filter/complexity_filter', - 'filter/deduplication_filter', - 'filter/image_aesthetic_filter', - 'filter/sensitive_filter', - 'filter/text_image_diversity_filter' + { + text: '安装', + collapsed: false, + prefix: '', + items: ['install_image_understanding'], + }, + { + text: "generate", + collapsed: false, + prefix: 'generate/', + items: [ + 'image_caption', + 'image_qa', + 'image_pers_qa', + 'multimodal_math', + 'prompt_templated_vqa_generator', + 'fix_prompted_vqa_generator', + "prompted_vqa_generator", + "batch_vqa_generator", + "visual_reasoning_generator", + "vlm_bbox_generator", + "image_bbox_generator" + // 'vision_mct_reasoning', + // 'image_region_caption', + // 'image_scale_caption', + // 'image_gcot', + // 'image_caprl', + // 'multirole_videoqa', + ] + }, + { + text: "eval", + collapsed: false, + prefix: 'eval/', + items: [ + 'image_clip_evaluator', + 'image_longclip_evaluator', + 'image_vqa_evaluator', + ] + }, + { + text: "filter", + collapsed: false, + prefix: 'filter/', + items: [ + 'image_aesthetic_filter', + 'image_cat_filter', + 'image_clip_filter', + 'image_complexity_filter', + 'image_consistency_filter', + 'image_deduplication_filter', + 'image_diversity_filter', + 'image_sensitive_filter', + ] + }, + { + text: "refine", + collapsed: false, + prefix: 'refine/', + items: [ + 'visual_dependency_refiner', + 'visual_grounding_refiner', + 'wiki_qa_refiner', + ] + } ], }, { diff --git a/docs/en/notes/mm_guide/image_understanding/context_vqa.md b/docs/en/notes/mm_guide/image_understanding/context_vqa.md new file mode 100644 index 00000000..8d228fe7 --- /dev/null +++ b/docs/en/notes/mm_guide/image_understanding/context_vqa.md @@ -0,0 +1,297 @@ +--- + +title: ContextVQA Multimodal Question Answering Data Generation Pipeline +icon: mdi:image-text +createTime: 2025/06/16 14:30:00 +permalink: /en/mm_guide/contextvqa_pipeline/ +--- +## 1. Overview + +The **ContextVQA Generation Pipeline** is designed to automatically generate **Context-based Visual Question Answering (VQA)** data from images. This pipeline utilizes a Vision-Language Model (VLM) to generate a Wikipedia-style article and Q&A pairs related to an image, and subsequently parses them into structured data. + +We support the following application scenarios: + +* **Knowledge-based VQA Synthesis**: Constructing datasets requiring reasoning over external knowledge. +* **Multimodal RAG Construction**: Generating high-quality data for Retrieval-Augmented Generation training. +* **Visual Reasoning Tasks**: Creating data where questions point to the image, but answers are derived from the text context. + +The main process of the pipeline includes: + +1. **Data Loading**: Reading data files containing image paths. +2. **Context & QA Generation**: Using a VLM to analyze the image and generate a Wiki-style article with Q&A pairs. +3. **Refining & Structuring**: Parsing the raw generated text into a structured `{context, qas}` format. + +--- + +## 2. Quick Start + +### Step 1: Create a Working Directory + +```bash +mkdir run_context_vqa +cd run_context_vqa + +``` + +### Step 2: Prepare the Script + +Save the code provided in the "Pipeline Example" section below as `context_vqa_pipeline.py`. + +### Step 3: Configure Parameters + +This pipeline supports CLI argument configuration. You can specify the model path and input file directly via the command line: + +```bash +# Ensure dependencies are installed +pip install open-dataflow vllm + +``` + +### Step 4: Run + +```bash +python context_vqa_pipeline.py \ + --model_path "Qwen/Qwen2.5-VL-3B-Instruct" \ + --images_file "path/to/your/images.jsonl" \ + --cache_path "./cache_local" + +``` + +--- + +## 3. Data Flow & Logic + +### 1. **Input Data** + +The input data for this process includes the following fields: + +* **image**: Path to the image file (local path or URL). +* **id** (optional): Unique identifier for the data entry. + +Data is managed via `FileStorage`, which supports resuming from checkpoints. + +**Input Data Example**: + +```json +[ + { + "id": 1, + "image": "./images/landmark.jpg" + }, + { + "id": 2, + "image": "./images/animal.jpg" + } +] + +``` + +### 2. **Core Operator Logic** + +The pipeline chains two core operators to complete the task: + +#### A. **FixPromptedVQAGenerator (Context Generation)** + +This operator uses the VLM model to generate raw text based on a predefined Prompt template. + +**Functionality:** + +* Generates a Wikipedia-style article based on the image. +* Generates Q&A pairs based on the article. +* **Prompt Constraints**: Questions refer to the image but avoid naming objects directly; answers must come from the text and not be objects in the image; answers must be concise. + +**Model Serving Configuration**: + +```python +self.serving = LocalModelVLMServing_vllm( + hf_model_name_or_path=model_path, + hf_cache_dir=hf_cache_dir, + vllm_tensor_parallel_size=1, + vllm_temperature=0.7, # Maintains some creativity + vllm_top_p=0.9, + vllm_max_tokens=512, +) + +``` + +**Operator Execution**: + +```python +self.vqa_generator.run( + storage=self.storage.step(), + input_image_key="image", + output_answer_key="vqa" # Outputs raw generated text +) + +``` + +#### B. **WikiQARefiner (Refining)** + +This operator cleans and converts the unstructured text generated by the VLM into a standard format. + +**Functionality:** + +* Cleans Markdown formatting and excess whitespace. +* Separates the article content (Context) from the Q&A pairs (QAs). + +**Operator Execution**: + +```python +self.refiner.run( + storage=self.storage.step(), + input_key="vqa", # Input raw text from previous step + output_key="context_vqa" # Output final structured data +) + +``` + +### 3. **Output Data** + +Finally, the output data generated by the pipeline will contain: + +* **image**: Original image path. +* **vqa**: Raw text generated by the VLM (intermediate result). +* **context_vqa**: Structured final result containing `context` (article) and `qas` (list of Q&A). + +**Output Data Example**: + +```json +{ + "id": 1, + "image": "./images/landmark.jpg", + "context_vqa": { + "context": "The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France...", + "qas": [ + { + "question": "In which city is this structure located?", + "answer": "Paris" + }, + { + "question": "What material is the tower primarily constructed from?", + "answer": "wrought-iron" + } + ] + } +} + +``` + +--- + +## 4. Pipeline Example + +Below is the complete `ContextVQAPipeline` code implementation, supporting CLI execution. + +```python +import argparse +from dataflow.utils.storage import FileStorage +from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm +from dataflow.operators.core_vision import FixPromptedVQAGenerator +from dataflow.operators.core_vision import WikiQARefiner + +class ContextVQAPipeline: + """ + Batch generate ContextVQA Captions with a single command. + """ + + def __init__( + self, + model_path: str, + *, + hf_cache_dir: str | None = None, + download_dir: str = "./ckpt", + device: str = "cuda", + first_entry_file: str = "dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl", + cache_path: str = "./cache_local_skvqa", + file_name_prefix: str = "skvqa_cache_step", + cache_type: str = "jsonl", + ): + # ---------- 1. Storage ---------- + self.storage = FileStorage( + first_entry_file_name=first_entry_file, + cache_path=cache_path, + file_name_prefix=file_name_prefix, + cache_type=cache_type, + ) + + # ---------- 2. Serving ---------- + self.serving = LocalModelVLMServing_vllm( + hf_model_name_or_path=model_path, + hf_cache_dir=hf_cache_dir, + hf_local_dir=download_dir, + vllm_tensor_parallel_size=1, + vllm_temperature=0.7, + vllm_top_p=0.9, + vllm_max_tokens=512, + ) + + # ---------- 3. Operator ---------- + # Use specific Prompt to generate Wiki-style article and Q&A + self.vqa_generator = FixPromptedVQAGenerator( + serving=self.serving, + system_prompt="You are a helpful assistant.", + user_prompt= """ + Write a Wikipedia article related to this image without directly referring to the image. Then write question answer pairs. The question answer pairs should satisfy the following criteria. + 1: The question should refer to the image. + 2: The question should avoid mentioning the name of the object in the image. + 3: The question should be answered by reasoning over the Wikipedia article. + 4: The question should sound natural and concise. + 5: The answer should be extracted from the Wikipedia article. + 6: The answer should not be any objects in the image. + 7: The answer should be a single word or phrase and list all correct answers separated by commas. + 8: The answer should not contain 'and', 'or', rather you can split them into multiple answers. + """ + ) + + # Refine and structure results + self.refiner = WikiQARefiner() + + # ------------------------------------------------------------------ # + def forward(self): + input_image_key = "image" + output_answer_key = "vqa" + output_wiki_key = "context_vqa" + + # Step 1: Generate raw text + self.vqa_generator.run( + storage=self.storage.step(), + input_image_key=input_image_key, + output_answer_key=output_answer_key + ) + + # Step 2: Parse into structured data + self.refiner.run( + storage=self.storage.step(), + input_key=output_answer_key, + output_key=output_wiki_key + ) + +# ---------------------------- CLI Entry -------------------------------- # +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Batch SKVQA caption generation with DataFlow") + + parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct") + parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface") + parser.add_argument("--download_dir", default="./ckpt") + parser.add_argument("--device", choices=["cuda", "cpu", "mps"], default="cuda") + + parser.add_argument("--images_file", default="dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl") + parser.add_argument("--cache_path", default="./cache_local") + parser.add_argument("--file_name_prefix", default="context_vqa") + parser.add_argument("--cache_type", default="jsonl") + + args = parser.parse_args() + + pipe = ContextVQAPipeline( + model_path=args.model_path, + hf_cache_dir=args.hf_cache_dir, + download_dir=args.download_dir, + device=args.device, + first_entry_file=args.images_file, + cache_path=args.cache_path, + file_name_prefix=args.file_name_prefix, + cache_type=args.cache_type, + ) + pipe.forward() + +``` \ No newline at end of file diff --git a/docs/en/notes/mm_guide/image_understanding/image_gcot.md b/docs/en/notes/mm_guide/image_understanding/image_gcot.md new file mode 100644 index 00000000..85a41201 --- /dev/null +++ b/docs/en/notes/mm_guide/image_understanding/image_gcot.md @@ -0,0 +1,340 @@ +--- +title: Image Grounded CoT (GCoT) Pipeline +icon: mdi:image-text +createTime: 2026/01/11 20:44:55 +permalink: /en/mm_guide/image_gcot/ +--- +## 1. Overview + +The **Image Grounded Chain-of-Thought (GCoT) Pipeline** is designed to automatically generate **Grounded Chain-of-Thought** data. This pipeline generates multi-step reasoning to answer a question and simultaneously spatially locates (via Bounding Boxes) the key objects mentioned during the reasoning process. This significantly enhances the interpretability and precision of multimodal data. + +Unlike traditional methods, this pipeline uses a **Single VLM (e.g., Qwen2.5-VL)** to handle both "Reasoning" and "Grounding" tasks, making the process streamlined and efficient. + +We support the following application scenarios: + +* **Enhanced Multimodal Data Construction**: Adding interpretability and grounding annotations to VQA datasets. +* **Complex Scene Understanding**: Generating detailed reasoning steps containing object coordinates. +* **Model Reasoning Training**: Building data to train models to be "grounded" and reduce hallucinations. + +The main process of the pipeline includes: + +1. **CoT Generation**: The model generates step-by-step reasoning text and extracts key nouns. +2. **Keyword Parsing**: Cleaning and extracting keywords to be grounded from the generated text. +3. **Visual Grounding**: The model generates bounding boxes (BBoxes) for the extracted keywords. +4. **Information Injection**: Injecting BBox coordinates back into the reasoning text to form the final GCoT. + +--- + +## 2. Quick Start + +### Step 1: Create a Working Directory + +```bash +mkdir run_gcot +cd run_gcot + +``` + +### Step 2: Prepare the Script + +Save the code in the "Pipeline Example" section below as `image_gcot_pipeline.py`. + +### Step 3: Configure Parameters + +Ensure you have a VLM model capable of grounding (e.g., Qwen2.5-VL-7B-Instruct). + +```bash +# Install dependencies +pip install open-dataflow vllm + +``` + +### Step 4: Run + +```bash +python image_gcot_pipeline.py \ + --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \ + --input_file "data/image_qa.jsonl" + +``` + +--- + +## 3. Data Flow & Logic + +### 1. **Input Data** + +The input data for this process typically consists of standard VQA data: + +* **image**: Path to the image file. +* **question**: Question about the image. +* **answer**: Standard answer to the question (used to assist CoT generation). + +**Input Data Example**: + +```json +{ + "image": "./images/cat_dog.jpg", + "question": "Is the cat looking at the dog?", + "answer": "Yes" +} + +``` + +### 2. **Core Operator Logic** + +This pipeline combines multiple fine-grained operators to achieve complex GCoT generation logic: + +#### A. **CoT Generation (PromptTemplatedVQAGenerator)** + +Uses a predefined `GCOT_PROMPT_TEMPLATE` to guide the model to generate "Step-by-step Reasoning" and a "Keyword List". + +* **Prompt Strategy**: Asks the model to output in the format `Step 1: ...`, `Step 2: ...`, `Keywords: ...`. +* **Output**: Raw string containing reasoning text and keywords. + +#### B. **Text Cleaning & Extraction (FunctionalRefiner)** + +Uses custom functions to parse the output from the previous step: + +* `extract_clean_cot_logic`: Strips the keyword section, keeping pure CoT text. +* `extract_keywords_logic`: Parses the content after `Keywords:` to generate a Python List. + +#### C. **Visual Grounding (VLMBBoxGenerator)** + +Calls the VLM's grounding capability to generate bounding boxes for each extracted keyword. + +* **Input**: Image + List of Keywords. +* **Output**: Dictionary mapping keywords to bounding box coordinates. + +#### D. **Coordinate Injection (FunctionalRefiner)** + +Uses the `inject_bboxes_logic` function to intelligently insert the generated BBox coordinates back into the original CoT text after the corresponding words. + +### 3. **Output Data** + +Finally, the output data generated by the pipeline will contain the following key fields: + +* **raw_cot_output**: Raw text generated by the model. +* **cleaned_cot**: Cleaned reasoning text. +* **bbox_mapping**: Mapping of keywords to their coordinates. +* **gcot**: Final result, reasoning chain containing coordinate information. + +**Output Data Example (gcot field)**: + +```text +Step 1: Locate the cat [200, 300, 400, 500]. The cat is sitting on the left. +Step 2: Locate the dog [500, 300, 700, 500]. The dog is sleeping on the right. +Step 3: Observe their gaze. The cat is facing the dog. +Answer: Yes + +``` + +--- + +## 4. Pipeline Example + +Below is the complete `ImageGCoTPipeline` code implementation. + +```python +import re +from typing import List, Dict, Any +import argparse +import torch +from dataflow.utils.storage import FileStorage +from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm + +from dataflow.operators.core_vision import PromptTemplatedVQAGenerator, VLMBBoxGenerator +from dataflow.operators.core_text import FunctionalRefiner +from dataflow.prompts.prompt_template import NamedPlaceholderPromptTemplate + +# 定义 Prompt 模板,强制模型输出推理步骤和关键词 +GCOT_PROMPT_TEMPLATE = ( + "Question: {question}\n" + "Answer: {answer}\n\n" + "Task: Provide a detailed step-by-step reasoning (Chain-of-Thought) that explains " + "how to arrive at this answer based on the image.\n" + "Then, extract key nouns and objects mentioned in your reasoning that are " + "visible in the image and can be spatially located.\n\n" + "Format:\n" + "Step 1: ...\n" + "Step 2: ...\n" + "Answer: {answer}\n" + "Keywords: object1, object2\n" +) + +DEFAULT_BBOX_PROMPT = 'Detect "{keyword}".' + +# ----------------- 辅助逻辑函数 ----------------- # + +def _parse_base(text: str) -> Dict[str, Any]: + """基础解析逻辑:分离 CoT 文本和 Keywords 行""" + if not text: return {"cot": "", "keywords": []} + lines = text.split('\n') + cot_lines = [] + keywords = [] + for line in lines: + if line.strip().lower().startswith('keywords:'): + keyword_str = line.split(':', 1)[-1].strip() + # 简单的分词处理 + raw_kws = [kw.strip().strip('.,;:!?"\'') for kw in keyword_str.replace(';', ',').split(',')] + keywords = [k for k in raw_kws if k] + else: + cot_lines.append(line) + return {"cot": '\n'.join(cot_lines).strip(), "keywords": keywords} + +def extract_clean_cot_logic(text: str) -> str: + return _parse_base(text)["cot"] + +def extract_keywords_logic(text: str) -> List[str]: + return _parse_base(text)["keywords"] + +def inject_bboxes_logic(cot_text: str, bbox_map: Dict[str, List[str]]) -> str: + """将 BBox 注入回 CoT 文本""" + if not cot_text or not bbox_map: return cot_text + # 优先匹配长词,避免子串误匹配 + sorted_keywords = sorted(bbox_map.keys(), key=lambda x: len(x), reverse=True) + result_text = cot_text + replaced = set() + + for keyword in sorted_keywords: + if keyword in replaced: continue + # 简单策略:只在 'Answer:' 之前注入,防止破坏答案区 + answer_pos = result_text.find('Answer:') + search_limit = answer_pos if answer_pos != -1 else len(result_text) + + # 大小写不敏感查找 + pos = result_text.lower().find(keyword.lower(), 0, search_limit) + if pos == -1: continue + + boxes = bbox_map[keyword] # List[str] + box_str = "".join(boxes) + # 替换:保留原词,追加 Box + replacement = f"{keyword} {box_str}" + + result_text = result_text[:pos] + replacement + result_text[pos + len(keyword):] + replaced.add(keyword) + return result_text + +# ----------------- 流水线定义 ----------------- # + +class ImageGCoTPipeline: + def __init__( + self, + model_path: str, + *, + first_entry_file: str, + cache_path: str = "./cache_gcot", + file_name_prefix: str = "gcot", + # Keys 配置 + question_key: str = "question", + answer_key: str = "answer", + image_key: str = "image", + output_key: str = "gcot", + vllm_max_tokens: int = 512 + ): + # 1. 存储初始化 + self.storage = FileStorage( + first_entry_file_name=first_entry_file, + cache_path=cache_path, + file_name_prefix=file_name_prefix, + cache_type="jsonl" + ) + + # 2. 模型服务 (单一模型) + self.vlm_serving = LocalModelVLMServing_vllm( + hf_model_name_or_path=model_path, + vllm_tensor_parallel_size=1, + vllm_temperature=0.7, + vllm_max_tokens=vllm_max_tokens + ) + + self.keys = { + "q": question_key, + "a": answer_key, + "img": image_key, + "raw_cot": "raw_cot_output", + "clean_cot": "cleaned_cot", + "keywords": "extracted_keywords", + "bbox_map": "bbox_mapping", + "final": output_key + } + + # 3. 算子链配置 + + # Step A: 生成 CoT 和 Keywords + self.op_gen_cot = PromptTemplatedVQAGenerator( + serving=self.vlm_serving, + system_prompt="You are a helpful assistant.", + prompt_template=NamedPlaceholderPromptTemplate(template=GCOT_PROMPT_TEMPLATE) + ) + + # Step B: 解析清洗 CoT + self.op_extract_cot = FunctionalRefiner(func=extract_clean_cot_logic) + + # Step C: 解析 Keywords + self.op_extract_kws = FunctionalRefiner(func=extract_keywords_logic) + + # Step D: 生成 BBox (Grounding) + self.op_bbox_gen = VLMBBoxGenerator( + serving=self.vlm_serving, + prompt_template=DEFAULT_BBOX_PROMPT + ) + + # Step E: 注入 BBox 到 CoT + self.op_inject = FunctionalRefiner(func=inject_bboxes_logic) + + def forward(self): + print(">>> [Pipeline] Step 1: Generating CoT...") + self.op_gen_cot.run( + self.storage.step(), + input_image_key=self.keys["img"], + output_answer_key=self.keys["raw_cot"], + question=self.keys["q"], + answer=self.keys["a"] + ) + + print(">>> [Pipeline] Step 2: Parsing Outputs...") + self.op_extract_cot.run( + self.storage.step(), + output_key=self.keys["clean_cot"], + text=self.keys["raw_cot"] + ) + self.op_extract_kws.run( + self.storage.step(), + output_key=self.keys["keywords"], + text=self.keys["raw_cot"] + ) + + print(">>> [Pipeline] Step 3: Generating BBoxes (Grounding)...") + self.op_bbox_gen.run( + self.storage.step(), + input_image_key=self.keys["img"], + input_kws_key=self.keys["keywords"], + output_key=self.keys["bbox_map"] + ) + + print(">>> [Pipeline] Step 4: Injecting GCoT...") + self.op_inject.run( + self.storage.step(), + output_key=self.keys["final"], + cot_text=self.keys["clean_cot"], + bbox_map=self.keys["bbox_map"] + ) + + print(f">>> [Pipeline] Done. Final GCoT saved to: {self.keys['final']}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--input_file", default="dataflow/example/image_to_text_pipeline/image_qa_result.jsonl") + parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct") + + args = parser.parse_args() + + pipe = ImageGCoTPipeline( + model_path=args.model_path, + first_entry_file=args.input_file + ) + pipe.forward() + +``` diff --git a/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline.md b/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline.md new file mode 100644 index 00000000..a033dcf1 --- /dev/null +++ b/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline.md @@ -0,0 +1,276 @@ +--- +title: Image Region Captioning Pipeline +createTime: 2026/01/11 22:04:27 +icon: mdi:image-text +permalink: /en/mm_guide/image_region_caption_pipeline/ +--- +## 1. Overview + +The **Image Region Captioning Pipeline** is designed to generate detailed text descriptions for specific regions within an image. Combining the localization capabilities of Computer Vision with the understanding of Multimodal Large Models (VLMs), this pipeline identifies Regions of Interest (ROI) and generates precise natural language annotations for them. + +This pipeline supports processing **pre-defined Bounding Box** data, visualizing these boxes, and then feeding them into a VLM for caption generation. + +We support the following application scenarios: + +* **Dense Captioning**: Generating descriptions for multiple objects within a single image. +* **Fine-grained Image Understanding**: Focusing on local details rather than global descriptions. +* **Dataset Augmentation**: Constructing image-text pair datasets that include localization information. + +The main process of the pipeline includes: + +1. **Data Loading**: Reading source data containing image paths and bounding box information. +2. **BBox Processing & Visualization**: Processing input bounding boxes and generating a version of the image with visual markers (e.g., drawn boxes). +3. **Region Caption Generation**: Using a VLM to generate text descriptions based on the marked images or specific regions. + +--- + +## 2. Quick Start + +### Step 1: Create a Working Directory + +```bash +mkdir run_region_caption +cd run_region_caption + +``` + +### Step 2: Prepare the Script + +Save the code in the "Pipeline Example" section below as `region_caption_pipeline.py`. + +### Step 3: Configure Parameters + +Ensure the input file (jsonl) contains `image` and `bbox` fields. + +```bash +# Install dependencies +pip install open-dataflow vllm + +``` + +### Step 4: Run + +```bash +python region_caption_pipeline.py \ + --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \ + --first_entry_file "data/region_captions.jsonl" \ + --output_jsonl_path "data/results.jsonl" + +``` + +--- + +## 3. Data Flow & Logic + +### 1. **Input Data** + +The input data typically contains the image path and a list of corresponding bounding boxes: + +* **image**: Path to the image file. +* **bbox**: List of bounding box coordinates, typically in `[[x, y, w, h], ...]` or `[[x1, y1, x2, y2], ...]` format (depending on configuration). + +**Input Data Example**: + +```json +{ + "image": "./images/kitchen.jpg", + "bbox": [[196, 104, 310, 495], [50, 60, 100, 200]] +} + +``` + +### 2. **Core Operator Logic** + +This pipeline chains two core operators to complete the task: + +#### A. **ImageBboxGenerator** + +This operator handles the vision-level tasks. + +* **Input**: Raw image + `bbox` data. +* **Functionality**: Reads bounding boxes and draws them onto the image (visualization) or preprocesses them according to configuration. +* **Configuration (`ExistingBBoxDataGenConfig`)**: Controls parameters like `max_boxes` and visualization options (`draw_visualization`). +* **Output**: Generates a new image path containing visual markers (`image_with_bbox`). + +#### B. **PromptedVQAGenerator** + +This operator is responsible for generating text using the VLM. + +* **Input**: The `image_with_bbox` generated in the previous step. +* **Functionality**: The VLM receives the marked image and generates descriptions for the corresponding regions based on prompts. +* **Output**: Region description text. + +### 3. **Output Data** + +The final output data will contain the processed image path and the generated descriptions: + +* **image_with_bbox**: Path to the image with drawn boxes. +* **mdvp_record**: List of generated region descriptions. + +**Output Data Example**: + +```json +{ + "image": "./images/kitchen.jpg", + "image_with_bbox": "./images/kitchen_visualized.jpg", + "mdvp_record": [ + "A wooden chair located near the table.", + "A white refrigerator in the background." + ] +} + +``` + +--- + +## 4. Pipeline Example + +Below is the complete `ImageRegionCaptioningPipeline` code implementation. + +```python +import argparse +from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm +from dataflow.operators.core_vision.generate.image_bbox_generator import ( + ImageBboxGenerator, + ExistingBBoxDataGenConfig +) +from dataflow.operators.core_vision.generate.prompted_vqa_generator import ( + PromptedVQAGenerator +) +from dataflow.utils.storage import FileStorage + + +class ImageRegionCaptioningPipeline: + def __init__( + self, + model_path: str, + *, + hf_cache_dir: str | None = None, + download_dir: str = "./ckpt/models", + device: str = "cuda", + # Storage & Paths + first_entry_file: str = "./dataflow/example/image_to_text_pipeline/region_captions.jsonl", + cache_path: str = "./dataflow/example/cache", + file_name_prefix: str = "region_caption", + cache_type: str = "jsonl", + # Keys + input_image_key: str = "image", + input_bbox_key: str = "bbox", + image_with_bbox_path: str = 'image_with_bbox', # Key for intermediate image + output_key: str = "mdvp_record", + # BBox Config + max_boxes: int = 10, + input_jsonl_path: str = "./dataflow/example/image_to_text_pipeline/region_captions.jsonl", + output_jsonl_path: str = "./dataflow/example/image_to_text_pipeline/region_captions_results_v1.jsonl", + output_image_with_bbox_path: str = "./dataflow/example/image_to_text_pipeline/image_with_bbox_results_v1.jsonl", + draw_visualization: bool = True + ): + # 1. 初始化存储 (Storage) + # 用于 BBox 生成阶段的存储 + self.bbox_storage = FileStorage( + first_entry_file_name=first_entry_file, + cache_path=cache_path, + file_name_prefix=file_name_prefix, + cache_type=cache_type + ) + + # 2. 配置 BBox 生成器 + self.cfg = ExistingBBoxDataGenConfig( + max_boxes=max_boxes, + input_jsonl_path=input_jsonl_path, + output_jsonl_path=output_image_with_bbox_path, + ) + + # 3. 初始化 Caption 阶段的存储 + # 注意:这里接续了上一步的输出路径 + self.caption_storage = FileStorage( + first_entry_file_name=output_image_with_bbox_path, + cache_path=cache_path, + file_name_prefix=file_name_prefix, + cache_type=cache_type + ) + + # 4. 初始化 VLM 服务 + self.serving = LocalModelVLMServing_vllm( + hf_model_name_or_path=model_path, + hf_cache_dir=hf_cache_dir, + hf_local_dir=download_dir, + vllm_tensor_parallel_size=1, + vllm_temperature=0.7, + vllm_top_p=0.9, + vllm_max_tokens=512, + ) + + # 5. 初始化核心算子 + self.bbox_generator = ImageBboxGenerator(config=self.cfg) + self.caption_generator = PromptedVQAGenerator(serving=self.serving) + + self.input_image_key = input_image_key + self.input_bbox_key = input_bbox_key + self.output_key = output_key + self.image_with_bbox_path = image_with_bbox_path + + def forward(self): + # 步骤 1: 生成带 BBox 可视化的图像 + print(">>> [Pipeline] Step 1: Processing BBoxes and Visualizing...") + self.bbox_generator.run( + storage=self.bbox_storage.step(), + input_image_key=self.input_image_key, + input_bbox_key=self.input_bbox_key, + output_key=self.image_with_bbox_path, + ) + + # 步骤 2: 基于可视化图像生成描述 + print(">>> [Pipeline] Step 2: Generating Region Captions...") + self.caption_generator.run( + storage=self.caption_storage.step(), + input_image_key='image_with_bbox' # 使用上一步生成的带框图像 + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Image region captioning with DataFlow") + + parser.add_argument("--model_path", default="/data0/happykeyan/Models/Qwen2.5-VL-3B-Instruct") + parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface") + parser.add_argument("--download_dir", default="./ckpt/models") + parser.add_argument("--device", choices=["cuda", "cpu", "mps"], default="cuda") + + parser.add_argument("--first_entry_file", default="./dataflow/example/image_to_text_pipeline/region_captions.jsonl") + parser.add_argument("--cache_path", default="./dataflow/example/cache") + parser.add_argument("--file_name_prefix", default="region_caption") + parser.add_argument("--cache_type", default="jsonl") + + parser.add_argument("--input_image_key", default="image") + parser.add_argument("--input_bbox_key", default="bbox") + parser.add_argument("--output_key", default="mdvp_record") + + parser.add_argument("--max_boxes", type=int, default=10) + parser.add_argument("--input_jsonl_path", default="./dataflow/example/image_to_text_pipeline/region_captions.jsonl") + parser.add_argument("--output_jsonl_path", default="./dataflow/example/image_to_text_pipeline/region_captions_results_v1.jsonl") + parser.add_argument("--output_image_with_bbox_path", default="./dataflow/example/image_to_text_pipeline/image_with_bbox_results_v1.jsonl") + parser.add_argument("--draw_visualization", type=bool, default=True) + + args = parser.parse_args() + + pipe = ImageRegionCaptioningPipeline( + model_path=args.model_path, + hf_cache_dir=args.hf_cache_dir, + download_dir=args.download_dir, + device=args.device, + first_entry_file=args.first_entry_file, + cache_path=args.cache_path, + file_name_prefix=args.file_name_prefix, + cache_type=args.cache_type, + input_image_key=args.input_image_key, + input_bbox_key=args.input_bbox_key, + output_key=args.output_key, + max_boxes=args.max_boxes, + input_jsonl_path=args.input_jsonl_path, + output_image_with_bbox_path=args.output_image_with_bbox_path, + draw_visualization=args.draw_visualization + ) + pipe.forward() + +``` + diff --git a/docs/en/notes/mm_guide/image_understanding/image_scale_caption_pipeline.md b/docs/en/notes/mm_guide/image_understanding/image_scale_caption_pipeline.md new file mode 100644 index 00000000..8dc770ae --- /dev/null +++ b/docs/en/notes/mm_guide/image_understanding/image_scale_caption_pipeline.md @@ -0,0 +1,367 @@ +--- +title: ScaleCap High-Density Captioning Pipeline +createTime: 2026/01/11 22:08:57 +icon: mdi:image-text +permalink: /en/mm_guide/image_scale_caption_pipeline/ +--- +## 1. Overview + +The **ScaleCap High-Density Captioning Pipeline** implements an advanced **"Generate-Verify-Expand-Fuse"** paradigm for image captioning. This pipeline is designed to generate **extremely high information density** captions with **minimal hallucinations**, making it ideal for scenarios requiring deep understanding of image details. + +Based on the paper *ScaleCap: Inference-Time Scalable Image Captioning via Dual-Modality Debiasing*, this method progressively mines object and position details through multi-turn dialogue and visual self-verification (Visual Grounding), filtering out hallucinations along the way. + +We support the following application scenarios: + +* **High-Quality Multimodal Dataset Construction**: Generating training data that is more detailed and accurate than standard captions. +* **Fine-Grained Image Retrieval**: Providing index text rich in detail. +* **Accessibility/Blind Assistance**: Generating "What You See Is What You Get" (WYSIWYG) detailed narrations. + +The main process of the pipeline includes: + +1. **Initial Caption Generation**: VLM generates a baseline description. +2. **Visual Debiasing**: Splitting the description into sentences and verifying each sentence against visual evidence (Visual Grounding). +3. **Detail Expansion**: Generating follow-up questions about object attributes and positions based on verified "Golden Sentences". +4. **Answering & Re-verification**: VLM answers the questions and performs another round of visual grounding to filter incorrect details. +5. **Final Fusion**: Merging all verified information into a coherent, long description. + +--- + +## 2. Quick Start + +### Step 1: Create a Working Directory + +```bash +mkdir run_scalecap +cd run_scalecap + +``` + +### Step 2: Prepare the Script + +Save the code in the "Pipeline Example" section below as `scalecap_pipeline.py`. + +### Step 3: Configure Parameters + +Ensure the VLM model path (e.g., Qwen2.5-VL) is correct. + +```bash +# Install dependencies +pip install open-dataflow vllm + +``` + +### Step 4: Run + +```bash +python scalecap_pipeline.py \ + --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \ + --input_jsonl "data/images.jsonl" \ + --output_key "final_caption" + +``` + +--- + +## 3. Data Flow & Logic + +### 1. **Input Data** + +The input data requires only the image path: + +* **image**: Path to the image file. + +**Input Data Example**: + +```json +{ + "image": "./images/complex_scene.jpg" +} + +``` + +### 2. **Core Operator Logic** + +This pipeline is a complex orchestration of multiple atomic operators: + +#### A. **Initial Generation (PromptedVQAGenerator)** + +* **Function**: Generates a preliminary description (`init_caption`) of the image using a basic prompt. + +#### B. **Visual Debiasing (VisualGroundingRefiner)** + +* **Function**: The core anti-hallucination mechanism of ScaleCap. +* **Logic**: +1. Uses `split_sentences` to break the draft into single sentences. +2. Asks the VLM: "Given the image, is the description '{text}' directly supported by visual evidence?". +3. Keeps only sentences where the answer is "Yes", forming **"Golden Sentences"**. + + + +#### C. **Question Generation & Parsing (PromptTemplatedQAGenerator)** + +* **Function**: Generates targeted follow-up questions based on Golden Sentences using LLM capabilities. +* **Logic**: The model generates text like "Describe more details about the [Object]", which is then automatically expanded into **Object Detail** and **Positional Relation** questions via `parse_questions_logic`. + +#### D. **Batch Answering & Refiltering (BatchVQAGenerator & Refiner)** + +* **Function**: Mining deep image information. +* **Logic**: +1. Uses `BatchVQAGenerator` to have the VLM answer all generated questions in a batch. +2. Uses `VisualGroundingRefiner` again to check if these new details are accurate. +3. Retains reliable details (`final_details`). + + + +#### E. **Final Fusion (PromptTemplatedQAGenerator)** + +* **Function**: Rewrites the "Golden Sentences" and "Verified Details" into a fluent text. +* **Output**: `final_caption`. + +### 3. **Output Data** + +The output data records the entire pipeline process, facilitating debugging and analysis: + +* **init_caption**: Raw generated draft. +* **golden_sentences**: List of sentences that passed the first check. +* **q_list**: List of generated follow-up questions. +* **final_details**: Detailed answers that passed the second check. +* **final_caption**: The final high-density description. + +**Output Data Example**: + +```json +{ + "image": "./images/complex_scene.jpg", + "init_caption": "A dog sitting on a bench.", + "golden_sentences": ["A dog is sitting on a wooden bench."], + "q_list": ["Describe more details about the dog.", "Describe position of the bench."], + "final_details": ["The dog is a Golden Retriever with a red collar.", "The bench is located in a park."], + "final_caption": "A Golden Retriever with a red collar is sitting on a wooden bench located in a park..." +} + +``` + +--- + +## 4. Pipeline Example + +Below is the complete `ImageScaleCaptionPipeline` code implementation. + +```python +import re +import argparse +from typing import Callable, Any, List + +from dataflow.utils.storage import FileStorage +from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm +from dataflow.prompts.prompt_template import NamedPlaceholderPromptTemplate +from dataflow.prompts.image import ImageScaleCaptionPrompt +from dataflow.operators.core_vision import PromptedVQAGenerator, BatchVQAGenerator, VisualGroundingRefiner +from dataflow.operators.core_text import PromptTemplatedQAGenerator, FunctionalRefiner + +class ImageScaleCaptionPipeline: + def __init__( + self, + model_path: str, + *, + hf_cache_dir: str | None = None, + download_dir: str = "./ckpt/models", + device: str = "cuda", + # Storage params + first_entry_file: str = "images.jsonl", + cache_path: str = "./cache_scalecap", + file_name_prefix: str = "scalecap", + cache_type: str = "jsonl", + # Keys + input_image_key: str = "image", + output_key: str = "final_caption", + # VLLM Config + vllm_tensor_parallel_size: int = 1, + vllm_temperature: float = 0.7, + vllm_top_p: float = 0.9, + vllm_max_tokens: int = 512, + ): + # 1. Storage + self.storage = FileStorage( + first_entry_file_name=first_entry_file, + cache_path=cache_path, + file_name_prefix=file_name_prefix, + cache_type=cache_type, + ) + + # 2. Serving + self.serving = LocalModelVLMServing_vllm( + hf_model_name_or_path=model_path, + hf_cache_dir=hf_cache_dir, + hf_local_dir=download_dir, + vllm_tensor_parallel_size=vllm_tensor_parallel_size, + vllm_temperature=vllm_temperature, + vllm_top_p=vllm_top_p, + vllm_max_tokens=vllm_max_tokens, + ) + + # 3. Prompts + self.prompts_db = ImageScaleCaptionPrompt().build_prompt() + + # 4. Keys + self.input_image_key = input_image_key + self.output_key = output_key + + # ================== Operator Initialization ================== + + # --- Step A: Generate Init Caption --- + self.refine_const_prompt = FunctionalRefiner(func=lambda: self.prompts_db["VLM_PROMPT_1"]) + self.gen_init_caption = PromptedVQAGenerator( + serving=self.serving, + system_prompt="You are a helpful assistant." + ) + + # --- Step B: Refine Golden Sentences --- + self.refine_split = FunctionalRefiner(func=split_sentences) + # 视觉自检 (保留 Yes 的句子) + self.refine_golden = VisualGroundingRefiner( + serving=self.serving, + prompt_template="Given the image, is the description '{text}' directly supported by visual evidence? Answer strictly yes or no." + ) + + # --- Step C: Generate Questions --- + self.refine_join = FunctionalRefiner(func=join_list) + tpl_q = NamedPlaceholderPromptTemplate( + template=self.prompts_db["LLM_PROMPT_1"], + join_list_with="\n" + ) + self.gen_questions_text = PromptTemplatedQAGenerator( + serving=self.serving, + prompt_template=tpl_q + ) + self.refine_parse_qs = FunctionalRefiner(func=parse_questions_logic) + + # --- Step D: Generate Answers --- + self.gen_answers = BatchVQAGenerator(serving=self.serving) + self.refine_answers = VisualGroundingRefiner( + serving=self.serving, + prompt_template="Given the image, is the statement '{text}' grounded in the image and not generic? Answer strictly yes or no." + ) + + # --- Step E: Integrate Final Caption --- + tpl_final = NamedPlaceholderPromptTemplate( + template=self.prompts_db["LLM_PROMPT_4"], + join_list_with="\n" + ) + self.gen_final_caption = PromptTemplatedQAGenerator( + serving=self.serving, + prompt_template=tpl_final + ) + + def forward(self): + print(">>> [Pipeline] Step 0: Preparing Prompts...") + self.refine_const_prompt.run( + self.storage.step(), + output_key="init_prompt" + ) + + print(">>> [Pipeline] Step 1: Generating Initial Caption...") + self.gen_init_caption.run( + self.storage.step(), + input_prompt_key="init_prompt", + input_image_key=self.input_image_key, + output_answer_key="init_caption" + ) + + print(">>> [Pipeline] Step 2: Refining Golden Sentences...") + self.refine_split.run( + self.storage.step(), + output_key="sentences", + text="init_caption" + ) + self.refine_golden.run( + self.storage.step(), + input_list_key="sentences", + input_image_key=self.input_image_key, + output_key="golden_sentences" + ) + + print(">>> [Pipeline] Step 3: Generating Details Questions...") + self.refine_join.run( + self.storage.step(), + output_key="golden_str", + data="golden_sentences" + ) + self.gen_questions_text.run( + self.storage.step(), + output_answer_key="raw_q_text", + sentence="golden_str" + ) + self.refine_parse_qs.run( + self.storage.step(), + output_key="q_list", + text="raw_q_text" + ) + + print(">>> [Pipeline] Step 4: Generating & Filtering Answers...") + self.gen_answers.run( + self.storage.step(), + input_prompts_key="q_list", + input_image_key=self.input_image_key, + output_key="raw_answers" + ) + self.refine_answers.run( + self.storage.step(), + input_list_key="raw_answers", + input_image_key=self.input_image_key, + output_key="final_details" + ) + + print(">>> [Pipeline] Step 5: Integrating Final Caption...") + self.refine_join.run( + self.storage.step(), + output_key="details_str", + data="final_details" + ) + self.gen_final_caption.run( + self.storage.step(), + output_answer_key=self.output_key, + context="golden_str", + object_info="details_str", + position_info="details_str" + ) + + print(f">>> [Pipeline] All Done. Result saved to: {self.storage.cache_path}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="ScaleCap Dense Captioning Pipeline") + + parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct") + parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface") + parser.add_argument("--download_dir", default="./ckpt/models") + parser.add_argument("--device", default="cuda") + + parser.add_argument("--input_jsonl", default="./dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl") + parser.add_argument("--cache_path", default="./cache_scalecap_results") + parser.add_argument("--file_name_prefix", default="scalecap") + parser.add_argument("--input_image_key", default="image") + parser.add_argument("--output_key", default="final_caption") + + parser.add_argument("--tp", type=int, default=1) + parser.add_argument("--max_tokens", type=int, default=1024) + + args = parser.parse_args() + + pipe = ImageScaleCaptionPipeline( + model_path=args.model_path, + hf_cache_dir=args.hf_cache_dir, + download_dir=args.download_dir, + device=args.device, + first_entry_file=args.input_jsonl, + cache_path=args.cache_path, + file_name_prefix=args.file_name_prefix, + input_image_key=args.input_image_key, + output_key=args.output_key, + vllm_tensor_parallel_size=args.tp, + vllm_max_tokens=args.max_tokens + ) + + pipe.forward() + +``` diff --git a/docs/en/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline.md b/docs/en/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline.md new file mode 100644 index 00000000..cc3806af --- /dev/null +++ b/docs/en/notes/mm_guide/image_understanding/image_visual_only_mcq_pipeline.md @@ -0,0 +1,325 @@ +--- +title: Visual-Only MCQ Pipeline +createTime: 2026/01/11 22:13:45 +icon: mdi:image-text +permalink: /en/mm_guide/image_visual_only_mcq_pipeline/ +--- +## 1. Overview + +The **Visual-Only MCQ Pipeline** is a core component of the CapRL (Caption Reinforcement Learning) framework. Its goal is to generate a set of high-quality Multiple Choice Questions (MCQs) that satisfy **strict visual dependency**: the model must "see" the image to answer correctly; answering based on text alone (guessing or common sense) is not possible. + +This pipeline uses a **Generate-Parse-Verify** three-step method, leveraging **Option Rotation** and **Blind Tests** to rigorously filter out hallucinations or overly simple questions. The generated questions serve as a robust reward signal for Reinforcement Learning. + +The main process includes: + +1. **MCQ Generation**: VLM generates raw QA pairs based on the image. +2. **Structured Parsing**: Using regex logic to parse text into standard question/option structures. +3. **Visual Dependency Verification**: +* **Rotation Test**: Shuffling options multiple times to eliminate positional bias. +* **Dual Filtering**: Requiring high "Visual Accuracy" and low "Text-only Accuracy". + + + +--- + +## 2. Quick Start + +### Step 1: Create Working Directory + +```bash +mkdir run_vis_mcq +cd run_vis_mcq + +``` + +### Step 2: Prepare Script + +Save the code in the "Pipeline Example" section below as `visual_mcq_pipeline.py`. + +### Step 3: Configure Parameters + +Control filtering thresholds via CLI. For example, requiring 100% visual accuracy and less than 25% blind accuracy: + +```bash +# Install dependencies +pip install open-dataflow vllm + +``` + +### Step 4: Run + +```bash +python visual_mcq_pipeline.py \ + --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \ + --input_file "data/captions.jsonl" \ + --rotate_num 4 \ + --pass_vis 1.0 \ + --pass_txt 0.25 + +``` + +--- + +## 3. Data Flow & Logic + +### 1. **Input Data** + +Input only requires the image path: + +* **image**: Path to the image file. + +**Input Data Example**: + +```json +{ + "image": "./images/sample_01.jpg" +} + +``` + +### 2. **Core Operator Logic** + +This pipeline chains three key operators: + +#### A. **FixPromptedVQAGenerator (Raw Generation)** + +* **Function**: Uses CapRL predefined Prompt templates (`SYS_PROMPT_MCQ` / `USER_PROMPT_MCQ`) to generate 5 MCQs at once. +* **Output**: Unstructured text block containing multiple `#### Question` and options. + +#### B. **FunctionalRefiner (Regex Parsing)** + +* **Logic Function**: `parse_mcq_text_logic` +* **Function**: Extracts questions, options (A-F), and correct answers from raw text using regex. +* **Output**: Structured MCQ list (`parsed_mcq_list`). + +#### C. **VisualDependencyRefiner (Dependency Verification)** + +This is the core filter. It performs N inferences (N = `rotate_num`) for each question: + +1. **Option Rotation**: Randomly shuffles options (e.g., moving answer from A to C) to prevent the model from cheating by "always picking A". +2. **Visual Pass**: Input Image + Question. Records the model's accuracy. +3. **Textual Pass**: Input Question only (no image). Records the model's blind guessing accuracy. +4. **Filtering Criteria**: +* Keep the question IF AND ONLY IF: `Visual_Acc >= pass_visual_min` **AND** `Textual_Acc <= pass_textual_max`. +* *Example*: If a question can be answered correctly without the image (high text accuracy), it tests common sense rather than vision, so it is **discarded**. + + + +### 3. **Output Data** + +The output data (`final_mcqs`) contains only questions that passed rigorous verification. These questions possess high quality and visual relevance. + +**Output Data Example**: + +```json +{ + "image": "./images/sample_01.jpg", + "final_mcqs": [ + { + "question": "What is the color of the car on the far left?\n - A) Red\n - B) Blue...", + "answer": "A", + "stats": { + "visual_acc": 1.0, # 4/4 correct with image + "text_acc": 0.0 # 0/4 correct without image + } + } + ] +} + +``` + +--- + +## 4. Pipeline Example + +Below is the complete `VisualOnlyMCQPipeline` code implementation. + +```python +import argparse +import re +from typing import List, Dict, Any +from dataflow.utils.storage import FileStorage +from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm + +from dataflow.operators.core_vision import FixPromptedVQAGenerator, VisualDependencyRefiner +from dataflow.operators.core_text import FunctionalRefiner +from dataflow.prompts.image import ImageCaprlPrompt + +# 正则解析逻辑 +_Q_BLOCK_SPLIT = re.compile(r"^####\s*\d+\.\s*\*\*(.*?)\*\*\s*$", re.M) +_OPT_LINE_RE = re.compile(r"^\s*-\s*([A-F])\)\s*(.+?)\s*$") +_ANS_LINE_RE = re.compile(r"^\s*\*\*Answer:\*\*\s*([A-F])\)\s*(.+?)\s*$", re.I) + +def parse_mcq_text_logic(mcq_text: str, expected: int = 5) -> List[Dict[str, Any]]: + """将 VLM 生成的原始文本解析为结构化字典列表""" + if not mcq_text or not isinstance(mcq_text, str): return [] + + indices = [m.start() for m in _Q_BLOCK_SPLIT.finditer(mcq_text)] + if not indices: return [] + indices.append(len(mcq_text)) + blocks = [mcq_text[indices[i]:indices[i+1]].strip() for i in range(len(indices)-1)] + + parsed = [] + for block in blocks: + lines = [ln.rstrip() for ln in block.splitlines() if ln.strip()] + q_title_m = _Q_BLOCK_SPLIT.search(block) + if not q_title_m: continue + + q_title = q_title_m.group(1).strip() + options = {} + ans_letter, ans_text = None, None + + for ln in lines: + m_opt = _OPT_LINE_RE.match(ln) + if m_opt: + options[m_opt.group(1)] = m_opt.group(2).strip() + continue + m_ans = _ANS_LINE_RE.match(ln) + if m_ans: + ans_letter = m_ans.group(1).upper() + ans_text = m_ans.group(2).strip() + break + + if options and ans_letter and ans_letter in options: + q_lines = [q_title] + for lbl in ["A", "B", "C", "D", "E", "F"]: + if lbl in options: + q_lines.append(f" - {lbl}) {options[lbl]}") + + parsed.append({ + "question": "\n".join(q_lines), + "question_title": q_title, + "options": options, + "answer": ans_letter, + "answer_text": ans_text + }) + + if expected > 0: + parsed = parsed[:expected] + + uniq = [] + seen = set() + for it in parsed: + key = (it["question_title"], it["answer"]) + if key not in seen: + seen.add(key) + uniq.append(it) + return uniq + + +class VisualOnlyMCQPipeline: + def __init__( + self, + model_path: str, + *, + first_entry_file: str, + cache_path: str = "./cache_mcq", + file_name_prefix: str = "vis_mcq", + # Config + rotate_num: int = 4, + pass_visual_min: float = 1.0, + pass_textual_max: float = 0.25, + add_none_above: bool = True, + # Keys + input_image_key: str = "image", + output_key: str = "final_mcqs", + # VLLM + device: str = "cuda", + vllm_max_tokens: int = 2048 + ): + # 1. 初始化存储 + self.storage = FileStorage( + first_entry_file_name=first_entry_file, + cache_path=cache_path, + file_name_prefix=file_name_prefix, + cache_type="jsonl" + ) + + # 2. 初始化 VLM 服务 + self.serving = LocalModelVLMServing_vllm( + hf_model_name_or_path=model_path, + vllm_tensor_parallel_size=1, + vllm_temperature=0.1, # 低温度以保证格式稳定 + vllm_max_tokens=vllm_max_tokens + ) + + # Keys 配置 + self.keys = { + "img": input_image_key, + "raw_text": "raw_mcq_text", + "parsed_list": "parsed_mcq_list", + "final": output_key + } + + # 加载 Prompt 库 + self.prompts_db = ImageCaprlPrompt().build_prompt() + + # ================== 算子初始化 ================== + + # 算子 1: 生成原始 MCQ 文本 + self.op_gen_raw = FixPromptedVQAGenerator( + serving=self.serving, + system_prompt=self.prompts_db["SYS_PROMPT_MCQ"], + user_prompt=self.prompts_db["USER_PROMPT_MCQ"] + ) + + # 算子 2: 解析文本为结构化数据 + self.op_parse = FunctionalRefiner(func=parse_mcq_text_logic) + + # 算子 3: 视觉依赖性验证 (核心过滤) + # 包含旋转 (Rotation) 和 无图检测 (Text-only check) + self.op_verify = VisualDependencyRefiner( + serving=self.serving, + instruction_template=self.prompts_db["ANSWER_INSTRUCTION"], + rotate_num=rotate_num, + pass_visual_min=pass_visual_min, + pass_textual_max=pass_textual_max, + add_none_above_visual=add_none_above + ) + + def forward(self): + print(">>> [Pipeline] Step 1: Generating Raw MCQs (FixPrompted)...") + self.op_gen_raw.run( + self.storage.step(), + input_image_key=self.keys["img"], + output_answer_key=self.keys["raw_text"] + ) + + print(">>> [Pipeline] Step 2: Parsing MCQs...") + self.op_parse.run( + self.storage.step(), + output_key=self.keys["parsed_list"], + mcq_text=self.keys["raw_text"], + expected=5 + ) + + print(">>> [Pipeline] Step 3: Verifying Visual Dependency (Rotation Check)...") + self.op_verify.run( + self.storage.step(), + input_list_key=self.keys["parsed_list"], + input_image_key=self.keys["img"], + output_key=self.keys["final"] + ) + + print(f">>> [Pipeline] Done. Results in: {self.keys['final']}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--input_file", default="./dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl") + parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct") + parser.add_argument("--rotate_num", type=int, default=4) + parser.add_argument("--pass_vis", type=float, default=1.0) + parser.add_argument("--pass_txt", type=float, default=0.25) + + args = parser.parse_args() + + pipe = VisualOnlyMCQPipeline( + model_path=args.model_path, + first_entry_file=args.input_file, + rotate_num=args.rotate_num, + pass_visual_min=args.pass_vis, + pass_textual_max=args.pass_txt + ) + pipe.forward() + +``` \ No newline at end of file diff --git a/docs/en/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md b/docs/en/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md new file mode 100644 index 00000000..0904691e --- /dev/null +++ b/docs/en/notes/mm_guide/image_understanding/vision_mct_reasoning_pipeline.md @@ -0,0 +1,230 @@ +--- +title: Vision MCTS Reasoning Pipeline +icon: mdi:image-text +createTime: 2026/01/11 21:59:59 +permalink: /en/mm_guide/vision_mct_reasoning_pipeline/ +--- + +## 1. Overview + +The **Vision MCTS Reasoning Pipeline** is designed to construct high-quality **Process Supervision Data** for multimodal large models. This pipeline handles two types of data sources: existing Monte Carlo Tree Search (MCTS) trajectory data, or direct generation of new reasoning chains using a VLM. + +This pipeline is a core tool for **Grounded-RL** and **SFT Data Construction**, converting complex tree-search processes into a linearized `......` format that models can learn from. + +We support the following application scenarios: + +* **MCTS Data Extraction**: Converting high-value paths (Rollouts) from search trees into linear training data. +* **Hybrid Data Construction**: Automatically falling back to VLM-based CoT generation for samples without search trees. +* **Spatial Reasoning Enhancement**: Supporting the generation of spatial reasoning chains containing explicit coordinates (Bounding Boxes). + +The main process of the pipeline includes: + +1. **MCTS Tree Parsing**: Parsing the search tree structure in the input data to extract successful reasoning paths. +2. **Visual Reasoning Generation (Fallback)**: Using a VLM to regenerate reasoning chains for samples where the tree structure is missing or parsing fails. +3. **Data Standardization**: Outputting reasoning chain data in a unified format. + +--- + +## 2. Quick Start + +### Step 1: Create a Working Directory + +```bash +mkdir run_mcts_reasoning +cd run_mcts_reasoning + +``` + +### Step 2: Prepare the Script + +Save the code in the "Pipeline Example" section below as `vision_mcts_pipeline.py`. + +### Step 3: Configure Parameters + +Ensure the input file (jsonl) contains the `tree` field (for extraction) or just `question/image` (for generation). + +```bash +# Install dependencies +pip install open-dataflow vllm + +``` + +### Step 4: Run + +```bash +python vision_mcts_pipeline.py \ + --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \ + --input_file "data/mcts_trajectories.jsonl" \ + --prompt_type "spatial" + +``` + +--- + +## 3. Data Flow & Logic + +### 1. **Input Data** + +Input data typically comes from MCTS search logs or unlabelled image-text pairs: + +* **image**: Path to the image. +* **question**: Visual question. +* **tree** (optional): JSON structure of the MCTS search tree, containing node values, visit counts, and actions. + +**Input Data Example**: + +```json +{ + "image": "./images/puzzle.jpg", + "question": "What is the next step to solve this?", + "tree": { "root": { "children": [...], "value": 1.0, "text": "Step 1..." } } +} + +``` + +### 2. **Core Operator Logic** + +The pipeline employs an **"Extract First, Fallback to Generate"** hybrid strategy: + +#### A. **MCTSTreeRefiner** + +This operator is responsible for processing the `tree` field. It traverses the tree structure and filters for the best paths from root to leaf based on node Q-values. + +* **Input**: `tree` object. +* **Functionality**: Linearizes tree paths, filtering out low-value or incomplete search branches. +* **Output**: List of extracted reasoning chains (`mcts_chains`). + +#### B. **VisualReasoningGenerator** + +This operator is the "Generation Engine" of the pipeline. It takes the extraction results from the previous step as input. + +* **Mechanism**: Checks `input_existing_chains_key` (i.e., `mcts_chains`). +* If MCTS parsing was successful (chains exist), it reuses them directly without running inference (saving compute). +* If MCTS chains are empty (tree missing or parsing failed), it calls the VLM to generate reasoning chains from scratch based on the `prompt_type`. + + +* **Prompt Type**: Supports modes like `spatial` (spatial coordinate reasoning), `logical` (logical reasoning), etc. + +### 3. **Output Data** + +The final output data (`final_reasoning_chains`) will contain high-quality Chain-of-Thought data ready for SFT training. + +**Output Example**: + +```json +{ + "image": "./images/puzzle.jpg", + "final_reasoning_chains": [ + "First, locate the red block at [100, 200]. To solve the puzzle, it needs to move right...Move Red Block" + ] +} + +``` + +--- + +## 4. Pipeline Example + +Below is the complete `VisionMCTSReasoningPipeline` code implementation. +```python +import argparse +from dataflow.utils.storage import FileStorage +from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm + +# 引入原子算子 +from dataflow.operators.core_text import MCTSTreeRefiner +from dataflow.operators.core_vision import VisualReasoningGenerator + +class VisionMCTSReasoningPipeline: + def __init__( + self, + model_path: str, + *, + # Storage + first_entry_file: str, + cache_path: str = "./cache_mcts", + file_name_prefix: str = "mcts_reason", + # Config + prompt_type: str = "spatial", + max_samples_per_file: int = 10000, + # Keys + input_question_key: str = "question", + input_image_key: str = "image", + input_tree_key: str = "tree", + output_key: str = "final_reasoning_chains", + # VLLM + vllm_max_tokens: int = 1024 + ): + # 1. 存储初始化 + self.storage = FileStorage( + first_entry_file_name=first_entry_file, + cache_path=cache_path, + file_name_prefix=file_name_prefix, + cache_type="jsonl" + ) + + # 2. 模型服务 + self.serving = LocalModelVLMServing_vllm( + hf_model_name_or_path=model_path, + vllm_tensor_parallel_size=1, + vllm_temperature=0.7, + vllm_max_tokens=vllm_max_tokens + ) + + self.keys = { + "q": input_question_key, + "img": input_image_key, + "tree": input_tree_key, + "mcts_chains": "mcts_extracted_chains", # 中间结果 + "final": output_key + } + + # ================== Operators ================== + + # 算子 1: MCTS Tree -> Chains (提取器) + # 负责将树结构扁平化为线性链 + self.op_mcts_refine = MCTSTreeRefiner( + max_chains_per_sample=max_samples_per_file + ) + + # 算子 2: VLM -> Chains (生成器/Fallback) + # 如果 MCTS 提取失败,则使用 VLM 生成;如果成功,则跳过 + self.op_vlm_gen = VisualReasoningGenerator( + serving=self.serving, + prompt_type=prompt_type + ) + + def forward(self): + print(">>> [Pipeline] Step 1: Extracting Chains from MCTS Trees...") + self.op_mcts_refine.run( + self.storage.step(), + input_tree_key=self.keys["tree"], + output_key=self.keys["mcts_chains"] + ) + + print(">>> [Pipeline] Step 2: Generating Chains via VLM (Fallback)...") + # 注意:input_existing_chains_key 实现了混合/回退逻辑 + self.op_vlm_gen.run( + self.storage.step(), + input_question_key=self.keys["q"], + input_image_key=self.keys["img"], + input_existing_chains_key=self.keys["mcts_chains"], + output_key=self.keys["final"] + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--input_file", default="dataflow/example/image_to_text_pipeline/mct_reasoning.jsonl") + parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct") + parser.add_argument("--prompt_type", default="spatial") + args = parser.parse_args() + + pipe = VisionMCTSReasoningPipeline( + model_path=args.model_path, + first_entry_file=args.input_file, + prompt_type=args.prompt_type + ) + pipe.forward() + +``` \ No newline at end of file diff --git a/docs/en/notes/mm_guide/video_understanding/multirole_videoqa_pipeline.md b/docs/en/notes/mm_guide/video_understanding/multirole_videoqa_pipeline.md new file mode 100644 index 00000000..bbe78d17 --- /dev/null +++ b/docs/en/notes/mm_guide/video_understanding/multirole_videoqa_pipeline.md @@ -0,0 +1,288 @@ +--- +title: Multi-Role Video QA Pipeline +createTime: 2026/01/11 22:15:28 +icon: mdi:image-text +permalink: /en/mm_guide/multirole_videoqa_pipeline/ +--- +## 1. Overview + +The **Multi-Role Video QA Pipeline** leverages Multimodal Large Models (VLMs) and a Multi-Agent collaboration mechanism to automatically generate high-quality, deep Question-Answer (QA) pairs from long videos or advertising footage. + +Unlike standard single-pass generation, this pipeline introduces a **Multi-Agent Iterative Refinement** phase. It first generates initial QAs, then refines them through multiple rounds of interaction simulating different agent roles (e.g., Questioner, Checker, Polisher), finally outputting logical and accurate QA data. + +We support the following application scenarios: + +* **Ad Video Understanding**: Extracting key selling points, emotional tone, and narrative logic from ads. +* **Complex Video Reasoning**: Constructing deep QA datasets requiring reasoning across different time segments. +* **Long Video Summarization & QA**: Handling video data containing rich Metadata (`Meta`) and multiple Clips (`Clips`). + +The main process of the pipeline includes: + +1. **Initial Generation**: Generates baseline QA pairs based on video metadata and clips. +2. **Multi-Agent Refinement**: Critiques, corrects, and optimizes QA pairs through multiple iterations (default 3 rounds). +3. **Final Generation**: Cleans the data and outputs the final QA set in a standard format. + +--- + +## 2. Quick Start + +### Step 1: Create a Working Directory + +```bash +mkdir run_video_qa +cd run_video_qa + +``` + +### Step 2: Prepare the Script + +Save the code in the "Pipeline Example" section below as `multirole_videoqa_pipeline.py`. + +### Step 3: Configure Parameters + +Ensure the input data contains `Meta` and `Clips` fields. + +```bash +# Install dependencies +pip install open-dataflow vllm + +``` + +### Step 4: Run + +```bash +python multirole_videoqa_pipeline.py \ + --model_path "/path/to/Qwen2.5-VL-7B-Instruct" \ + --images_file "data/adsQA.jsonl" \ + --card_id "0" + +``` + +--- + +## 3. Data Flow & Logic + +### 1. **Input Data** + +Input data is typically pre-processed video data containing global metadata and segment information: + +* **Meta**: Global description, title, or background info of the video. +* **Clips**: List of video clips, where each clip contains audio text, frame image paths, and clip descriptions. + +**Input Data Example**: + +```json +{ + "Meta": "A commercial for a new sports car featuring dynamic driving scenes.", + "Clips": [ + { + "Audio_Text": "Experience the speed.", + "Frames_Images": ["./frames/001.jpg", "./frames/002.jpg"], + "Description": "Car accelerating on a highway." + }, + { + "Audio_Text": "Safety meets luxury.", + "Frames_Images": ["./frames/003.jpg"], + "Description": "Interior shot showing leather seats." + } + ] +} + +``` + +### 2. **Core Operator Logic** + +This pipeline executes through a chain of three specialized operators: + +#### A. **MultiroleVideoQAInitialGenerator** + +* **Function**: Acts as the "Draft Author", reading `Meta` and `Clips` to generate the first version of QA pairs using the VLM. +* **Output**: A DataFrame containing preliminary QAs. + +#### B. **MultiroleVideoQAMultiAgentGenerator** + +* **Function**: Acts as the "Editorial Team", polishing the draft. +* **Mechanism**: Configured with `max_iterations` (e.g., 3 rounds). During these rounds, the model may simulate different roles (e.g., a reviewer pointing out errors, a polisher improving wording) to progressively enhance QA quality. +* **Input**: Initial DataFrame. +* **Output**: Intermediate DataFrame after multiple rounds of correction. + +#### C. **MultiroleVideoQAFinalGenerator** + +* **Function**: Acts as the "Publisher", responsible for final formatting and cleaning. +* **Output**: Standardized `QA` list. + +### 3. **Output Data** + +The output data adds a high-quality QA list to the original fields: + +* **QA**: List of generated QA pairs, including labels (e.g., question type), question text, and answer text. + +**Output Data Example**: + +```json +{ + "Meta": "...", + "Clips": [...], + "QA": [ + { + "Label": "Feature Extraction", + "Question": "What specific features of the car are highlighted in the interior shots?", + "Answer": "The video highlights the luxury leather seats and the advanced dashboard interface." + }, + { + "Label": "Narrative Analysis", + "Question": "How does the audio complement the visual transition?", + "Answer": "The narration 'Experience speed' coincides with the acceleration scene, reinforcing the dynamic visual." + } + ] +} + +``` + +--- + +## 4. Pipeline Example + +Below is the complete `MultiRoleVideoQAPipeline` code implementation. + +```python +import argparse +import os +from dataflow.serving import LocalModelVLMServing_vllm +from dataflow.utils.storage import FileStorage +from dataflow.operators.core_vision import ( + MultiroleVideoQAInitialGenerator, + MultiroleVideoQAMultiAgentGenerator, + MultiroleVideoQAFinalGenerator +) + +try: + import torch + # 多进程启动方式设置为 spawn,避免 CUDA 初始化冲突 + if 'spawn' not in torch.multiprocessing.get_all_start_methods(): + torch.multiprocessing.set_start_method('spawn', force=True) +except ImportError: + pass + + +class MultiRoleVideoQAPipeline(): + def __init__( + self, + model_path: str, + *, + hf_cache_dir: str | None = None, + download_dir: str = "./ckpt", + first_entry_file: str = "/dataflow/example/ads_QA/adsQA.jsonl", + cache_path: str = "./cache_local", + file_name_prefix: str = "dataflow_cache_step", + cache_type: str = "jsonl", + # Keys Configuration + Meta_key: str = "Meta", + clips_key: str = "Clips", + output_key: str = "QA" + ): + # 1. 存储初始化 + self.storage = FileStorage( + first_entry_file_name=first_entry_file, + cache_path=cache_path, + file_name_prefix=file_name_prefix, + cache_type=cache_type, + ) + + # 强制设置 vLLM 的多进程方法 + os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = "spawn" + + # 2. VLM 服务初始化 + self.llm_serving = LocalModelVLMServing_vllm( + hf_model_name_or_path=model_path, + hf_cache_dir=hf_cache_dir, + hf_local_dir=download_dir, + vllm_tensor_parallel_size=1, + vllm_temperature=0.7, + vllm_top_p=0.9, + vllm_max_tokens=6000, # 视频问答通常需要较长的 Context + ) + + # 3. 算子链初始化 + # 阶段一:初始生成 + self.initial_QA_generation = MultiroleVideoQAInitialGenerator(llm_serving = self.llm_serving) + + # 阶段二:多智能体迭代优化 (核心差异点) + self.multiAgent_QA_generation = MultiroleVideoQAMultiAgentGenerator( + llm_serving = self.llm_serving, + max_iterations = 3 + ) + + # 阶段三:最终格式化 + self.final_QA_generation = MultiroleVideoQAFinalGenerator(llm_serving = self.llm_serving) + + self.input_meta_key = Meta_key + self.input_clips_key = clips_key + self.output_key = output_key + + def forward(self): + print(">>> [Pipeline] Step 1: Initial QA Generation...") + init_df = self.initial_QA_generation.run( + storage = self.storage.step(), + input_meta_key = self.input_meta_key, + input_clips_key = self.input_clips_key, + output_key = self.output_key + ) + + print(">>> [Pipeline] Step 2: Multi-Agent Refinement (3 iterations)...") + # 注意:此算子接收上一阶段的 DataFrame (init_df) 作为输入 + middle_df = self.multiAgent_QA_generation.run( + df = init_df, + input_meta_key = self.input_meta_key, + input_clips_key = self.input_clips_key, + output_key = self.output_key + ) + + print(">>> [Pipeline] Step 3: Finalizing QA Pairs...") + self.final_QA_generation.run( + storage = self.storage, + df = middle_df, + input_meta_key = self.input_meta_key, + input_clips_key = self.input_clips_key, + output_key = self.output_key + ) + print(">>> [Pipeline] Done.") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Batch video QA generation with DataFlow (Single GPU)") + + parser.add_argument("--model_path", default="../../Models/Qwen2.5-VL-7B-Instruct", + help="Path to the local model or HuggingFace repo ID.") + parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface", + help="HuggingFace cache directory.") + parser.add_argument("--download_dir", default="./ckpt", + help="Local directory for downloading models.") + + parser.add_argument("--card_id", type=str, default="0", + help="The single CUDA device ID to use (e.g., '0' or '1').") + + parser.add_argument("--images_file", default="./dataflow/example/ads_QA/adsQA.jsonl", + help="Path to the first entry file for DataFlow.") + parser.add_argument("--cache_path", default="./cache_local", + help="Directory for caching DataFlow steps.") + parser.add_argument("--file_name_prefix", default="caption", + help="Prefix for cache file names.") + parser.add_argument("--cache_type", default="jsonl", + help="Type of cache file (e.g., jsonl).") + + args = parser.parse_args() + + os.environ['CUDA_VISIBLE_DEVICES'] = args.card_id.replace(' ', '') + + pipe = MultiRoleVideoQAPipeline( + model_path=args.model_path, + hf_cache_dir=args.hf_cache_dir, + download_dir=args.download_dir, + first_entry_file=args.images_file, + cache_path=args.cache_path, + file_name_prefix=args.file_name_prefix, + cache_type=args.cache_type, + ) + pipe.forward() + +``` diff --git a/docs/en/notes/mm_operators/image_understanding/eval/clip_image_text_evaluator.md b/docs/en/notes/mm_operators/image_understanding/eval/clip_image_text_evaluator.md deleted file mode 100644 index 6a998fef..00000000 --- a/docs/en/notes/mm_operators/image_understanding/eval/clip_image_text_evaluator.md +++ /dev/null @@ -1,107 +0,0 @@ ---- -title: clip_image_text_evaluator -createTime: 2025/10/15 19:56:33 -icon: material-symbols-light:image -permalink: /en/mm_operators/eval/clip_image_text_evaluator/ ---- -## 📘 Overview -`CLIPEvaluator` computes the **image-text alignment score** using **CLIP**, with scores ranging from `[0,1]`. -Internally, it encodes both the image and text using the CLIP model, performs L2 normalization, -then calculates cosine similarity and linearly maps it to `[0,1]` via `(cos + 1) / 2`. - - -## ```__init__``` -```python -def __init__( - self, - model_name: str = "/data0/happykeyan/workspace/ckpt/clip-vit-base-patch32", - device: str = None -) -``` - - -## `init` Parameters -| Parameter | Type | Default | Description | -| :--- | :--- | :--- | :--- | -| `model_name` | `str` | `"/data0/happykeyan/workspace/ckpt/clip-vit-base-patch32"` | Local path or Hugging Face Model ID for the CLIP model. Loaded via `CLIPProcessor` / `CLIPModel` (`use_safetensors=True`). | -| `device` | `str \| None` | `None` | The inference device. Automatically selects `"cuda"` if available, otherwise falls back to `"cpu"`. | - - - -## `run` -```python -def run( - self, - storage: DataFlowStorage, - image_key: str = "image_path", - text_key: str = "text", - output_key: str = "clip_score" -): - ... -``` - -Parameters -| Parameter | Type | Default | Description | -| :--- | :--- | :--- | :--- | -| `storage` | `DataFlowStorage` | — | The Dataflow storage object used for reading and writing data. | -| `image_key` | `str` | `"image_path"` | The column name containing the image path. | -| `text_key` | `str` | `"text"` | The column name containing the text input. | -| `output_key` | `str` | `"clip_score"` | The column name for storing the output alignment score (range `[0,1]`). | - - - -## 🧠 Example Usage - -```python -from dataflow.utils.storage import FileStorage -from dataflow.operators.core_vision import CLIPEvaluator - -# 1) Prepare FileStorage (must contain image_path and text columns) -storage = FileStorage( - first_entry_file_name="data/clip_input.jsonl", - cache_path="./cache_local", - file_name_prefix="clip_eval", - cache_type="jsonl" -) - -# 2) Initialize the operator (can also use HF model ID, e.g. "openai/clip-vit-base-patch32") -evaluator = CLIPEvaluator( - model_name="/data0/happykeyan/workspace/ckpt/clip-vit-base-patch32", - device=None # automatically chooses cuda/cpu -) - -# 3) Execute evaluation -cols = evaluator.run( - storage=storage.step(), - image_key="image_path", - text_key="text", - output_key="clip_score" -) -print(cols) # ["clip_score"] -``` - -### 🧾 Default Output Format -| Field | Type | Description | -| :--- | :--- | :--- | -| `image_path` / specified `image_key` | `string` | The input image path. | -| `text` / specified `text_key` | `string` | The input text. | -| `clip_score` / specified `output_key` | `float` | The image-text alignment score (range `[0,1]`). | - - - -Example Input: -```jsonl -{ - "image_path": "1.png", - "text": "The image shows a man and a woman in what appears to be a car." -} -``` - -Example Output: -```jsonl -{ - "image_path": "1.png", - "text": "The image shows a man and a woman in what appears to be a car.", - "clip_score": 0.642 -} -``` \ No newline at end of file diff --git a/docs/en/notes/mm_operators/image_understanding/eval/image_clip_evaluator.md b/docs/en/notes/mm_operators/image_understanding/eval/image_clip_evaluator.md new file mode 100644 index 00000000..4476615c --- /dev/null +++ b/docs/en/notes/mm_operators/image_understanding/eval/image_clip_evaluator.md @@ -0,0 +1,108 @@ +--- +title: ImageCLIPEvaluator +createTime: 2025/10/15 19:56:33 +# icon: material-symbols-light:image +permalink: /en/mm_operators/eval/image_clip_evaluator/ +--- +## 📘 Overview +`ImageCLIPEvaluator` computes an **image–text alignment score** based on **CLIP**, with scores ranging in `[0, 1]`. +Internally, it encodes the image and the text with CLIP → normalizes the embeddings → computes cosine similarity and linearly maps it to `[0, 1]` via `(cos + 1) / 2`. + + + + +## ```__init__``` +```python +def __init__( + self, + model_name: str = "openai/clip-vit-base-patch32", + device: str = None +): + ... +``` + + +## `init` Parameters +| Parameter | Type | Default | Description | +| :----------- | :------------ | :------------------------------- | :---------- | +| `model_name` | `str` | `"openai/clip-vit-base-patch32"` | Local path or Hugging Face Model ID of the CLIP model; loaded via `CLIPProcessor` / `CLIPModel` (`use_safetensors=True`). | +| `device` | `str \| None` | `None` | Inference device; when `None`, the operator automatically selects `"cuda"` if available, otherwise falls back to `"cpu"`. | + + +## `run` +```python +def run( + self, + storage: DataFlowStorage, + input_image_key: str = "image_path", + input_text_key: str = "text", + output_key: str = "clip_score" +): + ... +``` + +Parameters +| Parameter | Type | Default | Description | +| :--------------- | :---------------- | :-------------- | :---------- | +| `storage` | `DataFlowStorage` | — | The Dataflow storage object used for reading and writing data. | +| `input_image_key`| `str` | `"image_path"` | Column name of the input image path. | +| `input_text_key` | `str` | `"text"` | Column name of the input text. | +| `output_key` | `str` | `"clip_score"` | Column name for the output alignment score (range `[0, 1]`). | + + + +## 🧠 Example Usage + +```python +from dataflow.utils.storage import FileStorage +from dataflow.operators.core_vision import ImageCLIPEvaluator + +# 1) Prepare FileStorage (must contain at least image_path and text columns) +storage = FileStorage( + first_entry_file_name="./dataflow/example/test_image_eval/test_image_eval.jsonl", + cache_path="./cache_local", + file_name_prefix="clip_eval", + cache_type="jsonl" +) + +# 2) Initialize the operator (can also use an HF model ID such as "openai/clip-vit-base-patch32") +evaluator = ImageCLIPEvaluator( + model_name="openai/clip-vit-base-patch32", + device=None # automatically selects cuda/cpu +) + +# 3) Run evaluation +cols = evaluator.run( + storage=storage.step(), + input_image_key="image_path", + input_text_key="text", + output_key="clip_score" +) +print(cols) # ["clip_score"] +``` + +### 🧾 Default Output Format +| Field name | Type | Default | Description | +| :------------------------------------------- | :------- | :------ | :---------- | +| `image_path` (or column given by `input_image_key`) | `string` | — | Input image path. | +| `text` (or column given by `input_text_key`) | `string` | — | Input text. | +| `clip_score` (or `output_key`) | `float` | — | Image–text alignment score in the range `[0, 1]`. | + + + +Example Input: +```jsonl +{ + "image_path": "1.png", + "text": "The image shows a man and a woman in what appears to be a car." +} +``` + +Example Output: +```jsonl +{ + "image_path": "1.png", + "text": "The image shows a man and a woman in what appears to be a car.", + "clip_score": 0.642 +} +``` \ No newline at end of file diff --git a/docs/en/notes/mm_operators/image_understanding/eval/image_longclip_evaluator.md b/docs/en/notes/mm_operators/image_understanding/eval/image_longclip_evaluator.md new file mode 100644 index 00000000..1f38a452 --- /dev/null +++ b/docs/en/notes/mm_operators/image_understanding/eval/image_longclip_evaluator.md @@ -0,0 +1,114 @@ +--- +title: ImageLongCLIPEvaluator +createTime: 2025/10/15 14:30:52 +# icon: material-symbols-light:image +permalink: /en/mm_operators/eval/image_longclip_evaluator/ +--- +## 📘 Overview + +`ImageLongCLIPEvaluator` computes an **alignment score between images and long-form text** using **LongCLIP**, with scores in the range `[0, 1]`. +Compared with standard CLIP, LongCLIP supports substantially longer textual context (in this implementation, the default is `context_length = 248`), which makes it suitable for paragraph-level description matching and alignment evaluation. + +The internal pipeline is analogous to CLIP: it encodes the image and the text separately, applies L2 normalization to both embeddings, computes cosine similarity, and then maps the similarity to the interval `[0, 1]` via `(cos + 1) / 2`. + + + + +## ```__init__``` +```python +def __init__( + self, + model_name: str = "BeichenZhang/LongCLIP-L-336px", + device: str = None, +): + ... +``` + + +## `__init__` Parameters +| Parameter | Type | Default | Description | +| :----------- | :------------ | :------------------------------ | :---------- | +| `model_name` | `str` | `"BeichenZhang/LongCLIP-L-336px"` | LongCLIP checkpoint spec. If it is a **directory path**, the operator will search for files ending with `.pt`, `.bin` or `.ckpt` and automatically pick one checkpoint to load; if it is a **file path**, that file is used directly as the checkpoint. | +| `device` | `str \| None` | `None` | Inference device. Automatically selects `"cuda"` if available, otherwise falls back to `"cpu"`. | + + + +## `run` +```python +def run( + self, + storage: DataFlowStorage, + input_image_key: str = "image_path", + input_text_key: str = "text", + output_key: str = "longclip_score", +): + ... +``` + +Parameters +| Parameter | Type | Default | Description | +| :--------------- | :---------------- | :---------------- | :---------- | +| `storage` | `DataFlowStorage` | — | Dataflow storage object used to read and write the DataFrame inside the operator. | +| `input_image_key`| `str` | `"image_path"` | Name of the input image column corresponding to file paths in the DataFrame. | +| `input_text_key` | `str` | `"text"` | Name of the input long-text column corresponding to paragraph-level descriptions in the DataFrame. | +| `output_key` | `str` | `"longclip_score"`| Name of the output score column (range `[0, 1]`) used to store LongCLIP image–text alignment scores. | + + + + +## 🧠 Example Usage + +```python +from dataflow.utils.storage import FileStorage +from dataflow.operators.core_vision import ImageLongCLIPEvaluator + +# 1) Prepare FileStorage (must contain at least image_path and text columns) +storage = FileStorage( + first_entry_file_name="./dataflow/example/test_image_eval/test_image_eval.jsonl", + cache_path="./cache_local", + file_name_prefix="longclip_eval", + cache_type="jsonl" +) + +# 2) Initialize the LongCLIP evaluator (model_name can be a directory or a specific checkpoint file) +evaluator = ImageLongCLIPEvaluator( + model_name="BeichenZhang/LongCLIP-L-336px", + device=None # automatically selects cuda/cpu +) + +# 3) Run evaluation: adds longclip_score ∈ [0, 1] for each row +cols = evaluator.run( + storage=storage.step(), + input_image_key="image_path", + input_text_key="text", + output_key="longclip_score" +) +print(cols) # ["longclip_score"] +``` + +### 🧾 Default Output Format +| Field name | Type | Default | Description | +| :------------------------------------------- | :------- | :------ | :---------- | +| `image_path` (or the column given by `input_image_key`) | `string` | — | Input image path. | +| `text` (or the column given by `input_text_key`) | `string` | — | Input long-text description. | +| `longclip_score` (or `output_key`) | `float` | — | Long-text image–text alignment score in the range `[0, 1]`. | + + + + +Example Input: +```jsonl +{ + "image_path": "1.png", + "text": "The image shows a man and a woman in what appears to be a car." +} +``` + +Example Output: +```jsonl +{ + "image_path": "1.png", + "text": "The image shows a man and a woman in what appears to be a car.", + "clip_score": 0.642 +} +``` \ No newline at end of file diff --git a/docs/en/notes/mm_operators/image_understanding/eval/image_vqa_evaluator.md b/docs/en/notes/mm_operators/image_understanding/eval/image_vqa_evaluator.md new file mode 100644 index 00000000..59f82843 --- /dev/null +++ b/docs/en/notes/mm_operators/image_understanding/eval/image_vqa_evaluator.md @@ -0,0 +1,111 @@ +--- +title: ImageVQAScoreEvaluator +createTime: 2025/10/15 14:52:29 +# icon: material-symbols-light:image +permalink: /en/mm_operators/eval/image_vqa_evaluator/ +--- +## 📘 Overview +`ImageVQAScoreEvaluator` leverages a **BLIP visual question answering (VQA) model** to compute a **Yes-probability score** that quantifies whether an image is aligned with a given textual description, with values in the interval `[0, 1]`. +The core idea is as follows: the textual description is wrapped into an English interrogative prompt of the form *“Does this image match the description?”*, using `"yes"` and `"no"` as candidate answers. +The model is then queried twice with `"yes"` and `"no"` as labels, and their respective losses are converted into relative probabilities. The normalized probability assigned to `"yes"` is taken as the image–text consistency score. + + +## ```__init__``` +```python +def __init__( + self, + model_name: str = "Salesforce/blip-vqa-base", + device: str = None, + local_only: bool = True, +): + ... +``` + + +## `init` Parameters +| Parameter | Type | Default | Description | +| :----------- | :------------ | :---------------------------- | :---------- | +| `model_name` | `str` | `"Salesforce/blip-vqa-base"` | Hugging Face Model ID or local path of the BLIP VQA model; loaded via `BlipProcessor` / `BlipForQuestionAnswering`. | +| `device` | `str \| None` | `None` | Inference device. When `None`, the operator automatically selects `"cuda"` if available; otherwise it falls back to `"cpu"`. | +| `local_only` | `bool` | `True` | Whether to load model weights strictly from local files. When `True`, the model is loaded with `local_files_only=True` (recommended for offline or restricted-network environments). | + + + +## `run` +```python +def run( + self, + storage: DataFlowStorage, + input_image_key: str = "image_path", + input_text_key: str = "text", + output_key: str = "vqa_score" +): + ... +``` + +Parameters +| Parameter | Type | Default | Description | +| :--------------- | :---------------- | :-------------- | :---------- | +| `storage` | `DataFlowStorage` | — | Dataflow storage object used for reading and writing the DataFrame. | +| `input_image_key`| `str` | `"image_path"` | Name of the column containing image paths. | +| `input_text_key` | `str` | `"text"` | Name of the column containing textual descriptions (which will be wrapped into English questions). | +| `output_key` | `str` | `"vqa_score"` | Name of the output field storing the VQA score (range `[0, 1]`), representing the model’s probability that “the image matches the description”. | + + + +## 🧠 Example Usage + +```python +from dataflow.utils.storage import FileStorage +from dataflow.operators.core_vision import ImageVQAScoreEvaluator + +# 1) Prepare FileStorage (must contain at least image_path and text columns) +storage = FileStorage( + first_entry_file_name="./dataflow/example/test_image_eval/test_image_eval.jsonl", + cache_path="./cache_local", + file_name_prefix="vqa_eval", + cache_type="jsonl" +) + +# 2) Initialize the VQA-based evaluator (can be pointed to a local model path) +evaluator = ImageVQAScoreEvaluator( + model_name="Salesforce/blip-vqa-base", + device=None, # automatically selects cuda/cpu + local_only=True # load from local files only (recommended offline) +) + +# 3) Run evaluation: adds vqa_score ∈ [0, 1] for each row +cols = evaluator.run( + storage=storage.step(), + input_image_key="image_path", + input_text_key="text", + output_key="vqa_score" +) +print(cols) # ["vqa_score"] +``` + +### 🧾 Default Output Format +| Field name | Type | Default | Description | +| :--------------------------------------------- | :------- | :------ | :---------- | +| `image_path` (or the column given by `input_image_key`) | `string` | — | Input image path. | +| `text` (or the column given by `input_text_key`) | `string` | — | Input textual description. | +| `vqa_score` (or `output_key`) | `float` | — | Yes-probability produced by BLIP VQA for the question “Does this image match the description?”, in the range `[0, 1]`. | + + + +Example Input: +```jsonl +{ + "image_path": "1.png", + "text": "The image shows a man and a woman in what appears to be a car." +} +``` + +Example Output: +```jsonl +{ + "image_path": "1.png", + "text": "The image shows a man and a woman in what appears to be a car.", + "vqa_score": 0.774 +} +``` \ No newline at end of file diff --git a/docs/en/notes/mm_operators/image_understanding/eval/longclip_image_text_evaluator.md b/docs/en/notes/mm_operators/image_understanding/eval/longclip_image_text_evaluator.md deleted file mode 100644 index d554083f..00000000 --- a/docs/en/notes/mm_operators/image_understanding/eval/longclip_image_text_evaluator.md +++ /dev/null @@ -1,109 +0,0 @@ ---- -title: longclip_image_text_evaluator -createTime: 2025/10/15 19:56:29 -icon: material-symbols-light:image -permalink: /en/mm_operators/eval/longclip_image_text_evaluator/ ---- -## 📘 Overview -`LongCLIPEvaluator` computes **image–long-text alignment scores** using **LongCLIP**, producing scores in the range `[0,1]`. -Compared to the standard CLIP model, LongCLIP supports longer text contexts (default `context_length=248`), -making it ideal for paragraph-level description evaluation and alignment tasks. - - - -## ```__init__``` -```python -def __init__( - self, - ckpt_path: str = "/data0/happykeyan/DataFlow-MM/Dataflow-MM-Preview/ckpt/LongCLIP-L-336px/longclip-L@336px.pt", - device: str = None, -): - ... -``` - - -## `init` Parameters -| Parameter | Type | Default | Description | -| :--- | :--- | :--- | :--- | -| `model_name` | `str` | `"/data0/happykeyan/workspace/ckpt/clip-vit-base-patch32"` | Local path or Hugging Face Model ID for the CLIP model. Loaded via `CLIPProcessor` / `CLIPModel` (`use_safetensors=True`). | -| `device` | `str \| None` | `None` | The inference device. Automatically selects `"cuda"` if available, otherwise falls back to `"cpu"`. | - - - -## `run` -```python -def run( - self, - storage: DataFlowStorage, - image_key: str = "image_path", - text_key: str = "text", - output_key: str = "clip_score" -): - ... -``` - -Parameters -| Parameter | Type | Default | Description | -| :--- | :--- | :--- | :--- | -| `storage` | `DataFlowStorage` | — | The Dataflow storage object used for reading and writing data. | -| `image_key` | `str` | `"image_path"` | The column name containing the image path. | -| `text_key` | `str` | `"text"` | The column name containing the text input. | -| `output_key` | `str` | `"clip_score"` | The column name for storing the output alignment score (range `[0,1]`). | - - - -## 🧠 Example Usage - -```python -from dataflow.utils.storage import FileStorage -from dataflow.operators.core_vision import LongCLIPEvaluator - -# 1) Prepare FileStorage (must include image_path and text columns) -storage = FileStorage( - first_entry_file_name="data/longclip_input.jsonl", - cache_path="./cache_local", - file_name_prefix="longclip_eval", - cache_type="jsonl" -) - -# 2) Initialize LongCLIP evaluator (replace ckpt_path with your checkpoint) -evaluator = LongCLIPEvaluator( - ckpt_path="/data0/happykeyan/DataFlow-MM/Dataflow-MM-Preview/ckpt/LongCLIP-L-336px/longclip-L@336px.pt", - device=None # auto-selects cuda/cpu -) - -# 3) Run evaluation — adds a new column 'longclip_score' ∈ [0,1] -cols = evaluator.run( - storage=storage.step(), - image_key="image_path", - text_key="text", - output_key="longclip_score" -) -print(cols) # ["longclip_score"] -``` - -### 🧾 Default Output Format -| Field | Type | Description | -| :--- | :--- | :--- | -| `image_path` / specified `image_key` | `string` | The input image path. | -| `text` / specified `text_key` | `string` | The input text. | -| `clip_score` / specified `output_key` | `float` | The image-text alignment score (range `[0,1]`). | - - - -Example Input: -```jsonl -{ - "image_path": "1.png", - "text": "The image shows a man and a woman in what appears to be a car." -} -``` - -Example Output: -```jsonl -{ - "image_path": "1.png", - "text": "The image shows a man and a woman in what appears to be a car.", - "clip_score": 0.642 -} -``` \ No newline at end of file diff --git a/docs/en/notes/mm_operators/image_understanding/eval/vqa_score_image_text_evaluator.md b/docs/en/notes/mm_operators/image_understanding/eval/vqa_score_image_text_evaluator.md deleted file mode 100644 index a88ba543..00000000 --- a/docs/en/notes/mm_operators/image_understanding/eval/vqa_score_image_text_evaluator.md +++ /dev/null @@ -1,113 +0,0 @@ ---- -title: vqa_score_image_text_evaluator -createTime: 2025/10/15 19:56:31 -icon: material-symbols-light:image -permalink: /en/mm_operators/eval/vqa_score_image_text_evaluator/ ---- -## 📘 Overview -`VQAScoreEvaluator` uses **BLIP VQA** to transform the question *“Does this image match the description?”* into a **Yes/No probability score** ranging from `[0,1]`. -- Constructed question: `Does this image match the description: {text}? Answer yes or no.` -- Forward pass twice with labels `"yes"` and `"no"` to obtain respective losses `L_yes` and `L_no`. -- Compute normalized probabilities. - -## ```__init__``` -```python -def __init__( - self, - model_name: str = "/data0/happykeyan/DataFlow-MM/Dataflow-MM-Preview/ckpt/blip-vqa-base", - device: str = None, - local_only: bool = True, -): - ... -``` - - -## `__init__` Parameters -| Parameter | Type | Default | Description | -| :--- | :--- | :--- | :--- | -| `model_name` | `str` | `"/data0/happykeyan/DataFlow-MM/Dataflow-MM-Preview/ckpt/blip-vqa-base"` | Path or Hugging Face Model ID of the BLIP VQA model. Loaded via `BlipProcessor` and `BlipForQuestionAnswering`. | -| `device` | `str \| None` | `None` | Inference device. Automatically selects `"cuda"` if available, otherwise falls back to `"cpu"`. | -| `local_only` | `bool` | `True` | If `True`, load model weights only from local files (`local_files_only=True`). | - - - - -## `run` -```python -def run( - self, - storage: DataFlowStorage, - image_key: str = "image_path", - text_key: str = "text", - output_key: str = "clip_score" -): - ... -``` - -Parameters -| Parameter | Type | Default | Description | -| :--- | :--- | :--- | :--- | -| `storage` | `DataFlowStorage` | — | The Dataflow storage object for reading and writing data. | -| `image_key` | `str` | `"image_path"` | Column name containing the input image path. | -| `text_key` | `str` | `"text"` | Column name containing the text description. | -| `output_key` | `str` | `"vqa_score"` | Column name where the computed Yes-probability score (range `[0,1]`) is stored. | - - - -## 🧠 Example Usage - -```python -from dataflow.utils.storage import FileStorage -from dataflow.operators.core_vision import VQAScoreEvaluator - -# 1) Prepare FileStorage (must include image_path and text columns) -storage = FileStorage( - first_entry_file_name="data/vqa_input.jsonl", - cache_path="./cache_local", - file_name_prefix="vqa_score", - cache_type="jsonl" -) - -# 2) Initialize the evaluator (you can replace with your own model path or HF model ID) -evaluator = VQAScoreEvaluator( - model_name="/data0/happykeyan/DataFlow-MM/Dataflow-MM-Preview/ckpt/blip-vqa-base", - device=None, # auto-select cuda/cpu - local_only=True # load from local weights only -) - -# 3) Run evaluation (adds a column 'vqa_score') -cols = evaluator.run( - storage=storage.step(), - image_key="image_path", - text_key="text", - output_key="vqa_score" -) -print(cols) # ["vqa_score"] -``` - -### 🧾 Default Output Format -| Field | Type | Description | -| :--- | :--- | :--- | -| `image_path` / specified `image_key` | `string` | The input image path. | -| `text` / specified `text_key` | `string` | The input text description. | -| `vqa_score` / specified `output_key` | `float` | The BLIP-predicted probability that the image matches the text (Yes probability, range `[0,1]`). | - - - - -Example Input: -```jsonl -{ - "image_path": "1.png", - "text": "The image shows a man and a woman in what appears to be a car." -} -``` - -Example Output: -```jsonl -{ - "image_path": "1.png", - "text": "The image shows a man and a woman in what appears to be a car.", - "vqa_score": 0.774 -} -``` \ No newline at end of file diff --git a/docs/en/notes/mm_operators/image_understanding/filter/cat_filter.md b/docs/en/notes/mm_operators/image_understanding/filter/cat_filter.md deleted file mode 100644 index c1e542f5..00000000 --- a/docs/en/notes/mm_operators/image_understanding/filter/cat_filter.md +++ /dev/null @@ -1,106 +0,0 @@ ---- -title: cat_filter -createTime: 2025/10/15 19:56:39 -icon: material-symbols-light:image -permalink: /en/mm_operators/filter/cat_filter/ ---- -## 📘 Overview -`CatFilter` (Caption-as-Teacher) is a **joint filtering operator** based on **textual complexity** and **OCR overlap rate**. -For each `caption`, it performs the following operations: -1. Uses **Stanza** for dependency parsing to extract subject-verb-object triples and assess **semantic complexity**. -2. Requires the sentence to **contain at least one verb** (actional requirement). -3. Applies **OCR** on the associated image and computes the **Jaccard overlap** between OCR text and `caption`; captions with excessive overlap are considered direct OCR copies and are filtered out. - -A sample is retained only if it meets **all three conditions**. - - -## ```__init__``` -```python -def __init__( - self, - min_triples: int = 2, - ocr_overlap_threshold: float = 0.2 -) -``` - - -## `init` Parameters -| Parameter | Type | Default | Description | -| :--- | :--- | :--- | :--- | -| `min_triples` | `int` | `2` | The minimum number of dependency triples (subject-verb-object or argument structures) required to determine sufficient caption complexity. | -| `ocr_overlap_threshold` | `float` | `0.2` | The maximum allowed Jaccard overlap between OCR text and caption. If overlap **≥** this threshold, the sample is considered an OCR copy and is filtered out. | - - - -## `run` -```python -def run( - self, - storage: DataFlowStorage, - image_key: str, - caption_key: str -): - ... -``` - -Parameters -| Parameter | Type | Default | Description | -| :--- | :--- | :--- | :--- | -| `storage` | `DataFlowStorage` | — | The Dataflow storage object for reading and writing. | -| `image_key` | `str` | — | The column name for image paths (e.g., `"image_path"`). | -| `caption_key` | `str` | — | The column name for caption text (e.g., `"caption"`). | - - - -## 🧠 Example Usage - -```python -from dataflow.utils.storage import FileStorage -from dataflow.operators.core_vision import CatFilter - -# 1) Prepare FileStorage (must contain image_path and caption columns) -storage = FileStorage( - first_entry_file_name="data/cat_input.jsonl", - cache_path="./cache_local", - file_name_prefix="cat_filter", - cache_type="jsonl" -) - -# 2) Initialize the operator -filt = CatFilter( - min_triples=2, # minimum complexity threshold - ocr_overlap_threshold=0.2 # maximum allowed OCR overlap -) - -# 3) Execute filtering -cols = filt.run( - storage=storage.step(), - image_key="image_path", - caption_key="caption" -) -print(cols) # ["image_path", "caption"] -``` - -### 🧾 Default Output Format -| Field | Type | Description | -| :--- | :--- | :--- | -| `image_path` | `string` | The retained image path after filtering. | -| `caption` | `string` | The retained caption text that meets all three conditions: complexity ≥ `min_triples`, contains verbs, and OCR overlap < `ocr_overlap_threshold`. | - - -Example Input: -```jsonl - "caption": "SALE SALE SALE 50% OFF" - -{ - "caption": "Two kids count seashells on a sandy beach while their mother reads under a blue umbrella." -} -``` - -Example Output: -```jsonl -{ - "image_path": "1.png", - "caption": "A bride smiles while the groom points ahead inside a car, their hands resting together on the seat." -} -``` \ No newline at end of file diff --git a/docs/en/notes/mm_operators/image_understanding/filter/clip_filter.md b/docs/en/notes/mm_operators/image_understanding/filter/clip_filter.md deleted file mode 100644 index 2b7d2a01..00000000 --- a/docs/en/notes/mm_operators/image_understanding/filter/clip_filter.md +++ /dev/null @@ -1,110 +0,0 @@ ---- -title: clip_filter -createTime: 2025/10/15 19:56:41 -icon: material-symbols-light:image -permalink: /en/mm_operators/filter/clip_filter/ ---- -## 📘 Overview -`ClipFilter` is a **consistency filtering operator** based on **CLIP similarity**. -For each sample, it computes the cosine similarity between the normalized image and text embeddings (mapped to the `[0,1]` range). -Samples with similarity **≥ `threshold`** are retained, while others are filtered out. - - - -## ```__init__``` -```python -def __init__( - self, - model_name: str = "../ckpt/clip-vit-base-patch32", - device: str = None -) -``` - - -## `init` Parameters -| Parameter | Type | Default | Description | -| :--- | :--- | :--- | :--- | -| `model_name` | `str` | `"../ckpt/clip-vit-base-patch32"` | The local path or Hugging Face Model ID for the CLIP model. Internally loaded using `CLIPProcessor` and `CLIPModel` (`use_safetensors=True`, `weights_only=False`). | -| `device` | `str \| None` | `None` | The inference device; if `None`, automatically selects `"cuda"` when available, otherwise falls back to `"cpu"`. | - - - - -## `run` -```python -def run( - self, - storage: DataFlowStorage, - image_key: str = "image", - caption_key: str = "caption", - threshold: float = 0.25 -): - ... -``` - -Parameters -| Parameter | Type | Default | Description | -| :--- | :--- | :--- | :--- | -| `storage` | `DataFlowStorage` | — | The Dataflow storage object used for reading and writing data. | -| `image_key` | `str` | `"image"` | The column name containing the image path. | -| `caption_key` | `str` | `"caption"` | The column name containing the text description. | -| `threshold` | `float` | `0.25` | The minimum CLIP similarity required to retain a sample; samples below this value will be filtered out. | - - - - -## 🧠 Example Usage - -```python -from dataflow.utils.storage import FileStorage -from dataflow.operators.core_vision import ClipFilter - -# 1) Prepare FileStorage (must contain "image" and "caption" columns) -storage = FileStorage( - first_entry_file_name="data/clip_filter_input.jsonl", - cache_path="./cache_local", - file_name_prefix="clip_filter", - cache_type="jsonl" -) - -# 2) Initialize the operator (can use local or Hugging Face model) -flt = ClipFilter( - model_name="../ckpt/clip-vit-base-patch32", # or "openai/clip-vit-base-patch32" - device=None # auto-select cuda/cpu -) - -# 3) Execute filtering (retains only samples with similarity ≥ 0.25) -cols = flt.run( - storage=storage.step(), - image_key="image", - caption_key="caption", - threshold=0.25 -) -print(cols) # ["image", "caption"] -``` - -### 🧾 Default Output Format -| Field | Type | Description | -| :--- | :--- | :--- | -| `image` | `string` | The retained image path after filtering. | -| `caption` | `string` | The retained caption text whose image-text similarity is **≥ `threshold`**. | - -Example Input: -```jsonl -{ - "image": "1.png", - "caption": "A bride and groom smiling in a car." -} -{ - "image": "2.jpg", - "caption": "A red bus driving across a snowy mountain road at night." -} -``` - -Example Output: -```jsonl -{ - "image": "1.png", - "caption": "A bride and groom smiling in a car." -} -``` \ No newline at end of file diff --git a/docs/en/notes/mm_operators/image_understanding/filter/complexity_filter.md b/docs/en/notes/mm_operators/image_understanding/filter/complexity_filter.md deleted file mode 100644 index 77e25b6d..00000000 --- a/docs/en/notes/mm_operators/image_understanding/filter/complexity_filter.md +++ /dev/null @@ -1,106 +0,0 @@ ---- -title: complexity_filter -createTime: 2025/10/15 19:56:44 -icon: material-symbols-light:image -permalink: /en/mm_operators/filter/complexity_filter/ ---- -## 📘 Overview -`ComplexityFilter` is a **text filtering operator** based on **Natural Language Inference (NLI)**. It evaluates whether a caption covers multiple visual capability elements (such as color, shape, action recognition, counting, and spatial relationships), thereby determining its **capability richness**. -For each caption, the operator constructs hypothesis sentences (template: `"The following text describes {}."`) and uses an MNLI model to calculate the **entailment** probability. A sample is retained if the number of matched capability elements reaches the threshold (`min_k`); otherwise, it is filtered out. - -## ```__init__``` -```python -def __init__( - self, - model_name: str = "../ckpt/bart-large-mnli", - threshold: float = 0.4, - min_k: int = 2, - device: str = None -) -``` - - -## `init` Parameters -| Parameter | Type | Default | Description | -| :--- | :--- | :--- | :--- | -| `model_name` | `str` | `"../ckpt/bart-large-mnli"` | The local path or Hugging Face Model ID for the NLI model. Internally loaded using `AutoTokenizer` and `AutoModelForSequenceClassification` (`local_files_only=True`, `use_safetensors=True`, `weights_only=False`). | -| `threshold` | `float` | `0.4` | The minimum entailment probability threshold for a capability element to be considered “hit.” Higher values imply stricter filtering. | -| `min_k` | `int` | `2` | The minimum number of capability elements that must be hit; samples below this threshold are filtered out. | -| `device` | `str \| None` | `None` | The inference device; if `None`, automatically selects `"cuda"` when available, otherwise falls back to `"cpu"`. | - -## `run` -```python -def run( - self, - storage: DataFlowStorage, - caption_key: str -): - ... -``` - -Parameters -| Parameter | Type | Default | Description | -| :--- | :--- | :--- | :--- | -| `storage` | `DataFlowStorage` | — | The Dataflow storage object for reading and writing. | -| `caption_key` | `str` | — | The name of the column containing the captions to be evaluated (e.g., `"caption"`). | - - - -## 🧠 Example Usage - -```python -from dataflow.utils.storage import FileStorage -from dataflow.operators.core_vision import ComplexityFilter - -# 1) Prepare FileStorage (must contain a caption column) -storage = FileStorage( - first_entry_file_name="data/complexity_input.jsonl", - cache_path="./cache_local", - file_name_prefix="complexity_filter", - cache_type="jsonl" -) - -# 2) Initialize the operator (can use local or HF model) -filt = ComplexityFilter( - model_name="../ckpt/bart-large-mnli", # or "facebook/bart-large-mnli" - threshold=0.4, # entailment probability threshold - min_k=2, # minimum number of matched elements - device=None # auto-select cuda/cpu -) - -# 3) Execute filtering -cols = filt.run( - storage=storage.step(), - caption_key="caption" -) -print(cols) # ["caption"] -``` - -#### 🧾 Default Output Format -| Field | Type | Description | -| :--- | :--- | :--- | -| `caption` | `string` | The filtered caption text; only samples with the number of matched capability elements `≥ min_k` are retained. | - -Example Input: -```jsonl -{ - "caption": "A red double-decker bus turns left at a city intersection while pedestrians wait at the crosswalk." -} -{ - "caption": "SALE SALE SALE 50% OFF" -} -{ - "caption": "Two kids count seashells on a sandy beach while their mother reads under a blue umbrella." -} - -``` - -Example Output: -```jsonl -{ - "caption": "A red double-decker bus turns left at a city intersection while pedestrians wait at the crosswalk." -} -{ - "caption": "Two kids count seashells on a sandy beach while their mother reads under a blue umbrella." -} -``` \ No newline at end of file diff --git a/docs/en/notes/mm_operators/image_understanding/filter/deduplication_filter.md b/docs/en/notes/mm_operators/image_understanding/filter/deduplication_filter.md deleted file mode 100644 index d99f63ce..00000000 --- a/docs/en/notes/mm_operators/image_understanding/filter/deduplication_filter.md +++ /dev/null @@ -1,114 +0,0 @@ ---- -title: deduplication_filter -createTime: 2025/10/15 19:56:47 -icon: material-symbols-light:image -permalink: /en/mm_operators/filter/deduplication_filter/ ---- -## 📘 Overview -`DeduplicateFilter` is a **duplicate removal operator** based on **CLIP image embedding similarity**. -It extracts CLIP feature vectors for all images in a dataset and computes pairwise cosine similarity. -For any image pair with similarity **≥ `threshold`**, the operator keeps the **first** image and removes the **later duplicates**. -Additionally, it records the **maximum similarity score** for each retained image (stored in the column `output_score_key`, default: `max_similarity`) for auditing purposes. - - -## ```__init__``` -```python -def __init__( - self, - model_name: str = "openai/clip-vit-base-patch32", - threshold: float = 0.90, - batch_size: int = 32, - device: str = "cuda" if torch.cuda.is_available() else "cpu" -) -``` - - -## `init` Parameters -| Parameter | Type | Default | Description | -| :--- | :--- | :--- | :--- | -| `model_name` | `str` | `"openai/clip-vit-base-patch32"` | The CLIP model used to extract image embeddings (Hugging Face Model ID or local path). | -| `threshold` | `float` | `0.90` | The deduplication threshold; if cosine similarity between two images **≥** this value, the later image is considered a duplicate and removed. | -| `batch_size` | `int` | `32` | The batch size for CLIP inference; higher values increase throughput but also memory usage. | -| `device` | `str` | `"cuda"` if available, otherwise `"cpu"` | The device used for model inference. | - - - -## `run` -```python -def run( - self, - storage: DataFlowStorage, - input_image_key: str = "image", - output_score_key: str = "max_similarity" -) -> None: - ... -``` - -Parameters -| Parameter | Type | Default | Description | -| :--- | :--- | :--- | :--- | -| `storage` | `DataFlowStorage` | — | The Dataflow storage object used for reading and writing data. | -| `input_image_key` | `str` | `"image"` | The column name containing image paths or objects that can be parsed by `_load_image`. | -| `output_score_key` | `str` | `"max_similarity"` | The name of the column storing each image’s maximum similarity with all others. | - - -## 🧠 Example Usage - -```python -from dataflow.utils.storage import FileStorage -from dataflow.operators.core_vision import DeduplicateFilter - -# 1) Prepare FileStorage (must contain an "image" column) -storage = FileStorage( - first_entry_file_name="data/dedup_input.jsonl", - cache_path="./cache_local", - file_name_prefix="image_dedup", - cache_type="jsonl" -) - -# 2) Initialize the operator -dedup = DeduplicateFilter( - model_name="openai/clip-vit-base-patch32", - threshold=0.90, - batch_size=32, - device="cuda" # or "cpu" -) - -# 3) Execute deduplication -dedup.run( - storage=storage.step(), - input_image_key="image", # image column - output_score_key="max_similarity" # column to record max similarity -) -``` - -### 🧾 Default Output Format -| Field | Type | Description | -| :--- | :--- | :--- | -| `image` (or the column specified by `input_image_key`) | `string/any` | The retained image entries after deduplication. | -| `max_similarity` (or the column specified by `output_score_key`) | `float` | The maximum similarity score of this image compared to all others (for audit; duplicate rows are excluded from output). | - -Example Input: -```jsonl -{ - "image": "a.jpg" -} -{ - "image": "b.jpg" -} -{ - "image": "a_copy.jpg" -} -``` - -Example Output: -```jsonl -{ - "image": "a.jpg", - "max_similarity": 0.96 -} -{ - "image": "b.jpg", - "max_similarity": 0.12 -} -``` \ No newline at end of file diff --git a/docs/en/notes/mm_operators/image_understanding/filter/image_aesthetic_filter.md b/docs/en/notes/mm_operators/image_understanding/filter/image_aesthetic_filter.md index a4d9e470..0cfd39aa 100644 --- a/docs/en/notes/mm_operators/image_understanding/filter/image_aesthetic_filter.md +++ b/docs/en/notes/mm_operators/image_understanding/filter/image_aesthetic_filter.md @@ -1,13 +1,18 @@ --- -title: image_aesthetic_filter -createTime: 2025/10/15 19:56:51 -icon: material-symbols-light:image +title: ImageAestheticFilter +createTime: 2025/10/15 15:45:04 +# icon: material-symbols-light:image permalink: /en/mm_operators/filter/image_aesthetic_filter/ --- ## 📘 Overview -`ImageAestheticFilter` is a **basic image aesthetic filtering operator** designed to quickly remove low-quality images. -It evaluates four grayscale-based metrics for each image: **sharpness** (Laplacian variance), **brightness** (mean), **contrast** (standard deviation), and **extreme pixel ratio** (proportion of near-black or near-white pixels). -A sample is retained only if all four metrics meet the defined thresholds. +`ImageAestheticFilter` performs **basic quality and aesthetic filtering** over input images by jointly evaluating: + +- Sharpness (degree of blur) +- Global brightness (overly dark / overly bright) +- Contrast (whether the image appears flat and washed-out) +- Proportions of near-black / near-white pixels (whether the image is almost entirely black or white) + +The operator is intended to remove **low-quality images** that are blurry, strongly mis-exposed, or almost uniform in color, thereby providing cleaner inputs for subsequent detection, recognition, retrieval, or generation tasks. @@ -20,18 +25,20 @@ def __init__( contrast_thresh: float = 40.0, max_black_ratio: float = 0.90, max_white_ratio: float = 0.90 -) +): + ... ``` ## `init` Parameters -| Parameter | Type | Default | Description | -| :--- | :--- | :--- | :--- | -| `blur_thresh` | `float` | `150.0` | Threshold for image sharpness (Laplacian variance); higher values indicate stricter sharpness requirements. | -| `brightness_range` | `tuple[float, float]` | `(30, 230)` | Allowed average brightness range (grayscale values 0–255). | -| `contrast_thresh` | `float` | `40.0` | Minimum required contrast level (grayscale standard deviation); higher means more contrast required. | -| `max_black_ratio` | `float` | `0.90` | Upper limit for near-black pixel ratio (<10); exceeding this suggests extreme darkness or large occluded areas. | -| `max_white_ratio` | `float` | `0.90` | Upper limit for near-white pixel ratio (>245); exceeding this indicates overexposure or excessive white areas. | +| Parameter | Type | Default | Description | +| :----------------- | :---------------------- | :-------------------- | :---------- | +| `blur_thresh` | `float` | `150.0` | Sharpness threshold based on the variance of the Laplacian. Higher values correspond to sharper images; images with values below this threshold are treated as blurry. | +| `brightness_range` | `tuple[float, float]` | `(30, 230)` | Admissible range of global brightness (mean grayscale intensity). Images with mean intensity below the lower bound are considered too dark; those above the upper bound are considered too bright. Only images whose mean lies within this interval are regarded as properly exposed. | +| `contrast_thresh` | `float` | `40.0` | Contrast threshold based on the standard deviation of the grayscale image. Values below this threshold indicate insufficient contrast (visually “flat” or washed-out images). | +| `max_black_ratio` | `float` | `0.90` | Maximum allowed proportion of **near-black pixels** (`gray < 10`). Images exceeding this ratio are treated as almost entirely black. | +| `max_white_ratio` | `float` | `0.90` | Maximum allowed proportion of **near-white pixels** (`gray > 245`). Images exceeding this ratio are treated as almost entirely white. | + ## `run` @@ -39,16 +46,16 @@ def __init__( def run( self, storage: DataFlowStorage, - image_key: str + input_image_key: str = "image_path", ): ... ``` Parameters -| Parameter | Type | Default | Description | -| :--- | :--- | :--- | :--- | -| `storage` | `DataFlowStorage` | — | The Dataflow storage object used for reading and writing data. | -| `image_key` | `str` | — | The column name containing the image path (e.g., `"image_path"`). | +| Parameter | Type | Default | Description | +| :--------------- | :---------------- | :-------------- | :---------- | +| `storage` | `DataFlowStorage` | — | Dataflow storage object used to read and write the DataFrame. | +| `input_image_key`| `str` | `"image_path"` | Name of the column containing image paths. | @@ -58,36 +65,36 @@ Parameters from dataflow.utils.storage import FileStorage from dataflow.operators.core_vision import ImageAestheticFilter -# 1) Prepare FileStorage (must contain an image_path column) +# 1) Prepare FileStorage (must contain at least an image_path column) storage = FileStorage( - first_entry_file_name="data/aesthetic_input.jsonl", + first_entry_file_name="./dataflow/example/test_image_filter/test_image_filter.jsonl", cache_path="./cache_local", - file_name_prefix="img_aesthetic", + file_name_prefix="aes_filter", cache_type="jsonl" ) -# 2) Initialize the operator (thresholds can be adjusted as needed) -flt = ImageAestheticFilter( +# 2) Initialize the aesthetic filter (thresholds can be tuned as needed) +aes_filter = ImageAestheticFilter( blur_thresh=150.0, brightness_range=(30, 230), contrast_thresh=40.0, max_black_ratio=0.90, - max_white_ratio=0.90 + max_white_ratio=0.90, ) -# 3) Execute filtering -cols = flt.run( +# 3) Run filtering: only images passing the quality checks are retained +cols = aes_filter.run( storage=storage.step(), - image_key="image_path" + input_image_key="image_path", ) print(cols) # ["image_path"] ``` ### 🧾 Default Output Format -| Field | Type | Description | -| :--- | :--- | :--- | -| `image_path` | `string` | The retained image path after filtering. | -| `quality` | `bool` | The image quality flag; only samples with `quality=True` are kept in the output. | +| Field name | Type | Default | Description | +| :---------------------------------------------- | :-------- | :------ | :---------- | +| `image_path` (or the column specified by `input_image_key`) | `string` | — | Input image path. | +| `quality` | `boolean` | — | Indicates whether the image passes the aesthetic/quality filter. Only rows with `quality == true` are preserved in the final output. | Example Input: diff --git a/docs/en/notes/mm_operators/image_understanding/filter/image_cat_filter.md b/docs/en/notes/mm_operators/image_understanding/filter/image_cat_filter.md new file mode 100644 index 00000000..9db6ad96 --- /dev/null +++ b/docs/en/notes/mm_operators/image_understanding/filter/image_cat_filter.md @@ -0,0 +1,120 @@ +--- +title: ImageCatFilter +createTime: 2025/10/15 15:00:00 +# icon: material-symbols-light:image +permalink: /en/mm_operators/filter/image_cat_filter/ +--- +## 📘 Overview +`ImageCatFilter` is a caption-quality filtering operator inspired by the **Caption-as-Teacher** paradigm. It combines a **BART-large-MNLI natural language inference (NLI) model** with optional **Tesseract OCR**, and applies a three-stage criterion—**semantic complexity**, **action description**, and **OCR-style transcription**—to image–text pairs. The operator is designed to retain only captions that are semantically rich and genuinely describe the visual content of the corresponding image. + + +## ```__init__``` +```python +def __init__( + self, + model_name: str = "facebook/bart-large-mnli", + complexity_thresh: float = 0.4, + min_caps: int = 2, + action_thresh: float = 0.4, + ocr_overlap_threshold: float = 0.2, + ocr_nli_thresh: float = 0.6, + device: str | None = None, +): + ... +``` + + +## `init` Parameters +| Parameter | Type | Default | Description | +| :--------------------- | :---------------- | :---------------------------- | :---------- | +| `model_name` | `str` | `"facebook/bart-large-mnli"` | Name or local path of the pretrained NLI model. Loaded via `AutoTokenizer` and `AutoModelForSequenceClassification`. | +| `complexity_thresh` | `float` | `0.4` | Entailment probability threshold used when matching the caption against a set of “capability hypotheses”. Entailment scores above this threshold indicate that the corresponding capability is covered by the caption. | +| `min_caps` | `int` | `2` | Minimum number of capability hypotheses that must be supported by the caption (e.g., actions, interactions, scene details) for it to be considered sufficiently complex. | +| `action_thresh` | `float` | `0.4` | Entailment probability threshold for the `ACTION_HYPOTHESIS` (“The caption clearly describes an action happening in the scene.”). Captions below this threshold are considered to lack adequate action description. | +| `ocr_overlap_threshold`| `float` | `0.2` | Jaccard-overlap threshold between OCR tokens and caption tokens. Only when this overlap is high will the operator further check via NLI whether the caption is primarily an OCR transcription. | +| `ocr_nli_thresh` | `float` | `0.6` | Entailment probability threshold for `OCR_ONLY_HYPOTHESIS` (“The caption mainly transcribes the visible text in the image instead of describing the visual scene.”). Samples with high overlap and entailment above this threshold are treated as OCR transcriptions and removed. | +| `device` | `str \| None` | `None` | Inference device. If `None`, the operator automatically selects `"cuda"` when available; otherwise it falls back to `"cpu"`. | + + +## `run` +```python +def run( + self, + storage: DataFlowStorage, + input_image_key: str = "image", + input_caption_key: str = "caption", +): + ... +``` + +Parameters +| Parameter | Type | Default | Description | +| :------------------- | :---------------- | :------------ | :---------- | +| `storage` | `DataFlowStorage` | — | Dataflow storage object used to read and write the DataFrame. | +| `input_image_key` | `str` | `"image"` | Name of the column containing image paths. | +| `input_caption_key` | `str` | `"caption"` | Name of the column containing the English image descriptions. | + + + +## 🧠 Example Usage + +```python +from dataflow.utils.storage import FileStorage +from dataflow.operators.core_vision import ImageCatFilter + +# 1) Prepare FileStorage (must contain at least `image` and `caption` columns) +storage = FileStorage( + first_entry_file_name="./dataflow/example/test_image_filter/test_image_filter.jsonl", + cache_path="./cache_local", + file_name_prefix="cat_filter", + cache_type="jsonl" +) + +# 2) Initialize the CatFilter operator (complexity and OCR-related thresholds can be tuned) +cat_filter = ImageCatFilter( + model_name="facebook/bart-large-mnli", + complexity_thresh=0.4, + min_caps=2, + action_thresh=0.4, + ocr_overlap_threshold=0.2, + ocr_nli_thresh=0.6, + device=None # automatically selects cuda/cpu +) + +# 3) Run filtering: retain only captions that are semantically complex, action-descriptive, +# and not mere OCR transcriptions +cols = cat_filter.run( + storage=storage.step(), + input_image_key="image", + input_caption_key="caption", +) +print(cols) # ["image", "caption"] +``` + +### 🧾 Default Output Format +| Field name | Type | Default | Description | +| :------------------------------------------ | :------- | :------ | :---------- | +| `image` (or the column specified by `input_image_key`) | `string` | — | Input image path. | +| `caption` (or the column specified by `input_caption_key`)| `string` | — | Input English image description. | + + +Example Input: +```jsonl +{ + "image_path": "1.png", + "caption": "A bride smiles while the groom points ahead inside a car, their hands resting together on the seat." +} +{ + "image_path": "2.jpg", + "caption": "SALE SALE SALE 50% OFF" +} + +``` + +Example Output: +```jsonl +{ + "image_path": "1.png", + "caption": "A bride smiles while the groom points ahead inside a car, their hands resting together on the seat." +} +``` \ No newline at end of file diff --git a/docs/en/notes/mm_operators/image_understanding/filter/image_clip_filter.md b/docs/en/notes/mm_operators/image_understanding/filter/image_clip_filter.md new file mode 100644 index 00000000..30e508cc --- /dev/null +++ b/docs/en/notes/mm_operators/image_understanding/filter/image_clip_filter.md @@ -0,0 +1,112 @@ +--- +title: ImageClipFilter +createTime: 2025/10/15 15:48:32 +# icon: material-symbols-light:image +permalink: /en/mm_operators/filter/image_clip_filter/ +--- +## 📘 Overview +`ImageClipFilter` is an image–text consistency operator built on a pretrained **CLIP** model. +It computes the **semantic similarity** between an image and its accompanying textual description, and then filters image–text pairs according to a user-specified similarity threshold. +Pairs whose similarity score falls below the threshold are discarded as semantically inconsistent. + + + + +## ```__init__``` +```python +def __init__( + self, + model_name: str = "openai/clip-vit-base-patch32", + device: str = None +): + ... +``` + + +## `init` Parameters +| Parameter | Type | Default | Description | +| :----------- | :------------ | :-------------------------------- | :---------- | +| `model_name` | `str` | `"openai/clip-vit-base-patch32"` | Local path or Hugging Face Model ID of the CLIP model. Internally loaded via `CLIPProcessor` / `CLIPModel` with `use_safetensors=True` and `weights_only=False`. | +| `device` | `str \| None` | `None` | Inference device. If `None`, the operator automatically selects `"cuda"` when available; otherwise it falls back to `"cpu"`. | + + + + +## `run` +```python +def run( + self, + storage: DataFlowStorage, + input_image_key: str = "image", + input_caption_key: str = "caption", + threshold: float = 0.25 +): + ... +``` + +Parameters +| Parameter | Type | Default | Description | +| :------------------- | :---------------- | :----------- | :---------- | +| `storage` | `DataFlowStorage` | — | Dataflow storage object used for reading and writing the DataFrame. | +| `input_image_key` | `str` | `"image"` | Column name containing image paths. | +| `input_caption_key` | `str` | `"caption"` | Column name containing the textual description (caption). | +| `threshold` | `float` | `0.25` | Minimum CLIP similarity threshold; only image–text pairs with similarity `≥ threshold` are retained. | + + + + +## 🧠 Example Usage + +```python +from dataflow.utils.storage import FileStorage +from dataflow.operators.core_vision import ImageClipFilter + +# 1) Prepare FileStorage (must contain at least `image` and `caption` columns) +storage = FileStorage( + first_entry_file_name="./dataflow/example/test_image_filter/test_image_filter.jsonl", + cache_path="./cache_local", + file_name_prefix="image_clip_filter", + cache_type="jsonl" +) + +# 2) Initialize the operator (model_name can be an HF model ID such as "openai/clip-vit-base-patch32") +filt = ImageClipFilter( + model_name="openai/clip-vit-base-patch32", + device=None # automatically select cuda/cpu +) + +# 3) Run filtering: keep only image–text pairs with CLIP similarity ≥ 0.25 +cols = filt.run( + storage=storage.step(), + input_image_key="image", + input_caption_key="caption", + threshold=0.25 +) +print(cols) # ["image", "caption"] +``` + +### 🧾 Default Output Format +| Field | Type | Default | Description | +| :-------- | :------ | :------ | :---------- | +| `image` | `string`| — | Image path for retained samples. | +| `caption` | `string`| — | Textual description of retained samples (for which CLIP similarity ≥ `threshold`). | + +Example Input: +```jsonl +{ + "image": "1.png", + "caption": "A bride and groom smiling in a car." +} +{ + "image": "2.jpg", + "caption": "A red bus driving across a snowy mountain road at night." +} +``` + +Example Output: +```jsonl +{ + "image": "1.png", + "caption": "A bride and groom smiling in a car." +} +``` \ No newline at end of file diff --git a/docs/en/notes/mm_operators/image_understanding/filter/image_complexity_filter.md b/docs/en/notes/mm_operators/image_understanding/filter/image_complexity_filter.md new file mode 100644 index 00000000..a0a8633a --- /dev/null +++ b/docs/en/notes/mm_operators/image_understanding/filter/image_complexity_filter.md @@ -0,0 +1,114 @@ +--- +title: ComplexityFilter +createTime: 2025/10/15 16:10:28 +# icon: material-symbols-light:image +permalink: /en/mm_operators/filter/complexity_filter/ +--- +## 📘 Overview +`ComplexityFilter` is an **NLI-based (Natural Language Inference)** text filtering operator designed to evaluate whether a caption simultaneously covers multiple **visual capability dimensions** (e.g., color, shape, action recognition, counting, spatial relations). +The operator thereby estimates the **capability richness** of a caption. + +For each caption, the operator constructs a set of hypothesis sentences using a common template, e.g. +`"The following text describes {}."` +An MNLI-style model is then used to compute the **entailment probability** for each capability hypothesis. +If the number of “hit” capabilities (those whose entailment probability exceeds a threshold `min_k`) meets or exceeds a user-specified threshold, the sample is retained; otherwise it is filtered out. + + + +## ```__init__``` +```python +def __init__( + self, + model_name: str = "facebook/bart-large-mnli", + threshold: float = 0.4, + min_k: int = 2, + device: str = None +): + ... +``` + + +## `init` Parameters +| Parameter | Type | Default | Description | +| :----------- | :------------ | :-------------------------- | :---------- | +| `model_name` | `str` | `"facebook/bart-large-mnli"` | Local path or Hugging Face Model ID of the NLI model. Internally loaded using `AutoTokenizer` / `AutoModelForSequenceClassification` with `local_files_only=True`, `use_safetensors=True`, and `weights_only=False`. | +| `threshold` | `float` | `0.4` | Minimum entailment probability required to mark a single capability as “hit”. Higher values yield stricter filtering. | +| `min_k` | `int` | `2` | Minimum number of capability dimensions that must be hit. Captions with fewer than `min_k` hits are discarded. | +| `device` | `str \| None` | `None` | Inference device. If `None`, the operator automatically selects `"cuda"` when available; otherwise it falls back to `"cpu"`. | + +## `run` +```python +def run( + self, + storage: DataFlowStorage, + input_caption_key: str = "caption" +): + ... +``` + +Parameters +| Parameter | Type | Default | Description | +| :----------------- | :---------------- | :----------- | :---------- | +| `storage` | `DataFlowStorage` | — | Dataflow storage object used for reading and writing the DataFrame. | +| `input_caption_key`| `str` | `"caption"` | Name of the text column to be evaluated, usually the image description (caption) field. | + + +## 🧠 Example Usage + +```python +from dataflow.utils.storage import FileStorage +from dataflow.operators.core_vision import ComplexityFilter + +# 1) Prepare FileStorage (must contain at least a `caption` column) +storage = FileStorage( + first_entry_file_name="./dataflow/example/test_image_filter/test_image_filter.jsonl", + cache_path="./cache_local", + file_name_prefix="complexity_filter", + cache_type="jsonl" +) + +# 2) Initialize the operator (can use a local or HF model) +filt = ComplexityFilter( + model_name="facebook/bart-large-mnli", # or "../ckpt/bart-large-mnli" + threshold=0.4, # entailment probability threshold + min_k=2, # require at least 2 capability hits + device=None # automatically select cuda/cpu +) + +# 3) Run filtering +cols = filt.run( + storage=storage.step(), + input_caption_key="caption" +) +print(cols) # ["caption"] +``` + +### 🧾 Default Output Format +| Field name | Type | Default | Description | +| :--------------------------------------------- | :------ | :------ | :---------- | +| `caption` (or the column specified by `input_caption_key`) | `string` | — | Caption text retained after filtering; only samples with a number of capability hits `≥ min_k` are kept. | + + +Example Input: +```jsonl +{ + "caption": "A red double-decker bus turns left at a city intersection while pedestrians wait at the crosswalk." +} +{ + "caption": "SALE SALE SALE 50% OFF" +} +{ + "caption": "Two kids count seashells on a sandy beach while their mother reads under a blue umbrella." +} + +``` + +Example Output: +```jsonl +{ + "caption": "A red double-decker bus turns left at a city intersection while pedestrians wait at the crosswalk." +} +{ + "caption": "Two kids count seashells on a sandy beach while their mother reads under a blue umbrella." +} +``` \ No newline at end of file diff --git a/docs/en/notes/mm_operators/image_understanding/filter/image_consistency_filter.md b/docs/en/notes/mm_operators/image_understanding/filter/image_consistency_filter.md new file mode 100644 index 00000000..a22e2513 --- /dev/null +++ b/docs/en/notes/mm_operators/image_understanding/filter/image_consistency_filter.md @@ -0,0 +1,120 @@ +--- +title: ImageConsistencyFilter +createTime: 2025/10/15 15:48:32 +# icon: material-symbols-light:image +permalink: /en/mm_operators/filter/image_consistency_filter/ +--- +## 📘 Overview +`ImageConsistencyFilter` is an **NLI-based (Natural Language Inference)** consistency filtering operator. +It evaluates whether, for the same image, the triplet +**(caption, question, answer)** is semantically coherent; that is, whether the **answer can be logically inferred from caption + question**. + +Internally, the operator treats `caption + question` as the **premise** and `answer` as the **hypothesis**, and then uses the `bart-large-mnli` model to compute the **entailment probability**. +If this probability falls below the threshold `threshold`, the sample is deemed semantically inconsistent and is filtered out. + + + +## ```__init__``` +```python +def __init__( + self, + model_name: str = "facebook/bart-large-mnli", + threshold: float = 0.35, + device: str = None +): + ... +``` + + +## `init` Parameters +| Parameter | Type | Default | Description | +| :---------- | :------------ | :--------------------------- | :---------- | +| `model_name` | `str` | `"facebook/bart-large-mnli"` | Local path or Hugging Face Model ID for the NLI model. Internally loaded via `AutoTokenizer` / `AutoModelForSequenceClassification` with `local_files_only=True`, `use_safetensors=True`, and `weights_only=False`. | +| `threshold` | `float` | `0.35` | Entailment probability threshold. If the entailment probability for **caption + question → answer** is below this value, the sample is treated as semantically inconsistent and discarded. Higher values result in stricter filtering. | +| `device` | `str \| None` | `None` | Inference device. If `None`, the operator automatically selects `"cuda"` when available; otherwise it falls back to `"cpu"`. | + + + +## `run` +```python +def run( + self, + storage: DataFlowStorage, + input_caption_key: str = "caption", + input_question_key: str = "question", + input_answer_key: str = "answer", +): + ... + +``` + +Parameters +| Parameter | Type | Default | Description | +| :------------------- | :---------------- | :----------- | :---------- | +| `storage` | `DataFlowStorage` | — | Dataflow storage object used for reading from and writing to the DataFrame. | +| `input_caption_key` | `str` | `"caption"` | Column name of the caption text, typically the natural-language description of the image. | +| `input_question_key` | `str` | `"question"` | Column name of the question text associated with the image. | +| `input_answer_key` | `str` | `"answer"` | Column name of the answer text, representing the response to the question. | + + +## 🧠 Example Usage + +```python +from dataflow.utils.storage import FileStorage +from dataflow.operators.core_vision import ImageConsistencyFilter + +# 1) Prepare FileStorage (must contain at least caption / question / answer) +storage = FileStorage( + first_entry_file_name="./dataflow/example/test_image_filter/test_image_filter.jsonl", + cache_path="./cache_local", + file_name_prefix="image_consistency_filter", + cache_type="jsonl" +) + +# 2) Initialize the operator (can use a local or HF model) +filt = ImageConsistencyFilter( + model_name="facebook/bart-large-mnli", # or a local path "../ckpt/bart-large-mnli" + threshold=0.35, # entailment probability threshold + device=None # automatically select cuda/cpu +) + +# 3) Run filtering +cols = filt.run( + storage=storage.step(), + input_caption_key="caption", + input_question_key="question", + input_answer_key="answer" +) +print(cols) # ["caption", "question", "answer"] +``` + +### 🧾 Default Output Format (Output Format) +| Field name | Type | Default | Description | +| :----------------------------------------------- | :------ | :------ | :---------- | +| `caption` (or the column specified by `input_caption_key`) | `string` | — | Caption text retained after filtering. | +| `question` (or the column specified by `input_question_key`) | `string` | — | Question text that, together with the caption, is deemed to entail the answer. | +| `answer` (or the column specified by `input_answer_key`) | `string` | — | Answer text whose entailment probability from caption + question is `≥ threshold` under the NLI model. | + + +Example Input: +```jsonl +{ + "caption": "A groom in a black tuxedo sits in a car next to his smiling bride.", + "question": "Where are the couple sitting?", + "answer": "They are sitting inside a car." +} +{ + "caption": "A groom in a black tuxedo sits in a car next to his smiling bride.", + "question": "What color is the sky in this picture?", + "answer": "The sky is green with purple stripes." +} +``` + +Example Output: +```jsonl +{ + "caption": "A groom in a black tuxedo sits in a car next to his smiling bride.", + "question": "Where are the couple sitting?", + "answer": "They are sitting inside a car." +} +``` \ No newline at end of file diff --git a/docs/en/notes/mm_operators/image_understanding/filter/image_deduplication_filter.md b/docs/en/notes/mm_operators/image_understanding/filter/image_deduplication_filter.md new file mode 100644 index 00000000..1be6a002 --- /dev/null +++ b/docs/en/notes/mm_operators/image_understanding/filter/image_deduplication_filter.md @@ -0,0 +1,118 @@ +--- +title: ImageDeduplicateFilter +createTime: 2025/10/15 19:56:47 +# icon: material-symbols-light:image +permalink: /en/mm_operators/filter/image_deduplication_filter/ +--- +## 📘 Overview +`ImageDeduplicateFilter` is an **image-level deduplication operator** built upon **CLIP-based image embeddings**. +The operator encodes all images in a dataset into CLIP feature vectors and computes pairwise cosine similarities. +For any pair of images whose similarity is **greater than or equal to `threshold`**, the operator **retains the first occurrence** and +marks subsequent ones as near-duplicate samples to be removed. + +In addition, for every retained image, the operator records its **maximum cosine similarity** with all other images in the column +specified by `output_score_key` (by default, `max_similarity`). This value can be used for downstream quality control, auditing, +or further manual examination of near-duplicate content. + + +## ```__init__``` +```python +def __init__( + self, + model_name: str = "openai/clip-vit-base-patch32", + threshold: float = 0.90, + batch_size: int = 32, + device: str = "cuda" if torch.cuda.is_available() else "cpu" +): + ... +``` + +## `init` Parameters +| Parameter | Type | Default | Description | +| :------------- | :----- | :---------------------------------------- | :---------- | +| `model_name` | `str` | `"openai/clip-vit-base-patch32"` | Identifier or local path of the CLIP model used to extract image embeddings (Hugging Face Model ID or local checkpoint directory). | +| `threshold` | `float`| `0.90` | Deduplication similarity threshold. If the cosine similarity between two image embeddings is **greater than or equal to** this value, the later image in the sequence is treated as a near-duplicate and removed. | +| `batch_size` | `int` | `32` | Batch size used during CLIP inference. Larger batch sizes improve throughput but increase GPU/CPU memory consumption. | +| `device` | `str` | `"cuda"` if available, otherwise `"cpu"` | Computational device used for CLIP inference. The operator automatically defaults to GPU when available; otherwise, it falls back to CPU execution. | + +## `run` +```python +def run( + self, + storage: DataFlowStorage, + input_image_key: str = "image", + output_score_key: str = "max_similarity" +) -> None: + ... +``` + +Parameters +| Parameter | Type | Default | Description | +| :--- | :--- | :--- | :--- | +| `storage` | `DataFlowStorage` | — | The Dataflow storage object used for reading and writing data. | +| `input_image_key` | `str` | `"image"` | The column name containing image paths or objects that can be parsed by `_load_image`. | +| `output_score_key` | `str` | `"max_similarity"` | The name of the column storing each image’s maximum similarity with all others. | + + +## 🧠 Example Usage + +```python +from dataflow.utils.storage import FileStorage +from dataflow.operators.core_vision import ImageDeduplicateFilter + +# 1) Prepare FileStorage (must contain an "image" column) +storage = FileStorage( + first_entry_file_name="data/dedup_input.jsonl", + cache_path="./cache_local", + file_name_prefix="image_dedup", + cache_type="jsonl" +) + +# 2) Initialize the operator +dedup = ImageDeduplicateFilter( + model_name="openai/clip-vit-base-patch32", + threshold=0.90, + batch_size=32, + device="cuda" # or "cpu" +) + +# 3) Execute deduplication +cols = dedup.run( + storage=storage.step(), + input_image_key="image", # image column + output_score_key="max_similarity" # column to record max similarity +) +print(cols) # ["image", "max_similarity"] +``` + +### 🧾 Default Output Format +| Field | Type | Description | +| :--------------------------------------------- | :----------- | :---------- | +| `image` (or the column specified by `input_image_key`) | `string/any` | The retained image entries after deduplication; near-duplicate images are removed according to the similarity threshold. | +| `max_similarity` (or the column specified by `output_score_key`) | `float` | Maximum cosine similarity between this image and all other images in the dataset (for auditing and analysis; removed duplicates are not present in the final output). | + + +Example Input: +```jsonl +{ + "image": "a.jpg" +} +{ + "image": "b.jpg" +} +{ + "image": "a_copy.jpg" +} +``` + +Example Output: +```jsonl +{ + "image": "a.jpg", + "max_similarity": 0.96 +} +{ + "image": "b.jpg", + "max_similarity": 0.12 +} +``` \ No newline at end of file diff --git a/docs/en/notes/mm_operators/image_understanding/filter/image_diversity_filter.md b/docs/en/notes/mm_operators/image_understanding/filter/image_diversity_filter.md new file mode 100644 index 00000000..887639e6 --- /dev/null +++ b/docs/en/notes/mm_operators/image_understanding/filter/image_diversity_filter.md @@ -0,0 +1,126 @@ +--- +title: ImageDiversityFilter +createTime: 2025/10/15 19:57:00 +# icon: material-symbols-light:image +permalink: /en/mm_operators/filter/image_diversity_filter/ +--- +## 📘 Overview +`ImageDiversityFilter` is a joint **text–image deduplication operator** designed to preserve **content diversity** when cleaning multimodal datasets. +It relies on two complementary signals: + +1. **Text side**: estimates similarity between the current caption and previously retained captions using **TF–IDF with cosine similarity**. +2. **Image side**: measures visual redundancy using **perceptual hash (pHash) Hamming distance** over images. + +A sample is retained **only if** it is sufficiently novel **both** in text and image space; otherwise, it is treated as a near-duplicate and filtered out. + +This dual-view strategy avoids failure modes that occur when only one modality is considered (e.g., different images with nearly identical text, or vice versa), and helps construct **de-duplicated, semantically diverse** multimodal corpora. + + + + + +## ```__init__``` +```python +def __init__( + self, + text_thresh: float = 0.8, + hash_size: int = 8, + img_dist_thresh: int = 5 +): + ... +``` + + +## `init` Parameters +| Parameter | Type | Default | Description | +| :--- | :--- | :--- | :--- | +| `text_thresh` | `float` | `0.8` | Text uniqueness threshold. The maximum cosine similarity with the most recent corpus (managed by the internal `TextDuplicateFilter`) must be **< this value** to be considered unique. | +| `hash_size` | `int` | `8` | Hash size used for perceptual hashing (pHash). Larger values capture finer visual details but require more computation and memory (used by `ImageDuplicateFilter`). | +| `img_dist_thresh` | `int` | `5` | Image uniqueness threshold. The minimum Hamming distance with the most recent image hashes must be **> this value** to be considered unique. | + + +## `run` +```python +def run( + self, + storage: DataFlowStorage, + input_image_key: str = "image_path", + input_text_key: str = "text" +): + ... +``` + +Parameters +| Parameter | Type | Default | Description | +| :--------------- | :---------------- | :------------ | :---------- | +| `storage` | `DataFlowStorage` | — | Dataflow storage object containing the multimodal table to be de-duplicated. | +| `input_image_key`| `str` | `"image_path"`| Name of the image column. Entries should be image paths (or other disk-resident locations that `PIL` can open). | +| `input_text_key` | `str` | `"text"` | Name of the text column, typically a caption or description field used for computing TF–IDF similarity. | + + +## 🧠 Example Usage + +```python +from dataflow.utils.storage import FileStorage +from dataflow.operators.core_vision import ImageDiversityFilter + +# 1) Prepare FileStorage (must contain at least "image_path" and "text" columns) +storage = FileStorage( + first_entry_file_name="./dataflow/example/test_image_filter/test_image_filter.jsonl", + cache_path="./cache_local", + file_name_prefix="image_diversity_filter", + cache_type="jsonl" +) + +# 2) Initialize the operator +filt = ImageDiversityFilter( + text_thresh=0.8, # text similarity threshold (higher → stricter) + hash_size=8, # perceptual hash size + img_dist_thresh=5 # minimum Hamming distance threshold (higher → require larger visual difference) +) + +# 3) Run filtering +cols = filt.run( + storage=storage.step(), + input_image_key="image_path", + input_text_key="text" +) +print(cols) # ["image_path", "text"] +``` + + +### 🧾 Default Output Format + +| Field | Type | Default | Description | +| :-------------------------------------------- | :------- | :------ | :---------- | +| `image_path` (or the column specified by `input_image_key`) | `string` | — | Image paths retained after filtering; only rows whose text and image are both sufficiently dissimilar from historical samples are kept. | +| `text` (or the column specified by `input_text_key`) | `string` | — | Text descriptions paired with the retained images, guaranteed not to be overly similar to previously kept texts in TF–IDF space. | + + +Example Input: +```jsonl +{ + "image_path": "a.jpg", + "text": "A cat sitting on a wooden chair." +} +{ + "image_path": "a_dup.jpg", + "text": "A cat sits on a wooden chair." +} +{ + "image_path": "b.jpg", + "text": "A bus driving through a snowy mountain pass at night." +} +``` + +Example Output: +```jsonl +{ + "image_path": "a.jpg", + "text": "A cat sitting on a wooden chair." +} +{ + "image_path": "b.jpg", + "text": "A bus driving through a snowy mountain pass at night." +} +``` \ No newline at end of file diff --git a/docs/en/notes/mm_operators/image_understanding/filter/image_sensitive_filter.md b/docs/en/notes/mm_operators/image_understanding/filter/image_sensitive_filter.md new file mode 100644 index 00000000..fee0c87c --- /dev/null +++ b/docs/en/notes/mm_operators/image_understanding/filter/image_sensitive_filter.md @@ -0,0 +1,119 @@ +--- +title: ImageSensitiveFilter +createTime: 2025/10/15 15:31:35 +# icon: material-symbols-light:image +permalink: /en/mm_operators/filter/image_sensitive_filter/ +--- +## 📘 Overview +`ImageSensitiveFilter` is a **multi-label safety filtering operator** built on top of the **BART Large MNLI** zero-shot natural language inference model. +It evaluates multiple text fields associated with an image and automatically identifies and filters samples containing the following **high-risk content categories**: + +- Sexual content (pornography, nudity, etc.) +- Violence or physical harm +- Suicide or self-harm +- Hate speech +- Harassment or insults +- Threats or intimidation + +Unlike traditional keyword-based blacklists, this operator leverages NLI-style reasoning between **input text** and **natural-language risk descriptions** to decide whether sensitive content is present. This design is more **flexible and extensible**, and is particularly suitable for safety- and compliance-critical cleaning of multimodal datasets. + + + +## ```__init__``` +```python +def __init__( + self, + model_name: str = "facebook/bart-large-mnli", + threshold: float = 0.5, + device: str | None = None, +): + ... +``` + + +## `init` Parameters +| Parameter | Type | Default | Description | +| :---------- | :---------------- | :--------------------------- | :---------- | +| `model_name` | `str` | `"facebook/bart-large-mnli"` | Local path or Hugging Face Model ID of the NLI model. Internally loaded via `AutoTokenizer` / `AutoModelForSequenceClassification` (`local_files_only=True`, `use_safetensors=True`, `weights_only=False`). | +| `threshold` | `float` | `0.5` | **Entailment probability threshold** for risk categories. If the entailment probability for any risk label is `≥ threshold`, the corresponding text is deemed *unsafe*. Higher values lead to stricter filtering. | +| `device` | `str \| None` | `None` | Inference device. If `None`, the operator automatically selects `"cuda"` when available; otherwise it falls back to `"cpu"`. | + + +## `run` +```python +def run( + self, + storage: DataFlowStorage, + input_image_key: str, + input_text_keys: list +): + ... +``` + +Parameters +| Parameter | Type | Default | Description | +| :--------------- | :---------------- | :------ | :---------- | +| `storage` | `DataFlowStorage` | — | Dataflow storage object used to read and write the underlying DataFrame. | +| `input_image_key` | `str` | — | Name of the column containing image paths (e.g., `"image"`). Used only to check path existence; no visual inference is performed. | +| `input_text_keys` | `list[str]` | — | List of text column names to be evaluated for safety (e.g., `["caption", "question", "answer"]`). Each of these fields is scored against all risk labels. | + + + + +## 🧠 Example Usage + +```python +from dataflow.utils.storage import FileStorage +from dataflow.operators.core_vision import ImageSensitiveFilter + +# 1) Prepare FileStorage (must contain at least "image" and caption-like columns) +storage = FileStorage( + first_entry_file_name="./dataflow/example/test_image_filter/test_image_filter.jsonl", + cache_path="./cache_local", + file_name_prefix="imgtext_sensitive_filter", + cache_type="jsonl" +) + +# 2) Initialize the operator (using either a local or HF model) +filt = ImageSensitiveFilter( + model_name="facebook/bart-large-mnli", # or a local checkpoint path + threshold=0.5, # risk decision threshold + device=None # automatically choose cuda/cpu +) + +# 3) Run filtering: jointly check image path + multiple text fields for sensitive content +cols = filt.run( + storage=storage.step(), + input_image_key="image", + input_text_keys=["caption", "question", "answer"] +) +print(cols) # ["image", "caption", "question", "answer"] +``` + +### 🧾 Default Output Format + +| Field | Type | Default | Description | +| :----------------------------------------- | :------ | :------ | :---------- | +| column specified by `input_image_key` | `string`| — | Original image-path column; after filtering, only rows that pass the safety check are retained. | +| columns specified by `input_text_keys` | `string`| — | Original text columns (caption / question / answer, etc.); after filtering, only rows whose texts are all judged safe are retained. | + + +Example Input: +```jsonl +{ + "image_path": "1.png", + "text": "A bride and groom smiling in a car." +} +{ + "image_path": "2.jpg", + "text": "Some abusive or hateful phrase here." +} +``` + +Example Output: +```jsonl +{ + "image_path": "1.png", + "text": "A bride and groom smiling in a car." +} +``` \ No newline at end of file diff --git a/docs/en/notes/mm_operators/image_understanding/filter/sensitive_filter.md b/docs/en/notes/mm_operators/image_understanding/filter/sensitive_filter.md deleted file mode 100644 index 4fab238a..00000000 --- a/docs/en/notes/mm_operators/image_understanding/filter/sensitive_filter.md +++ /dev/null @@ -1,115 +0,0 @@ ---- -title: sensitive_filter -createTime: 2025/10/15 19:56:56 -icon: material-symbols-light:image -permalink: /en/mm_operators/filter/sensitive_filter/ ---- -## 📘 Overview -`SensitiveFilter` is a **sensitive content filtering operator** that combines **image NSFW classification** and **text toxicity/hate speech detection** to remove unsafe samples. - -- **Image side**: Uses an image classification model to detect labels such as `{porn, hentai, sexy, nsfw}`; if the confidence score **≥ `img_thresh`**, the image is marked unsafe. -- **Text side**: Uses a text classification model to detect labels such as `{toxic, offensive, hate, obscene, threat, sexual_explicit, identity_attack}`; if the confidence score **≥ `txt_thresh`**, the text is marked unsafe. -A sample is filtered out if **either** the image **or any text field** is classified as unsafe. - - -## ```__init__``` -```python -def __init__( - self, - img_model_name="../ckpt/nsfw_image_detection", - txt_model_name="../ckpt/toxic-bert", - img_thresh=0.5, - txt_thresh=0.5, -): - ... -``` - - -## `init` Parameters -| Parameter | Type | Default | Description | -| :--- | :--- | :--- | :--- | -| `img_model_name` | `str` | `"../ckpt/nsfw_image_detection"` | Local path or Hugging Face Model ID for the image-sensitive detection model. Internally loaded using `AutoImageProcessor` and `AutoModelForImageClassification` (`use_safetensors=True`, `weights_only=False`). | -| `txt_model_name` | `str` | `"../ckpt/toxic-bert"` | Local path or Hugging Face Model ID for the text-toxicity detection model. Internally loaded using `AutoTokenizer` and `AutoModelForSequenceClassification` (`use_safetensors=True`, `weights_only=False`). | -| `img_thresh` | `float` | `0.5` | The image sensitivity threshold; if the image matches `{porn, hentai, sexy, nsfw}` with score **≥** this value, it is classified as unsafe. | -| `txt_thresh` | `float` | `0.5` | The text sensitivity threshold; if any text field matches `{toxic, offensive, hate, obscene, threat, sexual_explicit, identity_attack}` with score **≥** this value, it is classified as unsafe. | - -## `run` -```python -def run( - self, - storage: DataFlowStorage, - image_key: str, - text_keys: list -): - ... -``` - -Parameters -| Parameter | Type | Default | Description | -| :--- | :--- | :--- | :--- | -| `storage` | `DataFlowStorage` | — | The Dataflow storage object used for reading and writing data. | -| `image_key` | `str` | — | The column name containing the image path (e.g., `"image_path"`). | -| `text_keys` | `list[str]` | — | A list of text column names to be checked (e.g., `["caption", "question", "answer"]`). | - - - - -## 🧠 Example Usage - -```python -from dataflow.utils.storage import FileStorage -from dataflow.operators.core_vision import SensitiveFilter - -# 1) Prepare FileStorage (must contain image_path and text columns) -storage = FileStorage( - first_entry_file_name="data/sensitive_input.jsonl", - cache_path="./cache_local", - file_name_prefix="sensitive_filter", - cache_type="jsonl" -) - -# 2) Initialize the operator (you can also use HF model IDs like -# img_model_name="Falconsai/nsfw_image_detection", -# txt_model_name="unitary/toxic-bert") -filt = SensitiveFilter( - img_model_name="../ckpt/nsfw_image_detection", - txt_model_name="../ckpt/toxic-bert", - img_thresh=0.5, - txt_thresh=0.5, -) - -# 3) Execute filtering -cols = filt.run( - storage=storage.step(), - image_key="image_path", - text_keys=["text"] # or ["caption", "question", "answer"] -) -print(cols) # ["image_path", "text"] -``` - -### 🧾 Default Output Format -| Field | Type | Description | -| :--- | :--- | :--- | -| `image_path` / specified `image_key` | `string` | The retained image path after filtering. | -| Each field in `text_keys` | `string` | The retained text fields; only samples where both the image and all texts are safe are included in the output. | - - -Example Input: -```jsonl -{ - "image_path": "1.png", - "text": "A bride and groom smiling in a car." -} -{ - "image_path": "2.jpg", - "text": "Some abusive or hateful phrase here." -} -``` - -Example Output: -```jsonl -{ - "image_path": "1.png", - "text": "A bride and groom smiling in a car." -} -``` \ No newline at end of file diff --git a/docs/en/notes/mm_operators/image_understanding/filter/text_image_diversity_filter.md b/docs/en/notes/mm_operators/image_understanding/filter/text_image_diversity_filter.md deleted file mode 100644 index de3cebe6..00000000 --- a/docs/en/notes/mm_operators/image_understanding/filter/text_image_diversity_filter.md +++ /dev/null @@ -1,118 +0,0 @@ ---- -title: text_image_diversity_filter -createTime: 2025/10/15 19:57:00 -icon: material-symbols-light:image -permalink: /en/mm_operators/filter/text_image_diversity_filter/ ---- -## 📘 Overview -`TextImageDiversityFilter` is a **joint text-image deduplication operator** that ensures both textual and visual diversity in a dataset. - -- **Text side:** Uses **TF-IDF + cosine similarity** to compute the maximum similarity between the current text and the historical corpus. A sample is considered *textually unique* if the similarity is below `text_thresh`. -- **Image side:** Uses **perceptual hash (pHash)** and computes the Hamming distance. A sample is considered *visually unique* if the distance is greater than `img_dist_thresh`. - -A sample is retained only if **both the text and image are unique**; otherwise, it is filtered out. - - - -## ```__init__``` -```python -def __init__( - self, - text_thresh: float = 0.8, - hash_size: int = 8, - img_dist_thresh: int = 5 -): - ... -``` - - -## `init` Parameters -| Parameter | Type | Default | Description | -| :--- | :--- | :--- | :--- | -| `text_thresh` | `float` | `0.8` | Text uniqueness threshold. The maximum cosine similarity with the most recent corpus (managed by the internal `TextDuplicateFilter`) must be **< this value** to be considered unique. | -| `hash_size` | `int` | `8` | Hash size used for perceptual hashing (pHash). Larger values capture finer visual details but require more computation and memory (used by `ImageDuplicateFilter`). | -| `img_dist_thresh` | `int` | `5` | Image uniqueness threshold. The minimum Hamming distance with the most recent image hashes must be **> this value** to be considered unique. | - - -## `run` -```python -def run( - self, - storage: DataFlowStorage, - image_key: str, - te - ... -``` - -Parameters -| Parameter | Type | Default | Description | -| :--- | :--- | :--- | :--- | -| `storage` | `DataFlowStorage` | — | The Dataflow storage object used for reading and writing data. | -| `image_key` | `str` | — | The column name containing the image path (e.g., `"image_path"`). | -| `text_key` | `str` | — | The column name containing the text content (e.g., `"text"` or `"caption"`). | - - - -## 🧠 Example Usage - -```python -from dataflow.utils.storage import FileStorage -from dataflow.operators.core_vision import TextImageDiversityFilter - -# 1) Prepare FileStorage (must contain image_path and text columns) -storage = FileStorage( - first_entry_file_name="data/ti_diversity_input.jsonl", - cache_path="./cache_local", - file_name_prefix="ti_diversity", - cache_type="jsonl" -) - -# 2) Initialize the operator -filt = TextImageDiversityFilter( - text_thresh=0.8, # Text uniqueness threshold (lower = looser) - hash_size=8, # pHash size - img_dist_thresh=5 # Image uniqueness threshold (higher = stricter) -) - -# 3) Execute filtering -cols = filt.run( - storage=storage.step(), - image_key="image_path", - text_key="text" -) -print(cols) # ["image_path", "text"] -``` - -### 🧾 Default Output Format -| Field | Type | Description | -| :--- | :--- | :--- | -| `image_path` / specified `image_key` | `string` | The retained image path after deduplication. | -| `text` / specified `text_key` | `string` | The retained text content after deduplication. | - -Example Input: -```jsonl -{ - "image_path": "a.jpg", - "text": "A cat sitting on a wooden chair." -} -{ - "image_path": "a_dup.jpg", - "text": "A cat sits on a wooden chair." -} -{ - "image_path": "b.jpg", - "text": "A bus driving through a snowy mountain pass at night." -} -``` - -Example Output: -```jsonl -{ - "image_path": "a.jpg", - "text": "A cat sitting on a wooden chair." -} -{ - "image_path": "b.jpg", - "text": "A bus driving through a snowy mountain pass at night." -} -``` \ No newline at end of file diff --git a/docs/en/notes/mm_operators/image_understanding/generate/batch_vqa_generator.md b/docs/en/notes/mm_operators/image_understanding/generate/batch_vqa_generator.md new file mode 100644 index 00000000..287ed23b --- /dev/null +++ b/docs/en/notes/mm_operators/image_understanding/generate/batch_vqa_generator.md @@ -0,0 +1,115 @@ +--- +title: BatchVQAGenerator +createTime: 2026/01/11 21:54:10 +permalink: /en/mm_operators/generate/batch_vqa_generator/ +--- +## 📘 Overview + +`BatchVQAGenerator` is a **Batch Visual Question Answering Operator**. + +It is designed for **"One Image, Many Questions"** scenarios. The input consists of a single image and a list of questions (e.g., ["What color?", "How many?", "What action?"]). The operator automatically pairs the image with each question in the list, constructs a batch request, and generates answers in parallel. + +This mechanism is highly efficient for dense captioning, multi-perspective image analysis, or attribute-based Q&A tasks. + +## 🏗️ `__init__` Function + +```python +def __init__( + self, + serving: LLMServingABC, + system_prompt: str = "You are a helpful assistant." +): + +``` + +### 🧾 Parameters + +| Parameter | Type | Default | Description | +| --- | --- | --- | --- | +| `serving` | `LLMServingABC` | N/A | The model serving instance for inference (must support VLM multimodal inputs). | +| `system_prompt` | `str` | `"You are..."` | The system prompt sent to the model. | + +## ⚡ `run` Function + +```python +def run( + self, + storage: DataFlowStorage, + input_prompts_key: str, + input_image_key: str, + output_key: str +): + ... + +``` + +Executes the main logic: + +1. **Data Loading** +Reads the image path (`input_image_key`) and the list of questions (`input_prompts_key`) from the DataFrame. +2. **Broadcasting & Batch Construction** +For each row: +* Retrieves the single image path. +* Iterates through every question `q` in the list. +* Constructs a standard multimodal message `[Image, Text(q)]` for each question. +* Packages all Q&A requests for that single image into one Batch. + + +3. **Parallel Inference** +Calls `serving.generate_from_input` to generate answers for all questions related to that image simultaneously using GPU parallelism. +4. **Save Results** +Writes the list of generated answers (in the same order as the question list) to the `output_key` column. + +### 🧾 `run` Parameters + +| Parameter | Type | Default | Description | +| --- | --- | --- | --- | +| `storage` | `DataFlowStorage` | N/A | DataFlow storage object. | +| `input_prompts_key` | `str` | N/A | Column name containing the **list of questions** (`List[str]`). | +| `input_image_key` | `str` | N/A | Column name containing the **single image** path. | +| `output_key` | `str` | N/A | Output column name for the list of answers (`List[str]`). | + +## 🧩 Example Usage + +```python +from dataflow.utils.storage import FileStorage +from dataflow.core import LLMServing +from dataflow.operators.generate import BatchVQAGenerator + +# 1) Initialize Model +serving = LLMServing(model_path="Qwen/Qwen2.5-VL-7B-Instruct") + +# 2) Initialize Operator +generator = BatchVQAGenerator( + serving=serving, + system_prompt="Answer briefly." +) + +# 3) Prepare Data (jsonl) +# Format: {"image": "scene.jpg", "questions": ["Weather?", "Object count?", "Action?"]} +storage = FileStorage(file_name_prefix="dense_captioning") +storage.step() + +# 4) Execute Batch VQA +generator.run( + storage=storage, + input_prompts_key="questions", + input_image_key="image", + output_key="answers" +) + +``` + +### 🧾 Output Format + +The `output_key` column contains a list of strings corresponding to the input question list. + +**Example Input DataFrame:** +| image | questions | +| :--- | :--- | +| `"park.jpg"` | `["Weather?", "Count?", "Action?"]` | + +**Example Output DataFrame:** +| image | questions | answers | +| :--- | :--- | :--- | +| `"park.jpg"` | `["Weather?", "Count?", "Action?"]` | `["Sunny", "3 people", "Running"]` | \ No newline at end of file diff --git a/docs/en/notes/mm_operators/image_understanding/generate/fix_prompted_vqa_generator.md b/docs/en/notes/mm_operators/image_understanding/generate/fix_prompted_vqa_generator.md new file mode 100644 index 00000000..8440c5c0 --- /dev/null +++ b/docs/en/notes/mm_operators/image_understanding/generate/fix_prompted_vqa_generator.md @@ -0,0 +1,126 @@ +--- +title: FixPromptedVQAGenerator +createTime: 2026/01/11 21:31:49 +permalink: /en/mm_operators/fix_prompted_vqa_generator/ +--- +## 📘 Overview + +`FixPromptedVQAGenerator` is a **Fixed-Prompt Multimodal VQA Operator**. + +It is designed to execute the **same** instruction task on a batch of images or videos. Unlike dynamic templating operators, this operator accepts a static `user_prompt` (e.g., "Please caption this image") during initialization and applies it uniformly to every media sample in the input DataFrame. + +Use Cases: + +* Batch Image/Video Captioning. +* Uniform VQA queries across a dataset (e.g., "Is there any violence in this image?"). + +## 🏗️ `__init__` Function + +```python +def __init__( + self, + serving: LLMServingABC, + system_prompt: str = "You are a helpful assistant.", + user_prompt: str = "Please caption the media in detail." +): + +``` + +### 🧾 Parameters + +| Parameter | Type | Default | Description | +| --- | --- | --- | --- | +| `serving` | `LLMServingABC` | N/A | The model serving instance for inference (must support multimodal inputs). | +| `system_prompt` | `str` | `"You are..."` | The system prompt sent to the model. | +| `user_prompt` | `str` | `"Please caption..."` | **Core Parameter**. The user instruction (Prompt) applied uniformly to all input samples. | + +## ⚡ `run` Function + +```python +def run( + self, + storage: DataFlowStorage, + input_image_key: str = "image", + input_video_key: str = "video", + output_answer_key: str = "answer", +): + ... + +``` + +Executes the main logic: + +1. **Read Data** +Reads the DataFrame from `storage`. +2. **Input Construction** +* Checks for and reads the `input_image_key` or `input_video_key` column. +* Constructs the input message for each media file, combining the fixed `system_prompt`, the media file itself, and the fixed `user_prompt`. + + +3. **Batch Inference** +* Packages the constructed prompts and media data into a batch. +* Calls `serving.generate_from_input` to execute parallel inference. + + +4. **Save Results** +* Writes the text generated by the model into the `output_answer_key` column. +* Updates and saves the DataFrame. + + + +### 🧾 `run` Parameters + +| Parameter | Type | Default | Description | +| --- | --- | --- | --- | +| `storage` | `DataFlowStorage` | N/A | DataFlow storage object. | +| `input_image_key` | `str` | `"image"` | Column name for image paths (mutually exclusive with video_key). | +| `input_video_key` | `str` | `"video"` | Column name for video paths (mutually exclusive with image_key). | +| `output_answer_key` | `str` | `"answer"` | Column name for the generated output. | + +## 🧩 Example Usage + +```python +from dataflow.utils.storage import FileStorage +from dataflow.core import LLMServing +from dataflow.operators.generate import FixPromptedVQAGenerator + +# 1) Initialize Model +serving = LLMServing(model_path="Qwen/Qwen2.5-VL-3B-Instruct") + +# 2) Initialize Operator: Set a fixed prompt +# Example: Generate detailed descriptions for a batch of images +generator = FixPromptedVQAGenerator( + serving=serving, + system_prompt="You are a helpful visual assistant.", + user_prompt="Describe the content of this image in detail, including objects, colors, and spatial relationships." +) + +# 3) Prepare Data +storage = FileStorage( + file_name_prefix="image_captioning_task", + cache_path="./cache_data" +) +storage.step() + +# 4) Execute Generation +generator.run( + storage=storage, + input_image_key="image_path", + output_answer_key="detailed_caption" +) + +``` + +### 🧾 Input/Output Example + +**Input DataFrame Row:** +| image_path | +| :--- | +| `"/data/cat.jpg"` | +| `"/data/dog.png"` | + +**Output DataFrame Row:** +| image_path | detailed_caption | +| :--- | :--- | +| `"/data/cat.jpg"` | `"A black and white cat sitting on a sofa..."` | +| `"/data/dog.png"` | `"A golden retriever running on the grass..."` | diff --git a/docs/en/notes/mm_operators/image_understanding/generate/image_bbox_generator.md b/docs/en/notes/mm_operators/image_understanding/generate/image_bbox_generator.md new file mode 100644 index 00000000..e930c005 --- /dev/null +++ b/docs/en/notes/mm_operators/image_understanding/generate/image_bbox_generator.md @@ -0,0 +1,147 @@ +--- +title: ImageBboxGenerator +createTime: 2026/01/11 21:49:37 +permalink: /en/mm_operators/generate/image_bbox_generator/ +--- +## 📘 Overview + +`ImageBboxGenerator` is an **Image Region Annotation & Prompt Preparation Operator**. + +It is primarily used for data preprocessing in multimodal tasks (such as Grounding Caption). It handles raw data containing image paths, normalizes Regions of Interest (RoI), visualizes them, and generates structured Prompts for subsequent VLM inference. + +Key Capabilities: + +1. **Dual BBox Acquisition**: +* **Existing Mode**: Reads existing BBox coordinates directly from the input data. +* **Auto-Extraction Mode**: If no BBox is provided, automatically extracts salient object regions using OpenCV (Edge Detection + Contour Fitting). + + +2. **Coordinate Normalization**: Converts pixel coordinates into normalized coordinates (0-1 range) compliant with VLM input standards. +3. **Visualization Enhancement**: Generates images with numbered, colored bounding boxes to help the model understand "Region N" references. +4. **Prompt Construction**: Automatically generates prompts containing region count information (e.g., "Describe the content of each marked region..."). + +## 🏗️ `__init__` Function + +```python +def __init__(self, config: Optional[ExistingBBoxDataGenConfig] = None): + ... + +``` + +### 🧾 Parameters + +| Parameter | Type | Default | Description | +| --- | --- | --- | --- | +| `config` | `ExistingBBoxDataGenConfig` | `None` | Configuration object defining input/output paths and max box limits. | + +#### `ExistingBBoxDataGenConfig` Details + +| Field | Type | Default | Description | +| --- | --- | --- | --- | +| `max_boxes` | `int` | `10` | Max BBoxes per image (sorted by area). Zero-padded if fewer. | +| `input_jsonl_path` | `str` | `None` | **Required**. Path to the input JSONL file. | +| `output_jsonl_path` | `str` | `None` | **Required**. Path to save the processed results. | + +## ⚡ `run` Function + +```python +def run( + self, + storage: DataFlowStorage, + input_image_key: str = "image", + input_bbox_key: str = "bbox", + output_key: str = "mdvp_record" +): + ... + +``` + +Executes the main logic: + +1. **Data Loading** +Reads raw data from `config.input_jsonl_path`. +2. **BBox Acquisition (Extract/Get)** +* Checks each row for `input_bbox_key`. +* **Type A (With BBox)**: Uses the coordinates provided in the data. +* **Type B (Without BBox)**: Calls `extract_boxes_from_image` to extract object contours via adaptive thresholding and morphology, applying NMS (Non-Maximum Suppression) to remove duplicates. + + +3. **Normalization & Visualization** +* **Normalization**: Converts `[x, y, w, h]` to normalized `[x1, y1, x2, y2]` format, truncating or padding with `0.0` to match `max_boxes`. +* **Visualization**: Draws green rectangles and numeric labels on the original image, saving the result to `storage.cache_path`. + + +4. **Prompt Generation** +* Generates a fixed template prompt based on the valid box count: +> "Describe the content of each marked region in the image. There are {N} regions: \ to \." + + + + +5. **Result Export** +* Writes the complete record containing raw info, normalized BBoxes, visualization paths, and the Prompt to `config.output_jsonl_path`. + + + +### 🧾 `run` Parameters + +| Parameter | Type | Default | Description | +| --- | --- | --- | --- | +| `storage` | `DataFlowStorage` | N/A | Storage object, mainly used to provide the `cache_path`. | +| `input_image_key` | `str` | `"image"` | Field name for image paths in the input JSONL. | +| `input_bbox_key` | `str` | `"bbox"` | Field name for BBox data in the input JSONL. | +| `output_key` | `str` | `"mdvp_record"` | (Reserved) Key name for the output record. | + +## 🧩 Example Usage + +```python +from dataflow.utils.storage import FileStorage +from dataflow.operators.cv import ImageBboxGenerator, ExistingBBoxDataGenConfig + +# 1) Configure Parameters +config = ExistingBBoxDataGenConfig( + max_boxes=5, + input_jsonl_path="./data/raw_images.jsonl", + output_jsonl_path="./data/processed_with_prompts.jsonl" +) + +# 2) Initialize Operator +# Note: This operator is for data prep and does not require a Serving instance +generator = ImageBboxGenerator(config=config) + +# 3) Prepare Storage (Only for providing cache path) +storage = FileStorage( + cache_path="./cache_vis_images", + file_name_prefix="bbox_gen" +) + +# 4) Execute Processing +# Automatically reads from config input, writes to config output +generator.run( + storage=storage, + input_image_key="image_path", + input_bbox_key="ground_truth_bbox" # Will auto-extract if this column is missing +) + +``` + +### 🧾 Output Data Format (Output JSONL) + +Each line in the `output_jsonl_path` file contains: + +```json +{ + "image": "/data/raw/cat.jpg", + "type": "without_bbox", // or "with_bbox" + "bbox": [[100, 200, 50, 60], ...], // Raw pixel coords [x, y, w, h] + "normalized_bbox": [ + [0.1, 0.2, 0.15, 0.26], + [0.0, 0.0, 0.0, 0.0] // Zero-padded + ], + "result_file": "./cache_vis_images", + "image_with_bbox": "./cache_vis_images/1_bbox_vis.jpg", // Path to visualized image + "valid_bboxes_num": 1, + "prompt": "Describe the content of each marked region in the image. There are 1 regions: \ to \." +} + +``` diff --git a/docs/en/notes/mm_operators/image_understanding/generate/image_caprl.md b/docs/en/notes/mm_operators/image_understanding/generate/image_caprl.md index 9649a5c2..c8e82317 100644 --- a/docs/en/notes/mm_operators/image_understanding/generate/image_caprl.md +++ b/docs/en/notes/mm_operators/image_understanding/generate/image_caprl.md @@ -64,7 +64,7 @@ class CapRLMCQGenerate(OperatorABC): def run( self, storage: DataFlowStorage, - image_key: str = "image", + input_image_key: str = "image", output_key: str = "cap_rl_qa", ): ... @@ -79,7 +79,7 @@ Reads images, generates MCQs, parses, rotates, validates under both conditions, | Parameter | Type | Default | Description | | ---------- | ----------------- | ------------- | ------------------------------------------ | | storage | `DataFlowStorage` | required | IO and caching. | -| image_key | str | `"image"` | Input field (image path). | +| input_image_key | str | `"image"` | Input field (image path). | | output_key | str | `"cap_rl_qa"` | Output field to store the full stats JSON. | --- @@ -111,7 +111,7 @@ cfg = CapRLMCQConfig( ) op = CapRLMCQGenerate(serving, cfg) -op.run(storage=storage.step(), image_key="image", output_key="cap_rl_qa") +op.run(storage=storage.step(), input_image_key="image", output_key="cap_rl_qa") ``` --- diff --git a/docs/en/notes/mm_operators/image_understanding/generate/image_caption.md b/docs/en/notes/mm_operators/image_understanding/generate/image_caption.md index 746397f1..0171f15a 100644 --- a/docs/en/notes/mm_operators/image_understanding/generate/image_caption.md +++ b/docs/en/notes/mm_operators/image_understanding/generate/image_caption.md @@ -1,15 +1,19 @@ --- -title: Image Caption Generation +title: ImageCaptionGenerator createTime: 2025/10/15 15:00:00 -icon: material-symbols-light:image +# icon: material-symbols-light:image permalink: /en/mm_operators/generate/image_caption/ --- ## 📘 Overview -`ImageCaptionGenerate` is an operator designed to **automatically generate image captions using large vision-language models (VLMs)**. -Given input images, it constructs prompts to guide the model in producing high-quality scene or object descriptions. -This is suitable for multimodal annotation, dataset construction, and image-text matching tasks. +`ImageCaptionGenerator` is an operator designed to **automatically generate image captions using large vision-language models (VLMs)**. +Given input images, it constructs prompts to guide the model in producing high-quality scene or object descriptions. This is suitable for multimodal annotation, dataset construction, and image-text matching tasks. + +**Features:** +* Supports batch processing of multiple images. +* Generates high-quality descriptions using VLMs like Qwen. +* Automatically handles image input and prompt construction. --- @@ -25,11 +29,11 @@ def __init__( ## 🧾 `__init__` Parameters -| Parameter | Type | Default | Description | -| :------------ | :-------------- | :------ | :------------------------------------------------------------ | -| `llm_serving` | `LLMServingABC` | - | Model serving object used to call VLM for generating captions | +| Parameter | Type | Default | Description | +| :------------ | :-------------- | :------ | :-------------------------------------------------------------- | +| `llm_serving` | `LLMServingABC` | - | **Model Serving Object** used to call the VLM for caption generation | ---- +----- ## ⚡ `run` Function @@ -37,35 +41,35 @@ def __init__( def run( self, storage: DataFlowStorage, - multi_modal_key: str = "image", - output_key: str = "caption" + input_modal_key: str = "image", + output_key: str = "output" ): ... ``` The `run` function executes the main caption generation workflow: -read image paths → construct prompts → call the model → generate text captions → write results to output. +read image paths → **validate DataFrame** → construct prompts → call the model → generate text captions → write results to output. ## 🧾 `run` Parameters -| Parameter | Type | Default | Description | -| :---------------- | :---------------- | :---------- | :----------------------------- | -| `storage` | `DataFlowStorage` | - | Dataflow storage object | -| `multi_modal_key` | `str` | `"image"` | Multimodal input field name | -| `output_key` | `str` | `"caption"` | Output field name for captions | +| Parameter | Type | Default | Description | +| :---------------- | :---------------- | :---------- | :---------------------------------------------------- | +| `storage` | `DataFlowStorage` | - | Dataflow storage object | +| `input_modal_key` | `str` | `"image"` | **Multimodal Input Field Name** (e.g., image paths) | +| `output_key` | `str` | `"output"` | **Model Output Field Name** (the generated description text) | ---- +----- ## 🧠 Example Usage ```python from dataflow.utils.storage import FileStorage from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm -from dataflow.operators.core_vision import ImageCaptionGenerate +from dataflow.operators.core_vision import ImageCaptionGenerator # Step 1: Launch local model service serving = LocalModelVLMServing_vllm( - hf_model_name_or_path="./models/Qwen2.5-VL-3B-Instruct", + hf_model_name_or_path="Qwen/Qwen2.5-VL-3B-Instruct", vllm_tensor_parallel_size=1, vllm_temperature=0.7, vllm_top_p=0.9, @@ -74,25 +78,23 @@ serving = LocalModelVLMServing_vllm( # Step 2: Prepare input data storage = FileStorage( - first_entry_file_name="data/example_caption.jsonl", + first_entry_file_name="dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl", cache_path="./cache_local", - file_name_prefix="caption", + file_name_prefix="dataflow_cache_step", cache_type="jsonl", - media_key="image", - media_type="image" ) -storage.step() +storage.step() # Load data # Step 3: Initialize and run the operator -generator = ImageCaptionGenerate(serving) +generator = ImageCaptionGenerator(serving) generator.run( storage=storage, - multi_modal_key="image", - output_key="caption" + input_modal_key="image", + output_key="caption" # Explicitly specifying output field as "caption" in the example ) ``` ---- +----- ## 🧾 Default Output Format @@ -101,7 +103,7 @@ generator.run( | `image` | `List[str]` | Input image paths | | `caption` | `str` | Generated image caption text | ---- +----- ### 📥 Example Input diff --git a/docs/en/notes/mm_operators/image_understanding/generate/image_gcot.md b/docs/en/notes/mm_operators/image_understanding/generate/image_gcot.md index 389d835d..6bf2dfab 100644 --- a/docs/en/notes/mm_operators/image_understanding/generate/image_gcot.md +++ b/docs/en/notes/mm_operators/image_understanding/generate/image_gcot.md @@ -54,9 +54,9 @@ def __init__( def run( self, storage: DataFlowStorage, - question_key: str = "question", - answer_key: str = "answer", - image_key: str = "image", + input_question_key: str = "question", + input_answer_key: str = "answer", + input_image_key: str = "image", output_key: str = "gcot", save_intermediate: bool = True, qwen_unload_callback = None @@ -74,9 +74,9 @@ Executes the complete GCoT generation pipeline: | Parameter | Type | Default | Description | | :--------------------- | :---------------- | :----------- | :---------------------------------------------------- | | `storage` | `DataFlowStorage` | - | Dataflow storage object | -| `question_key` | `str` | `"question"` | Input question field name | -| `answer_key` | `str` | `"answer"` | Input answer field name | -| `image_key` | `str` | `"image"` | Input image field name | +| `input_question_key` | `str` | `"question"` | Input question field name | +| `input_answer_key` | `str` | `"answer"` | Input answer field name | +| `input_image_key` | `str` | `"image"` | Input image field name | | `output_key` | `str` | `"gcot"` | Output GCoT field name | | `save_intermediate` | `bool` | `True` | Whether to save intermediate results and visualizations | | `qwen_unload_callback` | `Callable` | `None` | Callback function to unload Qwen model (for memory management) | @@ -124,9 +124,9 @@ def unload_qwen(): gcot_generator.run( storage=storage, - question_key="question", - answer_key="answer", - image_key="image", + input_question_key="question", + input_answer_key="answer", + input_image_key="image", output_key="gcot", save_intermediate=True, qwen_unload_callback=unload_qwen diff --git a/docs/en/notes/mm_operators/image_understanding/generate/image_pers_qa.md b/docs/en/notes/mm_operators/image_understanding/generate/image_pers_qa.md index 968f0b4b..4b965947 100644 --- a/docs/en/notes/mm_operators/image_understanding/generate/image_pers_qa.md +++ b/docs/en/notes/mm_operators/image_understanding/generate/image_pers_qa.md @@ -1,20 +1,30 @@ --- -title: Personalized Image QA Generation +title: PersQAGenerator createTime: 2025/10/15 18:20:00 -icon: material-symbols-light:quiz +# icon: material-symbols-light:quiz permalink: /en/mm_operators/generate/image_pers_qa/ --- ## 📘 Overview -`PersQAGenerate` is an operator for **generating personalized image question-answer pairs using large vision-language models (VLMs)**. -It can: -- Automatically assign name tags to main characters in images (e.g., ``); -- Randomly select suitable questions from predefined templates; -- Guide the model to start answers with the character's name; -- Output structured QA pairs suitable for multimodal QA dataset construction and evaluation of character understanding. +`PersQAGenerator` is an operator designed to **generate personalized image Question-Answer (QA) pairs based on large vision-language models (VLMs)**. +It performs the following steps: ---- + * Automatically assigns a name tag to the main character in the image (hardcoded as `` in the implementation). + * Randomly selects an appropriate question from predefined templates. + * Guides the VLM to start the answer with the character's name tag. + * Outputs structured QA pairs, suitable for multimodal QA dataset construction and character role understanding evaluation. + +**Features:** + + * Supports generating personalized QA for specific characters in images. + * Automatically assigns name tags (e.g., ``) to the main subject. + * Randomly selects relevant questions from predefined templates. + * Requires the model to start answers with the main character's name tag. + * Supports batch processing of multiple images. + * Output includes the complete Question-Answer pair in the format: `Question: ..., Answer: ...`. + +----- ## 🏗️ `__init__` Function @@ -28,11 +38,11 @@ def __init__( ## 🧾 `__init__` Parameters -| Parameter | Type | Default | Description | -| :------------ | :-------------- | :------ | :------------------------------------------------------------ | -| `llm_serving` | `LLMServingABC` | - | Model serving object used to call VLM for generating QA pairs | +| Parameter | Type | Default | Description | +| :------------ | :-------------- | :------ | :-------------------------------------------------------------- | +| `llm_serving` | `LLMServingABC` | - | **Model Serving Object** used to call the VLM for QA generation | ---- +----- ## ⚡ `run` Function @@ -40,71 +50,68 @@ def __init__( def run( self, storage: DataFlowStorage, - multi_modal_key: str = "image", - output_key: str = "pers_qa" + input_modal_key: str = "image", + output_key: str = "output" ): ... ``` -The `run` function executes the main QA generation workflow: -read image paths → construct questions and prompts → call the model → return structured QA results. +The `run` function executes the main QA generation logic: read image paths → construct questions and prompts → call the model → return structured QA results. ## 🧾 `run` Parameters -| Parameter | Type | Default | Description | -| :---------------- | :---------------- | :---------- | :----------------------------- | -| `storage` | `DataFlowStorage` | - | Dataflow storage object | -| `multi_modal_key` | `str` | `"image"` | Multimodal input field name | -| `output_key` | `str` | `"pers_qa"` | Output field name for QA pairs | +| Parameter | Type | Default | Description | +| :---------------- | :---------------- | :---------- | :------------------------------------------------------------------- | +| `storage` | `DataFlowStorage` | - | Dataflow storage object | +| `input_modal_key` | `str` | `"image"` | **Multimodal Input Field Name** (image path) | +| `output_key` | `str` | `"output"` | **Model Output Field Name** (personalized QA text, defaults to `output`) | ---- +----- ## 🧠 Example Usage ```python from dataflow.utils.storage import FileStorage from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm -from dataflow.operators.core_vision import PersQAGenerate +from dataflow.operators.core_vision import PersQAGenerator # Step 1: Launch local model service serving = LocalModelVLMServing_vllm( - hf_model_name_or_path="./models/Qwen2.5-VL-3B-Instruct", + hf_model_name_or_path="Qwen/Qwen2.5-VL-3B-Instruct", vllm_tensor_parallel_size=1, vllm_temperature=0.7, vllm_top_p=0.9, vllm_max_tokens=512 ) -# Step 2: Prepare storage +# Step 2: Set up storage storage = FileStorage( - first_entry_file_name="data/example.jsonl", + first_entry_file_name="dataflow/example/Image2TextPipeline/test_image2caption.jsonl", cache_path="./cache_local", file_name_prefix="pers_qa", cache_type="jsonl", - media_key="image", - media_type="image" ) storage.step() # Step 3: Initialize and run the operator -generator = PersQAGenerate(serving) +generator = PersQAGenerator(serving) generator.run( storage=storage, - multi_modal_key="image", + input_modal_key="image", output_key="pers_qa" ) ``` ---- +----- ## 🧾 Default Output Format -| Field | Type | Description | -| :-------- | :---------- | :----------------------------------------------------------------------- | -| `image` | `List[str]` | Input image paths | -| `pers_qa` | `str` | Generated personalized QA text in the format `Question: ... Answer: ...` | +| Field | Type | Description | +| :-------- | :---------- | :------------------------------------------------------------------- | +| `image` | `List[str]` | Input image paths | +| `pers_qa` | `str` | Generated personalized QA pair text, format: `Question: ..., Answer: ...` | ---- +----- ### 📥 Example Input @@ -116,6 +123,8 @@ generator.run( ### 📤 Example Output ```jsonl -{"image": ["./test/example1.jpg"], "pers_qa": "Question: What is she doing? Answer: is smiling at the camera."} -{"image": ["./test/example2.jpg"], "pers_qa": "Question: Where is she? Answer: is in a cafe."} -``` \ No newline at end of file +{"image": ["./test/example1.jpg"], "pers_qa": "Question: is doing what?, Answer: is smiling at the camera."} +{"image": ["./test/example2.jpg"], "pers_qa": "Question: Where is ?, Answer: is in a cafe."} +``` + +> **Tips:** Using a stronger Multimodal Large Language Model (MLLM) can ensure more accurate format generation. \ No newline at end of file diff --git a/docs/en/notes/mm_operators/image_understanding/generate/image_qa.md b/docs/en/notes/mm_operators/image_understanding/generate/image_qa.md index a76dde2a..db71f0f9 100644 --- a/docs/en/notes/mm_operators/image_understanding/generate/image_qa.md +++ b/docs/en/notes/mm_operators/image_understanding/generate/image_qa.md @@ -1,16 +1,23 @@ --- -title: Image Question-Answer Generation +title: ImageQAGenerator createTime: 2025/10/15 16:00:00 -icon: material-symbols-light:quiz +# icon: material-symbols-light:quiz permalink: /en/mm_operators/generate/image_qa/ --- ## 📘 Overview -`ImageQAGenerate` is an operator for **automatically generating question-answer pairs (Visual QA) based on image content**. -It can intelligently ask relevant questions about the image scene and generate reference answers. This is suitable for multimodal QA dataset construction, retrieval augmentation, and image-text matching enhancement. +`ImageQAGenerator` is an operator designed to **automatically generate Question-Answer (QA) pairs based on image content (Visual QA)**. +It intelligently proposes relevant questions based on the image scene and generates corresponding reference answers. ---- +**Features:** + + * Supports batch processing of multiple images. + * Automatically generates relevant QA pairs using Vision-Language Models (VLMs). + * Applicable for Visual QA dataset construction and model training. + * Automatically handles image input and QA prompt construction. + +----- ## 🏗️ `__init__` Function @@ -24,11 +31,11 @@ def __init__( ## 🧾 `__init__` Parameters -| Parameter | Type | Default | Description | -| :------------ | :-------------- | :------ | :-------------------------------------------------------------------------- | -| `llm_serving` | `LLMServingABC` | - | Model serving object used to call a vision-language model for QA generation | +| Parameter | Type | Default | Description | +| :------------ | :-------------- | :------ | :-------------------------------------------------------------- | +| `llm_serving` | `LLMServingABC` | - | **Model Serving Object** used to call the VLM for QA generation | ---- +----- ## ⚡ `run` Function @@ -36,34 +43,34 @@ def __init__( def run( self, storage: DataFlowStorage, - multi_modal_key: str = "image", - output_key: str = "qa_pairs" + input_modal_key: str = "image", + output_key: str = "output" ): ... ``` -The `run` function executes the main QA generation workflow: generates multiple QA pairs for the input images and writes them to the output file. +The `run` function executes the main operator logic: read image paths → **validate DataFrame** → construct prompts → call the model → generate Visual QA pairs and write them to the output file. ## 🧾 `run` Parameters -| Parameter | Type | Default | Description | -| :---------------- | :---------------- | :----------- | :----------------------------- | -| `storage` | `DataFlowStorage` | - | Dataflow storage object | -| `multi_modal_key` | `str` | `"image"` | Multimodal input field name | -| `output_key` | `str` | `"qa_pairs"` | Output field name for QA pairs | +| Parameter | Type | Default | Description | +| :---------------- | :---------------- | :---------- | :------------------------------------------------------------------------ | +| `storage` | `DataFlowStorage` | - | Dataflow storage object | +| `input_modal_key` | `str` | `"image"` | **Multimodal Input Field Name** (e.g., image paths) | +| `output_key` | `str` | `"output"` | **Output QA Field Name** (defaults to `output`, can be customized) | ---- +----- ## 🧠 Example Usage ```python from dataflow.utils.storage import FileStorage from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm -from dataflow.operators.core_vision import ImageQAGenerate +from dataflow.operators.core_vision import ImageQAGenerator # Step 1: Launch local model service serving = LocalModelVLMServing_vllm( - hf_model_name_or_path="./models/Qwen2.5-VL-3B-Instruct", + hf_model_name_or_path="Qwen/Qwen2.5-VL-3B-Instruct", vllm_tensor_parallel_size=1, vllm_temperature=0.7, vllm_top_p=0.9, @@ -72,34 +79,34 @@ serving = LocalModelVLMServing_vllm( # Step 2: Prepare input data storage = FileStorage( - first_entry_file_name="data/example_qa.jsonl", + first_entry_file_name="dataflow/example/Image2TextPipeline/test_image2qa.jsonl", cache_path="./cache_local", file_name_prefix="imageqa", cache_type="jsonl", - media_key="image", - media_type="image" ) -storage.step() +storage.step() # Load data # Step 3: Initialize and run the operator -qa_generator = ImageQAGenerate(serving) +qa_generator = ImageQAGenerator(serving) qa_generator.run( storage=storage, - multi_modal_key="image", - output_key="qa_pairs" + input_modal_key="image", + output_key="qa_pairs" # Explicitly specifying output field as "qa_pairs" in the example ) ``` ---- +----- ## 🧾 Default Output Format -| Field | Type | Description | -| :--------- | :--------------------- | :----------------------------------------------------------------- | -| `image` | `List[str]` | Input image paths | -| `qa_pairs` | `List[Dict[str, str]]` | Generated QA pairs, each containing `question` and `answer` fields | +| Field | Type | Description | +| :--------- | :--------------------- | :------------------------------------------------------------------- | +| `image` | `List[str]` | Input image paths | +| `qa_pairs` | `List[Dict[str, str]]` | Generated QA pairs, containing `question` and `answer` fields | ---- +> **Note:** The raw output from the model (`output_key`) is typically a single string containing all QA pairs. A subsequent operator (like `JsonParser`) is usually required to structure this output into the `List[Dict[str, str]]` format shown here. + +----- ### 📥 Example Input @@ -107,15 +114,15 @@ qa_generator.run( {"image": ["./test/street_scene.jpg"]} ``` -### 📤 Example Output +### 📤 Example Output (Structured) ```jsonl { "image": ["./test/street_scene.jpg"], "qa_pairs": [ - {"question": "How many cars are in the image?", "answer": "Two"}, - {"question": "What kind of scene is captured in this photo?", "answer": "City street"}, - {"question": "What is the main type of transportation in the image?", "answer": "Cars"} + {"question": "How many cars are in the image?", "answer": "Two cars"}, + {"question": "What is the scene depicted in this photo?", "answer": "A city street"}, + {"question": "What is the main mode of transportation shown?", "answer": "A car"} ] } ``` \ No newline at end of file diff --git a/docs/en/notes/mm_operators/image_understanding/generate/image_region_caption.md b/docs/en/notes/mm_operators/image_understanding/generate/image_region_caption.md index 986c2cd7..5c9c4ea2 100644 --- a/docs/en/notes/mm_operators/image_understanding/generate/image_region_caption.md +++ b/docs/en/notes/mm_operators/image_understanding/generate/image_region_caption.md @@ -36,8 +36,8 @@ def __init__( def run( self, storage: DataFlowStorage, - image_key: str = "image", - bbox_key: str = "bbox", + input_image_key: str = "image", + input_bbox_key: str = "bbox", output_key: str = "mdvp_record" ): ``` @@ -47,8 +47,8 @@ def run( | Parameter | Type | Default | Description | | :--- | :--- | :--- | :--- | | storage | DataFlowStorage | No default (required) | Storage instance for file operations and cache path management | -| image_key | str | "image" | Field name for image path in input JSONL data | -| bbox_key | str | "bbox" | Field name for bounding boxes in input data. If missing, automatically extracts from image | +| input_image_key | str | "image" | Field name for image path in input JSONL data | +| input_bbox_key | str | "bbox" | Field name for bounding boxes in input data. If missing, automatically extracts from image | | output_key | str | "mdvp_record" | Field name for output region caption records in result data | @@ -88,8 +88,8 @@ operator = ImageRegionCaptionGenerate(llm_serving=model, config=cfg) operator.run( storage=storage.step(), - image_key="image", - bbox_key="bbox", + input_image_key="image", + input_bbox_key="bbox", output_key="mdvp_record" ) diff --git a/docs/en/notes/mm_operators/image_understanding/generate/image_scale_caption.md b/docs/en/notes/mm_operators/image_understanding/generate/image_scale_caption.md index 01586b25..4423cad6 100644 --- a/docs/en/notes/mm_operators/image_understanding/generate/image_scale_caption.md +++ b/docs/en/notes/mm_operators/image_understanding/generate/image_scale_caption.md @@ -66,7 +66,7 @@ Returns the operator description in either Chinese or English. def run( self, storage: DataFlowStorage, - image_key: str = "image", + input_image_key: str = "image", output_key: str = "scalecap_record" ) ``` @@ -76,7 +76,7 @@ def run( | Parameter | Type | Default | Description | | ------------ | ----------------- | ------------------- | --------------------------------------------------------- | | `storage` | `DataFlowStorage` | — | DataFlow storage interface for reading and writing data. | -| `image_key` | `str` | `"image"` | The column name for image paths in the input. | +| `input_image_key` | `str` | `"image"` | The column name for image paths in the input. | | `output_key` | `str` | `"scalecap_record"` | The column name where the generated captions are written. | --- @@ -107,7 +107,7 @@ operator = ImageScaleCaptionGenerate(vlm_serving=model, config=cfg) # Run the operator operator.run( storage=storage, - image_key="image", + input_image_key="image", output_key="scalecap_record" ) ``` diff --git a/docs/en/notes/mm_operators/image_understanding/generate/image_skvqa.md b/docs/en/notes/mm_operators/image_understanding/generate/image_skvqa.md deleted file mode 100644 index 5850e976..00000000 --- a/docs/en/notes/mm_operators/image_understanding/generate/image_skvqa.md +++ /dev/null @@ -1,163 +0,0 @@ ---- - -title: Image-based Knowledge-Enhanced Question Answering Generation (SKVQA) -createTime: 2025/10/26 15:00:00 -icon: material-symbols-light:image -permalink: /en/mm_operators/generate/image_skvqa/ ---- - -## 📘 Overview - -`ImageSKVQAGenerate` is an operator designed to generate **Synthetic Knowledge Visual Question Answering (SKVQA)** data. -Unlike standard Visual Question Answering (VQA), SKVQA integrates external **contextual knowledge** into the question–answer generation process, -enabling the model to reason based not only on the image itself but also on relevant textual descriptions or background information. - -This capability is highly useful for **visual knowledge understanding, product manual QA generation, and multimodal knowledge-enhanced training** tasks. - ---- - -## 🏗️ `__init__` Function - -```python -def __init__( - self, - llm_serving: LLMServingABC -): - ... -``` - -## 🧾 `__init__` Parameters - -| Parameter | Type | Default | Description | -| :------------ | :-------------- | :------ | :---------------------------------------------------------------------------------------- | -| `llm_serving` | `LLMServingABC` | - | The model serving object used to call a Vision-Language Model (VLM) for SKVQA generation. | - ---- - -## ⚡ `run` Function - -```python -def run( - self, - storage: DataFlowStorage, - multi_modal_key: str = "image", - output_key: str = "skvqa" -): - ... -``` - -Executes the main operator logic to generate structured SKVQA outputs — including contextual text (`context`) and question–answer pairs (`QAs`) — for each input image. - ---- - -## 🧾 `run` Parameters - -| Parameter | Type | Default | Description | -| :---------------- | :---------------- | :-------- | :----------------------------------------------------------- | -| `storage` | `DataFlowStorage` | - | The DataFlow storage object. | -| `multi_modal_key` | `str` | `"image"` | The multimodal input field name (usually the image path). | -| `output_key` | `str` | `"skvqa"` | The output field name used to store the parsed SKVQA result. | - ---- - -## 🧠 Operator Functionality - -* Automatically generates a structured **SKVQA output** for each image, containing: - - * `context`: Contextual background information or knowledge relevant to the image. - * `qas`: A list of question–answer pairs (`question`, `answer`). - -* Parses model outputs formatted in Markdown, such as: - - ``` - ### Wikipedia Article - (context text) - - ### Question Answer Pairs - 1. **Question** - - Answer - 2. **Question** - - Answer - ``` - -* Supports **fault-tolerant parsing**, meaning even imperfectly formatted text can be interpreted as best as possible. - -* Applicable for **visual knowledge enhancement, multimodal training, and QA generation** tasks. - ---- - -## 🧩 Example Usage - -```python -from dataflow.utils.storage import FileStorage -from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm -from dataflow.operators.core_vision.generate.sk_vqa_generator import ImageSKVQAGenerate - -# Step 1: Launch a local vision-language model -serving = LocalModelVLMServing_vllm( - hf_model_name_or_path="./models/Qwen2.5-VL-3B-Instruct", - vllm_tensor_parallel_size=1, - vllm_temperature=0.7, - vllm_top_p=0.9, - vllm_max_tokens=512 -) - -# Step 2: Prepare input data -storage = FileStorage( - first_entry_file_name="data/example_skvqa.jsonl", - cache_path="./cache_skvqa", - cache_type="jsonl" -) -storage.step() - -# Step 3: Initialize the operator and run it -skvqa_generator = ImageSKVQAGenerate(serving) -skvqa_generator.run( - storage=storage, - multi_modal_key="image", - output_key="skvqa" -) -``` - ---- - -## 🧾 Default Output Format - -| Field | Type | Description | -| :------ | :--------------- | :----------------------------------------------------------------------------------- | -| `image` | `List[str]` | List of input image paths. | -| `skvqa` | `Dict[str, Any]` | The structured SKVQA output generated by the model, including context and Q&A pairs. | - ---- - -### 📥 Example Input - -```jsonl -{"image": ["./data/product_manual.jpg"]} -``` - -### 📤 Example Output - -```jsonl -{ - "image": ["./data/product_manual.jpg"], - "skvqa": { - "context": "This is a section from a smartwatch user manual showing the health monitoring interface.", - "qas": [ - {"question": "What device is shown in the image?", "answer": "A smartwatch"}, - {"question": "What are its main features?", "answer": "It supports heart rate monitoring, step tracking, and sleep analysis."}, - {"question": "What is the main topic of this text?", "answer": "An introduction to smartwatch functions"} - ] - } -} -``` - ---- - -## 💡 Key Features - -* ✅ Supports batch image inputs -* ✅ Automatically generates structured context + Q&A results -* ✅ Built-in format cleaning and fault tolerance -* ✅ Compatible with any vision–language model (e.g., Qwen-VL, InternVL, MiniCPM-V) -* ✅ Ideal for multimodal knowledge enhancement, retrieval QA, and data generation tasks diff --git a/docs/en/notes/mm_operators/image_understanding/generate/multimodal_math.md b/docs/en/notes/mm_operators/image_understanding/generate/multimodal_math.md index 5a4de97b..7a237e23 100644 --- a/docs/en/notes/mm_operators/image_understanding/generate/multimodal_math.md +++ b/docs/en/notes/mm_operators/image_understanding/generate/multimodal_math.md @@ -1,23 +1,23 @@ --- -title: Multimodal Math Question Generation +title: MultimodalMathGenerator createTime: 2025/10/15 19:00:00 -icon: material-symbols-light:functions +# icon: material-symbols-light:functions permalink: /en/mm_operators/generate/multimodal_math/ --- ## 📘 Overview -`MultimodalMathGenerate` is an operator for **automatically generating math function plots along with math question-answer pairs**. -It supports various function types such as linear, quadratic, sine, and exponential functions. Users can generate both simple and complex math problems, with automatically plotted corresponding function images. This is suitable for educational scenarios, visual QA model training, and mathematical reasoning evaluation. +`MultimodalMathGenerator` is a data generation operator for **automatically creating function plots (images) and corresponding math Question-Answer (QA) pairs**. +It supports various function types (linear, quadratic, sine, exponential, etc.) and generates simple calculation problems or advanced conceptual problems based on the `mode` field (`simple` or `complex`) in the input data. This operator is suitable for educational applications, visual QA model training, and math reasoning evaluation. ---- +----- ## 🏗️ `__init__` Function ```python def __init__( self, - image_dir: str = "/data0/mt/Dataflow-MM-Preview/cache", + image_dir: str = "~/cache", seed: int | None = None ): ... @@ -25,12 +25,12 @@ def __init__( ## 🧾 `__init__` Parameters -| Parameter | Type | Default | Description | -| :---------- | :------------ | :-------------------------------------- | :----------------------------------------- | -| `image_dir` | `str` | `"/data0/mt/Dataflow-MM-Preview/cache"` | Directory to save generated function plots | -| `seed` | `int \| None` | `None` | Random seed for reproducibility | +| Parameter | Type | Default | Description | +| :---------- | :------------ | :------ | :-------------------------------------------------------------- | +| `image_dir` | `str` | `"~/cache"` | Directory used to save the generated function plots | +| `seed` | `int \| None` | `None` | Random seed to ensure reproducibility of generated results | ---- +----- ## ⚡ `run` Function @@ -38,108 +38,117 @@ def __init__( def run( self, storage: DataFlowStorage, - n: int = 200, - mode: str = "complex", - output_key: str = "multimodal_math" + input_key: str = "mode", ): ... ``` -Executes the main workflow, automatically generating a specified number of function plots along with corresponding math QA pairs. +The `run` function executes the main operator logic: it reads the data from `storage`, generates the corresponding function image and math QA pair based on the value in the `input_key` field for each row, and then horizontally concatenates the newly generated columns back to the original data. ---- +----- ## 🧾 `run` Parameters -| Parameter | Type | Default | Description | -| :----------- | :---------------- | :------------------ | :---------------------------------------------------------------------------------------------------------- | -| `storage` | `DataFlowStorage` | - | Dataflow storage object | -| `n` | `int` | `200` | Number of samples to generate | -| `mode` | `str` | `"complex"` | Generation mode: `"simple"` for straightforward numeric problems, `"complex"` for advanced concept problems | -| `output_key` | `str` | `"multimodal_math"` | Output field name prefix for generated data | +| Parameter | Type | Default | Description | +| :---------- | :---------------- | :------ | :----------------------------------------------------------------------- | +| `storage` | `DataFlowStorage` | - | Dataflow storage object (contains the rows to be processed) | +| `input_key` | `str` | `"mode"` | **Field name for the mode column**. Its value determines whether to generate a `"simple"` or `"complex"` problem. | ---- +----- + +## 🧠 Mode Description and Example Usage -## 🧠 Example Usage +### 📐 Mode Description + +| Mode | `mode` Column Value | Characteristics | Problem Type | +| :--- | :--- | :--- | :--- | +| **Simple** | `"simple"` | Basic function recognition and numerical substitution. | Given the function expression $f(x)$, find the function value $f(a)$ at $x=a$. | +| **Complex** | Other values (e.g., `"complex"`) | Emphasizes mathematical analysis skills (derivatives, extrema, monotonicity). | Randomly generates questions on derivative sign, extreme points within an interval, or monotonicity judgment. | + +### 🧩 Example Usage (Requires an input file pre-populated with a `mode` column) ```python from dataflow.utils.storage import FileStorage -from dataflow.operators.core_math import MultimodalMathGenerate +from dataflow.operators.core_math import MultimodalMathGenerator +import pandas as pd + +# Step 1: Prepare an input file containing the 'mode' column (e.g., data/math_tasks.jsonl) +# Assuming data/math_tasks.jsonl contains: +# {"id": 1, "mode": "simple"} +# {"id": 2, "mode": "complex"} +# {"id": 3, "mode": "complex"} -# Step 1: Prepare storage storage = FileStorage( - first_entry_file_name="data/math_samples.jsonl", + first_entry_file_name="data/math_tasks.jsonl", cache_path="./cache_local", - file_name_prefix="math", + file_name_prefix="math_out", cache_type="jsonl" ) +storage.step() # Load data -# Step 2: Initialize operator -math_generator = MultimodalMathGenerate( +# Step 2: Initialize the operator +math_generator = MultimodalMathGenerator( image_dir="./math_plots", seed=42 ) -# Step 3: Generate complex math problems (derivatives, extrema, monotonicity) +# Step 3: Run the operator, generating problems based on the 'mode' column of each row math_generator.run( storage=storage, - n=10, - mode="complex", - output_key="multimodal_math" -) - -# Step 4: Generate simple numeric problems -math_generator.run( - storage=storage, - n=10, - mode="simple", - output_key="multimodal_math_simple" + input_key="mode" # Specify 'mode' column to control generation ) ``` ---- +----- ## 🧾 Default Output Format -| Field | Type | Description | -| :----------- | :---- | :------------------------------------ | -| `image_path` | `str` | Path to the generated function plot | -| `question` | `str` | Automatically generated math question | -| `answer` | `str` | Answer to the question | -| `solution` | `str` | Detailed step-by-step solution | +The operator will **horizontally concatenate** the following four fields onto the original input DataFrame: ---- +| Field | Type | Description | +| :----------- | :--- | :-------------------------------------------- | +| `image_path` | `str` | Local path where the function plot image is saved | +| `question` | `str` | Automatically generated mathematical question | +| `answer` | `str` | Answer | +| `solution` | `str` | Detailed solution steps and explanation | + +----- ### 📥 Example Input +> **Note:** The operator relies on the number of rows in the input `storage` and the value of the `input_key` column (defaults to `mode`) to generate data. + ```jsonl -{} +{"id": 1, "mode": "simple"} +{"id": 2, "mode": "complex"} ``` -> This operator does not depend on external input data and generates samples directly. - ---- +----- -### 📤 Example Output (Simple Mode) +### 📤 Example Output (Simple Mode Row) ```jsonl { + "id": 1, + "mode": "simple", "image_path": "./math_plots/plot_0.png", - "question": "The function plot represents f(x) = x². What is the value of the function at x=3.5?", + "question": "The function plot represents f(x) = x². What is the function value at x=3.5?", "answer": "12.25", - "solution": "According to the function expression f(x) = x², substituting x=3.5 gives y=12.25." + "solution": "According to the function expression f(x) = x², substitute x=3.5 to get y=12.25." } ``` ---- +----- -### 📤 Example Output (Complex Mode) +### 📤 Example Output (Complex Mode Row) ```jsonl { - "image_path": "./math_plots/plot_7.png", - "question": "The function plot represents f(x) = sin(x). Determine whether the rate of change of the function at x=2.5 is positive or negative.", - "answer": "Negative", - "solution": "By observing the slope near x=2.5 on the plot, the rate of change is negative." + "id": 2, + "mode": "complex", + "image_path": "./math_plots/plot_1.png", + "question": "The function plot represents f(x) = sin(x). Is the rate of change (derivative) at x=2.5 positive or negative?", + "answer": "negative", + "solution": "By observing the slope of the plot near x=2.5, the rate of change is negative." } ``` \ No newline at end of file diff --git a/docs/en/notes/mm_operators/image_understanding/generate/multirole_videoqa.md b/docs/en/notes/mm_operators/image_understanding/generate/multirole_videoqa.md new file mode 100644 index 00000000..2f45e04c --- /dev/null +++ b/docs/en/notes/mm_operators/image_understanding/generate/multirole_videoqa.md @@ -0,0 +1,140 @@ +--- +title: MultiRole Video QA Generation +createTime: 2025/12/2 20:00:00 +icon: material-symbols-light:video +permalink: /en/mm_operators/generate/multirole_videoqa/ +--- + +## 📘 Overview + +`MultiroleVideoQAGenerate` is a data generation operator for **automatically creating Question-Answer (QA) pairs based on the preprocessed video data**. +Given input preprocessed video data, it constructs several QA pairs relative to the video. This is suitable for Advertisement video annotation, dataset construction, and video understanding tasks. + +**Features:** +* Supports batch processing of multiple preprocessed video data. +* Generates high-quality QA pairs using VLMs like Qwen2.5-VL. +* Automatically handles video input and using prompt to generate data. + +--- + +## 🏗️ `__init__` Function + +```python +def __init__( + self, + llm_serving: VLMServingABC +): + ... +``` +## 🧾 `__init__` Parameters + +| Parameter | Type | Default | Description | +| :------------ | :-------------- | :------ | :-------------------------------------------------------------- | +| `llm_serving` | `VLMServingABC` | - | **Model Serving Object** used to call the VLM for QA pairs generation | + +----- + +## ⚡ `run` Function + +```python +def run( + self, + storage: DataFlowStorage, + input_meta_key: str = "Meta", + input_clips_key: str = "Clips", + output_key: str = "QA" +): + ... +``` + +The `run` function executes the main QA pairs generation workflow: +read data paths → **validate DataFrame** → construct prompts → call the model → generate QA pairs captions → write results to output. + +## 🧾 `run` Parameters + +| Parameter | Type | Default | Description | +| :---------------- | :---------------- | :---------- | :---------------------------------------------------- | +| `storage` | `DataFlowStorage` | - | Dataflow storage object | +| `input_mets_key` | `str` | `"Meta"` | **Multimodal Input Field Name** | +| `input_clips_key` | `str` | `"Clips"` | **Multimodal Input Field Name** | +| `output_key` | `str` | `"QA"` | **Model Output Field Name** (the generated QA pairs) | + +----- + +## 🧠 Example Usage + +```python +import os +import argparse +from dataflow.serving import LocalModelVLMServing_vllm +from dataflow.utils.storage import FileStorage +from dataflow.operators.core_vision import MultiroleVideoQAInitialGenerator, MultiroleVideoQAMultiAgentGenerator, MultiroleVideoQAFinalGenerator + +# Step 1: Launch local model service +llm_serving = LocalModelVLMServing_vllm( + hf_model_name_or_path=model_path, + hf_cache_dir=hf_cache_dir, + hf_local_dir=download_dir, + vllm_tensor_parallel_size=1, + vllm_temperature=0.7, + vllm_top_p=0.9, + vllm_max_tokens=6000, + ) + +# Step 2: Prepare input data +storage = FileStorage( + first_entry_file_name=first_entry_file, + cache_path=cache_path, + file_name_prefix=file_name_prefix, + cache_type=cache_type, + ) + +# Step 3: Initialize and run the operator +initial_QA_generation = MultiroleVideoQAInitialGenerator(llm_serving = self.llm_serving) +multiAgent_QA_generation = MultiroleVideoQAMultiAgentGenerator(llm_serving = self.llm_serving, max_iterations = 3) +final_QA_generation = MultiroleVideoQAFinalGenerator(llm_serving = self.llm_serving) + +init_df = initial_QA_generation.run( + storage = self.storage.step(), + input_meta_key = self.input_meta_key, + input_clips_key = self.input_clips_key, + output_key = self.output_key + ) +middle_df = multiAgent_QA_generation.run( + df = init_df, + input_meta_key = self.input_meta_key, + input_clips_key = self.input_clips_key, + output_key = self.output_key + ) +final_QA_generation.run( + storage = self.storage, + df = middle_df, + input_meta_key = self.input_meta_key, + input_clips_key = self.input_clips_key, + output_key = self.output_key + ) +``` + +----- + +## 🧾 Default Output Format + +| Field | Type | Description | +| :-------- | :----------- | :------------------------------- | +| `Meta` | `str` | Meta information for video | +| `Clips` | `List[Dict]` | Interleaved modality video Clips | +| `QA` | `List[Dict]` | QA pairs | + +----- + +### 📥 Example Input + +```jsonl +{"Meta": "Meta Information", "Clips": [{"Audio_Text": "Audio_Text1", "Frames_Images": ["image_path1","image_path2"], "Description": "Description1"}, {"Audio_Text": "Audio_Text2", "Frames_Images": ["image_path3","image_path4"], "Description": "Description2"}]} +``` + +### 📤 Example Output + +```jsonl +{"Meta": "Meta Information", "Clips": [{"Audio_Text": "Audio_Text1", "Frames_Images": ["image_path1","image_path2"], "Description": "Description1"}, {"Audio_Text": "Audio_Text2", "Frames_Images": ["image_path3","image_path4"], "Description": "Description2"}], "QA":[{"Label":"label1", "Question": "Question1", "Answer": "Answer1"},{"Label":"label2", "Question": "Question2", "Answer": "Answer2"}]} +``` \ No newline at end of file diff --git a/docs/en/notes/mm_operators/image_understanding/generate/prompt_templated_vqa_generator.md b/docs/en/notes/mm_operators/image_understanding/generate/prompt_templated_vqa_generator.md new file mode 100644 index 00000000..425b5273 --- /dev/null +++ b/docs/en/notes/mm_operators/image_understanding/generate/prompt_templated_vqa_generator.md @@ -0,0 +1,132 @@ +--- +title: PromptTemplatedVQAGenerator +createTime: 2026/01/11 21:25:34 +permalink: /en/mm_operators/generate/prompt_templated_vqa_generator/ +--- +## 📘 Overview + +`PromptTemplatedVQAGenerator` is a **Template-Based Multimodal VQA Operator**. It allows users to dynamically inject multiple fields from a DataFrame into a predefined Prompt Template to generate customized text instructions, which are then combined with image or video inputs for batch inference. + +Unlike standard VQA operators, this operator supports complex prompt construction logic (e.g., dynamically filling in categories, context descriptions, etc.), making it highly suitable for scenarios requiring **structured prompt engineering**, such as attribute-guided image captioning or controlled dialogue simulation. + +## 🏗️ `__init__` Function + +```python +def __init__( + self, + serving: LLMServingABC, + prompt_template: NamedPlaceholderPromptTemplate, + system_prompt: str = "You are a helpful assistant.", +): + +``` + +### 🧾 Parameters + +| Parameter | Type | Default | Description | +| --- | --- | --- | --- | +| `serving` | `LLMServingABC` | N/A | The model serving instance for inference (must support multimodal inputs). | +| `prompt_template` | `NamedPlaceholderPromptTemplate` | N/A | A template object implementing `build_prompt` to convert dictionary data into a string prompt. | +| `system_prompt` | `str` | `"You are..."` | The system prompt sent to the model. | + +## ⚡ `run` Function + +```python +def run( + self, + storage: DataFlowStorage, + input_image_key: str = "image", + input_video_key: str = "video", + output_answer_key: str = "answer", + **input_keys, +): + ... + +``` + +Executes the main logic: + +1. **Read Data** +Reads the DataFrame from `storage`. +2. **Dynamic Prompt Construction** +Iterates through each row of the DataFrame: +* Extracts data from columns specified in `input_keys` (e.g., `descriptions` column, `type` column). +* Calls `prompt_template.build_prompt()` to fill these values into the template, generating a unique `prompt_text` for that sample. + + +3. **Multimodal Input Assembly** +* Reads media paths from `input_image_key` or `input_video_key`. +* Packages the generated text prompt with the corresponding image/video data into the format required by the model. + + +4. **Inference & Output** +* Calls the model service for batch generation. +* Writes the results to the column specified by `output_answer_key` and saves the updated DataFrame. + + + +### 🧾 `run` Parameters + +| Parameter | Type | Default | Description | +| --- | --- | --- | --- | +| `storage` | `DataFlowStorage` | N/A | DataFlow storage object. | +| `input_image_key` | `str` | `"image"` | Column name for image paths (mutually exclusive with video_key). | +| `input_video_key` | `str` | `"video"` | Column name for video paths (mutually exclusive with image_key). | +| `output_answer_key` | `str` | `"answer"` | Column name for the generated output. | +| `**input_keys` | `kwargs` | N/A | **Key Parameter**. Defines the mapping between template placeholders and DataFrame columns.
+ +
Format: `template_var="dataframe_column"`. | + +## 🧩 Example Usage + +```python +from dataflow.utils.storage import FileStorage +from dataflow.core import LLMServing +from dataflow.prompts.prompt_template import NamedPlaceholderPromptTemplate +from dataflow.operators.generate import PromptTemplatedVQAGenerator + +# 1) Define a template with placeholders +# We want the model to check for a specific object type, referencing existing descriptions +TEMPLATE = ( + "Context: {descriptions}\n\n" + "Task: Describe the appearance of {type} in the image based on the context above." +) +prompt_template = NamedPlaceholderPromptTemplate(template=TEMPLATE) + +# 2) Initialize Operator +op = PromptTemplatedVQAGenerator( + serving=LLMServing(model_path="Qwen/Qwen2.5-VL-3B-Instruct"), + prompt_template=prompt_template +) + +# 3) Prepare Data (assuming jsonl has image, meta_desc, obj_type columns) +storage = FileStorage(file_name_prefix="vqa_task") +storage.step() + +# 4) Run Operator: Map 'meta_desc' to {descriptions}, 'obj_type' to {type} +op.run( + storage=storage, + input_image_key="image", + output_answer_key="generated_caption", + # Dynamic Mapping: + descriptions="meta_desc", + type="obj_type" +) + +``` + +### 🧾 Input/Output Example + +**Input DataFrame Row:** +| image | meta_desc | obj_type | +| :--- | :--- | :--- | +| `"/path/to/car.jpg"` | `"A photo taken on a sunny day."` | `"vintage car"` | + +**Constructed Prompt:** + +> "Context: A photo taken on a sunny day.\n\nTask: Describe the appearance of **vintage car** in the image based on the context above." + +**Output DataFrame Row:** +| image | meta_desc | obj_type | generated_caption | +| :--- | :--- | :--- | :--- | +| `"/path/to/car.jpg"` | `...` | `...` | `"The vintage car is red with..."` | \ No newline at end of file diff --git a/docs/en/notes/mm_operators/image_understanding/generate/prompted_vqa_generator.md b/docs/en/notes/mm_operators/image_understanding/generate/prompted_vqa_generator.md new file mode 100644 index 00000000..4cb870be --- /dev/null +++ b/docs/en/notes/mm_operators/image_understanding/generate/prompted_vqa_generator.md @@ -0,0 +1,133 @@ +--- +title: PromptedVQAGenerator +createTime: 2026/01/11 21:37:37 +permalink: /en/mm_operators/generate/prompted_vqa_generator/ +--- +## 📘 Overview + +`PromptedVQAGenerator` is a **General-Purpose Multimodal VQA Operator**. + +It reads **Prompts** and **Optional Media Paths (Image/Video)** directly from a DataFrame to generate answers. This operator is highly flexible: + +* **Multimodal Support**: Performs VQA with text and image/video inputs. +* **Pure Text Support**: Automatically switches to pure text chat mode if no image or video columns are provided or if paths are empty. +* **Flexible Input Formats**: Can read raw text prompts or parse conversation-style lists. +* **Compatibility**: Automatically handles Chat Template encapsulation for local models (Local VLLM) and direct calls for API models. + +## 🏗️ `__init__` Function + +```python +def __init__( + self, + serving: LLMServingABC, + system_prompt: str = "You are a helpful assistant." +): + +``` + +### 🧾 Parameters + +| Parameter | Type | Default | Description | +| --- | --- | --- | --- | +| `serving` | `LLMServingABC` | N/A | The model serving instance for inference (supports Local or API models). | +| `system_prompt` | `str` | `"You are..."` | The system prompt sent to the model. | + +## ⚡ `run` Function + +```python +def run( + self, + storage: DataFlowStorage, + input_prompt_key: str = None, + input_conversation_key: str = None, + input_image_key: str = None, + input_video_key: str = None, + output_answer_key: str = "answer", +): + ... + +``` + +Executes the main logic: + +1. **Data Loading & Prompt Extraction** +* Reads the DataFrame from `storage`. +* **Prompt Source (Mutually Exclusive)**: +* `input_prompt_key`: Reads the text string from this column as the User Prompt. +* `input_conversation_key`: Reads the conversation list (List[Dict]) and extracts the content of the first User Message. + + + + +2. **Media Processing** +* Attempts to read `input_image_key` and `input_video_key`. +* **Pure Text Mode Detection**: If media columns are not provided or media paths are empty/None for a row, the operator constructs a **Pure Text** request without `` or `