OpenDCAI · haolpku · Nov 24, 2025 · Nov 20, 2025 · Nov 20, 2025
diff --git a/docs/en/notes/mm_guide/audio_understanding/audio_asr_pipeline.md b/docs/en/notes/mm_guide/audio_understanding/audio_asr_pipeline.md
@@ -19,10 +19,10 @@ os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3,4,5,6,7'  # Set visible GPU device
 from dataflow.utils.storage import FileStorage
 from dataflow.operators.core_audio import (
     SileroVADGenerator,
-    MergeChunksByTimestamps,
+    MergeChunksRowGenerator,
     PromptedAQAGenerator,
-    # CTCForcedAlignFilter,                             # Import this for filtering instead of evaluation
-    CTCForcedAlignSampleEvaluator,
+    # CTCForcedAlignmentFilter,                             # Import this for filtering instead of evaluation
+    CTCForcedAlignmentSampleEvaluator,
 )
 from dataflow.serving import LocalModelVLMServing_vllm
 from dataflow.prompts.whisper_prompt_generator import WhisperTranscriptionPrompt
@@ -57,7 +57,7 @@ class Pipeline:
             num_workers=2,                                          # Process count; each process loads one model instance
         )
 
-        self.merger = MergeChunksByTimestamps(num_workers=2)
+        self.merger = MergeChunksRowGenerator(num_workers=2)
 
         self.prompted_generator = PromptedAQAGenerator(
             vlm_serving=self.serving,
@@ -70,7 +70,7 @@ class Pipeline:
         #     num_workers=1,
         # )
 
-        self.evaluator = CTCForcedAlignSampleEvaluator(
+        self.evaluator = CTCForcedAlignmentSampleEvaluator(
             model_path="MahmoudAshraf/mms-300m-1130-forced-aligner",
             device=["cuda:3"],                                      # GPUs that the model can be loaded on
             num_workers=2,                                          # Process count; each process loads one model instance
@@ -90,7 +90,6 @@ class Pipeline:
             return_seconds=True,
             time_resolution=1,
             neg_threshold=0.35,
-            window_size_samples=512,
             min_silence_at_max_speech=0.098,
             use_max_poss_sil_at_max_speech=True
         )

diff --git a/docs/en/notes/mm_guide/audio_understanding/audio_caption.md b/docs/en/notes/mm_guide/audio_understanding/audio_caption.md
@@ -10,41 +10,52 @@ permalink: /en/mm_guide/2gjc47qb/
 ## Step 1: Install Environment
 See[ Audio Environment Installation](./install_audio_understanding.md)
 
-## Step 2: Start the Local Model Service
+## Step 2: Import Relevant Packages
+```python
+from dataflow.operators.core_audio import PromptedAQAGenerator
+from dataflow.serving import LocalModelVLMServing_vllm
+from dataflow.utils.storage import FileStorage
+from dataflow.prompts.audio import AudioCaptionGeneratorPrompt
+```
+
+## Step 3: Start the Local Model Service
 The local model serving method is as follows:
 ```python
-llm_serving = LocalModelLLMServing_vllm(
-    hf_model_name_or_path="./models/Qwen2-Audio-7B-Instruct", # set to your own model path
+vlm_serving = LocalModelVLMServing_vllm(
+    hf_model_name_or_path="Qwen/Qwen2-Audio-7B-Instruct", # set to your own model path
     vllm_tensor_parallel_size=2,
     vllm_max_tokens=8192,
     vllm_gpu_memory_utilization=0.7
 )
 ```
 
-## Step 3: Prepare the Audio Data for Caption Generation
+## Step 4: Prepare the Audio Data for Caption Generation
 Fill in the audio paths in the following format:
 ```jsonl
-{"audio": ["your_audio_path"]}
+{"audio": ["https://raw.githubusercontent.com/gty1829/DataFlow-MM/df-audio-dev-1/dataflow/example/whisper_transcription/BAC009S0022W0165.wav"], "conversation": [{"from": "human", "value": "<audio>\nTranscribe the audio into Chinese." }]}
+{"audio": ["https://raw.githubusercontent.com/gty1829/DataFlow-MM/df-audio-dev-1/dataflow/example/audio_vqa/Santa%20Motor.wav"], "conversation": [{"from": "human", "value": "<audio>\nDescribe the sound in this audio clip." }]}
+
 ```
 
-## Step 4: Add the Data Path to FileStorage in the Following Format
+## Step 5: Add the Data Path to FileStorage in the Following Format
 ```python
 storage = FileStorage(
-    first_entry_file_name="your_path",
+    first_entry_file_name="./dataflow/example/audio_aqa/sample_data.jsonl",
     cache_path="./cache",
     file_name_prefix="audio_caption",
     cache_type="jsonl",
-    media_key="audio",
-    media_type="audio"
 )
 ```
 
-## Step 5: Initialize the CaptionGenerator Operator
+## Step 6: Initialize the PromptedAQAGenerator Operator
 ```python
-generator = CaptionGenerator(llm_serving)
+prompt_generator = PromptedAQAGenerator(
+    vlm_serving=vlm_serving,
+    system_prompt=AudioCaptionGeneratorPrompt().generate_prompt()
+)
 ```
 
-## Step 6: Execute the Operator
+## Step 7: Execute the Operator
 ```python
-generator.run(storage=storage.step(), output_key="caption")
+prompt_generator.run(storage=storage.step(), output_key="caption")
 ```
diff --git a/docs/en/notes/mm_guide/audio_understanding/install_audio_understanding.md b/docs/en/notes/mm_guide/audio_understanding/install_audio_understanding.md
@@ -11,5 +11,5 @@ conda create -n myvenv python=3.10
 conda activate myvenv
 
 cd ./DataFlow-MM
-pip install open-dataflow[audio]
+pip install open-dataflow-mm[audio]
 ```
diff --git a/docs/en/notes/mm_guide/audio_understanding/whisper_asr.md b/docs/en/notes/mm_guide/audio_understanding/whisper_asr.md
@@ -10,59 +10,66 @@ permalink: /en/mm_guide/dl0jhc6u/
 ## Step 1: Install Environment
 See [ Audio Environment Installation](./install_audio_understanding.md)
 
-## Step 2: Start the Local Model Service
+## Step 2: Import Relevant Packages
+```python
+from dataflow.operators.core_audio import PromptedAQAGenerator
+from dataflow.serving import LocalModelVLMServing_vllm
+from dataflow.utils.storage import FileStorage
+from dataflow.prompts.audio import WhisperTranscriptionPrompt
+```
+
+## Step 3: Start the Local Model Service
 The method for launching the local model serving service is as follows:
 ```python
-llm_serving = LocalModelLLMServing_vllm(
-    hf_model_name_or_path="./models/whisper-large-v3", # set to your own model path
+vlm_serving = LocalModelLLMServing_vllm(
+    hf_model_name_or_path="openai/whisper-large-v3", # set to your own model path
+    hf_cache_dir='./dataflow_cache',
     vllm_tensor_parallel_size=2,
-    vllm_max_tokens=None,
-    vllm_gpu_memory_utilization=0.7
+    vllm_temperature=0.3,
+    vllm_top_p=0.9,
+    vllm_max_tokens=512,
+    vllm_max_model_len=448,
+    vllm_gpu_memory_utilization=0.9
 )
 ```
 
-## Step 3: Prepare the Audio Data for Transcription or Translation
+## Step 4: Prepare the Audio Data for Transcription or Translation
 Fill in the audio paths in the following format:
 ```jsonl
-{"audio": ["your_audio_path"]}
+{"conversation": [{"from": "human", "value": "<audio>"}], "audio": ["https://raw.githubusercontent.com/gty1829/DataFlow-MM/df-audio-dev-1/dataflow/example/whisper_transcription/BAC009S0022W0165.wav"]}
+
 ```
 
-## Step 4: Add the Data Path to FileStorage
+## Step 5: Add the Data Path to FileStorage
 ```python
 storage = FileStorage(
-    first_entry_file_name="your_path",
+    first_entry_file_name="./dataflow/example/whisper_transcription/sample_data.jsonl",
     cache_path="./cache",
     file_name_prefix="whisper_transcription",
     cache_type="jsonl",
-    media_key="audio",
-    media_type="audio"
 )
 ```
 
-## Step 5: Initialize the WhisperTranscriptionGenerator Operator
+## Step 6: Initialize the PromptedAQAGenerator Operator
 ```python
-generator = WhisperTranscriptionGenerator(self.llm_serving)
-```
-
-## Step 6: Execute the Operator
-Speech Transcription
-```python
-generator.run(
-    storage=self.storage.step(), 
-    task="transcribe",              # Indicates that the task is speech transcription
-    language="mandarin",            # Spoken language in the audio; default is "english"
-    use_no_time_stamps=True,        # Whether to use the no-timestamp format; default is True
-    output_key="transcription"      # Key for the output result
+prompt_generator = PromptedAQAGenerator(
+    vlm_serving=vlm_serving,
+    system_prompt=WhisperTranscriptionPrompt().generate_prompt(
+        language="mandarin", 
+        task="transcribe",        # If task == 'translate', the model will translate input speech into English text.
+        with_timestamps=False
+    )
 )
 ```
 
-Speech Translation (translate audio content into English)
+## Step 7: Execute the Operator
+Speech Transcription
 ```python
-generator.run(
-    storage=self.storage.step(), 
-    task="translate",               # Indicates that the task is speech translation
-    language="mandarin",            # Spoken language in the audio; default is "english"
-    use_no_time_stamps=True,        # Whether to use the no-timestamp format; default is True
-    output_key="transcription"      # Key for the output result
+prompt_generator.run(
+    storage = self.storage.step(),
+    input_audio_key="audio",
+    input_conversation_key="conversation",
+    output_answer_key="answer",
+    storage=storage.step(), 
 )
 ```
diff --git a/docs/zh/notes/mm_guide/audio_understanding/audio_asr_pipeline.md b/docs/zh/notes/mm_guide/audio_understanding/audio_asr_pipeline.md
@@ -18,10 +18,10 @@ os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3,4,5,6,7'  # 设置可见的GPU设
 from dataflow.utils.storage import FileStorage
 from dataflow.operators.core_audio import (
     SileroVADGenerator,
-    MergeChunksByTimestamps,
-    PromptedAQAGenerator,
-    # CTCForcedAlignFilter,                             # 如果是过滤而非评估, 则导入过滤算子
-    CTCForcedAlignSampleEvaluator,
+    MergeChunksRowGenerator,
+    PromptedAQAGenerator,                
+    # CTCForcedAlignmentFilter,                         # 如果是过滤而非评估, 则导入过滤算子
+    CTCForcedAlignmentSampleEvaluator,
 )
 from dataflow.serving import LocalModelVLMServing_vllm
 from dataflow.prompts.whisper_prompt_generator import WhisperTranscriptionPrompt
@@ -56,20 +56,20 @@ class Pipeline:
             num_workers=2,                                          # num_workers为进程数, 每个进程启动一个模型, 平均分配在device列表中的每个设备上
         )
 
-        self.merger = MergeChunksByTimestamps(num_workers=2)
+        self.merger = MergeChunksRowGenerator(num_workers=2)
 
         self.prompted_generator = PromptedAQAGenerator(
             vlm_serving=self.serving,
             system_prompt=WhisperTranscriptionPrompt().generate_prompt(language="german", task="transcribe", with_timestamps=False),
         )
 
-        # self.filter = CTCForcedAlignFilter(
+        # self.filter = CTCForcedAlignmentFilter(
         #     model_path="MahmoudAshraf/mms-300m-1130-forced-aligner",
         #     device=["cuda:3"],
         #     num_workers=1,
         # )
 
-        self.evaluator = CTCForcedAlignSampleEvaluator(
+        self.evaluator = CTCForcedAlignmentSampleEvaluator(
             model_path="MahmoudAshraf/mms-300m-1130-forced-aligner",
             device=["cuda:3"],                                      # 可以加载模型的GPU列表 
             num_workers=2,                                          # num_workers为进程数, 每个进程启动一个模型, 平均分配在device列表中的每个设备上
@@ -89,7 +89,6 @@ class Pipeline:
             return_seconds=True,
             time_resolution=1,
             neg_threshold=0.35,
-            window_size_samples=512,
             min_silence_at_max_speech=0.098,
             use_max_poss_sil_at_max_speech=True
         )
@@ -151,5 +150,4 @@ class Pipeline:
 if __name__ == "__main__":
     pipeline = Pipeline()
     pipeline.forward()
-
 ```
diff --git a/docs/zh/notes/mm_guide/audio_understanding/audio_caption.md b/docs/zh/notes/mm_guide/audio_understanding/audio_caption.md
@@ -8,55 +8,54 @@ permalink: /zh/mm_guide/2gjc47qb/
 
 ## 音频字幕生成
 
-<!-- ## 第一步: 准备Dataflow环境
-```bash
-conda create -n myvenv python=3.12
-pip install open-dataflow
-pip install open-dataflow[vllm]
-```
-
-## 第二步: 安装Dataflow音频模块
-```bash
-pip install open-dataflow[audio]
-``` -->
-
 ## 第一步: 安装环境
 见[Audio环境安装](./install_audio_understanding.md)
 
-## 第二步: 启动本地模型服务
+## 第二步: 导入包
+```python
+from dataflow.operators.core_audio import PromptedAQAGenerator
+from dataflow.serving import LocalModelVLMServing_vllm
+from dataflow.utils.storage import FileStorage
+from dataflow.prompts.audio import AudioCaptionGeneratorPrompt
+```
+
+## 第三步: 启动本地模型服务
 本地模型调用服务方法如下:
 ```python
-llm_serving = LocalModelLLMServing_vllm(
-    hf_model_name_or_path="./models/Qwen2-Audio-7B-Instruct", # set to your own model path
+vlm_serving = LocalModelVLMServing_vllm(
+    hf_model_name_or_path="Qwen/Qwen2-Audio-7B-Instruct", # set to your own model path
     vllm_tensor_parallel_size=2,
     vllm_max_tokens=8192,
     vllm_gpu_memory_utilization=0.7
 )
 ```
 
-## 第三步: 按如下格式填写音频路径, 准备需要增加音频字幕的数据
+## 第四步: 按如下格式填写音频路径, 准备需要增加音频字幕的数据
 ```jsonl
-{"audio": ["your_audio_path"]}
+{"audio": ["https://raw.githubusercontent.com/gty1829/DataFlow-MM/df-audio-dev-1/dataflow/example/whisper_transcription/BAC009S0022W0165.wav"], "conversation": [{"from": "human", "value": "<audio>\nTranscribe the audio into Chinese." }]}
+{"audio": ["https://raw.githubusercontent.com/gty1829/DataFlow-MM/df-audio-dev-1/dataflow/example/audio_vqa/Santa%20Motor.wav"], "conversation": [{"from": "human", "value": "<audio>\nDescribe the sound in this audio clip." }]}
+
 ```
 
-## 第四步: 按下述格式将数据路径填入FileStorage中
+## 第五步: 按下述格式将数据路径填入FileStorage中
 ```python
 storage = FileStorage(
-    first_entry_file_name="your_path",
+    first_entry_file_name="./dataflow/example/audio_aqa/sample_data.jsonl",
     cache_path="./cache",
     file_name_prefix="audio_caption",
     cache_type="jsonl",
-    media_key="audio",
-    media_type="audio"
 )
 ```
 
-## 第五步: 初始化CaptionGenerator算子
+## 第六步: 初始化PromptedAQAGenerator算子
 ```python
-generator = CaptionGenerator(llm_serving)
+prompt_generator = PromptedAQAGenerator(
+    vlm_serving=vlm_serving,
+    system_prompt=AudioCaptionGeneratorPrompt().generate_prompt()
+)
 ```
 
-## 第六步: 执行算子
+## 第七步: 执行算子
 ```python
-generator.run(storage=storage.step(), output_key="caption")
+prompt_generator.run(storage=storage.step(), output_key="caption")
 ```
diff --git a/docs/zh/notes/mm_guide/audio_understanding/install_audio_understanding.md b/docs/zh/notes/mm_guide/audio_understanding/install_audio_understanding.md
@@ -11,5 +11,5 @@ conda create -n myvenv python=3.10
 conda activate myvenv
 
 cd ./DataFlow-MM
-pip install open-dataflow[audio]
+pip install open-dataflow-mm[audio]
 ```