OpenDCAI · haolpku · Nov 17, 2025 · Nov 17, 2025
diff --git a/docs/.vuepress/notes/en/mm_guide.ts b/docs/.vuepress/notes/en/mm_guide.ts
@@ -43,6 +43,7 @@ export const MMGuide: ThemeNote = defineNoteConfig({
                 'install_audio_understanding',
                 'audio_caption',
                 'whisper_asr',
+                'audio_asr_pipeline'
             ],
         },
         {

diff --git a/docs/.vuepress/notes/zh/mm_guide.ts b/docs/.vuepress/notes/zh/mm_guide.ts
@@ -44,6 +44,7 @@ export const MMGuide: ThemeNote = defineNoteConfig({
                 'install_audio_understanding',
                 'audio_caption',
                 'whisper_asr',
+                'audio_asr_pipeline',
             ],
         },
         {

diff --git a/docs/en/notes/mm_guide/audio_understanding/audio_asr_pipeline.md b/docs/en/notes/mm_guide/audio_understanding/audio_asr_pipeline.md
@@ -0,0 +1,156 @@
+---
+title: Speech Recognition and Cleaning Pipeline
+icon: material-symbols:speech-to-text
+createTime: 2025/11/17 14:38:19
+permalink: /en/mm_guide/4qyvw1fp/
+---
+
+
+## Speech Recognition and Cleaning Pipeline
+
+## Step 1: Install Environment
+See[ Audio Environment Installation](./install_audio_understanding.md)
+
+## Step 2: Import Relevant Packages
+```python
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3,4,5,6,7'  # Set visible GPU devices
+
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.core_audio import (
+    SileroVADGenerator,
+    MergeChunksByTimestamps,
+    PromptedAQAGenerator,
+    # CTCForcedAlignFilter,                             # Import this for filtering instead of evaluation
+    CTCForcedAlignSampleEvaluator,
+)
+from dataflow.serving import LocalModelVLMServing_vllm
+from dataflow.prompts.whisper_prompt_generator import WhisperTranscriptionPrompt
+```
+
+## Step 3: Define the Pipeline
+```python
+class Pipeline:
+    def __init__(self):
+        self.storage = FileStorage(
+            first_entry_file_name="./dataflow/example/audio_asr_pipeline/sample_data_local.jsonl",
+            cache_path="./cache",
+            file_name_prefix="audio_asr_pipeline",
+            cache_type="jsonl",
+        )
+
+        self.serving = LocalModelVLMServing_vllm(
+            hf_model_name_or_path="openai/whisper-large-v3",
+            hf_cache_dir="./dataflow_cache",
+            vllm_tensor_parallel_size=2,
+            vllm_temperature=0.6,
+            vllm_top_p=0.9,
+            vllm_max_tokens=512,
+            vllm_max_model_len=448,
+            vllm_gpu_memory_utilization=0.9
+        )
+
+        self.silero_vad_generator = SileroVADGenerator(
+            repo_or_dir="snakers4/silero-vad",                      # Model loading path
+            source="github",                                        # Load weights from GitHub, or set "local" to load from local path
+            device=['cuda:2'],                                      # GPUs that the model can be loaded on 
+            num_workers=2,                                          # Process count; each process loads one model instance
+        )
+
+        self.merger = MergeChunksByTimestamps(num_workers=2)
+
+        self.prompted_generator = PromptedAQAGenerator(
+            vlm_serving=self.serving,
+            system_prompt=WhisperTranscriptionPrompt().generate_prompt(language="german", task="transcribe", with_timestamps=False),
+        )
+
+        # self.filter = CTCForcedAlignFilter(
+        #     model_path="MahmoudAshraf/mms-300m-1130-forced-aligner",
+        #     device=["cuda:3"],
+        #     num_workers=1,
+        # )
+
+        self.evaluator = CTCForcedAlignSampleEvaluator(
+            model_path="MahmoudAshraf/mms-300m-1130-forced-aligner",
+            device=["cuda:3"],                                      # GPUs that the model can be loaded on
+            num_workers=2,                                          # Process count; each process loads one model instance
+        )
+
+    def forward(self):
+        self.silero_vad_generator.run(
+            storage=self.storage.step(),
+            input_audio_key='audio',
+            output_answer_key='timestamps',
+            threshold=0.5,
+            use_min_cut=True,
+            sampling_rate=16000,
+            max_speech_duration_s=30.0,
+            min_silence_duration_s=0.1,
+            speech_pad_s=0.03,
+            return_seconds=True,
+            time_resolution=1,
+            neg_threshold=0.35,
+            window_size_samples=512,
+            min_silence_at_max_speech=0.098,
+            use_max_poss_sil_at_max_speech=True
+        )
+
+        self.silero_vad_generator.close()     # Close multiprocessing
+
+        self.merger.run(
+            storage=self.storage.step(),
+            dst_folder="./cache",
+            input_audio_key="audio",
+            input_timestamps_key="timestamps",
+            timestamp_type="time", 
+            max_audio_duration=30.0,
+            hop_size_samples=512, 
+            sampling_rate=16000,
+        )
+
+        self.merger.close()
+
+        self.prompted_generator.run(
+            storage=self.storage.step(),
+            input_audio_key="audio",
+            input_conversation_key="conversation",
+            output_answer_key="transcript"
+        )
+
+        # self.filter.run(
+        #     storage=self.storage.step(),
+        #     input_audio_key="audio",
+        #     input_conversation_key="transcript",
+        #     sampling_rate=16000,
+        #     language="de",
+        #     micro_batch_size=16,
+        #     chinese_to_pinyin=False,
+        #     retain_word_level_alignment=True,
+        #     threshold=0.1,
+        #     threshold_mode="min",
+        #     romanize=True,
+        # )
+        # self.filter.close()
+
+        self.evaluator.run(
+            storage=self.storage.step(),
+            input_audio_key="audio",
+            input_conversation_key="transcript",
+            sampling_rate=16000,
+            language="de",
+            micro_batch_size=16,
+            chinese_to_pinyin=False,
+            retain_word_level_alignment=True,
+            romanize=True,
+        )
+
+        self.evaluator.close()
+```
+
+## Step 4: Run the Pipeline
+```python
+if __name__ == "__main__":
+    pipeline = Pipeline()
+    pipeline.forward()
+
+```
diff --git a/docs/en/notes/mm_guide/audio_understanding/audio_caption.md b/docs/en/notes/mm_guide/audio_understanding/audio_caption.md
@@ -1,27 +1,17 @@
 ---
-title: 音频字幕生成
+title: Audio Caption Generation
 createTime: 2025/07/15 21:33:01
 icon: material-symbols-light:autoplay
 permalink: /en/mm_guide/2gjc47qb/
 ---
 
+## Audio Caption Generation
 
-## 音频字幕生成
+## Step 1: Install Environment
+See[ Audio Environment Installation](./install_audio_understanding.md)
 
-## 第一步: 准备Dataflow环境
-```bash
-conda create -n myvenv python=3.12
-pip install open-dataflow
-pip install open-dataflow[vllm]
-```
-
-## 第二步: 安装Dataflow音频模块
-```bash
-pip install open-dataflow[audio]
-```
-
-## 第三步: 启动本地模型服务
-本地模型调用服务方法如下:
+## Step 2: Start the Local Model Service
+The local model serving method is as follows:
 ```python
 llm_serving = LocalModelLLMServing_vllm(
     hf_model_name_or_path="./models/Qwen2-Audio-7B-Instruct", # set to your own model path
@@ -31,12 +21,13 @@ llm_serving = LocalModelLLMServing_vllm(
 )
 ```
 
-## 第四步: 按如下格式填写音频路径, 准备需要增加音频字幕的数据
+## Step 3: Prepare the Audio Data for Caption Generation
+Fill in the audio paths in the following format:
 ```jsonl
 {"audio": ["your_audio_path"]}
 ```
 
-## 第五步: 按下述格式将数据路径填入FileStorage中
+## Step 4: Add the Data Path to FileStorage in the Following Format
 ```python
 storage = FileStorage(
     first_entry_file_name="your_path",
@@ -48,12 +39,12 @@ storage = FileStorage(
 )
 ```
 
-## 第六步: 初始化CaptionGenerator算子
+## Step 5: Initialize the CaptionGenerator Operator
 ```python
 generator = CaptionGenerator(llm_serving)
 ```
 
-## 第七步: 执行算子
+## Step 6: Execute the Operator
 ```python
 generator.run(storage=storage.step(), output_key="caption")
 ```
diff --git a/docs/en/notes/mm_guide/audio_understanding/install_audio_understanding.md b/docs/en/notes/mm_guide/audio_understanding/install_audio_understanding.md
@@ -1,8 +1,15 @@
 ---
-title: Audio环境安装
+title: Audio Environment Setup
 icon: material-symbols-light:download-rounded
 createTime: 2025/06/09 10:29:31
 permalink: /en/mm_guide/install_audio_understanding/
 ---
-# 安装
-请添加audio环境的安装方式
+## Environment Setup
+
+```bash
+conda create -n myvenv python=3.10
+conda activate myvenv
+
+cd ./DataFlow-MM
+pip install open-dataflow[audio]
+```
diff --git a/docs/en/notes/mm_guide/audio_understanding/whisper_asr.md b/docs/en/notes/mm_guide/audio_understanding/whisper_asr.md
@@ -1,26 +1,17 @@
 ---
-title: 使用Whisper进行语音转录或翻译
+title: Using Whisper for Speech Transcription or Translation
 createTime: 2025/07/15 21:32:36
 icon: material-symbols-light:interpreter-mode
 permalink: /en/mm_guide/dl0jhc6u/
 ---
 
-## 使用Whisper进行语音转录或翻译
+## Using Whisper for Speech Transcription or Translation
 
-## 第一步: 准备Dataflow环境
-```bash
-conda create -n myvenv python=3.12
-pip install open-dataflow
-pip install open-dataflow[vllm]
-```
-
-## 第二步: 安装Dataflow音频模块
-```bash
-pip install open-dataflow[audio]
-```
+## Step 1: Install Environment
+See [ Audio Environment Installation](./install_audio_understanding.md)
 
-## 第三步: 启动本地模型服务
-本地模型调用服务方法如下:
+## Step 2: Start the Local Model Service
+The method for launching the local model serving service is as follows:
 ```python
 llm_serving = LocalModelLLMServing_vllm(
     hf_model_name_or_path="./models/whisper-large-v3", # set to your own model path
@@ -30,12 +21,13 @@ llm_serving = LocalModelLLMServing_vllm(
 )
 ```
 
-## 第四步: 按如下格式填写音频路径, 准备需要进行音频转录或翻译的数据
+## Step 3: Prepare the Audio Data for Transcription or Translation
+Fill in the audio paths in the following format:
 ```jsonl
 {"audio": ["your_audio_path"]}
 ```
 
-## 第五步: 按下述格式将数据路径填入FileStorage中
+## Step 4: Add the Data Path to FileStorage
 ```python
 storage = FileStorage(
     first_entry_file_name="your_path",
@@ -47,30 +39,30 @@ storage = FileStorage(
 )
 ```
 
-## 第六步: 初始化WhisperTranscriptionGenerator算子
+## Step 5: Initialize the WhisperTranscriptionGenerator Operator
 ```python
 generator = WhisperTranscriptionGenerator(self.llm_serving)
 ```
 
-## 第七步: 执行算子
-语音转录文字
+## Step 6: Execute the Operator
+Speech Transcription
 ```python
 generator.run(
     storage=self.storage.step(), 
-    task="transcribe",              # 表明当前任务是语音转录
-    language="mandarin",            # 语音的语言, 默认为"english"
-    use_no_time_stamps=True,        # 是否使用无时间戳的输出格式, 默认为True
-    output_key="transcription"      # 输出结果的key
+    task="transcribe",              # Indicates that the task is speech transcription
+    language="mandarin",            # Spoken language in the audio; default is "english"
+    use_no_time_stamps=True,        # Whether to use the no-timestamp format; default is True
+    output_key="transcription"      # Key for the output result
 )
 ```
 
-语音翻译, 将语音中的语言翻译为英文
+Speech Translation (translate audio content into English)
 ```python
 generator.run(
     storage=self.storage.step(), 
-    task="translate",               # 表明当前任务是语音翻译
-    language="mandarin",            # 语音的语言, 默认为"english"
-    use_no_time_stamps=True,        # 是否使用无时间戳的输出格式, 默认为True
-    output_key="transcription"      # 输出结果的key
+    task="translate",               # Indicates that the task is speech translation
+    language="mandarin",            # Spoken language in the audio; default is "english"
+    use_no_time_stamps=True,        # Whether to use the no-timestamp format; default is True
+    output_key="transcription"      # Key for the output result
 )
 ```