Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions docs/en/notes/mm_guide/audio_understanding/audio_asr_pipeline.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@ os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3,4,5,6,7' # Set visible GPU device
from dataflow.utils.storage import FileStorage
from dataflow.operators.core_audio import (
SileroVADGenerator,
MergeChunksByTimestamps,
MergeChunksRowGenerator,
PromptedAQAGenerator,
# CTCForcedAlignFilter, # Import this for filtering instead of evaluation
CTCForcedAlignSampleEvaluator,
# CTCForcedAlignmentFilter, # Import this for filtering instead of evaluation
CTCForcedAlignmentSampleEvaluator,
)
from dataflow.serving import LocalModelVLMServing_vllm
from dataflow.prompts.whisper_prompt_generator import WhisperTranscriptionPrompt
Expand Down Expand Up @@ -57,7 +57,7 @@ class Pipeline:
num_workers=2, # Process count; each process loads one model instance
)

self.merger = MergeChunksByTimestamps(num_workers=2)
self.merger = MergeChunksRowGenerator(num_workers=2)

self.prompted_generator = PromptedAQAGenerator(
vlm_serving=self.serving,
Expand All @@ -70,7 +70,7 @@ class Pipeline:
# num_workers=1,
# )

self.evaluator = CTCForcedAlignSampleEvaluator(
self.evaluator = CTCForcedAlignmentSampleEvaluator(
model_path="MahmoudAshraf/mms-300m-1130-forced-aligner",
device=["cuda:3"], # GPUs that the model can be loaded on
num_workers=2, # Process count; each process loads one model instance
Expand All @@ -90,7 +90,6 @@ class Pipeline:
return_seconds=True,
time_resolution=1,
neg_threshold=0.35,
window_size_samples=512,
min_silence_at_max_speech=0.098,
use_max_poss_sil_at_max_speech=True
)
Expand Down
37 changes: 24 additions & 13 deletions docs/en/notes/mm_guide/audio_understanding/audio_caption.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,41 +10,52 @@ permalink: /en/mm_guide/2gjc47qb/
## Step 1: Install Environment
See[ Audio Environment Installation](./install_audio_understanding.md)

## Step 2: Start the Local Model Service
## Step 2: Import Relevant Packages
```python
from dataflow.operators.core_audio import PromptedAQAGenerator
from dataflow.serving import LocalModelVLMServing_vllm
from dataflow.utils.storage import FileStorage
from dataflow.prompts.audio import AudioCaptionGeneratorPrompt
```

## Step 3: Start the Local Model Service
The local model serving method is as follows:
```python
llm_serving = LocalModelLLMServing_vllm(
hf_model_name_or_path="./models/Qwen2-Audio-7B-Instruct", # set to your own model path
vlm_serving = LocalModelVLMServing_vllm(
hf_model_name_or_path="Qwen/Qwen2-Audio-7B-Instruct", # set to your own model path
vllm_tensor_parallel_size=2,
vllm_max_tokens=8192,
vllm_gpu_memory_utilization=0.7
)
```

## Step 3: Prepare the Audio Data for Caption Generation
## Step 4: Prepare the Audio Data for Caption Generation
Fill in the audio paths in the following format:
```jsonl
{"audio": ["your_audio_path"]}
{"audio": ["https://raw.githubusercontent.com/gty1829/DataFlow-MM/df-audio-dev-1/dataflow/example/whisper_transcription/BAC009S0022W0165.wav"], "conversation": [{"from": "human", "value": "<audio>\nTranscribe the audio into Chinese." }]}
{"audio": ["https://raw.githubusercontent.com/gty1829/DataFlow-MM/df-audio-dev-1/dataflow/example/audio_vqa/Santa%20Motor.wav"], "conversation": [{"from": "human", "value": "<audio>\nDescribe the sound in this audio clip." }]}

```

## Step 4: Add the Data Path to FileStorage in the Following Format
## Step 5: Add the Data Path to FileStorage in the Following Format
```python
storage = FileStorage(
first_entry_file_name="your_path",
first_entry_file_name="./dataflow/example/audio_aqa/sample_data.jsonl",
cache_path="./cache",
file_name_prefix="audio_caption",
cache_type="jsonl",
media_key="audio",
media_type="audio"
)
```

## Step 5: Initialize the CaptionGenerator Operator
## Step 6: Initialize the PromptedAQAGenerator Operator
```python
generator = CaptionGenerator(llm_serving)
prompt_generator = PromptedAQAGenerator(
vlm_serving=vlm_serving,
system_prompt=AudioCaptionGeneratorPrompt().generate_prompt()
)
```

## Step 6: Execute the Operator
## Step 7: Execute the Operator
```python
generator.run(storage=storage.step(), output_key="caption")
prompt_generator.run(storage=storage.step(), output_key="caption")
```
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,5 @@ conda create -n myvenv python=3.10
conda activate myvenv

cd ./DataFlow-MM
pip install open-dataflow[audio]
pip install open-dataflow-mm[audio]
```
69 changes: 38 additions & 31 deletions docs/en/notes/mm_guide/audio_understanding/whisper_asr.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,59 +10,66 @@ permalink: /en/mm_guide/dl0jhc6u/
## Step 1: Install Environment
See [ Audio Environment Installation](./install_audio_understanding.md)

## Step 2: Start the Local Model Service
## Step 2: Import Relevant Packages
```python
from dataflow.operators.core_audio import PromptedAQAGenerator
from dataflow.serving import LocalModelVLMServing_vllm
from dataflow.utils.storage import FileStorage
from dataflow.prompts.audio import WhisperTranscriptionPrompt
```

## Step 3: Start the Local Model Service
The method for launching the local model serving service is as follows:
```python
llm_serving = LocalModelLLMServing_vllm(
hf_model_name_or_path="./models/whisper-large-v3", # set to your own model path
vlm_serving = LocalModelLLMServing_vllm(
hf_model_name_or_path="openai/whisper-large-v3", # set to your own model path
hf_cache_dir='./dataflow_cache',
vllm_tensor_parallel_size=2,
vllm_max_tokens=None,
vllm_gpu_memory_utilization=0.7
vllm_temperature=0.3,
vllm_top_p=0.9,
vllm_max_tokens=512,
vllm_max_model_len=448,
vllm_gpu_memory_utilization=0.9
)
```

## Step 3: Prepare the Audio Data for Transcription or Translation
## Step 4: Prepare the Audio Data for Transcription or Translation
Fill in the audio paths in the following format:
```jsonl
{"audio": ["your_audio_path"]}
{"conversation": [{"from": "human", "value": "<audio>"}], "audio": ["https://raw.githubusercontent.com/gty1829/DataFlow-MM/df-audio-dev-1/dataflow/example/whisper_transcription/BAC009S0022W0165.wav"]}

```

## Step 4: Add the Data Path to FileStorage
## Step 5: Add the Data Path to FileStorage
```python
storage = FileStorage(
first_entry_file_name="your_path",
first_entry_file_name="./dataflow/example/whisper_transcription/sample_data.jsonl",
cache_path="./cache",
file_name_prefix="whisper_transcription",
cache_type="jsonl",
media_key="audio",
media_type="audio"
)
```

## Step 5: Initialize the WhisperTranscriptionGenerator Operator
## Step 6: Initialize the PromptedAQAGenerator Operator
```python
generator = WhisperTranscriptionGenerator(self.llm_serving)
```

## Step 6: Execute the Operator
Speech Transcription
```python
generator.run(
storage=self.storage.step(),
task="transcribe", # Indicates that the task is speech transcription
language="mandarin", # Spoken language in the audio; default is "english"
use_no_time_stamps=True, # Whether to use the no-timestamp format; default is True
output_key="transcription" # Key for the output result
prompt_generator = PromptedAQAGenerator(
vlm_serving=vlm_serving,
system_prompt=WhisperTranscriptionPrompt().generate_prompt(
language="mandarin",
task="transcribe", # If task == 'translate', the model will translate input speech into English text.
with_timestamps=False
)
)
```

Speech Translation (translate audio content into English)
## Step 7: Execute the Operator
Speech Transcription
```python
generator.run(
storage=self.storage.step(),
task="translate", # Indicates that the task is speech translation
language="mandarin", # Spoken language in the audio; default is "english"
use_no_time_stamps=True, # Whether to use the no-timestamp format; default is True
output_key="transcription" # Key for the output result
prompt_generator.run(
storage = self.storage.step(),
input_audio_key="audio",
input_conversation_key="conversation",
output_answer_key="answer",
storage=storage.step(),
)
```
16 changes: 7 additions & 9 deletions docs/zh/notes/mm_guide/audio_understanding/audio_asr_pipeline.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@ os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3,4,5,6,7' # 设置可见的GPU设
from dataflow.utils.storage import FileStorage
from dataflow.operators.core_audio import (
SileroVADGenerator,
MergeChunksByTimestamps,
PromptedAQAGenerator,
# CTCForcedAlignFilter, # 如果是过滤而非评估, 则导入过滤算子
CTCForcedAlignSampleEvaluator,
MergeChunksRowGenerator,
PromptedAQAGenerator,
# CTCForcedAlignmentFilter, # 如果是过滤而非评估, 则导入过滤算子
CTCForcedAlignmentSampleEvaluator,
)
from dataflow.serving import LocalModelVLMServing_vllm
from dataflow.prompts.whisper_prompt_generator import WhisperTranscriptionPrompt
Expand Down Expand Up @@ -56,20 +56,20 @@ class Pipeline:
num_workers=2, # num_workers为进程数, 每个进程启动一个模型, 平均分配在device列表中的每个设备上
)

self.merger = MergeChunksByTimestamps(num_workers=2)
self.merger = MergeChunksRowGenerator(num_workers=2)

self.prompted_generator = PromptedAQAGenerator(
vlm_serving=self.serving,
system_prompt=WhisperTranscriptionPrompt().generate_prompt(language="german", task="transcribe", with_timestamps=False),
)

# self.filter = CTCForcedAlignFilter(
# self.filter = CTCForcedAlignmentFilter(
# model_path="MahmoudAshraf/mms-300m-1130-forced-aligner",
# device=["cuda:3"],
# num_workers=1,
# )

self.evaluator = CTCForcedAlignSampleEvaluator(
self.evaluator = CTCForcedAlignmentSampleEvaluator(
model_path="MahmoudAshraf/mms-300m-1130-forced-aligner",
device=["cuda:3"], # 可以加载模型的GPU列表
num_workers=2, # num_workers为进程数, 每个进程启动一个模型, 平均分配在device列表中的每个设备上
Expand All @@ -89,7 +89,6 @@ class Pipeline:
return_seconds=True,
time_resolution=1,
neg_threshold=0.35,
window_size_samples=512,
min_silence_at_max_speech=0.098,
use_max_poss_sil_at_max_speech=True
)
Expand Down Expand Up @@ -151,5 +150,4 @@ class Pipeline:
if __name__ == "__main__":
pipeline = Pipeline()
pipeline.forward()

```
49 changes: 24 additions & 25 deletions docs/zh/notes/mm_guide/audio_understanding/audio_caption.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,55 +8,54 @@ permalink: /zh/mm_guide/2gjc47qb/

## 音频字幕生成

<!-- ## 第一步: 准备Dataflow环境
```bash
conda create -n myvenv python=3.12
pip install open-dataflow
pip install open-dataflow[vllm]
```

## 第二步: 安装Dataflow音频模块
```bash
pip install open-dataflow[audio]
``` -->

## 第一步: 安装环境
见[Audio环境安装](./install_audio_understanding.md)

## 第二步: 启动本地模型服务
## 第二步: 导入包
```python
from dataflow.operators.core_audio import PromptedAQAGenerator
from dataflow.serving import LocalModelVLMServing_vllm
from dataflow.utils.storage import FileStorage
from dataflow.prompts.audio import AudioCaptionGeneratorPrompt
```

## 第三步: 启动本地模型服务
本地模型调用服务方法如下:
```python
llm_serving = LocalModelLLMServing_vllm(
hf_model_name_or_path="./models/Qwen2-Audio-7B-Instruct", # set to your own model path
vlm_serving = LocalModelVLMServing_vllm(
hf_model_name_or_path="Qwen/Qwen2-Audio-7B-Instruct", # set to your own model path
vllm_tensor_parallel_size=2,
vllm_max_tokens=8192,
vllm_gpu_memory_utilization=0.7
)
```

## 第三步: 按如下格式填写音频路径, 准备需要增加音频字幕的数据
## 第四步: 按如下格式填写音频路径, 准备需要增加音频字幕的数据
```jsonl
{"audio": ["your_audio_path"]}
{"audio": ["https://raw.githubusercontent.com/gty1829/DataFlow-MM/df-audio-dev-1/dataflow/example/whisper_transcription/BAC009S0022W0165.wav"], "conversation": [{"from": "human", "value": "<audio>\nTranscribe the audio into Chinese." }]}
{"audio": ["https://raw.githubusercontent.com/gty1829/DataFlow-MM/df-audio-dev-1/dataflow/example/audio_vqa/Santa%20Motor.wav"], "conversation": [{"from": "human", "value": "<audio>\nDescribe the sound in this audio clip." }]}

```

## 第四步: 按下述格式将数据路径填入FileStorage中
## 第五步: 按下述格式将数据路径填入FileStorage中
```python
storage = FileStorage(
first_entry_file_name="your_path",
first_entry_file_name="./dataflow/example/audio_aqa/sample_data.jsonl",
cache_path="./cache",
file_name_prefix="audio_caption",
cache_type="jsonl",
media_key="audio",
media_type="audio"
)
```

## 第五步: 初始化CaptionGenerator算子
## 第六步: 初始化PromptedAQAGenerator算子
```python
generator = CaptionGenerator(llm_serving)
prompt_generator = PromptedAQAGenerator(
vlm_serving=vlm_serving,
system_prompt=AudioCaptionGeneratorPrompt().generate_prompt()
)
```

## 第六步: 执行算子
## 第七步: 执行算子
```python
generator.run(storage=storage.step(), output_key="caption")
prompt_generator.run(storage=storage.step(), output_key="caption")
```
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,5 @@ conda create -n myvenv python=3.10
conda activate myvenv

cd ./DataFlow-MM
pip install open-dataflow[audio]
pip install open-dataflow-mm[audio]
```
Loading