diff --git a/docs/en/notes/mm_guide/audio_understanding/audio_asr_pipeline.md b/docs/en/notes/mm_guide/audio_understanding/audio_asr_pipeline.md index d98f177c..c0424ffe 100644 --- a/docs/en/notes/mm_guide/audio_understanding/audio_asr_pipeline.md +++ b/docs/en/notes/mm_guide/audio_understanding/audio_asr_pipeline.md @@ -19,10 +19,10 @@ os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3,4,5,6,7' # Set visible GPU device from dataflow.utils.storage import FileStorage from dataflow.operators.core_audio import ( SileroVADGenerator, - MergeChunksByTimestamps, + MergeChunksRowGenerator, PromptedAQAGenerator, - # CTCForcedAlignFilter, # Import this for filtering instead of evaluation - CTCForcedAlignSampleEvaluator, + # CTCForcedAlignmentFilter, # Import this for filtering instead of evaluation + CTCForcedAlignmentSampleEvaluator, ) from dataflow.serving import LocalModelVLMServing_vllm from dataflow.prompts.whisper_prompt_generator import WhisperTranscriptionPrompt @@ -57,7 +57,7 @@ class Pipeline: num_workers=2, # Process count; each process loads one model instance ) - self.merger = MergeChunksByTimestamps(num_workers=2) + self.merger = MergeChunksRowGenerator(num_workers=2) self.prompted_generator = PromptedAQAGenerator( vlm_serving=self.serving, @@ -70,7 +70,7 @@ class Pipeline: # num_workers=1, # ) - self.evaluator = CTCForcedAlignSampleEvaluator( + self.evaluator = CTCForcedAlignmentSampleEvaluator( model_path="MahmoudAshraf/mms-300m-1130-forced-aligner", device=["cuda:3"], # GPUs that the model can be loaded on num_workers=2, # Process count; each process loads one model instance @@ -90,7 +90,6 @@ class Pipeline: return_seconds=True, time_resolution=1, neg_threshold=0.35, - window_size_samples=512, min_silence_at_max_speech=0.098, use_max_poss_sil_at_max_speech=True ) diff --git a/docs/en/notes/mm_guide/audio_understanding/audio_caption.md b/docs/en/notes/mm_guide/audio_understanding/audio_caption.md index d298511c..a908ed2b 100644 --- a/docs/en/notes/mm_guide/audio_understanding/audio_caption.md +++ b/docs/en/notes/mm_guide/audio_understanding/audio_caption.md @@ -10,41 +10,52 @@ permalink: /en/mm_guide/2gjc47qb/ ## Step 1: Install Environment See[ Audio Environment Installation](./install_audio_understanding.md) -## Step 2: Start the Local Model Service +## Step 2: Import Relevant Packages +```python +from dataflow.operators.core_audio import PromptedAQAGenerator +from dataflow.serving import LocalModelVLMServing_vllm +from dataflow.utils.storage import FileStorage +from dataflow.prompts.audio import AudioCaptionGeneratorPrompt +``` + +## Step 3: Start the Local Model Service The local model serving method is as follows: ```python -llm_serving = LocalModelLLMServing_vllm( - hf_model_name_or_path="./models/Qwen2-Audio-7B-Instruct", # set to your own model path +vlm_serving = LocalModelVLMServing_vllm( + hf_model_name_or_path="Qwen/Qwen2-Audio-7B-Instruct", # set to your own model path vllm_tensor_parallel_size=2, vllm_max_tokens=8192, vllm_gpu_memory_utilization=0.7 ) ``` -## Step 3: Prepare the Audio Data for Caption Generation +## Step 4: Prepare the Audio Data for Caption Generation Fill in the audio paths in the following format: ```jsonl -{"audio": ["your_audio_path"]} +{"audio": ["https://raw.githubusercontent.com/gty1829/DataFlow-MM/df-audio-dev-1/dataflow/example/whisper_transcription/BAC009S0022W0165.wav"], "conversation": [{"from": "human", "value": "