From d27f640bd3020ebe3034ca19d82726c5eacd4e91 Mon Sep 17 00:00:00 2001 From: niwa2 Date: Sun, 19 Jan 2025 20:12:37 +0100 Subject: [PATCH 1/2] add option to unload whisper and diarizer model when done --- backend/configs/config.yaml | 2 ++ configs/default_parameters.yaml | 2 ++ configs/translation.yaml | 22 +++++++++---------- .../whisper/base_transcription_pipeline.py | 4 ++++ modules/whisper/data_classes.py | 21 +++++++++++++++++- 5 files changed, 39 insertions(+), 12 deletions(-) diff --git a/backend/configs/config.yaml b/backend/configs/config.yaml index f0e3e925..044424c6 100644 --- a/backend/configs/config.yaml +++ b/backend/configs/config.yaml @@ -3,6 +3,8 @@ whisper: model_size: large-v2 # Compute type. 'float16' for CUDA, 'float32' for CPU. compute_type: float16 + # Whether to offload the model after the inference. + enable_offload: true bgm_separation: # UVR model sizes between ["UVR-MDX-NET-Inst_HQ_4", "UVR-MDX-NET-Inst_3"] diff --git a/configs/default_parameters.yaml b/configs/default_parameters.yaml index 4a7172fc..022bd984 100644 --- a/configs/default_parameters.yaml +++ b/configs/default_parameters.yaml @@ -31,6 +31,7 @@ whisper: language_detection_threshold: 0.5 language_detection_segments: 1 add_timestamp: false + enable_offload: true vad: vad_filter: false @@ -43,6 +44,7 @@ vad: diarization: is_diarize: false hf_token: "" + enable_offload: true bgm_separation: is_separate_bgm: false diff --git a/configs/translation.yaml b/configs/translation.yaml index a85124ab..b8197538 100644 --- a/configs/translation.yaml +++ b/configs/translation.yaml @@ -19,7 +19,7 @@ en: # English Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing Enable Background Music Remover Filter: Enable Background Music Remover Filter Save separated files to output: Save separated files to output - Offload sub model after removing background music: Offload sub model after removing background music + Offload sub model when finished: Offload sub model when finished Voice Detection Filter: Voice Detection Filter Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel. Enable Silero VAD Filter: Enable Silero VAD Filter @@ -65,7 +65,7 @@ ko: # Korean Enabling this will remove background music: 받아쓰기 이전에 먼저 배경 음악 제거용 서브 모델을 활성화 합니다. Enable Background Music Remover Filter: 배경 음악 제거 필터 활성화 Save separated files to output: 분리된 배경 음악 & 음성 파일 따로 출력 폴더에 저장 - Offload sub model after removing background music: 배경 음악 제거 후 서브 모델을 비활성화 합니다. (VRAM 이 부족할 시 체크하세요.) + Offload sub model when finished: 완료되면 하위 모델 언로드. (VRAM 이 부족할 시 체크하세요.) Voice Detection Filter: 목소리 감지 필터 Enable this to transcribe only detected voice: 서브 모델에 의해 목소리라고 판단된 부분만 받아쓰기를 진행합니다. Enable Silero VAD Filter: Silero VAD 필터 활성화 @@ -111,7 +111,7 @@ ja: # Japanese Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing Enable Background Music Remover Filter: Enable Background Music Remover Filter Save separated files to output: Save separated files to output - Offload sub model after removing background music: Offload sub model after removing background music + Offload sub model when finished: Offload sub model when finished Voice Detection Filter: Voice Detection Filter Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel. Enable Silero VAD Filter: Enable Silero VAD Filter @@ -157,7 +157,7 @@ es: # Spanish Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing Enable Background Music Remover Filter: Enable Background Music Remover Filter Save separated files to output: Save separated files to output - Offload sub model after removing background music: Offload sub model after removing background music + Offload sub model when finished: Offload sub model when finished Voice Detection Filter: Voice Detection Filter Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel. Enable Silero VAD Filter: Enable Silero VAD Filter @@ -203,7 +203,7 @@ fr: # French Enabling this will remove background music: L'activation supprimera la musique de fond via un sous-modèle avant la transcription Enable Background Music Remover Filter: Activer le filtre de suppression de musique de fond Save separated files to output: Sauvegarder les fichiers séparés dans la sortie - Offload sub model after removing background music: Décharger le sous-modèle après avoir supprimé la musique de fond + Offload sub model when finished: Décharger le sous-modèle une fois terminé Voice Detection Filter: Filtre de détection vocale Enable this to transcribe only detected voice: Activer pour transcrire uniquement la voix détectée Enable Silero VAD Filter: Activer le filtre Silero VAD @@ -249,7 +249,7 @@ de: # Deutsch Enabling this will remove background music: Aktivierung entfernt Hintergrundmusik mithilfe des Submodells vor der Transkription Enable Background Music Remover Filter: Hintergrundmusik-Entfernungsfilter aktivieren Save separated files to output: Getrennte Dateien in der Ausgabe speichern - Offload sub model after removing background music: Submodell nach Entfernung der Hintergrundmusik entladen + Offload sub model when finished: Submodell entladen wenn nicht mehr benötigt Voice Detection Filter: Sprachfilter Enable this to transcribe only detected voice: Aktivieren, um nur erkannte Sprachsegmente mithilfe des Submodells zu transkribieren. Enable Silero VAD Filter: Silero VAD-Filter aktivieren @@ -296,7 +296,7 @@ zh: # Chinese Enabling this will remove background music: 启用此功能将会在进行转录前去除背景音乐 Enable Background Music Remover Filter: 去除背景音乐 Save separated files to output: 导出分离出的音频文件 - Offload sub model after removing background music: 移除背景音乐后卸载子模型 + Offload sub model when finished: 完成后卸载子模型 Voice Detection Filter: 话音检测设置 Enable this to transcribe only detected voice: 启用此功能将仅转录检测到的语音部分 Enable Silero VAD Filter: 启用 Silero 语音活动检测 (VAD) @@ -342,7 +342,7 @@ uk: # Ukrainian Enabling this will remove background music: Увімкнення цього видалить фонову музику за допомогою підмоделі перед транскрипцією Enable Background Music Remover Filter: Увімкнути фільтр видалення фонової музики Save separated files to output: Зберегти розділені файли до вихідної папки - Offload sub model after removing background music: Вивантажити підмодель після видалення фонової музики + Offload sub model when finished: Після завершення вивантажте підмодель Voice Detection Filter: Фільтр розпізнавання голосу Enable this to transcribe only detected voice: Увімкніть це, щоб транскрибувати лише розпізнані голосові частини за допомогою підмоделі Enable Silero VAD Filter: Увімкнути фільтр Silero VAD @@ -388,7 +388,7 @@ ru: # Russian Enabling this will remove background music: Включение этого удалит фоновую музыку с помощью подмодели перед транскрипцией Enable Background Music Remover Filter: Включить фильтр удаления фоновой музыки Save separated files to output: Сохранить разделенные файлы в выходную папку - Offload sub model after removing background music: Выгрузить подмодель после удаления фоновой музыки + Offload sub model when finished: Выгрузить подмодель после завершения Voice Detection Filter: Фильтр обнаружения голоса Enable this to transcribe only detected voice: Включите это, чтобы транскрибировать только обнаруженные голосовые части с помощью подмодели Enable Silero VAD Filter: Включить фильтр Silero VAD @@ -434,7 +434,7 @@ tr: # Turkish Enabling this will remove background music: Bunu etkinleştirmek, arka plan müziğini alt model tarafından transkripsiyondan önce kaldıracaktır Enable Background Music Remover Filter: Arka Plan Müziği Kaldırma Filtresini Etkinleştir Save separated files to output: Ayrılmış dosyaları çıktıya kaydet - Offload sub model after removing background music: Arka plan müziği kaldırıldıktan sonra alt modeli devre dışı bırak + Offload sub model when finished: Alt modeli bitirdiğinizde boşaltın Voice Detection Filter: Ses Algılama Filtresi Enable this to transcribe only detected voice: Bunu etkinleştirerek yalnızca alt model tarafından algılanan ses kısımlarını transkribe et Enable Silero VAD Filter: Silero VAD Filtresini Etkinleştir @@ -480,7 +480,7 @@ eu: # Basque Enabling this will remove background music: Hau aktibatuz, transkribatu aurretik atzeko musika ezabatuko zaio azpieredu baten bidez Enable Background Music Remover Filter: Aktibatu Atzeko Musika Ezabatzeko Filtroa Save separated files to output: Gorde fitxategi bereiziak irteeran - Offload sub model after removing background music: Atzeko musika kendu ondoren azpieredua memoriatik kendu + Offload sub model when finished: Deskargatu azpieredua amaitutakoan Voice Detection Filter: Ahots Detekzio Filtroa Enable this to transcribe only detected voice: Aktibatu hau azpieredu batekin soilik detektatutako ahots zatiak transkribatzeko. Enable Silero VAD Filter: Aktibatu Silero VAD Filtroa diff --git a/modules/whisper/base_transcription_pipeline.py b/modules/whisper/base_transcription_pipeline.py index 1cd47077..d2fb97ac 100644 --- a/modules/whisper/base_transcription_pipeline.py +++ b/modules/whisper/base_transcription_pipeline.py @@ -169,6 +169,8 @@ def run(self, progress, *whisper_params.to_list() ) + if whisper_params.enable_offload: + self.offload() if vad_params.vad_filter: restored_result = self.vad.restore_speech_timestamps( @@ -188,6 +190,8 @@ def run(self, transcribed_result=result, device=diarization_params.diarization_device ) + if diarization_params.enable_offload: + self.diarizer.offload() self.cache_parameters( params=params, diff --git a/modules/whisper/data_classes.py b/modules/whisper/data_classes.py index ad72ee33..700e2a8a 100644 --- a/modules/whisper/data_classes.py +++ b/modules/whisper/data_classes.py @@ -161,6 +161,10 @@ class DiarizationParams(BaseParams): default="", description="Hugging Face token for downloading diarization models" ) + enable_offload: bool = Field( + default=True, + description="Offload Diarization model after Speaker diarization" + ) @classmethod def to_gradio_inputs(cls, @@ -182,6 +186,10 @@ def to_gradio_inputs(cls, value=defaults.get("hf_token", cls.__fields__["hf_token"].default), info=_("This is only needed the first time you download the model") ), + gr.Checkbox( + label=_("Offload sub model when finished"), + value=defaults.get("enable_offload", cls.__fields__["enable_offload"].default), + ) ] @@ -242,7 +250,7 @@ def to_gradio_input(cls, value=defaults.get("save_file", cls.__fields__["save_file"].default), ), gr.Checkbox( - label=_("Offload sub model after removing background music"), + label=_("Offload sub model when finished"), value=defaults.get("enable_offload", cls.__fields__["enable_offload"].default), ) ] @@ -328,6 +336,10 @@ class WhisperParams(BaseParams): description="Number of segments for language detection" ) batch_size: int = Field(default=24, gt=0, description="Batch size for processing") + enable_offload: bool = Field( + default=True, + description="Offload Whisper model after transcription" + ) @field_validator('lang') def validate_lang(cls, v): @@ -556,6 +568,13 @@ def to_gradio_inputs(cls, inputs += faster_whisper_inputs + insanely_fast_whisper_inputs + inputs += [ + gr.Checkbox( + label=_("Offload sub model when finished"), + value=defaults.get("enable_offload", cls.__fields__["enable_offload"].default), + ) + ] + return inputs From 1922554a4dad00e4b1d203f0409d615710f28f91 Mon Sep 17 00:00:00 2001 From: jhj0517 <97279763+jhj0517@users.noreply.github.com> Date: Mon, 27 Jan 2025 13:23:35 +0900 Subject: [PATCH 2/2] Update Korean translation --- configs/translation.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/translation.yaml b/configs/translation.yaml index b8197538..9ea0a90e 100644 --- a/configs/translation.yaml +++ b/configs/translation.yaml @@ -65,7 +65,7 @@ ko: # Korean Enabling this will remove background music: 받아쓰기 이전에 먼저 배경 음악 제거용 서브 모델을 활성화 합니다. Enable Background Music Remover Filter: 배경 음악 제거 필터 활성화 Save separated files to output: 분리된 배경 음악 & 음성 파일 따로 출력 폴더에 저장 - Offload sub model when finished: 완료되면 하위 모델 언로드. (VRAM 이 부족할 시 체크하세요.) + Offload sub model when finished: 완료 후 모델 오프로드. (VRAM 이 부족할 시 체크하세요.) Voice Detection Filter: 목소리 감지 필터 Enable this to transcribe only detected voice: 서브 모델에 의해 목소리라고 판단된 부분만 받아쓰기를 진행합니다. Enable Silero VAD Filter: Silero VAD 필터 활성화