diff --git a/.gitignore b/.gitignore index 774bdbd58..803da1836 100644 --- a/.gitignore +++ b/.gitignore @@ -65,3 +65,4 @@ rust-executor/CUSTOM_DENO_SNAPSHOT.bin rust-executor/test_data .npmrc +.worktrees/ diff --git a/core/src/ai/AIClient.ts b/core/src/ai/AIClient.ts index b3fd89590..40384fb25 100644 --- a/core/src/ai/AIClient.ts +++ b/core/src/ai/AIClient.ts @@ -302,17 +302,19 @@ export class AIClient { endThreshold?: number; endWindow?: number; timeBeforeSpeech?: number; - } + }, + language?: string ): Promise { const { aiOpenTranscriptionStream } = unwrapApolloResult(await this.#apolloClient.mutate({ mutation: gql` - mutation AiOpenTranscriptionStream($modelId: String!, $params: VoiceActivityParamsInput) { - aiOpenTranscriptionStream(modelId: $modelId, params: $params) + mutation AiOpenTranscriptionStream($modelId: String!, $params: VoiceActivityParamsInput, $language: String) { + aiOpenTranscriptionStream(modelId: $modelId, params: $params, language: $language) } `, variables: { modelId, - params + params, + language } })); diff --git a/core/src/ai/AIResolver.ts b/core/src/ai/AIResolver.ts index 2942e5873..1faa0fc63 100644 --- a/core/src/ai/AIResolver.ts +++ b/core/src/ai/AIResolver.ts @@ -338,7 +338,8 @@ export default class AIResolver { @Mutation(() => String) aiOpenTranscriptionStream( @Arg("modelId") modelId: string, - @Arg("params", () => VoiceActivityParamsInput, { nullable: true }) params?: VoiceActivityParamsInput + @Arg("params", () => VoiceActivityParamsInput, { nullable: true }) params?: VoiceActivityParamsInput, + @Arg("language", { nullable: true }) language?: string ): string { return "streamId" } diff --git a/rust-executor/src/ai_service/mod.rs b/rust-executor/src/ai_service/mod.rs index f15890c5c..9d8ee2068 100644 --- a/rust-executor/src/ai_service/mod.rs +++ b/rust-executor/src/ai_service/mod.rs @@ -54,8 +54,8 @@ pub struct AIService { llm_channel: Arc>>>, transcription_streams: Arc>>, cleanup_task_shutdown: Arc>>>, - /// Shared Whisper models - ONE model per size, shared across ALL streams using that size - /// Key = WhisperSource (Tiny/Small/Medium/Large), Value = Arc + /// Shared Whisper models - ONE model per size+language combination, shared across ALL streams + /// Key = "{size:?}_{lang_key}" (e.g., "Small_auto", "Small_de"), Value = Arc /// Cloning Arc is cheap (just increments ref count), model weights stay in memory once /// Saves 500MB-1.5GB per stream! shared_whisper_models: Arc>>>, @@ -1113,42 +1113,73 @@ impl AIService { &self, model_id: String, params: Option, + language: Option, ) -> Result { let model_size = Self::get_whisper_model_size(model_id.clone())?; // MEMORY OPTIMIZATION: Load each Whisper model size ONCE and share across all streams using that size // Arc cloning is cheap (just increments ref count), saves 500MB-1.5GB per stream! + // Language is set at build time, so include it in the cache key let whisper_model = { - let mut shared_models = self.shared_whisper_models.lock().await; - let model_key = format!("{:?}", model_size); // Use Debug format as key (e.g., "Small", "Medium") - - if !shared_models.contains_key(&model_key) { + // Parse language upfront and use canonical form for cache key + let whisper_lang = if let Some(ref lang) = language { + Some( + lang.to_lowercase() + .parse::() + .map_err(|_| anyhow::anyhow!("Unsupported whisper language: {}", lang))?, + ) + } else { + None + }; + let lang_key = whisper_lang + .as_ref() + .map(|l| l.to_string()) + .unwrap_or_else(|| "auto".to_string()); + let model_key = format!("{:?}_{}", model_size, lang_key); + + // First check: acquire lock briefly to see if model already exists + let existing = { + let shared_models = self.shared_whisper_models.lock().await; + shared_models.get(&model_key).cloned() + }; // Lock dropped here + + if let Some(model) = existing { log::info!( - "Loading shared Whisper model {} ({:?}) (ONE model per size, ~500MB-1.5GB)...", + "Reusing existing shared Whisper model {:?} (language: {}) for new stream", + model_size, + lang_key + ); + model + } else { + // Model not found — build it outside the lock to avoid serializing callers + log::info!( + "Loading shared Whisper model {} ({:?}, language: {}) (ONE model per size, ~500MB-1.5GB)...", model_id, - model_size + model_size, + lang_key ); - let model = WhisperBuilder::default() + let mut builder = WhisperBuilder::default() .with_source(model_size) - .with_device(Self::new_candle_device()) - .build() - .await?; + .with_device(Self::new_candle_device()); + + builder = builder.with_language(whisper_lang); + + let model = builder.build().await?; log::info!( - "Shared Whisper model {:?} loaded! All streams using this size will reuse this model.", - model_size - ); - shared_models.insert(model_key.clone(), Arc::new(model)); - } else { - log::info!( - "Reusing existing shared Whisper model {:?} for new stream", - model_size + "Shared Whisper model {:?} (language: {}) loaded! All streams using this config will reuse this model.", + model_size, + lang_key ); - } - // Clone the Arc - this is CHEAP! Just increments a reference count - shared_models.get(&model_key).unwrap().clone() + // Re-acquire lock and insert; use entry to handle race where another caller built it first + let mut shared_models = self.shared_whisper_models.lock().await; + let model_arc = shared_models + .entry(model_key) + .or_insert_with(|| Arc::new(model)); + model_arc.clone() + } }; log::info!("Opening transcription stream with model {:?}", model_size); diff --git a/rust-executor/src/graphql/mutation_resolvers.rs b/rust-executor/src/graphql/mutation_resolvers.rs index 76ab28c87..f1e2645de 100644 --- a/rust-executor/src/graphql/mutation_resolvers.rs +++ b/rust-executor/src/graphql/mutation_resolvers.rs @@ -2614,11 +2614,12 @@ impl Mutation { context: &RequestContext, model_id: String, params: Option, + language: Option, ) -> FieldResult { check_capability(&context.capabilities, &AI_TRANSCRIBE_CAPABILITY)?; Ok(AIService::global_instance() .await? - .open_transcription_stream(model_id, params.map(|p| p.into())) + .open_transcription_stream(model_id, params.map(|p| p.into()), language) .await?) }