diff --git a/Package.resolved b/Package.resolved index d964957..8488a92 100644 --- a/Package.resolved +++ b/Package.resolved @@ -1,5 +1,5 @@ { - "originHash" : "f02d1b06da5914ec9757af4da0eaa58ac89d06c25ef4f19477024d63596a814a", + "originHash" : "bd373e20fb0f4ce7e77093a139f872954cd08face9c8f999dd7ce38e1d0f0869", "pins" : [ { "identity" : "dtln-aec-coreml", @@ -10,6 +10,24 @@ "version" : "0.6.0-beta" } }, + { + "identity" : "eventsource", + "kind" : "remoteSourceControl", + "location" : "https://github.com/mattt/EventSource.git", + "state" : { + "revision" : "a3a85a85214caf642abaa96ae664e4c772a59f6e", + "version" : "1.4.1" + } + }, + { + "identity" : "fluidaudio", + "kind" : "remoteSourceControl", + "location" : "https://github.com/FluidInference/FluidAudio.git", + "state" : { + "revision" : "9830ce835881c0d0d40f90aabfaae3a6da5bebfb", + "version" : "0.12.4" + } + }, { "identity" : "sparkle", "kind" : "remoteSourceControl", @@ -18,6 +36,96 @@ "revision" : "21d8df80440b1ca3b65fa82e40782f1e5a9e6ba2", "version" : "2.9.0" } + }, + { + "identity" : "swift-asn1", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-asn1.git", + "state" : { + "revision" : "9f542610331815e29cc3821d3b6f488db8715517", + "version" : "1.6.0" + } + }, + { + "identity" : "swift-atomics", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-atomics.git", + "state" : { + "revision" : "b601256eab081c0f92f059e12818ac1d4f178ff7", + "version" : "1.3.0" + } + }, + { + "identity" : "swift-collections", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-collections.git", + "state" : { + "revision" : "8d9834a6189db730f6264db7556a7ffb751e99ee", + "version" : "1.4.0" + } + }, + { + "identity" : "swift-crypto", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-crypto.git", + "state" : { + "revision" : "fa308c07a6fa04a727212d793e761460e41049c3", + "version" : "4.3.0" + } + }, + { + "identity" : "swift-huggingface", + "kind" : "remoteSourceControl", + "location" : "https://github.com/huggingface/swift-huggingface.git", + "state" : { + "revision" : "b721959445b617d0bf03910b2b4aced345fd93bf", + "version" : "0.9.0" + } + }, + { + "identity" : "swift-jinja", + "kind" : "remoteSourceControl", + "location" : "https://github.com/huggingface/swift-jinja.git", + "state" : { + "revision" : "f731f03bf746481d4fda07f817c3774390c4d5b9", + "version" : "2.3.2" + } + }, + { + "identity" : "swift-nio", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-nio.git", + "state" : { + "revision" : "b31565862a8f39866af50bc6676160d8dda7de35", + "version" : "2.96.0" + } + }, + { + "identity" : "swift-system", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-system.git", + "state" : { + "revision" : "7c6ad0fc39d0763e0b699210e4124afd5041c5df", + "version" : "1.6.4" + } + }, + { + "identity" : "swift-transformers", + "kind" : "remoteSourceControl", + "location" : "https://github.com/huggingface/swift-transformers", + "state" : { + "revision" : "eed7264ac5e4ec5dfa6165c6e5c5577364344fe4", + "version" : "1.2.0" + } + }, + { + "identity" : "yyjson", + "kind" : "remoteSourceControl", + "location" : "https://github.com/ibireme/yyjson.git", + "state" : { + "revision" : "8b4a38dc994a110abaec8a400615567bd996105f", + "version" : "0.12.0" + } } ], "version" : 3 diff --git a/Package.swift b/Package.swift index 4a2db25..1eacd0e 100644 --- a/Package.swift +++ b/Package.swift @@ -7,6 +7,7 @@ let package = Package( dependencies: [ .package(url: "https://github.com/sparkle-project/Sparkle", from: "2.0.0"), .package(url: "https://github.com/MimicScribe/dtln-aec-coreml.git", from: "0.4.0-beta"), + .package(url: "https://github.com/FluidInference/FluidAudio.git", from: "0.12.4"), ], targets: [ .target( @@ -32,6 +33,7 @@ let package = Package( .product(name: "Sparkle", package: "Sparkle"), .product(name: "DTLNAecCoreML", package: "dtln-aec-coreml"), .product(name: "DTLNAec256", package: "dtln-aec-coreml"), + .product(name: "FluidAudio", package: "FluidAudio"), ], path: "Sources", exclude: ["ObjCExceptionCatcher", "Watchdog"], diff --git a/Sources/AudioMonitor.swift b/Sources/AudioMonitor.swift index 9b6689f..108643f 100644 --- a/Sources/AudioMonitor.swift +++ b/Sources/AudioMonitor.swift @@ -593,8 +593,9 @@ final class AudioMonitor { let url = await recorder.stop() savingCount -= 1 isSaving = savingCount > 0 - if url != nil { + if let url { notifyRecordingSaved(appName: appName) + autoTranscribe(recordingDirectory: url) } updateAutoState() } @@ -721,6 +722,25 @@ final class AudioMonitor { return count <= max } + // MARK: - Auto-Transcription + + private func autoTranscribe(recordingDirectory: URL) { + let provider = + UserDefaults.standard.string(forKey: TranscriptionProvider.defaultsKey) ?? "local" + guard provider == "local", LocalTranscriptionService.modelsReady else { return } + Task.detached { + do { + try await LocalTranscriptionService.transcribeAndSave( + recordingDirectory: recordingDirectory) + Log.info(Log.transcription, "auto", "auto-transcription complete") + } catch { + Log.error( + Log.transcription, "auto", + "auto-transcription failed: \(error.localizedDescription)") + } + } + } + // MARK: - Notifications private func notifyRecordingStarted(appName: String) { diff --git a/Sources/LocalTranscriptionService.swift b/Sources/LocalTranscriptionService.swift new file mode 100644 index 0000000..aa1641b --- /dev/null +++ b/Sources/LocalTranscriptionService.swift @@ -0,0 +1,430 @@ +@preconcurrency import AVFoundation +import CoreMedia +import FluidAudio + +/// On-device transcription using FluidAudio (Parakeet TDT v3 ASR + offline diarization). +/// Extracts dual tracks from the recording, transcribes and diarizes each independently, +/// then merges with speaker attribution via temporal overlap matching. +enum LocalTranscriptionService { + + /// Whether ASR and diarizer model files have been downloaded to disk. + nonisolated static var modelsReady: Bool { + UserDefaults.standard.bool(forKey: "localTranscriptionModelsReady") + } + + // MARK: - Public API + + /// Transcribe a recording and return the document. Status updates are sent via + /// `onStatus` which is `@Sendable` - callers must dispatch to MainActor themselves. + static func transcribe( + recordingDirectory: URL, + onStatus: @escaping @Sendable (TranscriptionStatus) -> Void + ) async throws -> TranscriptDocument { + let processedURL = recordingDirectory.appendingPathComponent("audio-processed.m4a") + let originalURL = recordingDirectory.appendingPathComponent("audio.m4a") + let audioURL = + FileManager.default.fileExists(atPath: processedURL.path) ? processedURL : originalURL + + let doc = try await Task.detached { + try await Self.run(audioURL: audioURL, onStatus: onStatus) + }.value + + onStatus(.completed) + return doc + } + + /// Convenience: transcribe and save to disk as `transcript-local.json`. + static func transcribeAndSave( + recordingDirectory: URL, + onStatus: @escaping @Sendable (TranscriptionStatus) -> Void = { _ in } + ) async throws { + let doc = try await transcribe( + recordingDirectory: recordingDirectory, onStatus: onStatus) + try doc.save(for: recordingDirectory, provider: .local) + } + + /// Download and prepare all models (ASR + diarizer). Call from Settings UI. + static func prepareModels( + onStatus: @escaping @Sendable (TranscriptionStatus) -> Void + ) async throws { + onStatus(.preparing) + try await Task.detached { + try await Self.downloadAllModels() + }.value + onStatus(.completed) + } + + // MARK: - Core Pipeline (runs off main actor) + + nonisolated private static func run( + audioURL: URL, + onStatus: @Sendable (TranscriptionStatus) -> Void + ) async throws -> TranscriptDocument { + try Task.checkCancellation() + + // 1. Load models (fast from cache on subsequent calls) + onStatus(.preparing) + let asrModels = try await downloadASRModels() + let diarizerManager = OfflineDiarizerManager() + try await diarizerManager.prepareModels() + UserDefaults.standard.set(true, forKey: "localTranscriptionModelsReady") + + try Task.checkCancellation() + + // 2. Extract tracks + onStatus(.transcribing) + let (systemSamples, micSamples) = try await extractTracks(from: audioURL) + + Log.info( + Log.transcription, "local", + "extracted tracks: system=\(systemSamples.count) samples" + + (micSamples != nil ? ", mic=\(micSamples!.count) samples" : " (single-track)")) + + try Task.checkCancellation() + + // 3. ASR + let asrManager = AsrManager() + try await asrManager.initialize(models: asrModels) + + let systemASR = try await asrManager.transcribe(systemSamples, source: .system) + Log.info( + Log.transcription, "local", + "system ASR: \(systemASR.text.prefix(80))... (\(String(format: "%.0f", systemASR.rtfx))x realtime)" + ) + + try Task.checkCancellation() + + var micASR: ASRResult? + if let micSamples { + micASR = try await asrManager.transcribe(micSamples, source: .microphone) + Log.info( + Log.transcription, "local", + "mic ASR: \(micASR!.text.prefix(80))... (\(String(format: "%.0f", micASR!.rtfx))x realtime)" + ) + } + + try Task.checkCancellation() + + // 4. Diarize (graceful degradation - transcript still works without diarization) + var systemDiarization: DiarizationResult? + var micDiarization: DiarizationResult? + + do { + systemDiarization = try await diarizerManager.process(audio: systemSamples) + Log.info( + Log.transcription, "local", + "system diarization: \(systemDiarization!.segments.count) segments") + + if let micSamples { + try Task.checkCancellation() + micDiarization = try await diarizerManager.process(audio: micSamples) + Log.info( + Log.transcription, "local", + "mic diarization: \(micDiarization!.segments.count) segments") + } + } catch { + Log.error(Log.transcription, "local", "diarization failed, proceeding without: \(error)") + } + + // 5. Assign speakers to ASR segments via temporal overlap + let systemSegments = labelSegments(asr: systemASR, diarization: systemDiarization) + let micSegments = micASR != nil ? labelSegments(asr: micASR!, diarization: micDiarization) : nil + + // 6. Merge into document + let language = systemASR.ctcDetectedTerms?.first + + return mergeIntoDocument( + systemSegments: systemSegments, + micSegments: micSegments, + language: language + ) + } + + // MARK: - Model Loading + + /// Downloads ASR models only. Diarizer models are prepared in `run()` where + /// the `OfflineDiarizerManager` instance is actually used. + @discardableResult + nonisolated private static func downloadASRModels() async throws -> AsrModels { + let asrModels = try await AsrModels.downloadAndLoad(version: .v3) + Log.info(Log.transcription, "local", "ASR models ready") + return asrModels + } + + /// Downloads both ASR and diarizer models (for Settings pre-download). + nonisolated private static func downloadAllModels() async throws { + _ = try await AsrModels.downloadAndLoad(version: .v3) + let diarizerManager = OfflineDiarizerManager() + try await diarizerManager.prepareModels() + UserDefaults.standard.set(true, forKey: "localTranscriptionModelsReady") + Log.info(Log.transcription, "local", "all models ready") + } + + // MARK: - Track Extraction + + nonisolated private static func extractTracks( + from url: URL + ) async throws -> (system: [Float], mic: [Float]?) { + let asset = AVURLAsset(url: url) + let tracks = try await asset.loadTracks(withMediaType: .audio) + + guard !tracks.isEmpty else { + throw LocalTranscriptionError.noAudioTracks + } + + let pcmSettings: [String: Any] = [ + AVFormatIDKey: kAudioFormatLinearPCM, + AVSampleRateKey: 16000 as Double, + AVNumberOfChannelsKey: 1, + AVLinearPCMBitDepthKey: 32, + AVLinearPCMIsFloatKey: true, + AVLinearPCMIsBigEndianKey: false, + AVLinearPCMIsNonInterleaved: false, + ] + + let duration = try await asset.load(.duration).seconds + let expectedSamples = Int(duration * 16000) + + let systemSamples = try readTrack( + tracks[0], asset: asset, settings: pcmSettings, reserveCount: expectedSamples) + let micSamples = + tracks.count >= 2 + ? try readTrack( + tracks[1], asset: asset, settings: pcmSettings, reserveCount: expectedSamples) : nil + + return (system: systemSamples, mic: micSamples) + } + + nonisolated private static func readTrack( + _ track: AVAssetTrack, asset: AVURLAsset, settings: [String: Any], + reserveCount: Int + ) throws -> [Float] { + let reader = try AVAssetReader(asset: asset) + let output = AVAssetReaderTrackOutput(track: track, outputSettings: settings) + output.alwaysCopiesSampleData = false + reader.add(output) + + guard reader.startReading() else { + throw LocalTranscriptionError.trackReadFailed( + reader.error?.localizedDescription ?? "unknown") + } + + var samples: [Float] = [] + samples.reserveCapacity(reserveCount) + while let sampleBuffer = output.copyNextSampleBuffer() { + guard let blockBuffer = CMSampleBufferGetDataBuffer(sampleBuffer) else { continue } + let length = CMBlockBufferGetDataLength(blockBuffer) + let floatCount = length / MemoryLayout.size + var chunk = [Float](repeating: 0, count: floatCount) + chunk.withUnsafeMutableBufferPointer { ptr in + guard let base = ptr.baseAddress else { return } + _ = CMBlockBufferCopyDataBytes( + blockBuffer, atOffset: 0, dataLength: length, destination: base) + } + samples.append(contentsOf: chunk) + } + + return samples + } + + // MARK: - Speaker Assignment (Temporal Overlap Matching) + + private struct LabeledSegment { + let speakerId: String + let startTime: Double + let text: String + } + + /// Assign speaker labels to ASR tokens by finding the diarization segment + /// with maximum temporal overlap. Falls back to nearest segment by gap distance. + nonisolated private static func labelSegments( + asr: ASRResult, diarization: DiarizationResult? + ) -> [LabeledSegment] { + // No diarization or no token timings - entire text as one segment + guard let timings = asr.tokenTimings, !timings.isEmpty, + let diarization, !diarization.segments.isEmpty + else { + let speaker = + diarization?.segments + .max(by: { $0.durationSeconds < $1.durationSeconds })?.speakerId ?? "SPEAKER_0" + return [LabeledSegment(speakerId: speaker, startTime: 0, text: asr.text)] + } + + // For each token, find best matching diarization speaker + var labeledTokens: [(speakerId: String, timing: TokenTiming)] = [] + + for token in timings { + let tokenStart = Float(token.startTime) + let tokenEnd = Float(token.endTime) + + var bestSpeaker: String? + var bestOverlap: Float = 0 + + for seg in diarization.segments { + let overlap = max( + 0, min(tokenEnd, seg.endTimeSeconds) - max(tokenStart, seg.startTimeSeconds)) + if overlap > bestOverlap { + bestOverlap = overlap + bestSpeaker = seg.speakerId + } + } + + // Fallback: nearest diarization segment by gap distance + if bestSpeaker == nil { + var nearestGap: Float = .infinity + for seg in diarization.segments { + let gap: Float + if tokenEnd <= seg.startTimeSeconds { + gap = seg.startTimeSeconds - tokenEnd + } else if tokenStart >= seg.endTimeSeconds { + gap = tokenStart - seg.endTimeSeconds + } else { + gap = 0 + } + if gap < nearestGap { + nearestGap = gap + bestSpeaker = seg.speakerId + } + } + } + + labeledTokens.append((bestSpeaker ?? "SPEAKER_0", token)) + } + + // Group consecutive same-speaker tokens into segments + var segments: [LabeledSegment] = [] + var currentSpeaker: String? + var currentText = "" + var currentStart: Double = 0 + + for (speaker, token) in labeledTokens { + if speaker != currentSpeaker { + if let s = currentSpeaker, + !currentText.trimmingCharacters(in: .whitespaces).isEmpty + { + segments.append( + LabeledSegment( + speakerId: s, startTime: currentStart, + text: currentText.trimmingCharacters(in: .whitespaces))) + } + currentSpeaker = speaker + currentText = "" + currentStart = token.startTime + } + currentText += token.token + } + + if let s = currentSpeaker, !currentText.trimmingCharacters(in: .whitespaces).isEmpty { + segments.append( + LabeledSegment( + speakerId: s, startTime: currentStart, + text: currentText.trimmingCharacters(in: .whitespaces))) + } + + return segments + } + + // MARK: - Merge Dual-Track Results + + /// Merge labeled segments from system (remote) and mic (local) tracks into a + /// TranscriptDocument with sequential integer speaker IDs and default names. + nonisolated private static func mergeIntoDocument( + systemSegments: [LabeledSegment], + micSegments: [LabeledSegment]?, + language: String? + ) -> TranscriptDocument { + // Collect unique speaker IDs per track + var micSpeakerIds: [String] = [] + if let micSegs = micSegments { + var seen = Set() + for seg in micSegs where seen.insert(seg.speakerId).inserted { + micSpeakerIds.append(seg.speakerId) + } + } + + var remoteSpeakerIds: [String] = [] + var seenRemote = Set() + for seg in systemSegments where seenRemote.insert(seg.speakerId).inserted { + remoteSpeakerIds.append(seg.speakerId) + } + + // Map to sequential integers: mic speakers first, then remote + var speakerMap: [String: Int] = [:] + var speakerNames: [String: String] = [:] + var nextId = 0 + + for id in micSpeakerIds { + speakerMap["M_\(id)"] = nextId + speakerNames[String(nextId)] = + micSpeakerIds.count == 1 ? "You" : "Local \(nextId + 1)" + nextId += 1 + } + + for id in remoteSpeakerIds { + speakerMap["R_\(id)"] = nextId + speakerNames[String(nextId)] = "Speaker \(nextId + 1)" + nextId += 1 + } + + // Convert to TranscriptSegments + var allSegments: [TranscriptSegment] = [] + + if let micSegs = micSegments { + for seg in micSegs { + allSegments.append( + TranscriptSegment( + speaker: speakerMap["M_\(seg.speakerId)"] ?? 0, + time: seg.startTime, + text: seg.text)) + } + } + + for seg in systemSegments { + allSegments.append( + TranscriptSegment( + speaker: speakerMap["R_\(seg.speakerId)"] ?? 0, + time: seg.startTime, + text: seg.text)) + } + + // Sort by time + allSegments.sort { $0.time < $1.time } + + // Single-track: no M_/R_ prefixes were used, simplify speaker names + if micSegments == nil { + speakerNames = [:] + for (i, id) in remoteSpeakerIds.enumerated() { + speakerMap["R_\(id)"] = i + speakerNames[String(i)] = "Speaker \(i + 1)" + } + // Re-map segments with corrected IDs + allSegments = systemSegments.map { seg in + TranscriptSegment( + speaker: speakerMap["R_\(seg.speakerId)"] ?? 0, + time: seg.startTime, + text: seg.text) + } + } + + return TranscriptDocument( + segments: allSegments, + language: language, + createdAt: Date(), + speakers: speakerNames + ) + } +} + +// MARK: - Errors + +nonisolated enum LocalTranscriptionError: Error, LocalizedError, Sendable { + case noAudioTracks + case trackReadFailed(String) + + var errorDescription: String? { + switch self { + case .noAudioTracks: "No audio tracks found in recording" + case .trackReadFailed(let msg): "Failed to read audio track: \(msg)" + } + } +} diff --git a/Sources/MainWindowView.swift b/Sources/MainWindowView.swift index 1669093..ffced54 100644 --- a/Sources/MainWindowView.swift +++ b/Sources/MainWindowView.swift @@ -155,7 +155,21 @@ struct RecordingsView: View { let processedSize = hasProcessed ? ((try? processedURL.resourceValues(forKeys: [.fileSizeKey]).fileSize) ?? 0) : 0 - let sidecar = TranscriptDocument.sidecarURL(for: url) + TranscriptDocument.migrateLegacyTranscript(in: url) + var available = Set() + for provider in TranscriptionProvider.allCases { + let sidecar = TranscriptDocument.sidecarURL(for: url, provider: provider) + if FileManager.default.fileExists(atPath: sidecar.path) { + available.insert(provider) + } + } + // Fallback: unmigrated legacy transcript counts as soniox + if !available.contains(.soniox), + FileManager.default.fileExists( + atPath: url.appendingPathComponent("transcript.json").path) + { + available.insert(.soniox) + } results.append( RecordingFile( url: url, @@ -167,7 +181,7 @@ struct RecordingsView: View { ).contentModificationDate) ?? .distantPast, size: originalSize + processedSize, hasProcessed: hasProcessed, - hasTranscript: FileManager.default.fileExists(atPath: sidecar.path) + availableTranscripts: available )) } @@ -296,7 +310,7 @@ private struct RecordingRow: View { .foregroundStyle(.secondary) .help("Echo cancellation applied") } - if recording.hasTranscript { + if !recording.availableTranscripts.isEmpty { Image(systemName: "text.quote") .font(.caption2) .foregroundStyle(.secondary) @@ -346,8 +360,15 @@ struct RecordingDetailView: View { @State private var editedTitle = "" // Transcription - @State private var transcript: TranscriptDocument? - @State private var transcriptionStatus: TranscriptionStatus = .idle + @State private var activeProvider: TranscriptionProvider = { + TranscriptionProvider( + rawValue: UserDefaults.standard.string(forKey: TranscriptionProvider.defaultsKey) ?? "local" + ) ?? .local + }() + @State private var localTranscript: TranscriptDocument? + @State private var sonioxTranscript: TranscriptDocument? + @State private var localTranscriptionStatus: TranscriptionStatus = .idle + @State private var sonioxTranscriptionStatus: TranscriptionStatus = .idle @State private var transcriptionTask: Task? var body: some View { @@ -434,7 +455,7 @@ struct RecordingDetailView: View { } Spacer() HStack(spacing: 8) { - if transcript != nil { + if activeTranscript != nil { Button { exportTranscript() } label: { @@ -721,7 +742,24 @@ struct RecordingDetailView: View { onTitleChanged() } + private var activeTranscript: TranscriptDocument? { + switch activeProvider { + case .local: localTranscript + case .soniox: sonioxTranscript + } + } + + private var activeTranscriptionStatus: TranscriptionStatus { + switch activeProvider { + case .local: localTranscriptionStatus + case .soniox: sonioxTranscriptionStatus + } + } + private func speakerName(for speakerID: Int) -> String { + if let name = activeTranscript?.speakers?[String(speakerID)], !name.isEmpty { + return name + } if let name = metadata?.speakers[String(speakerID)], !name.isEmpty { return name } @@ -729,67 +767,94 @@ struct RecordingDetailView: View { } private func saveSpeakerName(_ name: String, for speakerID: Int) { - var meta = - metadata - ?? RecordingMetadata( - title: recording.title, - createdAt: recording.date, - appName: recording.title, - speakers: [:] - ) - meta.speakers[String(speakerID)] = name - try? meta.save(in: recording.url) - metadata = meta + switch activeProvider { + case .local: + if localTranscript?.speakers == nil { localTranscript?.speakers = [:] } + localTranscript?.speakers?[String(speakerID)] = name + if let doc = localTranscript { + try? doc.save(for: recording.url, provider: .local) + } + case .soniox: + if sonioxTranscript?.speakers == nil { sonioxTranscript?.speakers = [:] } + sonioxTranscript?.speakers?[String(speakerID)] = name + if let doc = sonioxTranscript { + try? doc.save(for: recording.url, provider: .soniox) + } + } } // MARK: - Transcript private var transcriptArea: some View { - Group { - if let transcript, !transcript.segments.isEmpty { - transcriptView(transcript) - } else if case .error(let msg) = transcriptionStatus { - VStack(spacing: 12) { - Image(systemName: "exclamationmark.triangle") - .font(.title) - .foregroundStyle(.secondary) - Text(msg) - .font(.caption) - .foregroundStyle(.secondary) - .multilineTextAlignment(.center) - Button("Retry") { startTranscription() } - .buttonStyle(.bordered) - } - .frame(maxWidth: .infinity, maxHeight: .infinity) - } else if transcriptionStatus != .idle { - VStack(spacing: 12) { - ProgressView() - Text(transcriptionStatusText) - .font(.caption) - .foregroundStyle(.secondary) - Button("Cancel") { cancelTranscription() } - .buttonStyle(.bordered) + VStack(spacing: 0) { + Picker("", selection: $activeProvider) { + ForEach(TranscriptionProvider.allCases) { p in + Text(p.label).tag(p) } - .frame(maxWidth: .infinity, maxHeight: .infinity) - } else { - VStack(spacing: 12) { - if sonioxAPIKey.isEmpty { - Text("Add your Soniox API key in Settings to enable transcription") + } + .pickerStyle(.segmented) + .frame(width: 180) + .padding(.vertical, 8) + + Divider() + + Group { + if let transcript = activeTranscript, !transcript.segments.isEmpty { + transcriptView(transcript) + } else if case .error(let msg) = activeTranscriptionStatus { + VStack(spacing: 12) { + Image(systemName: "exclamationmark.triangle") + .font(.title) + .foregroundStyle(.secondary) + Text(msg) .font(.caption) .foregroundStyle(.secondary) .multilineTextAlignment(.center) - } else { - Button("Transcribe") { startTranscription() } - .buttonStyle(.borderedProminent) + Button("Retry") { startActiveTranscription() } + .buttonStyle(.bordered) + } + .frame(maxWidth: .infinity, maxHeight: .infinity) + } else if activeTranscriptionStatus != .idle { + VStack(spacing: 12) { + ProgressView() + Text(transcriptionStatusText) + .font(.caption) + .foregroundStyle(.secondary) + Button("Cancel") { cancelTranscription() } + .buttonStyle(.bordered) } + .frame(maxWidth: .infinity, maxHeight: .infinity) + } else { + providerIdleView + } + } + } + } + + private var providerIdleView: some View { + VStack(spacing: 12) { + switch activeProvider { + case .local: + Button("Transcribe") { startLocalTranscription() } + .buttonStyle(.borderedProminent) + case .soniox: + if sonioxAPIKey.isEmpty { + Text("Add your Soniox API key in Settings to enable cloud transcription") + .font(.caption) + .foregroundStyle(.secondary) + .multilineTextAlignment(.center) + } else { + Button("Transcribe") { startSonioxTranscription() } + .buttonStyle(.borderedProminent) } - .frame(maxWidth: .infinity, maxHeight: .infinity) } } + .frame(maxWidth: .infinity, maxHeight: .infinity) } private var transcriptionStatusText: String { - switch transcriptionStatus { + switch activeTranscriptionStatus { + case .preparing: "Loading models..." case .uploading: "Uploading audio..." case .queued: "Queued for transcription..." case .transcribing: "Transcribing..." @@ -840,49 +905,93 @@ struct RecordingDetailView: View { // MARK: - Transcription Lifecycle private func loadTranscript() { - transcript = TranscriptDocument.load(for: recording.url) - transcriptionStatus = .idle + localTranscript = TranscriptDocument.load(for: recording.url, provider: .local) + sonioxTranscript = + TranscriptDocument.load(for: recording.url, provider: .soniox) + ?? TranscriptDocument.loadLegacy(in: recording.url) + localTranscriptionStatus = .idle + sonioxTranscriptionStatus = .idle + } + + private func startActiveTranscription() { + switch activeProvider { + case .local: startLocalTranscription() + case .soniox: startSonioxTranscription() + } + } + + private func startLocalTranscription() { + transcriptionTask = Task { + localTranscriptionStatus = .preparing + do { + let doc = try await LocalTranscriptionService.transcribe( + recordingDirectory: recording.url + ) { status in + Task { @MainActor in + localTranscriptionStatus = status + } + } + localTranscript = doc + localTranscriptionStatus = .completed + onTranscriptChanged() + do { + try doc.save(for: recording.url, provider: .local) + } catch { + Log.error( + Log.transcription, "local", + "failed to save transcript: \(error.localizedDescription)") + } + } catch is CancellationError { + localTranscriptionStatus = .idle + } catch { + if Task.isCancelled { + localTranscriptionStatus = .idle + } else { + localTranscriptionStatus = .error(error.localizedDescription) + Log.error( + Log.transcription, "local", "failed: \(error.localizedDescription)") + } + } + } } - private func startTranscription() { + private func startSonioxTranscription() { guard !sonioxAPIKey.isEmpty else { return } transcriptionTask = Task { let service = TranscriptionService(apiKey: sonioxAPIKey) - transcriptionStatus = .uploading + sonioxTranscriptionStatus = .uploading do { let doc = try await service.transcribe( fileURL: recording.audioURL, onStatus: { status in - transcriptionStatus = status + sonioxTranscriptionStatus = status } ) - // Show transcript immediately, then persist - transcript = doc - transcriptionStatus = .completed + sonioxTranscript = doc + sonioxTranscriptionStatus = .completed onTranscriptChanged() do { - try doc.save(for: recording.url) + try doc.save(for: recording.url, provider: .soniox) } catch { Log.error( Log.transcription, "transcription", "failed to save transcript: \(error.localizedDescription)") } } catch is CancellationError { - transcriptionStatus = .idle + sonioxTranscriptionStatus = .idle } catch let error as URLError where error.code == .cancelled { - // URLSession throws URLError.cancelled when Task is cancelled if Task.isCancelled { - transcriptionStatus = .idle + sonioxTranscriptionStatus = .idle } else { - transcriptionStatus = .error(error.localizedDescription) + sonioxTranscriptionStatus = .error(error.localizedDescription) } } catch { if Task.isCancelled { - transcriptionStatus = .idle + sonioxTranscriptionStatus = .idle } else { - transcriptionStatus = .error(error.localizedDescription) + sonioxTranscriptionStatus = .error(error.localizedDescription) Log.error( Log.transcription, "transcription", "failed: \(error.localizedDescription)") @@ -894,14 +1003,20 @@ struct RecordingDetailView: View { private func cancelTranscription() { transcriptionTask?.cancel() transcriptionTask = nil - if case .error = transcriptionStatus { return } - transcriptionStatus = .idle + switch activeProvider { + case .local: + if case .error = localTranscriptionStatus { return } + localTranscriptionStatus = .idle + case .soniox: + if case .error = sonioxTranscriptionStatus { return } + sonioxTranscriptionStatus = .idle + } } // MARK: - Transcript Export private func exportTranscript() { - guard let transcript, !transcript.segments.isEmpty else { return } + guard let transcript = activeTranscript, !transcript.segments.isEmpty else { return } let panel = NSSavePanel() panel.nameFieldStringValue = "\(recording.title).json" @@ -1051,7 +1166,7 @@ struct RecordingFile: Identifiable { let date: Date // from metadata.createdAt let size: Int let hasProcessed: Bool - let hasTranscript: Bool + let availableTranscripts: Set var sizeFormatted: String { ByteCountFormatter.string(fromByteCount: Int64(size), countStyle: .file) diff --git a/Sources/SettingsView.swift b/Sources/SettingsView.swift index 10950e4..b51609c 100644 --- a/Sources/SettingsView.swift +++ b/Sources/SettingsView.swift @@ -174,8 +174,48 @@ struct SettingsView: View { // MARK: - Transcription + @AppStorage(TranscriptionProvider.defaultsKey) private var defaultProvider = "local" + @State private var isDownloadingModels = false + @State private var localModelsReady = LocalTranscriptionService.modelsReady + private var transcriptionSection: some View { Section("Transcription") { + Picker("Default provider", selection: $defaultProvider) { + Text("Local (on-device)").tag("local") + Text("Soniox (cloud)").tag("soniox") + } + + if localModelsReady { + Label("Models downloaded", systemImage: "checkmark.circle.fill") + .foregroundStyle(.green) + .font(.caption) + } else { + HStack { + Button("Download Models") { + isDownloadingModels = true + Task { + do { + try await LocalTranscriptionService.prepareModels { _ in } + localModelsReady = true + } catch { + Log.error( + Log.transcription, "settings", + "model download failed: \(error.localizedDescription)") + } + isDownloadingModels = false + } + } + .disabled(isDownloadingModels) + if isDownloadingModels { + ProgressView() + .controlSize(.small) + } + } + Text("Required for local transcription. Downloads ~300 MB of speech models.") + .font(.caption) + .foregroundStyle(.secondary) + } + SecureField("Soniox API Key", text: $sonioxAPIKey) Text( "Get your API key at soniox.com. Audio is sent to Soniox servers for transcription." diff --git a/Sources/TranscriptionService.swift b/Sources/TranscriptionService.swift index f6d2e4e..b933022 100644 --- a/Sources/TranscriptionService.swift +++ b/Sources/TranscriptionService.swift @@ -1,6 +1,31 @@ import AVFoundation import Foundation +// MARK: - Transcription Provider + +nonisolated enum TranscriptionProvider: String, CaseIterable, Identifiable, Sendable { + case local, soniox + + /// UserDefaults key for the user's default transcription provider. + static let defaultsKey = "defaultTranscriptionProvider" + + var id: String { rawValue } + + var label: String { + switch self { + case .local: "Local" + case .soniox: "Soniox" + } + } + + var filename: String { + switch self { + case .local: "transcript-local.json" + case .soniox: "transcript-soniox.json" + } + } +} + // MARK: - Data Models nonisolated struct RecordingMetadata: Codable, Sendable { @@ -42,27 +67,52 @@ nonisolated struct TranscriptDocument: Codable, Sendable { var segments: [TranscriptSegment] var language: String? var createdAt: Date + var speakers: [String: String]? - nonisolated static func sidecarURL(for recordingURL: URL) -> URL { - recordingURL.appendingPathComponent("transcript.json") + nonisolated static func sidecarURL( + for recordingURL: URL, provider: TranscriptionProvider + ) -> URL { + recordingURL.appendingPathComponent(provider.filename) } - nonisolated static func load(for recordingURL: URL) -> TranscriptDocument? { - let url = sidecarURL(for: recordingURL) + nonisolated static func load( + for recordingURL: URL, provider: TranscriptionProvider + ) -> TranscriptDocument? { + let url = sidecarURL(for: recordingURL, provider: provider) guard let data = try? Data(contentsOf: url) else { return nil } let decoder = JSONDecoder() decoder.dateDecodingStrategy = .iso8601 return try? decoder.decode(TranscriptDocument.self, from: data) } - func save(for recordingURL: URL) throws { - let url = Self.sidecarURL(for: recordingURL) + func save(for recordingURL: URL, provider: TranscriptionProvider) throws { + let url = Self.sidecarURL(for: recordingURL, provider: provider) let encoder = JSONEncoder() encoder.dateEncodingStrategy = .iso8601 encoder.outputFormatting = [.prettyPrinted, .sortedKeys] let data = try encoder.encode(self) try data.write(to: url, options: .atomic) } + + /// Migrates legacy `transcript.json` to `transcript-soniox.json`. + nonisolated static func migrateLegacyTranscript(in directory: URL) { + let legacy = directory.appendingPathComponent("transcript.json") + let target = directory.appendingPathComponent(TranscriptionProvider.soniox.filename) + if FileManager.default.fileExists(atPath: legacy.path), + !FileManager.default.fileExists(atPath: target.path) + { + try? FileManager.default.moveItem(at: legacy, to: target) + } + } + + /// Fallback: load pre-migration `transcript.json` if provider-specific file is missing. + nonisolated static func loadLegacy(in directory: URL) -> TranscriptDocument? { + let url = directory.appendingPathComponent("transcript.json") + guard let data = try? Data(contentsOf: url) else { return nil } + let decoder = JSONDecoder() + decoder.dateDecodingStrategy = .iso8601 + return try? decoder.decode(TranscriptDocument.self, from: data) + } } nonisolated struct TranscriptSegment: Codable, Identifiable, Sendable { @@ -89,6 +139,7 @@ nonisolated struct TranscriptSegment: Codable, Identifiable, Sendable { nonisolated enum TranscriptionStatus: Equatable, Sendable { case idle + case preparing case uploading case queued case transcribing @@ -448,7 +499,7 @@ final class TranscriptionService { })?["language"] as? String return TranscriptDocument( - segments: segments, language: language, createdAt: Date()) + segments: segments, language: language, createdAt: Date(), speakers: nil) } // MARK: - Cleanup