Prepare for swift-transformers upgrade, add hooks for TranscribeTask overrides (#367)

ZachNagengast · chen-argmax · web-flow · commit 4ef384ec769b · 2025-10-16T16:53:24.000-07:00
Co-authored-by: chen &lt;chen@argmaxinc.com&gt;
diff --git a/.github/workflows/release-tests.yml b/.github/workflows/release-tests.yml
@@ -1,4 +1,4 @@
-name: Pre-Release Tests
+name: Release Tests
 
 on:
   push:
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -75,7 +75,7 @@ jobs:
         run: |
           echo "Simulators on runner:"
           xcrun simctl list
-          if [[ "${{ matrix.run-config['name'] }}" != "macOS" ]]; then
+          if [[ "${{ matrix.run-config['name'] }}" == "visionOS" ]]; then
             xcodebuild -downloadPlatform ${{ matrix.run-config['name'] }}
           fi
           echo "Runtimes for testing:"
diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@
 
 # WhisperKit
 
-[![Tests](https://github.com/argmaxinc/whisperkit/actions/workflows/unit-tests.yml/badge.svg)](https://github.com/argmaxinc/whisperkit/actions/workflows/pre-release-tests.yml)
+[![Tests](https://github.com/argmaxinc/whisperkit/actions/workflows/release-tests.yml/badge.svg)](https://github.com/argmaxinc/whisperkit/actions/workflows/release-tests.yml)
 [![License](https://img.shields.io/github/license/argmaxinc/whisperkit?logo=github&logoColor=969da4&label=License&labelColor=353a41&color=32d058)](LICENSE.md)
 [![Supported Swift Version](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2Fargmaxinc%2FWhisperKit%2Fbadge%3Ftype%3Dswift-versions&labelColor=353a41&color=32d058)](https://swiftpackageindex.com/argmaxinc/WhisperKit) [![Supported Platforms](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2Fargmaxinc%2FWhisperKit%2Fbadge%3Ftype%3Dplatforms&labelColor=353a41&color=32d058)](https://swiftpackageindex.com/argmaxinc/WhisperKit)
 [![Discord](https://img.shields.io/discord/1171912382512115722?style=flat&logo=discord&logoColor=969da4&label=Discord&labelColor=353a41&color=32d058&link=https%3A%2F%2Fdiscord.gg%2FG5F5GZGecC)](https://discord.gg/G5F5GZGecC)
diff --git a/Sources/WhisperKit/Core/TextDecoder.swift b/Sources/WhisperKit/Core/TextDecoder.swift
@@ -355,8 +355,8 @@ public extension TextDecoding {
         {
             // Prefilling kv cache data requires non-nil task and language tokens, set defaults if not provided
             // Task tokens are remapped to 0->transcribe and 1->translate for the prefill lookup table
-            let task = MLMultiArray.from([taskToken == tokenizer.specialTokens.transcribeToken ? 0 : 1])
-            let lang = MLMultiArray.from([languageToken])
+            let task = try MLMultiArray.from([taskToken == tokenizer.specialTokens.transcribeToken ? 0 : 1])
+            let lang = try MLMultiArray.from([languageToken])
             guard let prefillOutput = try await self.prefillKVCache(withTask: task, andLanguage: lang) else {
                 Logging.error("Unable to prefill cache")
                 return prefilledDecoderInputs
diff --git a/Sources/WhisperKit/Core/TranscribeTask.swift b/Sources/WhisperKit/Core/TranscribeTask.swift
@@ -5,19 +5,20 @@ import CoreML
 import Foundation
 
 /// Responsible for transcribing audio chunk to text using the provided models and configurations.
-final class TranscribeTask {
+@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
+open class TranscribeTask {
     private var timings: TranscriptionTimings
     private let progress: Progress
     private let audioEncoder: any AudioEncoding
     private let featureExtractor: any FeatureExtracting
     private let segmentSeeker: any SegmentSeeking
     private let textDecoder: any TextDecoding
-    private let tokenizer: any WhisperTokenizer
     private let audioProcessor: any AudioProcessing
 
+    public private(set) var tokenizer: any WhisperTokenizer
     public var segmentDiscoveryCallback: SegmentDiscoveryCallback?
 
-    init(
+    public init(
         currentTimings: TranscriptionTimings,
         progress: Progress?,
         audioProcessor: (any AudioProcessing)? = nil,
@@ -37,7 +38,23 @@ final class TranscribeTask {
         self.tokenizer = tokenizer
     }
 
-    func run(
+    /// Hook for subclasses to launch work that can run alongside the main decoder pipeline.
+    open func windowPreprocess(
+        for paddedAudio: any AudioProcessorOutputType,
+        seek: Int,
+        segmentSize: Int
+    ) async {}
+
+    /// Hook for subclasses to finalize side work and optionally replace the segments for the current window.
+    open func windowPostProcess(
+        seek: Int,
+        segmentSize: Int,
+        originalSegments: [TranscriptionSegment]
+    ) async -> [TranscriptionSegment] {
+        originalSegments
+    }
+
+    public func run(
         audioArray: [Float],
         decodeOptions: DecodingOptions? = nil,
         callback: TranscriptionCallback = nil
@@ -61,7 +78,6 @@ final class TranscribeTask {
         // These accumulate across windows
         var allSegments: [TranscriptionSegment] = []
         var allTokens: [Int] = []
-        var transcription = ""
 
         let startDecoderInit = CFAbsoluteTimeGetCurrent()
         var decoderInputs = try textDecoder.prepareDecoderInputs(withPrompt: [tokenizer.specialTokens.startOfTranscriptToken])
@@ -107,6 +123,7 @@ final class TranscribeTask {
             
             let windowSamples = featureExtractor.windowSamples ?? Constants.defaultWindowSamples
             while seek < seekClipEnd - windowPadding {
+                let windowSeek = seek
                 // calculate new encoder segment features
                 let timeOffset = Float(seek) / Float(WhisperKit.sampleRate)
                 let segmentSize = min(windowSamples, contentFrames - seek, seekClipEnd - seek)
@@ -119,6 +136,7 @@ final class TranscribeTask {
                 guard let audioSamples = audioProcessor.padOrTrim(fromArray: clipAudioSamples, startAt: 0, toLength: windowSamples) else {
                     throw WhisperError.transcriptionFailed("Audio samples are nil")
                 }
+                await windowPreprocess(for: audioSamples, seek: windowSeek, segmentSize: segmentSize)
                 let processTime = Date().timeIntervalSince(audioProcessingStart)
                 timings.audioProcessing += processTime
                 timings.totalAudioProcessingRuns += 1
@@ -222,24 +240,30 @@ final class TranscribeTask {
                     seek = min(seek, maxSeekOffset)
                 }
 
-                guard let currentSegments = currentSegments else {
+                guard let currentSegments else {
                     // No current segment found, skip to next window
                     continue
                 }
 
+                let processedSegments = await windowPostProcess(
+                    seek: windowSeek,
+                    segmentSize: segmentSize,
+                    originalSegments: currentSegments
+                )
+
                 if options.verbose {
-                    let lines = TranscriptionUtilities.formatSegments(currentSegments)
+                    let lines = TranscriptionUtilities.formatSegments(processedSegments)
                     Logging.debug("Segments for window:")
                     for line in lines {
                         Logging.debug(line)
                     }
                 }
 
-                segmentDiscoveryCallback?(currentSegments)
+                segmentDiscoveryCallback?(processedSegments)
 
                 // add them to the `allSegments` list
-                allSegments.append(contentsOf: currentSegments)
-                let allCurrentTokens = currentSegments.flatMap { $0.tokens }
+                allSegments.append(contentsOf: processedSegments)
+                let allCurrentTokens = processedSegments.flatMap { $0.tokens }
                 allTokens.append(contentsOf: allCurrentTokens)
 
                 timings.decodingWindowing += Date().timeIntervalSince(windowingStart)
@@ -364,8 +388,23 @@ final class TranscribeTask {
         timings.decodingLoop = CFAbsoluteTimeGetCurrent() - startDecodeLoopTime
         timings.fullPipeline = CFAbsoluteTimeGetCurrent() - timings.pipelineStart
 
-        let wordTokens = allTokens.filter { $0 < tokenizer.specialTokens.specialTokenBegin }
-        transcription = tokenizer.decode(tokens: wordTokens).trimmingCharacters(in: .whitespaces)
+        let transcriptionResult = finalizeTranscriptionResult(
+            tokens: allTokens,
+            segments: allSegments,
+            language: detectedLanguage,
+            timings: timings
+        )
+        return transcriptionResult
+    }
+    
+    open func finalizeTranscriptionResult(
+        tokens: [Int],
+        segments allSegments: [TranscriptionSegment],
+        language detectedLanguage: String?,
+        timings: TranscriptionTimings
+    ) -> TranscriptionResult {
+        let wordTokens = tokens.filter { $0 < tokenizer.specialTokens.specialTokenBegin }
+        let transcription = tokenizer.decode(tokens: wordTokens).trimmingCharacters(in: .whitespaces)
         return TranscriptionResult(
             text: transcription,
             segments: allSegments,
diff --git a/Sources/WhisperKit/Core/WhisperKit.swift b/Sources/WhisperKit/Core/WhisperKit.swift
@@ -6,7 +6,6 @@ import AVFoundation
 import CoreML
 import Foundation
 import Hub
-import TensorUtils
 import Tokenizers
 
 open class WhisperKit {
@@ -957,6 +956,29 @@ open class WhisperKit {
         return transcribeResults
     }
 
+    /// Setup the `TranscribeTask` used for decoding. Subclasses may override to provide custom behavior.
+    open func setupTranscribeTask(
+        currentTimings: TranscriptionTimings,
+        progress: Progress,
+        audioProcessor: any AudioProcessing,
+        audioEncoder: any AudioEncoding,
+        featureExtractor: any FeatureExtracting,
+        segmentSeeker: any SegmentSeeking,
+        textDecoder: any TextDecoding,
+        tokenizer: any WhisperTokenizer
+    ) -> TranscribeTask {
+        TranscribeTask(
+            currentTimings: currentTimings,
+            progress: progress,
+            audioProcessor: audioProcessor,
+            audioEncoder: audioEncoder,
+            featureExtractor: featureExtractor,
+            segmentSeeker: segmentSeeker,
+            textDecoder: textDecoder,
+            tokenizer: tokenizer
+        )
+    }
+
     /// Runs the transcription task on a single audio sample array asynchronously with custom segment callback.
     /// - Returns: An array of `TranscriptionResult`.
     /// - Throws: An error if the transcription fails or if the tokenizer is unavailable.
@@ -983,7 +1005,7 @@ open class WhisperKit {
             progress.totalUnitCount = max(1, progress.totalUnitCount)
             progress.addChild(childProgress, withPendingUnitCount: 1)
 
-            let transcribeTask = TranscribeTask(
+            let transcribeTask = setupTranscribeTask(
                 currentTimings: currentTimings,
                 progress: childProgress,
                 audioProcessor: audioProcessor,
diff --git a/Sources/WhisperKit/Utilities/Extensions+Internal.swift b/Sources/WhisperKit/Utilities/Extensions+Internal.swift
@@ -4,6 +4,24 @@
 import AVFoundation
 import CoreML
 
+extension MLMultiArray {
+    /// All values will be stored in the last dimension of the MLMultiArray (default is dims=1)
+    static func from(_ array: [Int], dims: Int = 1) throws -> MLMultiArray {
+        var shape = Array(repeating: 1, count: dims)
+        shape[shape.count - 1] = array.count
+        /// Examples:
+        /// dims=1 : [arr.count]
+        /// dims=2 : [1, arr.count]
+        ///
+        let output = try MLMultiArray(shape: shape as [NSNumber], dataType: .int32)
+        let pointer = UnsafeMutablePointer<Int32>(OpaquePointer(output.dataPointer))
+        for (i, item) in array.enumerated() {
+            pointer[i] = Int32(item)
+        }
+        return output
+    }
+}
+
 extension Array {
     func batched(into size: Int) -> [[Element]] {
         return stride(from: 0, to: count, by: size).map {
@@ -35,6 +53,26 @@ extension Array where Element: Hashable {
     }
 }
 
+extension String {
+    /// Reference: https://github.com/huggingface/swift-transformers/blob/94610577e4af9bbc267060af1e25e977604dd796/Sources/Tokenizers/Decoder.swift#L267-L275
+    func trimmingFromEnd(character: Character = " ", upto: Int) -> String {
+        var result = self
+        var trimmed = 0
+        while trimmed < upto && result.last == character {
+            result.removeLast()
+            trimmed += 1
+        }
+        return result
+    }
+}
+
+extension [String] {
+    /// Reference: https://github.com/huggingface/swift-transformers/blob/94610577e4af9bbc267060af1e25e977604dd796/Sources/Hub/HubApi.swift#L983-L987
+    func matching(glob: String) -> [String] {
+        filter { fnmatch(glob, $0, 0) == 0 }
+    }
+}
+
 extension AVAudioPCMBuffer {
     /// Converts the buffer to a float array
     func asFloatArray() throws -> [Float] {

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-name: Pre-Release Tests`
	`1`	`+name: Release Tests`
`2`	`2`
`3`	`3`	`on:`
`4`	`4`	`push:`