From 85e847ae7c9282682ffac18eb871bea22628b78a Mon Sep 17 00:00:00 2001
From: ZachNagengast <znagengast@gmail.com>
Date: Sat, 1 Feb 2025 09:28:10 -0800
Subject: [PATCH 01/26] Reorder text normalization to remove whitespace at
 final step

Co-authored-by: Andrew Wooster <andrew.wooster@modmed.com>
---
 Sources/WhisperKit/Core/Utils/Utils.swift | 14 +++----
 Tests/WhisperKitTests/UnitTests.swift     | 50 +++++++++++++++++++++++
 2 files changed, 57 insertions(+), 7 deletions(-)

diff --git a/Sources/WhisperKit/Core/Utils/Utils.swift b/Sources/WhisperKit/Core/Utils/Utils.swift
index 241997d..2dafe11 100644
--- a/Sources/WhisperKit/Core/Utils/Utils.swift
+++ b/Sources/WhisperKit/Core/Utils/Utils.swift
@@ -56,7 +56,7 @@ extension Array where Element: Hashable {
 }
 
 extension MLMultiArray {
-    /// Calculate the linear offset by summing the products of each dimension’s index with the dimension’s stride.
+    /// Calculate the linear offset by summing the products of each dimension's index with the dimension's stride.
     /// More info [here](https://developer.apple.com/documentation/coreml/mlmultiarray/2879231-subscript)
     /// - Parameters:
     ///  - index: The index of the element
@@ -251,11 +251,8 @@ public extension Float {
 
 public extension String {
     var normalized: String {
-        // Trim whitespace and newlines
-        let trimmedString = self.trimmingCharacters(in: .whitespacesAndNewlines)
-
         // Convert to lowercase
-        let lowercaseString = trimmedString.lowercased()
+        let lowercaseString = self.lowercased()
 
         // Replace dashes with spaces
         let noDashesString = lowercaseString.replacingOccurrences(of: "-", with: " ")
@@ -265,8 +262,11 @@ public extension String {
 
         // Replace multiple spaces with a single space
         let singleSpacedString = noPunctuationString.replacingOccurrences(of: " +", with: " ", options: .regularExpression)
-
-        return singleSpacedString
+        
+        // Trim whitespace and newlines
+        let trimmedString = singleSpacedString.trimmingCharacters(in: .whitespacesAndNewlines)
+        
+        return trimmedString
     }
 
     func trimmingSpecialTokenCharacters() -> String {
diff --git a/Tests/WhisperKitTests/UnitTests.swift b/Tests/WhisperKitTests/UnitTests.swift
index 0fd780b..0fc4ba1 100644
--- a/Tests/WhisperKitTests/UnitTests.swift
+++ b/Tests/WhisperKitTests/UnitTests.swift
@@ -1353,6 +1353,32 @@ final class UnitTests: XCTestCase {
         XCTAssertEqual([Int]().batched(into: 3), [])
         XCTAssertEqual([1, 2, 3, 4].batched(into: 3), [[1, 2, 3], [4]])
     }
+    
+    func testStringNormalization() throws {
+        // Basic cases
+        XCTAssertEqual("Hello World!".normalized, "hello world")
+        XCTAssertEqual("hello   world".normalized, "hello world")
+        XCTAssertEqual("HELLO WORLD".normalized, "hello world")
+        
+        // Punctuation
+        XCTAssertEqual("Hello, World!".normalized, "hello world")
+        XCTAssertEqual("Hello... World???".normalized, "hello world")
+        XCTAssertEqual("'Hello' \"World\"".normalized, "hello world")
+        
+        // Dashes and hyphens
+        XCTAssertEqual("hello-world".normalized, "hello world")
+        XCTAssertEqual("hello -- world".normalized, "hello world")
+        
+        // Whitespace handling
+        XCTAssertEqual("   hello   world   ".normalized, "hello world")
+        
+        // Mixed cases
+        XCTAssertEqual("Hello!!!---World???".normalized, "hello world")
+        
+        // Numbers and special characters
+        XCTAssertEqual("Hello: World".normalized, "hello world")
+        XCTAssertEqual("Hello% world 100".normalized, "hello world 100")
+    }
 
     func testTrimmingSpecialTokenCharacters() {
         XCTAssertEqual("<|en|>".trimmingSpecialTokenCharacters(), "en")
@@ -1969,6 +1995,30 @@ final class UnitTests: XCTestCase {
         }
     }
 
+    func testWordTimingComparison() throws {
+        let word1 = WordTiming(word: "Hello!", tokens: [1], start: 0, end: 1, probability: 1.0)
+        let word2 = WordTiming(word: "hello", tokens: [2], start: 0, end: 1, probability: 1.0)
+        let word3 = WordTiming(word: "World!", tokens: [3], start: 1, end: 2, probability: 1.0)
+        let word4 = WordTiming(word: "WORLD", tokens: [4], start: 1, end: 2, probability: 1.0)
+        
+        // Test common prefix finding
+        let sequence1 = [word1, word3]
+        let sequence2 = [word2, word4]
+        
+        let commonPrefix = findLongestCommonPrefix(sequence1, sequence2)
+        XCTAssertEqual(commonPrefix.count, 2)
+        XCTAssertEqual(commonPrefix[0].word, "hello")
+        XCTAssertEqual(commonPrefix[1].word, "WORLD")
+        
+        // Test different suffix finding
+        let sequence3 = [word1, word3]
+        let sequence4 = [word2, word4, WordTiming(word: "suffix", tokens: [5], start: 2, end: 3, probability: 1.0)]
+        
+        let differentSuffix = findLongestDifferentSuffix(sequence3, sequence4)
+        XCTAssertEqual(differentSuffix.count, 1)
+        XCTAssertEqual(differentSuffix[0].word, "suffix")
+    }
+
     func testWordTimestampCorrectness() async throws {
         let options = DecodingOptions(wordTimestamps: true)
 

From 0cd78a223398c9d3ba867152c719de6ec8f4dc95 Mon Sep 17 00:00:00 2001
From: ZachNagengast <znagengast@gmail.com>
Date: Wed, 19 Feb 2025 22:10:39 -0800
Subject: [PATCH 02/26] Fix word alignment for long words or short token length
 segments

---
 Package.swift                                 |   4 +-
 Sources/WhisperKit/Core/Models.swift          |   3 +
 .../WhisperKit/Core/Text/SegmentSeeker.swift  | 112 ++++++++---
 Sources/WhisperKit/Core/TranscribeTask.swift  |   4 +-
 Tests/WhisperKitTests/UnitTests.swift         | 190 +++++++++++++++++-
 5 files changed, 275 insertions(+), 38 deletions(-)

diff --git a/Package.swift b/Package.swift
index 8bbea16..b5c04b7 100644
--- a/Package.swift
+++ b/Package.swift
@@ -20,8 +20,8 @@ let package = Package(
         ),
     ],
     dependencies: [
-        .package(url: "https://github.com/huggingface/swift-transformers.git", exact: "0.1.8"),
-        .package(url: "https://github.com/apple/swift-argument-parser.git", exact: "1.3.0"),
+        .package(url: "https://github.com/huggingface/swift-transformers.git", .upToNextMajor(from: "0.1.8")),
+        .package(url: "https://github.com/apple/swift-argument-parser.git", .upToNextMajor(from: "1.3.0")),
     ],
     targets: [
         .target(
diff --git a/Sources/WhisperKit/Core/Models.swift b/Sources/WhisperKit/Core/Models.swift
index bf7d7eb..8ef5ba4 100644
--- a/Sources/WhisperKit/Core/Models.swift
+++ b/Sources/WhisperKit/Core/Models.swift
@@ -1512,6 +1512,9 @@ public enum Constants {
 
     public static let defaultWindowSamples: Int = 480_000 // 30s of audio at 16khz sample rate default for Whisper models
 
+    public static let defaultPrependPunctuations: String = "\"'“¡¿([{-"
+    public static let defaultAppendPunctuations: String = "\"'.。,，!！?？:：”)]}、"
+
     public static let fallbackModelSupportConfig: ModelSupportConfig = {
         var config = ModelSupportConfig(
             repoName: "whisperkit-coreml-fallback",
diff --git a/Sources/WhisperKit/Core/Text/SegmentSeeker.swift b/Sources/WhisperKit/Core/Text/SegmentSeeker.swift
index 33a45dc..ed696e6 100644
--- a/Sources/WhisperKit/Core/Text/SegmentSeeker.swift
+++ b/Sources/WhisperKit/Core/Text/SegmentSeeker.swift
@@ -279,8 +279,12 @@ open class SegmentSeeker: SegmentSeeking {
 
         return (textIndices.reversed(), timeIndices.reversed())
     }
-
-    func mergePunctuations(alignment: [WordTiming], prepended: String, appended: String) -> [WordTiming] {
+    
+    func mergePunctuations(
+        alignment: [WordTiming],
+        prepended: String = Constants.defaultPrependPunctuations,
+        appended: String = Constants.defaultAppendPunctuations
+    ) -> [WordTiming] {
         var prependedAlignment = [WordTiming]()
         var appendedAlignment = [WordTiming]()
 
@@ -405,8 +409,8 @@ open class SegmentSeeker: SegmentSeeking {
         tokenizer: WhisperTokenizer,
         seek: Int,
         segmentSize: Int,
-        prependPunctuations: String,
-        appendPunctuations: String,
+        prependPunctuations: String = Constants.defaultPrependPunctuations,
+        appendPunctuations: String = Constants.defaultAppendPunctuations,
         lastSpeechTimestamp: Float,
         options: DecodingOptions,
         timings: TranscriptionTimings
@@ -415,7 +419,6 @@ open class SegmentSeeker: SegmentSeeking {
         var wordTokenIds = [Int]()
         var filteredLogProbs = [Float]()
         var filteredIndices = [Int]()
-        var lastSpeechTimestamp = lastSpeechTimestamp
 
         // Iterate through each segment
         var indexOffset = 0
@@ -455,6 +458,7 @@ open class SegmentSeeker: SegmentSeeking {
 
         Logging.debug("Alignment weights shape: \(filteredAlignmentWeights.shape)")
 
+        // Find alignment between text tokens and time indices
         var alignment = try findAlignment(
             wordTokenIds: wordTokenIds,
             alignmentWeights: filteredAlignmentWeights,
@@ -465,32 +469,69 @@ open class SegmentSeeker: SegmentSeeking {
 
         // TODO: This section is considered a "hack" in the source repo
         // Reference: https://github.com/openai/whisper/blob/ba3f3cd54b0e5b8ce1ab3de13e32122d0d5f98ab/whisper/timing.py#L305
+        let (constrainedMedianDuration, maxDuration) = calculateWordDurationConstraints(alignment: alignment)
+        alignment = truncateLongWordsAtSentenceBoundaries(alignment, maxDuration: maxDuration)
+
+        // Process alignment for punctuations
+        let mergedAlignment = mergePunctuations(alignment: alignment, prepended: prependPunctuations, appended: appendPunctuations)
+
+        // Update segments based on more accurate word timings
+        let updatedSegments = updateSegmentsWithWordTimings(
+            segments: segments,
+            mergedAlignment: mergedAlignment,
+            seek: seek,
+            lastSpeechTimestamp: lastSpeechTimestamp,
+            constrainedMedianDuration: constrainedMedianDuration,
+            maxDuration: maxDuration,
+            tokenizer: tokenizer
+        )
+
+        return updatedSegments
+    }
+
+
+    public func calculateWordDurationConstraints(alignment: [WordTiming]) -> (Float, Float) {
         var wordDurations = alignment.map { $0.end - $0.start }
         wordDurations = wordDurations.filter { $0 > 0 }
-
+        
         let medianDuration: Float = wordDurations.isEmpty ? 0.0 : wordDurations.sorted(by: <)[wordDurations.count / 2]
         let constrainedMedianDuration = min(0.7, medianDuration)
         let maxDuration = constrainedMedianDuration * 2
-
-        // Truncate long words at sentence boundaries
+        
+        return (constrainedMedianDuration, maxDuration)
+    }
+    
+    public func truncateLongWordsAtSentenceBoundaries(_ alignment: [WordTiming], maxDuration: Float) -> [WordTiming] {
         let sentenceEndMarks = [".", "。", "!", "！", "?", "？"]
-        if !wordDurations.isEmpty {
-            for i in 1..<alignment.count {
-                if alignment[i].end - alignment[i].start > maxDuration {
-                    if sentenceEndMarks.contains(alignment[i].word) {
-                        alignment[i].end = alignment[i].start + maxDuration
-                    } else if i > 0, sentenceEndMarks.contains(alignment[i - 1].word) {
-                        alignment[i].start = alignment[i].end - maxDuration
+        var truncatedAlignment = alignment
+        
+        if !truncatedAlignment.isEmpty {
+            for i in 1..<truncatedAlignment.count {
+                if truncatedAlignment[i].end - truncatedAlignment[i].start > maxDuration {
+                    if sentenceEndMarks.contains(truncatedAlignment[i].word) {
+                        truncatedAlignment[i].end = truncatedAlignment[i].start + maxDuration
+                    } else if i > 0, sentenceEndMarks.contains(truncatedAlignment[i - 1].word) {
+                        truncatedAlignment[i].start = truncatedAlignment[i].end - maxDuration
                     }
                 }
             }
         }
+        
+        return truncatedAlignment
+    }
 
-        // Process alignment for punctuations
-        let mergedAlignment = mergePunctuations(alignment: alignment, prepended: prependPunctuations, appended: appendPunctuations)
-
-        var wordIndex = 0
+    public func updateSegmentsWithWordTimings(
+        segments: [TranscriptionSegment],
+        mergedAlignment: [WordTiming],
+        seek: Int,
+        lastSpeechTimestamp: Float,
+        constrainedMedianDuration: Float,
+        maxDuration: Float,
+        tokenizer: WhisperTokenizer
+    ) -> [TranscriptionSegment] {
         let timeOffset = Float(seek) / Float(WhisperKit.sampleRate)
+        var wordIndex = 0
+        var lastSpeechTimestamp = lastSpeechTimestamp // Create a mutable copy, it will be updated below
         var updatedSegments = [TranscriptionSegment]()
 
         for segment in segments {
@@ -526,32 +567,39 @@ open class SegmentSeeker: SegmentSeeking {
             // TODO: This section is considered a "hack" in the source repo
             // Reference: https://github.com/openai/whisper/blob/ba3f3cd54b0e5b8ce1ab3de13e32122d0d5f98ab/whisper/timing.py#L342
             // Truncate long words at segment boundaries
-            if let firstWord = wordsInSegment.first, let lastWord = wordsInSegment.last {
-                // Logic for the first word
+            if let firstWord = wordsInSegment.first {
+                // Ensure the first and second word after a pause is not longer than
+                // twice the median word duration.
                 if firstWord.end - lastSpeechTimestamp > constrainedMedianDuration * 4 &&
                     (firstWord.end - firstWord.start > maxDuration ||
-                        (wordsInSegment.count > 1 && wordsInSegment[1].end - firstWord.start > maxDuration * 2))
+                     (wordsInSegment.count > 1 && wordsInSegment[1].end - firstWord.start > maxDuration * 2))
                 {
+                    // First word or both words are too long
                     if wordsInSegment.count > 1 && wordsInSegment[1].end - wordsInSegment[1].start > maxDuration {
-                        let boundary = max(wordsInSegment[1].end / 2, wordsInSegment[1].end - maxDuration)
+                        // Second word is too long, set it to max duration and shorten first word to fit
+                        let boundary = min(wordsInSegment[1].start + maxDuration, wordsInSegment[1].end / 2)
                         wordsInSegment[0].end = boundary
                         wordsInSegment[1].start = boundary
                     }
-                    wordsInSegment[0].start = max(lastSpeechTimestamp, firstWord.end - maxDuration)
+                    
+                    // First word is too long, keep its start time and adjust its end time
+                    wordsInSegment[0].end = min(wordsInSegment[0].start + maxDuration, wordsInSegment[0].end)
                 }
 
                 // Prefer segment-level start timestamp if the first word is too long.
-                if segment.start < firstWord.end && segment.start - 0.5 > firstWord.start {
-                    wordsInSegment[0].start = max(0, min(firstWord.end - constrainedMedianDuration, segment.start))
+                if segment.start < wordsInSegment[0].end && segment.start - 0.5 > wordsInSegment[0].start {
+                    wordsInSegment[0].start = max(0, min(wordsInSegment[0].end - constrainedMedianDuration, segment.start))
                 } else {
-                    updatedSegment.start = firstWord.start
+                    updatedSegment.start = wordsInSegment[0].start
                 }
 
-                // Prefer segment-level end timestamp if the last word is too long.
-                if updatedSegment.end > lastWord.start && segment.end + 0.5 < lastWord.end {
-                    wordsInSegment[wordsInSegment.count - 1].end = max(lastWord.start + constrainedMedianDuration, segment.end)
-                } else {
-                    updatedSegment.end = lastWord.end
+                if let lastWord = wordsInSegment.last {
+                    // Prefer segment-level end timestamp if the last word is too long.
+                    if updatedSegment.end > lastWord.start && segment.end + 0.5 < lastWord.end {
+                        wordsInSegment[wordsInSegment.count - 1].end = max(lastWord.start + constrainedMedianDuration, segment.end)
+                    } else {
+                        updatedSegment.end = lastWord.end
+                    }
                 }
 
                 lastSpeechTimestamp = updatedSegment.end
diff --git a/Sources/WhisperKit/Core/TranscribeTask.swift b/Sources/WhisperKit/Core/TranscribeTask.swift
index cc3565f..e149104 100644
--- a/Sources/WhisperKit/Core/TranscribeTask.swift
+++ b/Sources/WhisperKit/Core/TranscribeTask.swift
@@ -192,8 +192,8 @@ final class TranscribeTask {
                         tokenizer: tokenizer,
                         seek: previousSeek,
                         segmentSize: segmentSize,
-                        prependPunctuations: "\"'“¿([{-",
-                        appendPunctuations: "\"'.。,，!！?？:：”)]}、",
+                        prependPunctuations: Constants.defaultPrependPunctuations,
+                        appendPunctuations: Constants.defaultAppendPunctuations,
                         lastSpeechTimestamp: Float(Double(previousSeek) / Double(WhisperKit.sampleRate)),
                         options: options,
                         timings: timings
diff --git a/Tests/WhisperKitTests/UnitTests.swift b/Tests/WhisperKitTests/UnitTests.swift
index 0fc4ba1..f23bc41 100644
--- a/Tests/WhisperKitTests/UnitTests.swift
+++ b/Tests/WhisperKitTests/UnitTests.swift
@@ -1924,7 +1924,7 @@ final class UnitTests: XCTestCase {
             WordTiming(word: "<|endoftext|>", tokens: [50257], start: 12, end: 13, probability: 1),
         ]
 
-        let mergedAlignmentTiming = SegmentSeeker().mergePunctuations(alignment: wordTimings, prepended: "\"'“¡¿([{-", appended: "\"'.。,，!！?？:：”)]}、")
+        let mergedAlignmentTiming = SegmentSeeker().mergePunctuations(alignment: wordTimings)
 
         let expectedWordTimings = [
             WordTiming(word: "<|notimestamps|>", tokens: [50363], start: 0, end: 1, probability: 1),
@@ -1968,7 +1968,7 @@ final class UnitTests: XCTestCase {
             WordTiming(word: "<|endoftext|>", tokens: [50257], start: 11, end: 12, probability: 1),
         ]
 
-        let mergedAlignmentTiming = SegmentSeeker().mergePunctuations(alignment: wordTimings, prepended: "\"'“¿([{-", appended: "\"'.。,，!！?？:：”)]}、")
+        let mergedAlignmentTiming = SegmentSeeker().mergePunctuations(alignment: wordTimings)
 
         let expectedWordTimings = [
             WordTiming(word: "<|0.00|>", tokens: [50364], start: 0, end: 1, probability: 1),
@@ -2080,6 +2080,192 @@ final class UnitTests: XCTestCase {
         }
     }
 
+    func testLongWordDurations() async {
+        // Test case with words spanning very long durations
+        let wordTimings = [
+            // Normal duration words
+            WordTiming(word: " The", tokens: [264], start: 0.5, end: 1.0, probability: 1),
+            WordTiming(word: " first", tokens: [4589], start: 1.0, end: 2.0, probability: 1),
+            // Long duration word (30 seconds)
+            WordTiming(word: " segment", tokens: [234], start: 2.0, end: 3.0, probability: 1),
+            // Normal words
+            WordTiming(word: " with", tokens: [567], start: 3.0, end: 4.0, probability: 1),
+            WordTiming(word: " a", tokens: [257], start: 4.0, end: 5.0, probability: 1),
+            // Very long duration word (100 seconds)
+            WordTiming(word: " long", tokens: [890], start: 5.0, end: 6.0, probability: 1),
+            // Normal duration ending
+            WordTiming(word: " ending", tokens: [123], start: 6.0, end: 35.0, probability: 1),
+            WordTiming(word: ".", tokens: [13], start: 35.0, end: 35.0, probability: 1),
+        ]
+
+        let segmentSeeker = SegmentSeeker()
+
+        // Create test segments
+        let segments = [
+            TranscriptionSegment(
+                id: 0,
+                seek: 0,
+                start: 0.0,
+                end: 6.0,
+                text: "The first segment with a long",
+                tokens: [264, 4589, 234, 567, 257, 890], // Using tokens from wordTimings
+                tokenLogProbs: Array(repeating: [0: 0.0], count: 6)
+            ),
+            TranscriptionSegment(
+                id: 1,
+                seek: 0,
+                start: 6.5, // slight difference to test truncation
+                end: 30.0, // slight difference to test truncation
+                text: "ending.",
+                tokens: [123, 13],
+                tokenLogProbs: Array(repeating: [0: 0.0], count: 2)
+            )
+        ]
+
+        // Test duration constraints calculation
+        let (constrainedMedianDuration, maxDuration) = segmentSeeker.calculateWordDurationConstraints(alignment: wordTimings)
+        XCTAssertEqual(constrainedMedianDuration, 0.7, "Constrained median duration should be capped at 0.7")
+        XCTAssertEqual(maxDuration, 1.4, "Max duration should be double the constrained median duration")
+
+        // Test truncation of long words
+        let truncatedAlignment = segmentSeeker.truncateLongWordsAtSentenceBoundaries(wordTimings, maxDuration: maxDuration)
+        let mergedAlignment = segmentSeeker.mergePunctuations(alignment: truncatedAlignment)
+        let updatedSegments = segmentSeeker.updateSegmentsWithWordTimings(
+            segments: segments,
+            mergedAlignment: mergedAlignment,
+            seek: 0,
+            lastSpeechTimestamp: 0,
+            constrainedMedianDuration: constrainedMedianDuration,
+            maxDuration: maxDuration,
+            tokenizer: try! await loadTokenizer(for: .tiny)
+        )
+
+        let updatedWords = updatedSegments.compactMap { $0.words }.flatMap { $0 }
+
+        // Test that long segments are properly truncated
+        let longSegmentDuration = updatedWords.last!.end - updatedWords.last!.start
+        XCTAssertEqual(
+            longSegmentDuration,
+            maxDuration,
+            accuracy: 0.0001,
+            "Long segment duration (\(longSegmentDuration)s) should be truncated to maximum allowed duration (\(maxDuration)s)"
+        )
+
+        // Test that segments were properly updated
+        XCTAssertEqual(updatedSegments.count, segments.count, "Number of segments should remain the same")
+        XCTAssertLessThanOrEqual(updatedSegments.last!.end - updatedSegments.last!.start, 19.5,
+                                 "Segment duration should not exceed max duration")
+
+        // Test the long word
+        let longWordIndex = updatedWords.firstIndex { $0.word == " ending." }!
+        let longWordDuration = updatedWords[longWordIndex].end - updatedWords[longWordIndex].start
+        XCTAssertEqual(
+            longWordDuration,
+            maxDuration,
+            accuracy: 0.0001,
+            "Very long pause duration (\(longWordDuration)s) should be truncated to maximum allowed duration (\(maxDuration)s)"
+        )
+
+        // Test that the long word retained it's start time
+        XCTAssertEqual(
+            mergedAlignment.last!.start,
+            updatedWords[longWordIndex].start,
+            accuracy: 0.0001,
+            "Long word start time should remain unchanged"
+        )
+
+        // Test that the timing sequence remains monotonic
+        for i in 1..<updatedWords.count {
+            let prevEnd = updatedWords[i-1].end
+            let currentStart = updatedWords[i].start
+            XCTAssertEqual(
+                prevEnd,
+                currentStart,
+                "Timing sequence should remain monotonic (previous end: \(prevEnd), current start: \(currentStart))"
+            )
+        }
+
+        // Test that sentence boundaries are properly handled
+        let endWordIndex = updatedWords.firstIndex { $0.word == " ending." }!
+        let endWordDuration = updatedWords[endWordIndex].end - updatedWords[endWordIndex].start
+        XCTAssertEqual(
+            endWordDuration,
+            maxDuration,
+            accuracy: 0.0001,
+            "End word with punctuation duration (\(endWordDuration)s) should not exceed maximum allowed duration (\(maxDuration)s)"
+        )
+    }
+
+    func testSingleTokenSegmentWordDuration() async {
+        // Test case with single tokens that have very long durations
+        let wordTimings = [
+            WordTiming(word: "<|notimestamps|>", tokens: [50363], start: 0, end: 0.5, probability: 1),
+            // Single token with 20s duration
+            WordTiming(word: " Hello", tokens: [314], start: 0.5, end: 20.5, probability: 1),
+            WordTiming(word: "<|endoftext|>", tokens: [50257], start: 20.5, end: 30, probability: 1),
+        ]
+
+        let segmentSeeker = SegmentSeeker()
+
+        // Create test segments
+        let segments = [
+            TranscriptionSegment(
+                id: 0,
+                seek: 0,
+                start: 0.0,
+                end: 30.0,
+                text: "Hello",
+                tokens: [314],
+                tokenLogProbs: Array(repeating: [0: 0.0], count: 1)
+            )
+        ]
+
+        // Test duration constraints calculation
+        let (constrainedMedianDuration, maxDuration) = segmentSeeker.calculateWordDurationConstraints(alignment: wordTimings)
+        XCTAssertEqual(constrainedMedianDuration, 0.7, "Constrained median duration should be capped at 0.7")
+        XCTAssertEqual(maxDuration, 1.4, "Max duration should be double the constrained median duration")
+
+        // Test truncation of long words
+        let truncatedAlignment = segmentSeeker.truncateLongWordsAtSentenceBoundaries(wordTimings, maxDuration: maxDuration)
+        let mergedAlignment = segmentSeeker.mergePunctuations(alignment: truncatedAlignment)
+        let updatedSegments = segmentSeeker.updateSegmentsWithWordTimings(
+            segments: segments,
+            mergedAlignment: mergedAlignment,
+            seek: 0,
+            lastSpeechTimestamp: 0,
+            constrainedMedianDuration: constrainedMedianDuration,
+            maxDuration: maxDuration,
+            tokenizer: try! await loadTokenizer(for: .tiny)
+        )
+
+        let updatedWords = updatedSegments.first!.words!
+
+        // Test that long single tokens are properly truncated
+        let firstLongIndex = updatedWords.firstIndex { $0.word == " Hello" }!
+        let firstLongDuration = updatedWords[firstLongIndex].end - updatedWords[firstLongIndex].start
+        XCTAssertLessThanOrEqual(
+            firstLongDuration,
+            maxDuration,
+            "Long single token duration (\(firstLongDuration)s) should be truncated"
+        )
+
+        // Verify timing sequence remains valid
+        var previousEnd: Float = 0
+        for timing in updatedWords {
+            XCTAssertGreaterThanOrEqual(
+                timing.start,
+                previousEnd,
+                "Start time should not be earlier than previous end time"
+            )
+            XCTAssertLessThanOrEqual(
+                timing.end - timing.start,
+                maxDuration,
+                "No word duration should significantly exceed maximum allowed duration"
+            )
+            previousEnd = timing.end
+        }
+    }
+
     // MARK: - Streaming Timestamp Tests
 
     func testStreamingTimestamps() async throws {

From 1775c39a45944aea2de9741d7579682b2db4c38e Mon Sep 17 00:00:00 2001
From: ZachNagengast <znagengast@gmail.com>
Date: Wed, 19 Feb 2025 22:11:18 -0800
Subject: [PATCH 03/26] Support protocols from new swift-transformers version

---
 Package.resolved                     | 26 ++++++++++++++++---
 Sources/WhisperKit/Core/Models.swift | 38 ++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/Package.resolved b/Package.resolved
index 527eff0..b135190 100644
--- a/Package.resolved
+++ b/Package.resolved
@@ -1,12 +1,30 @@
 {
   "pins" : [
+    {
+      "identity" : "jinja",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/johnmai-dev/Jinja",
+      "state" : {
+        "revision" : "bbddb92fc51ae420b87300298370fd1dfc308f73",
+        "version" : "1.1.1"
+      }
+    },
     {
       "identity" : "swift-argument-parser",
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/apple/swift-argument-parser.git",
       "state" : {
-        "revision" : "c8ed701b513cf5177118a175d85fbbbcd707ab41",
-        "version" : "1.3.0"
+        "revision" : "0fbc8848e389af3bb55c182bc19ca9d5dc2f255b",
+        "version" : "1.4.0"
+      }
+    },
+    {
+      "identity" : "swift-collections",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/apple/swift-collections.git",
+      "state" : {
+        "revision" : "671108c96644956dddcd89dd59c203dcdb36cec7",
+        "version" : "1.1.4"
       }
     },
     {
@@ -14,8 +32,8 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/huggingface/swift-transformers.git",
       "state" : {
-        "revision" : "fc6543263e4caed9bf6107466d625cfae9357f08",
-        "version" : "0.1.8"
+        "revision" : "55710ddfb1ae804b4b7ce973be75cf2e41272185",
+        "version" : "0.1.17"
       }
     }
   ],
diff --git a/Sources/WhisperKit/Core/Models.swift b/Sources/WhisperKit/Core/Models.swift
index 8ef5ba4..7746af9 100644
--- a/Sources/WhisperKit/Core/Models.swift
+++ b/Sources/WhisperKit/Core/Models.swift
@@ -1325,10 +1325,18 @@ extension WhisperTokenizerWrapper: Tokenizer {
         tokenizer.encode(text: text)
     }
 
+    func encode(text: String, addSpecialTokens: Bool) -> [Int] {
+        tokenizer.encode(text: text, addSpecialTokens: addSpecialTokens)
+    }
+
     func decode(tokens: [Int]) -> String {
         tokenizer.decode(tokens: tokens)
     }
 
+    func decode(tokens: [Int], skipSpecialTokens: Bool) -> String {
+        tokenizer.decode(tokens: tokens, skipSpecialTokens: skipSpecialTokens)
+    }
+
     func convertTokenToId(_ token: String) -> Int? {
         tokenizer.convertTokenToId(token)
     }
@@ -1360,6 +1368,36 @@ extension WhisperTokenizerWrapper: Tokenizer {
     var unknownTokenId: Int? {
         tokenizer.unknownTokenId
     }
+
+    // MARK: Jinja template protocol methods
+
+    func applyChatTemplate(messages: [Tokenizers.Message]) throws -> [Int] {
+        try tokenizer.applyChatTemplate(messages: messages)
+    }
+
+    func applyChatTemplate(messages: [Tokenizers.Message], tools: [Tokenizers.ToolSpec]?) throws -> [Int] {
+        try tokenizer.applyChatTemplate(messages: messages, tools: tools)
+    }
+
+    func applyChatTemplate(messages: [Tokenizers.Message], tools: [Tokenizers.ToolSpec]?, additionalContext: [String : Any]?) throws -> [Int] {
+        try tokenizer.applyChatTemplate(messages: messages, tools: tools, additionalContext: additionalContext)
+    }
+
+    func applyChatTemplate(messages: [Tokenizers.Message], chatTemplate: Tokenizers.ChatTemplateArgument) throws -> [Int] {
+        try tokenizer.applyChatTemplate(messages: messages, chatTemplate: chatTemplate)
+    }
+
+    func applyChatTemplate(messages: [Tokenizers.Message], chatTemplate: String) throws -> [Int] {
+        try tokenizer.applyChatTemplate(messages: messages, chatTemplate: chatTemplate)
+    }
+
+    func applyChatTemplate(messages: [Tokenizers.Message], chatTemplate: Tokenizers.ChatTemplateArgument?, addGenerationPrompt: Bool, truncation: Bool, maxLength: Int?, tools: [Tokenizers.ToolSpec]?) throws -> [Int] {
+        try tokenizer.applyChatTemplate(messages: messages, chatTemplate: chatTemplate, addGenerationPrompt: addGenerationPrompt, truncation: truncation, maxLength: maxLength, tools: tools)
+    }
+
+    func applyChatTemplate(messages: [Tokenizers.Message], chatTemplate: Tokenizers.ChatTemplateArgument?, addGenerationPrompt: Bool, truncation: Bool, maxLength: Int?, tools: [Tokenizers.ToolSpec]?, additionalContext: [String : Any]?) throws -> [Int] {
+        try tokenizer.applyChatTemplate(messages: messages, chatTemplate: chatTemplate, addGenerationPrompt: addGenerationPrompt, truncation: truncation, maxLength: maxLength, tools: tools, additionalContext: additionalContext)
+    }
 }
 
 extension WhisperTokenizerWrapper {

From 2af850c760fa45dda15ff45540058d527514eb71 Mon Sep 17 00:00:00 2001
From: ZachNagengast <znagengast@gmail.com>
Date: Wed, 19 Feb 2025 22:36:16 -0800
Subject: [PATCH 04/26] Backwards support for older swift-transformers

---
 Sources/WhisperKit/Core/Models.swift | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/Sources/WhisperKit/Core/Models.swift b/Sources/WhisperKit/Core/Models.swift
index 7746af9..48b5338 100644
--- a/Sources/WhisperKit/Core/Models.swift
+++ b/Sources/WhisperKit/Core/Models.swift
@@ -1325,18 +1325,11 @@ extension WhisperTokenizerWrapper: Tokenizer {
         tokenizer.encode(text: text)
     }
 
-    func encode(text: String, addSpecialTokens: Bool) -> [Int] {
-        tokenizer.encode(text: text, addSpecialTokens: addSpecialTokens)
-    }
 
     func decode(tokens: [Int]) -> String {
         tokenizer.decode(tokens: tokens)
     }
 
-    func decode(tokens: [Int], skipSpecialTokens: Bool) -> String {
-        tokenizer.decode(tokens: tokens, skipSpecialTokens: skipSpecialTokens)
-    }
-
     func convertTokenToId(_ token: String) -> Int? {
         tokenizer.convertTokenToId(token)
     }
@@ -1371,6 +1364,15 @@ extension WhisperTokenizerWrapper: Tokenizer {
 
     // MARK: Jinja template protocol methods
 
+    #if canImport(Jinja)
+    func encode(text: String, addSpecialTokens: Bool) -> [Int] {
+        tokenizer.encode(text: text, addSpecialTokens: addSpecialTokens)
+    }
+
+    func decode(tokens: [Int], skipSpecialTokens: Bool) -> String {
+        tokenizer.decode(tokens: tokens, skipSpecialTokens: skipSpecialTokens)
+    }
+
     func applyChatTemplate(messages: [Tokenizers.Message]) throws -> [Int] {
         try tokenizer.applyChatTemplate(messages: messages)
     }
@@ -1398,6 +1400,7 @@ extension WhisperTokenizerWrapper: Tokenizer {
     func applyChatTemplate(messages: [Tokenizers.Message], chatTemplate: Tokenizers.ChatTemplateArgument?, addGenerationPrompt: Bool, truncation: Bool, maxLength: Int?, tools: [Tokenizers.ToolSpec]?, additionalContext: [String : Any]?) throws -> [Int] {
         try tokenizer.applyChatTemplate(messages: messages, chatTemplate: chatTemplate, addGenerationPrompt: addGenerationPrompt, truncation: truncation, maxLength: maxLength, tools: tools, additionalContext: additionalContext)
     }
+    #endif
 }
 
 extension WhisperTokenizerWrapper {

From 9313eac60d54d178aec40d6a2739f3a8cf2f4f68 Mon Sep 17 00:00:00 2001
From: ZachNagengast <znagengast@gmail.com>
Date: Wed, 19 Feb 2025 22:36:21 -0800
Subject: [PATCH 05/26] Formatting

---
 .../WhisperKit/Core/Text/SegmentSeeker.swift  | 17 ++++++------
 Sources/WhisperKit/Core/Utils/Utils.swift     |  4 +--
 Tests/WhisperKitTests/UnitTests.swift         | 26 +++++++++----------
 3 files changed, 23 insertions(+), 24 deletions(-)

diff --git a/Sources/WhisperKit/Core/Text/SegmentSeeker.swift b/Sources/WhisperKit/Core/Text/SegmentSeeker.swift
index ed696e6..04b7d1b 100644
--- a/Sources/WhisperKit/Core/Text/SegmentSeeker.swift
+++ b/Sources/WhisperKit/Core/Text/SegmentSeeker.swift
@@ -279,7 +279,7 @@ open class SegmentSeeker: SegmentSeeking {
 
         return (textIndices.reversed(), timeIndices.reversed())
     }
-    
+
     func mergePunctuations(
         alignment: [WordTiming],
         prepended: String = Constants.defaultPrependPunctuations,
@@ -489,22 +489,21 @@ open class SegmentSeeker: SegmentSeeking {
         return updatedSegments
     }
 
-
     public func calculateWordDurationConstraints(alignment: [WordTiming]) -> (Float, Float) {
         var wordDurations = alignment.map { $0.end - $0.start }
         wordDurations = wordDurations.filter { $0 > 0 }
-        
+
         let medianDuration: Float = wordDurations.isEmpty ? 0.0 : wordDurations.sorted(by: <)[wordDurations.count / 2]
         let constrainedMedianDuration = min(0.7, medianDuration)
         let maxDuration = constrainedMedianDuration * 2
-        
+
         return (constrainedMedianDuration, maxDuration)
     }
-    
+
     public func truncateLongWordsAtSentenceBoundaries(_ alignment: [WordTiming], maxDuration: Float) -> [WordTiming] {
         let sentenceEndMarks = [".", "。", "!", "！", "?", "？"]
         var truncatedAlignment = alignment
-        
+
         if !truncatedAlignment.isEmpty {
             for i in 1..<truncatedAlignment.count {
                 if truncatedAlignment[i].end - truncatedAlignment[i].start > maxDuration {
@@ -516,7 +515,7 @@ open class SegmentSeeker: SegmentSeeking {
                 }
             }
         }
-        
+
         return truncatedAlignment
     }
 
@@ -572,7 +571,7 @@ open class SegmentSeeker: SegmentSeeking {
                 // twice the median word duration.
                 if firstWord.end - lastSpeechTimestamp > constrainedMedianDuration * 4 &&
                     (firstWord.end - firstWord.start > maxDuration ||
-                     (wordsInSegment.count > 1 && wordsInSegment[1].end - firstWord.start > maxDuration * 2))
+                        (wordsInSegment.count > 1 && wordsInSegment[1].end - firstWord.start > maxDuration * 2))
                 {
                     // First word or both words are too long
                     if wordsInSegment.count > 1 && wordsInSegment[1].end - wordsInSegment[1].start > maxDuration {
@@ -581,7 +580,7 @@ open class SegmentSeeker: SegmentSeeking {
                         wordsInSegment[0].end = boundary
                         wordsInSegment[1].start = boundary
                     }
-                    
+
                     // First word is too long, keep its start time and adjust its end time
                     wordsInSegment[0].end = min(wordsInSegment[0].start + maxDuration, wordsInSegment[0].end)
                 }
diff --git a/Sources/WhisperKit/Core/Utils/Utils.swift b/Sources/WhisperKit/Core/Utils/Utils.swift
index 2dafe11..12b386c 100644
--- a/Sources/WhisperKit/Core/Utils/Utils.swift
+++ b/Sources/WhisperKit/Core/Utils/Utils.swift
@@ -262,10 +262,10 @@ public extension String {
 
         // Replace multiple spaces with a single space
         let singleSpacedString = noPunctuationString.replacingOccurrences(of: " +", with: " ", options: .regularExpression)
-        
+
         // Trim whitespace and newlines
         let trimmedString = singleSpacedString.trimmingCharacters(in: .whitespacesAndNewlines)
-        
+
         return trimmedString
     }
 
diff --git a/Tests/WhisperKitTests/UnitTests.swift b/Tests/WhisperKitTests/UnitTests.swift
index f23bc41..77952c8 100644
--- a/Tests/WhisperKitTests/UnitTests.swift
+++ b/Tests/WhisperKitTests/UnitTests.swift
@@ -1353,28 +1353,28 @@ final class UnitTests: XCTestCase {
         XCTAssertEqual([Int]().batched(into: 3), [])
         XCTAssertEqual([1, 2, 3, 4].batched(into: 3), [[1, 2, 3], [4]])
     }
-    
+
     func testStringNormalization() throws {
         // Basic cases
         XCTAssertEqual("Hello World!".normalized, "hello world")
         XCTAssertEqual("hello   world".normalized, "hello world")
         XCTAssertEqual("HELLO WORLD".normalized, "hello world")
-        
+
         // Punctuation
         XCTAssertEqual("Hello, World!".normalized, "hello world")
         XCTAssertEqual("Hello... World???".normalized, "hello world")
         XCTAssertEqual("'Hello' \"World\"".normalized, "hello world")
-        
+
         // Dashes and hyphens
         XCTAssertEqual("hello-world".normalized, "hello world")
         XCTAssertEqual("hello -- world".normalized, "hello world")
-        
+
         // Whitespace handling
         XCTAssertEqual("   hello   world   ".normalized, "hello world")
-        
+
         // Mixed cases
         XCTAssertEqual("Hello!!!---World???".normalized, "hello world")
-        
+
         // Numbers and special characters
         XCTAssertEqual("Hello: World".normalized, "hello world")
         XCTAssertEqual("Hello% world 100".normalized, "hello world 100")
@@ -2000,20 +2000,20 @@ final class UnitTests: XCTestCase {
         let word2 = WordTiming(word: "hello", tokens: [2], start: 0, end: 1, probability: 1.0)
         let word3 = WordTiming(word: "World!", tokens: [3], start: 1, end: 2, probability: 1.0)
         let word4 = WordTiming(word: "WORLD", tokens: [4], start: 1, end: 2, probability: 1.0)
-        
+
         // Test common prefix finding
         let sequence1 = [word1, word3]
         let sequence2 = [word2, word4]
-        
+
         let commonPrefix = findLongestCommonPrefix(sequence1, sequence2)
         XCTAssertEqual(commonPrefix.count, 2)
         XCTAssertEqual(commonPrefix[0].word, "hello")
         XCTAssertEqual(commonPrefix[1].word, "WORLD")
-        
+
         // Test different suffix finding
         let sequence3 = [word1, word3]
         let sequence4 = [word2, word4, WordTiming(word: "suffix", tokens: [5], start: 2, end: 3, probability: 1.0)]
-        
+
         let differentSuffix = findLongestDifferentSuffix(sequence3, sequence4)
         XCTAssertEqual(differentSuffix.count, 1)
         XCTAssertEqual(differentSuffix[0].word, "suffix")
@@ -2119,7 +2119,7 @@ final class UnitTests: XCTestCase {
                 text: "ending.",
                 tokens: [123, 13],
                 tokenLogProbs: Array(repeating: [0: 0.0], count: 2)
-            )
+            ),
         ]
 
         // Test duration constraints calculation
@@ -2176,7 +2176,7 @@ final class UnitTests: XCTestCase {
 
         // Test that the timing sequence remains monotonic
         for i in 1..<updatedWords.count {
-            let prevEnd = updatedWords[i-1].end
+            let prevEnd = updatedWords[i - 1].end
             let currentStart = updatedWords[i].start
             XCTAssertEqual(
                 prevEnd,
@@ -2217,7 +2217,7 @@ final class UnitTests: XCTestCase {
                 text: "Hello",
                 tokens: [314],
                 tokenLogProbs: Array(repeating: [0: 0.0], count: 1)
-            )
+            ),
         ]
 
         // Test duration constraints calculation

From 797d2744d6473adc4333242a9af78a417bdc28de Mon Sep 17 00:00:00 2001
From: ZachNagengast <znagengast@gmail.com>
Date: Wed, 19 Feb 2025 22:37:36 -0800
Subject: [PATCH 06/26] Revert default packages

---
 Package.resolved | 27 ++++-----------------------
 1 file changed, 4 insertions(+), 23 deletions(-)

diff --git a/Package.resolved b/Package.resolved
index b135190..1f8d409 100644
--- a/Package.resolved
+++ b/Package.resolved
@@ -1,30 +1,12 @@
 {
   "pins" : [
-    {
-      "identity" : "jinja",
-      "kind" : "remoteSourceControl",
-      "location" : "https://github.com/johnmai-dev/Jinja",
-      "state" : {
-        "revision" : "bbddb92fc51ae420b87300298370fd1dfc308f73",
-        "version" : "1.1.1"
-      }
-    },
     {
       "identity" : "swift-argument-parser",
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/apple/swift-argument-parser.git",
       "state" : {
-        "revision" : "0fbc8848e389af3bb55c182bc19ca9d5dc2f255b",
-        "version" : "1.4.0"
-      }
-    },
-    {
-      "identity" : "swift-collections",
-      "kind" : "remoteSourceControl",
-      "location" : "https://github.com/apple/swift-collections.git",
-      "state" : {
-        "revision" : "671108c96644956dddcd89dd59c203dcdb36cec7",
-        "version" : "1.1.4"
+        "revision" : "c8ed701b513cf5177118a175d85fbbbcd707ab41",
+        "version" : "1.3.0"
       }
     },
     {
@@ -32,10 +14,9 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/huggingface/swift-transformers.git",
       "state" : {
-        "revision" : "55710ddfb1ae804b4b7ce973be75cf2e41272185",
-        "version" : "0.1.17"
+        "revision" : "fc6543263e4caed9bf6107466d625cfae9357f08",
+        "version" : "0.1.8"
       }
     }
   ],
-  "version" : 2
 }

From 0423be92a2800ffea3f533fb4513fbf5940e49f5 Mon Sep 17 00:00:00 2001
From: ZachNagengast <znagengast@gmail.com>
Date: Wed, 19 Feb 2025 22:39:42 -0800
Subject: [PATCH 07/26] Fix package.resolved

---
 Package.resolved | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Package.resolved b/Package.resolved
index 1f8d409..aec2192 100644
--- a/Package.resolved
+++ b/Package.resolved
@@ -19,4 +19,5 @@
       }
     }
   ],
-}
+  "version" : 2
+}
\ No newline at end of file

From 2e905feb26a4274e597756c55157eedd446d5e1d Mon Sep 17 00:00:00 2001
From: ZachNagengast <znagengast@gmail.com>
Date: Wed, 19 Feb 2025 22:40:12 -0800
Subject: [PATCH 08/26] Fix pacakge.resolved

---
 Package.resolved | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Package.resolved b/Package.resolved
index aec2192..527eff0 100644
--- a/Package.resolved
+++ b/Package.resolved
@@ -20,4 +20,4 @@
     }
   ],
   "version" : 2
-}
\ No newline at end of file
+}

From 3ca28fc4c3838cb467164ff15b1ef30c82848997 Mon Sep 17 00:00:00 2001
From: ZachNagengast <znagengast@gmail.com>
Date: Thu, 20 Feb 2025 13:39:07 -0800
Subject: [PATCH 09/26] Remove protocol conformance for tokenizer

- This will allow wrapper to handle newer versions of the tokenizer protocol without requiring strict conformance
---
 Sources/WhisperKit/Core/Models.swift | 120 +++++++--------------------
 1 file changed, 28 insertions(+), 92 deletions(-)

diff --git a/Sources/WhisperKit/Core/Models.swift b/Sources/WhisperKit/Core/Models.swift
index 48b5338..ef95f1a 100644
--- a/Sources/WhisperKit/Core/Models.swift
+++ b/Sources/WhisperKit/Core/Models.swift
@@ -1198,17 +1198,40 @@ public struct SpecialTokens {
     }
 }
 
-public protocol WhisperTokenizer: Tokenizer {
+public protocol WhisperTokenizer {
+    // swift-transformers pass through
+    func encode(text: String) -> [Int]
+    func decode(tokens: [Int]) -> String
+    func convertTokenToId(_ token: String) -> Int?
+    func convertIdToToken(_ id: Int) -> String?
+
+    // WhisperKit specific
     var specialTokens: SpecialTokens { get }
     var allLanguageTokens: Set<Int> { get }
 
     func splitToWordTokens(tokenIds: [Int]) -> (words: [String], wordTokens: [[Int]])
 }
 
-struct WhisperTokenizerWrapper: WhisperTokenizer {
+open class WhisperTokenizerWrapper: WhisperTokenizer {
     let tokenizer: any Tokenizer
-    let specialTokens: SpecialTokens
-    let allLanguageTokens: Set<Int>
+    public let specialTokens: SpecialTokens
+    public let allLanguageTokens: Set<Int>
+
+    public func encode(text: String) -> [Int] {
+        tokenizer.encode(text: text)
+    }
+
+    public func decode(tokens: [Int]) -> String {
+        tokenizer.decode(tokens: tokens)
+    }
+
+    public func convertTokenToId(_ token: String) -> Int? {
+        tokenizer.convertTokenToId(token)
+    }
+
+    public func convertIdToToken(_ id: Int) -> String? {
+        tokenizer.convertIdToToken(id)
+    }
 
     init(tokenizer: any Tokenizer) {
         let specialTokens = SpecialTokens(
@@ -1300,7 +1323,7 @@ struct WhisperTokenizerWrapper: WhisperTokenizer {
     /// Decodes token ids into individual words and per-word subtokens
     /// - Parameter tokenIds: Array of tokens to decode and then split
     /// - Returns: Tuple containing and array of the split words and all tokens for each word
-    func splitToWordTokens(tokenIds: [Int]) -> (words: [String], wordTokens: [[Int]]) {
+    public func splitToWordTokens(tokenIds: [Int]) -> (words: [String], wordTokens: [[Int]]) {
         let decodedWords = tokenizer.decode(tokens: tokenIds.filter { $0 < specialTokens.specialTokenBegin })
 
         // Detect language of input text
@@ -1316,93 +1339,6 @@ struct WhisperTokenizerWrapper: WhisperTokenizer {
     }
 }
 
-extension WhisperTokenizerWrapper: Tokenizer {
-    func tokenize(text: String) -> [String] {
-        tokenizer.tokenize(text: text)
-    }
-
-    func encode(text: String) -> [Int] {
-        tokenizer.encode(text: text)
-    }
-
-
-    func decode(tokens: [Int]) -> String {
-        tokenizer.decode(tokens: tokens)
-    }
-
-    func convertTokenToId(_ token: String) -> Int? {
-        tokenizer.convertTokenToId(token)
-    }
-
-    func convertIdToToken(_ id: Int) -> String? {
-        tokenizer.convertIdToToken(id)
-    }
-
-    var bosToken: String? {
-        tokenizer.bosToken
-    }
-
-    var bosTokenId: Int? {
-        tokenizer.bosTokenId
-    }
-
-    var eosToken: String? {
-        tokenizer.eosToken
-    }
-
-    var eosTokenId: Int? {
-        tokenizer.eosTokenId
-    }
-
-    var unknownToken: String? {
-        tokenizer.unknownToken
-    }
-
-    var unknownTokenId: Int? {
-        tokenizer.unknownTokenId
-    }
-
-    // MARK: Jinja template protocol methods
-
-    #if canImport(Jinja)
-    func encode(text: String, addSpecialTokens: Bool) -> [Int] {
-        tokenizer.encode(text: text, addSpecialTokens: addSpecialTokens)
-    }
-
-    func decode(tokens: [Int], skipSpecialTokens: Bool) -> String {
-        tokenizer.decode(tokens: tokens, skipSpecialTokens: skipSpecialTokens)
-    }
-
-    func applyChatTemplate(messages: [Tokenizers.Message]) throws -> [Int] {
-        try tokenizer.applyChatTemplate(messages: messages)
-    }
-
-    func applyChatTemplate(messages: [Tokenizers.Message], tools: [Tokenizers.ToolSpec]?) throws -> [Int] {
-        try tokenizer.applyChatTemplate(messages: messages, tools: tools)
-    }
-
-    func applyChatTemplate(messages: [Tokenizers.Message], tools: [Tokenizers.ToolSpec]?, additionalContext: [String : Any]?) throws -> [Int] {
-        try tokenizer.applyChatTemplate(messages: messages, tools: tools, additionalContext: additionalContext)
-    }
-
-    func applyChatTemplate(messages: [Tokenizers.Message], chatTemplate: Tokenizers.ChatTemplateArgument) throws -> [Int] {
-        try tokenizer.applyChatTemplate(messages: messages, chatTemplate: chatTemplate)
-    }
-
-    func applyChatTemplate(messages: [Tokenizers.Message], chatTemplate: String) throws -> [Int] {
-        try tokenizer.applyChatTemplate(messages: messages, chatTemplate: chatTemplate)
-    }
-
-    func applyChatTemplate(messages: [Tokenizers.Message], chatTemplate: Tokenizers.ChatTemplateArgument?, addGenerationPrompt: Bool, truncation: Bool, maxLength: Int?, tools: [Tokenizers.ToolSpec]?) throws -> [Int] {
-        try tokenizer.applyChatTemplate(messages: messages, chatTemplate: chatTemplate, addGenerationPrompt: addGenerationPrompt, truncation: truncation, maxLength: maxLength, tools: tools)
-    }
-
-    func applyChatTemplate(messages: [Tokenizers.Message], chatTemplate: Tokenizers.ChatTemplateArgument?, addGenerationPrompt: Bool, truncation: Bool, maxLength: Int?, tools: [Tokenizers.ToolSpec]?, additionalContext: [String : Any]?) throws -> [Int] {
-        try tokenizer.applyChatTemplate(messages: messages, chatTemplate: chatTemplate, addGenerationPrompt: addGenerationPrompt, truncation: truncation, maxLength: maxLength, tools: tools, additionalContext: additionalContext)
-    }
-    #endif
-}
-
 extension WhisperTokenizerWrapper {
     /// Default values for each token, using base vocab
     static var defaultWhitespaceToken: Int { 220 }

From ffc251583ca42c40d64556bedf231f574ef320e1 Mon Sep 17 00:00:00 2001
From: Andrey Leonov <aleonov@gmail.com>
Date: Thu, 20 Feb 2025 18:00:46 -0500
Subject: [PATCH 10/26] Add public attributes

---
 Sources/WhisperKit/Core/Models.swift      | 5 +++++
 Sources/WhisperKit/Core/Utils/Utils.swift | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/Sources/WhisperKit/Core/Models.swift b/Sources/WhisperKit/Core/Models.swift
index ef95f1a..794739e 100644
--- a/Sources/WhisperKit/Core/Models.swift
+++ b/Sources/WhisperKit/Core/Models.swift
@@ -280,6 +280,11 @@ public struct ModelSupportConfig: Codable {
 public struct AudioChunk {
     public var seekOffsetIndex: Int
     public var audioSamples: [Float]
+
+    public init(seekOffsetIndex: Int, audioSamples: [Float]) {
+        self.seekOffsetIndex = seekOffsetIndex
+        self.audioSamples = audioSamples
+    }
 }
 
 // MARK: - Decoding
diff --git a/Sources/WhisperKit/Core/Utils/Utils.swift b/Sources/WhisperKit/Core/Utils/Utils.swift
index 12b386c..7f4a6e1 100644
--- a/Sources/WhisperKit/Core/Utils/Utils.swift
+++ b/Sources/WhisperKit/Core/Utils/Utils.swift
@@ -179,7 +179,7 @@ public extension MLTensor {
 }
 #endif
 
-extension MLModel {
+public extension MLModel {
     func asyncPrediction(
         from input: MLFeatureProvider,
         options: MLPredictionOptions

From 7c98d63db53742f6d89e8837bb7c9e973f0a9785 Mon Sep 17 00:00:00 2001
From: ZachNagengast <znagengast@gmail.com>
Date: Thu, 20 Feb 2025 15:16:59 -0800
Subject: [PATCH 11/26] Add public initializers

---
 .../Core/Audio/AudioProcessor.swift           |  5 ++
 Sources/WhisperKit/Core/Models.swift          | 87 ++++++++++++++++++-
 .../WhisperKit/Core/Text/TokenSampler.swift   | 10 +++
 3 files changed, 100 insertions(+), 2 deletions(-)

diff --git a/Sources/WhisperKit/Core/Audio/AudioProcessor.swift b/Sources/WhisperKit/Core/Audio/AudioProcessor.swift
index 052ba20..bde93a2 100644
--- a/Sources/WhisperKit/Core/Audio/AudioProcessor.swift
+++ b/Sources/WhisperKit/Core/Audio/AudioProcessor.swift
@@ -16,6 +16,11 @@ public typealias DeviceID = String
 public struct AudioDevice: Identifiable, Hashable {
     public let id: DeviceID
     public let name: String
+
+    public init(id: DeviceID, name: String) {
+        self.id = id
+        self.name = name
+    }
 }
 
 public protocol AudioProcessing {
diff --git a/Sources/WhisperKit/Core/Models.swift b/Sources/WhisperKit/Core/Models.swift
index 794739e..a88aa67 100644
--- a/Sources/WhisperKit/Core/Models.swift
+++ b/Sources/WhisperKit/Core/Models.swift
@@ -176,11 +176,26 @@ public struct ModelSupport: Codable, Equatable {
     private enum CodingKeys: String, CodingKey {
         case `default`, supported
     }
+
+    public init(
+        `default`: String,
+        supported: [String],
+        disabled: [String] = []
+    ) {
+        self.`default` = `default`
+        self.supported = supported
+        self.disabled = disabled
+    }
 }
 
 public struct DeviceSupport: Codable {
     public let identifiers: [String]
     public var models: ModelSupport
+
+    public init(identifiers: [String], models: ModelSupport) {
+        self.identifiers = identifiers
+        self.models = models
+    }
 }
 
 public struct ModelSupportConfig: Codable {
@@ -356,7 +371,12 @@ public struct DecodingCache {
     public var keyCache: MLMultiArray?
     public var valueCache: MLMultiArray?
     public var alignmentWeights: MLMultiArray?
-    public init(keyCache: MLMultiArray? = nil, valueCache: MLMultiArray? = nil, alignmentWeights: MLMultiArray? = nil) {
+
+    public init(
+        keyCache: MLMultiArray? = nil,
+        valueCache: MLMultiArray? = nil,
+        alignmentWeights: MLMultiArray? = nil
+    ) {
         self.keyCache = keyCache
         self.valueCache = valueCache
         self.alignmentWeights = alignmentWeights
@@ -437,7 +457,20 @@ public struct DecodingResult {
                               fallback: nil)
     }
 
-    public init(language: String, languageProbs: [String: Float], tokens: [Int], tokenLogProbs: [[Int: Float]], text: String, avgLogProb: Float, noSpeechProb: Float, temperature: Float, compressionRatio: Float, cache: DecodingCache? = nil, timings: TranscriptionTimings? = nil, fallback: DecodingFallback? = nil) {
+    public init(
+        language: String,
+        languageProbs: [String: Float],
+        tokens: [Int],
+        tokenLogProbs: [[Int: Float]],
+        text: String,
+        avgLogProb: Float,
+        noSpeechProb: Float,
+        temperature: Float,
+        compressionRatio: Float,
+        cache: DecodingCache? = nil,
+        timings: TranscriptionTimings? = nil,
+        fallback: DecodingFallback? = nil
+    ) {
         self.language = language
         self.languageProbs = languageProbs
         self.tokens = tokens
@@ -515,6 +548,20 @@ public struct TranscriptionResult: Codable {
     public var timings: TranscriptionTimings
     public var seekTime: Float?
 
+    public init(
+        text: String,
+        segments: [TranscriptionSegment],
+        language: String,
+        timings: TranscriptionTimings,
+        seekTime: Float? = nil
+    ) {
+        self.text = text
+        self.segments = segments
+        self.language = language
+        self.timings = timings
+        self.seekTime = seekTime
+    }
+
     public func logSegments() {
         for (i, segment) in segments.enumerated() {
             let start = segment.start
@@ -610,6 +657,34 @@ public struct TranscriptionSegment: Hashable, Codable {
     public var compressionRatio: Float = 1.0
     public var noSpeechProb: Float = 0.0
     public var words: [WordTiming]? = nil
+
+    public init(
+        id: Int = 0,
+        seek: Int = 0,
+        start: Float = 0.0,
+        end: Float = 0.0,
+        text: String = "",
+        tokens: [Int] = [],
+        tokenLogProbs: [[Int : Float]] = [[:]],
+        temperature: Float = 1.0,
+        avgLogprob: Float = 0.0,
+        compressionRatio: Float = 1.0,
+        noSpeechProb: Float = 0.0,
+        words: [WordTiming]? = nil
+    ) {
+        self.id = id
+        self.seek = seek
+        self.start = start
+        self.end = end
+        self.text = text
+        self.tokens = tokens
+        self.tokenLogProbs = tokenLogProbs
+        self.temperature = temperature
+        self.avgLogprob = avgLogprob
+        self.compressionRatio = compressionRatio
+        self.noSpeechProb = noSpeechProb
+        self.words = words
+    }
 }
 
 public struct WordTiming: Hashable, Codable {
@@ -618,6 +693,14 @@ public struct WordTiming: Hashable, Codable {
     public var start: Float
     public var end: Float
     public var probability: Float
+
+    public init(word: String, tokens: [Int], start: Float, end: Float, probability: Float) {
+        self.word = word
+        self.tokens = tokens
+        self.start = start
+        self.end = end
+        self.probability = probability
+    }
 }
 
 public struct TranscriptionProgress {
diff --git a/Sources/WhisperKit/Core/Text/TokenSampler.swift b/Sources/WhisperKit/Core/Text/TokenSampler.swift
index 2193161..3ece220 100644
--- a/Sources/WhisperKit/Core/Text/TokenSampler.swift
+++ b/Sources/WhisperKit/Core/Text/TokenSampler.swift
@@ -14,6 +14,16 @@ public struct SamplingResult {
     public var tokens: [Int]
     public var logProbs: [Float]
     public var completed: Bool
+
+    public init(
+        tokens: [Int],
+        logProbs: [Float],
+        completed: Bool
+    ) {
+        self.tokens = tokens
+        self.logProbs = logProbs
+        self.completed = completed
+    }
 }
 
 @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)

From d7258213961c4d4d3a6566b494e5aa5ae76292a0 Mon Sep 17 00:00:00 2001
From: ZachNagengast <znagengast@gmail.com>
Date: Thu, 20 Feb 2025 15:22:52 -0800
Subject: [PATCH 12/26] Add computed properties for duration in
 TranscriptionSegment and WordTiming

---
 Sources/WhisperKit/Core/Models.swift             | 10 ++++++++++
 Sources/WhisperKit/Core/Text/SegmentSeeker.swift |  8 ++++----
 Tests/WhisperKitTests/UnitTests.swift            | 10 +++++-----
 3 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/Sources/WhisperKit/Core/Models.swift b/Sources/WhisperKit/Core/Models.swift
index a88aa67..78af20e 100644
--- a/Sources/WhisperKit/Core/Models.swift
+++ b/Sources/WhisperKit/Core/Models.swift
@@ -658,6 +658,11 @@ public struct TranscriptionSegment: Hashable, Codable {
     public var noSpeechProb: Float = 0.0
     public var words: [WordTiming]? = nil
 
+    /// Computed property for the duration of the segment
+    public var duration: Float {
+        return end - start
+    }
+
     public init(
         id: Int = 0,
         seek: Int = 0,
@@ -694,6 +699,11 @@ public struct WordTiming: Hashable, Codable {
     public var end: Float
     public var probability: Float
 
+    /// Computed property for the duration of the word
+    public var duration: Float {
+        return end - start
+    }
+
     public init(word: String, tokens: [Int], start: Float, end: Float, probability: Float) {
         self.word = word
         self.tokens = tokens
diff --git a/Sources/WhisperKit/Core/Text/SegmentSeeker.swift b/Sources/WhisperKit/Core/Text/SegmentSeeker.swift
index 04b7d1b..c6144a6 100644
--- a/Sources/WhisperKit/Core/Text/SegmentSeeker.swift
+++ b/Sources/WhisperKit/Core/Text/SegmentSeeker.swift
@@ -490,7 +490,7 @@ open class SegmentSeeker: SegmentSeeking {
     }
 
     public func calculateWordDurationConstraints(alignment: [WordTiming]) -> (Float, Float) {
-        var wordDurations = alignment.map { $0.end - $0.start }
+        var wordDurations = alignment.map { $0.duration }
         wordDurations = wordDurations.filter { $0 > 0 }
 
         let medianDuration: Float = wordDurations.isEmpty ? 0.0 : wordDurations.sorted(by: <)[wordDurations.count / 2]
@@ -506,7 +506,7 @@ open class SegmentSeeker: SegmentSeeking {
 
         if !truncatedAlignment.isEmpty {
             for i in 1..<truncatedAlignment.count {
-                if truncatedAlignment[i].end - truncatedAlignment[i].start > maxDuration {
+                if truncatedAlignment[i].duration > maxDuration {
                     if sentenceEndMarks.contains(truncatedAlignment[i].word) {
                         truncatedAlignment[i].end = truncatedAlignment[i].start + maxDuration
                     } else if i > 0, sentenceEndMarks.contains(truncatedAlignment[i - 1].word) {
@@ -570,11 +570,11 @@ open class SegmentSeeker: SegmentSeeking {
                 // Ensure the first and second word after a pause is not longer than
                 // twice the median word duration.
                 if firstWord.end - lastSpeechTimestamp > constrainedMedianDuration * 4 &&
-                    (firstWord.end - firstWord.start > maxDuration ||
+                    (firstWord.duration > maxDuration ||
                         (wordsInSegment.count > 1 && wordsInSegment[1].end - firstWord.start > maxDuration * 2))
                 {
                     // First word or both words are too long
-                    if wordsInSegment.count > 1 && wordsInSegment[1].end - wordsInSegment[1].start > maxDuration {
+                    if wordsInSegment.count > 1 && wordsInSegment[1].duration > maxDuration {
                         // Second word is too long, set it to max duration and shorten first word to fit
                         let boundary = min(wordsInSegment[1].start + maxDuration, wordsInSegment[1].end / 2)
                         wordsInSegment[0].end = boundary
diff --git a/Tests/WhisperKitTests/UnitTests.swift b/Tests/WhisperKitTests/UnitTests.swift
index 77952c8..7293f36 100644
--- a/Tests/WhisperKitTests/UnitTests.swift
+++ b/Tests/WhisperKitTests/UnitTests.swift
@@ -2143,7 +2143,7 @@ final class UnitTests: XCTestCase {
         let updatedWords = updatedSegments.compactMap { $0.words }.flatMap { $0 }
 
         // Test that long segments are properly truncated
-        let longSegmentDuration = updatedWords.last!.end - updatedWords.last!.start
+        let longSegmentDuration = updatedWords.last!.duration
         XCTAssertEqual(
             longSegmentDuration,
             maxDuration,
@@ -2158,7 +2158,7 @@ final class UnitTests: XCTestCase {
 
         // Test the long word
         let longWordIndex = updatedWords.firstIndex { $0.word == " ending." }!
-        let longWordDuration = updatedWords[longWordIndex].end - updatedWords[longWordIndex].start
+        let longWordDuration = updatedWords[longWordIndex].duration
         XCTAssertEqual(
             longWordDuration,
             maxDuration,
@@ -2187,7 +2187,7 @@ final class UnitTests: XCTestCase {
 
         // Test that sentence boundaries are properly handled
         let endWordIndex = updatedWords.firstIndex { $0.word == " ending." }!
-        let endWordDuration = updatedWords[endWordIndex].end - updatedWords[endWordIndex].start
+        let endWordDuration = updatedWords[endWordIndex].duration
         XCTAssertEqual(
             endWordDuration,
             maxDuration,
@@ -2242,7 +2242,7 @@ final class UnitTests: XCTestCase {
 
         // Test that long single tokens are properly truncated
         let firstLongIndex = updatedWords.firstIndex { $0.word == " Hello" }!
-        let firstLongDuration = updatedWords[firstLongIndex].end - updatedWords[firstLongIndex].start
+        let firstLongDuration = updatedWords[firstLongIndex].duration
         XCTAssertLessThanOrEqual(
             firstLongDuration,
             maxDuration,
@@ -2258,7 +2258,7 @@ final class UnitTests: XCTestCase {
                 "Start time should not be earlier than previous end time"
             )
             XCTAssertLessThanOrEqual(
-                timing.end - timing.start,
+                timing.duration,
                 maxDuration,
                 "No word duration should significantly exceed maximum allowed duration"
             )

From d4c62e60d64b9f8e5f7228f035a9f1ee2f96cf6c Mon Sep 17 00:00:00 2001
From: ZachNagengast <znagengast@gmail.com>
Date: Thu, 20 Feb 2025 15:48:47 -0800
Subject: [PATCH 13/26] Lower token count for early stopping test

---
 Tests/WhisperKitTests/UnitTests.swift | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Tests/WhisperKitTests/UnitTests.swift b/Tests/WhisperKitTests/UnitTests.swift
index 7293f36..9f42466 100644
--- a/Tests/WhisperKitTests/UnitTests.swift
+++ b/Tests/WhisperKitTests/UnitTests.swift
@@ -1285,9 +1285,9 @@ final class UnitTests: XCTestCase {
                 "Audio file not found"
             )
 
-            let earlyStopTokenCount = 10
+            let earlyStopTokenCount = 5
             let continuationCallback: TranscriptionCallback = { (progress: TranscriptionProgress) -> Bool? in
-                // Stop after only 10 tokens (full test audio contains ~30)
+                // Stop after only 5 tokens (full test audio contains ~30)
                 progress.tokens.count <= earlyStopTokenCount
             }
 

From 64938b70d6b742ed6004a971627adf66cdc7444b Mon Sep 17 00:00:00 2001
From: ZachNagengast <znagengast@gmail.com>
Date: Thu, 20 Feb 2025 16:49:13 -0800
Subject: [PATCH 14/26] Attempt to fix early stopping test using detached tasks

---
 Tests/WhisperKitTests/UnitTests.swift | 99 ++++++++++++++-------------
 1 file changed, 50 insertions(+), 49 deletions(-)

diff --git a/Tests/WhisperKitTests/UnitTests.swift b/Tests/WhisperKitTests/UnitTests.swift
index 9f42466..0359455 100644
--- a/Tests/WhisperKitTests/UnitTests.swift
+++ b/Tests/WhisperKitTests/UnitTests.swift
@@ -1260,67 +1260,68 @@ final class UnitTests: XCTestCase {
         await fulfillment(of: [modelStateExpectation, segmentDiscoveryExpectation, transcriptionStateExpectation], timeout: 1)
     }
 
-    #if !os(watchOS) // FIXME: watchOS ignores the priority here for some reason
     func testCallbackWithEarlyStopping() async throws {
-        let callbackTestTask = Task(priority: .userInitiated) {
-            let computeOptions = ModelComputeOptions(
-                melCompute: .cpuOnly,
-                audioEncoderCompute: .cpuOnly,
-                textDecoderCompute: .cpuOnly,
-                prefillCompute: .cpuOnly
-            )
-
-            let config = try WhisperKitConfig(
-                modelFolder: tinyModelPath(),
-                computeOptions: computeOptions,
-                verbose: true,
-                logLevel: .debug,
-                load: false
-            )
-            let whisperKit = try await WhisperKit(config)
+        let computeOptions = ModelComputeOptions(
+            melCompute: .cpuOnly,
+            audioEncoderCompute: .cpuOnly,
+            textDecoderCompute: .cpuOnly,
+            prefillCompute: .cpuOnly
+        )
 
-            try await whisperKit.loadModels()
-            let audioFilePath = try XCTUnwrap(
-                Bundle.current.path(forResource: "jfk", ofType: "wav"),
-                "Audio file not found"
-            )
+        let config = try WhisperKitConfig(
+            modelFolder: self.tinyModelPath(),
+            computeOptions: computeOptions,
+            verbose: true,
+            logLevel: .debug,
+            load: false
+        )
+        let whisperKit = try await WhisperKit(config)
 
-            let earlyStopTokenCount = 5
-            let continuationCallback: TranscriptionCallback = { (progress: TranscriptionProgress) -> Bool? in
-                // Stop after only 5 tokens (full test audio contains ~30)
-                progress.tokens.count <= earlyStopTokenCount
-            }
+        try await whisperKit.loadModels()
+        let audioFilePath = try XCTUnwrap(
+            Bundle.current.path(forResource: "jfk", ofType: "wav"),
+            "Audio file not found"
+        )
 
-            let result = try await whisperKit.transcribe(audioPath: audioFilePath, callback: continuationCallback).first!
+        let earlyStopTokenCount = 5
+        let continuationCallback: TranscriptionCallback = { (progress: TranscriptionProgress) -> Bool? in
+            // Stop after only 5 tokens (full test audio contains ~30)
+            progress.tokens.count <= earlyStopTokenCount
+        }
 
-            XCTAssertNotNil(result)
-            let tokenCountWithEarlyStop = result.segments.flatMap { $0.tokens }.count
-            let decodingTimePerTokenWithEarlyStop = result.timings.decodingLoop / Double(tokenCountWithEarlyStop)
+        let result = try await Task.detached(priority: .userInitiated) {
+            try await whisperKit.transcribe(audioPath: audioFilePath, callback: continuationCallback).first!
+        }.value
 
-            // Work done in the callback should not block the decoding loop
-            let continuationCallbackWithWait: TranscriptionCallback = { (progress: TranscriptionProgress) -> Bool? in
-                Thread.sleep(forTimeInterval: 5)
-                return false
-            }
+        XCTAssertNotNil(result)
+        let tokenCountWithEarlyStop = result.segments.flatMap { $0.tokens }.count
+        let decodingTimePerTokenWithEarlyStop = result.timings.decodingLoop / Double(tokenCountWithEarlyStop)
 
-            let resultWithWait = try await whisperKit.transcribe(audioPath: audioFilePath, callback: continuationCallbackWithWait).first!
+        // Work done in the callback should not block the decoding loop
+        let continuationCallbackWithWait: TranscriptionCallback = { (progress: TranscriptionProgress) -> Bool? in
+            Thread.sleep(forTimeInterval: 5)
+            return false
+        }
 
-            XCTAssertNotNil(resultWithWait)
-            let tokenCountWithWait = resultWithWait.segments.flatMap { $0.tokens }.count
-            let decodingTimePerTokenWithWait = resultWithWait.timings.decodingLoop / Double(tokenCountWithWait)
-            Logging.debug("Decoding loop without wait: \(result.timings.decodingLoop), with wait: \(resultWithWait.timings.decodingLoop)")
+        // Explicitly create a new task for the second transcription
+        let resultWithWait = try await Task.detached(priority: .userInitiated) {
+            try await whisperKit.transcribe(audioPath: audioFilePath, callback: continuationCallbackWithWait).first!
+        }.value
 
-            // Assert that the decoding predictions per token are not slower with the waiting
-            XCTAssertEqual(decodingTimePerTokenWithWait, decodingTimePerTokenWithEarlyStop, accuracy: decodingTimePerTokenWithEarlyStop, "Decoding predictions per token should not be significantly slower with waiting")
+        XCTAssertNotNil(resultWithWait)
+        let tokenCountWithWait = resultWithWait.segments.flatMap { $0.tokens }.count
+        let decodingTimePerTokenWithWait = resultWithWait.timings.decodingLoop / Double(tokenCountWithWait)
+        Logging.debug("Decoding loop without wait: \(result.timings.decodingLoop), with wait: \(resultWithWait.timings.decodingLoop)")
+        Logging.debug("Token count without wait: \(tokenCountWithEarlyStop)")
+        Logging.debug("Token count with wait: \(tokenCountWithWait)")
 
-            // Assert that more tokens are returned in the callback with waiting
-            XCTAssertGreaterThanOrEqual(tokenCountWithWait, 30, "Tokens for callback with wait should contain the full audio file")
-            XCTAssertGreaterThan(tokenCountWithWait, tokenCountWithEarlyStop, "More tokens should be returned in the callback with waiting")
-        }
+        // Assert that the decoding predictions per token are not slower with the waiting
+        XCTAssertEqual(decodingTimePerTokenWithWait, decodingTimePerTokenWithEarlyStop, accuracy: decodingTimePerTokenWithEarlyStop, "Decoding predictions per token should not be significantly slower with waiting")
 
-        try await callbackTestTask.value
+        // Assert that more tokens are returned in the callback with waiting
+        XCTAssertGreaterThanOrEqual(tokenCountWithWait, 30, "Tokens for callback with wait should contain the full audio file")
+        XCTAssertGreaterThan(tokenCountWithWait, tokenCountWithEarlyStop, "More tokens should be returned in the callback with waiting (early stop: \(tokenCountWithEarlyStop), with wait: \(tokenCountWithWait))")
     }
-    #endif
 
     // MARK: - Utils Tests
 

From 287d0ab390c7f06b382bc33b1edabd92c80fcc19 Mon Sep 17 00:00:00 2001
From: ZachNagengast <znagengast@gmail.com>
Date: Thu, 20 Feb 2025 16:51:24 -0800
Subject: [PATCH 15/26] Use file with more tokens for early stopping test

---
 Tests/WhisperKitTests/UnitTests.swift | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Tests/WhisperKitTests/UnitTests.swift b/Tests/WhisperKitTests/UnitTests.swift
index 0359455..2357df9 100644
--- a/Tests/WhisperKitTests/UnitTests.swift
+++ b/Tests/WhisperKitTests/UnitTests.swift
@@ -1279,7 +1279,7 @@ final class UnitTests: XCTestCase {
 
         try await whisperKit.loadModels()
         let audioFilePath = try XCTUnwrap(
-            Bundle.current.path(forResource: "jfk", ofType: "wav"),
+            Bundle.current.path(forResource: "ted_60", ofType: "m4a"),
             "Audio file not found"
         )
 
@@ -1319,7 +1319,7 @@ final class UnitTests: XCTestCase {
         XCTAssertEqual(decodingTimePerTokenWithWait, decodingTimePerTokenWithEarlyStop, accuracy: decodingTimePerTokenWithEarlyStop, "Decoding predictions per token should not be significantly slower with waiting")
 
         // Assert that more tokens are returned in the callback with waiting
-        XCTAssertGreaterThanOrEqual(tokenCountWithWait, 30, "Tokens for callback with wait should contain the full audio file")
+        XCTAssertGreaterThanOrEqual(tokenCountWithWait, 200, "Tokens for callback with wait should contain the full audio file")
         XCTAssertGreaterThan(tokenCountWithWait, tokenCountWithEarlyStop, "More tokens should be returned in the callback with waiting (early stop: \(tokenCountWithEarlyStop), with wait: \(tokenCountWithWait))")
     }
 

From 15d9282d0b47703213818bf28e5ef38b1a02222d Mon Sep 17 00:00:00 2001
From: ZachNagengast <znagengast@gmail.com>
Date: Thu, 20 Feb 2025 17:18:56 -0800
Subject: [PATCH 16/26] Add timeout to early stopping callback test to prevent
 blocking

---
 Tests/WhisperKitTests/UnitTests.swift | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/Tests/WhisperKitTests/UnitTests.swift b/Tests/WhisperKitTests/UnitTests.swift
index 2357df9..3f47557 100644
--- a/Tests/WhisperKitTests/UnitTests.swift
+++ b/Tests/WhisperKitTests/UnitTests.swift
@@ -1298,8 +1298,16 @@ final class UnitTests: XCTestCase {
         let decodingTimePerTokenWithEarlyStop = result.timings.decodingLoop / Double(tokenCountWithEarlyStop)
 
         // Work done in the callback should not block the decoding loop
+        let queue = DispatchQueue(label: "EarlyStoppingQueue")
+        let semaphore = DispatchSemaphore(value: 0)
         let continuationCallbackWithWait: TranscriptionCallback = { (progress: TranscriptionProgress) -> Bool? in
-            Thread.sleep(forTimeInterval: 5)
+            // Wait for 5 seconds before returning false
+            queue.async {
+                DispatchQueue.main.asyncAfter(deadline: .now() + 5) {
+                    semaphore.signal()
+                }
+            }
+            _ = semaphore.wait(timeout: .now() + 5)
             return false
         }
 

From 1d79254a6654240ecc3c629a1bd7df70b6048b89 Mon Sep 17 00:00:00 2001
From: ZachNagengast <znagengast@gmail.com>
Date: Thu, 20 Feb 2025 17:27:45 -0800
Subject: [PATCH 17/26] Revert watchos early stopping test check

---
 Tests/WhisperKitTests/UnitTests.swift | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Tests/WhisperKitTests/UnitTests.swift b/Tests/WhisperKitTests/UnitTests.swift
index 3f47557..715321d 100644
--- a/Tests/WhisperKitTests/UnitTests.swift
+++ b/Tests/WhisperKitTests/UnitTests.swift
@@ -1260,6 +1260,7 @@ final class UnitTests: XCTestCase {
         await fulfillment(of: [modelStateExpectation, segmentDiscoveryExpectation, transcriptionStateExpectation], timeout: 1)
     }
 
+    #if !os(watchOS) // FIXME: watchOS ignores the priority here for some reason
     func testCallbackWithEarlyStopping() async throws {
         let computeOptions = ModelComputeOptions(
             melCompute: .cpuOnly,
@@ -1330,7 +1331,7 @@ final class UnitTests: XCTestCase {
         XCTAssertGreaterThanOrEqual(tokenCountWithWait, 200, "Tokens for callback with wait should contain the full audio file")
         XCTAssertGreaterThan(tokenCountWithWait, tokenCountWithEarlyStop, "More tokens should be returned in the callback with waiting (early stop: \(tokenCountWithEarlyStop), with wait: \(tokenCountWithWait))")
     }
-
+    #endif
     // MARK: - Utils Tests
 
     func testFillIndexesWithValue() throws {

From eb4ba4513e26ce943d06e901adaba023fee28718 Mon Sep 17 00:00:00 2001
From: ZachNagengast <znagengast@gmail.com>
Date: Thu, 20 Feb 2025 18:28:23 -0800
Subject: [PATCH 18/26] Update more standard package versioning

---
 Package.swift | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Package.swift b/Package.swift
index b5c04b7..315a27b 100644
--- a/Package.swift
+++ b/Package.swift
@@ -20,8 +20,8 @@ let package = Package(
         ),
     ],
     dependencies: [
-        .package(url: "https://github.com/huggingface/swift-transformers.git", .upToNextMajor(from: "0.1.8")),
-        .package(url: "https://github.com/apple/swift-argument-parser.git", .upToNextMajor(from: "1.3.0")),
+        .package(url: "https://github.com/huggingface/swift-transformers.git", .upToNextMinor(from: "0.1.8")),
+        .package(url: "https://github.com/apple/swift-argument-parser.git", from: "1.3.0"),
     ],
     targets: [
         .target(

From 402a86e31136f419e8d7eff12042dc3984c06455 Mon Sep 17 00:00:00 2001
From: ZachNagengast <znagengast@gmail.com>
Date: Fri, 21 Feb 2025 02:28:56 -0800
Subject: [PATCH 19/26] Improve short duration word timestamps

---
 .../WhisperKit/Core/Text/SegmentSeeker.swift  | 61 +++++++++++++++----
 Sources/WhisperKit/Core/TextDecoder.swift     | 15 ++++-
 Tests/WhisperKitTests/UnitTests.swift         |  7 +--
 3 files changed, 64 insertions(+), 19 deletions(-)

diff --git a/Sources/WhisperKit/Core/Text/SegmentSeeker.swift b/Sources/WhisperKit/Core/Text/SegmentSeeker.swift
index c6144a6..4501c9b 100644
--- a/Sources/WhisperKit/Core/Text/SegmentSeeker.swift
+++ b/Sources/WhisperKit/Core/Text/SegmentSeeker.swift
@@ -473,12 +473,14 @@ open class SegmentSeeker: SegmentSeeking {
         alignment = truncateLongWordsAtSentenceBoundaries(alignment, maxDuration: maxDuration)
 
         // Process alignment for punctuations
-        let mergedAlignment = mergePunctuations(alignment: alignment, prepended: prependPunctuations, appended: appendPunctuations)
+        if !alignment.isEmpty {
+            alignment = mergePunctuations(alignment: alignment, prepended: prependPunctuations, appended: appendPunctuations)
+        }
 
         // Update segments based on more accurate word timings
         let updatedSegments = updateSegmentsWithWordTimings(
             segments: segments,
-            mergedAlignment: mergedAlignment,
+            mergedAlignment: alignment,
             seek: seek,
             lastSpeechTimestamp: lastSpeechTimestamp,
             constrainedMedianDuration: constrainedMedianDuration,
@@ -533,7 +535,7 @@ open class SegmentSeeker: SegmentSeeking {
         var lastSpeechTimestamp = lastSpeechTimestamp // Create a mutable copy, it will be updated below
         var updatedSegments = [TranscriptionSegment]()
 
-        for segment in segments {
+        for (segmentIndex, segment) in segments.enumerated() {
             var savedTokens = 0
             let textTokens = segment.tokens.filter { $0 < tokenizer.specialTokens.specialTokenBegin }
             var wordsInSegment = [WordTiming]()
@@ -547,10 +549,42 @@ open class SegmentSeeker: SegmentSeeking {
                     continue
                 }
 
-                let start = (timeOffset + timing.start).rounded(2)
+                // Retokenize the word if some tokens were filtered out
+                let word = timingTokens.count < timing.tokens.count ?
+                    tokenizer.decode(tokens: timingTokens) :
+                    timing.word
+
+                var start = (timeOffset + timing.start).rounded(2)
                 let end = (timeOffset + timing.end).rounded(2)
+
+                // Handle short duration words by moving their start time back if there's space
+                if end - start < constrainedMedianDuration / 4 {
+                    if wordsInSegment.count >= 1,
+                       let previousTiming = wordsInSegment.last
+                    {
+                        let previousEnd = previousTiming.end
+
+                        // If there's space between this word and the previous word
+                        if start > previousEnd {
+                            // Move the word back, using either median duration or space available
+                            let spaceAvailable = start - previousEnd
+                            let desiredDuration = min(spaceAvailable, constrainedMedianDuration / 2)
+                            start = (start - desiredDuration).rounded(2)
+                        }
+                    } else if wordsInSegment.isEmpty,
+                              segmentIndex > 0,
+                              updatedSegments.count > segmentIndex - 1,
+                              start > updatedSegments[segmentIndex - 1].end
+                    {
+                        // First word of segment - check space from last segment
+                        let spaceAvailable = start - updatedSegments[segmentIndex - 1].end
+                        let desiredDuration = min(spaceAvailable, constrainedMedianDuration / 2)
+                        start = (start - desiredDuration).rounded(2)
+                    }
+                }
+
                 let probability = timing.probability.rounded(2)
-                let wordTiming = WordTiming(word: timing.word,
+                let wordTiming = WordTiming(word: word,
                                             tokens: timingTokens,
                                             start: start,
                                             end: end,
@@ -569,20 +603,21 @@ open class SegmentSeeker: SegmentSeeking {
             if let firstWord = wordsInSegment.first {
                 // Ensure the first and second word after a pause is not longer than
                 // twice the median word duration.
-                if firstWord.end - lastSpeechTimestamp > constrainedMedianDuration * 4 &&
-                    (firstWord.duration > maxDuration ||
-                        (wordsInSegment.count > 1 && wordsInSegment[1].end - firstWord.start > maxDuration * 2))
+                let pauseLength = firstWord.end - lastSpeechTimestamp
+                let firstWordTooLong = firstWord.duration > maxDuration
+                let bothWordsTooLong = wordsInSegment.count > 1 && wordsInSegment[1].end - firstWord.start > maxDuration * 2
+                if pauseLength > constrainedMedianDuration * 4 &&
+                    (firstWordTooLong || bothWordsTooLong)
                 {
                     // First word or both words are too long
                     if wordsInSegment.count > 1 && wordsInSegment[1].duration > maxDuration {
-                        // Second word is too long, set it to max duration and shorten first word to fit
-                        let boundary = min(wordsInSegment[1].start + maxDuration, wordsInSegment[1].end / 2)
+                        // Second word is too long, find a good boundary to readjust the words
+                        let boundary = max(wordsInSegment[1].end / 2, wordsInSegment[1].end - maxDuration)
                         wordsInSegment[0].end = boundary
                         wordsInSegment[1].start = boundary
                     }
-
-                    // First word is too long, keep its start time and adjust its end time
-                    wordsInSegment[0].end = min(wordsInSegment[0].start + maxDuration, wordsInSegment[0].end)
+                    // In either case, make sure the first word is not too long
+                    wordsInSegment[0].start = max(lastSpeechTimestamp, wordsInSegment[0].end - maxDuration)
                 }
 
                 // Prefer segment-level start timestamp if the first word is too long.
diff --git a/Sources/WhisperKit/Core/TextDecoder.swift b/Sources/WhisperKit/Core/TextDecoder.swift
index f0e8219..a088a90 100644
--- a/Sources/WhisperKit/Core/TextDecoder.swift
+++ b/Sources/WhisperKit/Core/TextDecoder.swift
@@ -686,12 +686,23 @@ open class TextDecoder: TextDecoding, WhisperMLModel {
             let loopStart = Date()
 
             let isPrefill = tokenIndex < intialPromptIndex - 1 // Prefill stops at the last token of the initial prompt
+            let isLastPrefillToken = tokenIndex == intialPromptIndex - 1
             let isFirstToken = tokenIndex == prefilledIndex
 
             // Check if current index is part of the initial prompt
             if tokenIndex < intialPromptIndex {
-                nextToken = currentTokens[tokenIndex]
-                Logging.debug("Forcing prompt tokenIndex: \(tokenIndex), token: \(nextToken), text: \(tokenizer.decode(tokens: [nextToken]))")
+                let isTimestampToken = currentTokens[tokenIndex] >= tokenizer.specialTokens.timeTokenBegin
+                let modelPredictedTimestamp = nextToken >= tokenizer.specialTokens.timeTokenBegin
+                
+                // Force the token unless it's the last prefill token and both are timestamps
+                if !(isLastPrefillToken && isTimestampToken && modelPredictedTimestamp) {
+                    nextToken = currentTokens[tokenIndex]
+                    Logging.debug("Forcing prompt tokenIndex: \(tokenIndex), token: \(nextToken), text: \(tokenizer.decode(tokens: [nextToken]))")
+                } else {
+                    // Last prefill was a timestamp but the model predicted a timestamp
+                    currentTokens[tokenIndex] = nextToken
+                    Logging.debug("Skipping prompt tokenIndex: \(tokenIndex), token: \(nextToken), text: \(tokenizer.decode(tokens: [nextToken]))")
+                }
             }
 
             // Set the current token as model input
diff --git a/Tests/WhisperKitTests/UnitTests.swift b/Tests/WhisperKitTests/UnitTests.swift
index 715321d..7dc9f2b 100644
--- a/Tests/WhisperKitTests/UnitTests.swift
+++ b/Tests/WhisperKitTests/UnitTests.swift
@@ -2176,11 +2176,10 @@ final class UnitTests: XCTestCase {
             "Very long pause duration (\(longWordDuration)s) should be truncated to maximum allowed duration (\(maxDuration)s)"
         )
 
-        // Test that the long word retained it's start time
+        // Test that the long word is truncated after pause
         XCTAssertEqual(
-            mergedAlignment.last!.start,
             updatedWords[longWordIndex].start,
-            accuracy: 0.0001,
+            33.6,
             "Long word start time should remain unchanged"
         )
 
@@ -2188,7 +2187,7 @@ final class UnitTests: XCTestCase {
         for i in 1..<updatedWords.count {
             let prevEnd = updatedWords[i - 1].end
             let currentStart = updatedWords[i].start
-            XCTAssertEqual(
+            XCTAssertLessThanOrEqual(
                 prevEnd,
                 currentStart,
                 "Timing sequence should remain monotonic (previous end: \(prevEnd), current start: \(currentStart))"

From aed22376ce479d3c810c3ba3a6964a384c343898 Mon Sep 17 00:00:00 2001
From: ZachNagengast <znagengast@gmail.com>
Date: Fri, 21 Feb 2025 13:44:48 -0800
Subject: [PATCH 20/26] PR review

---
 Makefile                                      |  2 +-
 Sources/WhisperKit/Core/Models.swift          | 34 +++++++++----------
 .../WhisperKit/Core/Text/SegmentSeeker.swift  | 18 +++++++---
 3 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/Makefile b/Makefile
index d575375..f9165a6 100644
--- a/Makefile
+++ b/Makefile
@@ -131,6 +131,6 @@ upload-benchmark-results:
 	@fastlane upload_results
 
 clean-package-caches:
-	@trash ~/Library/Developer/Xcode/DerivedData/WhisperKit*
+	@trash ~/Library/Developer/Xcode/DerivedData/WhisperKit* || true
 	@swift package purge-cache
 	@swift package reset
\ No newline at end of file
diff --git a/Sources/WhisperKit/Core/Models.swift b/Sources/WhisperKit/Core/Models.swift
index 78af20e..9f2fef4 100644
--- a/Sources/WhisperKit/Core/Models.swift
+++ b/Sources/WhisperKit/Core/Models.swift
@@ -178,11 +178,11 @@ public struct ModelSupport: Codable, Equatable {
     }
 
     public init(
-        `default`: String,
+        default: String,
         supported: [String],
         disabled: [String] = []
     ) {
-        self.`default` = `default`
+        self.default = `default`
         self.supported = supported
         self.disabled = disabled
     }
@@ -645,18 +645,18 @@ public extension TranscriptionResult {
 }
 
 public struct TranscriptionSegment: Hashable, Codable {
-    public var id: Int = 0
-    public var seek: Int = 0
-    public var start: Float = 0.0
-    public var end: Float = 0.0
-    public var text: String = ""
-    public var tokens: [Int] = []
-    public var tokenLogProbs: [[Int: Float]] = [[:]]
-    public var temperature: Float = 1.0
-    public var avgLogprob: Float = 0.0
-    public var compressionRatio: Float = 1.0
-    public var noSpeechProb: Float = 0.0
-    public var words: [WordTiming]? = nil
+    public var id: Int
+    public var seek: Int
+    public var start: Float
+    public var end: Float
+    public var text: String
+    public var tokens: [Int]
+    public var tokenLogProbs: [[Int: Float]]
+    public var temperature: Float
+    public var avgLogprob: Float
+    public var compressionRatio: Float
+    public var noSpeechProb: Float
+    public var words: [WordTiming]?
 
     /// Computed property for the duration of the segment
     public var duration: Float {
@@ -670,7 +670,7 @@ public struct TranscriptionSegment: Hashable, Codable {
         end: Float = 0.0,
         text: String = "",
         tokens: [Int] = [],
-        tokenLogProbs: [[Int : Float]] = [[:]],
+        tokenLogProbs: [[Int: Float]] = [[:]],
         temperature: Float = 1.0,
         avgLogprob: Float = 0.0,
         compressionRatio: Float = 1.0,
@@ -1297,13 +1297,13 @@ public struct SpecialTokens {
 }
 
 public protocol WhisperTokenizer {
-    // swift-transformers pass through
+    /// swift-transformers pass through
     func encode(text: String) -> [Int]
     func decode(tokens: [Int]) -> String
     func convertTokenToId(_ token: String) -> Int?
     func convertIdToToken(_ id: Int) -> String?
 
-    // WhisperKit specific
+    /// WhisperKit specific
     var specialTokens: SpecialTokens { get }
     var allLanguageTokens: Set<Int> { get }
 
diff --git a/Sources/WhisperKit/Core/Text/SegmentSeeker.swift b/Sources/WhisperKit/Core/Text/SegmentSeeker.swift
index 4501c9b..a263c7d 100644
--- a/Sources/WhisperKit/Core/Text/SegmentSeeker.swift
+++ b/Sources/WhisperKit/Core/Text/SegmentSeeker.swift
@@ -469,8 +469,8 @@ open class SegmentSeeker: SegmentSeeking {
 
         // TODO: This section is considered a "hack" in the source repo
         // Reference: https://github.com/openai/whisper/blob/ba3f3cd54b0e5b8ce1ab3de13e32122d0d5f98ab/whisper/timing.py#L305
-        let (constrainedMedianDuration, maxDuration) = calculateWordDurationConstraints(alignment: alignment)
-        alignment = truncateLongWordsAtSentenceBoundaries(alignment, maxDuration: maxDuration)
+        let wordDurations = calculateWordDurationConstraints(alignment: alignment)
+        alignment = truncateLongWordsAtSentenceBoundaries(alignment, maxDuration: wordDurations.max)
 
         // Process alignment for punctuations
         if !alignment.isEmpty {
@@ -483,15 +483,15 @@ open class SegmentSeeker: SegmentSeeking {
             mergedAlignment: alignment,
             seek: seek,
             lastSpeechTimestamp: lastSpeechTimestamp,
-            constrainedMedianDuration: constrainedMedianDuration,
-            maxDuration: maxDuration,
+            constrainedMedianDuration: wordDurations.median,
+            maxDuration: wordDurations.max,
             tokenizer: tokenizer
         )
 
         return updatedSegments
     }
 
-    public func calculateWordDurationConstraints(alignment: [WordTiming]) -> (Float, Float) {
+    public func calculateWordDurationConstraints(alignment: [WordTiming]) -> (median: Float, max: Float) {
         var wordDurations = alignment.map { $0.duration }
         wordDurations = wordDurations.filter { $0 > 0 }
 
@@ -558,6 +558,7 @@ open class SegmentSeeker: SegmentSeeking {
                 let end = (timeOffset + timing.end).rounded(2)
 
                 // Handle short duration words by moving their start time back if there's space
+                // There is most commonly space when there is timestamp tokens or merged punctuations between words
                 if end - start < constrainedMedianDuration / 4 {
                     if wordsInSegment.count >= 1,
                        let previousTiming = wordsInSegment.last
@@ -567,6 +568,11 @@ open class SegmentSeeker: SegmentSeeking {
                         // If there's space between this word and the previous word
                         if start > previousEnd {
                             // Move the word back, using either median duration or space available
+                            // Eg: [[0.5 - 2.0], [3.0 - 3.0]] (two word timings with one second gap)
+                            // ->  [[0.5 - 2.0], [2.7 - 3.0]] (second word start moved back)
+                            // however, if there is no space, it will not be adjusted
+                            // Eg: [[0.5 - 2.0], [2.0 - 2.0]] (two word timings with no gap)
+                            // ->  [[0.5 - 2.0], [2.0 - 2.0]] (second word start not moved back)
                             let spaceAvailable = start - previousEnd
                             let desiredDuration = min(spaceAvailable, constrainedMedianDuration / 2)
                             start = (start - desiredDuration).rounded(2)
@@ -577,6 +583,8 @@ open class SegmentSeeker: SegmentSeeking {
                               start > updatedSegments[segmentIndex - 1].end
                     {
                         // First word of segment - check space from last segment
+                        // Eg: [[0.5 - 1.5], [1.5 - 2.0]], [[3.0 - 3.0]] (two segments with one second gap)
+                        // ->  [[0.5 - 1.5], [1.5 - 2.0]], [[2.7 - 3.0]] (first word start of new segment moved back)
                         let spaceAvailable = start - updatedSegments[segmentIndex - 1].end
                         let desiredDuration = min(spaceAvailable, constrainedMedianDuration / 2)
                         start = (start - desiredDuration).rounded(2)

From 17f8f8ca8c6acaed3d47d3fc141af35314007ba7 Mon Sep 17 00:00:00 2001
From: ZachNagengast <znagengast@gmail.com>
Date: Fri, 21 Feb 2025 15:44:22 -0800
Subject: [PATCH 21/26] Ammend early callback tests on some OS's

---
 Tests/WhisperKitTests/UnitTests.swift | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/Tests/WhisperKitTests/UnitTests.swift b/Tests/WhisperKitTests/UnitTests.swift
index 7dc9f2b..39228a1 100644
--- a/Tests/WhisperKitTests/UnitTests.swift
+++ b/Tests/WhisperKitTests/UnitTests.swift
@@ -1260,7 +1260,6 @@ final class UnitTests: XCTestCase {
         await fulfillment(of: [modelStateExpectation, segmentDiscoveryExpectation, transcriptionStateExpectation], timeout: 1)
     }
 
-    #if !os(watchOS) // FIXME: watchOS ignores the priority here for some reason
     func testCallbackWithEarlyStopping() async throws {
         let computeOptions = ModelComputeOptions(
             melCompute: .cpuOnly,
@@ -1276,16 +1275,16 @@ final class UnitTests: XCTestCase {
             logLevel: .debug,
             load: false
         )
-        let whisperKit = try await WhisperKit(config)
-
+        var whisperKit = try await WhisperKit(config)
         try await whisperKit.loadModels()
+
         let audioFilePath = try XCTUnwrap(
             Bundle.current.path(forResource: "ted_60", ofType: "m4a"),
             "Audio file not found"
         )
 
         let earlyStopTokenCount = 5
-        let continuationCallback: TranscriptionCallback = { (progress: TranscriptionProgress) -> Bool? in
+        var continuationCallback: TranscriptionCallback = { (progress: TranscriptionProgress) -> Bool? in
             // Stop after only 5 tokens (full test audio contains ~30)
             progress.tokens.count <= earlyStopTokenCount
         }
@@ -1293,7 +1292,11 @@ final class UnitTests: XCTestCase {
         let result = try await Task.detached(priority: .userInitiated) {
             try await whisperKit.transcribe(audioPath: audioFilePath, callback: continuationCallback).first!
         }.value
+        continuationCallback = nil
 
+        // Reset whisperkit
+        whisperKit = try await WhisperKit(config)
+        try await whisperKit.loadModels()
         XCTAssertNotNil(result)
         let tokenCountWithEarlyStop = result.segments.flatMap { $0.tokens }.count
         let decodingTimePerTokenWithEarlyStop = result.timings.decodingLoop / Double(tokenCountWithEarlyStop)
@@ -1329,9 +1332,13 @@ final class UnitTests: XCTestCase {
 
         // Assert that more tokens are returned in the callback with waiting
         XCTAssertGreaterThanOrEqual(tokenCountWithWait, 200, "Tokens for callback with wait should contain the full audio file")
+
+        #if os(watchOS) || os(iOS) // FIXME: Some OS ignore the priority here on github action runners for some reason
+        XCTAssertGreaterThanOrEqual(tokenCountWithWait, tokenCountWithEarlyStop, "More tokens should be returned in the callback with waiting (early stop: \(tokenCountWithEarlyStop), with wait: \(tokenCountWithWait))")
+        #else
         XCTAssertGreaterThan(tokenCountWithWait, tokenCountWithEarlyStop, "More tokens should be returned in the callback with waiting (early stop: \(tokenCountWithEarlyStop), with wait: \(tokenCountWithWait))")
+        #endif
     }
-    #endif
     // MARK: - Utils Tests
 
     func testFillIndexesWithValue() throws {

From 95eed327b5bb193c96a0b67df95e9f38eca618df Mon Sep 17 00:00:00 2001
From: ZachNagengast <znagengast@gmail.com>
Date: Fri, 21 Feb 2025 15:56:35 -0800
Subject: [PATCH 22/26] Remove captured var from test

---
 Tests/WhisperKitTests/UnitTests.swift | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/Tests/WhisperKitTests/UnitTests.swift b/Tests/WhisperKitTests/UnitTests.swift
index 39228a1..7bde6e3 100644
--- a/Tests/WhisperKitTests/UnitTests.swift
+++ b/Tests/WhisperKitTests/UnitTests.swift
@@ -1275,7 +1275,7 @@ final class UnitTests: XCTestCase {
             logLevel: .debug,
             load: false
         )
-        var whisperKit = try await WhisperKit(config)
+        let whisperKit = try await WhisperKit(config)
         try await whisperKit.loadModels()
 
         let audioFilePath = try XCTUnwrap(
@@ -1294,9 +1294,6 @@ final class UnitTests: XCTestCase {
         }.value
         continuationCallback = nil
 
-        // Reset whisperkit
-        whisperKit = try await WhisperKit(config)
-        try await whisperKit.loadModels()
         XCTAssertNotNil(result)
         let tokenCountWithEarlyStop = result.segments.flatMap { $0.tokens }.count
         let decodingTimePerTokenWithEarlyStop = result.timings.decodingLoop / Double(tokenCountWithEarlyStop)

From 4b46ebbea621d8ac2fde772038edb0460fc2f343 Mon Sep 17 00:00:00 2001
From: ZachNagengast <znagengast@gmail.com>
Date: Fri, 21 Feb 2025 16:09:30 -0800
Subject: [PATCH 23/26] Remove unecessary tasks from test

---
 Tests/WhisperKitTests/UnitTests.swift | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/Tests/WhisperKitTests/UnitTests.swift b/Tests/WhisperKitTests/UnitTests.swift
index 7bde6e3..d01d5fd 100644
--- a/Tests/WhisperKitTests/UnitTests.swift
+++ b/Tests/WhisperKitTests/UnitTests.swift
@@ -1289,9 +1289,7 @@ final class UnitTests: XCTestCase {
             progress.tokens.count <= earlyStopTokenCount
         }
 
-        let result = try await Task.detached(priority: .userInitiated) {
-            try await whisperKit.transcribe(audioPath: audioFilePath, callback: continuationCallback).first!
-        }.value
+        let result = try await whisperKit.transcribe(audioPath: audioFilePath, callback: continuationCallback).first!
         continuationCallback = nil
 
         XCTAssertNotNil(result)
@@ -1312,10 +1310,7 @@ final class UnitTests: XCTestCase {
             return false
         }
 
-        // Explicitly create a new task for the second transcription
-        let resultWithWait = try await Task.detached(priority: .userInitiated) {
-            try await whisperKit.transcribe(audioPath: audioFilePath, callback: continuationCallbackWithWait).first!
-        }.value
+        let resultWithWait = try await whisperKit.transcribe(audioPath: audioFilePath, callback: continuationCallbackWithWait).first!
 
         XCTAssertNotNil(resultWithWait)
         let tokenCountWithWait = resultWithWait.segments.flatMap { $0.tokens }.count

From 9ce99a5bb64aa79bb267497f5b98de72a1c081f9 Mon Sep 17 00:00:00 2001
From: ZachNagengast <znagengast@gmail.com>
Date: Fri, 21 Feb 2025 16:24:22 -0800
Subject: [PATCH 24/26] Revert test changes keeping os filter

---
 Tests/WhisperKitTests/UnitTests.swift | 111 ++++++++++++--------------
 1 file changed, 52 insertions(+), 59 deletions(-)

diff --git a/Tests/WhisperKitTests/UnitTests.swift b/Tests/WhisperKitTests/UnitTests.swift
index d01d5fd..bb803b2 100644
--- a/Tests/WhisperKitTests/UnitTests.swift
+++ b/Tests/WhisperKitTests/UnitTests.swift
@@ -1261,75 +1261,68 @@ final class UnitTests: XCTestCase {
     }
 
     func testCallbackWithEarlyStopping() async throws {
-        let computeOptions = ModelComputeOptions(
-            melCompute: .cpuOnly,
-            audioEncoderCompute: .cpuOnly,
-            textDecoderCompute: .cpuOnly,
-            prefillCompute: .cpuOnly
-        )
+        let callbackTestTask = Task(priority: .userInitiated) {
+            let computeOptions = ModelComputeOptions(
+                melCompute: .cpuOnly,
+                audioEncoderCompute: .cpuOnly,
+                textDecoderCompute: .cpuOnly,
+                prefillCompute: .cpuOnly
+            )
 
-        let config = try WhisperKitConfig(
-            modelFolder: self.tinyModelPath(),
-            computeOptions: computeOptions,
-            verbose: true,
-            logLevel: .debug,
-            load: false
-        )
-        let whisperKit = try await WhisperKit(config)
-        try await whisperKit.loadModels()
+            let config = try WhisperKitConfig(
+                modelFolder: tinyModelPath(),
+                computeOptions: computeOptions,
+                verbose: true,
+                logLevel: .debug,
+                load: false
+            )
+            let whisperKit = try await WhisperKit(config)
 
-        let audioFilePath = try XCTUnwrap(
-            Bundle.current.path(forResource: "ted_60", ofType: "m4a"),
-            "Audio file not found"
-        )
+            try await whisperKit.loadModels()
+            let audioFilePath = try XCTUnwrap(
+                Bundle.current.path(forResource: "jfk", ofType: "wav"),
+                "Audio file not found"
+            )
 
-        let earlyStopTokenCount = 5
-        var continuationCallback: TranscriptionCallback = { (progress: TranscriptionProgress) -> Bool? in
-            // Stop after only 5 tokens (full test audio contains ~30)
-            progress.tokens.count <= earlyStopTokenCount
-        }
+            let earlyStopTokenCount = 10
+            let continuationCallback: TranscriptionCallback = { (progress: TranscriptionProgress) -> Bool? in
+                // Stop after only 10 tokens (full test audio contains ~30)
+                progress.tokens.count <= earlyStopTokenCount
+            }
 
-        let result = try await whisperKit.transcribe(audioPath: audioFilePath, callback: continuationCallback).first!
-        continuationCallback = nil
-
-        XCTAssertNotNil(result)
-        let tokenCountWithEarlyStop = result.segments.flatMap { $0.tokens }.count
-        let decodingTimePerTokenWithEarlyStop = result.timings.decodingLoop / Double(tokenCountWithEarlyStop)
-
-        // Work done in the callback should not block the decoding loop
-        let queue = DispatchQueue(label: "EarlyStoppingQueue")
-        let semaphore = DispatchSemaphore(value: 0)
-        let continuationCallbackWithWait: TranscriptionCallback = { (progress: TranscriptionProgress) -> Bool? in
-            // Wait for 5 seconds before returning false
-            queue.async {
-                DispatchQueue.main.asyncAfter(deadline: .now() + 5) {
-                    semaphore.signal()
-                }
+            let result = try await whisperKit.transcribe(audioPath: audioFilePath, callback: continuationCallback).first!
+
+            XCTAssertNotNil(result)
+            let tokenCountWithEarlyStop = result.segments.flatMap { $0.tokens }.count
+            let decodingTimePerTokenWithEarlyStop = result.timings.decodingLoop / Double(tokenCountWithEarlyStop)
+
+            // Work done in the callback should not block the decoding loop
+            let continuationCallbackWithWait: TranscriptionCallback = { (progress: TranscriptionProgress) -> Bool? in
+                Thread.sleep(forTimeInterval: 5)
+                return false
             }
-            _ = semaphore.wait(timeout: .now() + 5)
-            return false
-        }
 
-        let resultWithWait = try await whisperKit.transcribe(audioPath: audioFilePath, callback: continuationCallbackWithWait).first!
+            let resultWithWait = try await whisperKit.transcribe(audioPath: audioFilePath, callback: continuationCallbackWithWait).first!
+
+            XCTAssertNotNil(resultWithWait)
+            let tokenCountWithWait = resultWithWait.segments.flatMap { $0.tokens }.count
+            let decodingTimePerTokenWithWait = resultWithWait.timings.decodingLoop / Double(tokenCountWithWait)
+            Logging.debug("Decoding loop without wait: \(result.timings.decodingLoop), with wait: \(resultWithWait.timings.decodingLoop)")
 
-        XCTAssertNotNil(resultWithWait)
-        let tokenCountWithWait = resultWithWait.segments.flatMap { $0.tokens }.count
-        let decodingTimePerTokenWithWait = resultWithWait.timings.decodingLoop / Double(tokenCountWithWait)
-        Logging.debug("Decoding loop without wait: \(result.timings.decodingLoop), with wait: \(resultWithWait.timings.decodingLoop)")
-        Logging.debug("Token count without wait: \(tokenCountWithEarlyStop)")
-        Logging.debug("Token count with wait: \(tokenCountWithWait)")
+            // Assert that the decoding predictions per token are not slower with the waiting
+            XCTAssertEqual(decodingTimePerTokenWithWait, decodingTimePerTokenWithEarlyStop, accuracy: decodingTimePerTokenWithEarlyStop, "Decoding predictions per token should not be significantly slower with waiting")
 
-        // Assert that the decoding predictions per token are not slower with the waiting
-        XCTAssertEqual(decodingTimePerTokenWithWait, decodingTimePerTokenWithEarlyStop, accuracy: decodingTimePerTokenWithEarlyStop, "Decoding predictions per token should not be significantly slower with waiting")
+            // Assert that more tokens are returned in the callback with waiting
+            XCTAssertGreaterThanOrEqual(tokenCountWithWait, 30, "Tokens for callback with wait should contain the full audio file")
 
-        // Assert that more tokens are returned in the callback with waiting
-        XCTAssertGreaterThanOrEqual(tokenCountWithWait, 200, "Tokens for callback with wait should contain the full audio file")
+            #if os(watchOS) || os(iOS) // FIXME: Some OS ignore the priority here on github action runners for some reason
+            XCTAssertGreaterThanOrEqual(tokenCountWithWait, tokenCountWithEarlyStop, "More tokens should be returned in the callback with waiting (early stop: \(tokenCountWithEarlyStop), with wait: \(tokenCountWithWait))")
+            #else
+            XCTAssertGreaterThan(tokenCountWithWait, tokenCountWithEarlyStop, "More tokens should be returned in the callback with waiting (early stop: \(tokenCountWithEarlyStop), with wait: \(tokenCountWithWait))")
+            #endif
+        }
 
-        #if os(watchOS) || os(iOS) // FIXME: Some OS ignore the priority here on github action runners for some reason
-        XCTAssertGreaterThanOrEqual(tokenCountWithWait, tokenCountWithEarlyStop, "More tokens should be returned in the callback with waiting (early stop: \(tokenCountWithEarlyStop), with wait: \(tokenCountWithWait))")
-        #else
-        XCTAssertGreaterThan(tokenCountWithWait, tokenCountWithEarlyStop, "More tokens should be returned in the callback with waiting (early stop: \(tokenCountWithEarlyStop), with wait: \(tokenCountWithWait))")
-        #endif
+        try await callbackTestTask.value
     }
     // MARK: - Utils Tests
 

From 3d1a70987e3cabe4769d5978679995abe1499116 Mon Sep 17 00:00:00 2001
From: ZachNagengast <znagengast@gmail.com>
Date: Fri, 21 Feb 2025 16:37:17 -0800
Subject: [PATCH 25/26] Formatting

---
 Sources/WhisperKit/Core/FeatureExtractor.swift | 3 +--
 Sources/WhisperKit/Core/TextDecoder.swift      | 2 +-
 Tests/WhisperKitTests/UnitTests.swift          | 1 +
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Sources/WhisperKit/Core/FeatureExtractor.swift b/Sources/WhisperKit/Core/FeatureExtractor.swift
index 0fb0f68..3025e1f 100644
--- a/Sources/WhisperKit/Core/FeatureExtractor.swift
+++ b/Sources/WhisperKit/Core/FeatureExtractor.swift
@@ -37,7 +37,7 @@ open class FeatureExtractor: FeatureExtracting, WhisperMLModel {
         guard inputDescription.type == .multiArray else { return nil }
         guard let shapeConstraint = inputDescription.multiArrayConstraint else { return nil }
         let shape = shapeConstraint.shape.map { $0.intValue }
-        return shape[0]  // The audio input is a 1D array
+        return shape[0] // The audio input is a 1D array
     }
 
     public func logMelSpectrogram(fromAudio inputAudio: MLMultiArray) async throws -> MLMultiArray? {
@@ -54,5 +54,4 @@ open class FeatureExtractor: FeatureExtracting, WhisperMLModel {
         let output = MelSpectrogramOutput(features: outputFeatures)
         return output.melspectrogramFeatures
     }
-
 }
diff --git a/Sources/WhisperKit/Core/TextDecoder.swift b/Sources/WhisperKit/Core/TextDecoder.swift
index a088a90..f8f79b8 100644
--- a/Sources/WhisperKit/Core/TextDecoder.swift
+++ b/Sources/WhisperKit/Core/TextDecoder.swift
@@ -693,7 +693,7 @@ open class TextDecoder: TextDecoding, WhisperMLModel {
             if tokenIndex < intialPromptIndex {
                 let isTimestampToken = currentTokens[tokenIndex] >= tokenizer.specialTokens.timeTokenBegin
                 let modelPredictedTimestamp = nextToken >= tokenizer.specialTokens.timeTokenBegin
-                
+
                 // Force the token unless it's the last prefill token and both are timestamps
                 if !(isLastPrefillToken && isTimestampToken && modelPredictedTimestamp) {
                     nextToken = currentTokens[tokenIndex]
diff --git a/Tests/WhisperKitTests/UnitTests.swift b/Tests/WhisperKitTests/UnitTests.swift
index bb803b2..e0ca533 100644
--- a/Tests/WhisperKitTests/UnitTests.swift
+++ b/Tests/WhisperKitTests/UnitTests.swift
@@ -1324,6 +1324,7 @@ final class UnitTests: XCTestCase {
 
         try await callbackTestTask.value
     }
+
     // MARK: - Utils Tests
 
     func testFillIndexesWithValue() throws {

From d6ff6ce18d01d82c5bdc2b72b4776c9a179f0884 Mon Sep 17 00:00:00 2001
From: ZachNagengast <znagengast@gmail.com>
Date: Fri, 21 Feb 2025 17:20:31 -0800
Subject: [PATCH 26/26] Cleanup

---
 Tests/WhisperKitTests/UnitTests.swift | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Tests/WhisperKitTests/UnitTests.swift b/Tests/WhisperKitTests/UnitTests.swift
index e0ca533..0dc58d6 100644
--- a/Tests/WhisperKitTests/UnitTests.swift
+++ b/Tests/WhisperKitTests/UnitTests.swift
@@ -2089,12 +2089,12 @@ final class UnitTests: XCTestCase {
             // Normal duration words
             WordTiming(word: " The", tokens: [264], start: 0.5, end: 1.0, probability: 1),
             WordTiming(word: " first", tokens: [4589], start: 1.0, end: 2.0, probability: 1),
-            // Long duration word (30 seconds)
+            // Long duration word
             WordTiming(word: " segment", tokens: [234], start: 2.0, end: 3.0, probability: 1),
             // Normal words
             WordTiming(word: " with", tokens: [567], start: 3.0, end: 4.0, probability: 1),
             WordTiming(word: " a", tokens: [257], start: 4.0, end: 5.0, probability: 1),
-            // Very long duration word (100 seconds)
+            // Very long duration word
             WordTiming(word: " long", tokens: [890], start: 5.0, end: 6.0, probability: 1),
             // Normal duration ending
             WordTiming(word: " ending", tokens: [123], start: 6.0, end: 35.0, probability: 1),