Fix audio processing edge case (#237)

ZachNagengast · web-flow · commit dd2eb733e9d1 · 2024-10-31T19:38:40.000-07:00
* Fix xcconfig tracking

* Add package.swift docs to readme

* Fix edge case where framePosition does not align with actual frame count of AVAudioFile

* Upgrade github runner macos version

* Update remaining github runner versions

* Use WERUtils to check vad accuracy

* Reduce calls to frameposition

* Fix xcode version for runner
diff --git a/.github/workflows/development-tests.yml b/.github/workflows/development-tests.yml
@@ -16,8 +16,8 @@ jobs:
     name: "Build and Test"
     uses: ./.github/workflows/unit-tests.yml
     with:
-      ios-version: "17.2"
-      macos-runner: "macos-14"
+      ios-version: "18.1"
+      macos-runner: "macos-15"
 
   check-approvals:
     runs-on: ubuntu-latest
diff --git a/.github/workflows/pre-release-tests.yml b/.github/workflows/pre-release-tests.yml
@@ -13,8 +13,8 @@ jobs:
         include:
           - os: macos-13-xlarge
             ios-version: "16.1" # Oldest available version
-          - os: macos-14
-            ios-version: "17.2" # Latest available version
+          - os: macos-15
+            ios-version: "18.1" # Latest available version
     uses: ./.github/workflows/unit-tests.yml
     with:
       ios-version: ${{ matrix.ios-version }}
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -27,17 +27,17 @@ jobs:
               name: "iOS",
               condition: true,
               clean-destination: "generic/platform=iOS",
-              test-destination: "platform=iOS Simulator,OS=${{ inputs.ios-version }},name=iPhone 15",
+              test-destination: "platform=iOS Simulator,OS=${{ inputs.ios-version }},name=iPhone 16",
             }
           - {
               name: "watchOS",
-              condition: "${{ inputs.macos-runner == 'macos-14' }}",
+              condition: "${{ inputs.macos-runner == 'macos-15' }}",
               clean-destination: "generic/platform=watchOS",
-              test-destination: "platform=watchOS Simulator,OS=10.2,name=Apple Watch Ultra 2 (49mm)",
+              test-destination: "platform=watchOS Simulator,OS=11.1,name=Apple Watch Ultra 2 (49mm)",
             }
           - {
               name: "visionOS",
-              condition: "${{ inputs.macos-runner == 'macos-14' }}",
+              condition: "${{ inputs.macos-runner == 'macos-15' }}",
               clean-destination: "generic/platform=visionOS",
               test-destination: "platform=visionOS Simulator,name=Apple Vision Pro",
             }
@@ -46,7 +46,7 @@ jobs:
       - uses: actions/checkout@v4
       - uses: maxim-lobanov/setup-xcode@v1
         with:
-          xcode-version: "15.2"
+          xcode-version: latest-stable
       - name: Setup environment
         run: make setup
       - name: Setup Cache
@@ -66,7 +66,7 @@ jobs:
           echo "Destinations for testing:"
           xcodebuild test-without-building -only-testing WhisperKitTests/UnitTests -scheme whisperkit-Package -showdestinations
       - name: Boot Simulator and Wait
-        if: ${{ matrix.run-config['name'] != 'macOS' }} && ${{ inputs.macos-runner == 'macos-14' }}
+        if: ${{ matrix.run-config['name'] != 'macOS' }} && ${{ inputs.macos-runner == 'macos-15' }}
         # Slower runners require some time to fully boot the simulator
         # Parse the simulator name from the destination string, boot it, and wait
         run: |
diff --git a/Sources/WhisperKit/Core/Audio/AudioProcessor.swift b/Sources/WhisperKit/Core/Audio/AudioProcessor.swift
@@ -349,13 +349,15 @@ public class AudioProcessor: NSObject, AudioProcessing {
         }
 
         let inputBuffer = AVAudioPCMBuffer(pcmFormat: audioFile.processingFormat, frameCapacity: maxReadFrameSize)!
-
-        while audioFile.framePosition < endFramePosition {
-            let remainingFrames = AVAudioFrameCount(endFramePosition - audioFile.framePosition)
+        var nextPosition = inputStartFrame
+        while nextPosition < endFramePosition {
+            let framePosition = audioFile.framePosition
+            let remainingFrames = AVAudioFrameCount(endFramePosition - framePosition)
             let framesToRead = min(remainingFrames, maxReadFrameSize)
+            nextPosition = framePosition + Int64(framesToRead)
 
-            let currentPositionInSeconds = Double(audioFile.framePosition) / inputSampleRate
-            let nextPositionInSeconds = (Double(audioFile.framePosition) + Double(framesToRead)) / inputSampleRate
+            let currentPositionInSeconds = Double(framePosition) / inputSampleRate
+            let nextPositionInSeconds = Double(nextPosition) / inputSampleRate
             Logging.debug("Resampling \(String(format: "%.2f", currentPositionInSeconds))s - \(String(format: "%.2f", nextPositionInSeconds))s")
 
             do {
diff --git a/Tests/WhisperKitTests/Evaluate/WERUtils.swift b/Tests/WhisperKitTests/Evaluate/WERUtils.swift
@@ -123,4 +123,14 @@ enum WERUtils {
         let (_, diff) = evaluate(originalTranscript: originalTranscript, generatedTranscript: generatedTranscript)
         return diff
     }
+
+    static func diffString(from diff: [[String?]]) -> String {
+        return diff.compactMap { entry -> String? in
+            guard let word = entry[0], word != " " else { return nil }
+            if let changeType = entry[1] {
+                return "\(changeType)\(word)"
+            }
+            return word
+        }.joined(separator: " ")
+    }
 }
diff --git a/Tests/WhisperKitTests/UnitTests.swift b/Tests/WhisperKitTests/UnitTests.swift
@@ -1416,31 +1416,31 @@ final class UnitTests: XCTestCase {
     }
 
     func testVADAudioChunkerAccuracy() async throws {
-        let testResult = try await XCTUnwrapAsync(
-            await transcribe(with: .tiny, options: DecodingOptions(), audioFile: "ted_60.m4a"),
+        let options = DecodingOptions(temperatureFallbackCount: 0, chunkingStrategy: .vad)
+
+        let chunkedResult = try await XCTUnwrapAsync(
+            await transcribe(with: .tiny, options: options, audioFile: "ted_60.m4a"),
             "Failed to transcribe"
         )
 
-        let options = DecodingOptions(chunkingStrategy: .vad)
+        let clipTimestamps = chunkedResult.compactMap(\.seekTime)
+        XCTAssertEqual(clipTimestamps, [0, 22.9, 39], "Clip timestamps should match the expected values, found \(clipTimestamps)")
 
-        let chunkedResult = try await XCTUnwrapAsync(
-            await transcribe(with: .tiny, options: options, audioFile: "ted_60.m4a"),
+        // Run the test using same seek values for accuracy comparison
+        let testResult = try await XCTUnwrapAsync(
+            await transcribe(with: .tiny, options: DecodingOptions(temperatureFallbackCount: 0, clipTimestamps: [0, 22.9, 22.9, 39, 39, 60]), audioFile: "ted_60.m4a"),
             "Failed to transcribe"
         )
 
         XCTAssertFalse(testResult.text.isEmpty, "The test text should not be empty")
         XCTAssertFalse(chunkedResult.text.isEmpty, "The chunked text should not be empty")
 
-        // Select few sentences to compare at VAD border
-        // TODO: test that WER is in acceptable range
-//        XCTAssertTrue(testResult.text.normalized.contains("I would kind".normalized), "Expected text not found in \(testResult.text.normalized)")
-//        XCTAssertTrue(chunkedResult.text.normalized.contains("I would kind".normalized), "Expected text not found in \(chunkedResult.text.normalized)")
-//
-//        XCTAssertTrue(testResult.text.normalized.contains("every single paper".normalized), "Expected text not found in \(testResult.text.normalized)")
-//        XCTAssertTrue(chunkedResult.text.normalized.contains("every single paper".normalized), "Expected text not found in \(chunkedResult.text.normalized)")
+        // Check WER for the full audio and the chunked audio
+        let (wer, diff) = WERUtils.evaluate(originalTranscript: testResult.text, generatedTranscript: chunkedResult.text)
+
+        let diffDescription = WERUtils.diffString(from: diff)
 
-        XCTAssertTrue(testResult.text.normalized.contains("But then came my 90 page senior".normalized), "Expected text not found in \(testResult.text.normalized)")
-        XCTAssertTrue(chunkedResult.text.normalized.contains("But then came my 90 page senior".normalized), "Expected text not found in \(chunkedResult.text.normalized)")
+        XCTAssertEqual(wer, 0.0, "Transcripts should match with a WER of 0, found \(wer). Full diff: \(diffDescription)")
     }
 
     #if !os(watchOS) // FIXME: This test times out on watchOS when run on low compute runners