diff --git a/.github/workflows/development-tests.yml b/.github/workflows/development-tests.yml index cb7e284..ae84251 100644 --- a/.github/workflows/development-tests.yml +++ b/.github/workflows/development-tests.yml @@ -16,8 +16,8 @@ jobs: name: "Build and Test" uses: ./.github/workflows/unit-tests.yml with: - ios-version: "17.2" - macos-runner: "macos-14" + ios-version: "18.1" + macos-runner: "macos-15" check-approvals: runs-on: ubuntu-latest diff --git a/.github/workflows/pre-release-tests.yml b/.github/workflows/pre-release-tests.yml index 4f3ae2c..20c1696 100644 --- a/.github/workflows/pre-release-tests.yml +++ b/.github/workflows/pre-release-tests.yml @@ -13,8 +13,8 @@ jobs: include: - os: macos-13-xlarge ios-version: "16.1" # Oldest available version - - os: macos-14 - ios-version: "17.2" # Latest available version + - os: macos-15 + ios-version: "18.1" # Latest available version uses: ./.github/workflows/unit-tests.yml with: ios-version: ${{ matrix.ios-version }} diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index ae3695d..557ab22 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -27,17 +27,17 @@ jobs: name: "iOS", condition: true, clean-destination: "generic/platform=iOS", - test-destination: "platform=iOS Simulator,OS=${{ inputs.ios-version }},name=iPhone 15", + test-destination: "platform=iOS Simulator,OS=${{ inputs.ios-version }},name=iPhone 16", } - { name: "watchOS", - condition: "${{ inputs.macos-runner == 'macos-14' }}", + condition: "${{ inputs.macos-runner == 'macos-15' }}", clean-destination: "generic/platform=watchOS", - test-destination: "platform=watchOS Simulator,OS=10.2,name=Apple Watch Ultra 2 (49mm)", + test-destination: "platform=watchOS Simulator,OS=11.1,name=Apple Watch Ultra 2 (49mm)", } - { name: "visionOS", - condition: "${{ inputs.macos-runner == 'macos-14' }}", + condition: "${{ inputs.macos-runner == 'macos-15' }}", clean-destination: "generic/platform=visionOS", test-destination: "platform=visionOS Simulator,name=Apple Vision Pro", } @@ -46,7 +46,7 @@ jobs: - uses: actions/checkout@v4 - uses: maxim-lobanov/setup-xcode@v1 with: - xcode-version: "15.2" + xcode-version: latest-stable - name: Setup environment run: make setup - name: Setup Cache @@ -66,7 +66,7 @@ jobs: echo "Destinations for testing:" xcodebuild test-without-building -only-testing WhisperKitTests/UnitTests -scheme whisperkit-Package -showdestinations - name: Boot Simulator and Wait - if: ${{ matrix.run-config['name'] != 'macOS' }} && ${{ inputs.macos-runner == 'macos-14' }} + if: ${{ matrix.run-config['name'] != 'macOS' }} && ${{ inputs.macos-runner == 'macos-15' }} # Slower runners require some time to fully boot the simulator # Parse the simulator name from the destination string, boot it, and wait run: | diff --git a/Sources/WhisperKit/Core/Audio/AudioProcessor.swift b/Sources/WhisperKit/Core/Audio/AudioProcessor.swift index 89d3132..a9dfe63 100644 --- a/Sources/WhisperKit/Core/Audio/AudioProcessor.swift +++ b/Sources/WhisperKit/Core/Audio/AudioProcessor.swift @@ -349,13 +349,15 @@ public class AudioProcessor: NSObject, AudioProcessing { } let inputBuffer = AVAudioPCMBuffer(pcmFormat: audioFile.processingFormat, frameCapacity: maxReadFrameSize)! - - while audioFile.framePosition < endFramePosition { - let remainingFrames = AVAudioFrameCount(endFramePosition - audioFile.framePosition) + var nextPosition = inputStartFrame + while nextPosition < endFramePosition { + let framePosition = audioFile.framePosition + let remainingFrames = AVAudioFrameCount(endFramePosition - framePosition) let framesToRead = min(remainingFrames, maxReadFrameSize) + nextPosition = framePosition + Int64(framesToRead) - let currentPositionInSeconds = Double(audioFile.framePosition) / inputSampleRate - let nextPositionInSeconds = (Double(audioFile.framePosition) + Double(framesToRead)) / inputSampleRate + let currentPositionInSeconds = Double(framePosition) / inputSampleRate + let nextPositionInSeconds = Double(nextPosition) / inputSampleRate Logging.debug("Resampling \(String(format: "%.2f", currentPositionInSeconds))s - \(String(format: "%.2f", nextPositionInSeconds))s") do { diff --git a/Tests/WhisperKitTests/Evaluate/WERUtils.swift b/Tests/WhisperKitTests/Evaluate/WERUtils.swift index 6e20e98..7c94689 100644 --- a/Tests/WhisperKitTests/Evaluate/WERUtils.swift +++ b/Tests/WhisperKitTests/Evaluate/WERUtils.swift @@ -123,4 +123,14 @@ enum WERUtils { let (_, diff) = evaluate(originalTranscript: originalTranscript, generatedTranscript: generatedTranscript) return diff } + + static func diffString(from diff: [[String?]]) -> String { + return diff.compactMap { entry -> String? in + guard let word = entry[0], word != " " else { return nil } + if let changeType = entry[1] { + return "\(changeType)\(word)" + } + return word + }.joined(separator: " ") + } } diff --git a/Tests/WhisperKitTests/UnitTests.swift b/Tests/WhisperKitTests/UnitTests.swift index e633558..df258a4 100644 --- a/Tests/WhisperKitTests/UnitTests.swift +++ b/Tests/WhisperKitTests/UnitTests.swift @@ -1416,31 +1416,31 @@ final class UnitTests: XCTestCase { } func testVADAudioChunkerAccuracy() async throws { - let testResult = try await XCTUnwrapAsync( - await transcribe(with: .tiny, options: DecodingOptions(), audioFile: "ted_60.m4a"), + let options = DecodingOptions(temperatureFallbackCount: 0, chunkingStrategy: .vad) + + let chunkedResult = try await XCTUnwrapAsync( + await transcribe(with: .tiny, options: options, audioFile: "ted_60.m4a"), "Failed to transcribe" ) - let options = DecodingOptions(chunkingStrategy: .vad) + let clipTimestamps = chunkedResult.compactMap(\.seekTime) + XCTAssertEqual(clipTimestamps, [0, 22.9, 39], "Clip timestamps should match the expected values, found \(clipTimestamps)") - let chunkedResult = try await XCTUnwrapAsync( - await transcribe(with: .tiny, options: options, audioFile: "ted_60.m4a"), + // Run the test using same seek values for accuracy comparison + let testResult = try await XCTUnwrapAsync( + await transcribe(with: .tiny, options: DecodingOptions(temperatureFallbackCount: 0, clipTimestamps: [0, 22.9, 22.9, 39, 39, 60]), audioFile: "ted_60.m4a"), "Failed to transcribe" ) XCTAssertFalse(testResult.text.isEmpty, "The test text should not be empty") XCTAssertFalse(chunkedResult.text.isEmpty, "The chunked text should not be empty") - // Select few sentences to compare at VAD border - // TODO: test that WER is in acceptable range -// XCTAssertTrue(testResult.text.normalized.contains("I would kind".normalized), "Expected text not found in \(testResult.text.normalized)") -// XCTAssertTrue(chunkedResult.text.normalized.contains("I would kind".normalized), "Expected text not found in \(chunkedResult.text.normalized)") -// -// XCTAssertTrue(testResult.text.normalized.contains("every single paper".normalized), "Expected text not found in \(testResult.text.normalized)") -// XCTAssertTrue(chunkedResult.text.normalized.contains("every single paper".normalized), "Expected text not found in \(chunkedResult.text.normalized)") + // Check WER for the full audio and the chunked audio + let (wer, diff) = WERUtils.evaluate(originalTranscript: testResult.text, generatedTranscript: chunkedResult.text) + + let diffDescription = WERUtils.diffString(from: diff) - XCTAssertTrue(testResult.text.normalized.contains("But then came my 90 page senior".normalized), "Expected text not found in \(testResult.text.normalized)") - XCTAssertTrue(chunkedResult.text.normalized.contains("But then came my 90 page senior".normalized), "Expected text not found in \(chunkedResult.text.normalized)") + XCTAssertEqual(wer, 0.0, "Transcripts should match with a WER of 0, found \(wer). Full diff: \(diffDescription)") } #if !os(watchOS) // FIXME: This test times out on watchOS when run on low compute runners