Skip to content

Commit dd2eb73

Browse files
Fix audio processing edge case (#237)
* Fix xcconfig tracking * Add package.swift docs to readme * Fix edge case where framePosition does not align with actual frame count of AVAudioFile * Upgrade github runner macos version * Update remaining github runner versions * Use WERUtils to check vad accuracy * Reduce calls to frameposition * Fix xcode version for runner
1 parent a9b92c4 commit dd2eb73

File tree

6 files changed

+41
-29
lines changed

6 files changed

+41
-29
lines changed

.github/workflows/development-tests.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ jobs:
1616
name: "Build and Test"
1717
uses: ./.github/workflows/unit-tests.yml
1818
with:
19-
ios-version: "17.2"
20-
macos-runner: "macos-14"
19+
ios-version: "18.1"
20+
macos-runner: "macos-15"
2121

2222
check-approvals:
2323
runs-on: ubuntu-latest

.github/workflows/pre-release-tests.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ jobs:
1313
include:
1414
- os: macos-13-xlarge
1515
ios-version: "16.1" # Oldest available version
16-
- os: macos-14
17-
ios-version: "17.2" # Latest available version
16+
- os: macos-15
17+
ios-version: "18.1" # Latest available version
1818
uses: ./.github/workflows/unit-tests.yml
1919
with:
2020
ios-version: ${{ matrix.ios-version }}

.github/workflows/unit-tests.yml

+6-6
Original file line numberDiff line numberDiff line change
@@ -27,17 +27,17 @@ jobs:
2727
name: "iOS",
2828
condition: true,
2929
clean-destination: "generic/platform=iOS",
30-
test-destination: "platform=iOS Simulator,OS=${{ inputs.ios-version }},name=iPhone 15",
30+
test-destination: "platform=iOS Simulator,OS=${{ inputs.ios-version }},name=iPhone 16",
3131
}
3232
- {
3333
name: "watchOS",
34-
condition: "${{ inputs.macos-runner == 'macos-14' }}",
34+
condition: "${{ inputs.macos-runner == 'macos-15' }}",
3535
clean-destination: "generic/platform=watchOS",
36-
test-destination: "platform=watchOS Simulator,OS=10.2,name=Apple Watch Ultra 2 (49mm)",
36+
test-destination: "platform=watchOS Simulator,OS=11.1,name=Apple Watch Ultra 2 (49mm)",
3737
}
3838
- {
3939
name: "visionOS",
40-
condition: "${{ inputs.macos-runner == 'macos-14' }}",
40+
condition: "${{ inputs.macos-runner == 'macos-15' }}",
4141
clean-destination: "generic/platform=visionOS",
4242
test-destination: "platform=visionOS Simulator,name=Apple Vision Pro",
4343
}
@@ -46,7 +46,7 @@ jobs:
4646
- uses: actions/checkout@v4
4747
- uses: maxim-lobanov/setup-xcode@v1
4848
with:
49-
xcode-version: "15.2"
49+
xcode-version: latest-stable
5050
- name: Setup environment
5151
run: make setup
5252
- name: Setup Cache
@@ -66,7 +66,7 @@ jobs:
6666
echo "Destinations for testing:"
6767
xcodebuild test-without-building -only-testing WhisperKitTests/UnitTests -scheme whisperkit-Package -showdestinations
6868
- name: Boot Simulator and Wait
69-
if: ${{ matrix.run-config['name'] != 'macOS' }} && ${{ inputs.macos-runner == 'macos-14' }}
69+
if: ${{ matrix.run-config['name'] != 'macOS' }} && ${{ inputs.macos-runner == 'macos-15' }}
7070
# Slower runners require some time to fully boot the simulator
7171
# Parse the simulator name from the destination string, boot it, and wait
7272
run: |

Sources/WhisperKit/Core/Audio/AudioProcessor.swift

+7-5
Original file line numberDiff line numberDiff line change
@@ -349,13 +349,15 @@ public class AudioProcessor: NSObject, AudioProcessing {
349349
}
350350

351351
let inputBuffer = AVAudioPCMBuffer(pcmFormat: audioFile.processingFormat, frameCapacity: maxReadFrameSize)!
352-
353-
while audioFile.framePosition < endFramePosition {
354-
let remainingFrames = AVAudioFrameCount(endFramePosition - audioFile.framePosition)
352+
var nextPosition = inputStartFrame
353+
while nextPosition < endFramePosition {
354+
let framePosition = audioFile.framePosition
355+
let remainingFrames = AVAudioFrameCount(endFramePosition - framePosition)
355356
let framesToRead = min(remainingFrames, maxReadFrameSize)
357+
nextPosition = framePosition + Int64(framesToRead)
356358

357-
let currentPositionInSeconds = Double(audioFile.framePosition) / inputSampleRate
358-
let nextPositionInSeconds = (Double(audioFile.framePosition) + Double(framesToRead)) / inputSampleRate
359+
let currentPositionInSeconds = Double(framePosition) / inputSampleRate
360+
let nextPositionInSeconds = Double(nextPosition) / inputSampleRate
359361
Logging.debug("Resampling \(String(format: "%.2f", currentPositionInSeconds))s - \(String(format: "%.2f", nextPositionInSeconds))s")
360362

361363
do {

Tests/WhisperKitTests/Evaluate/WERUtils.swift

+10
Original file line numberDiff line numberDiff line change
@@ -123,4 +123,14 @@ enum WERUtils {
123123
let (_, diff) = evaluate(originalTranscript: originalTranscript, generatedTranscript: generatedTranscript)
124124
return diff
125125
}
126+
127+
static func diffString(from diff: [[String?]]) -> String {
128+
return diff.compactMap { entry -> String? in
129+
guard let word = entry[0], word != " " else { return nil }
130+
if let changeType = entry[1] {
131+
return "\(changeType)\(word)"
132+
}
133+
return word
134+
}.joined(separator: " ")
135+
}
126136
}

Tests/WhisperKitTests/UnitTests.swift

+14-14
Original file line numberDiff line numberDiff line change
@@ -1416,31 +1416,31 @@ final class UnitTests: XCTestCase {
14161416
}
14171417

14181418
func testVADAudioChunkerAccuracy() async throws {
1419-
let testResult = try await XCTUnwrapAsync(
1420-
await transcribe(with: .tiny, options: DecodingOptions(), audioFile: "ted_60.m4a"),
1419+
let options = DecodingOptions(temperatureFallbackCount: 0, chunkingStrategy: .vad)
1420+
1421+
let chunkedResult = try await XCTUnwrapAsync(
1422+
await transcribe(with: .tiny, options: options, audioFile: "ted_60.m4a"),
14211423
"Failed to transcribe"
14221424
)
14231425

1424-
let options = DecodingOptions(chunkingStrategy: .vad)
1426+
let clipTimestamps = chunkedResult.compactMap(\.seekTime)
1427+
XCTAssertEqual(clipTimestamps, [0, 22.9, 39], "Clip timestamps should match the expected values, found \(clipTimestamps)")
14251428

1426-
let chunkedResult = try await XCTUnwrapAsync(
1427-
await transcribe(with: .tiny, options: options, audioFile: "ted_60.m4a"),
1429+
// Run the test using same seek values for accuracy comparison
1430+
let testResult = try await XCTUnwrapAsync(
1431+
await transcribe(with: .tiny, options: DecodingOptions(temperatureFallbackCount: 0, clipTimestamps: [0, 22.9, 22.9, 39, 39, 60]), audioFile: "ted_60.m4a"),
14281432
"Failed to transcribe"
14291433
)
14301434

14311435
XCTAssertFalse(testResult.text.isEmpty, "The test text should not be empty")
14321436
XCTAssertFalse(chunkedResult.text.isEmpty, "The chunked text should not be empty")
14331437

1434-
// Select few sentences to compare at VAD border
1435-
// TODO: test that WER is in acceptable range
1436-
// XCTAssertTrue(testResult.text.normalized.contains("I would kind".normalized), "Expected text not found in \(testResult.text.normalized)")
1437-
// XCTAssertTrue(chunkedResult.text.normalized.contains("I would kind".normalized), "Expected text not found in \(chunkedResult.text.normalized)")
1438-
//
1439-
// XCTAssertTrue(testResult.text.normalized.contains("every single paper".normalized), "Expected text not found in \(testResult.text.normalized)")
1440-
// XCTAssertTrue(chunkedResult.text.normalized.contains("every single paper".normalized), "Expected text not found in \(chunkedResult.text.normalized)")
1438+
// Check WER for the full audio and the chunked audio
1439+
let (wer, diff) = WERUtils.evaluate(originalTranscript: testResult.text, generatedTranscript: chunkedResult.text)
1440+
1441+
let diffDescription = WERUtils.diffString(from: diff)
14411442

1442-
XCTAssertTrue(testResult.text.normalized.contains("But then came my 90 page senior".normalized), "Expected text not found in \(testResult.text.normalized)")
1443-
XCTAssertTrue(chunkedResult.text.normalized.contains("But then came my 90 page senior".normalized), "Expected text not found in \(chunkedResult.text.normalized)")
1443+
XCTAssertEqual(wer, 0.0, "Transcripts should match with a WER of 0, found \(wer). Full diff: \(diffDescription)")
14441444
}
14451445

14461446
#if !os(watchOS) // FIXME: This test times out on watchOS when run on low compute runners

0 commit comments

Comments
 (0)