diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index fe92120..f53d934 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -36,8 +36,9 @@ jobs:
path: Models
key: ${{ runner.os }}-models
- name: Build
- run: xcodebuild build-for-testing -scheme whisperkit-Package -destination 'platform=macOS'
+ run: xcodebuild clean build-for-testing -scheme whisperkit-Package -destination 'platform=macOS'
- name: Run tests
run: |
+ set -o pipefail
xcodebuild test-without-building -only-testing WhisperKitTests/UnitTests -scheme whisperkit-Package -showdestinations
xcodebuild test-without-building -only-testing WhisperKitTests/UnitTests -scheme whisperkit-Package -destination "platform=macOS,arch=arm64" | xcpretty
\ No newline at end of file
diff --git a/Examples/WhisperAX/WhisperAX/Views/ContentView.swift b/Examples/WhisperAX/WhisperAX/Views/ContentView.swift
index 32e5575..afbfe81 100644
--- a/Examples/WhisperAX/WhisperAX/Views/ContentView.swift
+++ b/Examples/WhisperAX/WhisperAX/Views/ContentView.swift
@@ -157,11 +157,11 @@ struct ContentView: View {
#if os(macOS)
selectedCategoryId = menu.first(where: { $0.name == selectedTab })?.id
#endif
-
fetchModels()
}
}
+
// MARK: - Transcription
var transcriptionView: some View {
@@ -169,7 +169,7 @@ struct ContentView: View {
ScrollView(.horizontal) {
HStack(spacing: 1) {
let startIndex = max(bufferEnergy.count - 300, 0)
- ForEach(Array(bufferEnergy.enumerated())[startIndex...], id: \.offset) { index, energy in
+ ForEach(Array(bufferEnergy.enumerated())[startIndex...], id: \.element) { index, energy in
ZStack {
RoundedRectangle(cornerRadius: 2)
.frame(width: 2, height: CGFloat(energy) * 24)
@@ -660,7 +660,12 @@ struct ContentView: View {
}
localModels = WhisperKit.formatModelFiles(localModels)
- availableModels = localModels
+ for model in localModels {
+ if !availableModels.contains(model),
+ !disabledModels.contains(model){
+ availableModels.append(model)
+ }
+ }
print("Found locally: \(localModels)")
print("Previously selected model: \(selectedModel)")
diff --git a/Makefile b/Makefile
index a1a8b56..e329842 100644
--- a/Makefile
+++ b/Makefile
@@ -49,7 +49,7 @@ setup-model-repo:
cd $(MODEL_REPO_DIR) && git fetch --all && git reset --hard origin/main && git clean -fdx; \
else \
echo "Repository not found, initializing..."; \
- GIT_LFS_SKIP_SMUDGE=1 git clone https://hf.co/$(MODEL_REPO) $(MODEL_REPO_DIR); \
+ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/$(MODEL_REPO) $(MODEL_REPO_DIR); \
fi
# Download all models
diff --git a/README.md b/README.md
index 4a57f41..41c405e 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,19 @@
+
+
+
+
+
+
+
# WhisperKit
-WhisperKit is a Swift package that integrates OpenAI's popular [Whisper](https://github.com/openai/whisper) speech recognition model with Apple's CoreML framework for efficient, local inference on Apple devices.
+[](https://github.com/argmaxinc/whisperkit/actions/workflows/unit-tests.yml)
+[](https://swiftpackageindex.com/argmaxinc/WhisperKit) [](https://swiftpackageindex.com/argmaxinc/WhisperKit)
+[](LICENSE.md)
+
+
+
+WhisperKit is a Swift package that integrates OpenAI's popular [Whisper](https://github.com/openai/whisper) speech recognition model with Apple's CoreML framework for efficient, local inference on Apple devices.
Check out the demo app on [TestFlight](https://testflight.apple.com/join/LPVOyJZW).
@@ -21,13 +34,16 @@ Check out the demo app on [TestFlight](https://testflight.apple.com/join/LPVOyJZ
- [Citation](#citation)
## Installation
+
WhisperKit can be integrated into your Swift project using the Swift Package Manager.
### Prerequisites
+
- macOS 14.0 or later.
- Xcode 15.0 or later.
### Steps
+
1. Open your Swift project in Xcode.
2. Navigate to `File` > `Add Package Dependencies...`.
3. Enter the package repository URL: `https://github.com/argmaxinc/whisperkit`.
@@ -35,9 +51,11 @@ WhisperKit can be integrated into your Swift project using the Swift Package Man
5. Click `Finish` to add WhisperKit to your project.
## Getting Started
+
To get started with WhisperKit, you need to initialize it in your project.
### Quick Example
+
This example demonstrates how to transcribe a local audio file:
```swift
@@ -52,7 +70,9 @@ Task {
```
### Model Selection
+
WhisperKit automatically downloads the recommended model for the device if not specified. You can also select a specific model by passing in the model name:
+
```swift
let pipe = try? await WhisperKit(model: "large-v3")
```
@@ -76,18 +96,25 @@ git clone https://github.com/argmaxinc/whisperkit.git
cd whisperkit
```
-Then, setup the environment and download the models.
+Then, setup the environment and download your desired model.
+
+```bash
+make setup
+make download-model MODEL=large-v3
+```
**Note**:
-1. this will download all available models to your local folder, if you only want to download a specific model, see our [HuggingFace repo](https://huggingface.co/argmaxinc/whisperkit-coreml))
-2. before running `download-models`, make sure [git-lfs](https://git-lfs.com) is installed
+
+1. This will download only the model specified by `MODEL` (see what's available in our [HuggingFace repo](https://huggingface.co/argmaxinc/whisperkit-coreml), where we use the prefix `openai_whisper-{MODEL}`)
+2. Before running `download-model`, make sure [git-lfs](https://git-lfs.com) is installed
+
+If you would like download all available models to your local folder, use this command instead:
```bash
-make setup
make download-models
```
-You can then run the CLI with:
+You can then run them via the CLI with:
```bash
swift run transcribe --model-path "Models/whisperkit-coreml/openai_whisper-large-v3" --audio-path "path/to/your/audio.{wav,mp3,m4a,flac}"
@@ -95,19 +122,21 @@ swift run transcribe --model-path "Models/whisperkit-coreml/openai_whisper-large
Which should print a transcription of the audio file.
-
## Contributing & Roadmap
+
Our goal is to make WhisperKit better and better over time and we'd love your help! Just search the code for "TODO" for a variety of features that are yet to be built. Please refer to our [contribution guidelines](CONTRIBUTING.md) for submitting issues, pull requests, and coding standards, where we also have a public roadmap of features we are looking forward to building in the future.
## License
+
WhisperKit is released under the MIT License. See [LICENSE.md](LICENSE.md) for more details.
## Citation
+
If you use WhisperKit for something cool or just find it useful, please drop us a note at [info@takeargmax.com](mailto:info@takeargmax.com)!
If you use WhisperKit for academic work, here is the BibTeX:
-```
+```bibtex
@misc{whisperkit-argmax,
title = {WhisperKit},
author = {Argmax, Inc.},
diff --git a/Sources/WhisperKit/Core/AudioProcessor.swift b/Sources/WhisperKit/Core/AudioProcessor.swift
index 48dc402..12ad63d 100644
--- a/Sources/WhisperKit/Core/AudioProcessor.swift
+++ b/Sources/WhisperKit/Core/AudioProcessor.swift
@@ -40,7 +40,7 @@ public protocol AudioProcessing {
var relativeEnergyWindow: Int { get set }
/// Starts recording audio from the specified input device, resetting the previous state
- func startRecordingLive(from inputDevice: AVCaptureDevice?, callback: (([Float]) -> Void)?) throws
+ func startRecordingLive(callback: (([Float]) -> Void)?) throws
/// Pause recording
func pauseRecording()
@@ -53,7 +53,7 @@ public protocol AudioProcessing {
public extension AudioProcessing {
// Use default recording device
func startRecordingLive(callback: (([Float]) -> Void)?) throws {
- try startRecordingLive(from: nil, callback: callback)
+ try startRecordingLive(callback: callback)
}
static func padOrTrimAudio(fromArray audioArray: [Float], startAt startIndex: Int = 0, toLength frameLength: Int = 480_000, saveSegment: Bool = false) -> MLMultiArray? {
@@ -382,14 +382,11 @@ public extension AudioProcessor {
}
}
- func startRecordingLive(from inputDevice: AVCaptureDevice? = nil, callback: (([Float]) -> Void)? = nil) throws {
+ func startRecordingLive(callback: (([Float]) -> Void)? = nil) throws {
audioSamples = []
audioEnergy = []
- if inputDevice != nil {
- // TODO: implement selecting input device
- Logging.debug("Input device selection not yet supported")
- }
+ // TODO: implement selecting input device
audioEngine = try setupEngine()
diff --git a/Sources/WhisperKit/Core/TextDecoder.swift b/Sources/WhisperKit/Core/TextDecoder.swift
index a7b12b7..289e15f 100644
--- a/Sources/WhisperKit/Core/TextDecoder.swift
+++ b/Sources/WhisperKit/Core/TextDecoder.swift
@@ -45,7 +45,7 @@ public protocol TextDecoding {
@available(macOS 14, iOS 17, tvOS 14, watchOS 10, *)
public extension TextDecoding {
- func prepareDecoderInputs(withPrompt initialPrompt: [Int]) -> DecodingInputs {
+ func prepareDecoderInputs(withPrompt initialPrompt: [Int]) -> DecodingInputs? {
let tokenShape = [NSNumber(value: 1), NSNumber(value: initialPrompt.count)]
// Initialize MLMultiArray for tokens
@@ -59,11 +59,13 @@ public extension TextDecoding {
}
guard let kvCacheEmbedDim = self.kvCacheEmbedDim else {
- fatalError("Unable to determine kvCacheEmbedDim")
+ Logging.error("Unable to determine kvCacheEmbedDim")
+ return nil
}
guard let kvCacheMaxSequenceLength = self.kvCacheMaxSequenceLength else {
- fatalError("Unable to determine kvCacheMaxSequenceLength")
+ Logging.error("Unable to determine kvCacheMaxSequenceLength")
+ return nil
}
// Initialize each MLMultiArray
diff --git a/Sources/WhisperKit/Core/Utils.swift b/Sources/WhisperKit/Core/Utils.swift
index 8dac7f4..fbccae8 100644
--- a/Sources/WhisperKit/Core/Utils.swift
+++ b/Sources/WhisperKit/Core/Utils.swift
@@ -153,14 +153,65 @@ public func modelSupport(for deviceName: String) -> (default: String, disabled:
let model where model.hasPrefix("iPhone16"): // A17
return ("base", ["large-v3_turbo", "large-v3", "large-v2_turbo", "large-v2"])
- // TODO: Disable turbo variants for M1
- case let model where model.hasPrefix("arm64"): // Mac
- return ("base", [""])
-
- // Catch-all for unhandled models or macs
+ // Fall through to macOS checks
default:
- return ("base", [""])
+ break
+ }
+
+#if os(macOS)
+ if deviceName.hasPrefix("arm64") {
+ if Process.processor.contains("Apple M1") {
+ // Disable turbo variants for M1
+ return ("base", ["large-v3_turbo", "large-v3_turbo_1049MB", "large-v3_turbo_1307MB", "large-v2_turbo", "large-v2_turbo_1116MB", "large-v2_turbo_1430MB"])
+ } else {
+ // Enable all variants for M2 or M3, none disabled
+ return ("base", [])
+ }
+ }
+#endif
+
+ // Unhandled device to base variant
+ return ("base", [""])
+}
+
+#if os(macOS)
+// From: https://stackoverflow.com/a/71726663
+extension Process {
+ static func stringFromTerminal(command: String) -> String {
+ let task = Process()
+ let pipe = Pipe()
+ task.standardOutput = pipe
+ task.launchPath = "/bin/bash"
+ task.arguments = ["-c", "sysctl -n " + command]
+ task.launch()
+ return String(bytes: pipe.fileHandleForReading.availableData, encoding: .utf8) ?? ""
}
+ static let processor = stringFromTerminal(command: "machdep.cpu.brand_string")
+ static let cores = stringFromTerminal(command: "machdep.cpu.core_count")
+ static let threads = stringFromTerminal(command: "machdep.cpu.thread_count")
+ static let vendor = stringFromTerminal(command: "machdep.cpu.vendor")
+ static let family = stringFromTerminal(command: "machdep.cpu.family")
+}
+#endif
+
+public func resolveAbsolutePath(_ inputPath: String) -> String {
+ let fileManager = FileManager.default
+
+ // Expanding tilde if present
+ let pathWithTildeExpanded = NSString(string: inputPath).expandingTildeInPath
+
+ // If the path is already absolute, return it
+ if pathWithTildeExpanded.hasPrefix("/") {
+ return pathWithTildeExpanded
+ }
+
+ // Resolving relative path based on the current working directory
+ if let cwd = fileManager.currentDirectoryPath as String? {
+ let resolvedPath = URL(fileURLWithPath: cwd).appendingPathComponent(pathWithTildeExpanded).path
+ return resolvedPath
+ }
+
+ return inputPath
}
func loadTokenizer(for pretrained: ModelVariant) async throws -> Tokenizer {
diff --git a/Sources/WhisperKit/Core/WhisperKit.swift b/Sources/WhisperKit/Core/WhisperKit.swift
index 3982315..62e2edb 100644
--- a/Sources/WhisperKit/Core/WhisperKit.swift
+++ b/Sources/WhisperKit/Core/WhisperKit.swift
@@ -141,7 +141,7 @@ public class WhisperKit {
return (modelInfo + additionalInfo).trimmingFromEnd(character: "/", upto: 1)
}
- // Custom sorting order
+ // Sorting order based on enum
let sizeOrder = ModelVariant.allCases.map { $0.description }
let sortedModels = availableModels.sorted { firstModel, secondModel in
diff --git a/Sources/WhisperKitCLI/transcribe.swift b/Sources/WhisperKitCLI/transcribe.swift
index e2922c8..3ad862c 100644
--- a/Sources/WhisperKitCLI/transcribe.swift
+++ b/Sources/WhisperKitCLI/transcribe.swift
@@ -11,10 +11,10 @@ import WhisperKit
@main
struct WhisperKitCLI: AsyncParsableCommand {
@Option(help: "Path to audio file")
- var audioPath: String = "./Tests/WhisperKitTests/Resources/jfk.wav"
+ var audioPath: String = "Tests/WhisperKitTests/Resources/jfk.wav"
@Option(help: "Path of model files")
- var modelPath: String = "./Models/whisperkit-coreml/openai_whisper-tiny"
+ var modelPath: String = "Models/whisperkit-coreml/openai_whisper-tiny"
@Option(help: "Compute units for audio encoder model with {all,cpuOnly,cpuAndGPU,cpuAndNeuralEngine,random}")
var audioEncoderComputeUnits: ComputeUnits = .cpuAndNeuralEngine
@@ -71,10 +71,17 @@ struct WhisperKitCLI: AsyncParsableCommand {
var reportPath: String = "."
func transcribe(audioPath: String, modelPath: String) async throws {
- guard FileManager.default.fileExists(atPath: modelPath) else {
- fatalError("Resource path does not exist \(modelPath)")
+ let resolvedModelPath = resolveAbsolutePath(modelPath)
+ guard FileManager.default.fileExists(atPath: resolvedModelPath) else {
+ fatalError("Model path does not exist \(resolvedModelPath)")
}
+ let resolvedAudioPath = resolveAbsolutePath(audioPath)
+ guard FileManager.default.fileExists(atPath: resolvedAudioPath) else {
+ fatalError("Resource path does not exist \(resolvedAudioPath)")
+ }
+
+
let computeOptions = ModelComputeOptions(
audioEncoderCompute: audioEncoderComputeUnits.asMLComputeUnits,
textDecoderCompute: textDecoderComputeUnits.asMLComputeUnits
@@ -104,7 +111,7 @@ struct WhisperKitCLI: AsyncParsableCommand {
noSpeechThreshold: noSpeechThreshold
)
- let transcribeResult = try await whisperKit.transcribe(audioPath: audioPath, decodeOptions: options)
+ let transcribeResult = try await whisperKit.transcribe(audioPath: resolvedAudioPath, decodeOptions: options)
let transcription = transcribeResult?.text ?? "Transcription failed"
diff --git a/Tests/WhisperKitTests/FunctionalTests.swift b/Tests/WhisperKitTests/FunctionalTests.swift
index 9d16b05..a1d49fd 100644
--- a/Tests/WhisperKitTests/FunctionalTests.swift
+++ b/Tests/WhisperKitTests/FunctionalTests.swift
@@ -7,6 +7,12 @@ import XCTest
@available(macOS 14, iOS 17, *)
final class FunctionalTests: XCTestCase {
+ func testInitLarge() async {
+ let modelPath = largev3ModelPath()
+ let whisperKit = try? await WhisperKit(modelFolder: modelPath, logLevel: .error)
+ XCTAssertNotNil(whisperKit)
+ }
+
func testOutputAll() async throws {
let modelPaths = allModelPaths()
diff --git a/Tests/WhisperKitTests/UnitTests.swift b/Tests/WhisperKitTests/UnitTests.swift
index ccaaf9f..3691cc5 100644
--- a/Tests/WhisperKitTests/UnitTests.swift
+++ b/Tests/WhisperKitTests/UnitTests.swift
@@ -15,7 +15,7 @@ final class UnitTests: XCTestCase {
XCTAssertNotNil(whisperKit)
}
- // MARK: - Model Loading Tests
+ // MARK: - Model Loading Test
func testInitTiny() async {
let modelPath = tinyModelPath()
@@ -23,12 +23,6 @@ final class UnitTests: XCTestCase {
XCTAssertNotNil(whisperKit)
}
- func testInitLarge() async {
- let modelPath = largev3ModelPath()
- let whisperKit = try? await WhisperKit(modelFolder: modelPath, logLevel: .error)
- XCTAssertNotNil(whisperKit)
- }
-
// MARK: - Audio Tests
func testAudioFileLoading() {
@@ -161,7 +155,12 @@ final class UnitTests: XCTestCase {
let decoderInputs = textDecoder.prepareDecoderInputs(withPrompt: [textDecoder.tokenizer!.startOfTranscriptToken])
let expectedShape: Int = 1
- let decoderOutput = try! await textDecoder.decodeText(from: encoderInput, using: decoderInputs, sampler: tokenSampler, options: decodingOptions)
+ guard let inputs = decoderInputs else {
+ XCTFail("Failed to prepare decoder inputs")
+ return
+ }
+
+ let decoderOutput = try! await textDecoder.decodeText(from: encoderInput, using: inputs, sampler: tokenSampler, options: decodingOptions)
XCTAssertNotNil(decoderOutput, "Failed to decode text")
XCTAssertEqual(decoderOutput.count, expectedShape, "Decoder output shape is not as expected")
}