From ade5f0860c3873706c8d5480d3af7fbeb80fade4 Mon Sep 17 00:00:00 2001 From: Andrey Leonov Date: Thu, 5 Sep 2024 16:13:26 -0400 Subject: [PATCH 1/2] Add VoiceActivityDetector base class Add base class to allow different VAD implementations --- Sources/WhisperKit/Core/AudioChunker.swift | 5 +- Sources/WhisperKit/Core/VAD/EnergyVAD.swift | 58 +++++++ .../VoiceActivityDetector.swift} | 160 ++++++++---------- 3 files changed, 135 insertions(+), 88 deletions(-) create mode 100644 Sources/WhisperKit/Core/VAD/EnergyVAD.swift rename Sources/WhisperKit/Core/{EnergyVAD.swift => VAD/VoiceActivityDetector.swift} (65%) diff --git a/Sources/WhisperKit/Core/AudioChunker.swift b/Sources/WhisperKit/Core/AudioChunker.swift index f71be97..7db4177 100644 --- a/Sources/WhisperKit/Core/AudioChunker.swift +++ b/Sources/WhisperKit/Core/AudioChunker.swift @@ -46,10 +46,11 @@ public extension AudioChunking { open class VADAudioChunker: AudioChunking { /// prevent hallucinations at the end of the clip by stopping up to 1.0s early private let windowPadding: Int - private let vad = EnergyVAD() + private let vad: VoiceActivityDetector - init(windowPadding: Int = 16000) { + init(windowPadding: Int = 16000, vad: VoiceActivityDetector = EnergyVAD()) { self.windowPadding = windowPadding + self.vad = vad } private func splitOnMiddleOfLongestSilence(audioArray: [Float], startIndex: Int, endIndex: Int) -> Int { diff --git a/Sources/WhisperKit/Core/VAD/EnergyVAD.swift b/Sources/WhisperKit/Core/VAD/EnergyVAD.swift new file mode 100644 index 0000000..3c8f0e7 --- /dev/null +++ b/Sources/WhisperKit/Core/VAD/EnergyVAD.swift @@ -0,0 +1,58 @@ +// For licensing see accompanying LICENSE.md file. +// Copyright © 2024 Argmax, Inc. All rights reserved. + +import Foundation + +/// Voice activity detection based on energy threshold +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) +final class EnergyVAD: VoiceActivityDetector { + var energyThreshold: Float + + /// Initialize a new EnergyVAD instance + /// - Parameters: + /// - sampleRate: Audio sample rate + /// - frameLength: Frame length in seconds + /// - frameOverlap: frame overlap in seconds, this will include `frameOverlap` length audio into the `frameLength` and is helpful to catch audio that starts exactly at chunk boundaries + /// - energyThreshold: minimal energy threshold + convenience init( + sampleRate: Int = WhisperKit.sampleRate, + frameLength: Float = 0.1, + frameOverlap: Float = 0.0, + energyThreshold: Float = 0.02 + ) { + self.init( + sampleRate: sampleRate, + // Compute frame length and overlap in number of samples + frameLengthSamples: Int(frameLength * Float(sampleRate)), + frameOverlapSamples: Int(frameOverlap * Float(sampleRate)), + energyThreshold: energyThreshold + ) + } + + required init( + sampleRate: Int = 16000, + frameLengthSamples: Int, + frameOverlapSamples: Int = 0, + energyThreshold: Float = 0.02 + ) { + self.energyThreshold = energyThreshold + super.init(sampleRate: sampleRate, frameLengthSamples: frameLengthSamples, frameOverlapSamples: frameOverlapSamples) + } + + override func voiceActivity(in waveform: [Float]) -> [Bool] { + let chunkRatio = Double(waveform.count) / Double(frameLengthSamples) + + // Round up if uneven, the final chunk will not be a full `frameLengthSamples` long + let count = Int(chunkRatio.rounded(.up)) + + let chunkedVoiceActivity = AudioProcessor.calculateVoiceActivityInChunks( + of: waveform, + chunkCount: count, + frameLengthSamples: frameLengthSamples, + frameOverlapSamples: frameOverlapSamples, + energyThreshold: energyThreshold + ) + + return chunkedVoiceActivity + } +} diff --git a/Sources/WhisperKit/Core/EnergyVAD.swift b/Sources/WhisperKit/Core/VAD/VoiceActivityDetector.swift similarity index 65% rename from Sources/WhisperKit/Core/EnergyVAD.swift rename to Sources/WhisperKit/Core/VAD/VoiceActivityDetector.swift index 15f152f..f0225cf 100644 --- a/Sources/WhisperKit/Core/EnergyVAD.swift +++ b/Sources/WhisperKit/Core/VAD/VoiceActivityDetector.swift @@ -1,79 +1,59 @@ // For licensing see accompanying LICENSE.md file. // Copyright © 2024 Argmax, Inc. All rights reserved. -import Accelerate import Foundation -/// Voice activity detection based on energy threshold +/// A base class for Voice Activity Detection (VAD), used to identify and separate segments of audio that contain human speech from those that do not. +///Subclasses must implement the `voiceActivity(in:)` method to provide specific voice activity detection functionality. @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) -final class EnergyVAD { +class VoiceActivityDetector { + /// The sample rate of the audio signal, in samples per second. var sampleRate: Int + + /// The length of each frame in samples. var frameLengthSamples: Int + + // The number of samples overlapping between consecutive frames. var frameOverlapSamples: Int - var energyThreshold: Float - - /// Initialize a new EnergyVAD instance + + /// Initializes a new `VoiceActivityDetector` instance with the specified parameters. /// - Parameters: - /// - sampleRate: Audio sample rate - /// - frameLength: Frame length in seconds - /// - frameOverlap: frame overlap in seconds, this will include `frameOverlap` length audio into the `frameLength` and is helpful to catch audio that starts exactly at chunk boundaries - /// - energyThreshold: minimal energy threshold - convenience init( - sampleRate: Int = WhisperKit.sampleRate, - frameLength: Float = 0.1, - frameOverlap: Float = 0.0, - energyThreshold: Float = 0.02 - ) { - self.init( - sampleRate: sampleRate, - // Compute frame length and overlap in number of samples - frameLengthSamples: Int(frameLength * Float(sampleRate)), - frameOverlapSamples: Int(frameOverlap * Float(sampleRate)), - energyThreshold: energyThreshold - ) - } - - required init( + /// - sampleRate: The sample rate of the audio signal in samples per second. Defaults to 16000. + /// - frameLengthSamples: The length of each frame in samples. + /// - frameOverlapSamples: The number of samples overlapping between consecutive frames. Defaults to 0. + /// - Note: Subclasses should override the `voiceActivity(in:)` method to provide specific VAD functionality. + init( sampleRate: Int = 16000, frameLengthSamples: Int, - frameOverlapSamples: Int = 0, - energyThreshold: Float = 0.02 + frameOverlapSamples: Int = 0 ) { self.sampleRate = sampleRate self.frameLengthSamples = frameLengthSamples self.frameOverlapSamples = frameOverlapSamples - self.energyThreshold = energyThreshold } - + + /// Analyzes the provided audio waveform to determine which segments contain voice activity. + /// - Parameter waveform: An array of `Float` values representing the audio waveform. + /// - Returns: An array of `Bool` values where `true` indicates the presence of voice activity and `false` indicates silence. func voiceActivity(in waveform: [Float]) -> [Bool] { - let chunkRatio = Double(waveform.count) / Double(frameLengthSamples) - - // Round up if uneven, the final chunk will not be a full `frameLengthSamples` long - let count = Int(chunkRatio.rounded(.up)) - - let chunkedVoiceActivity = AudioProcessor.calculateVoiceActivityInChunks( - of: waveform, - chunkCount: count, - frameLengthSamples: frameLengthSamples, - frameOverlapSamples: frameOverlapSamples, - energyThreshold: energyThreshold - ) - - return chunkedVoiceActivity + fatalError("`voiceActivity` must be implemented by subclass") } + /// Calculates and returns a list of active audio chunks, each represented by a start and end index. + /// - Parameter waveform: An array of `Float` values representing the audio waveform. + /// - Returns: An array of tuples where each tuple contains the start and end indices of an active audio chunk. func calculateActiveChunks(in waveform: [Float]) -> [(startIndex: Int, endIndex: Int)] { let vad: [Bool] = voiceActivity(in: waveform) var result = [(startIndex: Int, endIndex: Int)]() - + // Temporary variables to hold the start of the current non-silent segment var currentStartIndex: Int? - + for (index, vadChunk) in vad.enumerated() { if vadChunk { let chunkStart = index * frameLengthSamples let chunkEnd = min(chunkStart + frameLengthSamples, waveform.count) - + if currentStartIndex != nil { // If we already have a starting point, just update the end point in the last added segment result[result.count - 1].endIndex = chunkEnd @@ -87,53 +67,24 @@ final class EnergyVAD { currentStartIndex = nil } } - + return result } - - func voiceActivityClipTimestamps(in waveform: [Float]) -> [Float] { - let nonSilentChunks = calculateActiveChunks(in: waveform) - var clipTimestamps = [Float]() - - for chunk in nonSilentChunks { - let startTimestamp = Float(chunk.startIndex) / Float(sampleRate) - let endTimestamp = Float(chunk.endIndex) / Float(sampleRate) - - clipTimestamps.append(contentsOf: [startTimestamp, endTimestamp]) - } - - return clipTimestamps - } - - func calculateNonSilentSeekClips(in waveform: [Float]) -> [(start: Int, end: Int)] { - let clipTimestamps = voiceActivityClipTimestamps(in: waveform) - let options = DecodingOptions(clipTimestamps: clipTimestamps) - let seekClips = prepareSeekClips(contentFrames: waveform.count, decodeOptions: options) - return seekClips - } - - func calculateSeekTimestamps(in waveform: [Float]) -> [(startTime: Float, endTime: Float)] { - let nonSilentChunks = calculateActiveChunks(in: waveform) - var seekTimestamps = [(startTime: Float, endTime: Float)]() - - for chunk in nonSilentChunks { - let startTimestamp = Float(chunk.startIndex) / Float(sampleRate) - let endTimestamp = Float(chunk.endIndex) / Float(sampleRate) - - seekTimestamps.append(contentsOf: [(startTime: startTimestamp, endTime: endTimestamp)]) - } - - return seekTimestamps - } - + + /// Converts a voice activity index to the corresponding audio sample index. + /// - Parameter index: The voice activity index to convert. + /// - Returns: The corresponding audio sample index. func voiceActivityIndexToAudioSampleIndex(_ index: Int) -> Int { return index * frameLengthSamples } - + func voiceActivityIndexToSeconds(_ index: Int) -> Float { return Float(voiceActivityIndexToAudioSampleIndex(index)) / Float(sampleRate) } - + + /// Identifies the longest continuous period of silence within the provided voice activity detection results. + /// - Parameter vadResult: An array of `Bool` values representing voice activity detection results. + /// - Returns: A tuple containing the start and end indices of the longest silence period, or `nil` if no silence is found. func findLongestSilence(in vadResult: [Bool]) -> (startIndex: Int, endIndex: Int)? { var longestStartIndex: Int? var longestEndIndex: Int? @@ -165,4 +116,41 @@ final class EnergyVAD { return nil } } + + // MARK - Utility + + func voiceActivityClipTimestamps(in waveform: [Float]) -> [Float] { + let nonSilentChunks = calculateActiveChunks(in: waveform) + var clipTimestamps = [Float]() + + for chunk in nonSilentChunks { + let startTimestamp = Float(chunk.startIndex) / Float(sampleRate) + let endTimestamp = Float(chunk.endIndex) / Float(sampleRate) + + clipTimestamps.append(contentsOf: [startTimestamp, endTimestamp]) + } + + return clipTimestamps + } + + func calculateNonSilentSeekClips(in waveform: [Float]) -> [(start: Int, end: Int)] { + let clipTimestamps = voiceActivityClipTimestamps(in: waveform) + let options = DecodingOptions(clipTimestamps: clipTimestamps) + let seekClips = prepareSeekClips(contentFrames: waveform.count, decodeOptions: options) + return seekClips + } + + func calculateSeekTimestamps(in waveform: [Float]) -> [(startTime: Float, endTime: Float)] { + let nonSilentChunks = calculateActiveChunks(in: waveform) + var seekTimestamps = [(startTime: Float, endTime: Float)]() + + for chunk in nonSilentChunks { + let startTimestamp = Float(chunk.startIndex) / Float(sampleRate) + let endTimestamp = Float(chunk.endIndex) / Float(sampleRate) + + seekTimestamps.append(contentsOf: [(startTime: startTimestamp, endTime: endTimestamp)]) + } + + return seekTimestamps + } } From 940dfc1cac3c1de9aa4a543500f83a884296191a Mon Sep 17 00:00:00 2001 From: Andrey Leonov Date: Thu, 5 Sep 2024 16:43:59 -0400 Subject: [PATCH 2/2] fix spaces --- .../Core/VAD/VoiceActivityDetector.swift | 50 +++++++++---------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/Sources/WhisperKit/Core/VAD/VoiceActivityDetector.swift b/Sources/WhisperKit/Core/VAD/VoiceActivityDetector.swift index f0225cf..05ffba0 100644 --- a/Sources/WhisperKit/Core/VAD/VoiceActivityDetector.swift +++ b/Sources/WhisperKit/Core/VAD/VoiceActivityDetector.swift @@ -4,23 +4,23 @@ import Foundation /// A base class for Voice Activity Detection (VAD), used to identify and separate segments of audio that contain human speech from those that do not. -///Subclasses must implement the `voiceActivity(in:)` method to provide specific voice activity detection functionality. +/// Subclasses must implement the `voiceActivity(in:)` method to provide specific voice activity detection functionality. @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) class VoiceActivityDetector { /// The sample rate of the audio signal, in samples per second. var sampleRate: Int - + /// The length of each frame in samples. var frameLengthSamples: Int - + // The number of samples overlapping between consecutive frames. var frameOverlapSamples: Int - + /// Initializes a new `VoiceActivityDetector` instance with the specified parameters. /// - Parameters: - /// - sampleRate: The sample rate of the audio signal in samples per second. Defaults to 16000. - /// - frameLengthSamples: The length of each frame in samples. - /// - frameOverlapSamples: The number of samples overlapping between consecutive frames. Defaults to 0. + /// - sampleRate: The sample rate of the audio signal in samples per second. Defaults to 16000. + /// - frameLengthSamples: The length of each frame in samples. + /// - frameOverlapSamples: The number of samples overlapping between consecutive frames. Defaults to 0. /// - Note: Subclasses should override the `voiceActivity(in:)` method to provide specific VAD functionality. init( sampleRate: Int = 16000, @@ -31,7 +31,7 @@ class VoiceActivityDetector { self.frameLengthSamples = frameLengthSamples self.frameOverlapSamples = frameOverlapSamples } - + /// Analyzes the provided audio waveform to determine which segments contain voice activity. /// - Parameter waveform: An array of `Float` values representing the audio waveform. /// - Returns: An array of `Bool` values where `true` indicates the presence of voice activity and `false` indicates silence. @@ -45,15 +45,15 @@ class VoiceActivityDetector { func calculateActiveChunks(in waveform: [Float]) -> [(startIndex: Int, endIndex: Int)] { let vad: [Bool] = voiceActivity(in: waveform) var result = [(startIndex: Int, endIndex: Int)]() - + // Temporary variables to hold the start of the current non-silent segment var currentStartIndex: Int? - + for (index, vadChunk) in vad.enumerated() { if vadChunk { let chunkStart = index * frameLengthSamples let chunkEnd = min(chunkStart + frameLengthSamples, waveform.count) - + if currentStartIndex != nil { // If we already have a starting point, just update the end point in the last added segment result[result.count - 1].endIndex = chunkEnd @@ -67,21 +67,21 @@ class VoiceActivityDetector { currentStartIndex = nil } } - + return result } - + /// Converts a voice activity index to the corresponding audio sample index. /// - Parameter index: The voice activity index to convert. /// - Returns: The corresponding audio sample index. func voiceActivityIndexToAudioSampleIndex(_ index: Int) -> Int { return index * frameLengthSamples } - + func voiceActivityIndexToSeconds(_ index: Int) -> Float { return Float(voiceActivityIndexToAudioSampleIndex(index)) / Float(sampleRate) } - + /// Identifies the longest continuous period of silence within the provided voice activity detection results. /// - Parameter vadResult: An array of `Bool` values representing voice activity detection results. /// - Returns: A tuple containing the start and end indices of the longest silence period, or `nil` if no silence is found. @@ -116,41 +116,41 @@ class VoiceActivityDetector { return nil } } - + // MARK - Utility - + func voiceActivityClipTimestamps(in waveform: [Float]) -> [Float] { let nonSilentChunks = calculateActiveChunks(in: waveform) var clipTimestamps = [Float]() - + for chunk in nonSilentChunks { let startTimestamp = Float(chunk.startIndex) / Float(sampleRate) let endTimestamp = Float(chunk.endIndex) / Float(sampleRate) - + clipTimestamps.append(contentsOf: [startTimestamp, endTimestamp]) } - + return clipTimestamps } - + func calculateNonSilentSeekClips(in waveform: [Float]) -> [(start: Int, end: Int)] { let clipTimestamps = voiceActivityClipTimestamps(in: waveform) let options = DecodingOptions(clipTimestamps: clipTimestamps) let seekClips = prepareSeekClips(contentFrames: waveform.count, decodeOptions: options) return seekClips } - + func calculateSeekTimestamps(in waveform: [Float]) -> [(startTime: Float, endTime: Float)] { let nonSilentChunks = calculateActiveChunks(in: waveform) var seekTimestamps = [(startTime: Float, endTime: Float)]() - + for chunk in nonSilentChunks { let startTimestamp = Float(chunk.startIndex) / Float(sampleRate) let endTimestamp = Float(chunk.endIndex) / Float(sampleRate) - + seekTimestamps.append(contentsOf: [(startTime: startTimestamp, endTime: endTimestamp)]) } - + return seekTimestamps } }