From ade5f0860c3873706c8d5480d3af7fbeb80fade4 Mon Sep 17 00:00:00 2001
From: Andrey Leonov <aleonov@gmail.com>
Date: Thu, 5 Sep 2024 16:13:26 -0400
Subject: [PATCH 1/2] Add VoiceActivityDetector base class

Add base class to allow different VAD implementations
---
 Sources/WhisperKit/Core/AudioChunker.swift    |   5 +-
 Sources/WhisperKit/Core/VAD/EnergyVAD.swift   |  58 +++++++
 .../VoiceActivityDetector.swift}              | 160 ++++++++----------
 3 files changed, 135 insertions(+), 88 deletions(-)
 create mode 100644 Sources/WhisperKit/Core/VAD/EnergyVAD.swift
 rename Sources/WhisperKit/Core/{EnergyVAD.swift => VAD/VoiceActivityDetector.swift} (65%)

diff --git a/Sources/WhisperKit/Core/AudioChunker.swift b/Sources/WhisperKit/Core/AudioChunker.swift
index f71be97..7db4177 100644
--- a/Sources/WhisperKit/Core/AudioChunker.swift
+++ b/Sources/WhisperKit/Core/AudioChunker.swift
@@ -46,10 +46,11 @@ public extension AudioChunking {
 open class VADAudioChunker: AudioChunking {
     /// prevent hallucinations at the end of the clip by stopping up to 1.0s early
     private let windowPadding: Int
-    private let vad = EnergyVAD()
+    private let vad: VoiceActivityDetector
 
-    init(windowPadding: Int = 16000) {
+    init(windowPadding: Int = 16000, vad: VoiceActivityDetector = EnergyVAD()) {
         self.windowPadding = windowPadding
+        self.vad = vad
     }
 
     private func splitOnMiddleOfLongestSilence(audioArray: [Float], startIndex: Int, endIndex: Int) -> Int {
diff --git a/Sources/WhisperKit/Core/VAD/EnergyVAD.swift b/Sources/WhisperKit/Core/VAD/EnergyVAD.swift
new file mode 100644
index 0000000..3c8f0e7
--- /dev/null
+++ b/Sources/WhisperKit/Core/VAD/EnergyVAD.swift
@@ -0,0 +1,58 @@
+//  For licensing see accompanying LICENSE.md file.
+//  Copyright © 2024 Argmax, Inc. All rights reserved.
+
+import Foundation
+
+/// Voice activity detection based on energy threshold
+@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
+final class EnergyVAD: VoiceActivityDetector {
+    var energyThreshold: Float
+
+    /// Initialize a new EnergyVAD instance
+    /// - Parameters:
+    ///   - sampleRate: Audio sample rate
+    ///   - frameLength: Frame length in seconds
+    ///   - frameOverlap: frame overlap in seconds, this will include `frameOverlap` length audio into the `frameLength` and is helpful to catch audio that starts exactly at chunk boundaries
+    ///   - energyThreshold: minimal energy threshold
+    convenience init(
+        sampleRate: Int = WhisperKit.sampleRate,
+        frameLength: Float = 0.1,
+        frameOverlap: Float = 0.0,
+        energyThreshold: Float = 0.02
+    ) {
+        self.init(
+            sampleRate: sampleRate,
+            // Compute frame length and overlap in number of samples
+            frameLengthSamples: Int(frameLength * Float(sampleRate)),
+            frameOverlapSamples: Int(frameOverlap * Float(sampleRate)),
+            energyThreshold: energyThreshold
+        )
+    }
+
+    required init(
+        sampleRate: Int = 16000,
+        frameLengthSamples: Int,
+        frameOverlapSamples: Int = 0,
+        energyThreshold: Float = 0.02
+    ) {
+        self.energyThreshold = energyThreshold
+        super.init(sampleRate: sampleRate, frameLengthSamples: frameLengthSamples, frameOverlapSamples: frameOverlapSamples)
+    }
+
+    override func voiceActivity(in waveform: [Float]) -> [Bool] {
+        let chunkRatio = Double(waveform.count) / Double(frameLengthSamples)
+
+        // Round up if uneven, the final chunk will not be a full `frameLengthSamples` long
+        let count = Int(chunkRatio.rounded(.up))
+
+        let chunkedVoiceActivity = AudioProcessor.calculateVoiceActivityInChunks(
+            of: waveform,
+            chunkCount: count,
+            frameLengthSamples: frameLengthSamples,
+            frameOverlapSamples: frameOverlapSamples,
+            energyThreshold: energyThreshold
+        )
+
+        return chunkedVoiceActivity
+    }
+}
diff --git a/Sources/WhisperKit/Core/EnergyVAD.swift b/Sources/WhisperKit/Core/VAD/VoiceActivityDetector.swift
similarity index 65%
rename from Sources/WhisperKit/Core/EnergyVAD.swift
rename to Sources/WhisperKit/Core/VAD/VoiceActivityDetector.swift
index 15f152f..f0225cf 100644
--- a/Sources/WhisperKit/Core/EnergyVAD.swift
+++ b/Sources/WhisperKit/Core/VAD/VoiceActivityDetector.swift
@@ -1,79 +1,59 @@
 //  For licensing see accompanying LICENSE.md file.
 //  Copyright © 2024 Argmax, Inc. All rights reserved.
 
-import Accelerate
 import Foundation
 
-/// Voice activity detection based on energy threshold
+/// A base class for Voice Activity Detection (VAD), used to identify and separate segments of audio that contain human speech from those that do not.
+///Subclasses must implement the `voiceActivity(in:)` method to provide specific voice activity detection functionality.
 @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
-final class EnergyVAD {
+class VoiceActivityDetector {
+    /// The sample rate of the audio signal, in samples per second.
     var sampleRate: Int
+    
+    /// The length of each frame in samples.
     var frameLengthSamples: Int
+    
+    // The number of samples overlapping between consecutive frames.
     var frameOverlapSamples: Int
-    var energyThreshold: Float
-
-    /// Initialize a new EnergyVAD instance
+    
+    /// Initializes a new `VoiceActivityDetector` instance with the specified parameters.
     /// - Parameters:
-    ///   - sampleRate: Audio sample rate
-    ///   - frameLength: Frame length in seconds
-    ///   - frameOverlap: frame overlap in seconds, this will include `frameOverlap` length audio into the `frameLength` and is helpful to catch audio that starts exactly at chunk boundaries
-    ///   - energyThreshold: minimal energy threshold
-    convenience init(
-        sampleRate: Int = WhisperKit.sampleRate,
-        frameLength: Float = 0.1,
-        frameOverlap: Float = 0.0,
-        energyThreshold: Float = 0.02
-    ) {
-        self.init(
-            sampleRate: sampleRate,
-            // Compute frame length and overlap in number of samples
-            frameLengthSamples: Int(frameLength * Float(sampleRate)),
-            frameOverlapSamples: Int(frameOverlap * Float(sampleRate)),
-            energyThreshold: energyThreshold
-        )
-    }
-
-    required init(
+    /// - sampleRate: The sample rate of the audio signal in samples per second. Defaults to 16000.
+    /// - frameLengthSamples: The length of each frame in samples.
+    /// - frameOverlapSamples: The number of samples overlapping between consecutive frames. Defaults to 0.
+    /// - Note: Subclasses should override the `voiceActivity(in:)` method to provide specific VAD functionality.
+    init(
         sampleRate: Int = 16000,
         frameLengthSamples: Int,
-        frameOverlapSamples: Int = 0,
-        energyThreshold: Float = 0.02
+        frameOverlapSamples: Int = 0
     ) {
         self.sampleRate = sampleRate
         self.frameLengthSamples = frameLengthSamples
         self.frameOverlapSamples = frameOverlapSamples
-        self.energyThreshold = energyThreshold
     }
-
+    
+    /// Analyzes the provided audio waveform to determine which segments contain voice activity.
+    /// - Parameter waveform: An array of `Float` values representing the audio waveform.
+    /// - Returns: An array of `Bool` values where `true` indicates the presence of voice activity and `false` indicates silence.
     func voiceActivity(in waveform: [Float]) -> [Bool] {
-        let chunkRatio = Double(waveform.count) / Double(frameLengthSamples)
-
-        // Round up if uneven, the final chunk will not be a full `frameLengthSamples` long
-        let count = Int(chunkRatio.rounded(.up))
-
-        let chunkedVoiceActivity = AudioProcessor.calculateVoiceActivityInChunks(
-            of: waveform,
-            chunkCount: count,
-            frameLengthSamples: frameLengthSamples,
-            frameOverlapSamples: frameOverlapSamples,
-            energyThreshold: energyThreshold
-        )
-
-        return chunkedVoiceActivity
+        fatalError("`voiceActivity` must be implemented by subclass")
     }
 
+    /// Calculates and returns a list of active audio chunks, each represented by a start and end index.
+    /// - Parameter waveform: An array of `Float` values representing the audio waveform.
+    /// - Returns: An array of tuples where each tuple contains the start and end indices of an active audio chunk.
     func calculateActiveChunks(in waveform: [Float]) -> [(startIndex: Int, endIndex: Int)] {
         let vad: [Bool] = voiceActivity(in: waveform)
         var result = [(startIndex: Int, endIndex: Int)]()
-
+        
         // Temporary variables to hold the start of the current non-silent segment
         var currentStartIndex: Int?
-
+        
         for (index, vadChunk) in vad.enumerated() {
             if vadChunk {
                 let chunkStart = index * frameLengthSamples
                 let chunkEnd = min(chunkStart + frameLengthSamples, waveform.count)
-
+                
                 if currentStartIndex != nil {
                     // If we already have a starting point, just update the end point in the last added segment
                     result[result.count - 1].endIndex = chunkEnd
@@ -87,53 +67,24 @@ final class EnergyVAD {
                 currentStartIndex = nil
             }
         }
-
+        
         return result
     }
-
-    func voiceActivityClipTimestamps(in waveform: [Float]) -> [Float] {
-        let nonSilentChunks = calculateActiveChunks(in: waveform)
-        var clipTimestamps = [Float]()
-
-        for chunk in nonSilentChunks {
-            let startTimestamp = Float(chunk.startIndex) / Float(sampleRate)
-            let endTimestamp = Float(chunk.endIndex) / Float(sampleRate)
-
-            clipTimestamps.append(contentsOf: [startTimestamp, endTimestamp])
-        }
-
-        return clipTimestamps
-    }
-
-    func calculateNonSilentSeekClips(in waveform: [Float]) -> [(start: Int, end: Int)] {
-        let clipTimestamps = voiceActivityClipTimestamps(in: waveform)
-        let options = DecodingOptions(clipTimestamps: clipTimestamps)
-        let seekClips = prepareSeekClips(contentFrames: waveform.count, decodeOptions: options)
-        return seekClips
-    }
-
-    func calculateSeekTimestamps(in waveform: [Float]) -> [(startTime: Float, endTime: Float)] {
-        let nonSilentChunks = calculateActiveChunks(in: waveform)
-        var seekTimestamps = [(startTime: Float, endTime: Float)]()
-
-        for chunk in nonSilentChunks {
-            let startTimestamp = Float(chunk.startIndex) / Float(sampleRate)
-            let endTimestamp = Float(chunk.endIndex) / Float(sampleRate)
-
-            seekTimestamps.append(contentsOf: [(startTime: startTimestamp, endTime: endTimestamp)])
-        }
-
-        return seekTimestamps
-    }
-
+    
+    /// Converts a voice activity index to the corresponding audio sample index.
+    /// - Parameter index: The voice activity index to convert.
+    /// - Returns: The corresponding audio sample index.
     func voiceActivityIndexToAudioSampleIndex(_ index: Int) -> Int {
         return index * frameLengthSamples
     }
-
+    
     func voiceActivityIndexToSeconds(_ index: Int) -> Float {
         return Float(voiceActivityIndexToAudioSampleIndex(index)) / Float(sampleRate)
     }
-
+    
+    /// Identifies the longest continuous period of silence within the provided voice activity detection results.
+    /// - Parameter vadResult: An array of `Bool` values representing voice activity detection results.
+    /// - Returns: A tuple containing the start and end indices of the longest silence period, or `nil` if no silence is found.
     func findLongestSilence(in vadResult: [Bool]) -> (startIndex: Int, endIndex: Int)? {
         var longestStartIndex: Int?
         var longestEndIndex: Int?
@@ -165,4 +116,41 @@ final class EnergyVAD {
             return nil
         }
     }
+    
+    // MARK - Utility
+    
+    func voiceActivityClipTimestamps(in waveform: [Float]) -> [Float] {
+        let nonSilentChunks = calculateActiveChunks(in: waveform)
+        var clipTimestamps = [Float]()
+        
+        for chunk in nonSilentChunks {
+            let startTimestamp = Float(chunk.startIndex) / Float(sampleRate)
+            let endTimestamp = Float(chunk.endIndex) / Float(sampleRate)
+            
+            clipTimestamps.append(contentsOf: [startTimestamp, endTimestamp])
+        }
+        
+        return clipTimestamps
+    }
+    
+    func calculateNonSilentSeekClips(in waveform: [Float]) -> [(start: Int, end: Int)] {
+        let clipTimestamps = voiceActivityClipTimestamps(in: waveform)
+        let options = DecodingOptions(clipTimestamps: clipTimestamps)
+        let seekClips = prepareSeekClips(contentFrames: waveform.count, decodeOptions: options)
+        return seekClips
+    }
+    
+    func calculateSeekTimestamps(in waveform: [Float]) -> [(startTime: Float, endTime: Float)] {
+        let nonSilentChunks = calculateActiveChunks(in: waveform)
+        var seekTimestamps = [(startTime: Float, endTime: Float)]()
+        
+        for chunk in nonSilentChunks {
+            let startTimestamp = Float(chunk.startIndex) / Float(sampleRate)
+            let endTimestamp = Float(chunk.endIndex) / Float(sampleRate)
+            
+            seekTimestamps.append(contentsOf: [(startTime: startTimestamp, endTime: endTimestamp)])
+        }
+        
+        return seekTimestamps
+    }
 }

From 940dfc1cac3c1de9aa4a543500f83a884296191a Mon Sep 17 00:00:00 2001
From: Andrey Leonov <aleonov@gmail.com>
Date: Thu, 5 Sep 2024 16:43:59 -0400
Subject: [PATCH 2/2] fix spaces

---
 .../Core/VAD/VoiceActivityDetector.swift      | 50 +++++++++----------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/Sources/WhisperKit/Core/VAD/VoiceActivityDetector.swift b/Sources/WhisperKit/Core/VAD/VoiceActivityDetector.swift
index f0225cf..05ffba0 100644
--- a/Sources/WhisperKit/Core/VAD/VoiceActivityDetector.swift
+++ b/Sources/WhisperKit/Core/VAD/VoiceActivityDetector.swift
@@ -4,23 +4,23 @@
 import Foundation
 
 /// A base class for Voice Activity Detection (VAD), used to identify and separate segments of audio that contain human speech from those that do not.
-///Subclasses must implement the `voiceActivity(in:)` method to provide specific voice activity detection functionality.
+/// Subclasses must implement the `voiceActivity(in:)` method to provide specific voice activity detection functionality.
 @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
 class VoiceActivityDetector {
     /// The sample rate of the audio signal, in samples per second.
     var sampleRate: Int
-    
+
     /// The length of each frame in samples.
     var frameLengthSamples: Int
-    
+
     // The number of samples overlapping between consecutive frames.
     var frameOverlapSamples: Int
-    
+
     /// Initializes a new `VoiceActivityDetector` instance with the specified parameters.
     /// - Parameters:
-    /// - sampleRate: The sample rate of the audio signal in samples per second. Defaults to 16000.
-    /// - frameLengthSamples: The length of each frame in samples.
-    /// - frameOverlapSamples: The number of samples overlapping between consecutive frames. Defaults to 0.
+    ///   - sampleRate: The sample rate of the audio signal in samples per second. Defaults to 16000.
+    ///   - frameLengthSamples: The length of each frame in samples.
+    ///   - frameOverlapSamples: The number of samples overlapping between consecutive frames. Defaults to 0.
     /// - Note: Subclasses should override the `voiceActivity(in:)` method to provide specific VAD functionality.
     init(
         sampleRate: Int = 16000,
@@ -31,7 +31,7 @@ class VoiceActivityDetector {
         self.frameLengthSamples = frameLengthSamples
         self.frameOverlapSamples = frameOverlapSamples
     }
-    
+
     /// Analyzes the provided audio waveform to determine which segments contain voice activity.
     /// - Parameter waveform: An array of `Float` values representing the audio waveform.
     /// - Returns: An array of `Bool` values where `true` indicates the presence of voice activity and `false` indicates silence.
@@ -45,15 +45,15 @@ class VoiceActivityDetector {
     func calculateActiveChunks(in waveform: [Float]) -> [(startIndex: Int, endIndex: Int)] {
         let vad: [Bool] = voiceActivity(in: waveform)
         var result = [(startIndex: Int, endIndex: Int)]()
-        
+
         // Temporary variables to hold the start of the current non-silent segment
         var currentStartIndex: Int?
-        
+
         for (index, vadChunk) in vad.enumerated() {
             if vadChunk {
                 let chunkStart = index * frameLengthSamples
                 let chunkEnd = min(chunkStart + frameLengthSamples, waveform.count)
-                
+
                 if currentStartIndex != nil {
                     // If we already have a starting point, just update the end point in the last added segment
                     result[result.count - 1].endIndex = chunkEnd
@@ -67,21 +67,21 @@ class VoiceActivityDetector {
                 currentStartIndex = nil
             }
         }
-        
+
         return result
     }
-    
+
     /// Converts a voice activity index to the corresponding audio sample index.
     /// - Parameter index: The voice activity index to convert.
     /// - Returns: The corresponding audio sample index.
     func voiceActivityIndexToAudioSampleIndex(_ index: Int) -> Int {
         return index * frameLengthSamples
     }
-    
+
     func voiceActivityIndexToSeconds(_ index: Int) -> Float {
         return Float(voiceActivityIndexToAudioSampleIndex(index)) / Float(sampleRate)
     }
-    
+
     /// Identifies the longest continuous period of silence within the provided voice activity detection results.
     /// - Parameter vadResult: An array of `Bool` values representing voice activity detection results.
     /// - Returns: A tuple containing the start and end indices of the longest silence period, or `nil` if no silence is found.
@@ -116,41 +116,41 @@ class VoiceActivityDetector {
             return nil
         }
     }
-    
+
     // MARK - Utility
-    
+
     func voiceActivityClipTimestamps(in waveform: [Float]) -> [Float] {
         let nonSilentChunks = calculateActiveChunks(in: waveform)
         var clipTimestamps = [Float]()
-        
+
         for chunk in nonSilentChunks {
             let startTimestamp = Float(chunk.startIndex) / Float(sampleRate)
             let endTimestamp = Float(chunk.endIndex) / Float(sampleRate)
-            
+
             clipTimestamps.append(contentsOf: [startTimestamp, endTimestamp])
         }
-        
+
         return clipTimestamps
     }
-    
+
     func calculateNonSilentSeekClips(in waveform: [Float]) -> [(start: Int, end: Int)] {
         let clipTimestamps = voiceActivityClipTimestamps(in: waveform)
         let options = DecodingOptions(clipTimestamps: clipTimestamps)
         let seekClips = prepareSeekClips(contentFrames: waveform.count, decodeOptions: options)
         return seekClips
     }
-    
+
     func calculateSeekTimestamps(in waveform: [Float]) -> [(startTime: Float, endTime: Float)] {
         let nonSilentChunks = calculateActiveChunks(in: waveform)
         var seekTimestamps = [(startTime: Float, endTime: Float)]()
-        
+
         for chunk in nonSilentChunks {
             let startTimestamp = Float(chunk.startIndex) / Float(sampleRate)
             let endTimestamp = Float(chunk.endIndex) / Float(sampleRate)
-            
+
             seekTimestamps.append(contentsOf: [(startTime: startTimestamp, endTime: endTimestamp)])
         }
-        
+
         return seekTimestamps
     }
 }