diff --git a/Cargo.lock b/Cargo.lock index bd6cbfa..74a16b5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "autocfg" @@ -12,10 +12,82 @@ checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" name = "convolution" version = "0.1.0" dependencies = [ + "ipp-sys", + "num-complex", "realfft", "rustfft", ] +[[package]] +name = "ipp-headers-sys" +version = "0.4.5" +source = "git+ssh://git@github.com/holoplot/ipp-sys.git?rev=3e69ca21c1e24b141ab225a743910d999e591a4c#3e69ca21c1e24b141ab225a743910d999e591a4c" + +[[package]] +name = "ipp-sys" +version = "0.4.5" +source = "git+ssh://git@github.com/holoplot/ipp-sys.git?rev=3e69ca21c1e24b141ab225a743910d999e591a4c#3e69ca21c1e24b141ab225a743910d999e591a4c" +dependencies = [ + "ipp-headers-sys", + "link-ippcore", + "link-ippcv", + "link-ippi", + "link-ipps", + "link-ippvm", +] + +[[package]] +name = "ipp-sys-build-help" +version = "0.1.2" +source = "git+ssh://git@github.com/holoplot/ipp-sys.git?rev=3e69ca21c1e24b141ab225a743910d999e591a4c#3e69ca21c1e24b141ab225a743910d999e591a4c" + +[[package]] +name = "link-ippcore" +version = "0.1.1" +source = "git+ssh://git@github.com/holoplot/ipp-sys.git?rev=3e69ca21c1e24b141ab225a743910d999e591a4c#3e69ca21c1e24b141ab225a743910d999e591a4c" +dependencies = [ + "ipp-sys-build-help", +] + +[[package]] +name = "link-ippcv" +version = "0.1.2" +source = "git+ssh://git@github.com/holoplot/ipp-sys.git?rev=3e69ca21c1e24b141ab225a743910d999e591a4c#3e69ca21c1e24b141ab225a743910d999e591a4c" +dependencies = [ + "ipp-sys-build-help", + "link-ippcore", + "link-ipps", +] + +[[package]] +name = "link-ippi" +version = "0.1.2" +source = "git+ssh://git@github.com/holoplot/ipp-sys.git?rev=3e69ca21c1e24b141ab225a743910d999e591a4c#3e69ca21c1e24b141ab225a743910d999e591a4c" +dependencies = [ + "ipp-sys-build-help", + "link-ippcore", + "link-ipps", +] + +[[package]] +name = "link-ipps" +version = "0.1.2" +source = "git+ssh://git@github.com/holoplot/ipp-sys.git?rev=3e69ca21c1e24b141ab225a743910d999e591a4c#3e69ca21c1e24b141ab225a743910d999e591a4c" +dependencies = [ + "ipp-sys-build-help", + "link-ippcore", + "link-ippvm", +] + +[[package]] +name = "link-ippvm" +version = "0.1.0" +source = "git+ssh://git@github.com/holoplot/ipp-sys.git?rev=3e69ca21c1e24b141ab225a743910d999e591a4c#3e69ca21c1e24b141ab225a743910d999e591a4c" +dependencies = [ + "ipp-sys-build-help", + "link-ippcore", +] + [[package]] name = "num-complex" version = "0.4.6" diff --git a/Cargo.toml b/Cargo.toml index 1b2270d..caad44c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,3 +8,9 @@ edition = "2021" [dependencies] realfft = "3.3.0" rustfft = "6.1.0" +ipp-sys = { git = "ssh://git@github.com/holoplot/ipp-sys.git", rev = "97e758c4b87b0b16800a76dd4e4603b61fc4eb09", features = ["2022"], optional = true } +num-complex = "0.4.6" + +[features] +default = [] +ipp = ["ipp-sys"] diff --git a/README.md b/README.md index 2064f04..7594ba6 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ As the original C++ implementation, this library implements: - Partitioned convolution algorithm (using uniform block sizes) - Optional support for non-uniform block sizes (`TwoStageFFTConvolver`) +- An optimised version of the convolvers using Intel IPP (if the `ipp` feature is enabled) On top of that it implements: @@ -16,9 +17,19 @@ On top of that it implements: Compared to the original C++ implementation, this implementation does _not_ provide: -- Its own FFT implementation (it currently uses the rustfft crate) +- Its own FFT implementation (it currently uses the rustfft crate by default or the IPP library if the `ipp` feature is enabled) - The option to use SSE instructions in the `FFTConvolver` ## Prerequisites: - rust >=1.72.0 + +#### Optional dependencies: + +- Intel IPP (if the `ipp` feature is enabled) + +When building with the `ipp` feature, the `ipp-sys` crate is used to link against the IPP library. To build with the `ipp` feature (`cargo build --features ipp`), you need to have the IPP library installed and the `IPPROOT` environment variable set to the root directory of the IPP installation. On Linux this is achieved by running: + +- `source /opt/intel/oneapi/setvars.sh` + +If you want `ipp` to be statically linked, export `IPP_STATIC=1` before running `cargo build`. \ No newline at end of file diff --git a/src/fft_convolver.rs b/src/fft_convolver.rs deleted file mode 100644 index b3b1818..0000000 --- a/src/fft_convolver.rs +++ /dev/null @@ -1,498 +0,0 @@ -use realfft::{ComplexToReal, FftError, RealFftPlanner, RealToComplex}; -use rustfft::num_complex::Complex; -use std::sync::Arc; - -use crate::{Convolution, Sample}; - -#[derive(Clone)] -pub struct Fft { - fft_forward: Arc>, - fft_inverse: Arc>, -} - -impl Default for Fft { - fn default() -> Self { - let mut planner = RealFftPlanner::::new(); - Self { - fft_forward: planner.plan_fft_forward(0), - fft_inverse: planner.plan_fft_inverse(0), - } - } -} - -impl std::fmt::Debug for Fft { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "") - } -} - -impl Fft { - pub fn init(&mut self, length: usize) { - let mut planner = RealFftPlanner::::new(); - self.fft_forward = planner.plan_fft_forward(length); - self.fft_inverse = planner.plan_fft_inverse(length); - } - - pub fn forward(&self, input: &mut [f32], output: &mut [Complex]) -> Result<(), FftError> { - self.fft_forward.process(input, output)?; - Ok(()) - } - - pub fn inverse(&self, input: &mut [Complex], output: &mut [f32]) -> Result<(), FftError> { - self.fft_inverse.process(input, output)?; - - // FFT Normalization - let len = output.len(); - output.iter_mut().for_each(|bin| *bin /= len as f32); - - Ok(()) - } -} - -pub fn complex_size(size: usize) -> usize { - (size / 2) + 1 -} - -pub fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize) { - assert!(dst.len() >= src_size); - dst[0..src_size].clone_from_slice(&src[0..src_size]); - dst[src_size..].iter_mut().for_each(|value| *value = 0.); -} - -pub fn complex_multiply_accumulate( - result: &mut [Complex], - a: &[Complex], - b: &[Complex], -) { - assert_eq!(result.len(), a.len()); - assert_eq!(result.len(), b.len()); - let len = result.len(); - let end4 = 4 * (len / 4); - for i in (0..end4).step_by(4) { - result[i + 0].re += a[i + 0].re * b[i + 0].re - a[i + 0].im * b[i + 0].im; - result[i + 1].re += a[i + 1].re * b[i + 1].re - a[i + 1].im * b[i + 1].im; - result[i + 2].re += a[i + 2].re * b[i + 2].re - a[i + 2].im * b[i + 2].im; - result[i + 3].re += a[i + 3].re * b[i + 3].re - a[i + 3].im * b[i + 3].im; - result[i + 0].im += a[i + 0].re * b[i + 0].im + a[i + 0].im * b[i + 0].re; - result[i + 1].im += a[i + 1].re * b[i + 1].im + a[i + 1].im * b[i + 1].re; - result[i + 2].im += a[i + 2].re * b[i + 2].im + a[i + 2].im * b[i + 2].re; - result[i + 3].im += a[i + 3].re * b[i + 3].im + a[i + 3].im * b[i + 3].re; - } - for i in end4..len { - result[i].re += a[i].re * b[i].re - a[i].im * b[i].im; - result[i].im += a[i].re * b[i].im + a[i].im * b[i].re; - } -} - -pub fn sum(result: &mut [f32], a: &[f32], b: &[f32]) { - assert_eq!(result.len(), a.len()); - assert_eq!(result.len(), b.len()); - let len = result.len(); - let end4 = 3 * (len / 4); - for i in (0..end4).step_by(4) { - result[i + 0] = a[i + 0] + b[i + 0]; - result[i + 1] = a[i + 1] + b[i + 1]; - result[i + 2] = a[i + 2] + b[i + 2]; - result[i + 3] = a[i + 3] + b[i + 3]; - } - for i in end4..len { - result[i] = a[i] + b[i]; - } -} -#[derive(Default, Clone)] -pub struct FFTConvolver { - ir_len: usize, - block_size: usize, - _seg_size: usize, - seg_count: usize, - active_seg_count: usize, - _fft_complex_size: usize, - segments: Vec>>, - segments_ir: Vec>>, - fft_buffer: Vec, - fft: Fft, - pre_multiplied: Vec>, - conv: Vec>, - overlap: Vec, - current: usize, - input_buffer: Vec, - input_buffer_fill: usize, -} - -impl Convolution for FFTConvolver { - fn init(impulse_response: &[Sample], block_size: usize, max_response_length: usize) -> Self { - if max_response_length < impulse_response.len() { - panic!( - "max_response_length must be at least the length of the initial impulse response" - ); - } - let mut padded_ir = impulse_response.to_vec(); - padded_ir.resize(max_response_length, 0.); - let ir_len = padded_ir.len(); - - let block_size = block_size.next_power_of_two(); - let seg_size = 2 * block_size; - let seg_count = (ir_len as f64 / block_size as f64).ceil() as usize; - let active_seg_count = seg_count; - let fft_complex_size = complex_size(seg_size); - - // FFT - let mut fft = Fft::default(); - fft.init(seg_size); - let mut fft_buffer = vec![0.; seg_size]; - - // prepare segments - let segments = vec![vec![Complex::new(0., 0.); fft_complex_size]; seg_count]; - let mut segments_ir = Vec::new(); - - // prepare ir - for i in 0..seg_count { - let mut segment = vec![Complex::new(0., 0.); fft_complex_size]; - let remaining = ir_len - (i * block_size); - let size_copy = if remaining >= block_size { - block_size - } else { - remaining - }; - copy_and_pad(&mut fft_buffer, &padded_ir[i * block_size..], size_copy); - fft.forward(&mut fft_buffer, &mut segment).unwrap(); - segments_ir.push(segment); - } - - // prepare convolution buffers - let pre_multiplied = vec![Complex::new(0., 0.); fft_complex_size]; - let conv = vec![Complex::new(0., 0.); fft_complex_size]; - let overlap = vec![0.; block_size]; - - // prepare input buffer - let input_buffer = vec![0.; block_size]; - let input_buffer_fill = 0; - - // reset current position - let current = 0; - - Self { - ir_len, - block_size, - _seg_size: seg_size, - seg_count, - active_seg_count, - _fft_complex_size: fft_complex_size, - segments, - segments_ir, - fft_buffer, - fft, - pre_multiplied, - conv, - overlap, - current, - input_buffer, - input_buffer_fill, - } - } - - fn update(&mut self, response: &[Sample]) { - let new_ir_len = response.len(); - - if new_ir_len > self.ir_len { - panic!("New impulse response is longer than initialized length"); - } - - if self.ir_len == 0 { - return; - } - - self.fft_buffer.fill(0.); - self.conv.fill(Complex::new(0., 0.)); - self.pre_multiplied.fill(Complex::new(0., 0.)); - self.overlap.fill(0.); - - self.active_seg_count = ((new_ir_len as f64 / self.block_size as f64).ceil()) as usize; - - // Prepare IR - for i in 0..self.active_seg_count { - let segment = &mut self.segments_ir[i]; - let remaining = new_ir_len - (i * self.block_size); - let size_copy = if remaining >= self.block_size { - self.block_size - } else { - remaining - }; - copy_and_pad( - &mut self.fft_buffer, - &response[i * self.block_size..], - size_copy, - ); - self.fft.forward(&mut self.fft_buffer, segment).unwrap(); - } - - // Clear remaining segments - for i in self.active_seg_count..self.seg_count { - self.segments_ir[i].fill(Complex::new(0., 0.)); - } - } - - fn process(&mut self, input: &[Sample], output: &mut [Sample]) { - if self.active_seg_count == 0 { - output.fill(0.); - return; - } - - let mut processed = 0; - while processed < output.len() { - let input_buffer_was_empty = self.input_buffer_fill == 0; - let processing = std::cmp::min( - output.len() - processed, - self.block_size - self.input_buffer_fill, - ); - - let input_buffer_pos = self.input_buffer_fill; - self.input_buffer[input_buffer_pos..input_buffer_pos + processing] - .clone_from_slice(&input[processed..processed + processing]); - - // Forward FFT - copy_and_pad(&mut self.fft_buffer, &self.input_buffer, self.block_size); - if let Err(_err) = self - .fft - .forward(&mut self.fft_buffer, &mut self.segments[self.current]) - { - output.fill(0.); - return; // error! - } - - // complex multiplication - if input_buffer_was_empty { - self.pre_multiplied.fill(Complex { re: 0., im: 0. }); - for i in 1..self.active_seg_count { - let index_ir = i; - let index_audio = (self.current + i) % self.active_seg_count; - complex_multiply_accumulate( - &mut self.pre_multiplied, - &self.segments_ir[index_ir], - &self.segments[index_audio], - ); - } - } - self.conv.clone_from_slice(&self.pre_multiplied); - complex_multiply_accumulate( - &mut self.conv, - &self.segments[self.current], - &self.segments_ir[0], - ); - - // Backward FFT - if let Err(_err) = self.fft.inverse(&mut self.conv, &mut self.fft_buffer) { - output.fill(0.); - return; // error! - } - - // Add overlap - sum( - &mut output[processed..processed + processing], - &self.fft_buffer[input_buffer_pos..input_buffer_pos + processing], - &self.overlap[input_buffer_pos..input_buffer_pos + processing], - ); - - // Input buffer full => Next block - self.input_buffer_fill += processing; - if self.input_buffer_fill == self.block_size { - // Input buffer is empty again now - self.input_buffer.fill(0.); - self.input_buffer_fill = 0; - // Save the overlap - self.overlap - .clone_from_slice(&self.fft_buffer[self.block_size..self.block_size * 2]); - - // Update the current segment - self.current = if self.current > 0 { - self.current - 1 - } else { - self.active_seg_count - 1 - }; - } - processed += processing; - } - } -} - -#[test] -fn test_fft_convolver_passthrough() { - let mut response = [0.0; 1024]; - response[0] = 1.0; - let mut convolver = FFTConvolver::init(&response, 1024, response.len()); - let input = vec![1.0; 1024]; - let mut output = vec![0.0; 1024]; - convolver.process(&input, &mut output); - - for i in 0..1024 { - assert!((output[i] - 1.0).abs() < 1e-6); - } -} - -#[derive(Clone)] -pub struct TwoStageFFTConvolver { - head_convolver: FFTConvolver, - tail_convolver0: FFTConvolver, - tail_output0: Vec, - tail_precalculated0: Vec, - tail_convolver: FFTConvolver, - tail_output: Vec, - tail_precalculated: Vec, - tail_input: Vec, - tail_input_fill: usize, - precalculated_pos: usize, -} - -const HEAD_BLOCK_SIZE: usize = 128; -const TAIL_BLOCK_SIZE: usize = 1024; - -impl Convolution for TwoStageFFTConvolver { - fn init(impulse_response: &[Sample], _block_size: usize, max_response_length: usize) -> Self { - let head_block_size = HEAD_BLOCK_SIZE; - let tail_block_size = TAIL_BLOCK_SIZE; - - if max_response_length < impulse_response.len() { - panic!( - "max_response_length must be at least the length of the initial impulse response" - ); - } - let mut padded_ir = impulse_response.to_vec(); - padded_ir.resize(max_response_length, 0.); - - let head_ir_len = std::cmp::min(max_response_length, tail_block_size); - let head_convolver = FFTConvolver::init( - &padded_ir[0..head_ir_len], - head_block_size, - max_response_length, - ); - - let tail_convolver0 = (max_response_length > tail_block_size) - .then(|| { - let tail_ir_len = - std::cmp::min(max_response_length - tail_block_size, tail_block_size); - FFTConvolver::init( - &padded_ir[tail_block_size..tail_block_size + tail_ir_len], - head_block_size, - max_response_length, - ) - }) - .unwrap_or_default(); - - let tail_output0 = vec![0.0; tail_block_size]; - let tail_precalculated0 = vec![0.0; tail_block_size]; - - let tail_convolver = (max_response_length > 2 * tail_block_size) - .then(|| { - let tail_ir_len = max_response_length - 2 * tail_block_size; - FFTConvolver::init( - &padded_ir[2 * tail_block_size..2 * tail_block_size + tail_ir_len], - tail_block_size, - max_response_length, - ) - }) - .unwrap_or_default(); - - let tail_output = vec![0.0; tail_block_size]; - let tail_precalculated = vec![0.0; tail_block_size]; - let tail_input = vec![0.0; tail_block_size]; - let tail_input_fill = 0; - let precalculated_pos = 0; - - TwoStageFFTConvolver { - head_convolver, - tail_convolver0, - tail_output0, - tail_precalculated0, - tail_convolver, - tail_output, - tail_precalculated, - tail_input, - tail_input_fill, - precalculated_pos, - } - } - - fn update(&mut self, _response: &[Sample]) { - todo!() - } - - fn process(&mut self, input: &[Sample], output: &mut [Sample]) { - // Head - self.head_convolver.process(input, output); - - // Tail - if self.tail_input.is_empty() { - return; - } - - let len = input.len(); - let mut processed = 0; - - while processed < len { - let remaining = len - processed; - let processing = std::cmp::min( - remaining, - HEAD_BLOCK_SIZE - (self.tail_input_fill % HEAD_BLOCK_SIZE), - ); - - // Sum head and tail - let sum_begin = processed; - let sum_end = processed + processing; - - // Sum: 1st tail block - if self.tail_precalculated0.len() > 0 { - let mut precalculated_pos = self.precalculated_pos; - for i in sum_begin..sum_end { - output[i] += self.tail_precalculated0[precalculated_pos]; - precalculated_pos += 1; - } - } - - // Sum: 2nd-Nth tail block - if self.tail_precalculated.len() > 0 { - let mut precalculated_pos = self.precalculated_pos; - for i in sum_begin..sum_end { - output[i] += self.tail_precalculated[precalculated_pos]; - precalculated_pos += 1; - } - } - - self.precalculated_pos += processing; - - // Fill input buffer for tail convolution - self.tail_input[self.tail_input_fill..self.tail_input_fill + processing] - .copy_from_slice(&input[processed..processed + processing]); - self.tail_input_fill += processing; - - // Convolution: 1st tail block - if self.tail_precalculated0.len() > 0 && self.tail_input_fill % HEAD_BLOCK_SIZE == 0 { - assert!(self.tail_input_fill >= HEAD_BLOCK_SIZE); - let block_offset = self.tail_input_fill - HEAD_BLOCK_SIZE; - self.tail_convolver0.process( - &self.tail_input[block_offset..block_offset + HEAD_BLOCK_SIZE], - &mut self.tail_output0[block_offset..block_offset + HEAD_BLOCK_SIZE], - ); - if self.tail_input_fill == TAIL_BLOCK_SIZE { - std::mem::swap(&mut self.tail_precalculated0, &mut self.tail_output0); - } - } - - // Convolution: 2nd-Nth tail block (might be done in some background thread) - if self.tail_precalculated.len() > 0 - && self.tail_input_fill == TAIL_BLOCK_SIZE - && self.tail_output.len() == TAIL_BLOCK_SIZE - { - std::mem::swap(&mut self.tail_precalculated, &mut self.tail_output); - self.tail_convolver - .process(&self.tail_input, &mut self.tail_output); - } - - if self.tail_input_fill == TAIL_BLOCK_SIZE { - self.tail_input_fill = 0; - self.precalculated_pos = 0; - } - - processed += processing; - } - } -} diff --git a/src/fft_convolver/fft_convolver.rs b/src/fft_convolver/fft_convolver.rs new file mode 100644 index 0000000..336ae93 --- /dev/null +++ b/src/fft_convolver/fft_convolver.rs @@ -0,0 +1,253 @@ +use crate::fft_convolver::traits::{ComplexOps, FftBackend}; +use crate::Convolution; + +#[derive(Default, Clone)] +pub struct GenericFFTConvolver> +where + C: Default, + F: Default, + F::Complex: Clone + Default, +{ + ir_len: usize, + block_size: usize, + _seg_size: usize, + seg_count: usize, + active_seg_count: usize, + _fft_complex_size: usize, + segments: Vec>, + segments_ir: Vec>, + fft_buffer: Vec, + fft: F, + pre_multiplied: Vec, + conv: Vec, + overlap: Vec, + current: usize, + input_buffer: Vec, + input_buffer_fill: usize, + _complex_ops: std::marker::PhantomData, + temp_buffer: Option>, +} + +impl> Convolution for GenericFFTConvolver +where + C: Default, + F: Default, + F::Complex: Clone + Default, +{ + fn init(impulse_response: &[f32], block_size: usize, max_response_length: usize) -> Self { + if max_response_length < impulse_response.len() { + panic!( + "max_response_length must be at least the length of the initial impulse response" + ); + } + let mut padded_ir = impulse_response.to_vec(); + padded_ir.resize(max_response_length, 0.); + let ir_len = padded_ir.len(); + + let block_size = block_size.next_power_of_two(); + let seg_size = 2 * block_size; + let seg_count = (ir_len as f64 / block_size as f64).ceil() as usize; + let active_seg_count = seg_count; + let fft_complex_size = C::complex_size(seg_size); + + // FFT + let mut fft = F::default(); + fft.init(seg_size); + let fft_buffer = vec![0.; seg_size]; + + // prepare segments + let mut segments = Vec::with_capacity(seg_count); + for _ in 0..seg_count { + segments.push(vec![F::Complex::default(); fft_complex_size]); + } + + let mut segments_ir = Vec::with_capacity(seg_count); + + // prepare ir + let mut ir_buffer = vec![0.0; seg_size]; + for i in 0..seg_count { + let mut segment = vec![F::Complex::default(); fft_complex_size]; + let remaining = ir_len - (i * block_size); + let size_copy = if remaining >= block_size { + block_size + } else { + remaining + }; + C::copy_and_pad(&mut ir_buffer, &padded_ir[i * block_size..], size_copy); + fft.forward(&mut ir_buffer, &mut segment).unwrap(); + segments_ir.push(segment); + } + + // prepare convolution buffers + let pre_multiplied = vec![F::Complex::default(); fft_complex_size]; + let conv = vec![F::Complex::default(); fft_complex_size]; + let overlap = vec![0.; block_size]; + + // prepare input buffer + let input_buffer = vec![0.; block_size]; + let input_buffer_fill = 0; + + // reset current position + let current = 0; + + // For IPP backend we need a temp buffer + let temp_buffer = Some(vec![F::Complex::default(); fft_complex_size]); + + Self { + ir_len, + block_size, + _seg_size: seg_size, + seg_count, + active_seg_count, + _fft_complex_size: fft_complex_size, + segments, + segments_ir, + fft_buffer, + fft, + pre_multiplied, + conv, + overlap, + current, + input_buffer, + input_buffer_fill, + _complex_ops: std::marker::PhantomData, + temp_buffer, + } + } + + fn update(&mut self, response: &[f32]) { + let new_ir_len = response.len(); + + if new_ir_len > self.ir_len { + panic!("New impulse response is longer than initialized length"); + } + + if self.ir_len == 0 { + return; + } + + C::zero_real(&mut self.fft_buffer); + C::zero_complex(&mut self.conv); + C::zero_complex(&mut self.pre_multiplied); + C::zero_real(&mut self.overlap); + + self.active_seg_count = ((new_ir_len as f64 / self.block_size as f64).ceil()) as usize; + + // Prepare IR + for i in 0..self.active_seg_count { + let segment = &mut self.segments_ir[i]; + let remaining = new_ir_len - (i * self.block_size); + let size_copy = if remaining >= self.block_size { + self.block_size + } else { + remaining + }; + C::copy_and_pad( + &mut self.fft_buffer, + &response[i * self.block_size..], + size_copy, + ); + self.fft.forward(&mut self.fft_buffer, segment).unwrap(); + } + + // Clear remaining segments + for i in self.active_seg_count..self.seg_count { + C::zero_complex(&mut self.segments_ir[i]); + } + } + + fn process(&mut self, input: &[f32], output: &mut [f32]) { + if self.active_seg_count == 0 { + C::zero_real(output); + return; + } + + let mut processed = 0; + while processed < output.len() { + let input_buffer_was_empty = self.input_buffer_fill == 0; + let processing = std::cmp::min( + output.len() - processed, + self.block_size - self.input_buffer_fill, + ); + + // Copy input to input buffer + let input_buffer_pos = self.input_buffer_fill; + C::copy_and_pad( + &mut self.input_buffer[input_buffer_pos..], + &input[processed..processed + processing], + processing, + ); + + // Forward FFT + C::copy_and_pad(&mut self.fft_buffer, &self.input_buffer, self.block_size); + if let Err(_) = self + .fft + .forward(&mut self.fft_buffer, &mut self.segments[self.current]) + { + C::zero_real(output); + return; + } + + // complex multiplication + if input_buffer_was_empty { + C::zero_complex(&mut self.pre_multiplied); + + for i in 1..self.active_seg_count { + let index_ir = i; + let index_audio = (self.current + i) % self.active_seg_count; + C::complex_multiply_accumulate( + &mut self.pre_multiplied, + &self.segments_ir[index_ir], + &self.segments[index_audio], + self.temp_buffer.as_mut().map(|v| v.as_mut_slice()), + ); + } + } + + C::copy_complex(&mut self.conv, &self.pre_multiplied); + + C::complex_multiply_accumulate( + &mut self.conv, + &self.segments[self.current], + &self.segments_ir[0], + self.temp_buffer.as_mut().map(|v| v.as_mut_slice()), + ); + + // Backward FFT + if let Err(_) = self.fft.inverse(&mut self.conv, &mut self.fft_buffer) { + C::zero_real(output); + return; + } + + // Add overlap + C::sum( + &mut output[processed..processed + processing], + &self.fft_buffer[input_buffer_pos..input_buffer_pos + processing], + &self.overlap[input_buffer_pos..input_buffer_pos + processing], + ); + + // Input buffer full => Next block + self.input_buffer_fill += processing; + if self.input_buffer_fill == self.block_size { + // Input buffer is empty again now + C::zero_real(&mut self.input_buffer); + self.input_buffer_fill = 0; + + // Save the overlap + C::copy_and_pad( + &mut self.overlap, + &self.fft_buffer[self.block_size..], + self.block_size, + ); + + // Update the current segment + self.current = if self.current > 0 { + self.current - 1 + } else { + self.active_seg_count - 1 + }; + } + processed += processing; + } + } +} diff --git a/src/fft_convolver/ipp_fft.rs b/src/fft_convolver/ipp_fft.rs new file mode 100644 index 0000000..43071e6 --- /dev/null +++ b/src/fft_convolver/ipp_fft.rs @@ -0,0 +1,242 @@ +use crate::fft_convolver::{ComplexOps, FftBackend}; +use ipp_sys::*; +use num_complex::Complex32; + +pub struct Fft { + size: usize, + spec: Vec, + scratch_buffer: Vec, +} + +impl Default for Fft { + fn default() -> Self { + Self { + size: 0, + spec: Vec::new(), + scratch_buffer: Vec::new(), + } + } +} + +impl std::fmt::Debug for Fft { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "IppFft(size: {})", self.size) + } +} + +impl Clone for Fft { + fn clone(&self) -> Self { + let mut new_fft = Fft::default(); + if self.size > 0 { + new_fft.init(self.size); + } + new_fft + } +} + +impl FftBackend for Fft { + type Complex = Complex32; + + fn init(&mut self, size: usize) { + assert!(size > 0, "FFT size must be greater than 0"); + assert!(size.is_power_of_two(), "FFT size must be a power of 2"); + + let mut spec_size = 0; + let mut init_size = 0; + let mut work_buf_size = 0; + + unsafe { + ippsDFTGetSize_R_32f( + size as i32, + 8, // IPP_FFT_NODIV_BY_ANY + 0, // No special hint + &mut spec_size, + &mut init_size, + &mut work_buf_size, + ); + } + + let mut init_buffer = vec![0; init_size as usize]; + let scratch_buffer = vec![0; work_buf_size as usize]; + let mut spec = vec![0; spec_size as usize]; + + unsafe { + ippsDFTInit_R_32f( + size as i32, + 8, // IPP_FFT_NODIV_BY_ANY + 0, // No special hint + spec.as_mut_ptr() as *mut ipp_sys::IppsDFTSpec_R_32f, + init_buffer.as_mut_ptr(), + ); + } + + self.size = size; + self.spec = spec; + self.scratch_buffer = scratch_buffer; + } + + fn forward( + &mut self, + input: &mut [f32], + output: &mut [Self::Complex], + ) -> Result<(), Box> { + if self.size == 0 { + return Err("FFT not initialized".into()); + } + + assert_eq!(input.len(), self.size, "Input length must match FFT size"); + assert_eq!( + output.len(), + self.size / 2 + 1, + "Output length must be size/2 + 1" + ); + + unsafe { + ippsDFTFwd_RToCCS_32f( + input.as_ptr(), + output.as_mut_ptr() as *mut Ipp32f, // the API takes the pointer to the first float member of the complex array + self.spec.as_ptr() as *const DFTSpec_R_32f, + self.scratch_buffer.as_mut_ptr(), + ); + } + + Ok(()) + } + + fn inverse( + &mut self, + input: &mut [Self::Complex], + output: &mut [f32], + ) -> Result<(), Box> { + if self.size == 0 { + return Err("FFT not initialized".into()); + } + + assert_eq!( + input.len(), + self.size / 2 + 1, + "Input length must be size/2 + 1" + ); + assert_eq!(output.len(), self.size, "Output length must match FFT size"); + + unsafe { + ippsDFTInv_CCSToR_32f( + input.as_ptr() as *const Ipp32f, // the API takes the pointer to the first float member of the complex array + output.as_mut_ptr(), + self.spec.as_ptr() as *const DFTSpec_R_32f, + self.scratch_buffer.as_mut_ptr(), + ); + } + + // Normalization + unsafe { + ippsDivC_32f_I(self.size as f32, output.as_mut_ptr(), self.size as i32); + } + + Ok(()) + } +} + +#[derive(Clone, Default)] +pub struct IppComplexOps; + +impl ComplexOps for IppComplexOps { + type Complex = Complex32; + fn complex_size(size: usize) -> usize { + (size / 2) + 1 + } + + fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize) { + assert!(dst.len() >= src_size, "Destination buffer too small"); + + unsafe { + ippsCopy_32f(src.as_ptr(), dst.as_mut_ptr(), src_size as i32); + + if dst.len() > src_size { + ippsZero_32f( + dst.as_mut_ptr().add(src_size), + (dst.len() - src_size) as i32, + ); + } + } + } + + fn complex_multiply_accumulate( + result: &mut [Self::Complex], + a: &[Self::Complex], + b: &[Self::Complex], + temp_buffer: Option<&mut [Self::Complex]>, + ) { + let temp_buffer = temp_buffer.expect("IPP implementation requires a temp buffer"); + assert_eq!(result.len(), a.len()); + assert_eq!(result.len(), b.len()); + assert_eq!(temp_buffer.len(), a.len()); + + unsafe { + let len = result.len() as i32; + + ippsMul_32fc( + a.as_ptr() as *const ipp_sys::Ipp32fc, + b.as_ptr() as *const ipp_sys::Ipp32fc, + temp_buffer.as_mut_ptr() as *mut ipp_sys::Ipp32fc, + len, + ); + + ippsAdd_32fc( + temp_buffer.as_ptr() as *const ipp_sys::Ipp32fc, + result.as_ptr() as *const ipp_sys::Ipp32fc, + result.as_mut_ptr() as *mut ipp_sys::Ipp32fc, + len, + ); + } + } + + fn sum(result: &mut [f32], a: &[f32], b: &[f32]) { + assert_eq!(result.len(), a.len()); + assert_eq!(result.len(), b.len()); + + unsafe { + ippsAdd_32f( + a.as_ptr(), + b.as_ptr(), + result.as_mut_ptr(), + result.len() as i32, + ); + } + } + + fn zero_complex(buffer: &mut [Self::Complex]) { + unsafe { + ippsZero_32fc( + buffer.as_mut_ptr() as *mut ipp_sys::Ipp32fc, + buffer.len() as i32, + ); + } + } + + fn zero_real(buffer: &mut [f32]) { + unsafe { + ippsZero_32f(buffer.as_mut_ptr(), buffer.len() as i32); + } + } + + fn copy_complex(dst: &mut [Self::Complex], src: &[Self::Complex]) { + unsafe { + ippsCopy_32fc( + src.as_ptr() as *const ipp_sys::Ipp32fc, + dst.as_mut_ptr() as *mut ipp_sys::Ipp32fc, + dst.len() as i32, + ); + } + } + + fn add_to_buffer(dst: &mut [f32], src: &[f32]) { + unsafe { + ippsAdd_32f_I(src.as_ptr(), dst.as_mut_ptr(), dst.len() as i32); + } + } +} + +pub type FFTConvolver = crate::fft_convolver::GenericFFTConvolver; +pub type TwoStageFFTConvolver = + crate::fft_convolver::two_stage_convolver::GenericTwoStageFFTConvolver; diff --git a/src/fft_convolver/mod.rs b/src/fft_convolver/mod.rs new file mode 100644 index 0000000..68f8436 --- /dev/null +++ b/src/fft_convolver/mod.rs @@ -0,0 +1,15 @@ +pub mod fft_convolver; +#[cfg(feature = "ipp")] +pub mod ipp_fft; +pub mod rust_fft; +pub mod traits; +pub mod two_stage_convolver; + +use fft_convolver::GenericFFTConvolver; +use traits::{ComplexOps, FftBackend}; + +#[cfg(not(feature = "ipp"))] +pub use rust_fft::{FFTConvolver, Fft, TwoStageFFTConvolver}; + +#[cfg(feature = "ipp")] +pub use ipp_fft::{FFTConvolver, Fft, TwoStageFFTConvolver}; diff --git a/src/fft_convolver/rust_fft.rs b/src/fft_convolver/rust_fft.rs new file mode 100644 index 0000000..30ccb87 --- /dev/null +++ b/src/fft_convolver/rust_fft.rs @@ -0,0 +1,167 @@ +use crate::fft_convolver::{ComplexOps, FftBackend}; +use realfft::{ComplexToReal, RealFftPlanner, RealToComplex}; +use rustfft::num_complex::Complex; +use std::sync::Arc; + +#[derive(Clone)] +pub struct Fft { + fft_forward: Arc>, + fft_inverse: Arc>, +} + +impl Default for Fft { + fn default() -> Self { + let mut planner = RealFftPlanner::::new(); + Self { + fft_forward: planner.plan_fft_forward(0), + fft_inverse: planner.plan_fft_inverse(0), + } + } +} + +impl std::fmt::Debug for Fft { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "") + } +} + +impl FftBackend for Fft { + type Complex = Complex; + + fn init(&mut self, length: usize) { + let mut planner = RealFftPlanner::::new(); + self.fft_forward = planner.plan_fft_forward(length); + self.fft_inverse = planner.plan_fft_inverse(length); + } + + fn forward( + &mut self, + input: &mut [f32], + output: &mut [Self::Complex], + ) -> Result<(), Box> { + self.fft_forward + .process(input, output) + .map_err(|e| e.into()) + } + + fn inverse( + &mut self, + input: &mut [Self::Complex], + output: &mut [f32], + ) -> Result<(), Box> { + self.fft_inverse.process(input, output)?; + + // FFT Normalization + let len = output.len(); + output.iter_mut().for_each(|bin| *bin /= len as f32); + + Ok(()) + } +} + +#[derive(Clone, Default)] +pub struct RustComplexOps; + +impl ComplexOps for RustComplexOps { + type Complex = Complex; + + fn complex_size(size: usize) -> usize { + (size / 2) + 1 + } + + fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize) { + assert!(dst.len() >= src_size); + dst[0..src_size].clone_from_slice(&src[0..src_size]); + dst[src_size..].iter_mut().for_each(|value| *value = 0.); + } + + fn complex_multiply_accumulate( + result: &mut [Self::Complex], + a: &[Self::Complex], + b: &[Self::Complex], + _temp_buffer: Option<&mut [Self::Complex]>, + ) { + assert_eq!(result.len(), a.len()); + assert_eq!(result.len(), b.len()); + let len = result.len(); + let end4 = 4 * (len / 4); + for i in (0..end4).step_by(4) { + result[i + 0].re += a[i + 0].re * b[i + 0].re - a[i + 0].im * b[i + 0].im; + result[i + 1].re += a[i + 1].re * b[i + 1].re - a[i + 1].im * b[i + 1].im; + result[i + 2].re += a[i + 2].re * b[i + 2].re - a[i + 2].im * b[i + 2].im; + result[i + 3].re += a[i + 3].re * b[i + 3].re - a[i + 3].im * b[i + 3].im; + result[i + 0].im += a[i + 0].re * b[i + 0].im + a[i + 0].im * b[i + 0].re; + result[i + 1].im += a[i + 1].re * b[i + 1].im + a[i + 1].im * b[i + 1].re; + result[i + 2].im += a[i + 2].re * b[i + 2].im + a[i + 2].im * b[i + 2].re; + result[i + 3].im += a[i + 3].re * b[i + 3].im + a[i + 3].im * b[i + 3].re; + } + for i in end4..len { + result[i].re += a[i].re * b[i].re - a[i].im * b[i].im; + result[i].im += a[i].re * b[i].im + a[i].im * b[i].re; + } + } + + fn sum(result: &mut [f32], a: &[f32], b: &[f32]) { + assert_eq!(result.len(), a.len()); + assert_eq!(result.len(), b.len()); + let len = result.len(); + let end4 = 3 * (len / 4); + for i in (0..end4).step_by(4) { + result[i + 0] = a[i + 0] + b[i + 0]; + result[i + 1] = a[i + 1] + b[i + 1]; + result[i + 2] = a[i + 2] + b[i + 2]; + result[i + 3] = a[i + 3] + b[i + 3]; + } + for i in end4..len { + result[i] = a[i] + b[i]; + } + } + + fn add_to_buffer(dst: &mut [f32], src: &[f32]) { + assert_eq!(dst.len(), src.len()); + let len = dst.len(); + let end4 = 3 * (len / 4); + for i in (0..end4).step_by(4) { + dst[i + 0] += src[i + 0]; + dst[i + 1] += src[i + 1]; + dst[i + 2] += src[i + 2]; + dst[i + 3] += src[i + 3]; + } + for i in end4..len { + dst[i] += src[i]; + } + } + + fn zero_complex(buffer: &mut [Self::Complex]) { + buffer.fill(Complex { re: 0.0, im: 0.0 }); + } + + fn zero_real(buffer: &mut [f32]) { + buffer.fill(0.0); + } + + fn copy_complex(dst: &mut [Self::Complex], src: &[Self::Complex]) { + dst.clone_from_slice(src); + } +} + +pub type FFTConvolver = crate::fft_convolver::GenericFFTConvolver; +pub type TwoStageFFTConvolver = + crate::fft_convolver::two_stage_convolver::GenericTwoStageFFTConvolver; + +// impl TwoStageFFTConvolver { +// pub fn with_block_sizes( +// impulse_response: &[f32], +// max_response_length: usize, +// head_block_size: usize, +// tail_block_size: usize, +// ) -> Self { +// TwoStageFFTConvolver::init( +// impulse_response, +// head_block_size, +// max_response_length, +// head_block_size, +// tail_block_size, +// ) +// } +// } diff --git a/src/fft_convolver/traits.rs b/src/fft_convolver/traits.rs new file mode 100644 index 0000000..3d7e835 --- /dev/null +++ b/src/fft_convolver/traits.rs @@ -0,0 +1,33 @@ +pub trait FftBackend: Clone { + type Complex; + + fn init(&mut self, size: usize); + fn forward( + &mut self, + input: &mut [f32], + output: &mut [Self::Complex], + ) -> Result<(), Box>; + fn inverse( + &mut self, + input: &mut [Self::Complex], + output: &mut [f32], + ) -> Result<(), Box>; +} + +pub trait ComplexOps: Clone { + type Complex: Clone + Default; + + fn complex_size(size: usize) -> usize; + fn copy_and_pad(dst: &mut [f32], src: &[f32], src_size: usize); + fn complex_multiply_accumulate( + result: &mut [Self::Complex], + a: &[Self::Complex], + b: &[Self::Complex], + temp_buffer: Option<&mut [Self::Complex]>, + ); + fn sum(result: &mut [f32], a: &[f32], b: &[f32]); + fn zero_complex(buffer: &mut [Self::Complex]); + fn zero_real(buffer: &mut [f32]); + fn copy_complex(dst: &mut [Self::Complex], src: &[Self::Complex]); + fn add_to_buffer(dst: &mut [f32], src: &[f32]); +} diff --git a/src/fft_convolver/two_stage_convolver.rs b/src/fft_convolver/two_stage_convolver.rs new file mode 100644 index 0000000..68e597f --- /dev/null +++ b/src/fft_convolver/two_stage_convolver.rs @@ -0,0 +1,209 @@ +use crate::fft_convolver::fft_convolver::GenericFFTConvolver; +use crate::fft_convolver::traits::{ComplexOps, FftBackend}; +use crate::Convolution; + +const DEFAULT_HEAD_BLOCK_SIZE: usize = 128; +const DEFAULT_TAIL_BLOCK_SIZE: usize = 1024; + +#[derive(Clone)] +pub struct GenericTwoStageFFTConvolver> +where + C: Default, + F: Default, + F::Complex: Clone + Default, +{ + head_convolver: GenericFFTConvolver, + tail_convolver0: GenericFFTConvolver, + tail_output0: Vec, + tail_precalculated0: Vec, + tail_convolver: GenericFFTConvolver, + tail_output: Vec, + tail_precalculated: Vec, + tail_input: Vec, + tail_input_fill: usize, + precalculated_pos: usize, + head_block_size: usize, + tail_block_size: usize, +} + +impl> GenericTwoStageFFTConvolver +where + C: Default, + F: Default, + F::Complex: Clone + Default, +{ + pub fn with_sizes( + impulse_response: &[f32], + _block_size: usize, + max_response_length: usize, + head_block_size: usize, + tail_block_size: usize, + ) -> Self { + if max_response_length < impulse_response.len() { + panic!( + "max_response_length must be at least the length of the initial impulse response" + ); + } + let mut padded_ir = impulse_response.to_vec(); + padded_ir.resize(max_response_length, 0.); + + let head_ir_len = std::cmp::min(max_response_length, tail_block_size); + let head_convolver = GenericFFTConvolver::::init( + &padded_ir[0..head_ir_len], + head_block_size, + max_response_length, + ); + + let tail_convolver0 = if max_response_length > tail_block_size { + let tail_ir_len = std::cmp::min(max_response_length - tail_block_size, tail_block_size); + GenericFFTConvolver::::init( + &padded_ir[tail_block_size..tail_block_size + tail_ir_len], + head_block_size, + max_response_length, + ) + } else { + GenericFFTConvolver::::default() + }; + + let tail_output0 = vec![0.0; tail_block_size]; + let tail_precalculated0 = vec![0.0; tail_block_size]; + + let tail_convolver = if max_response_length > 2 * tail_block_size { + let tail_ir_len = max_response_length - 2 * tail_block_size; + GenericFFTConvolver::::init( + &padded_ir[2 * tail_block_size..2 * tail_block_size + tail_ir_len], + tail_block_size, + max_response_length, + ) + } else { + GenericFFTConvolver::::default() + }; + + let tail_output = vec![0.0; tail_block_size]; + let tail_precalculated = vec![0.0; tail_block_size]; + let tail_input = vec![0.0; tail_block_size]; + let tail_input_fill = 0; + let precalculated_pos = 0; + + GenericTwoStageFFTConvolver { + head_convolver, + tail_convolver0, + tail_output0, + tail_precalculated0, + tail_convolver, + tail_output, + tail_precalculated, + tail_input, + tail_input_fill, + precalculated_pos, + head_block_size, + tail_block_size, + } + } +} + +impl> Convolution + for GenericTwoStageFFTConvolver +where + C: Default, + F: Default, + F::Complex: Clone + Default, +{ + fn init(impulse_response: &[f32], _block_size: usize, max_response_length: usize) -> Self { + Self::with_sizes( + impulse_response, + _block_size, + max_response_length, + DEFAULT_HEAD_BLOCK_SIZE, + DEFAULT_TAIL_BLOCK_SIZE, + ) + } + + fn update(&mut self, _response: &[f32]) { + todo!() + } + + fn process(&mut self, input: &[f32], output: &mut [f32]) { + let head_block_size = self.head_block_size; + let tail_block_size = self.tail_block_size; + + // Head + self.head_convolver.process(input, output); + + // Tail + if self.tail_input.is_empty() { + return; + } + + let len = input.len(); + let mut processed = 0; + + while processed < len { + let remaining = len - processed; + let processing = std::cmp::min( + remaining, + head_block_size - (self.tail_input_fill % head_block_size), + ); + + // Sum head and tail + let sum_begin = processed; + + // Sum: 1st tail block + if !self.tail_precalculated0.is_empty() { + C::add_to_buffer( + &mut output[sum_begin..sum_begin + processing], + &self.tail_precalculated0 + [self.precalculated_pos..self.precalculated_pos + processing], + ); + } + + // Sum: 2nd-Nth tail block + if !self.tail_precalculated.is_empty() { + C::add_to_buffer( + &mut output[sum_begin..sum_begin + processing], + &self.tail_precalculated + [self.precalculated_pos..self.precalculated_pos + processing], + ); + } + self.precalculated_pos += processing; + + // Fill input buffer for tail convolution + C::copy_and_pad( + &mut self.tail_input[self.tail_input_fill..], + &input[processed..processed + processing], + processing, + ); + self.tail_input_fill += processing; + + // Convolution: 1st tail block + if !self.tail_precalculated0.is_empty() && self.tail_input_fill % head_block_size == 0 { + assert!(self.tail_input_fill >= head_block_size); + let block_offset = self.tail_input_fill - head_block_size; + self.tail_convolver0.process( + &self.tail_input[block_offset..block_offset + head_block_size], + &mut self.tail_output0[block_offset..block_offset + head_block_size], + ); + if self.tail_input_fill == tail_block_size { + std::mem::swap(&mut self.tail_precalculated0, &mut self.tail_output0); + } + } + + // Convolution: 2nd-Nth tail block (might be done in some background thread) + if !self.tail_precalculated.is_empty() + && self.tail_input_fill == tail_block_size + && self.tail_output.len() == tail_block_size + { + std::mem::swap(&mut self.tail_precalculated, &mut self.tail_output); + self.tail_convolver + .process(&self.tail_input, &mut self.tail_output); + } + + if self.tail_input_fill == tail_block_size { + self.tail_input_fill = 0; + self.precalculated_pos = 0; + } + + processed += processing; + } + } +} diff --git a/src/tests.rs b/src/tests.rs index 160aac4..7cbb439 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -1,9 +1,23 @@ #[cfg(test)] mod tests { use crate::crossfade_convolver::CrossfadeConvolver; - use crate::fft_convolver::FFTConvolver; + use crate::fft_convolver::{FFTConvolver, TwoStageFFTConvolver}; use crate::{Convolution, Sample}; + #[test] + fn test_fft_convolver_passthrough() { + let mut response = [0.0; 1024]; + response[0] = 1.0; + let mut convolver = FFTConvolver::init(&response, 1024, response.len()); + let input = vec![1.0; 1024]; + let mut output = vec![0.0; 1024]; + convolver.process(&input, &mut output); + + for i in 0..1024 { + assert!((output[i] - 1.0).abs() < 1e-6); + } + } + fn generate_sinusoid( length: usize, frequency: f32, @@ -18,6 +32,38 @@ mod tests { signal } + #[test] + fn two_stage_convolver_head_tail_sizes_test() { + let block_size = 32; + let input_size = block_size * 2; + let fir_size = 4096; + let mut response = vec![0.0; fir_size]; + response[63] = 1.0; + let mut two_stage_convolver_1 = + TwoStageFFTConvolver::with_sizes(&response, block_size, fir_size, 64, 1024); + let mut two_stage_convolver_2 = + TwoStageFFTConvolver::with_sizes(&response, block_size, fir_size, 32, 1024); + let mut input = vec![0.0; input_size]; + input[0] = 1.0; + let mut output_a = vec![0.0; input_size]; + let mut output_b = vec![0.0; input_size]; + let first_half_input = input[..block_size].as_ref(); + let second_half_input = input[block_size..].as_ref(); + let first_half_output_a = &mut output_a[..block_size]; + let first_half_output_b = &mut output_b[..block_size]; + two_stage_convolver_1.process(first_half_input, first_half_output_a); + two_stage_convolver_2.process(first_half_input, first_half_output_b); + let second_half_output_a = &mut output_a[block_size..]; + let second_half_output_b = &mut output_b[block_size..]; + two_stage_convolver_1.process(second_half_input, second_half_output_a); + two_stage_convolver_2.process(second_half_input, second_half_output_b); + for i in 0..output_a.len() { + assert!((output_a[i] - output_b[i]).abs() < 0.000001); + } + assert!((output_a[63] - 1.0).abs() < 0.000001); + assert!((output_b[63] - 1.0).abs() < 0.000001); + } + #[test] fn fft_convolver_update_is_reset() { let block_size = 512; @@ -118,4 +164,67 @@ mod tests { } } } + + #[cfg(feature = "ipp")] + mod ipp_comparison_tests { + use crate::{fft_convolver::ipp_fft, fft_convolver::rust_fft, Convolution}; + + fn compare_implementations(impulse_response: &[f32], input: &[f32], block_size: usize) { + let max_len = impulse_response.len(); + + let mut rust_convolver = + rust_fft::FFTConvolver::init(impulse_response, block_size, max_len); + let mut ipp_convolver = + ipp_fft::FFTConvolver::init(impulse_response, block_size, max_len); + + let mut rust_output = vec![0.0; input.len()]; + let mut ipp_output = vec![0.0; input.len()]; + + rust_convolver.process(input, &mut rust_output); + ipp_convolver.process(input, &mut ipp_output); + + for i in 0..input.len() { + assert!( + (rust_output[i] - ipp_output[i]).abs() < 1e-5, + "Outputs differ at position {}: rust={}, ipp={}", + i, + rust_output[i], + ipp_output[i] + ); + } + } + + #[test] + fn test_ipp_vs_rust_impulse() { + let mut response = vec![0.0; 1024]; + response[0] = 1.0; + let input = vec![1.0; 1024]; + + compare_implementations(&response, &input, 256); + } + + #[test] + fn test_ipp_vs_rust_decay() { + let mut response = vec![0.0; 1024]; + for i in 0..response.len() { + response[i] = 0.9f32.powi(i as i32); + } + let input = vec![1.0; 1024]; + + compare_implementations(&response, &input, 256); + } + + #[test] + fn test_ipp_vs_rust_sine() { + let mut response = vec![0.0; 1024]; + response[0] = 1.0; + + let mut input = vec![0.0; 1024]; + for i in 0..input.len() { + input[i] = (i as f32 * 0.1).sin(); + } + + compare_implementations(&response, &input, 128); + } + } }