diff --git a/benchmarks/AudioProcessing/Operations/FIROp/CMakeLists.txt b/benchmarks/AudioProcessing/Operations/FIROp/CMakeLists.txt index 2dab3a8b..b65c00c8 100644 --- a/benchmarks/AudioProcessing/Operations/FIROp/CMakeLists.txt +++ b/benchmarks/AudioProcessing/Operations/FIROp/CMakeLists.txt @@ -1,94 +1,96 @@ #------------------------------------------------------------------------------- -# Generate MLIRFIRScalar +# Generate BuddyFIRTilesVectorization #------------------------------------------------------------------------------- +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g") +function(build_buddy_tile_vectorization vector_size tile_size unroll_factor) + add_custom_command( + OUTPUT buddy_vec_${vector_size}_tile_${tile_size}_unroll_${unroll_factor}.o + COMMAND + cat ${BUDDY_SOURCE_DIR}/benchmarks/AudioProcessing/Operations/FIROp/FIR.mlir | + sed -e 's/@buddy_fir_f32/@buddy_fir_vs_${vector_size}_ts_${tile_size}_uf_${unroll_factor}_f32/g' + -e 's/@buddy_fir_f64/@buddy_fir_vs_${vector_size}_ts_${tile_size}_uf_${unroll_factor}_f64/g' | + ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt + -vectorize-dap="fir-vec-size=${vector_size};fir-tile-size=${tile_size};fir-unroll-factor=${unroll_factor}" + -convert-scf-to-cf + -convert-vector-to-llvm + -llvm-request-c-wrappers + -convert-arith-to-llvm + -finalize-memref-to-llvm + -convert-func-to-llvm + -reconcile-unrealized-casts | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir | + ${LLVM_MLIR_BINARY_DIR}/llc -O3 + -mtriple=${BUDDY_OPT_TRIPLE} + -mattr=${BUDDY_OPT_ATTR} + -filetype=obj + -o ${BUDDY_BINARY_DIR}/../benchmarks/AudioProcessing/Operations/FIROp/buddy_vec_${vector_size}_tile_${tile_size}_unroll_${unroll_factor}.o + DEPENDS + ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt + ${LLVM_MLIR_BINARY_DIR}/mlir-translate + ${LLVM_MLIR_BINARY_DIR}/llc + ) + add_library(BuddyVs${vector_size}Ts${tile_size}Uf${unroll_factor} STATIC buddy_vec_${vector_size}_tile_${tile_size}_unroll_${unroll_factor}.o) + set_target_properties(BuddyVs${vector_size}Ts${tile_size}Uf${unroll_factor} PROPERTIES LINKER_LANGUAGE CXX) +endfunction() -add_custom_command( - OUTPUT mlir-fir.o - COMMAND - ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt - ${BUDDY_SOURCE_DIR}/benchmarks/AudioProcessing/Operations/FIROp/MLIRFIR.mlir - -convert-scf-to-cf - -llvm-request-c-wrappers - -convert-arith-to-llvm - -finalize-memref-to-llvm - -convert-func-to-llvm - -reconcile-unrealized-casts | - ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir | - ${LLVM_MLIR_BINARY_DIR}/llc - -mtriple=${BUDDY_OPT_TRIPLE} - -mattr=${BUDDY_OPT_ATTR} - -filetype=obj - -o ${BUDDY_BINARY_DIR}/../benchmarks/AudioProcessing/Operations/FIROp/mlir-fir.o - DEPENDS - ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt - ${LLVM_MLIR_BINARY_DIR}/mlir-translate - ${LLVM_MLIR_BINARY_DIR}/llc -) - -add_library(MLIRFIRScalar STATIC mlir-fir.o) -set_target_properties(MLIRFIRScalar PROPERTIES LINKER_LANGUAGE CXX) - -#------------------------------------------------------------------------------- -# Generate MLIRFIRTiledVectorization -#------------------------------------------------------------------------------- - -add_custom_command( - OUTPUT fir-tile-vectorization.o - COMMAND - ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt - ${BUDDY_SOURCE_DIR}/benchmarks/AudioProcessing/Operations/FIROp/MLIRFIRTiledVectorization.mlir - -convert-scf-to-cf - -convert-vector-to-llvm - -llvm-request-c-wrappers - -convert-arith-to-llvm - -finalize-memref-to-llvm - -convert-func-to-llvm - -reconcile-unrealized-casts | - ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir | - ${LLVM_MLIR_BINARY_DIR}/llc - -mtriple=${BUDDY_OPT_TRIPLE} - -mattr=${BUDDY_OPT_ATTR} - -filetype=obj - -o ${BUDDY_BINARY_DIR}/../benchmarks/AudioProcessing/Operations/FIROp/fir-tile-vectorization.o - DEPENDS - ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt - ${LLVM_MLIR_BINARY_DIR}/mlir-translate - ${LLVM_MLIR_BINARY_DIR}/llc -) - -add_library(MLIRFIRTiledVectorization STATIC fir-tile-vectorization.o) -set_target_properties(MLIRFIRTiledVectorization PROPERTIES LINKER_LANGUAGE CXX) +build_buddy_tile_vectorization(8 64 1) +build_buddy_tile_vectorization(8 128 1) +build_buddy_tile_vectorization(8 256 1) +build_buddy_tile_vectorization(8 512 1) +build_buddy_tile_vectorization(8 1024 1) +build_buddy_tile_vectorization(8 2048 1) +build_buddy_tile_vectorization(8 4096 1) +build_buddy_tile_vectorization(8 8192 1) +build_buddy_tile_vectorization(16 64 1) +build_buddy_tile_vectorization(16 128 1) +build_buddy_tile_vectorization(16 216 1) +build_buddy_tile_vectorization(16 240 1) +build_buddy_tile_vectorization(16 256 1) +build_buddy_tile_vectorization(16 512 1) +build_buddy_tile_vectorization(16 512 2) +build_buddy_tile_vectorization(16 512 4) +build_buddy_tile_vectorization(16 512 8) +build_buddy_tile_vectorization(16 512 16) +build_buddy_tile_vectorization(16 1024 1) +build_buddy_tile_vectorization(16 2048 1) +build_buddy_tile_vectorization(16 4096 1) +build_buddy_tile_vectorization(16 8192 1) #------------------------------------------------------------------------------- # Generate MLIRFIRVectorization #------------------------------------------------------------------------------- -add_custom_command( - OUTPUT fir-vectorization.o - COMMAND - ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt - ${BUDDY_SOURCE_DIR}/benchmarks/AudioProcessing/Operations/FIROp/MLIRFIRVectorization.mlir - -convert-scf-to-cf - -convert-vector-to-llvm - -llvm-request-c-wrappers - -convert-arith-to-llvm - -finalize-memref-to-llvm - -convert-func-to-llvm - -reconcile-unrealized-casts | - ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir | - ${LLVM_MLIR_BINARY_DIR}/llc - -mtriple=${BUDDY_OPT_TRIPLE} - -mattr=${BUDDY_OPT_ATTR} - -filetype=obj - -o ${BUDDY_BINARY_DIR}/../benchmarks/AudioProcessing/Operations/FIROp/fir-vectorization.o - DEPENDS - ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt - ${LLVM_MLIR_BINARY_DIR}/mlir-translate - ${LLVM_MLIR_BINARY_DIR}/llc -) +function(build_fir_vectorization type) + add_custom_command( + OUTPUT fir-vectorization-${type}.o + COMMAND + cat ${BUDDY_SOURCE_DIR}/benchmarks/AudioProcessing/Operations/FIROp/MLIRFIRVectorization.mlir | + sed 's/TYPE_PLACEHOLDER/${type}/g' | + ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt + -convert-scf-to-cf + -convert-vector-to-llvm + -llvm-request-c-wrappers + -convert-arith-to-llvm + -finalize-memref-to-llvm + -convert-func-to-llvm + -reconcile-unrealized-casts | + ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir | + ${LLVM_MLIR_BINARY_DIR}/llc + -mtriple=${BUDDY_OPT_TRIPLE} + -mattr=${BUDDY_OPT_ATTR} + -filetype=obj + -o ${BUDDY_BINARY_DIR}/../benchmarks/AudioProcessing/Operations/FIROp/fir-vectorization-${type}.o + DEPENDS + ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt + ${LLVM_MLIR_BINARY_DIR}/mlir-translate + ${LLVM_MLIR_BINARY_DIR}/llc + ) + add_library(MLIRFIRVectorization${type} STATIC fir-vectorization-${type}.o) + set_target_properties(MLIRFIRVectorization${type} PROPERTIES LINKER_LANGUAGE CXX) +endfunction() -add_library(MLIRFIRVectorization STATIC fir-vectorization.o) -set_target_properties(MLIRFIRVectorization PROPERTIES LINKER_LANGUAGE CXX) +build_fir_vectorization(f32) +build_fir_vectorization(f64) #------------------------------------------------------------------------------- # Generate dap-op-fir-benchmark @@ -105,10 +107,31 @@ target_link_libraries(dap-op-fir-benchmark PRIVATE # Third-party library kfr_io # MLIR hand-written benchmark - MLIRFIRScalar - MLIRFIRTiledVectorization - MLIRFIRVectorization + MLIRFIRVectorizationf32 + MLIRFIRVectorizationf64 # Buddy DAP library + BuddyVs8Ts64Uf1 + BuddyVs8Ts128Uf1 + BuddyVs8Ts256Uf1 + BuddyVs8Ts512Uf1 + BuddyVs8Ts1024Uf1 + BuddyVs8Ts2048Uf1 + BuddyVs8Ts4096Uf1 + BuddyVs8Ts8192Uf1 + BuddyVs16Ts64Uf1 + BuddyVs16Ts128Uf1 + BuddyVs16Ts216Uf1 + BuddyVs16Ts240Uf1 + BuddyVs16Ts256Uf1 + BuddyVs16Ts512Uf1 + BuddyVs16Ts512Uf2 + BuddyVs16Ts512Uf4 + BuddyVs16Ts512Uf8 + BuddyVs16Ts512Uf16 + BuddyVs16Ts1024Uf1 + BuddyVs16Ts2048Uf1 + BuddyVs16Ts4096Uf1 + BuddyVs16Ts8192Uf1 BuddyLibDAP # LLVM/MLIR library StaticMLIRCRunnerUtils diff --git a/benchmarks/AudioProcessing/Operations/FIROp/FIR.mlir b/benchmarks/AudioProcessing/Operations/FIROp/FIR.mlir new file mode 100644 index 00000000..758151ff --- /dev/null +++ b/benchmarks/AudioProcessing/Operations/FIROp/FIR.mlir @@ -0,0 +1,29 @@ +//===- FIR.mlir -----------------------------------------------------------===// +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +// +// This file provides the `dap.fir` operations with various types. +// +//===----------------------------------------------------------------------===// + +func.func @buddy_fir_f32(%in : memref, %filter : memref, %out : memref) -> () { + dap.fir %in, %filter, %out : memref, memref, memref + return +} + +func.func @buddy_fir_f64(%in : memref, %filter : memref, %out : memref) -> () { + dap.fir %in, %filter, %out : memref, memref, memref + return +} diff --git a/benchmarks/AudioProcessing/Operations/FIROp/MLIRFIR.mlir b/benchmarks/AudioProcessing/Operations/FIROp/MLIRFIR.mlir index 24c331ef..9f7886fd 100644 --- a/benchmarks/AudioProcessing/Operations/FIROp/MLIRFIR.mlir +++ b/benchmarks/AudioProcessing/Operations/FIROp/MLIRFIR.mlir @@ -14,7 +14,8 @@ // //===----------------------------------------------------------------------===// // -// This file provides the MLIR Fir function. +// This file implements the scalar version of the Fir function, following the +// same algorithm as Buddy's scalar version DAP pass: `--lower-dap`. // //===----------------------------------------------------------------------===// diff --git a/benchmarks/AudioProcessing/Operations/FIROp/MLIRFIRTiledVectorization.mlir b/benchmarks/AudioProcessing/Operations/FIROp/MLIRFIRTiledVectorization.mlir index acf339e5..ca1553b7 100644 --- a/benchmarks/AudioProcessing/Operations/FIROp/MLIRFIRTiledVectorization.mlir +++ b/benchmarks/AudioProcessing/Operations/FIROp/MLIRFIRTiledVectorization.mlir @@ -14,7 +14,9 @@ // //===----------------------------------------------------------------------===// // -// This file provides the vectorized MLIR FIR function with tiling. +// This file implements the vectorized FIR function using a tiling technique. +// following the same algorithm as Buddy's vectorize DAP pass: +// `--vectorize-dap="fir-vec-size=16 fir-tile-size=2048"` // //===----------------------------------------------------------------------===// @@ -110,7 +112,7 @@ func.func @fir_tiled_vectorization(%input : memref, %kernel : memref, vector<16xf32> %out_index = arith.addi %i, %n : index - %out_vec = vector.load %output[%out_index] : memref, vector<16xf32> // 需要计算output的偏移量 + %out_vec = vector.load %output[%out_index] : memref, vector<16xf32> %fma_vec = vector.fma %k_vec, %in_vec, %out_vec : vector<16xf32> vector.store %fma_vec, %output[%out_index] : memref, vector<16xf32> } diff --git a/benchmarks/AudioProcessing/Operations/FIROp/MLIRFIRVectorization.mlir b/benchmarks/AudioProcessing/Operations/FIROp/MLIRFIRVectorization.mlir index 0a65ef27..f1e1b45e 100644 --- a/benchmarks/AudioProcessing/Operations/FIROp/MLIRFIRVectorization.mlir +++ b/benchmarks/AudioProcessing/Operations/FIROp/MLIRFIRVectorization.mlir @@ -14,17 +14,18 @@ // //===----------------------------------------------------------------------===// // -// This file provides the vectorized MLIR FIR function (without tiling). +// This file implements the vectorized MLIR FIR function (without tiling), +// with a fixed vector size of 16. // //===----------------------------------------------------------------------===// -func.func @fir_vectorization(%input : memref, %kernel : memref, - %output : memref) -> () { +func.func @fir_vector_TYPE_PLACEHOLDER(%input : memref, + %kernel : memref, %output : memref) -> () { // 1. Get the total length of the workload. %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index - %input_size = memref.dim %input, %c0 : memref - %kernel_size = memref.dim %kernel, %c0 : memref + %input_size = memref.dim %input, %c0 : memref + %kernel_size = memref.dim %kernel, %c0 : memref // 2. Set the iteration step (vector size). %vl_step = arith.constant 16 : index @@ -40,17 +41,17 @@ func.func @fir_vectorization(%input : memref, %kernel : memref, // 4. Loop through each kernel element scf.for %n = %c0 to %kernel_size step %c1 iter_args(%upbound = %upbound_init) -> (index) { - %k_elem = memref.load %kernel[%n] : memref - %k_vec = vector.splat %k_elem : vector<16xf32> + %k_elem = memref.load %kernel[%n] : memref + %k_vec = vector.splat %k_elem : vector<16xTYPE_PLACEHOLDER> // 5. Perform the vectorization body. %iter_idx = scf.for %i = %c0 to %upbound step %vl_step iter_args(%iter_init = %c0) -> (index) { - %in_vec = vector.load %input[%i] : memref, vector<16xf32> + %in_vec = vector.load %input[%i] : memref, vector<16xTYPE_PLACEHOLDER> %out_index = arith.addi %i, %n : index - %out_vec = vector.load %output[%out_index] : memref, vector<16xf32> - %fma_vec = vector.fma %k_vec, %in_vec, %out_vec : vector<16xf32> - vector.store %fma_vec, %output[%out_index] : memref, vector<16xf32> + %out_vec = vector.load %output[%out_index] : memref, vector<16xTYPE_PLACEHOLDER> + %fma_vec = vector.fma %k_vec, %in_vec, %out_vec : vector<16xTYPE_PLACEHOLDER> + vector.store %fma_vec, %output[%out_index] : memref, vector<16xTYPE_PLACEHOLDER> %i_next = arith.addi %i, %vl_step : index scf.yield %i_next : index } @@ -58,12 +59,12 @@ func.func @fir_vectorization(%input : memref, %kernel : memref, // 6. Process the remainder of the elements with scalar operations. %upbound_scalar = arith.addi %upbound, %vl_step_minus_1 : index scf.for %i = %iter_idx to %upbound_scalar step %c1 { - %in_elem = memref.load %input[%i] : memref + %in_elem = memref.load %input[%i] : memref %out_index = arith.addi %i, %n : index - %out_elem = memref.load %output[%out_index] : memref - %mul_elem = arith.mulf %in_elem, %k_elem : f32 - %add_elem = arith.addf %mul_elem, %out_elem : f32 - memref.store %add_elem, %output[%out_index] : memref + %out_elem = memref.load %output[%out_index] : memref + %mul_elem = arith.mulf %in_elem, %k_elem : TYPE_PLACEHOLDER + %add_elem = arith.addf %mul_elem, %out_elem : TYPE_PLACEHOLDER + memref.store %add_elem, %output[%out_index] : memref } %upbound_next = arith.subi %upbound, %c1 : index diff --git a/benchmarks/AudioProcessing/Operations/FIROp/Main.cpp b/benchmarks/AudioProcessing/Operations/FIROp/Main.cpp index 55cb467a..4b0cc09d 100644 --- a/benchmarks/AudioProcessing/Operations/FIROp/Main.cpp +++ b/benchmarks/AudioProcessing/Operations/FIROp/Main.cpp @@ -20,6 +20,7 @@ #include "Utils.hpp" #include +#include using namespace std; @@ -28,72 +29,118 @@ using namespace std; // ----------------------------------------------------------------------------- #define _NUM_ITER 10 -#define _IN_OUT_SIZE 2000000 -#define _KERNEL_SIZE 127 -#define _PRINT true +#define _IN_OUT_SIZE 20000000 +#define _FILTER_SIZE 127 +#define _PRINT false // ----------------------------------------------------------------------------- // Global Variables and Functions. Please do not modify the code here. // ----------------------------------------------------------------------------- -univector firInput; -univector firOutput; -univector taps127; +univector firInput_f32, firOutput_f32; +univector firFilter_f32; +univector firInput_f64, firOutput_f64; +univector firFilter_f64; intptr_t sizeofAud[1] = {_IN_OUT_SIZE}; -intptr_t sizeofKernel[1] = {_KERNEL_SIZE}; +intptr_t sizeofKernel[1] = {_FILTER_SIZE}; -MemRef audRef(sizeofAud); -MemRef resRef(sizeofAud); -MemRef kernelRef(sizeofKernel); +MemRef in_f32(sizeofAud), filt_f32(sizeofKernel), out_f32(sizeofAud); +MemRef in_f64(sizeofAud), filt_f64(sizeofKernel), out_f64(sizeofAud); -using MLIRFunctionType = void (*)(MemRef *, MemRef *, - MemRef *); +template +using MLIRFunctionType = void (*)(MemRef *, MemRef *, + MemRef *); -using BuddyFunctionType = void (*)(MemRef *, MemRef *, - MemRef *, bool); +template +using BuddyFunctionType = void (*)(MemRef *, MemRef *, + MemRef *, bool); // Benchmarking function for MLIR based FIR method. -void DAP_OPS_FIR(benchmark::State &state, MLIRFunctionType func) { - MemRef resRef(sizeofAud, 0.0); - for (auto _ : state) { - func(&audRef, &kernelRef, &resRef); +template +static void DAP_OPS_FIR(benchmark::State &state, MLIRFunctionType func) { + static_assert(std::is_same::value || std::is_same::value, + "T must be either float (f32) or double (f64)."); + if constexpr (std::is_same::value) { + MemRef out_f32(sizeofAud, 0.0); + for (auto _ : state) { + func(&in_f32, &filt_f32, &out_f32); + } + benchmark::DoNotOptimize(out_f32); + } else if constexpr (std::is_same::value) { + MemRef out_f64(sizeofAud, 0.0); + for (auto _ : state) { + func(&in_f64, &filt_f64, &out_f64); + } + benchmark::DoNotOptimize(out_f64); } - benchmark::DoNotOptimize(resRef); } -void DAP_OPS_FIR(benchmark::State &state, BuddyFunctionType func, - bool isVectorization) { - MemRef resRef(sizeofAud, 0.0); - for (auto _ : state) { - func(&audRef, &kernelRef, &resRef, isVectorization); +template +static void DAP_OPS_FIR(benchmark::State &state, BuddyFunctionType func, + bool isVectorization) { + static_assert(std::is_same::value || std::is_same::value, + "T must be either float (f32) or double (f64)."); + if constexpr (std::is_same::value) { + MemRef out_f32(sizeofAud, 0.0); + for (auto _ : state) { + func(&in_f32, &filt_f32, &out_f32, isVectorization); + } + benchmark::DoNotOptimize(out_f32); + } else if constexpr (std::is_same::value) { + MemRef out_f64(sizeofAud, 0.0); + for (auto _ : state) { + func(&in_f64, &filt_f64, &out_f64, isVectorization); + } + benchmark::DoNotOptimize(out_f64); } - benchmark::DoNotOptimize(resRef); } // Benchmarking function for KFR FIR method. -static void KFR_FIR(benchmark::State &state) { +static void KFR_FIR_f32(benchmark::State &state) { + for (auto _ : state) { + firOutput_f32 = kfr::fir(firInput_f32, firFilter_f32); + } + benchmark::DoNotOptimize(firOutput_f32); +} + +static void KFR_FIR_f64(benchmark::State &state) { for (auto _ : state) { - firOutput = kfr::fir(firInput, taps127); + firOutput_f64 = kfr::fir(firInput_f64, firFilter_f64); } + benchmark::DoNotOptimize(firOutput_f64); } // Verifies the result of an MLIR-based function against expected output. -void Verification(const univector &outputExpected, - MLIRFunctionType MLIRFunc, const std::string &name) { +template +void Verification(const univector &outputExpected, + MLIRFunctionType MLIRFunc, const std::string &name) { + static_assert(std::is_same::value || std::is_same::value, + "T must be either float (f32) or double (f64)."); // Initialize MemRef with all zeros. - MemRef outputGenerated(sizeofAud, 0.0); - MLIRFunc(&audRef, &kernelRef, &outputGenerated); + MemRef outputGenerated(sizeofAud, 0.0); + if constexpr (std::is_same::value) { + MLIRFunc(&in_f32, &filt_f32, &outputGenerated); + } else if constexpr (std::is_same::value) { + MLIRFunc(&in_f64, &filt_f64, &outputGenerated); + } firOp::printMemRef(outputGenerated, name, /*doPrint=*/_PRINT); firOp::verify(outputExpected, outputGenerated, _IN_OUT_SIZE, name); } -void Verification(const univector &outputExpected, - BuddyFunctionType BuddyFunc, bool isVectorization, +template +void Verification(const univector &outputExpected, + BuddyFunctionType BuddyFunc, bool isVectorization, const std::string &name) { + static_assert(std::is_same::value || std::is_same::value, + "T must be either float (f32) or double (f64)."); // Initialize MemRef with all zeros. - MemRef outputGenerated(sizeofAud, 0.0); - BuddyFunc(&audRef, &kernelRef, &outputGenerated, isVectorization); + MemRef outputGenerated(sizeofAud, 0.0); + if constexpr (std::is_same::value) { + BuddyFunc(&in_f32, &filt_f32, &outputGenerated, isVectorization); + } else if constexpr (std::is_same::value) { + BuddyFunc(&in_f64, &filt_f64, &outputGenerated, isVectorization); + } firOp::printMemRef(outputGenerated, name, /*doPrint=*/_PRINT); firOp::verify(outputExpected, outputGenerated, _IN_OUT_SIZE, name); } @@ -102,23 +149,93 @@ void Verification(const univector &outputExpected, // Register Benchmark. // ----------------------------------------------------------------------------- -BENCHMARK_CAPTURE(DAP_OPS_FIR, mlir_scalar, _mlir_ciface_fir_scalar) +// Benchmarks with f32/float type. +// BENCHMARK_CAPTURE(DAP_OPS_FIR, buddy_scalar_f32, dap::FIR, false) +// ->Unit(benchmark::kMillisecond) +// ->Iterations(_NUM_ITER); +// BENCHMARK_CAPTURE(DAP_OPS_FIR, mlir_vector_f32, _mlir_ciface_fir_vector_f32) +// ->Unit(benchmark::kMillisecond) +// ->Iterations(_NUM_ITER); +BENCHMARK_CAPTURE(DAP_OPS_FIR, buddy_tiled_vector_f32, dap::FIR, true) + ->Unit(benchmark::kMillisecond) + ->Iterations(20); +// BENCHMARK(KFR_FIR_f32)->Unit(benchmark::kMillisecond)->Iterations(_NUM_ITER); +// // Benchmarks with f64/double type. +// BENCHMARK_CAPTURE(DAP_OPS_FIR, buddy_scalar_f64, dap::FIR, false) +// ->Unit(benchmark::kMillisecond) +// ->Iterations(_NUM_ITER); +// BENCHMARK_CAPTURE(DAP_OPS_FIR, mlir_vector_f64, _mlir_ciface_fir_vector_f64) +// ->Unit(benchmark::kMillisecond) +// ->Iterations(_NUM_ITER); +// BENCHMARK_CAPTURE(DAP_OPS_FIR, buddy_tiled_vector_f64, dap::FIR, +// true) +// ->Unit(benchmark::kMillisecond) +// ->Iterations(_NUM_ITER); +// BENCHMARK(KFR_FIR_f64)->Unit(benchmark::kMillisecond)->Iterations(_NUM_ITER); +// // Benchmarks for vector size and tile size. +// BENCHMARK_CAPTURE(DAP_OPS_FIR, vs16_ts64_f32, _mlir_ciface_buddy_fir_vs_16_ts_64_uf_1_f32) +// ->Unit(benchmark::kMillisecond) +// ->Iterations(20); +// BENCHMARK_CAPTURE(DAP_OPS_FIR, vs16_ts128_f32, _mlir_ciface_buddy_fir_vs_16_ts_128_uf_1_f32) +// ->Unit(benchmark::kMillisecond) +// ->Iterations(_NUM_ITER); +// BENCHMARK_CAPTURE(DAP_OPS_FIR, vs8_ts256_f32, _mlir_ciface_buddy_fir_vs_8_ts_256_uf_1_f32) +// ->Unit(benchmark::kMillisecond) +// ->Iterations(20); +// BENCHMARK_CAPTURE(DAP_OPS_FIR, vs16_ts216_f32, _mlir_ciface_buddy_fir_vs_16_ts_240_uf_1_f32) +// ->Unit(benchmark::kMillisecond) +// ->Iterations(20); +// BENCHMARK_CAPTURE(DAP_OPS_FIR, vs16_ts240_f32, _mlir_ciface_buddy_fir_vs_16_ts_240_uf_1_f32) +// ->Unit(benchmark::kMillisecond) +// ->Iterations(20); +// BENCHMARK_CAPTURE(DAP_OPS_FIR, vs16_ts256_f32, _mlir_ciface_buddy_fir_vs_16_ts_256_uf_1_f32) +// ->Unit(benchmark::kMillisecond) +// ->Iterations(20); +BENCHMARK_CAPTURE(DAP_OPS_FIR, vec16_tile512_uf1_f32, _mlir_ciface_buddy_fir_vs_16_ts_512_uf_1_f32) ->Unit(benchmark::kMillisecond) - ->Iterations(_NUM_ITER); -BENCHMARK_CAPTURE(DAP_OPS_FIR, buddy_scalar, dap::FIR, false) + ->Iterations(20); +BENCHMARK_CAPTURE(DAP_OPS_FIR, vec16_tile512_uf2_f32, _mlir_ciface_buddy_fir_vs_16_ts_512_uf_2_f32) ->Unit(benchmark::kMillisecond) - ->Iterations(_NUM_ITER); -BENCHMARK_CAPTURE(DAP_OPS_FIR, mlir_vectorize, _mlir_ciface_fir_vectorization) + ->Iterations(20); +BENCHMARK_CAPTURE(DAP_OPS_FIR, vec16_tile512_uf4_f32, _mlir_ciface_buddy_fir_vs_16_ts_512_uf_4_f32) ->Unit(benchmark::kMillisecond) - ->Iterations(_NUM_ITER); -BENCHMARK_CAPTURE(DAP_OPS_FIR, mlir_tiled_vectorize, - _mlir_ciface_fir_tiled_vectorization) + ->Iterations(20); +BENCHMARK_CAPTURE(DAP_OPS_FIR, vec16_tile512_uf8_f32, _mlir_ciface_buddy_fir_vs_16_ts_512_uf_8_f32) ->Unit(benchmark::kMillisecond) - ->Iterations(_NUM_ITER); -BENCHMARK_CAPTURE(DAP_OPS_FIR, buddy_tiled_vectorize, dap::FIR, true) + ->Iterations(20); +BENCHMARK_CAPTURE(DAP_OPS_FIR, vec16_tile512_uf16_f32, _mlir_ciface_buddy_fir_vs_16_ts_512_uf_16_f32) ->Unit(benchmark::kMillisecond) - ->Iterations(_NUM_ITER); -BENCHMARK(KFR_FIR)->Unit(benchmark::kMillisecond)->Iterations(_NUM_ITER); + ->Iterations(20); +BENCHMARK_CAPTURE(DAP_OPS_FIR, buddy_tiled_vector_f32, dap::FIR, true) + ->Unit(benchmark::kMillisecond) + ->Iterations(20); +BENCHMARK_CAPTURE(DAP_OPS_FIR, vec16_tile1024_uf1_f32, _mlir_ciface_buddy_fir_vs_16_ts_1024_uf_1_f32) + ->Unit(benchmark::kMillisecond) + ->Iterations(20); +// BENCHMARK_CAPTURE(DAP_OPS_FIR, vs16_ts2048_f32, _mlir_ciface_buddy_fir_vs_16_ts_2048_uf_1_f32) +// ->Unit(benchmark::kMillisecond) +// ->Iterations(20); +// BENCHMARK_CAPTURE(DAP_OPS_FIR, vs16_ts4096_f32, _mlir_ciface_buddy_fir_vs_16_ts_4096_uf_1_f32) +// ->Unit(benchmark::kMillisecond) +// ->Iterations(20); +// BENCHMARK_CAPTURE(DAP_OPS_FIR, vs16_ts8192_f32, _mlir_ciface_buddy_fir_vs_16_ts_8192_uf_1_f32) +// ->Unit(benchmark::kMillisecond) +// ->Iterations(20); +// BENCHMARK_CAPTURE(DAP_OPS_FIR, vs8_ts256_f64, _mlir_ciface_buddy_fir_vs_8_ts_256_uf_1_f64) +// ->Unit(benchmark::kMillisecond) +// ->Iterations(20); +// BENCHMARK_CAPTURE(DAP_OPS_FIR, vs8_ts128_f64, _mlir_ciface_buddy_fir_vs_8_ts_128_uf_1_f64) +// ->Unit(benchmark::kMillisecond) +// ->Iterations(20); +// BENCHMARK_CAPTURE(DAP_OPS_FIR, vs8_ts512_f64, _mlir_ciface_buddy_fir_vs_8_ts_512_uf_1_f64) +// ->Unit(benchmark::kMillisecond) +// ->Iterations(20); +// BENCHMARK_CAPTURE(DAP_OPS_FIR, vs8_ts1024_f64, _mlir_ciface_buddy_fir_vs_8_ts_1024_uf_1_f64) +// ->Unit(benchmark::kMillisecond) +// ->Iterations(20); +// BENCHMARK_CAPTURE(DAP_OPS_FIR, vs16_ts512_f64, _mlir_ciface_buddy_fir_vs_16_ts_512_uf_1_f64) +// ->Unit(benchmark::kMillisecond) +// ->Iterations(20); // ----------------------------------------------------------------------------- // Main Function. @@ -126,10 +243,12 @@ BENCHMARK(KFR_FIR)->Unit(benchmark::kMillisecond)->Iterations(_NUM_ITER); int main(int argc, char **argv) { // Initialize univectors and MemRefs. - firOp::initializeKFRFIR(firInput, taps127); - audRef = std::move(MemRef(firInput.data(), sizeofAud)); - kernelRef = std::move(MemRef(taps127.data(), sizeofKernel)); - resRef = std::move(MemRef(sizeofAud, 0.0)); + firOp::initializeKFRFIR(firInput_f32, firFilter_f32); + in_f32 = std::move(MemRef(firInput_f32.data(), sizeofAud)); + filt_f32 = std::move(MemRef(firFilter_f32.data(), sizeofKernel)); + firOp::initializeKFRFIR(firInput_f64, firFilter_f64); + in_f64 = std::move(MemRef(firInput_f64.data(), sizeofAud)); + filt_f64 = std::move(MemRef(firFilter_f64.data(), sizeofKernel)); // Run benchmark. ::benchmark::Initialize(&argc, argv); @@ -137,18 +256,23 @@ int main(int argc, char **argv) { std::cout << "\033[34m---------- Verification ----------\033[0m" << std::endl; // Obtain KFR output results as expected results in verification. - firOutput = kfr::fir(firInput, taps127); - firOp::printUnivector(firOutput, /*doPrint=*/_PRINT); + firOutput_f32 = kfr::fir(firInput_f32, firFilter_f32); + firOp::printUnivector(firOutput_f32, "KFRF32", /*doPrint=*/_PRINT); + firOutput_f64 = kfr::fir(firInput_f64, firFilter_f64); + firOp::printUnivector(firOutput_f64, "KFRF64", /*doPrint=*/_PRINT); // Verify the correctness of all methods. - Verification(firOutput, _mlir_ciface_fir_scalar, "MLIRScalar"); - Verification(firOutput, dap::FIR, /*isVectorization=*/false, - "BuddyScalar"); - Verification(firOutput, _mlir_ciface_fir_vectorization, "MLIRVectorize"); - Verification(firOutput, _mlir_ciface_fir_tiled_vectorization, - "MLIRTiledVectorize"); - Verification(firOutput, dap::FIR, /*isVectorization=*/true, - "BuddyTiledVectorize"); + Verification(firOutput_f32, dap::FIR, /*isVectorization=*/false, + "BuddyScalarF32"); + Verification(firOutput_f32, _mlir_ciface_fir_vector_f32, "MLIRVectorizeF32"); + Verification(firOutput_f32, dap::FIR, /*isVectorization=*/true, + "BuddyTiledVectorizeF32"); + + Verification(firOutput_f64, dap::FIR, /*isVectorization=*/false, + "BuddyScalarF64"); + Verification(firOutput_f64, _mlir_ciface_fir_vector_f64, "MLIRVectorizeF64"); + Verification(firOutput_f64, dap::FIR, /*isVectorization=*/true, + "BuddyTiledVectorizeF64"); return 0; } diff --git a/benchmarks/AudioProcessing/Operations/FIROp/Utils.hpp b/benchmarks/AudioProcessing/Operations/FIROp/Utils.hpp index fafe4449..ef9e092a 100644 --- a/benchmarks/AudioProcessing/Operations/FIROp/Utils.hpp +++ b/benchmarks/AudioProcessing/Operations/FIROp/Utils.hpp @@ -38,15 +38,72 @@ using namespace kfr; // ----------------------------------------------------------------------------- extern "C" { -void _mlir_ciface_fir_scalar(MemRef *inputMLIRFIR, - MemRef *kernelMLIRFIR, - MemRef *outputMLIRFIR); -void _mlir_ciface_fir_vectorization(MemRef *inputMLIRFIR, - MemRef *kernelMLIRFIR, - MemRef *outputMLIRFIR); -void _mlir_ciface_fir_tiled_vectorization(MemRef *inputMLIRFIR, - MemRef *kernelMLIRFIR, - MemRef *outputMLIRFIR); +void _mlir_ciface_fir_vector_f32(MemRef *inputMLIRFIR, + MemRef *kernelMLIRFIR, + MemRef *outputMLIRFIR); +void _mlir_ciface_fir_vector_f64(MemRef *inputMLIRFIR, + MemRef *kernelMLIRFIR, + MemRef *outputMLIRFIR); +void _mlir_ciface_buddy_fir_vs_8_ts_256_uf_1_f64(MemRef *inputMLIRFIR, + MemRef *kernelMLIRFIR, + MemRef *outputMLIRFIR); +void _mlir_ciface_buddy_fir_vs_8_ts_128_uf_1_f64(MemRef *inputMLIRFIR, + MemRef *kernelMLIRFIR, + MemRef *outputMLIRFIR); +void _mlir_ciface_buddy_fir_vs_8_ts_512_uf_1_f64(MemRef *inputMLIRFIR, + MemRef *kernelMLIRFIR, + MemRef *outputMLIRFIR); +void _mlir_ciface_buddy_fir_vs_8_ts_1024_uf_1_f64(MemRef *inputMLIRFIR, + MemRef *kernelMLIRFIR, + MemRef *outputMLIRFIR); +void _mlir_ciface_buddy_fir_vs_16_ts_512_uf_1_f64(MemRef *inputMLIRFIR, + MemRef *kernelMLIRFIR, + MemRef *outputMLIRFIR); +void _mlir_ciface_buddy_fir_vs_16_ts_64_uf_1_f32(MemRef *inputMLIRFIR, + MemRef *kernelMLIRFIR, + MemRef *outputMLIRFIR); +void _mlir_ciface_buddy_fir_vs_16_ts_128_uf_1_f32(MemRef *inputMLIRFIR, + MemRef *kernelMLIRFIR, + MemRef *outputMLIRFIR); +void _mlir_ciface_buddy_fir_vs_16_ts_216_uf_1_f32(MemRef *inputMLIRFIR, + MemRef *kernelMLIRFIR, + MemRef *outputMLIRFIR); +void _mlir_ciface_buddy_fir_vs_16_ts_240_uf_1_f32(MemRef *inputMLIRFIR, + MemRef *kernelMLIRFIR, + MemRef *outputMLIRFIR); +void _mlir_ciface_buddy_fir_vs_16_ts_256_uf_1_f32(MemRef *inputMLIRFIR, + MemRef *kernelMLIRFIR, + MemRef *outputMLIRFIR); +void _mlir_ciface_buddy_fir_vs_8_ts_256_uf_1_f32(MemRef *inputMLIRFIR, + MemRef *kernelMLIRFIR, + MemRef *outputMLIRFIR); +void _mlir_ciface_buddy_fir_vs_16_ts_512_uf_1_f32(MemRef *inputMLIRFIR, + MemRef *kernelMLIRFIR, + MemRef *outputMLIRFIR); +void _mlir_ciface_buddy_fir_vs_16_ts_512_uf_2_f32(MemRef *inputMLIRFIR, + MemRef *kernelMLIRFIR, + MemRef *outputMLIRFIR); +void _mlir_ciface_buddy_fir_vs_16_ts_512_uf_4_f32(MemRef *inputMLIRFIR, + MemRef *kernelMLIRFIR, + MemRef *outputMLIRFIR); +void _mlir_ciface_buddy_fir_vs_16_ts_512_uf_8_f32(MemRef *inputMLIRFIR, + MemRef *kernelMLIRFIR, + MemRef *outputMLIRFIR); +void _mlir_ciface_buddy_fir_vs_16_ts_512_uf_16_f32(MemRef *inputMLIRFIR, + MemRef *kernelMLIRFIR, + MemRef *outputMLIRFIR); +void _mlir_ciface_buddy_fir_vs_16_ts_1024_uf_1_f32(MemRef *inputMLIRFIR, + MemRef *kernelMLIRFIR, + MemRef *outputMLIRFIR); +void _mlir_ciface_buddy_fir_vs_16_ts_2048_uf_1_f32(MemRef *inputMLIRFIR, + MemRef *kernelMLIRFIR, + MemRef *outputMLIRFIR); +void _mlir_ciface_buddy_fir_vs_16_ts_4096_uf_1_f32(MemRef *inputMLIRFIR, + MemRef *kernelMLIRFIR, + MemRef *outputMLIRFIR); +void _mlir_ciface_buddy_fir_vs_16_ts_8192_uf_1_f32(MemRef *inputMLIRFIR, + MemRef *kernelMLIRFIR, + MemRef *outputMLIRFIR); } // ----------------------------------------------------------------------------- @@ -70,12 +127,14 @@ void initializeKFRFIR(univector &input, univector &kernel) { // Print KFR univector result. template -void printUnivector(const univector &result, bool doPrint = false) { +void printUnivector(const univector &result, const std::string &name = "", + bool doPrint = false) { if (!doPrint) return; - std::ofstream file("KFRFIRResult.txt"); + std::string fileName = name + "FIRResult.txt"; + std::ofstream file(fileName); if (file.is_open()) { - file << "[ KFR FIR Result Information ]" << std::endl; + file << "[ " << name << " FIR Result Information ]" << std::endl; for (size_t i = 0; i < result.size(); ++i) { file << result[i] << std::endl; } @@ -108,7 +167,7 @@ void printMemRef(const MemRef &result, const std::string &name = "", // Verify correctness of KFR vs. MLIR results using direct error. template -void verify(const univector &A, const MemRef &B, size_t size, +void verify(const univector &A, const MemRef &B, size_t size, const std::string &name) { // Tolerance for floating point comparison const double epsilon = 1e-2;