[DAP] Update 'dap-op-fir-benchmark' to support vectorized FIR algorithms. (#160)

taiqzheng · web-flow · commit 1f9b40a2e330 · 2024-12-08T16:29:33.000+08:00
* [DAP] Update 'dap.fir' benchmark.

* [DAP] Update 'README.md' file for 'dap.fir' operation.
diff --git a/benchmarks/AudioProcessing/Operations/FIROp/CMakeLists.txt b/benchmarks/AudioProcessing/Operations/FIROp/CMakeLists.txt
@@ -1,5 +1,5 @@
 #-------------------------------------------------------------------------------
-# Generate MLIRFIR
+# Generate MLIRFIRScalar
 #-------------------------------------------------------------------------------
 
 add_custom_command(
@@ -25,8 +25,70 @@ add_custom_command(
     ${LLVM_MLIR_BINARY_DIR}/llc 
 )
 
-add_library(MLIRFIR STATIC mlir-fir.o)
-set_target_properties(MLIRFIR PROPERTIES LINKER_LANGUAGE CXX)
+add_library(MLIRFIRScalar STATIC mlir-fir.o)
+set_target_properties(MLIRFIRScalar PROPERTIES LINKER_LANGUAGE CXX)
+
+#-------------------------------------------------------------------------------
+# Generate MLIRFIRTiledVectorization
+#-------------------------------------------------------------------------------
+
+add_custom_command(
+  OUTPUT fir-tile-vectorization.o
+  COMMAND 
+    ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt 
+      ${BUDDY_SOURCE_DIR}/benchmarks/AudioProcessing/Operations/FIROp/MLIRFIRTiledVectorization.mlir
+      -convert-scf-to-cf
+      -convert-vector-to-llvm
+      -llvm-request-c-wrappers
+      -convert-arith-to-llvm
+      -finalize-memref-to-llvm
+      -convert-func-to-llvm
+      -reconcile-unrealized-casts |
+    ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
+    ${LLVM_MLIR_BINARY_DIR}/llc 
+      -mtriple=${BUDDY_OPT_TRIPLE} 
+      -mattr=${BUDDY_OPT_ATTR} 
+      -filetype=obj 
+      -o ${BUDDY_BINARY_DIR}/../benchmarks/AudioProcessing/Operations/FIROp/fir-tile-vectorization.o
+  DEPENDS
+    ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt
+    ${LLVM_MLIR_BINARY_DIR}/mlir-translate
+    ${LLVM_MLIR_BINARY_DIR}/llc 
+)
+
+add_library(MLIRFIRTiledVectorization STATIC fir-tile-vectorization.o)
+set_target_properties(MLIRFIRTiledVectorization PROPERTIES LINKER_LANGUAGE CXX)
+
+#-------------------------------------------------------------------------------
+# Generate MLIRFIRVectorization
+#-------------------------------------------------------------------------------
+
+add_custom_command(
+  OUTPUT fir-vectorization.o
+  COMMAND 
+    ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt 
+      ${BUDDY_SOURCE_DIR}/benchmarks/AudioProcessing/Operations/FIROp/MLIRFIRVectorization.mlir
+      -convert-scf-to-cf
+      -convert-vector-to-llvm
+      -llvm-request-c-wrappers
+      -convert-arith-to-llvm
+      -finalize-memref-to-llvm
+      -convert-func-to-llvm
+      -reconcile-unrealized-casts |
+    ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
+    ${LLVM_MLIR_BINARY_DIR}/llc 
+      -mtriple=${BUDDY_OPT_TRIPLE} 
+      -mattr=${BUDDY_OPT_ATTR} 
+      -filetype=obj 
+      -o ${BUDDY_BINARY_DIR}/../benchmarks/AudioProcessing/Operations/FIROp/fir-vectorization.o
+  DEPENDS
+    ${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt
+    ${LLVM_MLIR_BINARY_DIR}/mlir-translate
+    ${LLVM_MLIR_BINARY_DIR}/llc 
+)
+
+add_library(MLIRFIRVectorization STATIC fir-vectorization.o)
+set_target_properties(MLIRFIRVectorization PROPERTIES LINKER_LANGUAGE CXX)
 
 #-------------------------------------------------------------------------------
 # Generate dap-op-fir-benchmark
@@ -43,7 +105,9 @@ target_link_libraries(dap-op-fir-benchmark PRIVATE
   # Third-party library
   kfr_io
   # MLIR hand-written benchmark
-  MLIRFIR
+  MLIRFIRScalar
+  MLIRFIRTiledVectorization
+  MLIRFIRVectorization
   # Buddy DAP library
   BuddyLibDAP
   # LLVM/MLIR library
diff --git a/benchmarks/AudioProcessing/Operations/FIROp/MLIRFIR.mlir b/benchmarks/AudioProcessing/Operations/FIROp/MLIRFIR.mlir
@@ -18,7 +18,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-func.func @mlir_fir(%input : memref<?xf32>, %kernel : memref<?xf32>, 
+func.func @fir_scalar(%input : memref<?xf32>, %kernel : memref<?xf32>, 
                     %output : memref<?xf32>) -> () {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
diff --git a/benchmarks/AudioProcessing/Operations/FIROp/MLIRFIRTiledVectorization.mlir b/benchmarks/AudioProcessing/Operations/FIROp/MLIRFIRTiledVectorization.mlir
@@ -0,0 +1,126 @@
+//===- MLIRFIRTiledVectorization.mlir -------------------------------------===//
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the vectorized MLIR FIR function with tiling.
+//
+//===----------------------------------------------------------------------===//
+
+// Tail process for fir vectorization algorithm.
+func.func @tail_processing(%input : memref<?xf32>, %kernel : memref<?xf32>, 
+                           %output : memref<?xf32>, %input_offset : index) -> () {
+  // 1. Get the total length of the workload.
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %input_size = memref.dim %input, %c0 : memref<?xf32>
+  %kernel_size = memref.dim %kernel, %c0 : memref<?xf32>
+
+  // 2. Set the iteration step (vector size).
+  %vl_step = arith.constant 16 : index
+  %vl_step_minus_1 = arith.subi %vl_step, %c1 : index
+
+  // 3. Calculate the upper bound for vectorized processing
+  // - Subtract `vl_step` is to avoid overflow at the vectorization tail.
+  // - Add 1 to ensure the final loop runs when the workload length is divisible
+  //   by the vector size.
+  %upbound_ = arith.subi %input_size, %vl_step : index
+  %upbound_init = arith.addi %upbound_, %c1 : index
+
+  // 4. Loop through each kernel element
+  scf.for %n = %c0 to %kernel_size step %c1 
+    iter_args(%upbound = %upbound_init) -> (index) {
+    %k_elem = memref.load %kernel[%n] : memref<?xf32>
+    %k_vec = vector.splat %k_elem : vector<16xf32>
+
+    // 5. Perform the vectorization body.
+    %iter_idx = scf.for %i = %input_offset to %upbound step %vl_step // 起始点从`0`改为`input_offset`
+        iter_args(%iter_init = %input_offset) -> (index) {
+      %in_vec = vector.load %input[%i] : memref<?xf32>, vector<16xf32>
+      %out_index = arith.addi %i, %n : index
+      %out_vec = vector.load %output[%out_index] : memref<?xf32>, vector<16xf32>  // 需要计算output的偏移量
+      %fma_vec = vector.fma %k_vec, %in_vec, %out_vec : vector<16xf32>
+      vector.store %fma_vec, %output[%out_index] : memref<?xf32>, vector<16xf32>
+      %i_next = arith.addi %i, %vl_step : index
+      scf.yield %i_next : index
+    }
+
+    // 6. Process the remainder of the elements with scalar operations.
+    %upbound_scalar = arith.addi %upbound, %vl_step_minus_1 : index
+    scf.for %i = %iter_idx to %upbound_scalar step %c1 {
+      %in_elem = memref.load %input[%i] : memref<?xf32>
+      %out_index = arith.addi %i, %n : index
+      %out_elem = memref.load %output[%out_index] : memref<?xf32>  // ouput index need to change
+      %mul_elem = arith.mulf %in_elem, %k_elem : f32
+      %add_elem = arith.addf %mul_elem, %out_elem : f32
+      memref.store %add_elem, %output[%out_index] : memref<?xf32>  // change output index
+    }
+
+    %upbound_next = arith.subi %upbound, %c1 : index
+    scf.yield %upbound_next : index
+  }
+
+  return 
+}
+
+func.func @fir_tiled_vectorization(%input : memref<?xf32>, %kernel : memref<?xf32>, 
+                                   %output : memref<?xf32>) -> () {
+  // 1. Get the total length of the workload.
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %input_size = memref.dim %input, %c0 : memref<?xf32>
+  %kernel_size = memref.dim %kernel, %c0 : memref<?xf32>
+
+  // 2. Set the iteration step (vector size).
+  %vl_step = arith.constant 16 : index
+  %vl_step_minus_1 = arith.subi %vl_step, %c1 : index
+
+  %tile_step = arith.constant 2048 : index
+
+  // 3. Calculate the upper bound for vectorized processing.
+  // The computation times for the last kernel elements(which is the shortest).
+  %last_kernel_element_used_input_size_ = arith.subi %input_size, %kernel_size : index
+  %last_kernel_element_used_input_size = arith.addi %last_kernel_element_used_input_size_, %c1 : index
+
+  %input_upbound_ = arith.subi %last_kernel_element_used_input_size, %tile_step : index
+  %input_upbound = arith.addi %input_upbound_, %c1 : index
+
+  // 4. Do the tiling process, each tile can be fully computed with vector(remainder is zero)
+  // Return the offset address for tail process.
+  %input_offset = scf.for %address = %c0 to %input_upbound step %tile_step 
+      iter_args(%offset = %c0) -> (index) {
+    %upbound = arith.addi %address, %tile_step : index
+
+    scf.for %n = %c0 to %kernel_size step %c1 {
+      %k_elem = memref.load %kernel[%n] : memref<?xf32>
+      %k_vec = vector.splat %k_elem : vector<16xf32>
+
+      // 5. Perform the vectorization body. 
+      scf.for %i = %address to %upbound step %vl_step {
+        %in_vec = vector.load %input[%i] : memref<?xf32>, vector<16xf32>
+        %out_index = arith.addi %i, %n : index
+        %out_vec = vector.load %output[%out_index] : memref<?xf32>, vector<16xf32>  // 需要计算output的偏移量
+        %fma_vec = vector.fma %k_vec, %in_vec, %out_vec : vector<16xf32>
+        vector.store %fma_vec, %output[%out_index] : memref<?xf32>, vector<16xf32>
+      }
+    }
+
+    scf.yield %upbound : index
+  }
+
+  // 6. Tail processing, begin from `input[input_offset]`
+  call @tail_processing(%input, %kernel, %output, %input_offset) : (memref<?xf32>, memref<?xf32>, memref<?xf32>, index) -> ()
+
+  return
+}
diff --git a/benchmarks/AudioProcessing/Operations/FIROp/MLIRFIRVectorization.mlir b/benchmarks/AudioProcessing/Operations/FIROp/MLIRFIRVectorization.mlir
@@ -0,0 +1,74 @@
+//===- MLIRFIRVectorization.mlir ------------------------------------------===//
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the vectorized MLIR FIR function (without tiling).
+//
+//===----------------------------------------------------------------------===//
+
+func.func @fir_vectorization(%input : memref<?xf32>, %kernel : memref<?xf32>, 
+                             %output : memref<?xf32>) -> () {
+  // 1. Get the total length of the workload.
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %input_size = memref.dim %input, %c0 : memref<?xf32>
+  %kernel_size = memref.dim %kernel, %c0 : memref<?xf32>
+
+  // 2. Set the iteration step (vector size).
+  %vl_step = arith.constant 16 : index
+  %vl_step_minus_1 = arith.subi %vl_step, %c1 : index
+
+  // 3. Calculate the upper bound for vectorized processing
+  // - Subtract `vl_step` is to avoid overflow at the vectorization tail.
+  // - Add 1 to ensure the final loop runs when the workload length is divisible
+  //   by the vector size.
+  %upbound_ = arith.subi %input_size, %vl_step : index
+  %upbound_init = arith.addi %upbound_, %c1 : index
+
+  // 4. Loop through each kernel element
+  scf.for %n = %c0 to %kernel_size step %c1 
+    iter_args(%upbound = %upbound_init) -> (index) {
+    %k_elem = memref.load %kernel[%n] : memref<?xf32>
+    %k_vec = vector.splat %k_elem : vector<16xf32>
+
+    // 5. Perform the vectorization body.
+    %iter_idx = scf.for %i = %c0 to %upbound step %vl_step 
+        iter_args(%iter_init = %c0) -> (index) {
+      %in_vec = vector.load %input[%i] : memref<?xf32>, vector<16xf32>
+      %out_index = arith.addi %i, %n : index
+      %out_vec = vector.load %output[%out_index] : memref<?xf32>, vector<16xf32>
+      %fma_vec = vector.fma %k_vec, %in_vec, %out_vec : vector<16xf32>
+      vector.store %fma_vec, %output[%out_index] : memref<?xf32>, vector<16xf32>
+      %i_next = arith.addi %i, %vl_step : index
+      scf.yield %i_next : index
+    }
+
+    // 6. Process the remainder of the elements with scalar operations.
+    %upbound_scalar = arith.addi %upbound, %vl_step_minus_1 : index
+    scf.for %i = %iter_idx to %upbound_scalar step %c1 {
+      %in_elem = memref.load %input[%i] : memref<?xf32>
+      %out_index = arith.addi %i, %n : index
+      %out_elem = memref.load %output[%out_index] : memref<?xf32>
+      %mul_elem = arith.mulf %in_elem, %k_elem : f32
+      %add_elem = arith.addf %mul_elem, %out_elem : f32
+      memref.store %add_elem, %output[%out_index] : memref<?xf32>
+    }
+
+    %upbound_next = arith.subi %upbound, %c1 : index
+    scf.yield %upbound_next : index
+  }
+
+  return
+}
diff --git a/benchmarks/AudioProcessing/Operations/FIROp/Main.cpp b/benchmarks/AudioProcessing/Operations/FIROp/Main.cpp
@@ -50,6 +50,9 @@ MemRef<float, 1> kernelRef(sizeofKernel);
 using MLIRFunctionType = void (*)(MemRef<float, 1> *, MemRef<float, 1> *,
                                   MemRef<float, 1> *);
 
+using BuddyFunctionType = void (*)(MemRef<float, 1> *, MemRef<float, 1> *,
+                                   MemRef<float, 1> *, bool);
+
 // Benchmarking function for MLIR based FIR method.
 void DAP_OPS_FIR(benchmark::State &state, MLIRFunctionType func) {
   MemRef<float, 1> resRef(sizeofAud, 0.0);
@@ -59,6 +62,15 @@ void DAP_OPS_FIR(benchmark::State &state, MLIRFunctionType func) {
   benchmark::DoNotOptimize(resRef);
 }
 
+void DAP_OPS_FIR(benchmark::State &state, BuddyFunctionType func,
+                 bool isVectorization) {
+  MemRef<float, 1> resRef(sizeofAud, 0.0);
+  for (auto _ : state) {
+    func(&audRef, &kernelRef, &resRef, isVectorization);
+  }
+  benchmark::DoNotOptimize(resRef);
+}
+
 // Benchmarking function for KFR FIR method.
 static void KFR_FIR(benchmark::State &state) {
   for (auto _ : state) {
@@ -76,14 +88,34 @@ void Verification(const univector<float, _IN_OUT_SIZE> &outputExpected,
   firOp::verify(outputExpected, outputGenerated, _IN_OUT_SIZE, name);
 }
 
+void Verification(const univector<float, _IN_OUT_SIZE> &outputExpected,
+                  BuddyFunctionType BuddyFunc, bool isVectorization,
+                  const std::string &name) {
+  // Initialize MemRef with all zeros.
+  MemRef<float, 1> outputGenerated(sizeofAud, 0.0);
+  BuddyFunc(&audRef, &kernelRef, &outputGenerated, isVectorization);
+  firOp::printMemRef(outputGenerated, name, /*doPrint=*/_PRINT);
+  firOp::verify(outputExpected, outputGenerated, _IN_OUT_SIZE, name);
+}
+
 // -----------------------------------------------------------------------------
 // Register Benchmark.
 // -----------------------------------------------------------------------------
 
-BENCHMARK_CAPTURE(DAP_OPS_FIR, mlir_fir, _mlir_ciface_mlir_fir)
+BENCHMARK_CAPTURE(DAP_OPS_FIR, mlir_scalar, _mlir_ciface_fir_scalar)
+    ->Unit(benchmark::kMillisecond)
+    ->Iterations(_NUM_ITER);
+BENCHMARK_CAPTURE(DAP_OPS_FIR, buddy_scalar, dap::FIR<float, 1>, false)
+    ->Unit(benchmark::kMillisecond)
+    ->Iterations(_NUM_ITER);
+BENCHMARK_CAPTURE(DAP_OPS_FIR, mlir_vectorize, _mlir_ciface_fir_vectorization)
+    ->Unit(benchmark::kMillisecond)
+    ->Iterations(_NUM_ITER);
+BENCHMARK_CAPTURE(DAP_OPS_FIR, mlir_tiled_vectorize,
+                  _mlir_ciface_fir_tiled_vectorization)
     ->Unit(benchmark::kMillisecond)
     ->Iterations(_NUM_ITER);
-BENCHMARK_CAPTURE(DAP_OPS_FIR, buddy_fir, dap::FIR<float, 1>)
+BENCHMARK_CAPTURE(DAP_OPS_FIR, buddy_tiled_vectorize, dap::FIR<float, 1>, true)
     ->Unit(benchmark::kMillisecond)
     ->Iterations(_NUM_ITER);
 BENCHMARK(KFR_FIR)->Unit(benchmark::kMillisecond)->Iterations(_NUM_ITER);
@@ -109,8 +141,14 @@ int main(int argc, char **argv) {
   firOp::printUnivector(firOutput, /*doPrint=*/_PRINT);
 
   // Verify the correctness of all methods.
-  Verification(firOutput, dap::FIR<float, 1>, "Buddy");
-  Verification(firOutput, _mlir_ciface_mlir_fir, "MLIR");
+  Verification(firOutput, _mlir_ciface_fir_scalar, "MLIRScalar");
+  Verification(firOutput, dap::FIR<float, 1>, /*isVectorization=*/false,
+               "BuddyScalar");
+  Verification(firOutput, _mlir_ciface_fir_vectorization, "MLIRVectorize");
+  Verification(firOutput, _mlir_ciface_fir_tiled_vectorization,
+               "MLIRTiledVectorize");
+  Verification(firOutput, dap::FIR<float, 1>, /*isVectorization=*/true,
+               "BuddyTiledVectorize");
 
   return 0;
 }
diff --git a/benchmarks/AudioProcessing/Operations/FIROp/Utils.hpp b/benchmarks/AudioProcessing/Operations/FIROp/Utils.hpp
diff --git a/benchmarks/AudioProcessing/README.md b/benchmarks/AudioProcessing/README.md