Skip to content

Commit 1f9b40a

Browse files
authored
[DAP] Update 'dap-op-fir-benchmark' to support vectorized FIR algorithms. (#160)
* [DAP] Update 'dap.fir' benchmark. * [DAP] Update 'README.md' file for 'dap.fir' operation.
1 parent 69f518c commit 1f9b40a

File tree

7 files changed

+323
-15
lines changed

7 files changed

+323
-15
lines changed

benchmarks/AudioProcessing/Operations/FIROp/CMakeLists.txt

+68-4
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#-------------------------------------------------------------------------------
2-
# Generate MLIRFIR
2+
# Generate MLIRFIRScalar
33
#-------------------------------------------------------------------------------
44

55
add_custom_command(
@@ -25,8 +25,70 @@ add_custom_command(
2525
${LLVM_MLIR_BINARY_DIR}/llc
2626
)
2727

28-
add_library(MLIRFIR STATIC mlir-fir.o)
29-
set_target_properties(MLIRFIR PROPERTIES LINKER_LANGUAGE CXX)
28+
add_library(MLIRFIRScalar STATIC mlir-fir.o)
29+
set_target_properties(MLIRFIRScalar PROPERTIES LINKER_LANGUAGE CXX)
30+
31+
#-------------------------------------------------------------------------------
32+
# Generate MLIRFIRTiledVectorization
33+
#-------------------------------------------------------------------------------
34+
35+
add_custom_command(
36+
OUTPUT fir-tile-vectorization.o
37+
COMMAND
38+
${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt
39+
${BUDDY_SOURCE_DIR}/benchmarks/AudioProcessing/Operations/FIROp/MLIRFIRTiledVectorization.mlir
40+
-convert-scf-to-cf
41+
-convert-vector-to-llvm
42+
-llvm-request-c-wrappers
43+
-convert-arith-to-llvm
44+
-finalize-memref-to-llvm
45+
-convert-func-to-llvm
46+
-reconcile-unrealized-casts |
47+
${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
48+
${LLVM_MLIR_BINARY_DIR}/llc
49+
-mtriple=${BUDDY_OPT_TRIPLE}
50+
-mattr=${BUDDY_OPT_ATTR}
51+
-filetype=obj
52+
-o ${BUDDY_BINARY_DIR}/../benchmarks/AudioProcessing/Operations/FIROp/fir-tile-vectorization.o
53+
DEPENDS
54+
${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt
55+
${LLVM_MLIR_BINARY_DIR}/mlir-translate
56+
${LLVM_MLIR_BINARY_DIR}/llc
57+
)
58+
59+
add_library(MLIRFIRTiledVectorization STATIC fir-tile-vectorization.o)
60+
set_target_properties(MLIRFIRTiledVectorization PROPERTIES LINKER_LANGUAGE CXX)
61+
62+
#-------------------------------------------------------------------------------
63+
# Generate MLIRFIRVectorization
64+
#-------------------------------------------------------------------------------
65+
66+
add_custom_command(
67+
OUTPUT fir-vectorization.o
68+
COMMAND
69+
${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt
70+
${BUDDY_SOURCE_DIR}/benchmarks/AudioProcessing/Operations/FIROp/MLIRFIRVectorization.mlir
71+
-convert-scf-to-cf
72+
-convert-vector-to-llvm
73+
-llvm-request-c-wrappers
74+
-convert-arith-to-llvm
75+
-finalize-memref-to-llvm
76+
-convert-func-to-llvm
77+
-reconcile-unrealized-casts |
78+
${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
79+
${LLVM_MLIR_BINARY_DIR}/llc
80+
-mtriple=${BUDDY_OPT_TRIPLE}
81+
-mattr=${BUDDY_OPT_ATTR}
82+
-filetype=obj
83+
-o ${BUDDY_BINARY_DIR}/../benchmarks/AudioProcessing/Operations/FIROp/fir-vectorization.o
84+
DEPENDS
85+
${BUDDY_MLIR_BUILD_DIR}/bin/buddy-opt
86+
${LLVM_MLIR_BINARY_DIR}/mlir-translate
87+
${LLVM_MLIR_BINARY_DIR}/llc
88+
)
89+
90+
add_library(MLIRFIRVectorization STATIC fir-vectorization.o)
91+
set_target_properties(MLIRFIRVectorization PROPERTIES LINKER_LANGUAGE CXX)
3092

3193
#-------------------------------------------------------------------------------
3294
# Generate dap-op-fir-benchmark
@@ -43,7 +105,9 @@ target_link_libraries(dap-op-fir-benchmark PRIVATE
43105
# Third-party library
44106
kfr_io
45107
# MLIR hand-written benchmark
46-
MLIRFIR
108+
MLIRFIRScalar
109+
MLIRFIRTiledVectorization
110+
MLIRFIRVectorization
47111
# Buddy DAP library
48112
BuddyLibDAP
49113
# LLVM/MLIR library

benchmarks/AudioProcessing/Operations/FIROp/MLIRFIR.mlir

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
//
1919
//===----------------------------------------------------------------------===//
2020

21-
func.func @mlir_fir(%input : memref<?xf32>, %kernel : memref<?xf32>,
21+
func.func @fir_scalar(%input : memref<?xf32>, %kernel : memref<?xf32>,
2222
%output : memref<?xf32>) -> () {
2323
%c0 = arith.constant 0 : index
2424
%c1 = arith.constant 1 : index
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
//===- MLIRFIRTiledVectorization.mlir -------------------------------------===//
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
//
15+
//===----------------------------------------------------------------------===//
16+
//
17+
// This file provides the vectorized MLIR FIR function with tiling.
18+
//
19+
//===----------------------------------------------------------------------===//
20+
21+
// Tail process for fir vectorization algorithm.
22+
func.func @tail_processing(%input : memref<?xf32>, %kernel : memref<?xf32>,
23+
%output : memref<?xf32>, %input_offset : index) -> () {
24+
// 1. Get the total length of the workload.
25+
%c0 = arith.constant 0 : index
26+
%c1 = arith.constant 1 : index
27+
%input_size = memref.dim %input, %c0 : memref<?xf32>
28+
%kernel_size = memref.dim %kernel, %c0 : memref<?xf32>
29+
30+
// 2. Set the iteration step (vector size).
31+
%vl_step = arith.constant 16 : index
32+
%vl_step_minus_1 = arith.subi %vl_step, %c1 : index
33+
34+
// 3. Calculate the upper bound for vectorized processing
35+
// - Subtract `vl_step` is to avoid overflow at the vectorization tail.
36+
// - Add 1 to ensure the final loop runs when the workload length is divisible
37+
// by the vector size.
38+
%upbound_ = arith.subi %input_size, %vl_step : index
39+
%upbound_init = arith.addi %upbound_, %c1 : index
40+
41+
// 4. Loop through each kernel element
42+
scf.for %n = %c0 to %kernel_size step %c1
43+
iter_args(%upbound = %upbound_init) -> (index) {
44+
%k_elem = memref.load %kernel[%n] : memref<?xf32>
45+
%k_vec = vector.splat %k_elem : vector<16xf32>
46+
47+
// 5. Perform the vectorization body.
48+
%iter_idx = scf.for %i = %input_offset to %upbound step %vl_step // 起始点从`0`改为`input_offset`
49+
iter_args(%iter_init = %input_offset) -> (index) {
50+
%in_vec = vector.load %input[%i] : memref<?xf32>, vector<16xf32>
51+
%out_index = arith.addi %i, %n : index
52+
%out_vec = vector.load %output[%out_index] : memref<?xf32>, vector<16xf32> // 需要计算output的偏移量
53+
%fma_vec = vector.fma %k_vec, %in_vec, %out_vec : vector<16xf32>
54+
vector.store %fma_vec, %output[%out_index] : memref<?xf32>, vector<16xf32>
55+
%i_next = arith.addi %i, %vl_step : index
56+
scf.yield %i_next : index
57+
}
58+
59+
// 6. Process the remainder of the elements with scalar operations.
60+
%upbound_scalar = arith.addi %upbound, %vl_step_minus_1 : index
61+
scf.for %i = %iter_idx to %upbound_scalar step %c1 {
62+
%in_elem = memref.load %input[%i] : memref<?xf32>
63+
%out_index = arith.addi %i, %n : index
64+
%out_elem = memref.load %output[%out_index] : memref<?xf32> // ouput index need to change
65+
%mul_elem = arith.mulf %in_elem, %k_elem : f32
66+
%add_elem = arith.addf %mul_elem, %out_elem : f32
67+
memref.store %add_elem, %output[%out_index] : memref<?xf32> // change output index
68+
}
69+
70+
%upbound_next = arith.subi %upbound, %c1 : index
71+
scf.yield %upbound_next : index
72+
}
73+
74+
return
75+
}
76+
77+
func.func @fir_tiled_vectorization(%input : memref<?xf32>, %kernel : memref<?xf32>,
78+
%output : memref<?xf32>) -> () {
79+
// 1. Get the total length of the workload.
80+
%c0 = arith.constant 0 : index
81+
%c1 = arith.constant 1 : index
82+
%input_size = memref.dim %input, %c0 : memref<?xf32>
83+
%kernel_size = memref.dim %kernel, %c0 : memref<?xf32>
84+
85+
// 2. Set the iteration step (vector size).
86+
%vl_step = arith.constant 16 : index
87+
%vl_step_minus_1 = arith.subi %vl_step, %c1 : index
88+
89+
%tile_step = arith.constant 2048 : index
90+
91+
// 3. Calculate the upper bound for vectorized processing.
92+
// The computation times for the last kernel elements(which is the shortest).
93+
%last_kernel_element_used_input_size_ = arith.subi %input_size, %kernel_size : index
94+
%last_kernel_element_used_input_size = arith.addi %last_kernel_element_used_input_size_, %c1 : index
95+
96+
%input_upbound_ = arith.subi %last_kernel_element_used_input_size, %tile_step : index
97+
%input_upbound = arith.addi %input_upbound_, %c1 : index
98+
99+
// 4. Do the tiling process, each tile can be fully computed with vector(remainder is zero)
100+
// Return the offset address for tail process.
101+
%input_offset = scf.for %address = %c0 to %input_upbound step %tile_step
102+
iter_args(%offset = %c0) -> (index) {
103+
%upbound = arith.addi %address, %tile_step : index
104+
105+
scf.for %n = %c0 to %kernel_size step %c1 {
106+
%k_elem = memref.load %kernel[%n] : memref<?xf32>
107+
%k_vec = vector.splat %k_elem : vector<16xf32>
108+
109+
// 5. Perform the vectorization body.
110+
scf.for %i = %address to %upbound step %vl_step {
111+
%in_vec = vector.load %input[%i] : memref<?xf32>, vector<16xf32>
112+
%out_index = arith.addi %i, %n : index
113+
%out_vec = vector.load %output[%out_index] : memref<?xf32>, vector<16xf32> // 需要计算output的偏移量
114+
%fma_vec = vector.fma %k_vec, %in_vec, %out_vec : vector<16xf32>
115+
vector.store %fma_vec, %output[%out_index] : memref<?xf32>, vector<16xf32>
116+
}
117+
}
118+
119+
scf.yield %upbound : index
120+
}
121+
122+
// 6. Tail processing, begin from `input[input_offset]`
123+
call @tail_processing(%input, %kernel, %output, %input_offset) : (memref<?xf32>, memref<?xf32>, memref<?xf32>, index) -> ()
124+
125+
return
126+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
//===- MLIRFIRVectorization.mlir ------------------------------------------===//
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
//
15+
//===----------------------------------------------------------------------===//
16+
//
17+
// This file provides the vectorized MLIR FIR function (without tiling).
18+
//
19+
//===----------------------------------------------------------------------===//
20+
21+
func.func @fir_vectorization(%input : memref<?xf32>, %kernel : memref<?xf32>,
22+
%output : memref<?xf32>) -> () {
23+
// 1. Get the total length of the workload.
24+
%c0 = arith.constant 0 : index
25+
%c1 = arith.constant 1 : index
26+
%input_size = memref.dim %input, %c0 : memref<?xf32>
27+
%kernel_size = memref.dim %kernel, %c0 : memref<?xf32>
28+
29+
// 2. Set the iteration step (vector size).
30+
%vl_step = arith.constant 16 : index
31+
%vl_step_minus_1 = arith.subi %vl_step, %c1 : index
32+
33+
// 3. Calculate the upper bound for vectorized processing
34+
// - Subtract `vl_step` is to avoid overflow at the vectorization tail.
35+
// - Add 1 to ensure the final loop runs when the workload length is divisible
36+
// by the vector size.
37+
%upbound_ = arith.subi %input_size, %vl_step : index
38+
%upbound_init = arith.addi %upbound_, %c1 : index
39+
40+
// 4. Loop through each kernel element
41+
scf.for %n = %c0 to %kernel_size step %c1
42+
iter_args(%upbound = %upbound_init) -> (index) {
43+
%k_elem = memref.load %kernel[%n] : memref<?xf32>
44+
%k_vec = vector.splat %k_elem : vector<16xf32>
45+
46+
// 5. Perform the vectorization body.
47+
%iter_idx = scf.for %i = %c0 to %upbound step %vl_step
48+
iter_args(%iter_init = %c0) -> (index) {
49+
%in_vec = vector.load %input[%i] : memref<?xf32>, vector<16xf32>
50+
%out_index = arith.addi %i, %n : index
51+
%out_vec = vector.load %output[%out_index] : memref<?xf32>, vector<16xf32>
52+
%fma_vec = vector.fma %k_vec, %in_vec, %out_vec : vector<16xf32>
53+
vector.store %fma_vec, %output[%out_index] : memref<?xf32>, vector<16xf32>
54+
%i_next = arith.addi %i, %vl_step : index
55+
scf.yield %i_next : index
56+
}
57+
58+
// 6. Process the remainder of the elements with scalar operations.
59+
%upbound_scalar = arith.addi %upbound, %vl_step_minus_1 : index
60+
scf.for %i = %iter_idx to %upbound_scalar step %c1 {
61+
%in_elem = memref.load %input[%i] : memref<?xf32>
62+
%out_index = arith.addi %i, %n : index
63+
%out_elem = memref.load %output[%out_index] : memref<?xf32>
64+
%mul_elem = arith.mulf %in_elem, %k_elem : f32
65+
%add_elem = arith.addf %mul_elem, %out_elem : f32
66+
memref.store %add_elem, %output[%out_index] : memref<?xf32>
67+
}
68+
69+
%upbound_next = arith.subi %upbound, %c1 : index
70+
scf.yield %upbound_next : index
71+
}
72+
73+
return
74+
}

benchmarks/AudioProcessing/Operations/FIROp/Main.cpp

+42-4
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,9 @@ MemRef<float, 1> kernelRef(sizeofKernel);
5050
using MLIRFunctionType = void (*)(MemRef<float, 1> *, MemRef<float, 1> *,
5151
MemRef<float, 1> *);
5252

53+
using BuddyFunctionType = void (*)(MemRef<float, 1> *, MemRef<float, 1> *,
54+
MemRef<float, 1> *, bool);
55+
5356
// Benchmarking function for MLIR based FIR method.
5457
void DAP_OPS_FIR(benchmark::State &state, MLIRFunctionType func) {
5558
MemRef<float, 1> resRef(sizeofAud, 0.0);
@@ -59,6 +62,15 @@ void DAP_OPS_FIR(benchmark::State &state, MLIRFunctionType func) {
5962
benchmark::DoNotOptimize(resRef);
6063
}
6164

65+
void DAP_OPS_FIR(benchmark::State &state, BuddyFunctionType func,
66+
bool isVectorization) {
67+
MemRef<float, 1> resRef(sizeofAud, 0.0);
68+
for (auto _ : state) {
69+
func(&audRef, &kernelRef, &resRef, isVectorization);
70+
}
71+
benchmark::DoNotOptimize(resRef);
72+
}
73+
6274
// Benchmarking function for KFR FIR method.
6375
static void KFR_FIR(benchmark::State &state) {
6476
for (auto _ : state) {
@@ -76,14 +88,34 @@ void Verification(const univector<float, _IN_OUT_SIZE> &outputExpected,
7688
firOp::verify(outputExpected, outputGenerated, _IN_OUT_SIZE, name);
7789
}
7890

91+
void Verification(const univector<float, _IN_OUT_SIZE> &outputExpected,
92+
BuddyFunctionType BuddyFunc, bool isVectorization,
93+
const std::string &name) {
94+
// Initialize MemRef with all zeros.
95+
MemRef<float, 1> outputGenerated(sizeofAud, 0.0);
96+
BuddyFunc(&audRef, &kernelRef, &outputGenerated, isVectorization);
97+
firOp::printMemRef(outputGenerated, name, /*doPrint=*/_PRINT);
98+
firOp::verify(outputExpected, outputGenerated, _IN_OUT_SIZE, name);
99+
}
100+
79101
// -----------------------------------------------------------------------------
80102
// Register Benchmark.
81103
// -----------------------------------------------------------------------------
82104

83-
BENCHMARK_CAPTURE(DAP_OPS_FIR, mlir_fir, _mlir_ciface_mlir_fir)
105+
BENCHMARK_CAPTURE(DAP_OPS_FIR, mlir_scalar, _mlir_ciface_fir_scalar)
106+
->Unit(benchmark::kMillisecond)
107+
->Iterations(_NUM_ITER);
108+
BENCHMARK_CAPTURE(DAP_OPS_FIR, buddy_scalar, dap::FIR<float, 1>, false)
109+
->Unit(benchmark::kMillisecond)
110+
->Iterations(_NUM_ITER);
111+
BENCHMARK_CAPTURE(DAP_OPS_FIR, mlir_vectorize, _mlir_ciface_fir_vectorization)
112+
->Unit(benchmark::kMillisecond)
113+
->Iterations(_NUM_ITER);
114+
BENCHMARK_CAPTURE(DAP_OPS_FIR, mlir_tiled_vectorize,
115+
_mlir_ciface_fir_tiled_vectorization)
84116
->Unit(benchmark::kMillisecond)
85117
->Iterations(_NUM_ITER);
86-
BENCHMARK_CAPTURE(DAP_OPS_FIR, buddy_fir, dap::FIR<float, 1>)
118+
BENCHMARK_CAPTURE(DAP_OPS_FIR, buddy_tiled_vectorize, dap::FIR<float, 1>, true)
87119
->Unit(benchmark::kMillisecond)
88120
->Iterations(_NUM_ITER);
89121
BENCHMARK(KFR_FIR)->Unit(benchmark::kMillisecond)->Iterations(_NUM_ITER);
@@ -109,8 +141,14 @@ int main(int argc, char **argv) {
109141
firOp::printUnivector(firOutput, /*doPrint=*/_PRINT);
110142

111143
// Verify the correctness of all methods.
112-
Verification(firOutput, dap::FIR<float, 1>, "Buddy");
113-
Verification(firOutput, _mlir_ciface_mlir_fir, "MLIR");
144+
Verification(firOutput, _mlir_ciface_fir_scalar, "MLIRScalar");
145+
Verification(firOutput, dap::FIR<float, 1>, /*isVectorization=*/false,
146+
"BuddyScalar");
147+
Verification(firOutput, _mlir_ciface_fir_vectorization, "MLIRVectorize");
148+
Verification(firOutput, _mlir_ciface_fir_tiled_vectorization,
149+
"MLIRTiledVectorize");
150+
Verification(firOutput, dap::FIR<float, 1>, /*isVectorization=*/true,
151+
"BuddyTiledVectorize");
114152

115153
return 0;
116154
}

0 commit comments

Comments
 (0)