Skip to content

Add benchmark for linalg.batchmatmul_transpsoe_bop #170

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
func.func @kernel_placeholder(%a : memref<?x?x?xf32>, %b : memref<?x?x?xf32>, %c : memref<?x?x?xf32>) {
linalg.batch_matmul_transpose_b
ins(%a, %b: memref<?x?x?xf32>, memref<?x?x?xf32>)
outs(%c: memref<?x?x?xf32>)
return
}
145 changes: 145 additions & 0 deletions benchmarks/DeepLearning/Ops/BatchMatMulTransposeBOp/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
add_executable(dl-op-batch-matmul-transpose-b-benchmark
Main.cpp
)
target_link_libraries(dl-op-batch-matmul-transpose-b-benchmark GoogleBenchmark)

# CMAKE_C_FLAGS is set when configuring cmake.
separate_arguments(CLANG_FLAGS_LIST UNIX_COMMAND "${CMAKE_C_FLAGS}")

add_custom_command(OUTPUT batch_matmul_transpose_b_scalar_O0.o
COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/BatchMatMulTransposeB.mlir |
sed -e {s/@kernel_placeholder/@batch_matmul_transpose_b_scalar_O0/} |
${LLVM_MLIR_BINARY_DIR}/mlir-opt
-pass-pipeline
"builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" |
${BUDDY_MLIR_BINARY_DIR}/buddy-opt
-arith-expand
-eliminate-empty-tensors
-empty-tensor-to-alloc-tensor
-one-shot-bufferize
-convert-linalg-to-affine-loops
-affine-loop-fusion
-affine-parallelize
-lower-affine
-func-bufferize
-arith-bufferize
-tensor-bufferize
-buffer-deallocation
-finalizing-bufferize
-convert-vector-to-scf
-expand-strided-metadata
-convert-vector-to-llvm
-memref-expand
-arith-expand
-convert-arith-to-llvm
-finalize-memref-to-llvm
-convert-scf-to-cf
-llvm-request-c-wrappers
-convert-arith-to-llvm
-convert-math-to-llvm
-convert-math-to-libm
-convert-func-to-llvm
-reconcile-unrealized-casts |
${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir -o batch_matmul_transpose_b_scalar_O0.ll
COMMAND ${LLVM_MLIR_BINARY_DIR}/clang -O0 ${CLANG_FLAGS_LIST} batch_matmul_transpose_b_scalar_O0.ll
-c -save-temps -o ${CMAKE_CURRENT_BINARY_DIR}/batch_matmul_transpose_b_scalar_O0.o
VERBATIM)

add_library(batch_matmul_transpose_b_scalar_O0 STATIC batch_matmul_transpose_b_scalar_O0.o)
set_target_properties(batch_matmul_transpose_b_scalar_O0 PROPERTIES LINKER_LANGUAGE CXX)
target_link_libraries(dl-op-batch-matmul-transpose-b-benchmark
batch_matmul_transpose_b_scalar_O0
)

add_custom_command(OUTPUT batch_matmul_transpose_b_scalar_O3.o
COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/BatchMatMulTransposeB.mlir |
sed -e {s/@kernel_placeholder/@batch_matmul_transpose_b_scalar_O3/} |
${LLVM_MLIR_BINARY_DIR}/mlir-opt
-pass-pipeline
"builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" |
${BUDDY_MLIR_BINARY_DIR}/buddy-opt
-arith-expand
-eliminate-empty-tensors
-empty-tensor-to-alloc-tensor
-one-shot-bufferize
-convert-linalg-to-affine-loops
-affine-loop-fusion
-affine-parallelize
-lower-affine
-func-bufferize
-arith-bufferize
-tensor-bufferize
-buffer-deallocation
-finalizing-bufferize
-convert-vector-to-scf
-expand-strided-metadata
-convert-vector-to-llvm
-memref-expand
-arith-expand
-convert-arith-to-llvm
-finalize-memref-to-llvm
-convert-scf-to-cf
-llvm-request-c-wrappers
-convert-arith-to-llvm
-convert-math-to-llvm
-convert-math-to-libm
-convert-func-to-llvm
-reconcile-unrealized-casts |
${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir -o batch_matmul_transpose_b_scalar_O3.ll
COMMAND ${LLVM_MLIR_BINARY_DIR}/clang -O3 ${CLANG_FLAGS_LIST} batch_matmul_transpose_b_scalar_O3.ll
-c -save-temps -o ${CMAKE_CURRENT_BINARY_DIR}/batch_matmul_transpose_b_scalar_O3.o
VERBATIM)

add_library(batch_matmul_transpose_b_scalar_O3 STATIC batch_matmul_transpose_b_scalar_O3.o)
set_target_properties(batch_matmul_transpose_b_scalar_O3 PROPERTIES LINKER_LANGUAGE CXX)
target_link_directories(dl-op-batch-matmul-transpose-b-benchmark PRIVATE ${LLVM_MLIR_LIBRARY_DIR})
target_link_libraries(dl-op-batch-matmul-transpose-b-benchmark
batch_matmul_transpose_b_scalar_O3
)

add_custom_command(OUTPUT batch_matmul_transpose_b_vec.o
COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/BatchMatMulTransposeB.mlir |
sed -e {s/@kernel_placeholder/@batch_matmul_transpose_b_vec/} |
${LLVM_MLIR_BINARY_DIR}/mlir-opt
-pass-pipeline
"builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" |
${BUDDY_MLIR_BINARY_DIR}/buddy-opt
-arith-expand
-eliminate-empty-tensors
-empty-tensor-to-alloc-tensor
-one-shot-bufferize
-func-bufferize
-arith-bufferize
-tensor-bufferize
-buffer-deallocation
-finalizing-bufferize
-batchmatmul-transpose-b-vectorization
-convert-linalg-to-affine-loops
-affine-loop-fusion
-lower-affine
-convert-vector-to-scf
-expand-strided-metadata
-convert-vector-to-llvm
-memref-expand
-arith-expand
-convert-arith-to-llvm
-finalize-memref-to-llvm
-convert-scf-to-cf
-llvm-request-c-wrappers
-convert-arith-to-llvm
-convert-math-to-llvm
-convert-math-to-libm
-convert-func-to-llvm
-reconcile-unrealized-casts |
${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir -o batch_matmul_transpose_b_vec.ll
COMMAND ${LLVM_MLIR_BINARY_DIR}/clang -O3 ${CLANG_FLAGS_LIST} batch_matmul_transpose_b_vec.ll
-c -save-temps -o ${CMAKE_CURRENT_BINARY_DIR}/batch_matmul_transpose_b_vec.o
VERBATIM)

add_library(batch_matmul_transpose_b_vec STATIC batch_matmul_transpose_b_vec.o)
set_target_properties(batch_matmul_transpose_b_vec PROPERTIES LINKER_LANGUAGE CXX)
target_link_libraries(dl-op-batch-matmul-transpose-b-benchmark
batch_matmul_transpose_b_vec
)

# Build the target for your new method here.
134 changes: 134 additions & 0 deletions benchmarks/DeepLearning/Ops/BatchMatMulTransposeBOp/Main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
//===- Main.cpp -----------------------------------------------------------===//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
//===----------------------------------------------------------------------===//
//
// This is the main file of Batch Matmul TransposeBOp benchmark.
//
//===----------------------------------------------------------------------===//

#include "Utils.hpp"
#include <benchmark/benchmark.h>
#include <buddy/Core/Container.h>

// -----------------------------------------------------------------------------
// Benchmark Configuration. You can change the number here as needed.
// -----------------------------------------------------------------------------

#define _NUM_ITER 1
#define _SIZE_BATCH 4
#define _SIZE_N 40
#define _SIZE_K 256
#define _SIZE_M 256

// -----------------------------------------------------------------------------
// Global Variables and Functions. No need to change the code here.
// -----------------------------------------------------------------------------

intptr_t sizesInput1[3] = {_SIZE_BATCH, _SIZE_M, _SIZE_K};
intptr_t sizesInput2[3] = {_SIZE_BATCH, _SIZE_K, _SIZE_N};
intptr_t sizesOutput[3] = {_SIZE_BATCH, _SIZE_M, _SIZE_N};
float *input1 = nullptr;
float *input2 = nullptr;
MemRef<float, 3> input1MemRef(sizesInput1);
MemRef<float, 3> input2MemRef(sizesInput2);

// Runs the provided BatchMatMulTransposeB function for benchmarking.
template <typename Func>
void DL_OPS_BATCH_MATMUL_TRANSPOSE_B(benchmark::State &state, Func func) {
MemRef<float, 3> outputMemRef(sizesOutput, 0.0);
for (auto _ : state) {
func(&input1MemRef, &input2MemRef, &outputMemRef);
}
benchmark::DoNotOptimize(outputMemRef);
}

using MLIRFunctionType = void (*)(MemRef<float, 3> *, MemRef<float, 3> *,
MemRef<float, 3> *);
// Verifies the result of an MLIR-based function against expected output.
void MLIRVerification(float *outputExpected, MLIRFunctionType MLIRFunc,
const std::string &name) {
MemRef<float, 3> outputMemRef(sizesOutput, 0);
MLIRFunc(&input1MemRef, &input2MemRef, &outputMemRef);
float *outputOptimized = outputMemRef.getData();
batch_matmul_transpose_b::verify<float>(outputExpected, outputOptimized,
_SIZE_BATCH, _SIZE_M * _SIZE_N, name);
}

// -----------------------------------------------------------------------------
// MLIR Benchmark. You can compare your new method with other methods here.
// -----------------------------------------------------------------------------

extern "C" {
void _mlir_ciface_batch_matmul_transpose_b_scalar_O0(MemRef<float, 3> *A,
MemRef<float, 3> *B,
MemRef<float, 3> *C);
void _mlir_ciface_batch_matmul_transpose_b_scalar_O3(MemRef<float, 3> *A,
MemRef<float, 3> *B,
MemRef<float, 3> *C);
void _mlir_ciface_batch_matmul_transpose_b_vec(MemRef<float, 3> *A,
MemRef<float, 3> *B,
MemRef<float, 3> *C);
/// [Step 1] Add function of your new method.
}
BENCHMARK_CAPTURE(DL_OPS_BATCH_MATMUL_TRANSPOSE_B, Scalar_O0,
_mlir_ciface_batch_matmul_transpose_b_scalar_O0)
->Unit(benchmark::kMillisecond)
->Iterations(_NUM_ITER);
BENCHMARK_CAPTURE(DL_OPS_BATCH_MATMUL_TRANSPOSE_B, Scalar_O3,
_mlir_ciface_batch_matmul_transpose_b_scalar_O3)
->Unit(benchmark::kMillisecond)
->Iterations(_NUM_ITER);
BENCHMARK_CAPTURE(DL_OPS_BATCH_MATMUL_TRANSPOSE_B, Vec,
_mlir_ciface_batch_matmul_transpose_b_vec)
->Unit(benchmark::kMillisecond)
->Iterations(_NUM_ITER);

/// [Step 2] Call GoogleBenchmark function to run your new method.

// -----------------------------------------------------------------------------
// Main Function. You can verify the correctness of your new method here.
// -----------------------------------------------------------------------------

int main(int argc, char **argv) {
// Initialize input data.
input1 = batch_matmul_transpose_b::allocArray<float>(_SIZE_BATCH * _SIZE_N,
_SIZE_K);
input2 = batch_matmul_transpose_b::allocArray<float>(_SIZE_BATCH * _SIZE_K,
_SIZE_M);
input1MemRef = MemRef<float, 3>(input1, sizesInput1);
input2MemRef = MemRef<float, 3>(input2, sizesInput2);

// Run benchmark.
::benchmark::Initialize(&argc, argv);
::benchmark::RunSpecifiedBenchmarks();

std::cout << "\033[34m---------- Verification ----------\033[0m" << std::endl;
// Attain scalar output results as expected output results in verification.
MemRef<float, 3> outputMemrefScalar(sizesOutput, 0);
_mlir_ciface_batch_matmul_transpose_b_scalar_O0(&input1MemRef, &input2MemRef,
&outputMemrefScalar);
float *outputExpected = outputMemrefScalar.getData();

MLIRVerification(outputExpected,
_mlir_ciface_batch_matmul_transpose_b_scalar_O3,
"Scalar_O3");
MLIRVerification(outputExpected, _mlir_ciface_batch_matmul_transpose_b_vec,
"Vec");
/// [Step 3] Add your new method for verification.

delete[] input1;
delete[] input2;
return 0;
}
84 changes: 84 additions & 0 deletions benchmarks/DeepLearning/Ops/BatchMatMulTransposeBOp/Utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
//===- Utils.hpp ----------------------------------------------------------===//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
//===----------------------------------------------------------------------===//
//
// This file implements BatchMatMulTransposeOp helper functions.
//
//===----------------------------------------------------------------------===//

#ifndef BATCHMATMUL_TRANSPOSE_B_UTILS_HPP
#define BATCHMATMUL_TRANSPOSE_B_UTILS_HPP

#include <benchmark/benchmark.h>
#include <cmath>
#include <cstdlib>
#include <ctime>
#include <iostream>

// -----------------------------------------------------------------------------
// Helper Functions
// -----------------------------------------------------------------------------

namespace batch_matmul_transpose_b {

// Allocates a 1D array with dimensions `rows * cols` and fills it with random
// values between 0 and 99.
template <typename DATA_TYPE> DATA_TYPE *allocArray(int rows, int cols) {
// Initialize the random number generator.
std::srand(static_cast<unsigned int>(std::time(0)));
// Allocate memory for the array
DATA_TYPE *array = new DATA_TYPE[rows * cols];
// Fill the array with random numbers between 0 and 99
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
array[i * cols + j] = static_cast<DATA_TYPE>(std::rand() % 100);
}
}
return array;
}

template <typename DATA_TYPE>
void verify(DATA_TYPE *A, DATA_TYPE *B, int batch, int size,
const std::string &name) {
const std::string PASS = "\033[32mPASS\033[0m";
const std::string FAIL = "\033[31mFAIL\033[0m";
const double epsilon = 1e-6; // Tolerance for floating point comparison

std::cout << name << " ";
if (!A || !B) {
std::cout << FAIL << " (Null pointer detected)" << std::endl;
return;
}

bool isPass = true;
for (int i = 0; i < batch; i++) {
for (int j = 0; j < size; j++) {
int k = i * size + j;
if (std::fabs(A[k] - B[k]) > epsilon) {
std::cout << FAIL << std::endl;
std::cout << "Batch=" << i << " Index=" << j << ":\tA[k]=" << A[k]
<< " B[k]=" << B[k] << std::endl;
isPass = false;
break;
}
}
}
if (isPass) {
std::cout << PASS << std::endl;
}
}
} // namespace batch_matmul_transpose_b

#endif // BATCHMATMUL_TRANSPOSE_B_UTILS_HPP
1 change: 1 addition & 0 deletions benchmarks/DeepLearning/Ops/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ add_subdirectory(SoftmaxExpSumDivOp)
add_subdirectory(Conv2DNhwcFhwcOp)
add_subdirectory(TransposeOp)
add_subdirectory(MatMulTransposeBOp)
add_subdirectory(BatchMatMulTransposeBOp)

if (CROSS_COMPILE_RVV)
add_subdirectory(Conv2DNhwcFhwcInt32Op)
Expand Down