buddy-compiler · FloatingcloudKnight · Mar 11, 2025
diff --git a/benchmarks/DeepLearning/Ops/BatchMatMulTransposeBOp/BatchMatMulTransposeB.mlir b/benchmarks/DeepLearning/Ops/BatchMatMulTransposeBOp/BatchMatMulTransposeB.mlir
@@ -0,0 +1,6 @@
+func.func @kernel_placeholder(%a : memref<?x?x?xf32>, %b : memref<?x?x?xf32>, %c : memref<?x?x?xf32>) {
+  linalg.batch_matmul_transpose_b 
+    ins(%a, %b: memref<?x?x?xf32>, memref<?x?x?xf32>)
+    outs(%c: memref<?x?x?xf32>)
+    return
+}
diff --git a/benchmarks/DeepLearning/Ops/BatchMatMulTransposeBOp/CMakeLists.txt b/benchmarks/DeepLearning/Ops/BatchMatMulTransposeBOp/CMakeLists.txt
@@ -0,0 +1,145 @@
+add_executable(dl-op-batch-matmul-transpose-b-benchmark
+  Main.cpp
+)
+target_link_libraries(dl-op-batch-matmul-transpose-b-benchmark GoogleBenchmark)
+
+# CMAKE_C_FLAGS is set when configuring cmake.
+separate_arguments(CLANG_FLAGS_LIST UNIX_COMMAND "${CMAKE_C_FLAGS}")
+
+add_custom_command(OUTPUT batch_matmul_transpose_b_scalar_O0.o
+  COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/BatchMatMulTransposeB.mlir |
+          sed -e {s/@kernel_placeholder/@batch_matmul_transpose_b_scalar_O0/} |
+          ${LLVM_MLIR_BINARY_DIR}/mlir-opt
+            -pass-pipeline 
+            "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" |
+          ${BUDDY_MLIR_BINARY_DIR}/buddy-opt
+              -arith-expand
+              -eliminate-empty-tensors
+              -empty-tensor-to-alloc-tensor
+              -one-shot-bufferize
+              -convert-linalg-to-affine-loops
+              -affine-loop-fusion
+              -affine-parallelize
+              -lower-affine
+              -func-bufferize
+              -arith-bufferize
+              -tensor-bufferize
+              -buffer-deallocation
+              -finalizing-bufferize
+              -convert-vector-to-scf
+              -expand-strided-metadata
+              -convert-vector-to-llvm
+              -memref-expand
+              -arith-expand
+              -convert-arith-to-llvm
+              -finalize-memref-to-llvm
+              -convert-scf-to-cf
+              -llvm-request-c-wrappers
+              -convert-arith-to-llvm
+              -convert-math-to-llvm
+              -convert-math-to-libm
+              -convert-func-to-llvm
+              -reconcile-unrealized-casts | 
+            ${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir -o batch_matmul_transpose_b_scalar_O0.ll
+  COMMAND ${LLVM_MLIR_BINARY_DIR}/clang -O0 ${CLANG_FLAGS_LIST} batch_matmul_transpose_b_scalar_O0.ll
+          -c -save-temps -o ${CMAKE_CURRENT_BINARY_DIR}/batch_matmul_transpose_b_scalar_O0.o
+  VERBATIM)
+
+add_library(batch_matmul_transpose_b_scalar_O0 STATIC batch_matmul_transpose_b_scalar_O0.o)
+set_target_properties(batch_matmul_transpose_b_scalar_O0 PROPERTIES LINKER_LANGUAGE CXX)
+target_link_libraries(dl-op-batch-matmul-transpose-b-benchmark
+  batch_matmul_transpose_b_scalar_O0
+)
+
+add_custom_command(OUTPUT batch_matmul_transpose_b_scalar_O3.o
+  COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/BatchMatMulTransposeB.mlir |
+          sed -e {s/@kernel_placeholder/@batch_matmul_transpose_b_scalar_O3/} |
+          ${LLVM_MLIR_BINARY_DIR}/mlir-opt
+            -pass-pipeline 
+            "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" |
+          ${BUDDY_MLIR_BINARY_DIR}/buddy-opt
+              -arith-expand
+              -eliminate-empty-tensors
+              -empty-tensor-to-alloc-tensor
+              -one-shot-bufferize
+              -convert-linalg-to-affine-loops
+              -affine-loop-fusion
+              -affine-parallelize
+              -lower-affine
+              -func-bufferize
+              -arith-bufferize
+              -tensor-bufferize
+              -buffer-deallocation
+              -finalizing-bufferize
+              -convert-vector-to-scf
+              -expand-strided-metadata
+              -convert-vector-to-llvm
+              -memref-expand
+              -arith-expand
+              -convert-arith-to-llvm
+              -finalize-memref-to-llvm
+              -convert-scf-to-cf
+              -llvm-request-c-wrappers
+              -convert-arith-to-llvm
+              -convert-math-to-llvm
+              -convert-math-to-libm
+              -convert-func-to-llvm
+              -reconcile-unrealized-casts | 
+            ${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir -o batch_matmul_transpose_b_scalar_O3.ll
+  COMMAND ${LLVM_MLIR_BINARY_DIR}/clang -O3 ${CLANG_FLAGS_LIST} batch_matmul_transpose_b_scalar_O3.ll
+          -c -save-temps -o ${CMAKE_CURRENT_BINARY_DIR}/batch_matmul_transpose_b_scalar_O3.o
+  VERBATIM)
+
+add_library(batch_matmul_transpose_b_scalar_O3 STATIC batch_matmul_transpose_b_scalar_O3.o)
+set_target_properties(batch_matmul_transpose_b_scalar_O3 PROPERTIES LINKER_LANGUAGE CXX)
+target_link_directories(dl-op-batch-matmul-transpose-b-benchmark PRIVATE ${LLVM_MLIR_LIBRARY_DIR})
+target_link_libraries(dl-op-batch-matmul-transpose-b-benchmark
+  batch_matmul_transpose_b_scalar_O3
+)
+
+add_custom_command(OUTPUT batch_matmul_transpose_b_vec.o
+  COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/BatchMatMulTransposeB.mlir |
+          sed -e {s/@kernel_placeholder/@batch_matmul_transpose_b_vec/} |
+          ${LLVM_MLIR_BINARY_DIR}/mlir-opt
+            -pass-pipeline 
+            "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" |
+          ${BUDDY_MLIR_BINARY_DIR}/buddy-opt
+            -arith-expand
+            -eliminate-empty-tensors
+            -empty-tensor-to-alloc-tensor
+            -one-shot-bufferize
+            -func-bufferize
+            -arith-bufferize
+            -tensor-bufferize
+            -buffer-deallocation
+            -finalizing-bufferize
+            -batchmatmul-transpose-b-vectorization
+            -convert-linalg-to-affine-loops
+            -affine-loop-fusion
+            -lower-affine
+            -convert-vector-to-scf
+            -expand-strided-metadata
+            -convert-vector-to-llvm
+            -memref-expand
+            -arith-expand
+            -convert-arith-to-llvm
+            -finalize-memref-to-llvm
+            -convert-scf-to-cf
+            -llvm-request-c-wrappers
+            -convert-arith-to-llvm
+            -convert-math-to-llvm
+            -convert-math-to-libm 
+            -convert-func-to-llvm
+            -reconcile-unrealized-casts |
+            ${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir -o batch_matmul_transpose_b_vec.ll
+  COMMAND ${LLVM_MLIR_BINARY_DIR}/clang -O3 ${CLANG_FLAGS_LIST} batch_matmul_transpose_b_vec.ll
+          -c -save-temps -o ${CMAKE_CURRENT_BINARY_DIR}/batch_matmul_transpose_b_vec.o
+  VERBATIM)
+
+add_library(batch_matmul_transpose_b_vec STATIC batch_matmul_transpose_b_vec.o)
+set_target_properties(batch_matmul_transpose_b_vec PROPERTIES LINKER_LANGUAGE CXX)
+target_link_libraries(dl-op-batch-matmul-transpose-b-benchmark
+  batch_matmul_transpose_b_vec
+)
+
+# Build the target for your new method here.
diff --git a/benchmarks/DeepLearning/Ops/BatchMatMulTransposeBOp/Main.cpp b/benchmarks/DeepLearning/Ops/BatchMatMulTransposeBOp/Main.cpp
@@ -0,0 +1,134 @@
+//===- Main.cpp -----------------------------------------------------------===//
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the main file of Batch Matmul TransposeBOp benchmark.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Utils.hpp"
+#include <benchmark/benchmark.h>
+#include <buddy/Core/Container.h>
+
+// -----------------------------------------------------------------------------
+// Benchmark Configuration. You can change the number here as needed.
+// -----------------------------------------------------------------------------
+
+#define _NUM_ITER 1
+#define _SIZE_BATCH 4
+#define _SIZE_N 40
+#define _SIZE_K 256
+#define _SIZE_M 256
+
+// -----------------------------------------------------------------------------
+// Global Variables and Functions. No need to change the code here.
+// -----------------------------------------------------------------------------
+
+intptr_t sizesInput1[3] = {_SIZE_BATCH, _SIZE_M, _SIZE_K};
+intptr_t sizesInput2[3] = {_SIZE_BATCH, _SIZE_K, _SIZE_N};
+intptr_t sizesOutput[3] = {_SIZE_BATCH, _SIZE_M, _SIZE_N};
+float *input1 = nullptr;
+float *input2 = nullptr;
+MemRef<float, 3> input1MemRef(sizesInput1);
+MemRef<float, 3> input2MemRef(sizesInput2);
+
+// Runs the provided BatchMatMulTransposeB function for benchmarking.
+template <typename Func>
+void DL_OPS_BATCH_MATMUL_TRANSPOSE_B(benchmark::State &state, Func func) {
+  MemRef<float, 3> outputMemRef(sizesOutput, 0.0);
+  for (auto _ : state) {
+    func(&input1MemRef, &input2MemRef, &outputMemRef);
+  }
+  benchmark::DoNotOptimize(outputMemRef);
+}
+
+using MLIRFunctionType = void (*)(MemRef<float, 3> *, MemRef<float, 3> *,
+                                  MemRef<float, 3> *);
+// Verifies the result of an MLIR-based function against expected output.
+void MLIRVerification(float *outputExpected, MLIRFunctionType MLIRFunc,
+                      const std::string &name) {
+  MemRef<float, 3> outputMemRef(sizesOutput, 0);
+  MLIRFunc(&input1MemRef, &input2MemRef, &outputMemRef);
+  float *outputOptimized = outputMemRef.getData();
+  batch_matmul_transpose_b::verify<float>(outputExpected, outputOptimized,
+                                          _SIZE_BATCH, _SIZE_M * _SIZE_N, name);
+}
+
+// -----------------------------------------------------------------------------
+// MLIR Benchmark. You can compare your new method with other methods here.
+// -----------------------------------------------------------------------------
+
+extern "C" {
+void _mlir_ciface_batch_matmul_transpose_b_scalar_O0(MemRef<float, 3> *A,
+                                                     MemRef<float, 3> *B,
+                                                     MemRef<float, 3> *C);
+void _mlir_ciface_batch_matmul_transpose_b_scalar_O3(MemRef<float, 3> *A,
+                                                     MemRef<float, 3> *B,
+                                                     MemRef<float, 3> *C);
+void _mlir_ciface_batch_matmul_transpose_b_vec(MemRef<float, 3> *A,
+                                               MemRef<float, 3> *B,
+                                               MemRef<float, 3> *C);
+/// [Step 1] Add function of your new method.
+}
+BENCHMARK_CAPTURE(DL_OPS_BATCH_MATMUL_TRANSPOSE_B, Scalar_O0,
+                  _mlir_ciface_batch_matmul_transpose_b_scalar_O0)
+    ->Unit(benchmark::kMillisecond)
+    ->Iterations(_NUM_ITER);
+BENCHMARK_CAPTURE(DL_OPS_BATCH_MATMUL_TRANSPOSE_B, Scalar_O3,
+                  _mlir_ciface_batch_matmul_transpose_b_scalar_O3)
+    ->Unit(benchmark::kMillisecond)
+    ->Iterations(_NUM_ITER);
+BENCHMARK_CAPTURE(DL_OPS_BATCH_MATMUL_TRANSPOSE_B, Vec,
+                  _mlir_ciface_batch_matmul_transpose_b_vec)
+    ->Unit(benchmark::kMillisecond)
+    ->Iterations(_NUM_ITER);
+
+/// [Step 2] Call GoogleBenchmark function to run your new method.
+
+// -----------------------------------------------------------------------------
+// Main Function. You can verify the correctness of your new method here.
+// -----------------------------------------------------------------------------
+
+int main(int argc, char **argv) {
+  // Initialize input data.
+  input1 = batch_matmul_transpose_b::allocArray<float>(_SIZE_BATCH * _SIZE_N,
+                                                       _SIZE_K);
+  input2 = batch_matmul_transpose_b::allocArray<float>(_SIZE_BATCH * _SIZE_K,
+                                                       _SIZE_M);
+  input1MemRef = MemRef<float, 3>(input1, sizesInput1);
+  input2MemRef = MemRef<float, 3>(input2, sizesInput2);
+
+  // Run benchmark.
+  ::benchmark::Initialize(&argc, argv);
+  ::benchmark::RunSpecifiedBenchmarks();
+
+  std::cout << "\033[34m---------- Verification ----------\033[0m" << std::endl;
+  // Attain scalar output results as expected output results in verification.
+  MemRef<float, 3> outputMemrefScalar(sizesOutput, 0);
+  _mlir_ciface_batch_matmul_transpose_b_scalar_O0(&input1MemRef, &input2MemRef,
+                                                  &outputMemrefScalar);
+  float *outputExpected = outputMemrefScalar.getData();
+
+  MLIRVerification(outputExpected,
+                   _mlir_ciface_batch_matmul_transpose_b_scalar_O3,
+                   "Scalar_O3");
+  MLIRVerification(outputExpected, _mlir_ciface_batch_matmul_transpose_b_vec,
+                   "Vec");
+  /// [Step 3] Add your new method for verification.
+
+  delete[] input1;
+  delete[] input2;
+  return 0;
+}
diff --git a/benchmarks/DeepLearning/Ops/BatchMatMulTransposeBOp/Utils.hpp b/benchmarks/DeepLearning/Ops/BatchMatMulTransposeBOp/Utils.hpp
@@ -0,0 +1,84 @@
+//===- Utils.hpp ----------------------------------------------------------===//
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements BatchMatMulTransposeOp helper functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BATCHMATMUL_TRANSPOSE_B_UTILS_HPP
+#define BATCHMATMUL_TRANSPOSE_B_UTILS_HPP
+
+#include <benchmark/benchmark.h>
+#include <cmath>
+#include <cstdlib>
+#include <ctime>
+#include <iostream>
+
+// -----------------------------------------------------------------------------
+// Helper Functions
+// -----------------------------------------------------------------------------
+
+namespace batch_matmul_transpose_b {
+
+// Allocates a 1D array with dimensions `rows * cols` and fills it with random
+// values between 0 and 99.
+template <typename DATA_TYPE> DATA_TYPE *allocArray(int rows, int cols) {
+  // Initialize the random number generator.
+  std::srand(static_cast<unsigned int>(std::time(0)));
+  // Allocate memory for the array
+  DATA_TYPE *array = new DATA_TYPE[rows * cols];
+  // Fill the array with random numbers between 0 and 99
+  for (int i = 0; i < rows; i++) {
+    for (int j = 0; j < cols; j++) {
+      array[i * cols + j] = static_cast<DATA_TYPE>(std::rand() % 100);
+    }
+  }
+  return array;
+}
+
+template <typename DATA_TYPE>
+void verify(DATA_TYPE *A, DATA_TYPE *B, int batch, int size,
+            const std::string &name) {
+  const std::string PASS = "\033[32mPASS\033[0m";
+  const std::string FAIL = "\033[31mFAIL\033[0m";
+  const double epsilon = 1e-6; // Tolerance for floating point comparison
+
+  std::cout << name << " ";
+  if (!A || !B) {
+    std::cout << FAIL << " (Null pointer detected)" << std::endl;
+    return;
+  }
+
+  bool isPass = true;
+  for (int i = 0; i < batch; i++) {
+    for (int j = 0; j < size; j++) {
+      int k = i * size + j;
+      if (std::fabs(A[k] - B[k]) > epsilon) {
+        std::cout << FAIL << std::endl;
+        std::cout << "Batch=" << i << " Index=" << j << ":\tA[k]=" << A[k]
+                  << " B[k]=" << B[k] << std::endl;
+        isPass = false;
+        break;
+      }
+    }
+  }
+  if (isPass) {
+    std::cout << PASS << std::endl;
+  }
+}
+} // namespace batch_matmul_transpose_b
+
+#endif // BATCHMATMUL_TRANSPOSE_B_UTILS_HPP
diff --git a/benchmarks/DeepLearning/Ops/CMakeLists.txt b/benchmarks/DeepLearning/Ops/CMakeLists.txt
@@ -18,6 +18,7 @@ add_subdirectory(SoftmaxExpSumDivOp)
 add_subdirectory(Conv2DNhwcFhwcOp)
 add_subdirectory(TransposeOp)
 add_subdirectory(MatMulTransposeBOp)
+add_subdirectory(BatchMatMulTransposeBOp)
 
 if (CROSS_COMPILE_RVV)
 add_subdirectory(Conv2DNhwcFhwcInt32Op)