From 9def6515c6f7b5a03350b548502c713a31cb7570 Mon Sep 17 00:00:00 2001
From: Q Liu <52538137+LIUQyou@users.noreply.github.com>
Date: Sun, 30 Mar 2025 20:29:42 +0200
Subject: [PATCH] update benchmark and errors

---
 test/test_script_audioprocessing.sh           |  47 ++
 test/test_script_deeplearning.sh              | 221 ++++++
 test/test_script_geminiprocessing.sh          |  97 +++
 test/test_script_imageprocessing.sh           |  58 ++
 test/test_script_vectorizationprocessing.sh   |  38 +
 .../build_results_crosscompile_summary.log    |  23 +
 .../deeplearning/build_results_summary.log    |  29 +
 .../deeplearning/dl-layer-ffn-benchmark.log   |  18 +
 .../dl-layer-rmsnorm-benchmark.log            |  18 +
 .../dl-layer-selfattention-benchmark.log      |  18 +
 .../deeplearning/dl-model-lenet-benchmark.log |  19 +
 .../dl-model-mobilenetv3-benchmark.log        |  19 +
 .../dl-model-resnet18-benchmark.log           |  18 +
 .../dl-model-tinyllama-benchmark.log          |  19 +
 .../dl-model-whisper-benchmark.log            |  19 +
 .../dl-op-linalg-arithaddf-benchmark.log      |  19 +
 .../dl-op-linalg-arithdivf-benchmark.log      |  19 +
 .../dl-op-linalg-arithmulf-benchmark.log      |  19 +
 .../dl-op-linalg-arithnegf-benchmark.log      |  19 +
 .../dl-op-linalg-arithsubf-benchmark.log      |  19 +
 .../dl-op-linalg-batch-matmul-benchmark.log   |  25 +
 ...l-op-linalg-conv2d-nchw-fchw-benchmark.log |  19 +
 ...l-op-linalg-conv2d-nhwc-fhwc-benchmark.log |  21 +
 ...l-op-linalg-conv2d-nhwc-hwcf-benchmark.log |  19 +
 ...g-depthwise-conv-2d-nhwc-hwc-benchmark.log |  19 +
 .../dl-op-linalg-mathexp-benchmark.log        |  19 +
 .../dl-op-linalg-mathfpow-benchmark.log       |  19 +
 .../dl-op-linalg-mathrsqrt-benchmark.log      |  19 +
 .../dl-op-linalg-matmul-benchmark.log         |  22 +
 ...l-op-linalg-pooling-nhwc-sum-benchmark.log |  19 +
 .../dl-op-linalg-reduceaddf-benchmark.log     |  10 +
 .../dl-op-linalg-reducemaxf-benchmark.log     |  10 +
 ...p-linalg-softmax-exp-sum-div-benchmark.log |  19 +
 .../dl-op-matmul-transpose-b-benchmark.log    |  21 +
 .../dl-op-tosa-transpose-benchmark.log        |  17 +
 .../deeplearning/run_results_summary.log      |  29 +
 test_result/geminiprocessing/build.log        | 655 ++++++++++++++++++
 .../geminiprocessing/cmake_configure.log      |  37 +
 38 files changed, 1755 insertions(+)
 create mode 100755 test/test_script_audioprocessing.sh
 create mode 100755 test/test_script_deeplearning.sh
 create mode 100755 test/test_script_geminiprocessing.sh
 create mode 100755 test/test_script_imageprocessing.sh
 create mode 100755 test/test_script_vectorizationprocessing.sh
 create mode 100644 test_result/deeplearning/build_results_crosscompile_summary.log
 create mode 100644 test_result/deeplearning/build_results_summary.log
 create mode 100644 test_result/deeplearning/dl-layer-ffn-benchmark.log
 create mode 100644 test_result/deeplearning/dl-layer-rmsnorm-benchmark.log
 create mode 100644 test_result/deeplearning/dl-layer-selfattention-benchmark.log
 create mode 100644 test_result/deeplearning/dl-model-lenet-benchmark.log
 create mode 100644 test_result/deeplearning/dl-model-mobilenetv3-benchmark.log
 create mode 100644 test_result/deeplearning/dl-model-resnet18-benchmark.log
 create mode 100644 test_result/deeplearning/dl-model-tinyllama-benchmark.log
 create mode 100644 test_result/deeplearning/dl-model-whisper-benchmark.log
 create mode 100644 test_result/deeplearning/dl-op-linalg-arithaddf-benchmark.log
 create mode 100644 test_result/deeplearning/dl-op-linalg-arithdivf-benchmark.log
 create mode 100644 test_result/deeplearning/dl-op-linalg-arithmulf-benchmark.log
 create mode 100644 test_result/deeplearning/dl-op-linalg-arithnegf-benchmark.log
 create mode 100644 test_result/deeplearning/dl-op-linalg-arithsubf-benchmark.log
 create mode 100644 test_result/deeplearning/dl-op-linalg-batch-matmul-benchmark.log
 create mode 100644 test_result/deeplearning/dl-op-linalg-conv2d-nchw-fchw-benchmark.log
 create mode 100644 test_result/deeplearning/dl-op-linalg-conv2d-nhwc-fhwc-benchmark.log
 create mode 100644 test_result/deeplearning/dl-op-linalg-conv2d-nhwc-hwcf-benchmark.log
 create mode 100644 test_result/deeplearning/dl-op-linalg-depthwise-conv-2d-nhwc-hwc-benchmark.log
 create mode 100644 test_result/deeplearning/dl-op-linalg-mathexp-benchmark.log
 create mode 100644 test_result/deeplearning/dl-op-linalg-mathfpow-benchmark.log
 create mode 100644 test_result/deeplearning/dl-op-linalg-mathrsqrt-benchmark.log
 create mode 100644 test_result/deeplearning/dl-op-linalg-matmul-benchmark.log
 create mode 100644 test_result/deeplearning/dl-op-linalg-pooling-nhwc-sum-benchmark.log
 create mode 100644 test_result/deeplearning/dl-op-linalg-reduceaddf-benchmark.log
 create mode 100644 test_result/deeplearning/dl-op-linalg-reducemaxf-benchmark.log
 create mode 100644 test_result/deeplearning/dl-op-linalg-softmax-exp-sum-div-benchmark.log
 create mode 100644 test_result/deeplearning/dl-op-matmul-transpose-b-benchmark.log
 create mode 100644 test_result/deeplearning/dl-op-tosa-transpose-benchmark.log
 create mode 100644 test_result/deeplearning/run_results_summary.log
 create mode 100644 test_result/geminiprocessing/build.log
 create mode 100644 test_result/geminiprocessing/cmake_configure.log

diff --git a/test/test_script_audioprocessing.sh b/test/test_script_audioprocessing.sh
new file mode 100755
index 00000000..e16ae655
--- /dev/null
+++ b/test/test_script_audioprocessing.sh
@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+
+export BUDDY_MLIR_BUILD_DIR=/home/buddy-complier-workspace/buddy-mlir/build
+export LLVM_MLIR_BUILD_DIR=/home/buddy-complier-workspace/buddy-mlir/llvm/build
+cd /home/buddy-complier-workspace/buddy-benchmark
+mkdir -p build && cd build
+cmake -G Ninja .. \
+    -DCMAKE_BUILD_TYPE=RELEASE \
+    -DAUDIO_PROCESSING_BENCHMARKS=ON \
+    -DCMAKE_CXX_COMPILER=${LLVM_MLIR_BUILD_DIR}/bin/clang++ \
+    -DKFR_DIR=/home/buddy-complier-workspace/buddy-benchmark/thirdparty/kfr \
+    -DBUDDY_MLIR_BUILD_DIR=${BUDDY_MLIR_BUILD_DIR}
+ninja dap-op-iir-benchmark
+cd bin
+./dap-op-iir-benchmark
+
+
+
+cmake -G Ninja .. \
+    -DCMAKE_BUILD_TYPE=RELEASE \
+    -DAUDIO_PROCESSING_BENCHMARKS=ON \
+    -DCMAKE_CXX_COMPILER=${LLVM_MLIR_BUILD_DIR}/bin/clang++ \
+    -DKFR_DIR=/home/buddy-complier-workspace/buddy-benchmark/thirdparty/kfr \
+    -DBUDDY_MLIR_BUILD_DIR=${BUDDY_MLIR_BUILD_DIR} \
+    -DPYTHON_BINARY_DIR="$(dirname "$(which python3)")"
+
+ninja audio-plot
+cd bin
+./audio-plot ../../benchmarks/AudioProcessing/Audios/NASA_Mars.wav ResultKFRIir.wav
+# "
+# root@4f445bb41579:/home/buddy-complier-workspace/buddy-benchmark/build/bin# ./audio-plot ../../benchmarks/AudioProcessing/Audios/NASA_Mars.wav ResultKFRIir.wav
+# Plotting now...
+# Traceback (most recent call last):
+#   File "/home/buddy-complier-workspace/buddy-benchmark/utils/plots/python/plot.py", line 71, in <module>
+#     compare_wave(args.file1, args.file2, part=args.part,
+#   File "/home/buddy-complier-workspace/buddy-benchmark/utils/plots/python/plotools/compare.py", line 120, in compare_wave
+#     after, time2 = get_time_domain(file2)
+#   File "/home/buddy-complier-workspace/buddy-benchmark/utils/plots/python/plotools/compare.py", line 60, in get_time_domain
+#     info, samples = get_info_and_samples(file)
+#   File "/home/buddy-complier-workspace/buddy-benchmark/utils/plots/python/plotools/compare.py", line 38, in get_info_and_samples
+#     with wave.open(file, 'rb') as audio:
+#   File "/usr/lib/python3.10/wave.py", line 509, in open
+#     return Wave_read(f)
+#   File "/usr/lib/python3.10/wave.py", line 159, in __init__
+#     f = builtins.open(f, 'rb')
+# FileNotFoundError: [Errno 2] No such file or directory: 'ResultKFRIir.wav'
+# "
\ No newline at end of file
diff --git a/test/test_script_deeplearning.sh b/test/test_script_deeplearning.sh
new file mode 100755
index 00000000..f7c4d72e
--- /dev/null
+++ b/test/test_script_deeplearning.sh
@@ -0,0 +1,221 @@
+#!/usr/bin/env bash
+
+################################################################################
+# 0. Script Setup
+################################################################################
+# We disable "exit on error" so that if one benchmark fails to build or run,
+# we can continue with the rest.
+set +e
+
+################################################################################
+# 1. (Optional) Activate Python/Conda Environment
+################################################################################
+# Uncomment or adjust if you use Anaconda/Miniconda:
+# conda activate <YOUR-CONDA-ENV-NAME>
+
+
+################################################################################
+# 2. Build Each Benchmark (Continue Even If One Fails)
+################################################################################
+BENCHMARK_TARGETS=(
+  # ------------------
+  # Model-Level
+  # ------------------
+  "dl-model-tinyllama-benchmark"
+  "dl-model-mobilenetv3-benchmark"
+  "dl-model-lenet-benchmark"
+  "dl-model-bert-benchmark"
+  "dl-model-whisper-benchmark"
+  "dl-model-resnet18-benchmark"
+
+  # ------------------
+  # Layer-Level
+  # ------------------
+  "dl-layer-ffn-benchmark"
+  "dl-layer-selfattention-benchmark"
+  "dl-layer-rmsnorm-benchmark"
+
+  # ------------------
+  # Operation-Level
+  # ------------------
+  "dl-op-linalg-matmul-benchmark"
+  "dl-op-linalg-conv2d-nchw-fchw-benchmark"
+  "dl-op-linalg-conv2d-nhwc-hwcf-benchmark"
+  "dl-op-linalg-conv2d-nhwc-fhwc-benchmark"
+  "dl-op-linalg-depthwise-conv-2d-nhwc-hwc-benchmark"
+  "dl-op-linalg-pooling-nhwc-sum-benchmark"
+  "dl-op-linalg-batch-matmul-benchmark"
+  "dl-op-linalg-arithaddf-benchmark"
+  "dl-op-linalg-arithdivf-benchmark"
+  "dl-op-linalg-arithmulf-benchmark"
+  "dl-op-linalg-arithnegf-benchmark"
+  "dl-op-linalg-arithsubf-benchmark"
+  "dl-op-linalg-mathfpow-benchmark"
+  "dl-op-linalg-mathrsqrt-benchmark"
+  "dl-op-linalg-mathexp-benchmark"
+  "dl-op-linalg-reduceaddf-benchmark"
+  "dl-op-linalg-reducemaxf-benchmark"
+  "dl-op-linalg-softmax-exp-sum-div-benchmark"
+  "dl-op-tosa-transpose-benchmark"
+  "dl-op-matmul-transpose-b-benchmark"
+)
+
+
+################################################################################
+# 3. Set Environment Variables for Buddy MLIR/LLVM
+################################################################################
+# Adjust these paths according to your local setup:
+BUDDY_MLIR_DIR="/home/buddy-complier-workspace/buddy-mlir"  # The root directory of buddy-mlir
+LLVM_BUILD_DIR="$BUDDY_MLIR_DIR/llvm/build"                 # The build dir for LLVM
+BUDDY_BUILD_DIR="$BUDDY_MLIR_DIR/build"                     # The build dir for buddy-mlir
+
+# Export environment variables:
+export BUDDY_MLIR_BUILD_DIR="$BUDDY_BUILD_DIR"
+export LLVM_MLIR_BUILD_DIR="$LLVM_BUILD_DIR"
+export PYTHONPATH="${LLVM_BUILD_DIR}/tools/mlir/python_packages/mlir_core:${BUDDY_BUILD_DIR}/python_packages:${PYTHONPATH}"
+export BENCHMARK_PATH="${BUDDY_MLIR_DIR}/../buddy-benchmark"
+echo "[Info] BUDDY_MLIR_BUILD_DIR = ${BUDDY_MLIR_BUILD_DIR}"
+echo "[Info] LLVM_MLIR_BUILD_DIR  = ${LLVM_MLIR_BUILD_DIR}"
+echo "[Info] PYTHONPATH           = ${PYTHONPATH}"
+
+################################################################################
+# 3. Prepare Build Folder and Run CMake
+################################################################################
+cd "${BUDDY_MLIR_DIR}/../buddy-benchmark" || exit 1
+mkdir -p build
+cd build || exit 1
+
+echo "[Info] Running CMake configuration..."
+cmake -G Ninja .. \
+  -DDEEP_LEARNING_BENCHMARKS=ON \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+  -DBUDDY_MLIR_BUILD_DIR="${BUDDY_MLIR_BUILD_DIR}" \
+  -DCMAKE_CXX_COMPILER="${LLVM_MLIR_BUILD_DIR}/bin/clang++" \
+  -DCMAKE_C_COMPILER="${LLVM_MLIR_BUILD_DIR}/bin/clang" \
+  -DCMAKE_CXX_FLAGS="-march=native" \
+  -DCMAKE_C_FLAGS="-march=native"
+
+
+################################################################################
+# 4. Prepare Build Folder and Run CMake
+################################################################################
+
+mkdir -p $BENCHMARK_PATH/test_result
+mkdir -p $BENCHMARK_PATH/test_result/deeplearning
+BUILD_LOG="${BENCHMARK_PATH}/test_result/deeplearning/build_results_summary.log"
+> "${BUILD_LOG}"  # Clear/create the file
+
+echo "[Info] Building all benchmarks with Ninja..."
+for target in "${BENCHMARK_TARGETS[@]}"; do
+  echo "==> ninja ${target}"
+  if ninja "${target}"; then
+    echo "[Success] Build of '${target}'" | tee -a "${BUILD_LOG}"
+  else
+    echo "[Failed]  Build of '${target}'" | tee -a "${BUILD_LOG}"
+  fi
+done
+
+################################################################################
+# 5. Run Each Benchmark & Redirect Output (Continue Even If One Fails)
+################################################################################
+cd bin || exit 1
+
+RUN_LOG="${BENCHMARK_PATH}/test_result/deeplearning/run_results_summary.log"
+> "${RUN_LOG}"  # Clear/create the file
+
+echo "[Info] Running all benchmarks in ./bin..."
+for target in "${BENCHMARK_TARGETS[@]}"; do
+  if [ -f "${target}" ]; then
+    echo "==> Running ${target}"
+    if "./${target}" > "${BENCHMARK_PATH}/test_result/deeplearning/${target}.log" 2>&1; then
+      echo "[Success] Run of '${target}'" | tee -a "${RUN_LOG}"
+      echo "    Output saved to test_result/deeplearning/${target}.log"
+    else
+      echo "[Failed]  Run of '${target}'" | tee -a "${RUN_LOG}"
+      echo "    Output saved to test_result/deeplearning/${target}.log (May contain error info)"
+    fi
+  else
+    echo "[Missing] Executable not found for '${target}'" | tee -a "${RUN_LOG}"
+  fi
+done
+
+
+################################################################################
+# 6. Set Environment Variables for Buddy MLIR/LLVM for cross-compile
+################################################################################
+# Adjust these paths according to your local setup:
+BUDDY_MLIR_DIR="/home/buddy-complier-workspace/buddy-mlir"  # The root directory of buddy-mlir
+LLVM_BUILD_DIR="$BUDDY_MLIR_DIR/llvm/build"                 # The build dir for LLVM
+BUDDY_BUILD_DIR="$BUDDY_MLIR_DIR/build"                     # The build dir for buddy-mlir
+
+# Export environment variables:
+export BUDDY_MLIR_BUILD_DIR="$BUDDY_BUILD_DIR"
+export LLVM_MLIR_BUILD_DIR="$LLVM_BUILD_DIR"
+export PYTHONPATH="${LLVM_BUILD_DIR}/tools/mlir/python_packages/mlir_core:${BUDDY_BUILD_DIR}/python_packages:${PYTHONPATH}"
+export BUDDY_MLIR_BUILD_CROSS_DIR=${BUDDY_MLIR_BUILD_DIR}/../build
+export RISCV_GNU_TOOLCHAIN=${BUDDY_MLIR_BUILD_DIR}/../thirdparty/riscv-gnu-toolchain
+export RISCV_OMP_SHARED=${LLVM_MLIR_BUILD_DIR}/../build/lib/libomp.so
+export BENCHMARK_PATH="${BUDDY_MLIR_DIR}/../buddy-benchmark"
+
+echo "[Info] BUDDY_MLIR_BUILD_DIR = ${BUDDY_MLIR_BUILD_DIR}"
+echo "[Info] LLVM_MLIR_BUILD_DIR  = ${LLVM_MLIR_BUILD_DIR}"
+echo "[Info] PYTHONPATH           = ${PYTHONPATH}"
+
+################################################################################
+# 7. Prepare Build Folder and Run CMake
+################################################################################
+cd "${BUDDY_MLIR_DIR}/../buddy-benchmark" || exit 1
+mkdir -p build
+cd build || exit 1
+
+echo "[Info] Running CMake configuration..."
+cmake -G Ninja .. \
+    -DDEEP_LEARNING_BENCHMARKS=ON \
+    -DCMAKE_BUILD_TYPE=RELEASE \
+    -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+    -DCROSS_COMPILE_RVV=ON \
+    -DCMAKE_SYSTEM_NAME=Linux \
+    -DCMAKE_SYSTEM_PROCESSOR=riscv \
+    -DCMAKE_C_COMPILER=${LLVM_MLIR_BUILD_DIR}/bin/clang \
+    -DRISCV_GNU_TOOLCHAIN=${RISCV_GNU_TOOLCHAIN} \
+    -DCMAKE_CXX_COMPILER=${LLVM_MLIR_BUILD_DIR}/bin/clang++ \
+    -DCMAKE_C_FLAGS="-march=rv64gcv --target=riscv64-unknown-linux-gnu --sysroot=${RISCV_GNU_TOOLCHAIN}/sysroot --gcc-toolchain=${RISCV_GNU_TOOLCHAIN} -fPIC" \
+    -DCMAKE_CXX_FLAGS="-march=rv64gcv --target=riscv64-unknown-linux-gnu --sysroot=${RISCV_GNU_TOOLCHAIN}/sysroot --gcc-toolchain=${RISCV_GNU_TOOLCHAIN} -fPIC" \
+    -DRISCV_OMP_SHARED=${RISCV_OMP_SHARED} \
+    -DBUDDY_MLIR_BUILD_DIR=${BUDDY_MLIR_BUILD_DIR} \
+    -DBUDDY_MLIR_BUILD_CROSS_DIR=${BUDDY_MLIR_BUILD_CROSS_DIR} \
+    -DBUDDY_MLIR_CROSS_LIB_DIR=${BUDDY_MLIR_BUILD_CROSS_DIR}/lib
+
+################################################################################
+# 8. Prepare Build Folder and Run CMake for cross-compile
+################################################################################
+
+mkdir -p $BENCHMARK_PATH/test_result
+BUILD_LOG="${BENCHMARK_PATH}/test_result/deeplearning/build_results_crosscompile_summary.log"
+> "${BUILD_LOG}"  # Clear/create the file
+
+echo "[Info] Building all benchmarks with Ninja..."
+for target in "${BENCHMARK_TARGETS[@]}"; do
+  echo "==> ninja ${target}"
+  if ninja "${target}"; then
+    echo "[Success] Build of '${target}'" | tee -a "${BUILD_LOG}"
+  else
+    echo "[Failed]  Build of '${target}'" | tee -a "${BUILD_LOG}"
+  fi
+done
+
+
+echo
+echo "[Info] All build/run steps completed (script did not stop on failures)."
+echo "[Info] Build summary: ${BUILD_LOG}"
+echo "[Info] Run summary:   ${RUN_LOG}"
+
+
+cmake -G Ninja .. \
+    -DMLIR_DIR=$PWD/../llvm/build/lib/cmake/mlir \
+    -DLLVM_DIR=$PWD/../llvm/build/lib/cmake/llvm \
+    -DLLVM_ENABLE_ASSERTIONS=ON \
+    -DCMAKE_BUILD_TYPE=RELEASE \
+    -DBUDDY_MLIR_ENABLE_PYTHON_PACKAGES=ON \
+    -DPython3_EXECUTABLE=$(which python3)
\ No newline at end of file
diff --git a/test/test_script_geminiprocessing.sh b/test/test_script_geminiprocessing.sh
new file mode 100755
index 00000000..b151cb5b
--- /dev/null
+++ b/test/test_script_geminiprocessing.sh
@@ -0,0 +1,97 @@
+#!/usr/bin/env bash
+
+export BUDDY_MLIR_BUILD_DIR=/home/buddy-complier-workspace/buddy-mlir/build
+export LLVM_MLIR_BUILD_DIR=/home/buddy-complier-workspace/buddy-mlir/llvm/build
+export CHIPYARD_DIR=/home/buddy-complier-workspace/chipyard
+export BUDDY_BENCHMARK_DIR=/home/buddy-complier-workspace/buddy-benchmark
+
+cd "${CHIPYARD_DIR}"
+git config --global --add safe.directory /home/buddy-complier-workspace/chipyard
+git checkout 1.8.1
+
+# Initialize and update the 'generators/gemmini' submodule and any submodules inside it.
+git config --global --add safe.directory /home/buddy-complier-workspace/chipyard/generators/gemmini
+git submodule update --init --recursive generators/gemmini
+
+#############################################
+# 1. Initialize Conda for the current shell
+#############################################
+conda init bash  # or "conda init" if you’re already in a bash shell
+
+#############################################
+# 2. Check if 'chipyard' environment exists
+#############################################
+if conda env list | grep -qE '^[^ ]*\s+chipyard\s'; then
+    echo "[INFO] Found existing 'chipyard' environment. Activating it."
+else
+    echo "[INFO] 'chipyard' environment not found. Creating it..."
+    # Example creation command - adjust packages as needed
+    conda create -y -n chipyard python=3.10 \
+        cmake ninja  \
+        # plus any other dependencies needed...
+fi
+
+conda activate chipyard
+
+#############################################
+# 3. Source build-setup and env.sh
+#############################################
+# If your script uses conda-lock or has pinned requirements,
+# you might need to call build-setup.sh so it *creates* the
+# .conda-env environment. But be sure it doesn’t conflict
+# with your newly created 'chipyard' environment.
+source build-setup.sh esp-tools
+source env.sh
+
+#############################################
+# 4. Proceed with your build
+#############################################
+cd "${BUDDY_BENCHMARK_DIR}"
+rm -rf build
+# Remove any existing build directory and create a fresh one.
+mkdir -p build && cd build
+
+RESULT_DIR="${BUDDY_BENCHMARK_DIR}/test_result/geminiprocessing"
+mkdir -p "${RESULT_DIR}"
+
+export C_PATH=$(which riscv64-unknown-linux-gnu-gcc)
+export CXX_PATH=$(which riscv64-unknown-linux-gnu-g++)
+export CLinker_PATH=$(which riscv64-unknown-linux-gnu-ld)
+
+# Print Address here
+echo "[Info] C_COMPILER_PATH = ${C_PATH}"
+echo "[Info] CXX_COMPILER_PATH = ${CXX_PATH}"
+echo "[Info] C_LINKER_PATH = ${CLinker_PATH}"
+echo "[Info] BUDDY_MLIR_BUILD_DIR = ${BUDDY_MLIR_BUILD_DIR}"
+echo "[Info] LLVM_MLIR_BUILD_DIR  = ${LLVM_MLIR_BUILD_DIR}"
+echo "[Info] CHIPYARD_DIR = ${CHIPYARD_DIR}"
+echo "[Info] BUDDY_BENCHMARK_DIR = ${BUDDY_BENCHMARK_DIR}"
+echo "[Info] RESULT_DIR = ${RESULT_DIR}"
+
+echo "[Info] Running CMake configuration..."
+cmake -G Ninja .. \
+  -DCMAKE_C_COMPILER=${C_PATH} \
+  -DCMAKE_CXX_COMPILER=${CXX_PATH} \
+  -DCMAKE_LINKER=${CLinker_PATH} \
+  -DCMAKE_BUILD_TYPE=RELEASE \
+  -DBUDDY_MLIR_BUILD_DIR=${BUDDY_MLIR_BUILD_DIR} \
+  -DGEMMINI_INCLUDE_DIR=${CHIPYARD_DIR}/generators/gemmini/software/gemmini-rocc-tests/include/ \
+  -DGEMMINI_BENCHMARKS=ON \
+  2>&1 | tee "${RESULT_DIR}/cmake_configure.log"
+
+ninja 2>&1 | tee "${RESULT_DIR}/build.log"
+
+# ```[1/21] Creating directories for 'project_googlebenchmark'
+# [2/21] Building C object benchmarks/Gemmini/Ops/MatMulOp/CMakeFiles/ExoMatMul.dir/ExoMatmul.c.o
+# FAILED: benchmarks/Gemmini/Ops/MatMulOp/CMakeFiles/ExoMatMul.dir/ExoMatmul.c.o 
+# riscv64-unknown-linux-gnu-gcc  -I/home/buddy-complier-workspace/buddy-mlir/build/cmake/../../frontend/Interfaces -I/home/buddy-complier-workspace/buddy-mlir/build/cmake/../../thirdparty/include -I/home/buddy-complier-workspace/buddy-benchmark/benchmarks -I/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include -I/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/.. -I/home/xychen/buddy-mlir/frontend/Interfaces -O3 -DNDEBUG -MD -MT benchmarks/Gemmini/Ops/MatMulOp/CMakeFiles/ExoMatMul.dir/ExoMatmul.c.o -MF benchmarks/Gemmini/Ops/MatMulOp/CMakeFiles/ExoMatMul.dir/ExoMatmul.c.o.d -o benchmarks/Gemmini/Ops/MatMulOp/CMakeFiles/ExoMatMul.dir/ExoMatmul.c.o -c /home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c
+# /home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c: In function '_exo_matmul_4':
+# /home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:28:47: error: macro "gemmini_extended_config_ex" requires 7 arguments, but only 6 given
+#    28 |   gemmini_extended_config_ex(WS, 0, 0, 1, 0, 0);
+#       |                                               ^
+# In file included from /home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:23:```
+
+# cd bin
+# ./vectorization-matrix-benchmark 2>&1 | tee "${RESULT_DIR}/run.log"
+
+echo "[Info] CMake, build, and run logs are stored in ${RESULT_DIR}"
diff --git a/test/test_script_imageprocessing.sh b/test/test_script_imageprocessing.sh
new file mode 100755
index 00000000..ab89c912
--- /dev/null
+++ b/test/test_script_imageprocessing.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+
+# NEW: Create results directory and update log file path
+RESULT_DIR="${PWD}/test_result/imageprocessing"
+mkdir -p "$RESULT_DIR"
+LOG="${RESULT_DIR}/image-processing-result.log"
+echo "Benchmark results - $(date)" > "$LOG"
+
+# Function to check CPU flag support
+supports() {
+    local flag=$(echo "$1" | tr '[:upper:]' '[:lower:]')
+    if grep -qi "$flag" /proc/cpuinfo; then
+         return 0
+    else
+         return 1
+    fi
+}
+
+features=("SSE" "AVX2" "AVX512" "NEON")
+images=("../benchmarks/ImageProcessing/Images/YuTu.png")
+kernels=("prewittKernelAlign" "sobel3x3KernelAlign" "sobel5x5KernelAlign" "sobel7x7KernelAlign" "sobel9x9KernelAlign" "laplacianKernelAlign" "logKernelAlign")
+kernelmorphs=("random3x3KernelAlignInt")
+boundaries=("CONSTANT_PADDING" "REPLICATE_PADDING")
+
+for feature in "${features[@]}"; do
+   echo "Testing $feature support" | tee -a "$LOG"
+   if supports "$feature"; then
+       echo "$feature is supported." | tee -a "$LOG"
+       mkdir -p build_${feature} && cd build_${feature}
+       cmake -G Ninja .. \
+           -DCMAKE_BUILD_TYPE=RELEASE \
+           -DIMAGE_PROCESSING_BENCHMARKS=ON \
+           -DOpenCV_DIR=$PWD/../thirdparty/opencv/build/ \
+           -DEIGEN_DIR=$PWD/../thirdparty/eigen/ \
+           -DBUDDY_OPT_ATTR=$(echo "$feature" | tr '[:upper:]' '[:lower:]') \
+           -DBUDDY_MLIR_BUILD_DIR=/home/buddy-complier-workspace/buddy-mlir/build
+       ninja image-processing-benchmark
+       echo "Running image-processing-benchmark for $feature" | tee -a "$LOG"
+       for img in "${images[@]}"; do
+         for kern in "${kernels[@]}"; do
+           for morph in "${kernelmorphs[@]}"; do
+             for boundary in "${boundaries[@]}"; do
+               echo "Running: $img $kern $morph $boundary" | tee -a "$LOG"
+               ./bin/image-processing-benchmark "$img" "$kern" "$morph" "$boundary" 2>&1 | grep -v "Saved PNG file." >> "$LOG"
+             done
+           done
+         done
+       done
+       cd ..
+   else
+       echo "CPU does not support $feature." | tee -a "$LOG"
+   fi
+done
+
+# NEW: Clean up build directories
+for feature in "${features[@]}"; do
+    rm -rf "build_${feature}"
+done
\ No newline at end of file
diff --git a/test/test_script_vectorizationprocessing.sh b/test/test_script_vectorizationprocessing.sh
new file mode 100755
index 00000000..97899922
--- /dev/null
+++ b/test/test_script_vectorizationprocessing.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+
+################################################################################
+# 1. Script Setup
+################################################################################
+set -e
+BUDDY_MLIR_BUILD_DIR="/home/buddy-complier-workspace/buddy-mlir/build"
+LLVM_MLIR_BUILD_DIR="/home/buddy-complier-workspace/buddy-mlir/llvm/build"
+
+echo "[Info] BUDDY_MLIR_BUILD_DIR = ${BUDDY_MLIR_BUILD_DIR}"
+echo "[Info] LLVM_MLIR_BUILD_DIR  = ${LLVM_MLIR_BUILD_DIR}"
+
+RESULT_DIR="${PWD}/test_result/vectorization"
+mkdir -p "${RESULT_DIR}"
+LOG_FILE="${RESULT_DIR}/vectorization_result.log"
+echo "Vectorization Benchmark - $(date)" > "${LOG_FILE}"
+
+################################################################################
+# 2. Build Benchmark
+################################################################################
+mkdir -p build && cd build
+echo "[Info] Running CMake configuration..." | tee -a "${LOG_FILE}"
+cmake -G Ninja .. \
+  -DCMAKE_BUILD_TYPE=RELEASE \
+  -DVECTORIZATION_BENCHMARKS=ON \
+  -DBUDDY_MLIR_BUILD_DIR="${BUDDY_MLIR_BUILD_DIR}" 2>&1 | tee -a "${LOG_FILE}"
+
+echo "[Info] Building vectorization-matrix-benchmark..." | tee -a "${LOG_FILE}"
+ninja vectorization-matrix-benchmark 2>&1 | tee -a "${LOG_FILE}"
+
+################################################################################
+# 3. Run Benchmark
+################################################################################
+cd bin
+echo "[Info] Running vectorization-matrix-benchmark..." | tee -a "${LOG_FILE}"
+./vectorization-matrix-benchmark 2>&1 | tee -a "${LOG_FILE}"
+
+echo "[Info] Benchmark completed. Log saved to ${LOG_FILE}"
\ No newline at end of file
diff --git a/test_result/deeplearning/build_results_crosscompile_summary.log b/test_result/deeplearning/build_results_crosscompile_summary.log
new file mode 100644
index 00000000..df608647
--- /dev/null
+++ b/test_result/deeplearning/build_results_crosscompile_summary.log
@@ -0,0 +1,23 @@
+[Failed]  Build of 'dl-model-tinyllama-benchmark'
+[Failed]  Build of 'dl-model-mobilenetv3-benchmark'
+[Failed]  Build of 'dl-model-lenet-benchmark'
+[Failed]  Build of 'dl-model-bert-benchmark'
+[Failed]  Build of 'dl-model-whisper-benchmark'
+[Failed]  Build of 'dl-model-resnet18-benchmark'
+[Failed]  Build of 'dl-layer-ffn-benchmark'
+[Failed]  Build of 'dl-layer-selfattention-benchmark'
+[Failed]  Build of 'dl-layer-rmsnorm-benchmark'
+[Failed]  Build of 'dl-op-linalg-matmul-benchmark'
+[Failed]  Build of 'dl-op-linalg-conv2d-nchw-fchw-benchmark'
+[Failed]  Build of 'dl-op-linalg-conv2d-nhwc-hwcf-benchmark'
+[Failed]  Build of 'dl-op-linalg-conv2d-nhwc-fhwc-benchmark'
+[Failed]  Build of 'dl-op-linalg-depthwise-conv-2d-nhwc-hwc-benchmark'
+[Failed]  Build of 'dl-op-linalg-pooling-nhwc-sum-benchmark'
+[Failed]  Build of 'dl-op-linalg-batch-matmul-benchmark'
+[Failed]  Build of 'dl-op-linalg-arithaddf-benchmark'
+[Failed]  Build of 'dl-op-linalg-arithdivf-benchmark'
+[Failed]  Build of 'dl-op-linalg-arithmulf-benchmark'
+[Failed]  Build of 'dl-op-linalg-arithnegf-benchmark'
+[Failed]  Build of 'dl-op-linalg-arithsubf-benchmark'
+[Failed]  Build of 'dl-op-linalg-mathfpow-benchmark'
+[Failed]  Build of 'dl-op-linalg-mathrsqrt-benchmark'
diff --git a/test_result/deeplearning/build_results_summary.log b/test_result/deeplearning/build_results_summary.log
new file mode 100644
index 00000000..de1252ba
--- /dev/null
+++ b/test_result/deeplearning/build_results_summary.log
@@ -0,0 +1,29 @@
+[Success] Build of 'dl-model-tinyllama-benchmark'
+[Success] Build of 'dl-model-mobilenetv3-benchmark'
+[Success] Build of 'dl-model-lenet-benchmark'
+[Failed]  Build of 'dl-model-bert-benchmark'
+[Success] Build of 'dl-model-whisper-benchmark'
+[Success] Build of 'dl-model-resnet18-benchmark'
+[Success] Build of 'dl-layer-ffn-benchmark'
+[Success] Build of 'dl-layer-selfattention-benchmark'
+[Success] Build of 'dl-layer-rmsnorm-benchmark'
+[Success] Build of 'dl-op-linalg-matmul-benchmark'
+[Success] Build of 'dl-op-linalg-conv2d-nchw-fchw-benchmark'
+[Success] Build of 'dl-op-linalg-conv2d-nhwc-hwcf-benchmark'
+[Success] Build of 'dl-op-linalg-conv2d-nhwc-fhwc-benchmark'
+[Success] Build of 'dl-op-linalg-depthwise-conv-2d-nhwc-hwc-benchmark'
+[Success] Build of 'dl-op-linalg-pooling-nhwc-sum-benchmark'
+[Success] Build of 'dl-op-linalg-batch-matmul-benchmark'
+[Success] Build of 'dl-op-linalg-arithaddf-benchmark'
+[Success] Build of 'dl-op-linalg-arithdivf-benchmark'
+[Success] Build of 'dl-op-linalg-arithmulf-benchmark'
+[Success] Build of 'dl-op-linalg-arithnegf-benchmark'
+[Success] Build of 'dl-op-linalg-arithsubf-benchmark'
+[Success] Build of 'dl-op-linalg-mathfpow-benchmark'
+[Success] Build of 'dl-op-linalg-mathrsqrt-benchmark'
+[Success] Build of 'dl-op-linalg-mathexp-benchmark'
+[Success] Build of 'dl-op-linalg-reduceaddf-benchmark'
+[Success] Build of 'dl-op-linalg-reducemaxf-benchmark'
+[Success] Build of 'dl-op-linalg-softmax-exp-sum-div-benchmark'
+[Success] Build of 'dl-op-tosa-transpose-benchmark'
+[Success] Build of 'dl-op-matmul-transpose-b-benchmark'
diff --git a/test_result/deeplearning/dl-layer-ffn-benchmark.log b/test_result/deeplearning/dl-layer-ffn-benchmark.log
new file mode 100644
index 00000000..677ea1bb
--- /dev/null
+++ b/test_result/deeplearning/dl-layer-ffn-benchmark.log
@@ -0,0 +1,18 @@
+2025-03-30T12:12:58+00:00
+Running ./dl-layer-ffn-benchmark
+Run on (24 X 5100 MHz CPU s)
+CPU Caches:
+  L1 Data 48 KiB (x12)
+  L1 Instruction 32 KiB (x12)
+  L2 Unified 1280 KiB (x12)
+  L3 Unified 30720 KiB (x1)
+Load Average: 1.00, 1.39, 6.06
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+--------------------------------------------------------------------------
+Benchmark                                Time             CPU   Iterations
+--------------------------------------------------------------------------
+DL_LAYER_FFN/Scalar                  0.065 ms        0.065 ms        10641
+DL_LAYER_FFN/Auto_Vectorization      0.027 ms        0.027 ms        26024
+-----------------------------------------------------------
+Correctness Verification: [32mPASS[0m
+-----------------------------------------------------------
diff --git a/test_result/deeplearning/dl-layer-rmsnorm-benchmark.log b/test_result/deeplearning/dl-layer-rmsnorm-benchmark.log
new file mode 100644
index 00000000..2ce19761
--- /dev/null
+++ b/test_result/deeplearning/dl-layer-rmsnorm-benchmark.log
@@ -0,0 +1,18 @@
+2025-03-30T12:13:02+00:00
+Running ./dl-layer-rmsnorm-benchmark
+Run on (24 X 5100 MHz CPU s)
+CPU Caches:
+  L1 Data 48 KiB (x12)
+  L1 Instruction 32 KiB (x12)
+  L2 Unified 1280 KiB (x12)
+  L3 Unified 30720 KiB (x1)
+Load Average: 1.00, 1.39, 6.06
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+------------------------------------------------------------------------------
+Benchmark                                    Time             CPU   Iterations
+------------------------------------------------------------------------------
+DL_LAYER_RMSNORM/Scalar                  0.002 ms        0.002 ms       356344
+DL_LAYER_RMSNORM/Auto_Vectorization      0.001 ms        0.001 ms       764783
+-----------------------------------------------------------
+Correctness Verification: [32mPASS[0m
+-----------------------------------------------------------
diff --git a/test_result/deeplearning/dl-layer-selfattention-benchmark.log b/test_result/deeplearning/dl-layer-selfattention-benchmark.log
new file mode 100644
index 00000000..17623060
--- /dev/null
+++ b/test_result/deeplearning/dl-layer-selfattention-benchmark.log
@@ -0,0 +1,18 @@
+2025-03-30T12:13:00+00:00
+Running ./dl-layer-selfattention-benchmark
+Run on (24 X 5100 MHz CPU s)
+CPU Caches:
+  L1 Data 48 KiB (x12)
+  L1 Instruction 32 KiB (x12)
+  L2 Unified 1280 KiB (x12)
+  L3 Unified 30720 KiB (x1)
+Load Average: 1.00, 1.39, 6.06
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+--------------------------------------------------------------------------------
+Benchmark                                      Time             CPU   Iterations
+--------------------------------------------------------------------------------
+DL_LAYER_ATTENTION/Scalar                   4.68 ms         4.68 ms          150
+DL_LAYER_ATTENTION/Auto_Vectorization       1.57 ms         1.57 ms          455
+-----------------------------------------------------------
+Correctness Verification: [32mPASS[0m
+-----------------------------------------------------------
diff --git a/test_result/deeplearning/dl-model-lenet-benchmark.log b/test_result/deeplearning/dl-model-lenet-benchmark.log
new file mode 100644
index 00000000..a1ce7074
--- /dev/null
+++ b/test_result/deeplearning/dl-model-lenet-benchmark.log
@@ -0,0 +1,19 @@
+2025-03-30T12:09:01+00:00
+Running ./dl-model-lenet-benchmark
+Run on (24 X 5100 MHz CPU s)
+CPU Caches:
+  L1 Data 48 KiB (x12)
+  L1 Instruction 32 KiB (x12)
+  L2 Unified 1280 KiB (x12)
+  L3 Unified 30720 KiB (x1)
+Load Average: 1.04, 1.86, 7.56
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+-----------------------------------------------------------------------------
+Benchmark                                   Time             CPU   Iterations
+-----------------------------------------------------------------------------
+DL_MODEL_LENET/Auto_Vectorization       0.164 ms        0.164 ms         4368
+DL_MODEL_LENET/Buddy_Vectorization      0.154 ms        0.154 ms         5094
+-----------------------------------------------------------
+Correctness Verification:
+Transform case: [32mPASS[0m
+-----------------------------------------------------------
diff --git a/test_result/deeplearning/dl-model-mobilenetv3-benchmark.log b/test_result/deeplearning/dl-model-mobilenetv3-benchmark.log
new file mode 100644
index 00000000..8bed1b85
--- /dev/null
+++ b/test_result/deeplearning/dl-model-mobilenetv3-benchmark.log
@@ -0,0 +1,19 @@
+2025-03-30T12:08:59+00:00
+Running ./dl-model-mobilenetv3-benchmark
+Run on (24 X 5100 MHz CPU s)
+CPU Caches:
+  L1 Data 48 KiB (x12)
+  L1 Instruction 32 KiB (x12)
+  L2 Unified 1280 KiB (x12)
+  L3 Unified 30720 KiB (x1)
+Load Average: 1.04, 1.86, 7.56
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+-----------------------------------------------------------------------------------
+Benchmark                                         Time             CPU   Iterations
+-----------------------------------------------------------------------------------
+BM_MobileNet_V3/BM_MobileNet_V3_scalar         36.7 ms         36.7 ms           18
+BM_MobileNet_V3/BM_MobileNet_V3_conv_opt       32.6 ms         32.6 ms           22
+-----------------------------------------------------------
+Correctness Verification:
+Transform case: [32mPASS[0m
+-----------------------------------------------------------
diff --git a/test_result/deeplearning/dl-model-resnet18-benchmark.log b/test_result/deeplearning/dl-model-resnet18-benchmark.log
new file mode 100644
index 00000000..e95722a5
--- /dev/null
+++ b/test_result/deeplearning/dl-model-resnet18-benchmark.log
@@ -0,0 +1,18 @@
+2025-03-30T12:12:55+00:00
+Running ./dl-model-resnet18-benchmark
+Run on (24 X 5100 MHz CPU s)
+CPU Caches:
+  L1 Data 48 KiB (x12)
+  L1 Instruction 32 KiB (x12)
+  L2 Unified 1280 KiB (x12)
+  L3 Unified 30720 KiB (x1)
+Load Average: 1.00, 1.39, 6.08
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+--------------------------------------------------------------------------------
+Benchmark                                      Time             CPU   Iterations
+--------------------------------------------------------------------------------
+DL_MODEL_Resnet18/Auto_Vectorization         723 ms          722 ms            1
+DL_MODEL_Resnet18/Buddy_Vectorization        726 ms          718 ms            1
+-----------------------------------------------------------
+Correctness Verification: [32mPASS[0m
+-----------------------------------------------------------
diff --git a/test_result/deeplearning/dl-model-tinyllama-benchmark.log b/test_result/deeplearning/dl-model-tinyllama-benchmark.log
new file mode 100644
index 00000000..e07df494
--- /dev/null
+++ b/test_result/deeplearning/dl-model-tinyllama-benchmark.log
@@ -0,0 +1,19 @@
+2025-03-30T12:03:25+00:00
+Running ./dl-model-tinyllama-benchmark
+Run on (24 X 5100 MHz CPU s)
+CPU Caches:
+  L1 Data 48 KiB (x12)
+  L1 Instruction 32 KiB (x12)
+  L2 Unified 1280 KiB (x12)
+  L3 Unified 30720 KiB (x1)
+Load Average: 2.75, 3.61, 10.42
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+----------------------------------------------------------------------------
+Benchmark                                  Time             CPU   Iterations
+----------------------------------------------------------------------------
+DL_MODEL_TINYLLAMA/scalar             158531 ms       158516 ms            1
+DL_MODEL_TINYLLAMA/matmul_opt           9744 ms         9735 ms            1
+DL_MODEL_TINYLLAMA/matmul_opt_omp       7716 ms         7038 ms            1
+[34m---------- Verification ----------[0m
+matmul_opt [32mPASS[0m
+matmul_opt_omp [32mPASS[0m
diff --git a/test_result/deeplearning/dl-model-whisper-benchmark.log b/test_result/deeplearning/dl-model-whisper-benchmark.log
new file mode 100644
index 00000000..e4534bbc
--- /dev/null
+++ b/test_result/deeplearning/dl-model-whisper-benchmark.log
@@ -0,0 +1,19 @@
+2025-03-30T12:09:03+00:00
+Running ./dl-model-whisper-benchmark
+Run on (24 X 5100 MHz CPU s)
+CPU Caches:
+  L1 Data 48 KiB (x12)
+  L1 Instruction 32 KiB (x12)
+  L2 Unified 1280 KiB (x12)
+  L3 Unified 30720 KiB (x1)
+Load Average: 1.04, 1.84, 7.52
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+-------------------------------------------------------------------------------
+Benchmark                                     Time             CPU   Iterations
+-------------------------------------------------------------------------------
+DL_MODEL_Whisper/Auto_Vectorization       78390 ms        78388 ms            1
+DL_MODEL_Whisper/Buddy_Vectorization      36641 ms        36637 ms            1
+-----------------------------------------------------------
+Correctness Verification for Output1: [32mPASS[0m
+Correctness Verification for Output2: [31mFAIL[0m
+-----------------------------------------------------------
diff --git a/test_result/deeplearning/dl-op-linalg-arithaddf-benchmark.log b/test_result/deeplearning/dl-op-linalg-arithaddf-benchmark.log
new file mode 100644
index 00000000..d89cd1e1
--- /dev/null
+++ b/test_result/deeplearning/dl-op-linalg-arithaddf-benchmark.log
@@ -0,0 +1,19 @@
+2025-03-30T12:13:31+00:00
+Running ./dl-op-linalg-arithaddf-benchmark
+Run on (24 X 5100 MHz CPU s)
+CPU Caches:
+  L1 Data 48 KiB (x12)
+  L1 Instruction 32 KiB (x12)
+  L2 Unified 1280 KiB (x12)
+  L3 Unified 30720 KiB (x1)
+Load Average: 1.00, 1.35, 5.89
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+--------------------------------------------------------------------
+Benchmark                          Time             CPU   Iterations
+--------------------------------------------------------------------
+BM_ADDF_SCALAR                 0.030 ms        0.030 ms        23440
+BM_ADDF_AutoVectorization      0.004 ms        0.004 ms       175032
+-----------------------------------------------------------
+Correctness Verification:
+Transform case: [32mPASS[0m
+-----------------------------------------------------------
diff --git a/test_result/deeplearning/dl-op-linalg-arithdivf-benchmark.log b/test_result/deeplearning/dl-op-linalg-arithdivf-benchmark.log
new file mode 100644
index 00000000..02d6e568
--- /dev/null
+++ b/test_result/deeplearning/dl-op-linalg-arithdivf-benchmark.log
@@ -0,0 +1,19 @@
+2025-03-30T12:13:33+00:00
+Running ./dl-op-linalg-arithdivf-benchmark
+Run on (24 X 5100 MHz CPU s)
+CPU Caches:
+  L1 Data 48 KiB (x12)
+  L1 Instruction 32 KiB (x12)
+  L2 Unified 1280 KiB (x12)
+  L3 Unified 30720 KiB (x1)
+Load Average: 1.00, 1.34, 5.87
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+--------------------------------------------------------------------
+Benchmark                          Time             CPU   Iterations
+--------------------------------------------------------------------
+BM_DIVF_SCALAR                 0.029 ms        0.029 ms        23951
+BM_DIVF_AutoVectorization      0.009 ms        0.009 ms        73837
+-----------------------------------------------------------
+Correctness Verification:
+Transform case: [32mPASS[0m
+-----------------------------------------------------------
diff --git a/test_result/deeplearning/dl-op-linalg-arithmulf-benchmark.log b/test_result/deeplearning/dl-op-linalg-arithmulf-benchmark.log
new file mode 100644
index 00000000..4fa4ffde
--- /dev/null
+++ b/test_result/deeplearning/dl-op-linalg-arithmulf-benchmark.log
@@ -0,0 +1,19 @@
+2025-03-30T12:13:35+00:00
+Running ./dl-op-linalg-arithmulf-benchmark
+Run on (24 X 5100 MHz CPU s)
+CPU Caches:
+  L1 Data 48 KiB (x12)
+  L1 Instruction 32 KiB (x12)
+  L2 Unified 1280 KiB (x12)
+  L3 Unified 30720 KiB (x1)
+Load Average: 1.00, 1.34, 5.87
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+--------------------------------------------------------------------
+Benchmark                          Time             CPU   Iterations
+--------------------------------------------------------------------
+BM_MULF_SCALAR                 0.029 ms        0.029 ms        23549
+BM_MULF_AutoVectorization      0.004 ms        0.004 ms       174752
+-----------------------------------------------------------
+Correctness Verification:
+Transform case: [32mPASS[0m
+-----------------------------------------------------------
diff --git a/test_result/deeplearning/dl-op-linalg-arithnegf-benchmark.log b/test_result/deeplearning/dl-op-linalg-arithnegf-benchmark.log
new file mode 100644
index 00000000..e6387a2a
--- /dev/null
+++ b/test_result/deeplearning/dl-op-linalg-arithnegf-benchmark.log
@@ -0,0 +1,19 @@
+2025-03-30T12:13:37+00:00
+Running ./dl-op-linalg-arithnegf-benchmark
+Run on (24 X 5100 MHz CPU s)
+CPU Caches:
+  L1 Data 48 KiB (x12)
+  L1 Instruction 32 KiB (x12)
+  L2 Unified 1280 KiB (x12)
+  L3 Unified 30720 KiB (x1)
+Load Average: 1.00, 1.34, 5.87
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+--------------------------------------------------------------------
+Benchmark                          Time             CPU   Iterations
+--------------------------------------------------------------------
+BM_NEGF_SCALAR                 0.022 ms        0.022 ms        30658
+BM_NEGF_AutoVectorization      0.003 ms        0.003 ms       245490
+-----------------------------------------------------------
+Correctness Verification:
+Transform case: [32mPASS[0m
+-----------------------------------------------------------
diff --git a/test_result/deeplearning/dl-op-linalg-arithsubf-benchmark.log b/test_result/deeplearning/dl-op-linalg-arithsubf-benchmark.log
new file mode 100644
index 00000000..3a9efa27
--- /dev/null
+++ b/test_result/deeplearning/dl-op-linalg-arithsubf-benchmark.log
@@ -0,0 +1,19 @@
+2025-03-30T12:13:39+00:00
+Running ./dl-op-linalg-arithsubf-benchmark
+Run on (24 X 5100 MHz CPU s)
+CPU Caches:
+  L1 Data 48 KiB (x12)
+  L1 Instruction 32 KiB (x12)
+  L2 Unified 1280 KiB (x12)
+  L3 Unified 30720 KiB (x1)
+Load Average: 1.00, 1.34, 5.84
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+--------------------------------------------------------------------
+Benchmark                          Time             CPU   Iterations
+--------------------------------------------------------------------
+BM_SUBF_SCALAR                 0.029 ms        0.029 ms        23697
+BM_SUBF_AutoVectorization      0.004 ms        0.004 ms       147910
+-----------------------------------------------------------
+Correctness Verification:
+Transform case: [32mPASS[0m
+-----------------------------------------------------------
diff --git a/test_result/deeplearning/dl-op-linalg-batch-matmul-benchmark.log b/test_result/deeplearning/dl-op-linalg-batch-matmul-benchmark.log
new file mode 100644
index 00000000..d187e4d9
--- /dev/null
+++ b/test_result/deeplearning/dl-op-linalg-batch-matmul-benchmark.log
@@ -0,0 +1,25 @@
+2025-03-30T12:13:21+00:00
+Running ./dl-op-linalg-batch-matmul-benchmark
+Run on (24 X 5100 MHz CPU s)
+CPU Caches:
+  L1 Data 48 KiB (x12)
+  L1 Instruction 32 KiB (x12)
+  L2 Unified 1280 KiB (x12)
+  L3 Unified 30720 KiB (x1)
+Load Average: 1.00, 1.36, 5.95
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+---------------------------------------------------------------------------------------------
+Benchmark                                                   Time             CPU   Iterations
+---------------------------------------------------------------------------------------------
+DL_OPS_BATCH_MATMUL/Scalar/iterations:1                  3525 ms         3517 ms            1
+DL_OPS_BATCH_MATMUL/AutoVectorization/iterations:1        976 ms          976 ms            1
+DL_OPS_BATCH_MATMUL/Vectorization/iterations:1            189 ms          189 ms            1
+DL_OPS_BATCH_MATMUL/Tile/iterations:1                     109 ms          109 ms            1
+DL_OPS_BATCH_MATMUL/SCF/iterations:1                      117 ms          117 ms            1
+DL_OPS_BATCH_MATMUL/BROADCAST/iterations:1                353 ms          353 ms            1
+DL_OPS_BATCH_MATMUL/BROADCAST_OMP/iterations:1           75.4 ms         38.2 ms            1
+[34m---------- Verification ----------[0m
+Tile [32mPASS[0m
+SCF [32mPASS[0m
+BROADCAST [32mPASS[0m
+BROADCAST_OMP [32mPASS[0m
diff --git a/test_result/deeplearning/dl-op-linalg-conv2d-nchw-fchw-benchmark.log b/test_result/deeplearning/dl-op-linalg-conv2d-nchw-fchw-benchmark.log
new file mode 100644
index 00000000..4e58a246
--- /dev/null
+++ b/test_result/deeplearning/dl-op-linalg-conv2d-nchw-fchw-benchmark.log
@@ -0,0 +1,19 @@
+2025-03-30T12:13:15+00:00
+Running ./dl-op-linalg-conv2d-nchw-fchw-benchmark
+Run on (24 X 5100 MHz CPU s)
+CPU Caches:
+  L1 Data 48 KiB (x12)
+  L1 Instruction 32 KiB (x12)
+  L2 Unified 1280 KiB (x12)
+  L3 Unified 30720 KiB (x1)
+Load Average: 1.00, 1.37, 5.97
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+-------------------------------------------------------------------
+Benchmark                         Time             CPU   Iterations
+-------------------------------------------------------------------
+BM_Conv2DNchwFchw_SCALAR        283 ms          283 ms            2
+BM_Conv2DNchwFchw_Im2col       10.2 ms         10.2 ms           68
+-----------------------------------------------------------
+Correctness Verification:
+Transform case: [32mPASS[0m
+-----------------------------------------------------------
diff --git a/test_result/deeplearning/dl-op-linalg-conv2d-nhwc-fhwc-benchmark.log b/test_result/deeplearning/dl-op-linalg-conv2d-nhwc-fhwc-benchmark.log
new file mode 100644
index 00000000..08115149
--- /dev/null
+++ b/test_result/deeplearning/dl-op-linalg-conv2d-nhwc-fhwc-benchmark.log
@@ -0,0 +1,21 @@
+2025-03-30T12:13:18+00:00
+Running ./dl-op-linalg-conv2d-nhwc-fhwc-benchmark
+Run on (24 X 5100 MHz CPU s)
+CPU Caches:
+  L1 Data 48 KiB (x12)
+  L1 Instruction 32 KiB (x12)
+  L2 Unified 1280 KiB (x12)
+  L3 Unified 30720 KiB (x1)
+Load Average: 1.00, 1.36, 5.95
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+---------------------------------------------------------------------------------------------------
+Benchmark                                                         Time             CPU   Iterations
+---------------------------------------------------------------------------------------------------
+DL_OPS_CONV_2D_NHWC_FHWC/scalar/iterations:5                   73.5 ms         73.5 ms            5
+DL_OPS_CONV_2D_NHWC_FHWC/auto_vectorization/iterations:5       9.35 ms         9.35 ms            5
+DL_OPS_CONV_2D_NHWC_FHWC/vectorization/iterations:5            1.74 ms         1.74 ms            5
+DL_OPS_CONV_2D_NHWC_FHWC/vec_tile/iterations:5                 1.73 ms         1.73 ms            5
+[34m---------- Verification ----------[0m
+auto_vectorization [32mPASS[0m
+vectorization [32mPASS[0m
+vec_tile [32mPASS[0m
diff --git a/test_result/deeplearning/dl-op-linalg-conv2d-nhwc-hwcf-benchmark.log b/test_result/deeplearning/dl-op-linalg-conv2d-nhwc-hwcf-benchmark.log
new file mode 100644
index 00000000..f09a4101
--- /dev/null
+++ b/test_result/deeplearning/dl-op-linalg-conv2d-nhwc-hwcf-benchmark.log
@@ -0,0 +1,19 @@
+2025-03-30T12:13:17+00:00
+Running ./dl-op-linalg-conv2d-nhwc-hwcf-benchmark
+Run on (24 X 5100 MHz CPU s)
+CPU Caches:
+  L1 Data 48 KiB (x12)
+  L1 Instruction 32 KiB (x12)
+  L2 Unified 1280 KiB (x12)
+  L3 Unified 30720 KiB (x1)
+Load Average: 1.00, 1.37, 5.97
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+---------------------------------------------------------------------------------
+Benchmark                                       Time             CPU   Iterations
+---------------------------------------------------------------------------------
+BM_CONV_2D_NHWC_HWCF_SCALAR                  32.4 ms         32.4 ms           22
+BM_CONV_2D_NHWC_HWCF_AutoVectorization       5.83 ms         5.83 ms          120
+-----------------------------------------------------------
+Correctness Verification:
+Transform case: [32mPASS[0m
+-----------------------------------------------------------
diff --git a/test_result/deeplearning/dl-op-linalg-depthwise-conv-2d-nhwc-hwc-benchmark.log b/test_result/deeplearning/dl-op-linalg-depthwise-conv-2d-nhwc-hwc-benchmark.log
new file mode 100644
index 00000000..c761a6b6
--- /dev/null
+++ b/test_result/deeplearning/dl-op-linalg-depthwise-conv-2d-nhwc-hwc-benchmark.log
@@ -0,0 +1,19 @@
+2025-03-30T12:13:19+00:00
+Running ./dl-op-linalg-depthwise-conv-2d-nhwc-hwc-benchmark
+Run on (24 X 5100 MHz CPU s)
+CPU Caches:
+  L1 Data 48 KiB (x12)
+  L1 Instruction 32 KiB (x12)
+  L2 Unified 1280 KiB (x12)
+  L3 Unified 30720 KiB (x1)
+Load Average: 1.00, 1.36, 5.95
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+------------------------------------------------------------------------------------------------------------
+Benchmark                                                                  Time             CPU   Iterations
+------------------------------------------------------------------------------------------------------------
+DL_OPS_DEPTHWISE_CONV_2D_NHWC_HWC/scalar/iterations:5                   6.25 ms         6.25 ms            5
+DL_OPS_DEPTHWISE_CONV_2D_NHWC_HWC/auto_vectorization/iterations:5       1.71 ms         1.71 ms            5
+DL_OPS_DEPTHWISE_CONV_2D_NHWC_HWC/vectorization/iterations:5           0.128 ms        0.128 ms            5
+[34m---------- Verification ----------[0m
+auto_vectorization [32mPASS[0m
+vectorization [32mPASS[0m
diff --git a/test_result/deeplearning/dl-op-linalg-mathexp-benchmark.log b/test_result/deeplearning/dl-op-linalg-mathexp-benchmark.log
new file mode 100644
index 00000000..c3ecd554
--- /dev/null
+++ b/test_result/deeplearning/dl-op-linalg-mathexp-benchmark.log
@@ -0,0 +1,19 @@
+2025-03-30T12:13:45+00:00
+Running ./dl-op-linalg-mathexp-benchmark
+Run on (24 X 5100 MHz CPU s)
+CPU Caches:
+  L1 Data 48 KiB (x12)
+  L1 Instruction 32 KiB (x12)
+  L2 Unified 1280 KiB (x12)
+  L3 Unified 30720 KiB (x1)
+Load Average: 1.00, 1.33, 5.81
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+-------------------------------------------------------------------
+Benchmark                         Time             CPU   Iterations
+-------------------------------------------------------------------
+BM_EXP_SCALAR                 0.046 ms        0.046 ms        15309
+BM_EXP_AutoVectorization      0.032 ms        0.032 ms        21998
+-----------------------------------------------------------
+Correctness Verification:
+Transform case: [32mPASS[0m
+-----------------------------------------------------------
diff --git a/test_result/deeplearning/dl-op-linalg-mathfpow-benchmark.log b/test_result/deeplearning/dl-op-linalg-mathfpow-benchmark.log
new file mode 100644
index 00000000..018b3377
--- /dev/null
+++ b/test_result/deeplearning/dl-op-linalg-mathfpow-benchmark.log
@@ -0,0 +1,19 @@
+2025-03-30T12:13:42+00:00
+Running ./dl-op-linalg-mathfpow-benchmark
+Run on (24 X 5100 MHz CPU s)
+CPU Caches:
+  L1 Data 48 KiB (x12)
+  L1 Instruction 32 KiB (x12)
+  L2 Unified 1280 KiB (x12)
+  L3 Unified 30720 KiB (x1)
+Load Average: 1.00, 1.34, 5.84
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+--------------------------------------------------------------------
+Benchmark                          Time             CPU   Iterations
+--------------------------------------------------------------------
+BM_FPOW_SCALAR                 0.084 ms        0.084 ms         8347
+BM_FPOW_AutoVectorization      0.057 ms        0.057 ms        12328
+-----------------------------------------------------------
+Correctness Verification:
+Transform case: [32mPASS[0m
+-----------------------------------------------------------
diff --git a/test_result/deeplearning/dl-op-linalg-mathrsqrt-benchmark.log b/test_result/deeplearning/dl-op-linalg-mathrsqrt-benchmark.log
new file mode 100644
index 00000000..bf045f07
--- /dev/null
+++ b/test_result/deeplearning/dl-op-linalg-mathrsqrt-benchmark.log
@@ -0,0 +1,19 @@
+2025-03-30T12:13:43+00:00
+Running ./dl-op-linalg-mathrsqrt-benchmark
+Run on (24 X 5100 MHz CPU s)
+CPU Caches:
+  L1 Data 48 KiB (x12)
+  L1 Instruction 32 KiB (x12)
+  L2 Unified 1280 KiB (x12)
+  L3 Unified 30720 KiB (x1)
+Load Average: 1.00, 1.33, 5.81
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+---------------------------------------------------------------------
+Benchmark                           Time             CPU   Iterations
+---------------------------------------------------------------------
+BM_RSQRT_SCALAR                 0.073 ms        0.073 ms         9497
+BM_RSQRT_AutoVectorization      0.004 ms        0.004 ms       161025
+-----------------------------------------------------------
+Correctness Verification:
+Transform case: [32mPASS[0m
+-----------------------------------------------------------
diff --git a/test_result/deeplearning/dl-op-linalg-matmul-benchmark.log b/test_result/deeplearning/dl-op-linalg-matmul-benchmark.log
new file mode 100644
index 00000000..412446c6
--- /dev/null
+++ b/test_result/deeplearning/dl-op-linalg-matmul-benchmark.log
@@ -0,0 +1,22 @@
+2025-03-30T12:13:04+00:00
+Running ./dl-op-linalg-matmul-benchmark
+Run on (24 X 5100 MHz CPU s)
+CPU Caches:
+  L1 Data 48 KiB (x12)
+  L1 Instruction 32 KiB (x12)
+  L2 Unified 1280 KiB (x12)
+  L3 Unified 30720 KiB (x1)
+Load Average: 1.00, 1.38, 6.03
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+-------------------------------------------------------------------------------
+Benchmark                                     Time             CPU   Iterations
+-------------------------------------------------------------------------------
+DL_OPS_MATMUL/scalar_O0/iterations:1       3716 ms         3716 ms            1
+DL_OPS_MATMUL/scalar_O3/iterations:1       3312 ms         3312 ms            1
+DL_OPS_MATMUL/tile/iterations:1             117 ms          117 ms            1
+DL_OPS_MATMUL/vec/iterations:1              140 ms          140 ms            1
+DL_OPS_MATMUL/vec_omp/iterations:1         20.5 ms         18.8 ms            1
+[34m---------- Verification ----------[0m
+tile [32mPASS[0m
+vec [32mPASS[0m
+vec_omp [32mPASS[0m
diff --git a/test_result/deeplearning/dl-op-linalg-pooling-nhwc-sum-benchmark.log b/test_result/deeplearning/dl-op-linalg-pooling-nhwc-sum-benchmark.log
new file mode 100644
index 00000000..f9296017
--- /dev/null
+++ b/test_result/deeplearning/dl-op-linalg-pooling-nhwc-sum-benchmark.log
@@ -0,0 +1,19 @@
+2025-03-30T12:13:19+00:00
+Running ./dl-op-linalg-pooling-nhwc-sum-benchmark
+Run on (24 X 5100 MHz CPU s)
+CPU Caches:
+  L1 Data 48 KiB (x12)
+  L1 Instruction 32 KiB (x12)
+  L2 Unified 1280 KiB (x12)
+  L3 Unified 30720 KiB (x1)
+Load Average: 1.00, 1.36, 5.95
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+--------------------------------------------------------------------------------
+Benchmark                                      Time             CPU   Iterations
+--------------------------------------------------------------------------------
+BM_POOLING_NHWC_SUM_SCALAR                 0.233 ms        0.233 ms         3007
+BM_POOLING_NHWC_SUM_AutoVectorization      0.042 ms        0.042 ms        16752
+-----------------------------------------------------------
+Correctness Verification:
+Transform case: [32mPASS[0m
+-----------------------------------------------------------
diff --git a/test_result/deeplearning/dl-op-linalg-reduceaddf-benchmark.log b/test_result/deeplearning/dl-op-linalg-reduceaddf-benchmark.log
new file mode 100644
index 00000000..1e8bcc7f
--- /dev/null
+++ b/test_result/deeplearning/dl-op-linalg-reduceaddf-benchmark.log
@@ -0,0 +1,10 @@
+2025-03-30T12:13:47+00:00
+Running ./dl-op-linalg-reduceaddf-benchmark
+Run on (24 X 5100 MHz CPU s)
+CPU Caches:
+  L1 Data 48 KiB (x12)
+  L1 Instruction 32 KiB (x12)
+  L2 Unified 1280 KiB (x12)
+  L3 Unified 30720 KiB (x1)
+Load Average: 1.00, 1.33, 5.81
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
diff --git a/test_result/deeplearning/dl-op-linalg-reducemaxf-benchmark.log b/test_result/deeplearning/dl-op-linalg-reducemaxf-benchmark.log
new file mode 100644
index 00000000..7ed900ff
--- /dev/null
+++ b/test_result/deeplearning/dl-op-linalg-reducemaxf-benchmark.log
@@ -0,0 +1,10 @@
+2025-03-30T12:13:48+00:00
+Running ./dl-op-linalg-reducemaxf-benchmark
+Run on (24 X 5100 MHz CPU s)
+CPU Caches:
+  L1 Data 48 KiB (x12)
+  L1 Instruction 32 KiB (x12)
+  L2 Unified 1280 KiB (x12)
+  L3 Unified 30720 KiB (x1)
+Load Average: 1.08, 1.34, 5.79
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
diff --git a/test_result/deeplearning/dl-op-linalg-softmax-exp-sum-div-benchmark.log b/test_result/deeplearning/dl-op-linalg-softmax-exp-sum-div-benchmark.log
new file mode 100644
index 00000000..37b85c1d
--- /dev/null
+++ b/test_result/deeplearning/dl-op-linalg-softmax-exp-sum-div-benchmark.log
@@ -0,0 +1,19 @@
+2025-03-30T12:13:48+00:00
+Running ./dl-op-linalg-softmax-exp-sum-div-benchmark
+Run on (24 X 5100 MHz CPU s)
+CPU Caches:
+  L1 Data 48 KiB (x12)
+  L1 Instruction 32 KiB (x12)
+  L2 Unified 1280 KiB (x12)
+  L3 Unified 30720 KiB (x1)
+Load Average: 1.08, 1.34, 5.79
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+--------------------------------------------------------------------------------
+Benchmark                                      Time             CPU   Iterations
+--------------------------------------------------------------------------------
+BM_SOFTMAXEXPSUMDIV_SCALAR                 0.006 ms        0.006 ms       124261
+BM_SOFTMAXEXPSUMDIV_AutoVectorization      0.004 ms        0.004 ms       182159
+-----------------------------------------------------------
+Correctness Verification:
+Transform case: [32mPASS[0m
+-----------------------------------------------------------
diff --git a/test_result/deeplearning/dl-op-matmul-transpose-b-benchmark.log b/test_result/deeplearning/dl-op-matmul-transpose-b-benchmark.log
new file mode 100644
index 00000000..ac6c4e30
--- /dev/null
+++ b/test_result/deeplearning/dl-op-matmul-transpose-b-benchmark.log
@@ -0,0 +1,21 @@
+2025-03-30T12:13:50+00:00
+Running ./dl-op-matmul-transpose-b-benchmark
+Run on (24 X 5100 MHz CPU s)
+CPU Caches:
+  L1 Data 48 KiB (x12)
+  L1 Instruction 32 KiB (x12)
+  L2 Unified 1280 KiB (x12)
+  L3 Unified 30720 KiB (x1)
+Load Average: 1.08, 1.34, 5.79
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+-----------------------------------------------------------------------------------------------
+Benchmark                                                     Time             CPU   Iterations
+-----------------------------------------------------------------------------------------------
+DL_OPS_MATMUL_TRANSPOSE_B/scalar_O0/iterations:5           1046 ms         1044 ms            5
+DL_OPS_MATMUL_TRANSPOSE_B/scalar_O3/iterations:5            277 ms          277 ms            5
+DL_OPS_MATMUL_TRANSPOSE_B/scalar_O3_omp/iterations:5       32.4 ms         21.2 ms            5
+DL_OPS_MATMUL_TRANSPOSE_B/vec/iterations:5                 84.6 ms         84.6 ms            5
+[34m---------- Verification ----------[0m
+scalar_O3 [32mPASS[0m
+scalar_O3_omp [32mPASS[0m
+vec [32mPASS[0m
diff --git a/test_result/deeplearning/dl-op-tosa-transpose-benchmark.log b/test_result/deeplearning/dl-op-tosa-transpose-benchmark.log
new file mode 100644
index 00000000..aec2390a
--- /dev/null
+++ b/test_result/deeplearning/dl-op-tosa-transpose-benchmark.log
@@ -0,0 +1,17 @@
+2025-03-30T12:13:50+00:00
+Running ./dl-op-tosa-transpose-benchmark
+Run on (24 X 5100 MHz CPU s)
+CPU Caches:
+  L1 Data 48 KiB (x12)
+  L1 Instruction 32 KiB (x12)
+  L2 Unified 1280 KiB (x12)
+  L3 Unified 30720 KiB (x1)
+Load Average: 1.08, 1.34, 5.79
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+-------------------------------------------------------------------------------------
+Benchmark                                           Time             CPU   Iterations
+-------------------------------------------------------------------------------------
+DL_OPS_TRANSPOSE_2D/scalar_O0/iterations:5       25.4 ms         20.6 ms            5
+DL_OPS_TRANSPOSE_2D/scalar_O3/iterations:5       19.2 ms         14.2 ms            5
+[34m---------- Verification ----------[0m
+scalar_O3 [32mPASS[0m
diff --git a/test_result/deeplearning/run_results_summary.log b/test_result/deeplearning/run_results_summary.log
new file mode 100644
index 00000000..ce1a088d
--- /dev/null
+++ b/test_result/deeplearning/run_results_summary.log
@@ -0,0 +1,29 @@
+[Success] Run of 'dl-model-tinyllama-benchmark'
+[Success] Run of 'dl-model-mobilenetv3-benchmark'
+[Success] Run of 'dl-model-lenet-benchmark'
+[Missing] Executable not found for 'dl-model-bert-benchmark'
+[Success] Run of 'dl-model-whisper-benchmark'
+[Success] Run of 'dl-model-resnet18-benchmark'
+[Success] Run of 'dl-layer-ffn-benchmark'
+[Success] Run of 'dl-layer-selfattention-benchmark'
+[Success] Run of 'dl-layer-rmsnorm-benchmark'
+[Success] Run of 'dl-op-linalg-matmul-benchmark'
+[Success] Run of 'dl-op-linalg-conv2d-nchw-fchw-benchmark'
+[Success] Run of 'dl-op-linalg-conv2d-nhwc-hwcf-benchmark'
+[Success] Run of 'dl-op-linalg-conv2d-nhwc-fhwc-benchmark'
+[Success] Run of 'dl-op-linalg-depthwise-conv-2d-nhwc-hwc-benchmark'
+[Success] Run of 'dl-op-linalg-pooling-nhwc-sum-benchmark'
+[Success] Run of 'dl-op-linalg-batch-matmul-benchmark'
+[Success] Run of 'dl-op-linalg-arithaddf-benchmark'
+[Success] Run of 'dl-op-linalg-arithdivf-benchmark'
+[Success] Run of 'dl-op-linalg-arithmulf-benchmark'
+[Success] Run of 'dl-op-linalg-arithnegf-benchmark'
+[Success] Run of 'dl-op-linalg-arithsubf-benchmark'
+[Success] Run of 'dl-op-linalg-mathfpow-benchmark'
+[Success] Run of 'dl-op-linalg-mathrsqrt-benchmark'
+[Success] Run of 'dl-op-linalg-mathexp-benchmark'
+[Failed]  Run of 'dl-op-linalg-reduceaddf-benchmark'
+[Failed]  Run of 'dl-op-linalg-reducemaxf-benchmark'
+[Success] Run of 'dl-op-linalg-softmax-exp-sum-div-benchmark'
+[Success] Run of 'dl-op-tosa-transpose-benchmark'
+[Success] Run of 'dl-op-matmul-transpose-b-benchmark'
diff --git a/test_result/geminiprocessing/build.log b/test_result/geminiprocessing/build.log
new file mode 100644
index 00000000..8473f261
--- /dev/null
+++ b/test_result/geminiprocessing/build.log
@@ -0,0 +1,655 @@
+[1/21] Creating directories for 'project_googlebenchmark'
+[2/21] Building C object benchmarks/Gemmini/Ops/MatMulOp/CMakeFiles/ExoMatMul.dir/ExoMatmul.c.o
+FAILED: benchmarks/Gemmini/Ops/MatMulOp/CMakeFiles/ExoMatMul.dir/ExoMatmul.c.o 
+riscv64-unknown-linux-gnu-gcc  -I/home/buddy-complier-workspace/buddy-mlir/build/cmake/../../frontend/Interfaces -I/home/buddy-complier-workspace/buddy-mlir/build/cmake/../../thirdparty/include -I/home/buddy-complier-workspace/buddy-benchmark/benchmarks -I/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include -I/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/.. -I/home/xychen/buddy-mlir/frontend/Interfaces -O3 -DNDEBUG -MD -MT benchmarks/Gemmini/Ops/MatMulOp/CMakeFiles/ExoMatMul.dir/ExoMatmul.c.o -MF benchmarks/Gemmini/Ops/MatMulOp/CMakeFiles/ExoMatMul.dir/ExoMatmul.c.o.d -o benchmarks/Gemmini/Ops/MatMulOp/CMakeFiles/ExoMatMul.dir/ExoMatmul.c.o -c /home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c: In function '_exo_matmul_4':
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:28:47: error: macro "gemmini_extended_config_ex" requires 7 arguments, but only 6 given
+   28 |   gemmini_extended_config_ex(WS, 0, 0, 1, 0, 0);
+      |                                               ^
+In file included from /home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:23:
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:251: note: macro "gemmini_extended_config_ex" defined here
+  251 | #define gemmini_extended_config_ex(dataflow, sys_act, sys_shift, relu6_shift, A_stride, A_transpose, B_transpose) \
+      | 
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:28:3: error: 'gemmini_extended_config_ex' undeclared (first use in this function)
+   28 |   gemmini_extended_config_ex(WS, 0, 0, 1, 0, 0);
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:28:3: note: each undeclared identifier is reported only once for each function it appears in
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:35:18: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
+   35 |   int32_t *res = (int32_t*) ((uint32_t)gemm_acc_malloc (16 * 16 * 4 * 4 * sizeof(int32_t)));
+      |                  ^
+In file included from /home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:20,
+                 from /home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:23:
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:66:34: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   66 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16));
+      |                                  ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:15: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |               ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:232:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  232 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:66:9: note: in expansion of macro 'gemmini_extended_preload'
+   66 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16));
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:66:119: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   66 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16));
+      |                                                                                                                       ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:25: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |                         ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:232:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  232 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:66:9: note: in expansion of macro 'gemmini_extended_preload'
+   66 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16));
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:67:44: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   67 |         gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024))/16))), ~((uint32_t)0), (16), (16), 16, 16);
+      |                                            ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:15: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |               ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:219:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  219 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(A_rows) << (ADDR_LEN + 16)) | ((uint64_t)(A_cols) << ADDR_LEN) | (uint64_t)(A), ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), k_COMPUTE_PRELOADED)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:67:9: note: in expansion of macro 'gemmini_extended_compute_preloaded'
+   67 |         gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024))/16))), ~((uint32_t)0), (16), (16), 16, 16);
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:68:34: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   68 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16));
+      |                                  ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:15: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |               ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:232:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  232 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:68:9: note: in expansion of macro 'gemmini_extended_preload'
+   68 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16));
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:68:125: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   68 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16));
+      |                                                                                                                             ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:25: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |                         ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:232:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  232 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:68:9: note: in expansion of macro 'gemmini_extended_preload'
+   68 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16));
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:69:44: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   69 |         gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024))/16))), ~((uint32_t)0), (16), (16), 16, 16);
+      |                                            ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:15: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |               ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:219:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  219 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(A_rows) << (ADDR_LEN + 16)) | ((uint64_t)(A_cols) << ADDR_LEN) | (uint64_t)(A), ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), k_COMPUTE_PRELOADED)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:69:9: note: in expansion of macro 'gemmini_extended_compute_preloaded'
+   69 |         gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024))/16))), ~((uint32_t)0), (16), (16), 16, 16);
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:70:34: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   70 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16));
+      |                                  ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:15: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |               ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:232:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  232 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:70:9: note: in expansion of macro 'gemmini_extended_preload'
+   70 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16));
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:70:133: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   70 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16));
+      |                                                                                                                                     ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:25: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |                         ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:232:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  232 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:70:9: note: in expansion of macro 'gemmini_extended_preload'
+   70 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16));
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:71:44: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   71 |         gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024))/16))), ~((uint32_t)0), (16), (16), 16, 16);
+      |                                            ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:15: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |               ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:219:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  219 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(A_rows) << (ADDR_LEN + 16)) | ((uint64_t)(A_cols) << ADDR_LEN) | (uint64_t)(A), ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), k_COMPUTE_PRELOADED)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:71:9: note: in expansion of macro 'gemmini_extended_compute_preloaded'
+   71 |         gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024))/16))), ~((uint32_t)0), (16), (16), 16, 16);
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:72:34: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   72 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16));
+      |                                  ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:15: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |               ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:232:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  232 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:72:9: note: in expansion of macro 'gemmini_extended_preload'
+   72 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16));
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:72:133: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   72 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16));
+      |                                                                                                                                     ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:25: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |                         ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:232:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  232 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:72:9: note: in expansion of macro 'gemmini_extended_preload'
+   72 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16));
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:73:44: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   73 |         gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024))/16))), ~((uint32_t)0), (16), (16), 16, 16);
+      |                                            ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:15: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |               ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:219:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  219 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(A_rows) << (ADDR_LEN + 16)) | ((uint64_t)(A_cols) << ADDR_LEN) | (uint64_t)(A), ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), k_COMPUTE_PRELOADED)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:73:9: note: in expansion of macro 'gemmini_extended_compute_preloaded'
+   73 |         gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024))/16))), ~((uint32_t)0), (16), (16), 16, 16);
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:74:34: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   74 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + 1024)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16));
+      |                                  ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:15: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |               ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:232:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  232 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:74:9: note: in expansion of macro 'gemmini_extended_preload'
+   74 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + 1024)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16));
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:74:126: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   74 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + 1024)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16));
+      |                                                                                                                              ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:25: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |                         ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:232:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  232 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:74:9: note: in expansion of macro 'gemmini_extended_preload'
+   74 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + 1024)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16));
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:75:44: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   75 |         gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + 256)/16))), ~((uint32_t)0), (16), (16), 16, 16);
+      |                                            ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:15: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |               ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:219:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  219 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(A_rows) << (ADDR_LEN + 16)) | ((uint64_t)(A_cols) << ADDR_LEN) | (uint64_t)(A), ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), k_COMPUTE_PRELOADED)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:75:9: note: in expansion of macro 'gemmini_extended_compute_preloaded'
+   75 |         gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + 256)/16))), ~((uint32_t)0), (16), (16), 16, 16);
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:76:34: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   76 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + 1024 + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16));
+      |                                  ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:15: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |               ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:232:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  232 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:76:9: note: in expansion of macro 'gemmini_extended_preload'
+   76 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + 1024 + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16));
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:76:132: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   76 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + 1024 + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16));
+      |                                                                                                                                    ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:25: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |                         ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:232:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  232 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:76:9: note: in expansion of macro 'gemmini_extended_preload'
+   76 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + 1024 + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16));
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:77:44: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   77 |         gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + 256)/16))), ~((uint32_t)0), (16), (16), 16, 16);
+      |                                            ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:15: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |               ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:219:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  219 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(A_rows) << (ADDR_LEN + 16)) | ((uint64_t)(A_cols) << ADDR_LEN) | (uint64_t)(A), ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), k_COMPUTE_PRELOADED)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:77:9: note: in expansion of macro 'gemmini_extended_compute_preloaded'
+   77 |         gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + 256)/16))), ~((uint32_t)0), (16), (16), 16, 16);
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:78:34: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   78 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + 1024 + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16));
+      |                                  ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:15: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |               ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:232:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  232 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:78:9: note: in expansion of macro 'gemmini_extended_preload'
+   78 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + 1024 + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16));
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:78:140: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   78 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + 1024 + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16));
+      |                                                                                                                                            ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:25: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |                         ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:232:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  232 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:78:9: note: in expansion of macro 'gemmini_extended_preload'
+   78 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + 1024 + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16));
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:79:44: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   79 |         gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + 256)/16))), ~((uint32_t)0), (16), (16), 16, 16);
+      |                                            ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:15: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |               ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:219:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  219 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(A_rows) << (ADDR_LEN + 16)) | ((uint64_t)(A_cols) << ADDR_LEN) | (uint64_t)(A), ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), k_COMPUTE_PRELOADED)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:79:9: note: in expansion of macro 'gemmini_extended_compute_preloaded'
+   79 |         gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + 256)/16))), ~((uint32_t)0), (16), (16), 16, 16);
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:80:34: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   80 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + 1024 + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16));
+      |                                  ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:15: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |               ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:232:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  232 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:80:9: note: in expansion of macro 'gemmini_extended_preload'
+   80 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + 1024 + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16));
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:80:140: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   80 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + 1024 + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16));
+      |                                                                                                                                            ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:25: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |                         ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:232:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  232 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:80:9: note: in expansion of macro 'gemmini_extended_preload'
+   80 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + 1024 + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16));
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:81:44: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   81 |         gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + 256)/16))), ~((uint32_t)0), (16), (16), 16, 16);
+      |                                            ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:15: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |               ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:219:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  219 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(A_rows) << (ADDR_LEN + 16)) | ((uint64_t)(A_cols) << ADDR_LEN) | (uint64_t)(A), ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), k_COMPUTE_PRELOADED)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:81:9: note: in expansion of macro 'gemmini_extended_compute_preloaded'
+   81 |         gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + 256)/16))), ~((uint32_t)0), (16), (16), 16, 16);
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:82:34: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   82 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (2) * (1024))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16));
+      |                                  ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:15: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |               ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:232:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  232 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:82:9: note: in expansion of macro 'gemmini_extended_preload'
+   82 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (2) * (1024))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16));
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:82:134: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   82 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (2) * (1024))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16));
+      |                                                                                                                                      ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:25: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |                         ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:232:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  232 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:82:9: note: in expansion of macro 'gemmini_extended_preload'
+   82 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (2) * (1024))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16));
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:83:44: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   83 |         gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + (2) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16);
+      |                                            ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:15: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |               ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:219:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  219 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(A_rows) << (ADDR_LEN + 16)) | ((uint64_t)(A_cols) << ADDR_LEN) | (uint64_t)(A), ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), k_COMPUTE_PRELOADED)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:83:9: note: in expansion of macro 'gemmini_extended_compute_preloaded'
+   83 |         gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + (2) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16);
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:84:34: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   84 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (2) * (1024) + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16));
+      |                                  ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:15: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |               ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:232:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  232 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:84:9: note: in expansion of macro 'gemmini_extended_preload'
+   84 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (2) * (1024) + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16));
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:84:140: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   84 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (2) * (1024) + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16));
+      |                                                                                                                                            ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:25: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |                         ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:232:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  232 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:84:9: note: in expansion of macro 'gemmini_extended_preload'
+   84 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (2) * (1024) + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16));
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:85:44: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   85 |         gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + (2) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16);
+      |                                            ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:15: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |               ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:219:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  219 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(A_rows) << (ADDR_LEN + 16)) | ((uint64_t)(A_cols) << ADDR_LEN) | (uint64_t)(A), ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), k_COMPUTE_PRELOADED)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:85:9: note: in expansion of macro 'gemmini_extended_compute_preloaded'
+   85 |         gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + (2) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16);
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:86:34: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   86 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (2) * (1024) + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16));
+      |                                  ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:15: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |               ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:232:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  232 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:86:9: note: in expansion of macro 'gemmini_extended_preload'
+   86 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (2) * (1024) + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16));
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:86:148: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   86 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (2) * (1024) + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16));
+      |                                                                                                                                                    ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:25: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |                         ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:232:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  232 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:86:9: note: in expansion of macro 'gemmini_extended_preload'
+   86 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (2) * (1024) + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16));
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:87:44: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   87 |         gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + (2) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16);
+      |                                            ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:15: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |               ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:219:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  219 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(A_rows) << (ADDR_LEN + 16)) | ((uint64_t)(A_cols) << ADDR_LEN) | (uint64_t)(A), ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), k_COMPUTE_PRELOADED)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:87:9: note: in expansion of macro 'gemmini_extended_compute_preloaded'
+   87 |         gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + (2) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16);
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:88:34: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   88 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (2) * (1024) + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16));
+      |                                  ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:15: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |               ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:232:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  232 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:88:9: note: in expansion of macro 'gemmini_extended_preload'
+   88 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (2) * (1024) + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16));
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:88:148: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   88 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (2) * (1024) + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16));
+      |                                                                                                                                                    ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:25: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |                         ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:232:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  232 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:88:9: note: in expansion of macro 'gemmini_extended_preload'
+   88 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (2) * (1024) + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16));
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:89:44: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   89 |         gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + (2) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16);
+      |                                            ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:15: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |               ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:219:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  219 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(A_rows) << (ADDR_LEN + 16)) | ((uint64_t)(A_cols) << ADDR_LEN) | (uint64_t)(A), ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), k_COMPUTE_PRELOADED)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:89:9: note: in expansion of macro 'gemmini_extended_compute_preloaded'
+   89 |         gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + (2) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16);
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:90:34: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   90 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (3) * (1024))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16));
+      |                                  ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:15: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |               ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:232:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  232 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:90:9: note: in expansion of macro 'gemmini_extended_preload'
+   90 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (3) * (1024))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16));
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:90:134: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   90 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (3) * (1024))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16));
+      |                                                                                                                                      ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:25: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |                         ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:232:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  232 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:90:9: note: in expansion of macro 'gemmini_extended_preload'
+   90 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (3) * (1024))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16))) | 0x40000000, (16), (16), (16), (16));
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:91:44: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   91 |         gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + (3) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16);
+      |                                            ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:15: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |               ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:219:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  219 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(A_rows) << (ADDR_LEN + 16)) | ((uint64_t)(A_cols) << ADDR_LEN) | (uint64_t)(A), ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), k_COMPUTE_PRELOADED)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:91:9: note: in expansion of macro 'gemmini_extended_compute_preloaded'
+   91 |         gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + (3) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16);
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:92:34: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   92 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (3) * (1024) + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16));
+      |                                  ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:15: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |               ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:232:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  232 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:92:9: note: in expansion of macro 'gemmini_extended_preload'
+   92 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (3) * (1024) + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16));
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:92:140: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   92 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (3) * (1024) + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16));
+      |                                                                                                                                            ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:25: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |                         ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:232:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  232 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:92:9: note: in expansion of macro 'gemmini_extended_preload'
+   92 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (3) * (1024) + 256)/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16))) | 0x40000000, (16), (16), (16), (16));
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:93:44: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   93 |         gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + (3) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16);
+      |                                            ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:15: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |               ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:219:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  219 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(A_rows) << (ADDR_LEN + 16)) | ((uint64_t)(A_cols) << ADDR_LEN) | (uint64_t)(A), ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), k_COMPUTE_PRELOADED)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:93:9: note: in expansion of macro 'gemmini_extended_compute_preloaded'
+   93 |         gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + (3) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16);
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:94:34: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   94 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (3) * (1024) + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16));
+      |                                  ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:15: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |               ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:232:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  232 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:94:9: note: in expansion of macro 'gemmini_extended_preload'
+   94 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (3) * (1024) + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16));
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:94:148: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   94 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (3) * (1024) + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16));
+      |                                                                                                                                                    ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:25: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |                         ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:232:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  232 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:94:9: note: in expansion of macro 'gemmini_extended_preload'
+   94 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (3) * (1024) + (2) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16))) | 0x40000000, (16), (16), (16), (16));
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:95:44: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   95 |         gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + (3) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16);
+      |                                            ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:15: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |               ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:219:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  219 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(A_rows) << (ADDR_LEN + 16)) | ((uint64_t)(A_cols) << ADDR_LEN) | (uint64_t)(A), ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), k_COMPUTE_PRELOADED)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:95:9: note: in expansion of macro 'gemmini_extended_compute_preloaded'
+   95 |         gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + (3) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16);
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:96:34: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   96 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (3) * (1024) + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16));
+      |                                  ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:15: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |               ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:232:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  232 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:96:9: note: in expansion of macro 'gemmini_extended_preload'
+   96 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (3) * (1024) + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16));
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:96:148: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   96 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (3) * (1024) + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16));
+      |                                                                                                                                                    ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:25: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |                         ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:232:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  232 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:96:9: note: in expansion of macro 'gemmini_extended_preload'
+   96 |         gemmini_extended_preload((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)b)) + ((j) * (4096) + (3) * (1024) + (3) * (256))/16))), (uint32_t)(&*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16))) | 0x40000000, (16), (16), (16), (16));
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:97:44: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   97 |         gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + (3) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16);
+      |                                            ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:15: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |               ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:219:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  219 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(A_rows) << (ADDR_LEN + 16)) | ((uint64_t)(A_cols) << ADDR_LEN) | (uint64_t)(A), ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), k_COMPUTE_PRELOADED)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:97:9: note: in expansion of macro 'gemmini_extended_compute_preloaded'
+   97 |         gemmini_extended_compute_preloaded((uint32_t)(&*(int8_t*)((uint64_t)( ((uint32_t)((uint64_t)a)) + ((i) * (1024) + (3) * (256))/16))), ~((uint32_t)0), (16), (16), 16, 16);
+      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:98:89: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   98 |         gemmini_extended_mvout( ((uint64_t) &C[(16 * i + 3136 * io) * (256) + 64 * j]), (uint32_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16)), (16), (16) );
+      |                                                                                         ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:25: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |                         ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:212:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  212 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, dram_addr, ((uint64_t)(rows) << (ADDR_LEN + 16)) | ((uint64_t)(cols) << ADDR_LEN) | (uint64_t)(spad_addr), k_MVOUT)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:98:9: note: in expansion of macro 'gemmini_extended_mvout'
+   98 |         gemmini_extended_mvout( ((uint64_t) &C[(16 * i + 3136 * io) * (256) + 64 * j]), (uint32_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024))/16)), (16), (16) );
+      |         ^~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:99:94: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+   99 |         gemmini_extended_mvout( ((uint64_t) &C[(16 * i + 3136 * io) * (256) + 16 + 64 * j]), (uint32_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16)), (16), (16) );
+      |                                                                                              ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:25: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |                         ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:212:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  212 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, dram_addr, ((uint64_t)(rows) << (ADDR_LEN + 16)) | ((uint64_t)(cols) << ADDR_LEN) | (uint64_t)(spad_addr), k_MVOUT)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:99:9: note: in expansion of macro 'gemmini_extended_mvout'
+   99 |         gemmini_extended_mvout( ((uint64_t) &C[(16 * i + 3136 * io) * (256) + 16 + 64 * j]), (uint32_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + 256)/16)), (16), (16) );
+      |         ^~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:100:94: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+  100 |         gemmini_extended_mvout( ((uint64_t) &C[(16 * i + 3136 * io) * (256) + 32 + 64 * j]), (uint32_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16)), (16), (16) );
+      |                                                                                              ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:25: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |                         ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:212:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  212 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, dram_addr, ((uint64_t)(rows) << (ADDR_LEN + 16)) | ((uint64_t)(cols) << ADDR_LEN) | (uint64_t)(spad_addr), k_MVOUT)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:100:9: note: in expansion of macro 'gemmini_extended_mvout'
+  100 |         gemmini_extended_mvout( ((uint64_t) &C[(16 * i + 3136 * io) * (256) + 32 + 64 * j]), (uint32_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (2) * (256))/16)), (16), (16) );
+      |         ^~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:101:94: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+  101 |         gemmini_extended_mvout( ((uint64_t) &C[(16 * i + 3136 * io) * (256) + 48 + 64 * j]), (uint32_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16)), (16), (16) );
+      |                                                                                              ^
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/../rocc-software/src/xcustom.h:152:25: note: in definition of macro 'ROCC_INSTRUCTION_0_R_R'
+  152 |         : "r"(rs1), "r"(rs2));                                                       \
+      |                         ^~~
+/home/buddy-complier-workspace/chipyard/generators/gemmini/software/gemmini-rocc-tests/include/gemmini.h:212:3: note: in expansion of macro 'ROCC_INSTRUCTION_RS1_RS2'
+  212 |   ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, dram_addr, ((uint64_t)(rows) << (ADDR_LEN + 16)) | ((uint64_t)(cols) << ADDR_LEN) | (uint64_t)(spad_addr), k_MVOUT)
+      |   ^~~~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:101:9: note: in expansion of macro 'gemmini_extended_mvout'
+  101 |         gemmini_extended_mvout( ((uint64_t) &C[(16 * i + 3136 * io) * (256) + 48 + 64 * j]), (uint32_t) &*(int32_t*)((uint64_t)( ((uint32_t)((uint64_t)res)) + ((j) * (1024) + (3) * (256))/16)), (16), (16) );
+      |         ^~~~~~~~~~~~~~~~~~~~~~
+/home/buddy-complier-workspace/buddy-benchmark/benchmarks/Gemmini/Ops/MatMulOp/ExoMatmul.c:105:17: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
+  105 |   gemm_acc_free((uint32_t)(res));
+      |                 ^
+[3/21] Generating buddy_matmul.o
+[4/21] Building CXX object benchmarks/Gemmini/ResNet-101/CMakeFiles/CRunnerUtils.dir/CRunnerUtils.cpp.o
+[5/21] Performing download step (git clone) for 'project_googlebenchmark'
+Cloning into 'project_googlebenchmark'...
+HEAD is now at f91b6b4 bump version to 1.6 in preparation for release
+[6/21] Generating resnet-101.o
+ninja: build stopped: subcommand failed.
diff --git a/test_result/geminiprocessing/cmake_configure.log b/test_result/geminiprocessing/cmake_configure.log
new file mode 100644
index 00000000..a3a42f37
--- /dev/null
+++ b/test_result/geminiprocessing/cmake_configure.log
@@ -0,0 +1,37 @@
+-- The CXX compiler identification is GNU 9.2.0
+-- The C compiler identification is GNU 9.2.0
+-- Detecting CXX compiler ABI info
+-- Detecting CXX compiler ABI info - done
+-- Check for working CXX compiler: /home/buddy-complier-workspace/chipyard/.conda-env/esp-tools/bin/riscv64-unknown-linux-gnu-g++ - skipped
+-- Detecting CXX compile features
+-- Detecting CXX compile features - done
+-- Detecting C compiler ABI info
+-- Detecting C compiler ABI info - done
+-- Check for working C compiler: /home/buddy-complier-workspace/chipyard/.conda-env/esp-tools/bin/riscv64-unknown-linux-gnu-gcc - skipped
+-- Detecting C compile features
+-- Detecting C compile features - done
+-- Configuring Target Architecture: avx512f
+-- Configuring Target Triple: x86_64-unknown-linux-gnu
+-- Configuring benchmarks: google
+-- Performing Test CMAKE_HAVE_LIBC_PTHREAD
+-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Failed
+-- Looking for pthread_create in pthreads
+-- Looking for pthread_create in pthreads - not found
+-- Looking for pthread_create in pthread
+-- Looking for pthread_create in pthread - found
+-- Found Threads: TRUE  
+-- Performing Test HAVE_SSE
+-- Performing Test HAVE_SSE - Failed
+-- 	SSE support - no
+-- Performing Test HAVE_AVX2
+-- Performing Test HAVE_AVX2 - Failed
+-- 	AVX2 support - no
+-- Performing Test HAVE_AVX512
+-- Performing Test HAVE_AVX512 - Failed
+-- 	AVX512 support - no
+-- Performing Test HAVE_NEON
+-- Performing Test HAVE_NEON - Failed
+-- 	Arm Neon support - no
+-- Configuring done
+-- Generating done
+-- Build files have been written to: /home/buddy-complier-workspace/buddy-benchmark/build