Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 101 additions & 0 deletions build_tools/riscv/mips_matmul_test.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
// mips_matmul_test.mlir
//
// End-to-end test inputs for the MIPS matmul kernel pipeline.
//
// Each function exercises torch.aten.mm, which is intercepted by
// ConvertTorchToMIPSPass and rewritten as mips.matmul. The op is then
// eliminated during One-Shot Bufferize: MIPSBufferizableOpInterface
// decomposes the 2-D memrefs and emits a direct func.call to the
// hand-tuned C kernel:
//
// torch.aten.mm
// → mips.matmul (ConvertTorchToMIPSPass)
// → flow.dispatch(...) (IREE dispatch formation)
// → func.call @my_matmul_kernel (MIPSBufferizableOpInterface)
// → ELF inside .vmfb (iree-compile LLVMCPU backend)
//
// Usage:
// bash build_tools/riscv/rvv_qemu_workflow_static.sh -- static (.o baked into vmfb)
// bash build_tools/riscv/rvv_qemu_workflow_dynamic.sh -- dynamic (.so plugin at runtime)

module {
// ── Test 1: 4×4 identity × data → passthrough ────────────────────────────
// Verifies that A=I leaves B unchanged; a simple correctness smoke-test.
//
// A = identity(4×4), B = [[1..4],[5..8],[9..12],[13..16]]
// Expected: result = B
func.func @matmul_4x4(
%A : !torch.vtensor<[4,4],f32>,
%B : !torch.vtensor<[4,4],f32>)
-> !torch.vtensor<[4,4],f32> {
%0 = torch.aten.mm %A, %B
: !torch.vtensor<[4,4],f32>, !torch.vtensor<[4,4],f32>
-> !torch.vtensor<[4,4],f32>
return %0 : !torch.vtensor<[4,4],f32>
}

// ── Test 2: 2×3 × 3×2 → 2×2 (non-square, reduced K dimension) ──────────
// Verifies M≠N≠K path through the kernel (inner loop trip-count < vlen).
//
// A = [[1,2,3],[4,5,6]], B = [[1,0],[0,1],[1,0]]
// Expected: [[1+0+3, 0+2+0],[4+0+6, 0+5+0]] = [[4,2],[10,5]]
func.func @matmul_2x3x2(
%A : !torch.vtensor<[2,3],f32>,
%B : !torch.vtensor<[3,2],f32>)
-> !torch.vtensor<[2,2],f32> {
%0 = torch.aten.mm %A, %B
: !torch.vtensor<[2,3],f32>, !torch.vtensor<[3,2],f32>
-> !torch.vtensor<[2,2],f32>
return %0 : !torch.vtensor<[2,2],f32>
}

// ── Test 3: 8×8 × 8×8 → 8×8 (exercises multi-vector-register tiling) ───
// With vlen=512 and LMUL=m4, N=8 fits in a single VL group. This test
// stresses the vectorized inner loop and accumulation across K=8 steps.
//
// A = upper-triangular ones (row i has ones in columns 0..i).
// B = identity(8×8).
// Expected: A*I = A — result is upper-triangular ones.
//
// A row layout (8×8):
// row 0: [1,0,0,0,0,0,0,0]
// row 1: [1,1,0,0,0,0,0,0]
// row 2: [1,1,1,0,0,0,0,0]
// ...
// row 7: [1,1,1,1,1,1,1,1]
func.func @matmul_8x8(
%A : !torch.vtensor<[8,8],f32>,
%B : !torch.vtensor<[8,8],f32>)
-> !torch.vtensor<[8,8],f32> {
%0 = torch.aten.mm %A, %B
: !torch.vtensor<[8,8],f32>, !torch.vtensor<[8,8],f32>
-> !torch.vtensor<[8,8],f32>
return %0 : !torch.vtensor<[8,8],f32>
}
}

// ─────────────────────────────────────────────────────────────────────────────
// Expected outputs (iree-run-module)
// ─────────────────────────────────────────────────────────────────────────────
//
// matmul_4x4 A=identity(4x4), B=[1..16 row-major]:
// result[0]: 4x4xf32=[1 2 3 4][5 6 7 8][9 10 11 12][13 14 15 16]
//
// matmul_2x3x2 A=[[1,2,3],[4,5,6]], B=[[1,0],[0,1],[1,0]]:
// result[0]: 2x2xf32=[4 2][10 5]
//
// matmul_8x8 A=upper-triangular-ones(8x8), B=identity(8x8):
// result[0]: 8x8xf32=
// [1 0 0 0 0 0 0 0]
// [1 1 0 0 0 0 0 0]
// [1 1 1 0 0 0 0 0]
// [1 1 1 1 0 0 0 0]
// [1 1 1 1 1 0 0 0]
// [1 1 1 1 1 1 0 0]
// [1 1 1 1 1 1 1 0]
// [1 1 1 1 1 1 1 1]
//
// iree-run-module invocation for matmul_8x8:
// --function=matmul_8x8
// "--input=8x8xf32=1,0,0,0,0,0,0,0, 1,1,0,0,0,0,0,0, 1,1,1,0,0,0,0,0, 1,1,1,1,0,0,0,0, 1,1,1,1,1,0,0,0, 1,1,1,1,1,1,0,0, 1,1,1,1,1,1,1,0, 1,1,1,1,1,1,1,1"
// "--input=8x8xf32=1,0,0,0,0,0,0,0, 0,1,0,0,0,0,0,0, 0,0,1,0,0,0,0,0, 0,0,0,1,0,0,0,0, 0,0,0,0,1,0,0,0, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,1"
206 changes: 206 additions & 0 deletions build_tools/riscv/rvv_qemu_workflow_dynamic.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
#!/usr/bin/env bash
# rvv_qemu_workflow_dynamic.sh
#
# End-to-end MIPS matmul pipeline — DYNAMIC plugin loading.
#
# The RVV kernel is compiled into a shared library (.so) that is loaded at
# runtime via --executable_plugin. No custom linker wrapper is needed at
# iree-compile time.
#
# Pipeline:
# mips_matmul_test.mlir
# ─[iree-opt torch-to-iree{use-mips-matmul=true}]─► flow.mlir
# ─[clang --target=riscv64 -shared]──────────────► librvv_matmul.so
# ─[iree-compile --iree-llvmcpu-link-embedded=false]► matmul.vmfb
# ─[qemu-riscv64 iree-run-module --executable_plugin]► result
#
# Usage:
# bash rvv_qemu_workflow_dynamic.sh # RISC-V QEMU, vlen=512
# bash rvv_qemu_workflow_dynamic.sh --host # x86 host (scalar fallback)
# bash rvv_qemu_workflow_dynamic.sh --vlen 256 # QEMU with vlen=256

set -euo pipefail

# ─────────────────────────────────────────────────────────────────────────────
# Configuration
# ─────────────────────────────────────────────────────────────────────────────
IREE_SRC="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
WORK_DIR="${HOME}/MLIR_Work/mips"
HOST_BUILD="${WORK_DIR}/iree-build"
HOST_INSTALL="${HOST_BUILD}/install"
RISCV_BUILD="${WORK_DIR}/iree-build-riscv"
OUT_DIR="${WORK_DIR}/out/dynamic"

IREE_OPT="${HOST_INSTALL}/bin/iree-opt"
IREE_COMPILE="${HOST_INSTALL}/bin/iree-compile"
HOST_RUN="${HOST_INSTALL}/bin/iree-run-module"
RISCV_RUN="${RISCV_BUILD}/install/bin/iree-run-module"
QEMU="${HOME}/local/bin/qemu-riscv64"
SYSROOT="${HOME}/riscv/toolchain/clang/linux/RISCV/sysroot"

CLANG="${HOME}/miniforge3/bin/clang"
LLD="${HOME}/miniforge3/bin/ld.lld"
CLANG_INC="${HOME}/miniforge3/lib/clang/18/include"

KERNEL_SRC="${IREE_SRC}/runtime/src/iree/builtins/mips/matmul_kernel.c"
PLUGIN_SRC="${IREE_SRC}/runtime/src/iree/builtins/mips/matmul_plugin.c"
TEST_MLIR="${IREE_SRC}/build_tools/riscv/mips_matmul_test.mlir"

# Rocky 8's libstdc++ is too old; conda has GLIBCXX 3.4.29+.
export LD_LIBRARY_PATH="${HOME}/miniforge3/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"

# ─────────────────────────────────────────────────────────────────────────────
# Argument parsing
# ─────────────────────────────────────────────────────────────────────────────
HOST_MODE=0
VLEN=512
while [[ $# -gt 0 ]]; do
case "$1" in
--host) HOST_MODE=1 ;;
--vlen) shift; VLEN="$1" ;;
*) echo "Unknown arg: $1"; exit 1 ;;
esac
shift
done

mkdir -p "${OUT_DIR}"

# ─────────────────────────────────────────────────────────────────────────────
# Helpers
# ─────────────────────────────────────────────────────────────────────────────
section() { echo ""; echo "══[ $* ]══════════════════════════════════════════════════"; }
ok() { echo " [ok] $*"; }
run_qemu() {
local vlen="$1"; shift
"${QEMU}" -cpu "rv64,v=true,vlen=${vlen},elen=64,vext_spec=v1.0" \
-L "${SYSROOT}" "${RISCV_RUN}" "$@"
}

# ─────────────────────────────────────────────────────────────────────────────
# Step 1: torch → IREE flow IR
# ─────────────────────────────────────────────────────────────────────────────
section "Step 1: torch → IREE flow IR"

"${IREE_OPT}" \
--pass-pipeline="builtin.module(torch-to-iree{use-mips-matmul=true})" \
"${TEST_MLIR}" -o "${OUT_DIR}/flow.mlir"
ok "${OUT_DIR}/flow.mlir"

# ─────────────────────────────────────────────────────────────────────────────
# Step 2: Cross-compile kernel + plugin → shared library
#
# matmul_kernel.c — compute logic (no IREE headers)
# matmul_plugin.c — IREE HAL executable plugin interface
# Both compiled together into a single -fPIC -shared .so.
# ─────────────────────────────────────────────────────────────────────────────
section "Step 2: Compile matmul_kernel.c + matmul_plugin.c → .so"

PLUGIN_SO="${OUT_DIR}/librvv_matmul.so"

if [[ "${HOST_MODE}" == "1" ]]; then
"${CLANG}" --target=x86_64-linux-gnu \
-O2 -fPIC -shared \
-I "${IREE_SRC}/runtime/src" \
"${KERNEL_SRC}" "${PLUGIN_SRC}" -o "${PLUGIN_SO}"
ok "x86 scalar plugin: ${PLUGIN_SO}"
else
"${CLANG}" --target=riscv64-linux-gnu -march=rv64gcv -mabi=lp64d \
-O2 -fPIC -shared -nostdinc -nostdlib \
-isystem "${CLANG_INC}" \
-fuse-ld="${LLD}" \
-I "${IREE_SRC}/runtime/src" \
"${KERNEL_SRC}" "${PLUGIN_SRC}" -o "${PLUGIN_SO}"
ok "RISC-V RVV plugin: ${PLUGIN_SO} ($(file -b "${PLUGIN_SO}" | cut -d, -f1))"
fi

# ─────────────────────────────────────────────────────────────────────────────
# Step 3: iree-compile → .vmfb (kernel resolved at runtime via plugin)
#
# --iree-llvmcpu-link-embedded=false — host-ABI shared object, not embedded ELF
# No --iree-mips-static-embedding — my_matmul_kernel is a HAL import entry
# ─────────────────────────────────────────────────────────────────────────────
section "Step 3: iree-compile → .vmfb (dynamic)"

if [[ "${HOST_MODE}" == "1" ]]; then
RISCV_FLAGS=()
else
RISCV_FLAGS=(
"--iree-llvmcpu-target-triple=riscv64-linux-gnu"
"--iree-llvmcpu-target-abi=lp64d"
"--iree-llvmcpu-target-cpu-features=+m,+a,+f,+d,+c,+zvl512b,+v"
"--riscv-v-fixed-length-vector-lmul-max=8"
)
fi

VMFB="${OUT_DIR}/matmul_dynamic.vmfb"
"${IREE_COMPILE}" \
--iree-hal-target-backends=llvm-cpu \
--iree-llvmcpu-link-embedded=false \
"${RISCV_FLAGS[@]}" \
"${OUT_DIR}/flow.mlir" -o "${VMFB}"
ok "${VMFB} ($(du -sh "${VMFB}" | cut -f1))"

# ─────────────────────────────────────────────────────────────────────────────
# Step 4: Verify kernel appears as an import (unresolved symbol) in the vmfb
# ─────────────────────────────────────────────────────────────────────────────
section "Step 4: Verify dynamic import in vmfb"

ELF_OFFSET=$(grep -boa $'\x7fELF' "${VMFB}" 2>/dev/null | head -1 | cut -d: -f1 || true)
if [[ -n "${ELF_OFFSET}" ]]; then
dd if="${VMFB}" bs=1 skip="${ELF_OFFSET}" 2>/dev/null > "${OUT_DIR}/dispatch.elf"
python3 - "${OUT_DIR}/dispatch.elf" << 'PYEOF'
import sys
data = open(sys.argv[1], 'rb').read()
idx = data.find(b'my_matmul_kernel')
rvv = sum(1 for i in range(0, len(data)-3, 4) if data[i] & 0x7f == 0x57)
if idx != -1:
print(f" [ok] 'my_matmul_kernel' at offset {idx} (import table entry — kernel lives in .so)")
else:
print(" [warn] 'my_matmul_kernel' not found in dispatch ELF")
PYEOF
else
echo " [warn] No ELF found in vmfb"
fi

# ─────────────────────────────────────────────────────────────────────────────
# Step 5: Run (kernel loaded from .so via --executable_plugin)
# ─────────────────────────────────────────────────────────────────────────────
section "Step 5: Run (--executable_plugin=${PLUGIN_SO})"

MATMUL_ARGS=(
--module="${VMFB}"
--executable_plugin="${PLUGIN_SO}"
--function="matmul_4x4"
"--input=4x4xf32=1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1"
"--input=4x4xf32=1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16"
)

if [[ "${HOST_MODE}" == "1" ]]; then
echo " Running on x86 host (scalar fallback)..."
"${HOST_RUN}" "${MATMUL_ARGS[@]}"
else
echo " Running under QEMU vlen=${VLEN}..."
run_qemu "${VLEN}" "${MATMUL_ARGS[@]}"

echo ""
echo " VLEN sweep:"
for V in 128 256 512; do
printf " vlen=%-4s " "${V}:"
run_qemu "${V}" "${MATMUL_ARGS[@]}" 2>&1 | grep "4x4xf32" || echo "(no output)"
done
echo " Note: vlen=128 may produce zeros — vmfb compiled with +zvl512b"
fi

echo ""
echo " Expected: 4x4xf32=[1 2 3 4][5 6 7 8][9 10 11 12][13 14 15 16]"

# ─────────────────────────────────────────────────────────────────────────────
# Summary
# ─────────────────────────────────────────────────────────────────────────────
echo ""
echo "════════════════════════════════════════════════════════════"
echo " DONE — Dynamic plugin verified."
echo " Artifacts in ${OUT_DIR}/"
echo " librvv_matmul.so — plugin loaded at runtime"
echo " matmul_dynamic.vmfb — vmfb with HAL import (needs plugin)"
echo "════════════════════════════════════════════════════════════"
Loading