MIPS · Shukla-Gaurav · Mar 3, 2026 · Mar 3, 2026 · Mar 3, 2026 · Mar 3, 2026
diff --git a/build_tools/riscv/mips_matmul_test.mlir b/build_tools/riscv/mips_matmul_test.mlir
@@ -0,0 +1,101 @@
+// mips_matmul_test.mlir
+//
+// End-to-end test inputs for the MIPS matmul kernel pipeline.
+//
+// Each function exercises torch.aten.mm, which is intercepted by
+// ConvertTorchToMIPSPass and rewritten as mips.matmul. The op is then
+// eliminated during One-Shot Bufferize: MIPSBufferizableOpInterface
+// decomposes the 2-D memrefs and emits a direct func.call to the
+// hand-tuned C kernel:
+//
+//   torch.aten.mm
+//     → mips.matmul          (ConvertTorchToMIPSPass)
+//     → flow.dispatch(...)   (IREE dispatch formation)
+//     → func.call @my_matmul_kernel  (MIPSBufferizableOpInterface)
+//     → ELF inside .vmfb     (iree-compile LLVMCPU backend)
+//
+// Usage:
+//   bash build_tools/riscv/rvv_qemu_workflow_static.sh  -- static  (.o baked into vmfb)
+//   bash build_tools/riscv/rvv_qemu_workflow_dynamic.sh -- dynamic (.so plugin at runtime)
+
+module {
+  // ── Test 1: 4×4 identity × data → passthrough ────────────────────────────
+  // Verifies that A=I leaves B unchanged; a simple correctness smoke-test.
+  //
+  // A = identity(4×4), B = [[1..4],[5..8],[9..12],[13..16]]
+  // Expected: result = B
+  func.func @matmul_4x4(
+      %A : !torch.vtensor<[4,4],f32>,
+      %B : !torch.vtensor<[4,4],f32>)
+      -> !torch.vtensor<[4,4],f32> {
+    %0 = torch.aten.mm %A, %B
+        : !torch.vtensor<[4,4],f32>, !torch.vtensor<[4,4],f32>
+        -> !torch.vtensor<[4,4],f32>
+    return %0 : !torch.vtensor<[4,4],f32>
+  }
+
+  // ── Test 2: 2×3 × 3×2 → 2×2  (non-square, reduced K dimension) ──────────
+  // Verifies M≠N≠K path through the kernel (inner loop trip-count < vlen).
+  //
+  // A = [[1,2,3],[4,5,6]], B = [[1,0],[0,1],[1,0]]
+  // Expected: [[1+0+3, 0+2+0],[4+0+6, 0+5+0]] = [[4,2],[10,5]]
+  func.func @matmul_2x3x2(
+      %A : !torch.vtensor<[2,3],f32>,
+      %B : !torch.vtensor<[3,2],f32>)
+      -> !torch.vtensor<[2,2],f32> {
+    %0 = torch.aten.mm %A, %B
+        : !torch.vtensor<[2,3],f32>, !torch.vtensor<[3,2],f32>
+        -> !torch.vtensor<[2,2],f32>
+    return %0 : !torch.vtensor<[2,2],f32>
+  }
+
+  // ── Test 3: 8×8 × 8×8 → 8×8  (exercises multi-vector-register tiling) ───
+  // With vlen=512 and LMUL=m4, N=8 fits in a single VL group. This test
+  // stresses the vectorized inner loop and accumulation across K=8 steps.
+  //
+  // A = upper-triangular ones (row i has ones in columns 0..i).
+  // B = identity(8×8).
+  // Expected: A*I = A — result is upper-triangular ones.
+  //
+  // A row layout (8×8):
+  //   row 0: [1,0,0,0,0,0,0,0]
+  //   row 1: [1,1,0,0,0,0,0,0]
+  //   row 2: [1,1,1,0,0,0,0,0]
+  //   ...
+  //   row 7: [1,1,1,1,1,1,1,1]
+  func.func @matmul_8x8(
+      %A : !torch.vtensor<[8,8],f32>,
+      %B : !torch.vtensor<[8,8],f32>)
+      -> !torch.vtensor<[8,8],f32> {
+    %0 = torch.aten.mm %A, %B
+        : !torch.vtensor<[8,8],f32>, !torch.vtensor<[8,8],f32>
+        -> !torch.vtensor<[8,8],f32>
+    return %0 : !torch.vtensor<[8,8],f32>
+  }
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Expected outputs (iree-run-module)
+// ─────────────────────────────────────────────────────────────────────────────
+//
+// matmul_4x4  A=identity(4x4), B=[1..16 row-major]:
+//   result[0]: 4x4xf32=[1 2 3 4][5 6 7 8][9 10 11 12][13 14 15 16]
+//
+// matmul_2x3x2  A=[[1,2,3],[4,5,6]], B=[[1,0],[0,1],[1,0]]:
+//   result[0]: 2x2xf32=[4 2][10 5]
+//
+// matmul_8x8  A=upper-triangular-ones(8x8), B=identity(8x8):
+//   result[0]: 8x8xf32=
+//     [1 0 0 0 0 0 0 0]
+//     [1 1 0 0 0 0 0 0]
+//     [1 1 1 0 0 0 0 0]
+//     [1 1 1 1 0 0 0 0]
+//     [1 1 1 1 1 0 0 0]
+//     [1 1 1 1 1 1 0 0]
+//     [1 1 1 1 1 1 1 0]
+//     [1 1 1 1 1 1 1 1]
+//
+// iree-run-module invocation for matmul_8x8:
+//   --function=matmul_8x8
+//   "--input=8x8xf32=1,0,0,0,0,0,0,0, 1,1,0,0,0,0,0,0, 1,1,1,0,0,0,0,0, 1,1,1,1,0,0,0,0, 1,1,1,1,1,0,0,0, 1,1,1,1,1,1,0,0, 1,1,1,1,1,1,1,0, 1,1,1,1,1,1,1,1"
+//   "--input=8x8xf32=1,0,0,0,0,0,0,0, 0,1,0,0,0,0,0,0, 0,0,1,0,0,0,0,0, 0,0,0,1,0,0,0,0, 0,0,0,0,1,0,0,0, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,1"
diff --git a/build_tools/riscv/rvv_qemu_workflow_dynamic.sh b/build_tools/riscv/rvv_qemu_workflow_dynamic.sh
@@ -0,0 +1,206 @@
+#!/usr/bin/env bash
+# rvv_qemu_workflow_dynamic.sh
+#
+# End-to-end MIPS matmul pipeline — DYNAMIC plugin loading.
+#
+# The RVV kernel is compiled into a shared library (.so) that is loaded at
+# runtime via --executable_plugin. No custom linker wrapper is needed at
+# iree-compile time.
+#
+# Pipeline:
+#   mips_matmul_test.mlir
+#     ─[iree-opt torch-to-iree{use-mips-matmul=true}]─► flow.mlir
+#     ─[clang --target=riscv64 -shared]──────────────► librvv_matmul.so
+#     ─[iree-compile --iree-llvmcpu-link-embedded=false]► matmul.vmfb
+#     ─[qemu-riscv64 iree-run-module --executable_plugin]► result
+#
+# Usage:
+#   bash rvv_qemu_workflow_dynamic.sh            # RISC-V QEMU, vlen=512
+#   bash rvv_qemu_workflow_dynamic.sh --host     # x86 host (scalar fallback)
+#   bash rvv_qemu_workflow_dynamic.sh --vlen 256 # QEMU with vlen=256
+
+set -euo pipefail
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Configuration
+# ─────────────────────────────────────────────────────────────────────────────
+IREE_SRC="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+WORK_DIR="${HOME}/MLIR_Work/mips"
+HOST_BUILD="${WORK_DIR}/iree-build"
+HOST_INSTALL="${HOST_BUILD}/install"
+RISCV_BUILD="${WORK_DIR}/iree-build-riscv"
+OUT_DIR="${WORK_DIR}/out/dynamic"
+
+IREE_OPT="${HOST_INSTALL}/bin/iree-opt"
+IREE_COMPILE="${HOST_INSTALL}/bin/iree-compile"
+HOST_RUN="${HOST_INSTALL}/bin/iree-run-module"
+RISCV_RUN="${RISCV_BUILD}/install/bin/iree-run-module"
+QEMU="${HOME}/local/bin/qemu-riscv64"
+SYSROOT="${HOME}/riscv/toolchain/clang/linux/RISCV/sysroot"
+
+CLANG="${HOME}/miniforge3/bin/clang"
+LLD="${HOME}/miniforge3/bin/ld.lld"
+CLANG_INC="${HOME}/miniforge3/lib/clang/18/include"
+
+KERNEL_SRC="${IREE_SRC}/runtime/src/iree/builtins/mips/matmul_kernel.c"
+PLUGIN_SRC="${IREE_SRC}/runtime/src/iree/builtins/mips/matmul_plugin.c"
+TEST_MLIR="${IREE_SRC}/build_tools/riscv/mips_matmul_test.mlir"
+
+# Rocky 8's libstdc++ is too old; conda has GLIBCXX 3.4.29+.
+export LD_LIBRARY_PATH="${HOME}/miniforge3/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Argument parsing
+# ─────────────────────────────────────────────────────────────────────────────
+HOST_MODE=0
+VLEN=512
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --host)   HOST_MODE=1 ;;
+    --vlen)   shift; VLEN="$1" ;;
+    *) echo "Unknown arg: $1"; exit 1 ;;
+  esac
+  shift
+done
+
+mkdir -p "${OUT_DIR}"
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Helpers
+# ─────────────────────────────────────────────────────────────────────────────
+section() { echo ""; echo "══[ $* ]══════════════════════════════════════════════════"; }
+ok()      { echo "  [ok] $*"; }
+run_qemu() {
+  local vlen="$1"; shift
+  "${QEMU}" -cpu "rv64,v=true,vlen=${vlen},elen=64,vext_spec=v1.0" \
+            -L "${SYSROOT}" "${RISCV_RUN}" "$@"
+}
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Step 1: torch → IREE flow IR
+# ─────────────────────────────────────────────────────────────────────────────
+section "Step 1: torch → IREE flow IR"
+
+"${IREE_OPT}" \
+  --pass-pipeline="builtin.module(torch-to-iree{use-mips-matmul=true})" \
+  "${TEST_MLIR}" -o "${OUT_DIR}/flow.mlir"
+ok "${OUT_DIR}/flow.mlir"
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Step 2: Cross-compile kernel + plugin → shared library
+#
+# matmul_kernel.c   — compute logic (no IREE headers)
+# matmul_plugin.c   — IREE HAL executable plugin interface
+# Both compiled together into a single -fPIC -shared .so.
+# ─────────────────────────────────────────────────────────────────────────────
+section "Step 2: Compile matmul_kernel.c + matmul_plugin.c → .so"
+
+PLUGIN_SO="${OUT_DIR}/librvv_matmul.so"
+
+if [[ "${HOST_MODE}" == "1" ]]; then
+  "${CLANG}" --target=x86_64-linux-gnu \
+    -O2 -fPIC -shared \
+    -I "${IREE_SRC}/runtime/src" \
+    "${KERNEL_SRC}" "${PLUGIN_SRC}" -o "${PLUGIN_SO}"
+  ok "x86 scalar plugin: ${PLUGIN_SO}"
+else
+  "${CLANG}" --target=riscv64-linux-gnu -march=rv64gcv -mabi=lp64d \
+    -O2 -fPIC -shared -nostdinc -nostdlib \
+    -isystem "${CLANG_INC}" \
+    -fuse-ld="${LLD}" \
+    -I "${IREE_SRC}/runtime/src" \
+    "${KERNEL_SRC}" "${PLUGIN_SRC}" -o "${PLUGIN_SO}"
+  ok "RISC-V RVV plugin: ${PLUGIN_SO}  ($(file -b "${PLUGIN_SO}" | cut -d, -f1))"
+fi
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Step 3: iree-compile → .vmfb (kernel resolved at runtime via plugin)
+#
+# --iree-llvmcpu-link-embedded=false  — host-ABI shared object, not embedded ELF
+# No --iree-mips-static-embedding     — my_matmul_kernel is a HAL import entry
+# ─────────────────────────────────────────────────────────────────────────────
+section "Step 3: iree-compile → .vmfb (dynamic)"
+
+if [[ "${HOST_MODE}" == "1" ]]; then
+  RISCV_FLAGS=()
+else
+  RISCV_FLAGS=(
+    "--iree-llvmcpu-target-triple=riscv64-linux-gnu"
+    "--iree-llvmcpu-target-abi=lp64d"
+    "--iree-llvmcpu-target-cpu-features=+m,+a,+f,+d,+c,+zvl512b,+v"
+    "--riscv-v-fixed-length-vector-lmul-max=8"
+  )
+fi
+
+VMFB="${OUT_DIR}/matmul_dynamic.vmfb"
+"${IREE_COMPILE}" \
+  --iree-hal-target-backends=llvm-cpu \
+  --iree-llvmcpu-link-embedded=false \
+  "${RISCV_FLAGS[@]}" \
+  "${OUT_DIR}/flow.mlir" -o "${VMFB}"
+ok "${VMFB}  ($(du -sh "${VMFB}" | cut -f1))"
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Step 4: Verify kernel appears as an import (unresolved symbol) in the vmfb
+# ─────────────────────────────────────────────────────────────────────────────
+section "Step 4: Verify dynamic import in vmfb"
+
+ELF_OFFSET=$(grep -boa $'\x7fELF' "${VMFB}" 2>/dev/null | head -1 | cut -d: -f1 || true)
+if [[ -n "${ELF_OFFSET}" ]]; then
+  dd if="${VMFB}" bs=1 skip="${ELF_OFFSET}" 2>/dev/null > "${OUT_DIR}/dispatch.elf"
+  python3 - "${OUT_DIR}/dispatch.elf" << 'PYEOF'
+import sys
+data = open(sys.argv[1], 'rb').read()
+idx  = data.find(b'my_matmul_kernel')
+rvv  = sum(1 for i in range(0, len(data)-3, 4) if data[i] & 0x7f == 0x57)
+if idx != -1:
+    print(f"  [ok] 'my_matmul_kernel' at offset {idx} (import table entry — kernel lives in .so)")
+else:
+    print("  [warn] 'my_matmul_kernel' not found in dispatch ELF")
+PYEOF
+else
+  echo "  [warn] No ELF found in vmfb"
+fi
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Step 5: Run (kernel loaded from .so via --executable_plugin)
+# ─────────────────────────────────────────────────────────────────────────────
+section "Step 5: Run (--executable_plugin=${PLUGIN_SO})"
+
+MATMUL_ARGS=(
+  --module="${VMFB}"
+  --executable_plugin="${PLUGIN_SO}"
+  --function="matmul_4x4"
+  "--input=4x4xf32=1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1"
+  "--input=4x4xf32=1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16"
+)
+
+if [[ "${HOST_MODE}" == "1" ]]; then
+  echo "  Running on x86 host (scalar fallback)..."
+  "${HOST_RUN}" "${MATMUL_ARGS[@]}"
+else
+  echo "  Running under QEMU vlen=${VLEN}..."
+  run_qemu "${VLEN}" "${MATMUL_ARGS[@]}"
+
+  echo ""
+  echo "  VLEN sweep:"
+  for V in 128 256 512; do
+    printf "    vlen=%-4s  " "${V}:"
+    run_qemu "${V}" "${MATMUL_ARGS[@]}" 2>&1 | grep "4x4xf32" || echo "(no output)"
+  done
+  echo "  Note: vlen=128 may produce zeros — vmfb compiled with +zvl512b"
+fi
+
+echo ""
+echo "  Expected: 4x4xf32=[1 2 3 4][5 6 7 8][9 10 11 12][13 14 15 16]"
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Summary
+# ─────────────────────────────────────────────────────────────────────────────
+echo ""
+echo "════════════════════════════════════════════════════════════"
+echo " DONE — Dynamic plugin verified."
+echo " Artifacts in ${OUT_DIR}/"
+echo "   librvv_matmul.so     — plugin loaded at runtime"
+echo "   matmul_dynamic.vmfb  — vmfb with HAL import (needs plugin)"
+echo "════════════════════════════════════════════════════════════"