Merge branch 'main' into zhewen_channels

nod-ai · Feb 15, 2025 · a014cec · a014cec
2 parents e11585c + 13870fd
commit a014cec
Show file tree

Hide file tree

Showing 20 changed files with 561 additions and 272 deletions.
diff --git a/README.md b/README.md
@@ -37,9 +37,11 @@ git \
 The above avoids cloning entire repo histories for submodules, and skips a few, currently, unused,
 submodules that are nested in IREE.
 
-### Dependencies
+## Dependencies
 
-#### For Linux
+### For Linux
+
+#### Driver
 
 Build and install `xdna-driver`, use commit `929e8ab`:
 
@@ -53,6 +55,42 @@ git submodule update --init --recursive
 
 Follow the instructions to build and install the driver module: [xdna-driver](https://github.com/amd/xdna-driver/tree/929e8ab459cab5915631849b9f1ef9a4982d1c11).
 
+#### LLVM-AIE (Peano)
+
+You will need at least Peano/llvm-aie to be installed in your system to run e2e examples as it's needed for compiling AIE core code. For best performance (but slower compilation times), you will also need Chess.
+
+To install llvm-aie in the current working directory:
+
+```
+bash <path-to-iree-amd-aie>/build_tools/download_peano.sh
+```
+
+Now, you should see a directory named `llvm-aie` in your current working directory.
+
+After building IREE, you can then run e2e tests by passing `--peano_dir=<path-to-llvm-aie>` to tests, see [Testing](#testing).
+
+#### Chess
+
+For best performance and to run all tests, you can install Chess in the following way:
+
+1. Install Vitis™ AIE Essentials from [Ryzen AI Software 1.3 Early Accesss](https://account.amd.com/en/member/ryzenai-sw-ea.html#tabs-a5e122f973-item-4757898120-tab).
+   ``` bash
+      tar -xzvf ryzen_ai_1.3.1-ea-lnx64-20250116.tgz
+      cd ryzen_ai_1.3.1-ea-lnx64-20250116
+      mkdir vitis_aie_essentials
+      mv vitis_aie_essentials*.whl vitis_aie_essentials
+      cd vitis_aie_essentials
+      unzip vitis_aie_essentials*.whl
+   ```
+2. Set up an AI Engine license.
+    1. Get a local license for AI Engine tools from [https://www.xilinx.com/getlicense](https://www.xilinx.com/getlicense).
+    2. Copy your license file (Xilinx.lic) to your preferred location, e.g. `/opt/Xilinx.lic`.
+
+After building IREE, you can then run e2e tests by passing `--vitis_dir=<path-to-vitis-aie-essentials>` to tests, see [Testing](#testing). Note however that you need to export the path to the AI Engine license for successful compilation:
+```
+export XILINXD_LICENSE_FILE=<path-to-Xilinx.lic>
+```
+
 ## Building (along with IREE)
 
 ### Just show me the CMake

diff --git a/build_tools/ci/cpu_comparison/matmul_template/matmul4d_MxKxM0xK0_NxKxK0xN0.mlir b/build_tools/ci/cpu_comparison/matmul_template/matmul4d_MxKxM0xK0_NxKxK0xN0.mlir
@@ -0,0 +1,17 @@
+// input ${M}x${K}x32x64x${TYPE1}
+// input ${N}x${K}x64x32x${TYPE1}
+
+func.func @matmul4d(%arg0: tensor<${M}x${K}x32x64x${TYPE1}>, %arg1: tensor<${N}x${K}x64x32x${TYPE1}>) -> tensor<${N}x${M}x32x32x${TYPE2}> {
+  %cst = arith.constant ${ZERO} : ${TYPE2}
+  %0 = tensor.empty() : tensor<${N}x${M}x32x32x${TYPE2}>
+  %1 = linalg.fill ins(%cst : ${TYPE2}) outs(%0 : tensor<${N}x${M}x32x32x${TYPE2}>) -> tensor<${N}x${M}x32x32x${TYPE2}>
+  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<${M}x${K}x32x64x${TYPE1}>, tensor<${N}x${K}x64x32x${TYPE1}>) outs(%1 : tensor<${N}x${M}x32x32x${TYPE2}>) {
+    ^bb0(%in: ${TYPE1}, %in_1: ${TYPE1}, %out: ${TYPE2}):
+      %12 = ${EXT} %in : ${TYPE1} to ${TYPE2}
+      %13 = ${EXT} %in_1 : ${TYPE1} to ${TYPE2}
+      %14 = ${MUL} %12, %13 : ${TYPE2}
+      %15 = ${ADD} %out, %14 : ${TYPE2}
+      linalg.yield %15 : ${TYPE2}
+    } -> tensor<${N}x${M}x32x32x${TYPE2}>
+  return %2 : tensor<${N}x${M}x32x32x${TYPE2}>
+}
diff --git a/build_tools/ci/cpu_comparison/matmul_template/matmul_generator.py b/build_tools/ci/cpu_comparison/matmul_template/matmul_generator.py
@@ -1,6 +1,12 @@
-import sys
 import re
-import os
+
+
+def get_higher_order_element_type(element_type):
+    if element_type[0] in ["i", "f"]:
+        assert element_type[1:].isdigit(), f"support for {element_type} is missing"
+        bit_width = int(element_type[1:])
+        return f"{element_type[0]}{bit_width*2}"
+    assert False, f"support for {element_type} is missing"
 
 
 def generate_matmul_test(output_fn, input_fn, m, n, k, lhs_rhs_type, acc_type, b=0):
@@ -14,11 +20,15 @@ def generate_matmul_test(output_fn, input_fn, m, n, k, lhs_rhs_type, acc_type, b
     replace["K"] = k
     replace["TYPE1"] = lhs_rhs_type
     replace["TYPE2"] = acc_type
+    # Only used for Matmul+Trunc via scaling.
+    replace["TYPE_MUL_RESULT"] = get_higher_order_element_type(acc_type)
 
     replace["B"] = b  # This is only used for batch matmul
     acc_is_int = acc_type[0] == "i"
     replace["ZERO"] = 0 if acc_is_int else 0.0
     replace["ADD"] = "arith.addi" if acc_is_int else "arith.addf"
+    replace["MUL"] = "arith.muli" if acc_is_int else "arith.mulf"
+    replace["EXT"] = "arith.extsi" if acc_is_int else "arith.extf"
 
     key_map = map(lambda s: "${" + s + "}", replace.keys())
     key_map_escaped = map(re.escape, key_map)

diff --git a/build_tools/ci/cpu_comparison/matmul_template/matmul_trunci_scaling_MxK_KxN.mlir b/build_tools/ci/cpu_comparison/matmul_template/matmul_trunci_scaling_MxK_KxN.mlir
@@ -0,0 +1,33 @@
+// input ${M}x${K}x${TYPE1}
+// input ${K}x${N}x${TYPE1}
+
+// Matmul + Trunci variant with scaling.
+// In an actual quantized model, truncating from a higher bitwidth to a lower precision bitwidth
+// won't work and we need to scale.
+// Since the output of the Matmul here is an integer cannot be multiplied with a floating point
+// scale factor, we need to represent the scale factor with a multiplier and a shift operator instead.
+func.func @matmul_trunci(%arg0: tensor<${M}x${K}x${TYPE1}>, %arg1: tensor<${K}x${N}x${TYPE1}>) -> tensor<${M}x${N}x${TYPE1}>
+{
+  %cst = arith.constant ${ZERO} : ${TYPE2}
+  %cst_mul = arith.constant 10 : ${TYPE_MUL_RESULT}
+  %cst_shift = arith.constant 7 : ${TYPE_MUL_RESULT}
+  %0 = tensor.empty() : tensor<${M}x${N}x${TYPE2}>
+  %i8out = tensor.empty() : tensor<${M}x${N}x${TYPE1}>
+  %1 = linalg.fill ins(%cst : ${TYPE2}) outs(%0 : tensor<${M}x${N}x${TYPE2}>) -> tensor<${M}x${N}x${TYPE2}>
+  %2 = linalg.matmul ins(%arg0, %arg1 : tensor<${M}x${K}x${TYPE1}>, tensor<${K}x${N}x${TYPE1}>)
+    outs(%1: tensor<${M}x${N}x${TYPE2}>) -> tensor<${M}x${N}x${TYPE2}>
+  %3 = linalg.generic {indexing_maps = [
+                              affine_map<(d0, d1) -> (d0, d1)>,
+                              affine_map<(d0, d1) -> (d0, d1)>
+                       ],
+                       iterator_types = ["parallel", "parallel"]
+                      } ins(%2 : tensor<${M}x${N}x${TYPE2}>) outs(%i8out : tensor<${M}x${N}x${TYPE1}>) {
+    ^bb0(%in: ${TYPE2}, %out: ${TYPE1}):
+      %4 = arith.extsi %in : ${TYPE2} to ${TYPE_MUL_RESULT}
+      %5 = arith.muli %4, %cst_mul : ${TYPE_MUL_RESULT}
+      %6 = arith.shrsi %5, %cst_shift : ${TYPE_MUL_RESULT}
+      %7 = arith.trunci %6 : ${TYPE_MUL_RESULT} to ${TYPE1}
+      linalg.yield %7 : ${TYPE1}
+    } -> tensor<${M}x${N}x${TYPE1}>
+  return %3: tensor<${M}x${N}x${TYPE1}>
+}