Skip to content

Commit

Permalink
Merge branch 'main' into zhewen_channels
Browse files Browse the repository at this point in the history
  • Loading branch information
jtuyls authored Feb 15, 2025
2 parents e11585c + 13870fd commit a014cec
Show file tree
Hide file tree
Showing 20 changed files with 561 additions and 272 deletions.
42 changes: 40 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,11 @@ git \
The above avoids cloning entire repo histories for submodules, and skips a few, currently, unused,
submodules that are nested in IREE.

### Dependencies
## Dependencies

#### For Linux
### For Linux

#### Driver

Build and install `xdna-driver`, use commit `929e8ab`:

Expand All @@ -53,6 +55,42 @@ git submodule update --init --recursive

Follow the instructions to build and install the driver module: [xdna-driver](https://github.com/amd/xdna-driver/tree/929e8ab459cab5915631849b9f1ef9a4982d1c11).

#### LLVM-AIE (Peano)

You will need at least Peano/llvm-aie to be installed in your system to run e2e examples as it's needed for compiling AIE core code. For best performance (but slower compilation times), you will also need Chess.

To install llvm-aie in the current working directory:

```
bash <path-to-iree-amd-aie>/build_tools/download_peano.sh
```

Now, you should see a directory named `llvm-aie` in your current working directory.

After building IREE, you can then run e2e tests by passing `--peano_dir=<path-to-llvm-aie>` to tests, see [Testing](#testing).

#### Chess

For best performance and to run all tests, you can install Chess in the following way:

1. Install Vitis™ AIE Essentials from [Ryzen AI Software 1.3 Early Accesss](https://account.amd.com/en/member/ryzenai-sw-ea.html#tabs-a5e122f973-item-4757898120-tab).
``` bash
tar -xzvf ryzen_ai_1.3.1-ea-lnx64-20250116.tgz
cd ryzen_ai_1.3.1-ea-lnx64-20250116
mkdir vitis_aie_essentials
mv vitis_aie_essentials*.whl vitis_aie_essentials
cd vitis_aie_essentials
unzip vitis_aie_essentials*.whl
```
2. Set up an AI Engine license.
1. Get a local license for AI Engine tools from [https://www.xilinx.com/getlicense](https://www.xilinx.com/getlicense).
2. Copy your license file (Xilinx.lic) to your preferred location, e.g. `/opt/Xilinx.lic`.

After building IREE, you can then run e2e tests by passing `--vitis_dir=<path-to-vitis-aie-essentials>` to tests, see [Testing](#testing). Note however that you need to export the path to the AI Engine license for successful compilation:
```
export XILINXD_LICENSE_FILE=<path-to-Xilinx.lic>
```

## Building (along with IREE)

### Just show me the CMake
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
// input ${M}x${K}x32x64x${TYPE1}
// input ${N}x${K}x64x32x${TYPE1}

func.func @matmul4d(%arg0: tensor<${M}x${K}x32x64x${TYPE1}>, %arg1: tensor<${N}x${K}x64x32x${TYPE1}>) -> tensor<${N}x${M}x32x32x${TYPE2}> {
%cst = arith.constant ${ZERO} : ${TYPE2}
%0 = tensor.empty() : tensor<${N}x${M}x32x32x${TYPE2}>
%1 = linalg.fill ins(%cst : ${TYPE2}) outs(%0 : tensor<${N}x${M}x32x32x${TYPE2}>) -> tensor<${N}x${M}x32x32x${TYPE2}>
%2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<${M}x${K}x32x64x${TYPE1}>, tensor<${N}x${K}x64x32x${TYPE1}>) outs(%1 : tensor<${N}x${M}x32x32x${TYPE2}>) {
^bb0(%in: ${TYPE1}, %in_1: ${TYPE1}, %out: ${TYPE2}):
%12 = ${EXT} %in : ${TYPE1} to ${TYPE2}
%13 = ${EXT} %in_1 : ${TYPE1} to ${TYPE2}
%14 = ${MUL} %12, %13 : ${TYPE2}
%15 = ${ADD} %out, %14 : ${TYPE2}
linalg.yield %15 : ${TYPE2}
} -> tensor<${N}x${M}x32x32x${TYPE2}>
return %2 : tensor<${N}x${M}x32x32x${TYPE2}>
}
14 changes: 12 additions & 2 deletions build_tools/ci/cpu_comparison/matmul_template/matmul_generator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
import sys
import re
import os


def get_higher_order_element_type(element_type):
if element_type[0] in ["i", "f"]:
assert element_type[1:].isdigit(), f"support for {element_type} is missing"
bit_width = int(element_type[1:])
return f"{element_type[0]}{bit_width*2}"
assert False, f"support for {element_type} is missing"


def generate_matmul_test(output_fn, input_fn, m, n, k, lhs_rhs_type, acc_type, b=0):
Expand All @@ -14,11 +20,15 @@ def generate_matmul_test(output_fn, input_fn, m, n, k, lhs_rhs_type, acc_type, b
replace["K"] = k
replace["TYPE1"] = lhs_rhs_type
replace["TYPE2"] = acc_type
# Only used for Matmul+Trunc via scaling.
replace["TYPE_MUL_RESULT"] = get_higher_order_element_type(acc_type)

replace["B"] = b # This is only used for batch matmul
acc_is_int = acc_type[0] == "i"
replace["ZERO"] = 0 if acc_is_int else 0.0
replace["ADD"] = "arith.addi" if acc_is_int else "arith.addf"
replace["MUL"] = "arith.muli" if acc_is_int else "arith.mulf"
replace["EXT"] = "arith.extsi" if acc_is_int else "arith.extf"

key_map = map(lambda s: "${" + s + "}", replace.keys())
key_map_escaped = map(re.escape, key_map)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// input ${M}x${K}x${TYPE1}
// input ${K}x${N}x${TYPE1}

// Matmul + Trunci variant with scaling.
// In an actual quantized model, truncating from a higher bitwidth to a lower precision bitwidth
// won't work and we need to scale.
// Since the output of the Matmul here is an integer cannot be multiplied with a floating point
// scale factor, we need to represent the scale factor with a multiplier and a shift operator instead.
func.func @matmul_trunci(%arg0: tensor<${M}x${K}x${TYPE1}>, %arg1: tensor<${K}x${N}x${TYPE1}>) -> tensor<${M}x${N}x${TYPE1}>
{
%cst = arith.constant ${ZERO} : ${TYPE2}
%cst_mul = arith.constant 10 : ${TYPE_MUL_RESULT}
%cst_shift = arith.constant 7 : ${TYPE_MUL_RESULT}
%0 = tensor.empty() : tensor<${M}x${N}x${TYPE2}>
%i8out = tensor.empty() : tensor<${M}x${N}x${TYPE1}>
%1 = linalg.fill ins(%cst : ${TYPE2}) outs(%0 : tensor<${M}x${N}x${TYPE2}>) -> tensor<${M}x${N}x${TYPE2}>
%2 = linalg.matmul ins(%arg0, %arg1 : tensor<${M}x${K}x${TYPE1}>, tensor<${K}x${N}x${TYPE1}>)
outs(%1: tensor<${M}x${N}x${TYPE2}>) -> tensor<${M}x${N}x${TYPE2}>
%3 = linalg.generic {indexing_maps = [
affine_map<(d0, d1) -> (d0, d1)>,
affine_map<(d0, d1) -> (d0, d1)>
],
iterator_types = ["parallel", "parallel"]
} ins(%2 : tensor<${M}x${N}x${TYPE2}>) outs(%i8out : tensor<${M}x${N}x${TYPE1}>) {
^bb0(%in: ${TYPE2}, %out: ${TYPE1}):
%4 = arith.extsi %in : ${TYPE2} to ${TYPE_MUL_RESULT}
%5 = arith.muli %4, %cst_mul : ${TYPE_MUL_RESULT}
%6 = arith.shrsi %5, %cst_shift : ${TYPE_MUL_RESULT}
%7 = arith.trunci %6 : ${TYPE_MUL_RESULT} to ${TYPE1}
linalg.yield %7 : ${TYPE1}
} -> tensor<${M}x${N}x${TYPE1}>
return %3: tensor<${M}x${N}x${TYPE1}>
}
Loading

0 comments on commit a014cec

Please sign in to comment.