Skip to content

Commit

Permalink
[e2e CI] Add e2e CI tests for Matmul+Trunci with scaling (nod-ai#1099)
Browse files Browse the repository at this point in the history
-- This commit adds e2e CI tests for Matmul+Trunci with scaling.
-- In an actual quantized model, truncating from a higher bitwidth to a
lower precision bitwidth
    won't work and we need to scale.
-- Since the output of the Matmul here is an integer cannot be
multiplied with a floating point
scale factor, we need to represent the scale factor with a multiplier
and a shift operator instead.
-- Eg: a float scale factor of 0.333 could become multiply by 357913941
and shift right 30.

Signed-off-by: Abhishek Varma <[email protected]>
  • Loading branch information
Abhishek-Varma authored Feb 14, 2025
1 parent e052258 commit 683406d
Show file tree
Hide file tree
Showing 3 changed files with 119 additions and 0 deletions.
10 changes: 10 additions & 0 deletions build_tools/ci/cpu_comparison/matmul_template/matmul_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,14 @@
import os


def get_higher_order_element_type(element_type):
if element_type[0] in ["i", "f"]:
assert element_type[1:].isdigit(), f"support for {element_type} is missing"
bit_width = int(element_type[1:])
return f"{element_type[0]}{bit_width*2}"
assert False, f"support for {element_type} is missing"


def generate_matmul_test(output_fn, input_fn, m, n, k, lhs_rhs_type, acc_type, b=0):
"""
Generate mlir file (output_fn) from the template file (input_fn).
Expand All @@ -14,6 +22,8 @@ def generate_matmul_test(output_fn, input_fn, m, n, k, lhs_rhs_type, acc_type, b
replace["K"] = k
replace["TYPE1"] = lhs_rhs_type
replace["TYPE2"] = acc_type
# Only used for Matmul+Trunc via scaling.
replace["TYPE_MUL_RESULT"] = get_higher_order_element_type(acc_type)

replace["B"] = b # This is only used for batch matmul
acc_is_int = acc_type[0] == "i"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// input ${M}x${K}x${TYPE1}
// input ${K}x${N}x${TYPE1}

// Matmul + Trunci variant with scaling.
// In an actual quantized model, truncating from a higher bitwidth to a lower precision bitwidth
// won't work and we need to scale.
// Since the output of the Matmul here is an integer cannot be multiplied with a floating point
// scale factor, we need to represent the scale factor with a multiplier and a shift operator instead.
func.func @matmul_trunci(%arg0: tensor<${M}x${K}x${TYPE1}>, %arg1: tensor<${K}x${N}x${TYPE1}>) -> tensor<${M}x${N}x${TYPE1}>
{
%cst = arith.constant ${ZERO} : ${TYPE2}
%cst_mul = arith.constant 10 : ${TYPE_MUL_RESULT}
%cst_shift = arith.constant 7 : ${TYPE_MUL_RESULT}
%0 = tensor.empty() : tensor<${M}x${N}x${TYPE2}>
%i8out = tensor.empty() : tensor<${M}x${N}x${TYPE1}>
%1 = linalg.fill ins(%cst : ${TYPE2}) outs(%0 : tensor<${M}x${N}x${TYPE2}>) -> tensor<${M}x${N}x${TYPE2}>
%2 = linalg.matmul ins(%arg0, %arg1 : tensor<${M}x${K}x${TYPE1}>, tensor<${K}x${N}x${TYPE1}>)
outs(%1: tensor<${M}x${N}x${TYPE2}>) -> tensor<${M}x${N}x${TYPE2}>
%3 = linalg.generic {indexing_maps = [
affine_map<(d0, d1) -> (d0, d1)>,
affine_map<(d0, d1) -> (d0, d1)>
],
iterator_types = ["parallel", "parallel"]
} ins(%2 : tensor<${M}x${N}x${TYPE2}>) outs(%i8out : tensor<${M}x${N}x${TYPE1}>) {
^bb0(%in: ${TYPE2}, %out: ${TYPE1}):
%4 = arith.extsi %in : ${TYPE2} to ${TYPE_MUL_RESULT}
%5 = arith.muli %4, %cst_mul : ${TYPE_MUL_RESULT}
%6 = arith.shrsi %5, %cst_shift : ${TYPE_MUL_RESULT}
%7 = arith.trunci %6 : ${TYPE_MUL_RESULT} to ${TYPE1}
linalg.yield %7 : ${TYPE1}
} -> tensor<${M}x${N}x${TYPE1}>
return %3: tensor<${M}x${N}x${TYPE1}>
}
76 changes: 76 additions & 0 deletions build_tools/ci/cpu_comparison/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -692,6 +692,7 @@ def __init__(
rhs,
expected_out,
test_params=None,
use_scaling=False,
):
super().__init__(
name=f"matmul_trunci_{M}_{N}_{K}_{input_type}_{acc_type}",
Expand All @@ -712,10 +713,13 @@ def __init__(
self.lhs = lhs
self.rhs = rhs
self.expected_out = expected_out
self.use_scaling = use_scaling

def _execute(self, config):
matmul_template_dir = config.file_dir / "matmul_template"
template_name = matmul_template_dir / "matmul_trunci_MxK_KxN.mlir"
if self.use_scaling:
template_name = matmul_template_dir / "matmul_trunci_scaling_MxK_KxN.mlir"
self.generate(config, template_name)
filename = self.get_filename(config)
input_args = generate_inputs(
Expand Down Expand Up @@ -1589,6 +1593,78 @@ def __init__(self):
),
)
)

# Tests Matmul + Trunci with Scaling.
# Phoenix : Ukernel + Peano.
self.register(
MatmulTrunci(
256,
256,
128,
"i8",
"i32",
2 * np.ones([256, 128], dtype=np.int8),
3 * np.ones([128, 256], dtype=np.int8),
60 * np.ones([256, 256], dtype=np.int8),
test_params=TestParams(
name_suffix="scaling",
tile_pipeline="pack-peel-4-level-tiling",
run_on_target=["npu1_4col"],
aie_compilation_flags=[
"--iree-amdaie-num-rows=4",
"--iree-amdaie-num-cols=4",
],
use_ukernel=True,
),
use_scaling=True,
)
)
# Phoenix : Vectorization + Peano.
self.register(
MatmulTrunci(
256,
256,
128,
"i8",
"i32",
2 * np.ones([256, 128], dtype=np.int8),
3 * np.ones([128, 256], dtype=np.int8),
60 * np.ones([256, 256], dtype=np.int8),
test_params=TestParams(
tile_pipeline="pack-peel-4-level-tiling",
run_on_target=["npu1_4col"],
aie_compilation_flags=[
"--iree-amdaie-num-rows=4",
"--iree-amdaie-num-cols=4",
],
),
use_scaling=True,
)
)
# Strix : Ukernel + Chess.
self.register(
MatmulTrunci(
256,
256,
128,
"i8",
"i32",
2 * np.ones([256, 128], dtype=np.int8),
3 * np.ones([128, 256], dtype=np.int8),
60 * np.ones([256, 256], dtype=np.int8),
test_params=TestParams(
tile_pipeline="pack-peel-4-level-tiling",
run_on_target=["npu4"],
aie_compilation_flags=[
"--iree-amdaie-num-rows=4",
"--iree-amdaie-num-cols=8",
],
use_chess=True,
use_ukernel=True,
),
use_scaling=True,
)
)
# Matmul with truncf test(s):
for tile_pipeline in ["pack-peel", "pack-peel-4-level-tiling"]:
self.register(
Expand Down

0 comments on commit 683406d

Please sign in to comment.