From 683406d7dabbe5d5e798c6995e0aefc539cc2063 Mon Sep 17 00:00:00 2001
From: Abhishek Varma <abhvarma@amd.com>
Date: Fri, 14 Feb 2025 12:39:18 +0530
Subject: [PATCH] [e2e CI] Add e2e CI tests for Matmul+Trunci with scaling
 (#1099)

-- This commit adds e2e CI tests for Matmul+Trunci with scaling.
-- In an actual quantized model, truncating from a higher bitwidth to a
lower precision bitwidth
    won't work and we need to scale.
-- Since the output of the Matmul here is an integer cannot be
multiplied with a floating point
scale factor, we need to represent the scale factor with a multiplier
and a shift operator instead.
-- Eg: a float scale factor of 0.333 could become multiply by 357913941
and shift right 30.

Signed-off-by: Abhishek Varma <abhvarma@amd.com>
---
 .../matmul_template/matmul_generator.py       | 10 +++
 .../matmul_trunci_scaling_MxK_KxN.mlir        | 33 ++++++++
 build_tools/ci/cpu_comparison/run.py          | 76 +++++++++++++++++++
 3 files changed, 119 insertions(+)
 create mode 100644 build_tools/ci/cpu_comparison/matmul_template/matmul_trunci_scaling_MxK_KxN.mlir

diff --git a/build_tools/ci/cpu_comparison/matmul_template/matmul_generator.py b/build_tools/ci/cpu_comparison/matmul_template/matmul_generator.py
index cf6013214..adc42cfb9 100644
--- a/build_tools/ci/cpu_comparison/matmul_template/matmul_generator.py
+++ b/build_tools/ci/cpu_comparison/matmul_template/matmul_generator.py
@@ -3,6 +3,14 @@
 import os
 
 
+def get_higher_order_element_type(element_type):
+    if element_type[0] in ["i", "f"]:
+        assert element_type[1:].isdigit(), f"support for {element_type} is missing"
+        bit_width = int(element_type[1:])
+        return f"{element_type[0]}{bit_width*2}"
+    assert False, f"support for {element_type} is missing"
+
+
 def generate_matmul_test(output_fn, input_fn, m, n, k, lhs_rhs_type, acc_type, b=0):
     """
     Generate mlir file (output_fn) from the template file (input_fn).
@@ -14,6 +22,8 @@ def generate_matmul_test(output_fn, input_fn, m, n, k, lhs_rhs_type, acc_type, b
     replace["K"] = k
     replace["TYPE1"] = lhs_rhs_type
     replace["TYPE2"] = acc_type
+    # Only used for Matmul+Trunc via scaling.
+    replace["TYPE_MUL_RESULT"] = get_higher_order_element_type(acc_type)
 
     replace["B"] = b  # This is only used for batch matmul
     acc_is_int = acc_type[0] == "i"
diff --git a/build_tools/ci/cpu_comparison/matmul_template/matmul_trunci_scaling_MxK_KxN.mlir b/build_tools/ci/cpu_comparison/matmul_template/matmul_trunci_scaling_MxK_KxN.mlir
new file mode 100644
index 000000000..5ed4a849b
--- /dev/null
+++ b/build_tools/ci/cpu_comparison/matmul_template/matmul_trunci_scaling_MxK_KxN.mlir
@@ -0,0 +1,33 @@
+// input ${M}x${K}x${TYPE1}
+// input ${K}x${N}x${TYPE1}
+
+// Matmul + Trunci variant with scaling.
+// In an actual quantized model, truncating from a higher bitwidth to a lower precision bitwidth
+// won't work and we need to scale.
+// Since the output of the Matmul here is an integer cannot be multiplied with a floating point
+// scale factor, we need to represent the scale factor with a multiplier and a shift operator instead.
+func.func @matmul_trunci(%arg0: tensor<${M}x${K}x${TYPE1}>, %arg1: tensor<${K}x${N}x${TYPE1}>) -> tensor<${M}x${N}x${TYPE1}>
+{
+  %cst = arith.constant ${ZERO} : ${TYPE2}
+  %cst_mul = arith.constant 10 : ${TYPE_MUL_RESULT}
+  %cst_shift = arith.constant 7 : ${TYPE_MUL_RESULT}
+  %0 = tensor.empty() : tensor<${M}x${N}x${TYPE2}>
+  %i8out = tensor.empty() : tensor<${M}x${N}x${TYPE1}>
+  %1 = linalg.fill ins(%cst : ${TYPE2}) outs(%0 : tensor<${M}x${N}x${TYPE2}>) -> tensor<${M}x${N}x${TYPE2}>
+  %2 = linalg.matmul ins(%arg0, %arg1 : tensor<${M}x${K}x${TYPE1}>, tensor<${K}x${N}x${TYPE1}>)
+    outs(%1: tensor<${M}x${N}x${TYPE2}>) -> tensor<${M}x${N}x${TYPE2}>
+  %3 = linalg.generic {indexing_maps = [
+                              affine_map<(d0, d1) -> (d0, d1)>,
+                              affine_map<(d0, d1) -> (d0, d1)>
+                       ],
+                       iterator_types = ["parallel", "parallel"]
+                      } ins(%2 : tensor<${M}x${N}x${TYPE2}>) outs(%i8out : tensor<${M}x${N}x${TYPE1}>) {
+    ^bb0(%in: ${TYPE2}, %out: ${TYPE1}):
+      %4 = arith.extsi %in : ${TYPE2} to ${TYPE_MUL_RESULT}
+      %5 = arith.muli %4, %cst_mul : ${TYPE_MUL_RESULT}
+      %6 = arith.shrsi %5, %cst_shift : ${TYPE_MUL_RESULT}
+      %7 = arith.trunci %6 : ${TYPE_MUL_RESULT} to ${TYPE1}
+      linalg.yield %7 : ${TYPE1}
+    } -> tensor<${M}x${N}x${TYPE1}>
+  return %3: tensor<${M}x${N}x${TYPE1}>
+}
diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py
index 842515f84..3b8c585ef 100755
--- a/build_tools/ci/cpu_comparison/run.py
+++ b/build_tools/ci/cpu_comparison/run.py
@@ -692,6 +692,7 @@ def __init__(
         rhs,
         expected_out,
         test_params=None,
+        use_scaling=False,
     ):
         super().__init__(
             name=f"matmul_trunci_{M}_{N}_{K}_{input_type}_{acc_type}",
@@ -712,10 +713,13 @@ def __init__(
         self.lhs = lhs
         self.rhs = rhs
         self.expected_out = expected_out
+        self.use_scaling = use_scaling
 
     def _execute(self, config):
         matmul_template_dir = config.file_dir / "matmul_template"
         template_name = matmul_template_dir / "matmul_trunci_MxK_KxN.mlir"
+        if self.use_scaling:
+            template_name = matmul_template_dir / "matmul_trunci_scaling_MxK_KxN.mlir"
         self.generate(config, template_name)
         filename = self.get_filename(config)
         input_args = generate_inputs(
@@ -1589,6 +1593,78 @@ def __init__(self):
                 ),
             )
         )
+
+        # Tests Matmul + Trunci with Scaling.
+        # Phoenix : Ukernel + Peano.
+        self.register(
+            MatmulTrunci(
+                256,
+                256,
+                128,
+                "i8",
+                "i32",
+                2 * np.ones([256, 128], dtype=np.int8),
+                3 * np.ones([128, 256], dtype=np.int8),
+                60 * np.ones([256, 256], dtype=np.int8),
+                test_params=TestParams(
+                    name_suffix="scaling",
+                    tile_pipeline="pack-peel-4-level-tiling",
+                    run_on_target=["npu1_4col"],
+                    aie_compilation_flags=[
+                        "--iree-amdaie-num-rows=4",
+                        "--iree-amdaie-num-cols=4",
+                    ],
+                    use_ukernel=True,
+                ),
+                use_scaling=True,
+            )
+        )
+        # Phoenix : Vectorization + Peano.
+        self.register(
+            MatmulTrunci(
+                256,
+                256,
+                128,
+                "i8",
+                "i32",
+                2 * np.ones([256, 128], dtype=np.int8),
+                3 * np.ones([128, 256], dtype=np.int8),
+                60 * np.ones([256, 256], dtype=np.int8),
+                test_params=TestParams(
+                    tile_pipeline="pack-peel-4-level-tiling",
+                    run_on_target=["npu1_4col"],
+                    aie_compilation_flags=[
+                        "--iree-amdaie-num-rows=4",
+                        "--iree-amdaie-num-cols=4",
+                    ],
+                ),
+                use_scaling=True,
+            )
+        )
+        # Strix : Ukernel + Chess.
+        self.register(
+            MatmulTrunci(
+                256,
+                256,
+                128,
+                "i8",
+                "i32",
+                2 * np.ones([256, 128], dtype=np.int8),
+                3 * np.ones([128, 256], dtype=np.int8),
+                60 * np.ones([256, 256], dtype=np.int8),
+                test_params=TestParams(
+                    tile_pipeline="pack-peel-4-level-tiling",
+                    run_on_target=["npu4"],
+                    aie_compilation_flags=[
+                        "--iree-amdaie-num-rows=4",
+                        "--iree-amdaie-num-cols=8",
+                    ],
+                    use_chess=True,
+                    use_ukernel=True,
+                ),
+                use_scaling=True,
+            )
+        )
         # Matmul with truncf test(s):
         for tile_pipeline in ["pack-peel", "pack-peel-4-level-tiling"]:
             self.register(