From d2ccc8907e9b867259854d233c70980a26251ac0 Mon Sep 17 00:00:00 2001
From: Vivian Zhang <vivian.zhang@amd.com>
Date: Mon, 17 Feb 2025 10:31:08 -0800
Subject: [PATCH] [CI] Modify matmul4d to use total size as input args (#1112)

Before this PR, input args M/N/K are the outer dims for matmul4d ops.
For easier comparison with standard matmul, this PR keeps the use of
M/N/K as total matmul input sizes, while adding inner dim sizes as
another parameter.
---
 .../matmul4d_M1xK1xM0xK0_N1xK1xK0xN0.mlir     | 17 ++++++
 .../matmul4d_MxKxM0xK0_NxKxK0xN0.mlir         | 17 ------
 .../matmul_template/matmul_generator.py       | 20 ++++++-
 build_tools/ci/cpu_comparison/run.py          | 58 ++++++++++++-------
 4 files changed, 71 insertions(+), 41 deletions(-)
 create mode 100644 build_tools/ci/cpu_comparison/matmul_template/matmul4d_M1xK1xM0xK0_N1xK1xK0xN0.mlir
 delete mode 100644 build_tools/ci/cpu_comparison/matmul_template/matmul4d_MxKxM0xK0_NxKxK0xN0.mlir

diff --git a/build_tools/ci/cpu_comparison/matmul_template/matmul4d_M1xK1xM0xK0_N1xK1xK0xN0.mlir b/build_tools/ci/cpu_comparison/matmul_template/matmul4d_M1xK1xM0xK0_N1xK1xK0xN0.mlir
new file mode 100644
index 000000000..232e309ff
--- /dev/null
+++ b/build_tools/ci/cpu_comparison/matmul_template/matmul4d_M1xK1xM0xK0_N1xK1xK0xN0.mlir
@@ -0,0 +1,17 @@
+// input ${M1}x${K1}x${M0}x${K0}x${TYPE1}
+// input ${N1}x${K1}x${K0}x${N0}x${TYPE1}
+
+func.func @matmul4d(%arg0: tensor<${M1}x${K1}x${M0}x${K0}x${TYPE1}>, %arg1: tensor<${N1}x${K1}x${K0}x${N0}x${TYPE1}>) -> tensor<${N1}x${M1}x${M0}x${N0}x${TYPE2}> {
+  %cst = arith.constant ${ZERO} : ${TYPE2}
+  %0 = tensor.empty() : tensor<${N1}x${M1}x${M0}x${N0}x${TYPE2}>
+  %1 = linalg.fill ins(%cst : ${TYPE2}) outs(%0 : tensor<${N1}x${M1}x${M0}x${N0}x${TYPE2}>) -> tensor<${N1}x${M1}x${M0}x${N0}x${TYPE2}>
+  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<${M1}x${K1}x${M0}x${K0}x${TYPE1}>, tensor<${N1}x${K1}x${K0}x${N0}x${TYPE1}>) outs(%1 : tensor<${N1}x${M1}x${M0}x${N0}x${TYPE2}>) {
+    ^bb0(%in: ${TYPE1}, %in_1: ${TYPE1}, %out: ${TYPE2}):
+      %12 = ${EXT} %in : ${TYPE1} to ${TYPE2}
+      %13 = ${EXT} %in_1 : ${TYPE1} to ${TYPE2}
+      %14 = ${MUL} %12, %13 : ${TYPE2}
+      %15 = ${ADD} %out, %14 : ${TYPE2}
+      linalg.yield %15 : ${TYPE2}
+    } -> tensor<${N1}x${M1}x${M0}x${N0}x${TYPE2}>
+  return %2 : tensor<${N1}x${M1}x${M0}x${N0}x${TYPE2}>
+}
diff --git a/build_tools/ci/cpu_comparison/matmul_template/matmul4d_MxKxM0xK0_NxKxK0xN0.mlir b/build_tools/ci/cpu_comparison/matmul_template/matmul4d_MxKxM0xK0_NxKxK0xN0.mlir
deleted file mode 100644
index 76ef7dd63..000000000
--- a/build_tools/ci/cpu_comparison/matmul_template/matmul4d_MxKxM0xK0_NxKxK0xN0.mlir
+++ /dev/null
@@ -1,17 +0,0 @@
-// input ${M}x${K}x32x64x${TYPE1}
-// input ${N}x${K}x64x32x${TYPE1}
-
-func.func @matmul4d(%arg0: tensor<${M}x${K}x32x64x${TYPE1}>, %arg1: tensor<${N}x${K}x64x32x${TYPE1}>) -> tensor<${N}x${M}x32x32x${TYPE2}> {
-  %cst = arith.constant ${ZERO} : ${TYPE2}
-  %0 = tensor.empty() : tensor<${N}x${M}x32x32x${TYPE2}>
-  %1 = linalg.fill ins(%cst : ${TYPE2}) outs(%0 : tensor<${N}x${M}x32x32x${TYPE2}>) -> tensor<${N}x${M}x32x32x${TYPE2}>
-  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<${M}x${K}x32x64x${TYPE1}>, tensor<${N}x${K}x64x32x${TYPE1}>) outs(%1 : tensor<${N}x${M}x32x32x${TYPE2}>) {
-    ^bb0(%in: ${TYPE1}, %in_1: ${TYPE1}, %out: ${TYPE2}):
-      %12 = ${EXT} %in : ${TYPE1} to ${TYPE2}
-      %13 = ${EXT} %in_1 : ${TYPE1} to ${TYPE2}
-      %14 = ${MUL} %12, %13 : ${TYPE2}
-      %15 = ${ADD} %out, %14 : ${TYPE2}
-      linalg.yield %15 : ${TYPE2}
-    } -> tensor<${N}x${M}x32x32x${TYPE2}>
-  return %2 : tensor<${N}x${M}x32x32x${TYPE2}>
-}
diff --git a/build_tools/ci/cpu_comparison/matmul_template/matmul_generator.py b/build_tools/ci/cpu_comparison/matmul_template/matmul_generator.py
index 7fdd36b6d..f02e17023 100644
--- a/build_tools/ci/cpu_comparison/matmul_template/matmul_generator.py
+++ b/build_tools/ci/cpu_comparison/matmul_template/matmul_generator.py
@@ -9,7 +9,9 @@ def get_higher_order_element_type(element_type):
     assert False, f"support for {element_type} is missing"
 
 
-def generate_matmul_test(output_fn, input_fn, m, n, k, lhs_rhs_type, acc_type, b=0):
+def generate_matmul_test(
+    output_fn, input_fn, m, n, k, lhs_rhs_type, acc_type, b=0, m0=0, n0=0, k0=0
+):
     """
     Generate mlir file (output_fn) from the template file (input_fn).
     """
@@ -23,13 +25,27 @@ def generate_matmul_test(output_fn, input_fn, m, n, k, lhs_rhs_type, acc_type, b
     # Only used for Matmul+Trunc via scaling.
     replace["TYPE_MUL_RESULT"] = get_higher_order_element_type(acc_type)
 
-    replace["B"] = b  # This is only used for batch matmul
     acc_is_int = acc_type[0] == "i"
     replace["ZERO"] = 0 if acc_is_int else 0.0
     replace["ADD"] = "arith.addi" if acc_is_int else "arith.addf"
     replace["MUL"] = "arith.muli" if acc_is_int else "arith.mulf"
     replace["EXT"] = "arith.extsi" if acc_is_int else "arith.extf"
 
+    # This is only used for batch matmul.
+    replace["B"] = b
+
+    # m0, n0, k0 are only used for matmul4d as inner dim sizes.
+    replace["M0"] = m0
+    replace["N0"] = n0
+    replace["K0"] = k0
+    # matmul4d outer dim sizes can be calculated by `total_size/inner_dim_size`.
+    if m0 != 0:
+        replace["M1"] = int(m / m0)
+    if n0 != 0:
+        replace["N1"] = int(n / n0)
+    if k0 != 0:
+        replace["K1"] = int(k / k0)
+
     key_map = map(lambda s: "${" + s + "}", replace.keys())
     key_map_escaped = map(re.escape, key_map)
     regex = re.compile("|".join(key_map_escaped))
diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py
index a98bc2685..cb42a3949 100755
--- a/build_tools/ci/cpu_comparison/run.py
+++ b/build_tools/ci/cpu_comparison/run.py
@@ -457,14 +457,17 @@ def _execute(self, config):
 
 class Matmul4d(BaseMatmul):
     """
-    A test of linalg.generic with 4d inputs and output implementing form:
-    C += matmul4d(A,B) where A:MxKxM0xK0, B:NxKxK0xN0, C:NxMxM0xN0
-
-    Note that the outer dims for this operation are transposed to make sure
-    successful compilation through LogicalObjectFifo pipeline.
-    For comparison purpose, the input values of inner dims M0/N0/K0 are
-    fixed as 32/32/64 currently.
-    TODO(vivian): Generalize the class and the template.
+    A test of linalg.generic with 4d inputs and output, following the form:
+    C += matmul4d(A,B) where A:M1xK1xM0xK0, B:N1xK1xK0xN0, C:N1xM1xM0xN0
+
+    -- M0/N0/K0 are inner dim sizes, currently fixed at 32/32/64 for comparison purpose.
+
+    -- M1/N1/K1 are outer dim sizes.
+       Note that the outer dims for this operation are transposed to make sure
+       successful compilation through LogicalObjectFifo pipeline.
+
+    -- The input parameters M/N/K are the total size which equals to the product
+       of outer and inner dim sizes.
     """
 
     def __init__(
@@ -474,6 +477,9 @@ def __init__(
         K,
         input_type,
         acc_type,
+        M0=32,
+        N0=32,
+        K0=64,
         additional_labels=None,
         n_kernel_runs=1,
         test_params=None,
@@ -489,6 +495,9 @@ def __init__(
             function_name="matmul4d",
             n_kernel_runs=n_kernel_runs,
         )
+        self.M0 = M0
+        self.N0 = N0
+        self.K0 = K0
         self.labels.append("Matmul4d")
         if additional_labels:
             self.labels += additional_labels
@@ -500,8 +509,19 @@ def __init__(
 
     def _execute(self, config):
         matmul_template_dir = config.file_dir / "matmul_template"
-        template_name = matmul_template_dir / "matmul4d_MxKxM0xK0_NxKxK0xN0.mlir"
-        self.generate(config, template_name)
+        template_name = matmul_template_dir / "matmul4d_M1xK1xM0xK0_N1xK1xK0xN0.mlir"
+        generate_matmul_test(
+            self.get_filename(config),
+            template_name,
+            m=self.M,
+            n=self.N,
+            k=self.K,
+            lhs_rhs_type=self.input_type,
+            acc_type=self.acc_type,
+            m0=self.M0,
+            n0=self.N0,
+            k0=self.K0,
+        )
         if self.run_benchmark:
             return self.benchmark(config)
 
@@ -2149,13 +2169,10 @@ def __init__(self):
                 "transpose_b": False,
                 "tile_pipeline": "pack-peel-4-level-tiling",
             },
-            # matmul4d test where the input M/N/K are outer dim values.
-            # The total input values correspond to a standard matmul
-            # from the above test are M:512, N:4096, K:512.
             {
-                "M": 16,
-                "N": 128,
-                "K": 8,
+                "M": 512,
+                "N": 4096,
+                "K": 512,
                 "use_ukernel": True,
                 "peano_opt_level": 3,
                 "outline": "balanced",
@@ -2227,13 +2244,10 @@ def __init__(self):
                 "tile_pipeline": "pack-peel-4-level-tiling",
                 "run_on_target": "npu4",
             },
-            # matmul4d test where the input M/N/K are outer dim values.
-            # The total input values correspond to a standard matmul
-            # from the above test are M:512, N:4096, K:512.
             {
-                "M": 16,
-                "N": 128,
-                "K": 8,
+                "M": 512,
+                "N": 4096,
+                "K": 512,
                 "in_dtype": "i8",
                 "out_dtype": "i32",
                 "use_ukernel": True,