From d2ccc8907e9b867259854d233c70980a26251ac0 Mon Sep 17 00:00:00 2001 From: Vivian Zhang Date: Mon, 17 Feb 2025 10:31:08 -0800 Subject: [PATCH] [CI] Modify matmul4d to use total size as input args (#1112) Before this PR, input args M/N/K are the outer dims for matmul4d ops. For easier comparison with standard matmul, this PR keeps the use of M/N/K as total matmul input sizes, while adding inner dim sizes as another parameter. --- .../matmul4d_M1xK1xM0xK0_N1xK1xK0xN0.mlir | 17 ++++++ .../matmul4d_MxKxM0xK0_NxKxK0xN0.mlir | 17 ------ .../matmul_template/matmul_generator.py | 20 ++++++- build_tools/ci/cpu_comparison/run.py | 58 ++++++++++++------- 4 files changed, 71 insertions(+), 41 deletions(-) create mode 100644 build_tools/ci/cpu_comparison/matmul_template/matmul4d_M1xK1xM0xK0_N1xK1xK0xN0.mlir delete mode 100644 build_tools/ci/cpu_comparison/matmul_template/matmul4d_MxKxM0xK0_NxKxK0xN0.mlir diff --git a/build_tools/ci/cpu_comparison/matmul_template/matmul4d_M1xK1xM0xK0_N1xK1xK0xN0.mlir b/build_tools/ci/cpu_comparison/matmul_template/matmul4d_M1xK1xM0xK0_N1xK1xK0xN0.mlir new file mode 100644 index 000000000..232e309ff --- /dev/null +++ b/build_tools/ci/cpu_comparison/matmul_template/matmul4d_M1xK1xM0xK0_N1xK1xK0xN0.mlir @@ -0,0 +1,17 @@ +// input ${M1}x${K1}x${M0}x${K0}x${TYPE1} +// input ${N1}x${K1}x${K0}x${N0}x${TYPE1} + +func.func @matmul4d(%arg0: tensor<${M1}x${K1}x${M0}x${K0}x${TYPE1}>, %arg1: tensor<${N1}x${K1}x${K0}x${N0}x${TYPE1}>) -> tensor<${N1}x${M1}x${M0}x${N0}x${TYPE2}> { + %cst = arith.constant ${ZERO} : ${TYPE2} + %0 = tensor.empty() : tensor<${N1}x${M1}x${M0}x${N0}x${TYPE2}> + %1 = linalg.fill ins(%cst : ${TYPE2}) outs(%0 : tensor<${N1}x${M1}x${M0}x${N0}x${TYPE2}>) -> tensor<${N1}x${M1}x${M0}x${N0}x${TYPE2}> + %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<${M1}x${K1}x${M0}x${K0}x${TYPE1}>, tensor<${N1}x${K1}x${K0}x${N0}x${TYPE1}>) outs(%1 : tensor<${N1}x${M1}x${M0}x${N0}x${TYPE2}>) { + ^bb0(%in: ${TYPE1}, %in_1: ${TYPE1}, %out: ${TYPE2}): + %12 = ${EXT} %in : ${TYPE1} to ${TYPE2} + %13 = ${EXT} %in_1 : ${TYPE1} to ${TYPE2} + %14 = ${MUL} %12, %13 : ${TYPE2} + %15 = ${ADD} %out, %14 : ${TYPE2} + linalg.yield %15 : ${TYPE2} + } -> tensor<${N1}x${M1}x${M0}x${N0}x${TYPE2}> + return %2 : tensor<${N1}x${M1}x${M0}x${N0}x${TYPE2}> +} diff --git a/build_tools/ci/cpu_comparison/matmul_template/matmul4d_MxKxM0xK0_NxKxK0xN0.mlir b/build_tools/ci/cpu_comparison/matmul_template/matmul4d_MxKxM0xK0_NxKxK0xN0.mlir deleted file mode 100644 index 76ef7dd63..000000000 --- a/build_tools/ci/cpu_comparison/matmul_template/matmul4d_MxKxM0xK0_NxKxK0xN0.mlir +++ /dev/null @@ -1,17 +0,0 @@ -// input ${M}x${K}x32x64x${TYPE1} -// input ${N}x${K}x64x32x${TYPE1} - -func.func @matmul4d(%arg0: tensor<${M}x${K}x32x64x${TYPE1}>, %arg1: tensor<${N}x${K}x64x32x${TYPE1}>) -> tensor<${N}x${M}x32x32x${TYPE2}> { - %cst = arith.constant ${ZERO} : ${TYPE2} - %0 = tensor.empty() : tensor<${N}x${M}x32x32x${TYPE2}> - %1 = linalg.fill ins(%cst : ${TYPE2}) outs(%0 : tensor<${N}x${M}x32x32x${TYPE2}>) -> tensor<${N}x${M}x32x32x${TYPE2}> - %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<${M}x${K}x32x64x${TYPE1}>, tensor<${N}x${K}x64x32x${TYPE1}>) outs(%1 : tensor<${N}x${M}x32x32x${TYPE2}>) { - ^bb0(%in: ${TYPE1}, %in_1: ${TYPE1}, %out: ${TYPE2}): - %12 = ${EXT} %in : ${TYPE1} to ${TYPE2} - %13 = ${EXT} %in_1 : ${TYPE1} to ${TYPE2} - %14 = ${MUL} %12, %13 : ${TYPE2} - %15 = ${ADD} %out, %14 : ${TYPE2} - linalg.yield %15 : ${TYPE2} - } -> tensor<${N}x${M}x32x32x${TYPE2}> - return %2 : tensor<${N}x${M}x32x32x${TYPE2}> -} diff --git a/build_tools/ci/cpu_comparison/matmul_template/matmul_generator.py b/build_tools/ci/cpu_comparison/matmul_template/matmul_generator.py index 7fdd36b6d..f02e17023 100644 --- a/build_tools/ci/cpu_comparison/matmul_template/matmul_generator.py +++ b/build_tools/ci/cpu_comparison/matmul_template/matmul_generator.py @@ -9,7 +9,9 @@ def get_higher_order_element_type(element_type): assert False, f"support for {element_type} is missing" -def generate_matmul_test(output_fn, input_fn, m, n, k, lhs_rhs_type, acc_type, b=0): +def generate_matmul_test( + output_fn, input_fn, m, n, k, lhs_rhs_type, acc_type, b=0, m0=0, n0=0, k0=0 +): """ Generate mlir file (output_fn) from the template file (input_fn). """ @@ -23,13 +25,27 @@ def generate_matmul_test(output_fn, input_fn, m, n, k, lhs_rhs_type, acc_type, b # Only used for Matmul+Trunc via scaling. replace["TYPE_MUL_RESULT"] = get_higher_order_element_type(acc_type) - replace["B"] = b # This is only used for batch matmul acc_is_int = acc_type[0] == "i" replace["ZERO"] = 0 if acc_is_int else 0.0 replace["ADD"] = "arith.addi" if acc_is_int else "arith.addf" replace["MUL"] = "arith.muli" if acc_is_int else "arith.mulf" replace["EXT"] = "arith.extsi" if acc_is_int else "arith.extf" + # This is only used for batch matmul. + replace["B"] = b + + # m0, n0, k0 are only used for matmul4d as inner dim sizes. + replace["M0"] = m0 + replace["N0"] = n0 + replace["K0"] = k0 + # matmul4d outer dim sizes can be calculated by `total_size/inner_dim_size`. + if m0 != 0: + replace["M1"] = int(m / m0) + if n0 != 0: + replace["N1"] = int(n / n0) + if k0 != 0: + replace["K1"] = int(k / k0) + key_map = map(lambda s: "${" + s + "}", replace.keys()) key_map_escaped = map(re.escape, key_map) regex = re.compile("|".join(key_map_escaped)) diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py index a98bc2685..cb42a3949 100755 --- a/build_tools/ci/cpu_comparison/run.py +++ b/build_tools/ci/cpu_comparison/run.py @@ -457,14 +457,17 @@ def _execute(self, config): class Matmul4d(BaseMatmul): """ - A test of linalg.generic with 4d inputs and output implementing form: - C += matmul4d(A,B) where A:MxKxM0xK0, B:NxKxK0xN0, C:NxMxM0xN0 - - Note that the outer dims for this operation are transposed to make sure - successful compilation through LogicalObjectFifo pipeline. - For comparison purpose, the input values of inner dims M0/N0/K0 are - fixed as 32/32/64 currently. - TODO(vivian): Generalize the class and the template. + A test of linalg.generic with 4d inputs and output, following the form: + C += matmul4d(A,B) where A:M1xK1xM0xK0, B:N1xK1xK0xN0, C:N1xM1xM0xN0 + + -- M0/N0/K0 are inner dim sizes, currently fixed at 32/32/64 for comparison purpose. + + -- M1/N1/K1 are outer dim sizes. + Note that the outer dims for this operation are transposed to make sure + successful compilation through LogicalObjectFifo pipeline. + + -- The input parameters M/N/K are the total size which equals to the product + of outer and inner dim sizes. """ def __init__( @@ -474,6 +477,9 @@ def __init__( K, input_type, acc_type, + M0=32, + N0=32, + K0=64, additional_labels=None, n_kernel_runs=1, test_params=None, @@ -489,6 +495,9 @@ def __init__( function_name="matmul4d", n_kernel_runs=n_kernel_runs, ) + self.M0 = M0 + self.N0 = N0 + self.K0 = K0 self.labels.append("Matmul4d") if additional_labels: self.labels += additional_labels @@ -500,8 +509,19 @@ def __init__( def _execute(self, config): matmul_template_dir = config.file_dir / "matmul_template" - template_name = matmul_template_dir / "matmul4d_MxKxM0xK0_NxKxK0xN0.mlir" - self.generate(config, template_name) + template_name = matmul_template_dir / "matmul4d_M1xK1xM0xK0_N1xK1xK0xN0.mlir" + generate_matmul_test( + self.get_filename(config), + template_name, + m=self.M, + n=self.N, + k=self.K, + lhs_rhs_type=self.input_type, + acc_type=self.acc_type, + m0=self.M0, + n0=self.N0, + k0=self.K0, + ) if self.run_benchmark: return self.benchmark(config) @@ -2149,13 +2169,10 @@ def __init__(self): "transpose_b": False, "tile_pipeline": "pack-peel-4-level-tiling", }, - # matmul4d test where the input M/N/K are outer dim values. - # The total input values correspond to a standard matmul - # from the above test are M:512, N:4096, K:512. { - "M": 16, - "N": 128, - "K": 8, + "M": 512, + "N": 4096, + "K": 512, "use_ukernel": True, "peano_opt_level": 3, "outline": "balanced", @@ -2227,13 +2244,10 @@ def __init__(self): "tile_pipeline": "pack-peel-4-level-tiling", "run_on_target": "npu4", }, - # matmul4d test where the input M/N/K are outer dim values. - # The total input values correspond to a standard matmul - # from the above test are M:512, N:4096, K:512. { - "M": 16, - "N": 128, - "K": 8, + "M": 512, + "N": 4096, + "K": 512, "in_dtype": "i8", "out_dtype": "i32", "use_ukernel": True,