From 683406d7dabbe5d5e798c6995e0aefc539cc2063 Mon Sep 17 00:00:00 2001 From: Abhishek Varma Date: Fri, 14 Feb 2025 12:39:18 +0530 Subject: [PATCH] [e2e CI] Add e2e CI tests for Matmul+Trunci with scaling (#1099) -- This commit adds e2e CI tests for Matmul+Trunci with scaling. -- In an actual quantized model, truncating from a higher bitwidth to a lower precision bitwidth won't work and we need to scale. -- Since the output of the Matmul here is an integer cannot be multiplied with a floating point scale factor, we need to represent the scale factor with a multiplier and a shift operator instead. -- Eg: a float scale factor of 0.333 could become multiply by 357913941 and shift right 30. Signed-off-by: Abhishek Varma --- .../matmul_template/matmul_generator.py | 10 +++ .../matmul_trunci_scaling_MxK_KxN.mlir | 33 ++++++++ build_tools/ci/cpu_comparison/run.py | 76 +++++++++++++++++++ 3 files changed, 119 insertions(+) create mode 100644 build_tools/ci/cpu_comparison/matmul_template/matmul_trunci_scaling_MxK_KxN.mlir diff --git a/build_tools/ci/cpu_comparison/matmul_template/matmul_generator.py b/build_tools/ci/cpu_comparison/matmul_template/matmul_generator.py index cf6013214..adc42cfb9 100644 --- a/build_tools/ci/cpu_comparison/matmul_template/matmul_generator.py +++ b/build_tools/ci/cpu_comparison/matmul_template/matmul_generator.py @@ -3,6 +3,14 @@ import os +def get_higher_order_element_type(element_type): + if element_type[0] in ["i", "f"]: + assert element_type[1:].isdigit(), f"support for {element_type} is missing" + bit_width = int(element_type[1:]) + return f"{element_type[0]}{bit_width*2}" + assert False, f"support for {element_type} is missing" + + def generate_matmul_test(output_fn, input_fn, m, n, k, lhs_rhs_type, acc_type, b=0): """ Generate mlir file (output_fn) from the template file (input_fn). @@ -14,6 +22,8 @@ def generate_matmul_test(output_fn, input_fn, m, n, k, lhs_rhs_type, acc_type, b replace["K"] = k replace["TYPE1"] = lhs_rhs_type replace["TYPE2"] = acc_type + # Only used for Matmul+Trunc via scaling. + replace["TYPE_MUL_RESULT"] = get_higher_order_element_type(acc_type) replace["B"] = b # This is only used for batch matmul acc_is_int = acc_type[0] == "i" diff --git a/build_tools/ci/cpu_comparison/matmul_template/matmul_trunci_scaling_MxK_KxN.mlir b/build_tools/ci/cpu_comparison/matmul_template/matmul_trunci_scaling_MxK_KxN.mlir new file mode 100644 index 000000000..5ed4a849b --- /dev/null +++ b/build_tools/ci/cpu_comparison/matmul_template/matmul_trunci_scaling_MxK_KxN.mlir @@ -0,0 +1,33 @@ +// input ${M}x${K}x${TYPE1} +// input ${K}x${N}x${TYPE1} + +// Matmul + Trunci variant with scaling. +// In an actual quantized model, truncating from a higher bitwidth to a lower precision bitwidth +// won't work and we need to scale. +// Since the output of the Matmul here is an integer cannot be multiplied with a floating point +// scale factor, we need to represent the scale factor with a multiplier and a shift operator instead. +func.func @matmul_trunci(%arg0: tensor<${M}x${K}x${TYPE1}>, %arg1: tensor<${K}x${N}x${TYPE1}>) -> tensor<${M}x${N}x${TYPE1}> +{ + %cst = arith.constant ${ZERO} : ${TYPE2} + %cst_mul = arith.constant 10 : ${TYPE_MUL_RESULT} + %cst_shift = arith.constant 7 : ${TYPE_MUL_RESULT} + %0 = tensor.empty() : tensor<${M}x${N}x${TYPE2}> + %i8out = tensor.empty() : tensor<${M}x${N}x${TYPE1}> + %1 = linalg.fill ins(%cst : ${TYPE2}) outs(%0 : tensor<${M}x${N}x${TYPE2}>) -> tensor<${M}x${N}x${TYPE2}> + %2 = linalg.matmul ins(%arg0, %arg1 : tensor<${M}x${K}x${TYPE1}>, tensor<${K}x${N}x${TYPE1}>) + outs(%1: tensor<${M}x${N}x${TYPE2}>) -> tensor<${M}x${N}x${TYPE2}> + %3 = linalg.generic {indexing_maps = [ + affine_map<(d0, d1) -> (d0, d1)>, + affine_map<(d0, d1) -> (d0, d1)> + ], + iterator_types = ["parallel", "parallel"] + } ins(%2 : tensor<${M}x${N}x${TYPE2}>) outs(%i8out : tensor<${M}x${N}x${TYPE1}>) { + ^bb0(%in: ${TYPE2}, %out: ${TYPE1}): + %4 = arith.extsi %in : ${TYPE2} to ${TYPE_MUL_RESULT} + %5 = arith.muli %4, %cst_mul : ${TYPE_MUL_RESULT} + %6 = arith.shrsi %5, %cst_shift : ${TYPE_MUL_RESULT} + %7 = arith.trunci %6 : ${TYPE_MUL_RESULT} to ${TYPE1} + linalg.yield %7 : ${TYPE1} + } -> tensor<${M}x${N}x${TYPE1}> + return %3: tensor<${M}x${N}x${TYPE1}> +} diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py index 842515f84..3b8c585ef 100755 --- a/build_tools/ci/cpu_comparison/run.py +++ b/build_tools/ci/cpu_comparison/run.py @@ -692,6 +692,7 @@ def __init__( rhs, expected_out, test_params=None, + use_scaling=False, ): super().__init__( name=f"matmul_trunci_{M}_{N}_{K}_{input_type}_{acc_type}", @@ -712,10 +713,13 @@ def __init__( self.lhs = lhs self.rhs = rhs self.expected_out = expected_out + self.use_scaling = use_scaling def _execute(self, config): matmul_template_dir = config.file_dir / "matmul_template" template_name = matmul_template_dir / "matmul_trunci_MxK_KxN.mlir" + if self.use_scaling: + template_name = matmul_template_dir / "matmul_trunci_scaling_MxK_KxN.mlir" self.generate(config, template_name) filename = self.get_filename(config) input_args = generate_inputs( @@ -1589,6 +1593,78 @@ def __init__(self): ), ) ) + + # Tests Matmul + Trunci with Scaling. + # Phoenix : Ukernel + Peano. + self.register( + MatmulTrunci( + 256, + 256, + 128, + "i8", + "i32", + 2 * np.ones([256, 128], dtype=np.int8), + 3 * np.ones([128, 256], dtype=np.int8), + 60 * np.ones([256, 256], dtype=np.int8), + test_params=TestParams( + name_suffix="scaling", + tile_pipeline="pack-peel-4-level-tiling", + run_on_target=["npu1_4col"], + aie_compilation_flags=[ + "--iree-amdaie-num-rows=4", + "--iree-amdaie-num-cols=4", + ], + use_ukernel=True, + ), + use_scaling=True, + ) + ) + # Phoenix : Vectorization + Peano. + self.register( + MatmulTrunci( + 256, + 256, + 128, + "i8", + "i32", + 2 * np.ones([256, 128], dtype=np.int8), + 3 * np.ones([128, 256], dtype=np.int8), + 60 * np.ones([256, 256], dtype=np.int8), + test_params=TestParams( + tile_pipeline="pack-peel-4-level-tiling", + run_on_target=["npu1_4col"], + aie_compilation_flags=[ + "--iree-amdaie-num-rows=4", + "--iree-amdaie-num-cols=4", + ], + ), + use_scaling=True, + ) + ) + # Strix : Ukernel + Chess. + self.register( + MatmulTrunci( + 256, + 256, + 128, + "i8", + "i32", + 2 * np.ones([256, 128], dtype=np.int8), + 3 * np.ones([128, 256], dtype=np.int8), + 60 * np.ones([256, 256], dtype=np.int8), + test_params=TestParams( + tile_pipeline="pack-peel-4-level-tiling", + run_on_target=["npu4"], + aie_compilation_flags=[ + "--iree-amdaie-num-rows=4", + "--iree-amdaie-num-cols=8", + ], + use_chess=True, + use_ukernel=True, + ), + use_scaling=True, + ) + ) # Matmul with truncf test(s): for tile_pipeline in ["pack-peel", "pack-peel-4-level-tiling"]: self.register(