diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py index 1259f032e..6346024d1 100755 --- a/build_tools/ci/cpu_comparison/run.py +++ b/build_tools/ci/cpu_comparison/run.py @@ -1439,15 +1439,103 @@ def __init__(self): ) ) + performance_tests = [ + { + "M": 512, + "N": 512, + "K": 4096, + "use_ukernel": False, + "peano_opt_level": 2, + "outline": False, + }, + { + "M": 512, + "N": 512, + "K": 4096, + "use_ukernel": False, + "peano_opt_level": 2, + "outline": True, + }, + { + "M": 512, + "N": 512, + "K": 4096, + "use_ukernel": False, + "peano_opt_level": 3, + "outline": False, + }, + { + "M": 512, + "N": 512, + "K": 4096, + "use_ukernel": False, + "peano_opt_level": 3, + "outline": True, + }, + { + "M": 512, + "N": 512, + "K": 4096, + "use_ukernel": True, + "peano_opt_level": 3, + "outline": True, + }, + { + "M": 512, + "N": 4096, + "K": 512, + "use_ukernel": False, + "peano_opt_level": 3, + "outline": True, + }, + { + "M": 512, + "N": 4096, + "K": 512, + "use_ukernel": True, + "peano_opt_level": 3, + "outline": True, + }, + { + "M": 4096, + "N": 512, + "K": 512, + "use_ukernel": False, + "peano_opt_level": 3, + "outline": True, + }, + { + "M": 4096, + "N": 512, + "K": 512, + "use_ukernel": True, + "peano_opt_level": 3, + "outline": True, + }, + ] + # Some bf16 Performance tests: - for M, N, K, use_ukernel in [ - (512, 512, 4096, False), - (512, 512, 4096, True), - (512, 4096, 512, False), - (512, 4096, 512, True), - (4096, 512, 512, False), - (4096, 512, 512, True), - ]: + for test in performance_tests: + M = test["M"] + N = test["N"] + K = test["K"] + use_ukernel = test["use_ukernel"] + peano_opt_level = test["peano_opt_level"] + outline = test["outline"] + + outlining_string = "--iree-amdaie-enable-function-outlining=" + str( + int(outline) + ) + peano_opt_level_string = f'"-O{peano_opt_level}"' + aie_compilation_flags = [ + outlining_string, + f"--iree-amd-aie-additional-peano-opt-flags={peano_opt_level_string}", + ] + + name_suffix = "O" + str(peano_opt_level) + if outline: + name_suffix += "_outline" + self.register( Matmul( M, @@ -1457,8 +1545,12 @@ def __init__(self): "f32", use_ukernel=use_ukernel, n_repeats=2, + aie_compilation_flags=aie_compilation_flags, + name_suffix=name_suffix, + additional_labels=["PerformanceCorrectness"], ) ) + self.register( MatmulBenchmark( M, @@ -1470,6 +1562,8 @@ def __init__(self): use_ukernel=use_ukernel, n_repeats=5, n_kernel_runs=100, + aie_compilation_flags=aie_compilation_flags, + name_suffix=name_suffix, ) ) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.h index 2bbc679a6..7ec51fad6 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.h @@ -56,7 +56,7 @@ struct AMDAIEOptions { bool enableVectorizationPasses{true}; bool enableCoalescingLoops{false}; bool enableCollapsingUnitDims{false}; - bool enableFunctionOutlining{false}; + bool enableFunctionOutlining{true}; bool insertLoopAroundCoreBlock{false}; bool matmulElementwiseFusion{false}; AMDAIEDevice AMDAIETargetDevice{AMDAIEDevice::npu1_4col}; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp index b44d977c8..45967a2a8 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp @@ -92,7 +92,7 @@ FailureOr> makePeanoOptArgs( // Extend the max limit of the search depth in BasicAA "-basic-aa-max-lookup-search-depth=10", // - "-O2", + "-O3", // "--inline-threshold=10", // missing from libc diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/disable_linalg_function_outlining.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/disable_linalg_function_outlining.mlir index 277eef0e5..d95869a75 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/disable_linalg_function_outlining.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/disable_linalg_function_outlining.mlir @@ -1,14 +1,13 @@ // This test demonstrates enabling / disabling function outlining in the default -// pipeline (note that below the pipeline is not specified explicitly with -// the flag --iree-amdaie-tile-pipeline). We check 3 paths: +// pipeline. We check 3 paths: // // 1) Explicitly disabling linalg function outlining with -// --iree-amdaie-enable-function-outlining=0 +// --iree-amdaie-enable-function-outlining=0 // // 2) Explicitly enabling linalg function outlining with -// --iree-amdaie-enable-function-outlining=1 +// --iree-amdaie-enable-function-outlining=1 // -// 3) Not specifying the flag at all, which should use the default value (0). +// 3) Not specifying the flag at all, which should use the default value (1). // 1) Explicitly disabled: @@ -36,4 +35,4 @@ func.func @matmul(%lhs: tensor<64x64xbf16>, // CHECK-DISABLED-NOT: func.call // CHECK-ENABLED: func.call -// CHECK-DEFAULT-NOT: func.call +// CHECK-DEFAULT: func.call