Merge branch 'main' into matmul4d_ci

nod-ai · Feb 17, 2025 · f12a94e · f12a94e
2 parents 2fad3f9 + 5755830
commit f12a94e
Show file tree

Hide file tree

Showing 15 changed files with 717 additions and 433 deletions.
diff --git a/.github/workflows/ci-windows.yml b/.github/workflows/ci-windows.yml
@@ -66,7 +66,7 @@ jobs:
               submodule update --init --recursive --depth 1 --single-branch -j 10
 
       - name: Setup Cpp
-        uses: aminya/setup-cpp@12e62a1b8da8f1e66acc75305a2621234ef49dd0 # v0.46.0
+        uses: aminya/setup-cpp@abe2d67f8c619c5f4b9e40358430e33df461d5b8 # v0.46.2
         with:
           compiler: llvm
           vcvarsall: true

diff --git a/build_tools/ci/cpu_comparison/matmul_template/matmul_trunci_MxK_KxN.mlir b/build_tools/ci/cpu_comparison/matmul_template/matmul_trunci_MxK_KxN.mlir
diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py
@@ -171,7 +171,7 @@ def run(self, config):
         # does not).
         if self.use_chess and not config.vitis_dir:
             return False
-        if self.use_chess_for_ukernel and not config.vitis_dir:
+        if self.use_ukernel and self.use_chess_for_ukernel and not config.vitis_dir:
             return False
 
         # If use_chess=0, and config has not provided a valid
@@ -751,9 +751,9 @@ def _execute(self, config):
         return True
 
 
-class MatmulTrunci(BaseMatmul):
+class MatmulScaleTrunci(BaseMatmul):
     """
-    A test of the form matmul(A,B) + trunci(C) where A:MxK, B:KxN and C:MxN
+    A test of the form matmul(A,B) + scale(C) + trunci(C) where A:MxK, B:KxN and C:MxN
     """
 
     def __init__(
@@ -767,18 +767,17 @@ def __init__(
         rhs,
         expected_out,
         test_params=None,
-        use_scaling=False,
     ):
         super().__init__(
-            name=f"matmul_trunci_{M}_{N}_{K}_{input_type}_{acc_type}",
+            name=f"matmul_scale_trunci_{M}_{N}_{K}_{input_type}_{acc_type}",
             test_params=test_params,
             M=M,
             N=N,
             K=K,
             input_type=input_type,
             acc_type=acc_type,
         )
-        self.labels.append("MatmulTrunci")
+        self.labels.append("MatmulScaleTrunci")
 
         # Assertions on shapes: Check that lhs is MxK, rhs is KxN, and expected_out is MxN
         assert lhs.shape == (M, K)
@@ -788,13 +787,10 @@ def __init__(
         self.lhs = lhs
         self.rhs = rhs
         self.expected_out = expected_out
-        self.use_scaling = use_scaling
 
     def _execute(self, config):
         matmul_template_dir = config.file_dir / "matmul_template"
-        template_name = matmul_template_dir / "matmul_trunci_MxK_KxN.mlir"
-        if self.use_scaling:
-            template_name = matmul_template_dir / "matmul_trunci_scaling_MxK_KxN.mlir"
+        template_name = matmul_template_dir / "matmul_trunci_scaling_MxK_KxN.mlir"
         self.generate(config, template_name)
         filename = self.get_filename(config)
         input_args = generate_inputs(
@@ -1601,78 +1597,10 @@ def __init__(self):
         self.existing_names = []
         self.tests = []
 
-        # Tests Matmul + Trunci.
-        # Phoenix : Ukernel + Peano.
-        self.register(
-            MatmulTrunci(
-                256,
-                128,
-                32,
-                "i8",
-                "i32",
-                1 * np.ones([256, 32], dtype=np.int8),
-                1 * np.ones([32, 128], dtype=np.int8),
-                32 * np.ones([256, 128], dtype=np.int8),
-                test_params=TestParams(
-                    tile_pipeline="pack-peel-4-level-tiling",
-                    run_on_target=["npu1_4col"],
-                    aie_compilation_flags=[
-                        "--iree-amdaie-num-rows=4",
-                        "--iree-amdaie-num-cols=4",
-                    ],
-                    use_ukernel=True,
-                ),
-            )
-        )
-        # Phoenix : Vectorization + Peano.
-        self.register(
-            MatmulTrunci(
-                256,
-                128,
-                32,
-                "i8",
-                "i32",
-                1 * np.ones([256, 32], dtype=np.int8),
-                1 * np.ones([32, 128], dtype=np.int8),
-                32 * np.ones([256, 128], dtype=np.int8),
-                test_params=TestParams(
-                    tile_pipeline="pack-peel-4-level-tiling",
-                    run_on_target=["npu1_4col"],
-                    aie_compilation_flags=[
-                        "--iree-amdaie-num-rows=4",
-                        "--iree-amdaie-num-cols=4",
-                    ],
-                ),
-            )
-        )
-        # Strix : Ukernel + Chess.
-        self.register(
-            MatmulTrunci(
-                256,
-                128,
-                32,
-                "i8",
-                "i32",
-                1 * np.ones([256, 32], dtype=np.int8),
-                1 * np.ones([32, 128], dtype=np.int8),
-                32 * np.ones([256, 128], dtype=np.int8),
-                test_params=TestParams(
-                    tile_pipeline="pack-peel-4-level-tiling",
-                    run_on_target=["npu4"],
-                    aie_compilation_flags=[
-                        "--iree-amdaie-num-rows=4",
-                        "--iree-amdaie-num-cols=8",
-                    ],
-                    use_chess=True,
-                    use_ukernel=True,
-                ),
-            )
-        )
-
         # Tests Matmul + Trunci with Scaling.
         # Phoenix : Ukernel + Peano.
         self.register(
-            MatmulTrunci(
+            MatmulScaleTrunci(
                 256,
                 256,
                 128,
@@ -1691,12 +1619,11 @@ def __init__(self):
                     ],
                     use_ukernel=True,
                 ),
-                use_scaling=True,
             )
         )
         # Phoenix : Vectorization + Peano.
         self.register(
-            MatmulTrunci(
+            MatmulScaleTrunci(
                 256,
                 256,
                 128,
@@ -1713,12 +1640,11 @@ def __init__(self):
                         "--iree-amdaie-num-cols=4",
                     ],
                 ),
-                use_scaling=True,
             )
         )
-        # Strix : Ukernel + Chess.
+        # Strix : Ukernel + Peano.
         self.register(
-            MatmulTrunci(
+            MatmulScaleTrunci(
                 256,
                 256,
                 128,
@@ -1734,10 +1660,10 @@ def __init__(self):
                         "--iree-amdaie-num-rows=4",
                         "--iree-amdaie-num-cols=8",
                     ],
-                    use_chess=True,
+                    use_chess=False,
                     use_ukernel=True,
+                    use_chess_for_ukernel=False,
                 ),
-                use_scaling=True,
             )
         )
         # Matmul with truncf test(s):
@@ -1963,7 +1889,8 @@ def __init__(self):
                 "f32",
                 test_params=TestParams(
                     use_ukernel=True,
-                    use_chess=True,
+                    use_chess=False,
+                    use_chess_for_ukernel=False,
                     run_on_target=["npu4"],
                 ),
             )
@@ -1978,11 +1905,12 @@ def __init__(self):
                 test_params=TestParams(
                     name_suffix="npu4_4x8",
                     use_ukernel=True,
+                    use_chess=False,
+                    use_chess_for_ukernel=False,
                     aie_compilation_flags=[
                         "--iree-amdaie-num-rows=4",
                         "--iree-amdaie-num-cols=8",
                     ],
-                    use_chess=True,
                     run_on_target=["npu4"],
                 ),
             )
@@ -2024,7 +1952,8 @@ def __init__(self):
                         "--iree-amdaie-num-rows=4",
                         "--iree-amdaie-num-cols=8",
                     ],
-                    use_chess=True,
+                    use_chess=False,
+                    use_chess_for_ukernel=False,
                 ),
             )
         )
@@ -2044,7 +1973,8 @@ def __init__(self):
                         "--iree-amdaie-num-rows=4",
                         "--iree-amdaie-num-cols=8",
                     ],
-                    use_chess=True,
+                    use_chess=False,
+                    use_chess_for_ukernel=False,
                 ),
             )
         )

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
@@ -586,6 +586,23 @@ def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", [
   let hasCanonicalizer = 1;
 }
 
+def AMDAIE_NpuDmaPlaceHolderOp : AMDAIE_Op<"npu.dma_placeholder"> {
+  let summary = "Represents a placeholder for a DMA operation.";
+  let description = [{
+    This operation acts as a placeholder user for `amdaie.connection` operations to prevent
+    them from being dead-code eliminated. This is used for control flow connections that are
+    inserted before control packets are generated because they need to be taken into account
+    together with data connections for routing. This operation does not have any side effects
+    on control code size.
+  }];
+
+  let arguments = (
+    ins Index:$connection
+  );
+
+  let assemblyFormat = [{ `(` $connection `)` attr-dict }];
+}
+
 def AMDAIE_NpuHalfDmaCpyNdOp
   : AMDAIE_Op<"npu.half_dma_cpy_nd", [AttrSizedOperandSegments, OffsetSizeAndStrideOpInterface]> {
   let summary = "The NPU uController's DMA operation, operating on a single port";