From fd4db474d18be9b31e83e427fa30fbaf43083e9e Mon Sep 17 00:00:00 2001 From: Zhewen Yu Date: Sun, 16 Feb 2025 11:46:07 +0000 Subject: [PATCH] [ControlOverlay] Introduce `dma_placeholder` to preserve `connection` ops from Dead-Code Elimination (#1111) Following the discussion in #1063, `connection` ops generated for the control overlay do not initially have DMA users, as these are only added later when the content of control packets is determined. However, since `connection` ops are marked as pure in the IR, they can be wrongly eliminated by CSE and Canonicalization if they have no users at that stage. --- .../AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td | 17 +++++++++++++++++ .../AMDAIEGenerateControlOverlay.cpp | 18 +++++++++++++++--- .../Transforms/AMDAIELowerToAIE.cpp | 19 ++++++++++++++----- .../iree-amd-aie/Transforms/Passes.cpp | 2 ++ .../test/generate_control_overlay.mlir | 17 ++++++++--------- 5 files changed, 56 insertions(+), 17 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td index a2ef70926..e9a6345c1 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td @@ -586,6 +586,23 @@ def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", [ let hasCanonicalizer = 1; } +def AMDAIE_NpuDmaPlaceHolderOp : AMDAIE_Op<"npu.dma_placeholder"> { + let summary = "Represents a placeholder for a DMA operation."; + let description = [{ + This operation acts as a placeholder user for `amdaie.connection` operations to prevent + them from being dead-code eliminated. This is used for control flow connections that are + inserted before control packets are generated because they need to be taken into account + together with data connections for routing. This operation does not have any side effects + on control code size. + }]; + + let arguments = ( + ins Index:$connection + ); + + let assemblyFormat = [{ `(` $connection `)` attr-dict }]; +} + def AMDAIE_NpuHalfDmaCpyNdOp : AMDAIE_Op<"npu.half_dma_cpy_nd", [AttrSizedOperandSegments, OffsetSizeAndStrideOpInterface]> { let summary = "The NPU uController's DMA operation, operating on a single port"; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEGenerateControlOverlay.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEGenerateControlOverlay.cpp index 37cc1e4f3..10ef5c879 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEGenerateControlOverlay.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEGenerateControlOverlay.cpp @@ -67,9 +67,10 @@ LogicalResult generateControlOverlay(AMDAIE::WorkgroupOp workgroupOp, if (deviceModel.isShimNOCTile(col, row)) columnToShimTile[col] = tileOp; }); + AMDAIE::ControlCodeOp controlCodeOp = workgroupOp.getControlCode(); + rewriter.setInsertionPoint(controlCodeOp); // If the column is occupied, but the shim tile op is not present, then create // one. - rewriter.setInsertionPoint(workgroupOp.getControlCode()); for (uint32_t col : occupiedCols) { if (!columnToShimTile.count(col)) { auto colIndex = rewriter.create( @@ -104,6 +105,8 @@ LogicalResult generateControlOverlay(AMDAIE::WorkgroupOp workgroupOp, shimTileOp.emitOpError() << "no producer DMA channel available"; return WalkResult::interrupt(); } + + rewriter.setInsertionPoint(controlCodeOp); auto sourceChannelOp = rewriter.create( rewriter.getUnknownLoc(), shimTileOp, maybeChannel.value(), StrmSwPortType::DMA, AMDAIE::DMAChannelDir::MM2S); @@ -123,13 +126,17 @@ LogicalResult generateControlOverlay(AMDAIE::WorkgroupOp workgroupOp, rewriter.getUnknownLoc(), LogicalObjectFifoType::get(elementType), ValueRange(tileOp)); - rewriter.create( + auto connectionOp = rewriter.create( rewriter.getUnknownLoc(), targetPlaceholder, ValueRange{targetChannelOp}, sourcePlaceholder, ValueRange{sourceChannelOp}, ConnectionTypeAttr::get(rewriter.getContext(), ConnectionType::Packet), /*flow=*/nullptr); + + rewriter.setInsertionPoint(controlCodeOp.getBody()->getTerminator()); + rewriter.create(rewriter.getUnknownLoc(), + connectionOp.getResult()); return WalkResult::advance(); }); if (res.wasInterrupted()) return failure(); @@ -139,6 +146,7 @@ LogicalResult generateControlOverlay(AMDAIE::WorkgroupOp workgroupOp, // for sending Task Completion Tokens (TCTs). if (routeShimCtrlToTct) { for (auto [_, shimTileOp] : columnToShimTile) { + rewriter.setInsertionPoint(controlCodeOp); auto sourceChannelOp = rewriter.create( rewriter.getUnknownLoc(), shimTileOp, 0, StrmSwPortType::CTRL, AMDAIE::DMAChannelDir::MM2S); @@ -160,13 +168,17 @@ LogicalResult generateControlOverlay(AMDAIE::WorkgroupOp workgroupOp, rewriter.getUnknownLoc(), LogicalObjectFifoType::get(elementType), ValueRange(shimTileOp)); - rewriter.create( + auto connectionOp = rewriter.create( rewriter.getUnknownLoc(), targetPlaceholder, ValueRange{targetChannelOp}, sourcePlaceholder, ValueRange{sourceChannelOp}, ConnectionTypeAttr::get(rewriter.getContext(), ConnectionType::Circuit), /*flow=*/nullptr); + + rewriter.setInsertionPoint(controlCodeOp.getBody()->getTerminator()); + rewriter.create(rewriter.getUnknownLoc(), + connectionOp.getResult()); } } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp index 2f68fef2d..5b2729045 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp @@ -565,9 +565,9 @@ LogicalResult AIEDeviceBuilder::bufferToAIE(AMDAIE::BufferOp bufferOp, return success(); } -/// Convert the `amdaie.connection` operation into `aie.flow` ops and DMA -/// operations. Depending on the location of the source/target of the -/// connection, different DMA ops are created: +/// Convert the `amdaie.connection` operation into DMA operations. Depending on +/// the location of the source/target of the connection, different DMA ops are +/// created: /// 1. Source/target on a Shim tile: iterate through producer/consumer channels /// and create corresponding `aie.shim_dma_allocation` ops. /// 2. Source/target on MemTile: iterate through producer/consumer channels, @@ -601,8 +601,17 @@ LogicalResult AIEDeviceBuilder::connectionToAIE( } std::optional maybeFlowOp = connectionOp.getFlowOp(); - std::optional packetId = - maybeFlowOp ? maybeFlowOp->getPacketId() : std::nullopt; + if (!maybeFlowOp) return connectionOp.emitOpError() << "has no flow op"; + + FailureOr isCtrlFlow = maybeFlowOp->isControlFlow(); + if (failed(isCtrlFlow)) { + return connectionOp.emitOpError() + << "could not determine if flow is control"; + } + // No DMA op needed for control flow. + if (isCtrlFlow.value()) return success(); + + std::optional packetId = maybeFlowOp->getPacketId(); FailureOr maybeNpuDmaUserOp = connectionOp.getNpuCircularDmaCpyNdUser(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index 685e76c52..1b51c60bb 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -839,6 +839,8 @@ void addAMDAIEObjectFifoLoweringPasses( passManager.addPass(createAMDAIEDmaCSEPass()); passManager.addPass(createAMDAIEGenerateControlOverlayPass()); + passManager.addPass(createCSEPass()); + passManager.addPass(createCanonicalizerPass()); passManager.addPass(createAMDAIEAssignChannelsPass()); passManager.addPass(createCSEPass()); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/generate_control_overlay.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/generate_control_overlay.mlir index 17d207d68..185fbf9f8 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/generate_control_overlay.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/generate_control_overlay.mlir @@ -1,4 +1,4 @@ -// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-amdaie-generate-control-overlay{route-shim-to-tct=true route-shim-to-tile-ctrl=true}))" --split-input-file --verify-diagnostics %s | FileCheck %s +// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-amdaie-generate-control-overlay{route-shim-to-tct=true route-shim-to-tile-ctrl=true}, canonicalize, cse))" --split-input-file --verify-diagnostics %s | FileCheck %s // Device attribute is required for route-shim-to-tile-ctrl. module { @@ -65,21 +65,20 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK: %[[CHANNEL_2:.*]] = amdaie.channel(%[[TILE_0_0]], 1, port_type = DMA, direction = MM2S) // CHECK: %[[CHANNEL_3:.*]] = amdaie.channel(%[[TILE_0_1]], 0, port_type = CTRL, direction = S2MM) // CHECK: %[[CONNECT_1:.*]] = amdaie.connection(%{{.+}} {%[[CHANNEL_3]]}, %{{.+}} {%[[CHANNEL_2]]}) {connection_type = #amdaie} -// CHECK: %[[CHANNEL_4:.*]] = amdaie.channel(%[[TILE_0_0]], 0, port_type = DMA, direction = MM2S) // CHECK: %[[CHANNEL_5:.*]] = amdaie.channel(%[[TILE_0_2]], 0, port_type = CTRL, direction = S2MM) -// CHECK: %[[CONNECT_2:.*]] = amdaie.connection(%{{.+}} {%[[CHANNEL_5]]}, %{{.+}} {%[[CHANNEL_4]]}) {connection_type = #amdaie} -// CHECK: %[[CHANNEL_6:.*]] = amdaie.channel(%[[TILE_0_0]], 1, port_type = DMA, direction = MM2S) +// CHECK: %[[CONNECT_2:.*]] = amdaie.connection(%{{.+}} {%[[CHANNEL_5]]}, %{{.+}} {%[[CHANNEL_0]]}) {connection_type = #amdaie} // CHECK: %[[CHANNEL_7:.*]] = amdaie.channel(%[[TILE_0_3]], 0, port_type = CTRL, direction = S2MM) -// CHECK: %[[CONNECT_3:.*]] = amdaie.connection(%{{.+}} {%[[CHANNEL_7]]}, %{{.+}} {%[[CHANNEL_6]]}) {connection_type = #amdaie} -// CHECK: %[[CHANNEL_8:.*]] = amdaie.channel(%[[TILE_0_0]], 0, port_type = DMA, direction = MM2S) +// CHECK: %[[CONNECT_3:.*]] = amdaie.connection(%{{.+}} {%[[CHANNEL_7]]}, %{{.+}} {%[[CHANNEL_2]]}) {connection_type = #amdaie} // CHECK: %[[CHANNEL_9:.*]] = amdaie.channel(%[[TILE_0_4]], 0, port_type = CTRL, direction = S2MM) -// CHECK: %[[CONNECT_4:.*]] = amdaie.connection(%{{.+}} {%[[CHANNEL_9]]}, %{{.+}} {%[[CHANNEL_8]]}) {connection_type = #amdaie} -// CHECK: %[[CHANNEL_10:.*]] = amdaie.channel(%[[TILE_0_0]], 1, port_type = DMA, direction = MM2S) +// CHECK: %[[CONNECT_4:.*]] = amdaie.connection(%{{.+}} {%[[CHANNEL_9]]}, %{{.+}} {%[[CHANNEL_0]]}) {connection_type = #amdaie} // CHECK: %[[CHANNEL_11:.*]] = amdaie.channel(%[[TILE_0_5]], 0, port_type = CTRL, direction = S2MM) -// CHECK: %[[CONNECT_5:.*]] = amdaie.connection(%{{.+}} {%[[CHANNEL_11]]}, %{{.+}} {%[[CHANNEL_10]]}) {connection_type = #amdaie} +// CHECK: %[[CONNECT_5:.*]] = amdaie.connection(%{{.+}} {%[[CHANNEL_11]]}, %{{.+}} {%[[CHANNEL_2]]}) {connection_type = #amdaie} // CHECK: %[[CHANNEL_12:.*]] = amdaie.channel(%[[TILE_0_0]], 0, port_type = CTRL, direction = MM2S) // CHECK: %[[CHANNEL_13:.*]] = amdaie.channel(%[[TILE_0_0]], 0, port_type = SOUTH, direction = S2MM) // CHECK: %[[CONNECT_6:.*]] = amdaie.connection(%{{.+}} {%[[CHANNEL_13]]}, %{{.+}} {%[[CHANNEL_12]]}) {connection_type = #amdaie} +// CHECK: amdaie.controlcode { +// CHECK-COUNT-6:amdaie.npu.dma_placeholder +// CHECK: amdaie.end #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @column_control_overlay() {