diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp index 309e610fb..a6a65ac19 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp @@ -7,11 +7,9 @@ #include "iree-amd-aie/IR/AMDAIEOps.h" #include "iree-amd-aie/IR/AMDAIEDialect.h" -#include "llvm/ADT/TypeSwitch.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Func/IR/FuncOps.h" -#include "mlir/Dialect/Transform/IR/TransformOps.h" -#include "mlir/IR/DialectImplementation.h" +#include "mlir/Dialect/Utils/StaticValueUtils.h" #include "mlir/IR/OpDefinition.h" #define GET_OP_CLASSES @@ -52,7 +50,6 @@ LogicalResult ControlCodeOp::verify() { // AMDAIE_CoreOp //===----------------------------------------------------------------------===// - void CoreOp::build(OpBuilder &b, OperationState &result, AMDAIE::TileOp tileOp, ValueRange inputDmas, ValueRange outputDmas) { build(b, result, b.getIndexType(), tileOp, inputDmas, outputDmas, nullptr); @@ -93,6 +90,70 @@ TileOp CoreOp::getTileOp() { // AMDAIE_DmaCpyNdBaseOp //===----------------------------------------------------------------------===// +namespace { +// Simplified from upstream MLIR's foldDynamicIndexList: +LogicalResult foldMixed(SmallVectorImpl &ofrs) { + bool valuesChanged = false; + for (OpFoldResult &ofr : ofrs) { + if (ofr.is()) continue; + Attribute attr; + if (matchPattern(ofr.get(), m_Constant(&attr))) { + ofr = attr; + valuesChanged = true; + } + } + return success(valuesChanged); +} + +template +// Based on upstream MLIR's +// OpWithOffsetSizesAndStridesConstantArgumentFolder +class DoublyStridedFolder final : public OpRewritePattern { + public: + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(OpType op, + PatternRewriter &rewriter) const override { + SmallVector tgtMixedOffsets(op.getTargetMixedOffsets()); + SmallVector tgtMixedSizes(op.getTargetMixedSizes()); + SmallVector tgtMixedStrides(op.getTargetMixedStrides()); + SmallVector srcMixedOffsets(op.getSourceMixedOffsets()); + SmallVector srcMixedSizes(op.getSourceMixedSizes()); + SmallVector srcMixedStrides(op.getSourceMixedStrides()); + + // No constant operands were folded, just return; + if (failed(foldMixed(tgtMixedOffsets)) && + failed(foldMixed(tgtMixedSizes)) && + failed(foldMixed(tgtMixedStrides)) && + failed(foldMixed(srcMixedOffsets)) && + failed(foldMixed(srcMixedSizes)) && failed(foldMixed(srcMixedStrides))) + return failure(); + + ReplacementBuilder::replace(op, rewriter, tgtMixedOffsets, tgtMixedSizes, + tgtMixedStrides, srcMixedOffsets, srcMixedSizes, + srcMixedStrides); + + return success(); + } +}; + +template +struct DmaCpyNdBaseOpReplacementBuilder { + static void replace(T dmaOp, PatternRewriter &rewriter, + ArrayRef tgtMixedOffsets, + ArrayRef tgtMixedSizes, + ArrayRef tgtMixedStrides, + ArrayRef srcMixedOffsets, + ArrayRef srcMixedSizes, + ArrayRef srcMixedStrides) { + rewriter.replaceOpWithNewOp(dmaOp, dmaOp.getTarget(), tgtMixedOffsets, + tgtMixedSizes, tgtMixedStrides, + dmaOp.getSource(), srcMixedOffsets, + srcMixedSizes, srcMixedStrides); + } +}; +} // namespace + // Build a DmaCpyNdOp with mixed static and dynamic entries. void DmaCpyNdOp::build(OpBuilder &b, OperationState &result, Value target, ArrayRef targetOffsets, @@ -220,6 +281,12 @@ LogicalObjectFifoFromMemrefOp DmaCpyNdOp::getTargetObjectFifo() { return dyn_cast(getTarget().getDefiningOp()); }; +void DmaCpyNdOp::getCanonicalizationPatterns(RewritePatternSet &results, + MLIRContext *context) { + results.add>>(context); +} + // Build a CircularDmaCpyNdOp with mixed static and dynamic entries. void CircularDmaCpyNdOp::build( OpBuilder &b, OperationState &result, Value target, @@ -347,6 +414,13 @@ LogicalObjectFifoFromMemrefOp CircularDmaCpyNdOp::getTargetObjectFifo() { return dyn_cast(getTarget().getDefiningOp()); }; +void CircularDmaCpyNdOp::getCanonicalizationPatterns(RewritePatternSet &results, + MLIRContext *context) { + results.add>>(context); +} + //===----------------------------------------------------------------------===// // AMDAIE_LogicalObjectFifoAccessOp //===----------------------------------------------------------------------===// @@ -386,8 +460,7 @@ void LogicalObjectFifoFromMemrefOp::build( for (auto [column, row] : tileLocations) { auto getCol = b.create(b.getUnknownLoc(), column); auto getRow = b.create(b.getUnknownLoc(), row); - auto tileOp = - b.create(b.getUnknownLoc(), getCol, getRow); + auto tileOp = b.create(b.getUnknownLoc(), getCol, getRow); tiles.push_back(tileOp.getResult()); } // For deterministic order. @@ -449,8 +522,8 @@ void LogicalObjectFifoRelease::build(OpBuilder &b, mlir::OperationState &result, // AMDAIE_NpuDmaCpyNdOp //===----------------------------------------------------------------------===// -// Build a NpuDmaCpyNdOp with mixed static and dynamic entries and target and -// source BD IDs. +// Build a NpuDmaCpyNdOp with mixed static and dynamic entries and target +// and source BD IDs. void NpuDmaCpyNdOp::build(OpBuilder &b, OperationState &result, Value dma, ArrayRef targetOffsets, ArrayRef targetSizes, @@ -577,6 +650,30 @@ bool NpuDmaCpyNdOp::hasDmaWaitOpUser() { [](auto userOp) { return isa(userOp); }); } +namespace { +struct NpuDmaCpyNdOpReplacementBuilder { + static void replace(NpuDmaCpyNdOp dmaOp, PatternRewriter &rewriter, + ArrayRef tgtMixedOffsets, + ArrayRef tgtMixedSizes, + ArrayRef tgtMixedStrides, + ArrayRef srcMixedOffsets, + ArrayRef srcMixedSizes, + ArrayRef srcMixedStrides) { + rewriter.replaceOpWithNewOp( + dmaOp, dmaOp.getDma(), tgtMixedOffsets, tgtMixedSizes, tgtMixedStrides, + srcMixedOffsets, srcMixedSizes, srcMixedStrides, dmaOp.getTargetBdId(), + dmaOp.getSourceBdId()); + } +}; +} // namespace + +void NpuDmaCpyNdOp::getCanonicalizationPatterns(RewritePatternSet &results, + MLIRContext *context) { + results + .add>( + context); +} + //===----------------------------------------------------------------------===// // AMDAIE_TileOp //===----------------------------------------------------------------------===// @@ -645,5 +742,4 @@ LogicalResult WorkgroupOp::verify() { } return success(); } - } // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td index 6bb48171a..763d3a165 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td @@ -377,7 +377,11 @@ def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", ::llvm::SmallVector<::mlir::OpFoldResult>& newSourceOffsets, ::llvm::SmallVector<::mlir::OpFoldResult>& newSourceSizes, ::llvm::SmallVector<::mlir::OpFoldResult>& newSourceStrides); + }]; + + let hasCanonicalizer = 1; + } def AMDAIE_NpuDmaWaitOp: AMDAIE_Op<"npu.dma_wait", []> { @@ -784,6 +788,7 @@ class AMDAIE_DmaCpyNdBaseOp traits = []> : ::llvm::SmallVector<::mlir::OpFoldResult>& newSourceSizes, ::llvm::SmallVector<::mlir::OpFoldResult>& newSourceStrides); }]; + } def AMDAIE_DmaCpyNdOp: AMDAIE_DmaCpyNdBaseOp<"dma_cpy_nd", []> { @@ -809,6 +814,7 @@ def AMDAIE_DmaCpyNdOp: AMDAIE_DmaCpyNdBaseOp<"dma_cpy_nd", []> { }]; let hasVerifier = 0; + let hasCanonicalizer = 1; } def AMDAIE_CircularDmaCpyNdOp: AMDAIE_DmaCpyNdBaseOp<"circular_dma_cpy_nd", [Pure]> { @@ -833,6 +839,7 @@ def AMDAIE_CircularDmaCpyNdOp: AMDAIE_DmaCpyNdBaseOp<"circular_dma_cpy_nd", [Pur }]; let hasVerifier = 0; + let hasCanonicalizer = 1; } def AMDAIE_ReferenceToOp: AMDAIE_Op<"reference_to", [SameOperandsAndResultType]> { diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/canonicalize_doubly_strided_op.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/canonicalize_doubly_strided_op.mlir index 951f9f6d8..99b103a57 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/canonicalize_doubly_strided_op.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/canonicalize_doubly_strided_op.mlir @@ -6,11 +6,7 @@ // CHECK-LABEL: func.func @circular_dma_cpy_nd_source_target // CHECK-SAME: %[[ARG0:.+]]: !amdaie.logicalobjectfifo> // CHECK-SAME: %[[ARG1:.+]]: !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C64:.+]] = arith.constant 64 : index -// CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index -// CHECK: amdaie.circular_dma_cpy_nd(%[[ARG0]][%[[C0]]] [%[[C128]]] [%[[C1]]], %[[ARG1]][%[[C0]]] [%[[C64]]] [%[[C1]]]) +// CHECK: amdaie.circular_dma_cpy_nd(%[[ARG0]][0] [128] [1], %[[ARG1]][0] [64] [1]) // FOLD-SINGLE-DIMS-LABEL: func.func @circular_dma_cpy_nd_source_target // FOLD-SINGLE-DIMS-SAME: %[[ARG0:.+]]: !amdaie.logicalobjectfifo> @@ -25,11 +21,7 @@ func.func @circular_dma_cpy_nd_source_target(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { %0 = amdaie.circular_dma_cpy_nd(%arg0[0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1], %arg1[0, 0, 0, 0] [1, 4, 2, 8] [64, 16, 8, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) @@ -40,20 +32,8 @@ func.func @circular_dma_cpy_nd_linear_implicit(%arg0: !amdaie.logicalobjectfifo< // ----- // CHECK-LABEL: func.func @circular_dma_cpy_nd_linear -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index -// CHECK-DAG: %[[C64:.+]] = arith.constant 64 : index -// CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index -// CHECK: amdaie.circular_dma_cpy_nd(%{{.+}}[%[[C0]], %[[C0]]] [%[[C16]], %[[C8]]] [%[[C16]], %[[C1]]], %{{.+}}[%[[C0]], %[[C0]], %[[C0]]] [%[[C64]], %[[C16]], %[[C128]]] [%[[C128]], %[[C16]], %[[C1]]]) -// FOLD-SINGLE-DIMS-DAG: %[[C0:.+]] = arith.constant 0 : index -// FOLD-SINGLE-DIMS-DAG: %[[C1:.+]] = arith.constant 1 : index -// FOLD-SINGLE-DIMS-DAG: %[[C8:.+]] = arith.constant 8 : index -// FOLD-SINGLE-DIMS-DAG: %[[C16:.+]] = arith.constant 16 : index -// FOLD-SINGLE-DIMS-DAG: %[[C64:.+]] = arith.constant 64 : index -// FOLD-SINGLE-DIMS-DAG: %[[C128:.+]] = arith.constant 128 : index -// FOLD-SINGLE-DIMS: amdaie.circular_dma_cpy_nd(%{{.+}}[%[[C0]], %[[C0]]] [%[[C16]], %[[C8]]] [%[[C16]], %[[C1]]], %{{.+}}[%[[C0]], %[[C0]], %[[C0]]] [%[[C64]], %[[C16]], %[[C128]]] [%[[C128]], %[[C16]], %[[C1]]]) +// CHECK: amdaie.circular_dma_cpy_nd(%{{.+}}[0, 0] [16, 8] [16, 1], %{{.+}}[0, 0, 0] [64, 16, 128] [128, 16, 1]) +// FOLD-SINGLE-DIMS: amdaie.circular_dma_cpy_nd(%{{.+}}[0, 0] [16, 8] [16, 1], %{{.+}}[0, 0, 0] [64, 16, 128] [128, 16, 1]) func.func @circular_dma_cpy_nd_linear(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { %c16 = arith.constant 16 : index %0 = amdaie.circular_dma_cpy_nd(%arg0[0, 0, 0, 0] [1, 2, 8, 8] [256, 128, %c16, 1], %arg1[0, 0, 0, 0] [64, 16, 8, %c16] [128, %c16, %c16, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) @@ -75,19 +55,8 @@ func.func @circular_dma_cpy_nd_no_linear(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { %0 = amdaie.circular_dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 2, 2, 4, 8] [128, 128, 64, 32, 8, 1], %arg1[0, 0, 0, 0, 0, 0] [1, 1, 2, 2, 4, 8] [128, 128, 8, 64, 16, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) "iree.keep"(%0) : (index) -> () @@ -97,10 +66,7 @@ func.func @circular_dma_cpy_nd_unit(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { %0 = amdaie.circular_dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 2, 2, 4, 1, 8] [128, 64, 32, 8, 8, 1], %arg1[0, 0, 0, 0, 0, 0] [2, 2, 1, 4, 8, 1] [64, 32, 32, 8, 1, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) @@ -122,14 +88,8 @@ func.func @circular_dma_cpy_nd_non_zero_offset(%arg0: !amdaie.logicalobjectfifo< // ----- // CHECK-LABEL: func.func @circular_dma_cpy_nd_partial_non_zero_offset -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C64:.+]] = arith.constant 64 : index -// CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index -// CHECK: amdaie.circular_dma_cpy_nd(%{{.+}}[%[[C1]]] [%[[C128]]] [%[[C1]]], %{{.+}}[%[[C1]]] [%[[C64]]] [%[[C1]]]) -// FOLD-SINGLE-DIMS-DAG: %[[C1:.+]] = arith.constant 1 : index -// FOLD-SINGLE-DIMS-DAG: %[[C64:.+]] = arith.constant 64 : index -// FOLD-SINGLE-DIMS-DAG: %[[C128:.+]] = arith.constant 128 : index -// FOLD-SINGLE-DIMS: amdaie.circular_dma_cpy_nd(%{{.+}}[%[[C1]]] [%[[C128]]] [%[[C1]]], %{{.+}}[%[[C1]]] [%[[C64]]] [%[[C1]]]) +// CHECK: amdaie.circular_dma_cpy_nd(%{{.+}}[1] [128] [1], %{{.+}}[1] [64] [1]) +// FOLD-SINGLE-DIMS: amdaie.circular_dma_cpy_nd(%{{.+}}[1] [128] [1], %{{.+}}[1] [64] [1]) func.func @circular_dma_cpy_nd_partial_non_zero_offset(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { %0 = amdaie.circular_dma_cpy_nd(%arg0[0, 0, 0, 1] [1, 1, 8, 16] [128, 128, 16, 1], %arg1[0, 0, 0, 1] [1, 4, 2, 8] [64, 16, 8, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) "iree.keep"(%0) : (index) -> () @@ -143,11 +103,7 @@ func.func @circular_dma_cpy_nd_partial_non_zero_offset(%arg0: !amdaie.logicalobj // CHECK-LABEL: func.func @dma_cpy_nd_source_target // CHECK-SAME: %[[ARG0:.+]]: !amdaie.logicalobjectfifo> // CHECK-SAME: %[[ARG1:.+]]: !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C64:.+]] = arith.constant 64 : index -// CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index -// CHECK: amdaie.dma_cpy_nd(%[[ARG0]][%[[C0]]] [%[[C128]]] [%[[C1]]], %[[ARG1]][%[[C0]]] [%[[C64]]] [%[[C1]]]) +// CHECK: amdaie.dma_cpy_nd(%[[ARG0]][0] [128] [1], %[[ARG1]][0] [64] [1]) // FOLD-SINGLE-DIMS-LABEL: func.func @dma_cpy_nd_source_target // FOLD-SINGLE-DIMS-SAME: %[[ARG0:.+]]: !amdaie.logicalobjectfifo> @@ -162,11 +118,7 @@ func.func @dma_cpy_nd_source_target(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { %0 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1], %arg1[0, 0, 0, 0] [1, 4, 2, 8] [64, 16, 8, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) @@ -177,20 +129,8 @@ func.func @dma_cpy_nd_linear_implicit(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { %c16 = arith.constant 16 : index %0 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0] [1, 2, 8, 8] [256, 128, %c16, 1], %arg1[0, 0, 0, 0] [64, 16, 8, %c16] [128, %c16, %c16, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) @@ -212,19 +152,8 @@ func.func @dma_cpy_nd_no_linear(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { %0 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 2, 2, 4, 8] [128, 128, 64, 32, 8, 1], %arg1[0, 0, 0, 0, 0, 0] [1, 1, 2, 2, 4, 8] [128, 128, 8, 64, 16, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) "iree.keep"(%0) : (index) -> () @@ -234,10 +163,7 @@ func.func @dma_cpy_nd_unit(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { %0 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [2, 2, 1, 1, 4, 8] [64, 32, 32, 32, 8, 1], %arg1[0, 0, 0, 0, 0, 0] [2, 1, 2, 1, 4, 8] [64, 64, 32, 32, 8, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) @@ -259,14 +185,8 @@ func.func @dma_cpy_nd_non_zero_offset(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { %0 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 1] [1, 1, 8, 16] [128, 128, 16, 1], %arg1[0, 0, 0, 1] [1, 4, 2, 8] [64, 16, 8, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) "iree.keep"(%0) : (index) -> () @@ -280,12 +200,8 @@ func.func @dma_cpy_nd_partial_non_zero_offset(%arg0: !amdaie.logicalobjectfifo> // CHECK-SAME: %[[ARG1:.+]]: !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C64:.+]] = arith.constant 64 : index -// CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index // CHECK: %[[DMA0:.+]] = amdaie.circular_dma_cpy_nd(%[[ARG0]][] [] [], %[[ARG1]][] [] []) -// CHECK: amdaie.npu.dma_cpy_nd %[[DMA0]]([%[[C0]]] [%[[C128]]] [%[[C1]]], [%[[C0]]] [%[[C64]]] [%[[C1]]]) +// CHECK: amdaie.npu.dma_cpy_nd %[[DMA0]]([0] [128] [1], [0] [64] [1]) // FOLD-SINGLE-DIMS-LABEL: func.func @npu_dma_cpy_nd_source // FOLD-SINGLE-DIMS-SAME: %[[ARG0:.+]]: !amdaie.logicalobjectfifo> @@ -301,11 +217,7 @@ func.func @npu_dma_cpy_nd_source(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) @@ -316,20 +228,8 @@ func.func @npu_dma_cpy_nd_linear_implicit(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { %c16 = arith.constant 16 : index %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) @@ -351,19 +251,8 @@ func.func @npu_dma_cpy_nd_no_linear(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %1 = amdaie.npu.dma_cpy_nd %0([0, 0, 0, 0, 0, 0] [1, 1, 2, 2, 4, 8] [128, 128, 64, 32, 8, 1], [0, 0, 0, 0, 0, 0] [1, 1, 2, 2, 4, 8] [128, 128, 8, 64, 16, 1]) @@ -373,11 +262,7 @@ func.func @npu_dma_cpy_nd_unit(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) @@ -399,14 +284,8 @@ func.func @npu_dma_cpy_nd_non_zero_offset(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { %0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %1 = amdaie.npu.dma_cpy_nd %0([0, 0, 0, 1] [1, 1, 8, 16] [128, 128, 16, 1], [0, 0, 0, 1] [1, 4, 2, 8] [64, 16, 8, 1]) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_strided_ops.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_strided_ops.mlir index 702ba9a45..3b70fa5c1 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_strided_ops.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/combine_strided_ops.mlir @@ -211,14 +211,8 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- // CHECK-LABEL: @combine_source_same_dims -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index -// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index -// CHECK-DAG: %[[C32:.+]] = arith.constant 32 : index // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd -// CHECK: amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C2]], %[[C16]], %[[C8]], %[[C16]]] [%[[C32]], %[[C32]], %[[C8]], %[[C1]]]) +// CHECK: amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0, 0, 0] [2, 16, 8, 16] [32, 32, 8, 1]) // CHECK-NOT: amdaie.npu.dma_cpy_nd #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { @@ -238,14 +232,8 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- // CHECK-LABEL: @combine_source_values -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index -// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index -// CHECK-DAG: %[[C32:.+]] = arith.constant 32 : index // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd -// CHECK: amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C2]], %[[C16]], %[[C8]], %[[C16]]] [%[[C32]], %[[C32]], %[[C8]], %[[C1]]]) +// CHECK: amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0, 0, 0] [2, 16, 8, 16] [32, 32, 8, 1]) // CHECK-NOT: amdaie.npu.dma_cpy_nd #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { @@ -270,15 +258,8 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- // CHECK-LABEL: @combine_source_diff_dims -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index -// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index -// CHECK-DAG: %[[C32:.+]] = arith.constant 32 : index -// CHECK-DAG: %[[C64:.+]] = arith.constant 64 : index // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd -// CHECK: amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C3]], %[[C16]], %[[C8]], %[[C16]]] [%[[C64]], %[[C32]], %[[C8]], %[[C1]]]) +// CHECK: amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0, 0, 0] [3, 16, 8, 16] [64, 32, 8, 1]) // CHECK-NOT: amdaie.npu.dma_cpy_nd #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { @@ -298,16 +279,12 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- // CHECK-LABEL: @combine_source_induction_var -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index // CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index // CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index -// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index -// CHECK-DAG: %[[C32:.+]] = arith.constant 32 : index // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: scf.for %[[ARG2:.+]] = %[[C1]] to %[[C6]] step %[[C2]] -// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [%[[C0]], %[[C0]], %[[ARG2]], %[[C0]]] [%[[C2]], %[[C16]], %[[C8]], %[[C16]]] [%[[C32]], %[[C32]], %[[C8]], %[[C1]]]) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0, %[[ARG2]], 0] [2, 16, 8, 16] [32, 32, 8, 1]) // CHECK-NOT: amdaie.npu.dma_cpy_nd // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], MM2S) // CHECK-NOT: amdaie.npu.dma_wait @@ -336,14 +313,8 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- // CHECK-LABEL: @combine_target_same_dims -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index -// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index -// CHECK-DAG: %[[C32:.+]] = arith.constant 32 : index // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd -// CHECK: amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C32]]] [%[[C2]], %[[C16]], %[[C8]], %[[C16]]] [%[[C32]], %[[C32]], %[[C8]], %[[C1]]], [] [] []) +// CHECK: amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 0, 32] [2, 16, 8, 16] [32, 32, 8, 1], [] [] []) // CHECK-NOT: amdaie.npu.dma_cpy_nd #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { @@ -363,15 +334,8 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- // CHECK-LABEL: @combine_target_diff_dims -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index -// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index -// CHECK-DAG: %[[C32:.+]] = arith.constant 32 : index -// CHECK-DAG: %[[C64:.+]] = arith.constant 64 : index // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd -// CHECK: amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C32]]] [%[[C3]], %[[C16]], %[[C8]], %[[C16]]] [%[[C64]], %[[C32]], %[[C8]], %[[C1]]], [] [] []) +// CHECK: amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 0, 32] [3, 16, 8, 16] [64, 32, 8, 1], [] [] []) // CHECK-NOT: amdaie.npu.dma_cpy_nd #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { @@ -391,14 +355,8 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- // CHECK-LABEL: @combine_target_values -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index -// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index -// CHECK-DAG: %[[C32:.+]] = arith.constant 32 : index // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd -// CHECK: amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C32]]] [%[[C2]], %[[C16]], %[[C8]], %[[C16]]] [%[[C32]], %[[C32]], %[[C8]], %[[C1]]], [] [] []) +// CHECK: amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 0, 32] [2, 16, 8, 16] [32, 32, 8, 1], [] [] []) // CHECK-NOT: amdaie.npu.dma_cpy_nd #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { @@ -426,14 +384,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK-LABEL: @combine_target_induction_var // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index // CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index -// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index -// CHECK-DAG: %[[C32:.+]] = arith.constant 32 : index // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: scf.for %[[ARG2:.+]] = %[[C0]] to %[[C6]] step %[[C1]] -// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[ARG2]], %[[C0]], %[[C32]]] [%[[C2]], %[[C16]], %[[C8]], %[[C16]]] [%[[C32]], %[[C32]], %[[C8]], %[[C1]]], [] [] []) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, %[[ARG2]], 0, 32] [2, 16, 8, 16] [32, 32, 8, 1], [] [] []) // CHECK-NOT: amdaie.npu.dma_cpy_nd // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) // CHECK-NOT: amdaie.npu.dma_cpy_nd @@ -516,13 +470,8 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- // CHECK-LABEL: @three_dma_ops_same_dims -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index -// CHECK-DAG: %[[C32:.+]] = arith.constant 32 : index // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd -// CHECK: amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [%[[C0]], %[[C0]]] [%[[C3]], %[[C16]]] [%[[C32]], %[[C1]]]) +// CHECK: amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0] [3, 16] [32, 1]) // CHECK-NOT: amdaie.npu.dma_cpy_nd #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { @@ -543,13 +492,8 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- // CHECK-LABEL: @three_dma_ops_diff_dims -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index -// CHECK-DAG: %[[C32:.+]] = arith.constant 32 : index // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd -// CHECK: amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [%[[C0]], %[[C0]]] [%[[C4]], %[[C16]]] [%[[C32]], %[[C1]]]) +// CHECK: amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0] [4, 16] [32, 1]) // CHECK-NOT: amdaie.npu.dma_cpy_nd #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption.mlir index ef26f1c7b..94403ad1d 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption.mlir @@ -7,16 +7,13 @@ // Ensure no modification in case of an operand within the same scope. // CHECK-LABEL: @operand_in_same_scope -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index // CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index -// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index // CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index // CHECK: %[[CIRC_DMA_1:.+]] = amdaie.circular_dma_cpy_nd // CHECK: %[[CIRC_DMA_2:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode -// CHECK: amdaie.npu.dma_cpy_nd %[[CIRC_DMA_2]]([%[[C0]], %[[C1]]] [%[[C3]], %[[C16]]] [%[[C2]], %[[C1]]], [] [] []) +// CHECK: amdaie.npu.dma_cpy_nd %[[CIRC_DMA_2]]([0, 1] [3, 16] [2, 1], [] [] []) // CHECK: scf.for %[[ARG2:.+]] = %[[C1]] to %[[C6]] step %[[C2]] // CHECK: %[[BD_ID:.+]] = amdaie.bd_id // CHECK: amdaie.npu.dma_cpy_nd %[[CIRC_DMA_1]]([%[[ARG2]]] [16] [1] bd_id = %[[BD_ID]], [] [] []) @@ -436,19 +433,15 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK-LABEL: @exceed_max_size_source // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index -// CHECK-DAG: %[[C63:.+]] = arith.constant 63 : index // CHECK-DAG: %[[C64:.+]] = arith.constant 64 : index -// CHECK-DAG: %[[C1023:.+]] = arith.constant 1023 : index // CHECK-DAG: %[[C1024:.+]] = arith.constant 1024 : index // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode -// CHECK: %{{.+}} = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C63]], %[[C1]], %[[C8]], %[[C16]]] [%[[C0]], %[[C64]], %[[C16]], %[[C1]]]) +// CHECK: %{{.+}} = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0, 0, 0] [63, 1, 8, 16] [0, 64, 16, 1]) // CHECK: scf.for %{{.+}} = %[[C0]] to %[[C64]] step %[[C1]] // CHECK: %{{.+}} = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0, 0] [1, 8, 16] [128, 16, 1]) // CHECK: } -// CHECK: %{{.+}} = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [%[[C0]], %[[C0]], %[[C0]]] [%[[C1023]], %[[C8]], %[[C16]]] [%[[C0]], %[[C16]], %[[C1]]]) +// CHECK: %{{.+}} = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0, 0] [1023, 8, 16] [0, 16, 1]) // CHECK: scf.for %{{.+}} = %[[C0]] to %[[C1024]] step %[[C1]] // CHECK: %{{.+}} = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0] [8, 16] [16, 1]) // CHECK: } @@ -483,6 +476,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} } } + // ----- // Don't subsume if inter size (dim 0 in a four dimensional size array) or intra size @@ -490,19 +484,15 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK-LABEL: @exceed_max_size_target // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index -// CHECK-DAG: %[[C63:.+]] = arith.constant 63 : index // CHECK-DAG: %[[C64:.+]] = arith.constant 64 : index -// CHECK-DAG: %[[C1023:.+]] = arith.constant 1023 : index // CHECK-DAG: %[[C1024:.+]] = arith.constant 1024 : index // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode -// CHECK: %{{.+}} = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C63]], %[[C1]], %[[C8]], %[[C16]]] [%[[C0]], %[[C64]], %[[C16]], %[[C1]]], [] [] []) +// CHECK: %{{.+}} = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 0, 0] [63, 1, 8, 16] [0, 64, 16, 1], [] [] []) // CHECK: scf.for %{{.+}} = %[[C0]] to %[[C64]] step %[[C1]] // CHECK: %{{.+}} = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 0] [1, 8, 16] [128, 16, 1], [] [] []) // CHECK: } -// CHECK: %{{.+}} = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]]] [%[[C1023]], %[[C8]], %[[C16]]] [%[[C0]], %[[C16]], %[[C1]]], [] [] []) +// CHECK: %{{.+}} = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 0] [1023, 8, 16] [0, 16, 1], [] [] []) // CHECK: scf.for %{{.+}} = %[[C0]] to %[[C1024]] step %[[C1]] // CHECK: %{{.+}} = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0] [8, 16] [16, 1], [] [] []) // CHECK: } @@ -545,19 +535,15 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK-LABEL: @exceed_max_stride_source // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index // CHECK-DAG: %[[C32:.+]] = arith.constant 32 : index -// CHECK-DAG: %[[C64:.+]] = arith.constant 64 : index -// CHECK-DAG: %[[C1048576:.+]] = arith.constant 1048576 : index // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode -// CHECK: %{{.+}} = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C32]], %[[C1]], %[[C8]], %[[C16]]] [%[[C1048576]], %[[C64]], %[[C16]], %[[C1]]]) +// CHECK: %{{.+}} = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0, 0, 0] [32, 1, 8, 16] [1048576, 64, 16, 1]) // CHECK: scf.for %[[ARG5:.+]] = %[[C0]] to %[[C32]] step %[[C1]] // CHECK: %[[APPLY1:.+]] = affine.apply #[[$MAP]](%[[ARG5]]) // CHECK: %{{.+}} = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0, %[[APPLY1]]] [1, 8, 16] [64, 16, 1]) // CHECK: } -// CHECK: %{{.+}} = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [%[[C0]], %[[C0]], %[[C0]]] [%[[C32]], %[[C8]], %[[C16]]] [%[[C1048576]], %[[C16]], %[[C1]]]) +// CHECK: %{{.+}} = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0, 0] [32, 8, 16] [1048576, 16, 1]) // CHECK: scf.for %[[ARG7:.+]] = %[[C0]] to %[[C32]] step %[[C1]] // CHECK: %[[APPLY1:.+]] = affine.apply #[[$MAP]](%[[ARG7]]) // CHECK: %{{.+}} = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, %[[APPLY1]]] [8, 16] [16, 1]) @@ -604,19 +590,15 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK-LABEL: @exceed_max_stride_target // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index // CHECK-DAG: %[[C32:.+]] = arith.constant 32 : index -// CHECK-DAG: %[[C64:.+]] = arith.constant 64 : index -// CHECK-DAG: %[[C1048576:.+]] = arith.constant 1048576 : index // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode -// CHECK: %{{.+}} = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C32]], %[[C1]], %[[C8]], %[[C16]]] [%[[C1048576]], %[[C64]], %[[C16]], %[[C1]]], [] [] []) +// CHECK: %{{.+}} = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 0, 0] [32, 1, 8, 16] [1048576, 64, 16, 1], [] [] []) // CHECK: scf.for %[[ARG5:.+]] = %[[C0]] to %[[C32]] step %[[C1]] // CHECK: %[[APPLY1:.+]] = affine.apply #[[$MAP]](%[[ARG5]]) // CHECK: %{{.+}} = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, %[[APPLY1]]] [1, 8, 16] [64, 16, 1], [] [] []) // CHECK: } -// CHECK: %{{.+}} = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]]] [%[[C32]], %[[C8]], %[[C16]]] [%[[C1048576]], %[[C16]], %[[C1]]], [] [] []) +// CHECK: %{{.+}} = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 0] [32, 8, 16] [1048576, 16, 1], [] [] []) // CHECK: scf.for %[[ARG7:.+]] = %[[C0]] to %[[C32]] step %[[C1]] // CHECK: %[[APPLY1:.+]] = affine.apply #[[$MAP]](%[[ARG7]]) // CHECK: %{{.+}} = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, %[[APPLY1]]] [8, 16] [16, 1], [] [] []) @@ -663,16 +645,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // Subsume loop iteration into strided op without dependency. // CHECK-LABEL: @for_without_loop_dependency -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index -// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index -// CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode // CHECK-NOT: scf.for -// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C6]], %[[C1]], %[[C8]], %[[C16]]] [%[[C0]], %[[C128]], %[[C16]], %[[C1]]], [] [] []) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 0, 0] [6, 1, 8, 16] [0, 128, 16, 1], [] [] []) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) #map = affine_map<(d0) -> (d0 * 16)> #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> @@ -700,15 +676,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // Subsume loop iteration into strided op without dependency. // CHECK-LABEL: @forall_without_loop_dependency -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index -// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode // CHECK-NOT: scf.forall (%{{.+}}, %{{.+}}) in (2, 2) -// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C2]], %[[C2]], %[[C8]], %[[C16]]] [%[[C0]], %[[C0]], %[[C16]], %[[C1]]], [] [] []) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 0, 0] [2, 2, 8, 16] [0, 0, 16, 1], [] [] []) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) #map = affine_map<(d0) -> (d0 * 16)> #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> @@ -733,17 +704,11 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // Subsume loop iteration into strided op without dependency. // CHECK-LABEL: @nested_without_loop_dependency -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index -// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index -// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode // CHECK-NOT: scf.forall // CHECK-NOT: scf.for -// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C2]], %[[C3]], %[[C6]], %[[C16]]] [%[[C0]], %[[C0]], %[[C0]], %[[C1]]], [] [] []) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 0, 0] [2, 3, 6, 16] [0, 0, 0, 1], [] [] []) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) #map = affine_map<(d0) -> (d0 * 16)> #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> @@ -775,14 +740,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // an induction variable. // CHECK-LABEL: @dynamic_non_induction_var_offset // CHECK-SAME: %{{.+}}: !amdaie.logicalobjectfifo>, %{{.+}}: !amdaie.logicalobjectfifo>, %[[ARG:.+]]: index -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode // CHECK-NOT: scf.for -// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[ARG]]] [%[[C6]], %[[C16]]] [%[[C0]], %[[C1]]], [] [] []) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, %[[ARG]]] [6, 16] [0, 1], [] [] []) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { @@ -812,18 +773,14 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // Check that loop subsumption happens in case of an identity affine expression. // CHECK-LABEL: @valid_affine_expr -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index // CHECK: %[[CIRC_DMA_0:.+]] = amdaie.circular_dma_cpy_nd // CHECK: %[[CIRC_DMA_1:.+]] = amdaie.circular_dma_cpy_nd // CHECK: %[[CIRC_DMA_2:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode // CHECK-NOT: scf.for -// CHECK-DAG: amdaie.npu.dma_cpy_nd %[[CIRC_DMA_0]]([%[[C0]], %[[C0]]] [%[[C6]], %[[C16]]] [%[[C1]], %[[C1]]], [] [] []) -// CHECK-DAG: amdaie.npu.dma_cpy_nd %[[CIRC_DMA_1]]([%[[C0]], %[[C0]]] [%[[C6]], %[[C16]]] [%[[C16]], %[[C1]]], [] [] []) -// CHECK-DAG: amdaie.npu.dma_cpy_nd %[[CIRC_DMA_2]]([%[[C0]], %[[C16]]] [%[[C6]], %[[C16]]] [%[[C16]], %[[C1]]], [] [] []) +// CHECK-DAG: amdaie.npu.dma_cpy_nd %[[CIRC_DMA_0]]([0, 0] [6, 16] [1, 1], [] [] []) +// CHECK-DAG: amdaie.npu.dma_cpy_nd %[[CIRC_DMA_1]]([0, 0] [6, 16] [16, 1], [] [] []) +// CHECK-DAG: amdaie.npu.dma_cpy_nd %[[CIRC_DMA_2]]([0, 16] [6, 16] [16, 1], [] [] []) #map0 = affine_map<(d0) -> (d0)> #map1 = affine_map<(d0) -> (d0 * 16)> #map2 = affine_map<(d0) -> (d0 * 16 + 16)> @@ -856,16 +813,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- // CHECK-LABEL: @for_dependency_on_target -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index -// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index -// CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode // CHECK-NOT: scf.for -// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C6]], %[[C1]], %[[C8]], %[[C16]]] [%[[C16]], %[[C128]], %[[C16]], %[[C1]]], [] [] []) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 0, 0] [6, 1, 8, 16] [16, 128, 16, 1], [] [] []) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) #map = affine_map<(d0) -> (d0 * 16)> #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> @@ -892,16 +843,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- // CHECK-LABEL: @forall_dependency_on_target -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index -// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index -// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode // CHECK-NOT: scf.forall -// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C2]], %[[C6]], %[[C8]], %[[C16]]] [%[[C0]], %[[C16]], %[[C16]], %[[C1]]], [] [] []) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 0, 0] [2, 6, 8, 16] [0, 16, 16, 1], [] [] []) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) #map = affine_map<(d0) -> (16 * d0)> #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> @@ -925,16 +870,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- // CHECK-LABEL: @for_dependency_on_source -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index -// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index -// CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode // CHECK-NOT: scf.for -// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C6]], %[[C1]], %[[C8]], %[[C16]]] [%[[C16]], %[[C128]], %[[C16]], %[[C1]]]) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0, 0, 0] [6, 1, 8, 16] [16, 128, 16, 1]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) #map = affine_map<(d0) -> (d0 * 16)> #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> @@ -961,16 +900,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- // CHECK-LABEL: @forall_dependency_on_source -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index -// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index -// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode // CHECK-NOT: scf.forall -// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C2]], %[[C6]], %[[C8]], %[[C16]]] [%[[C0]], %[[C16]], %[[C16]], %[[C1]]]) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([] [] [], [0, 0, 0, 0] [2, 6, 8, 16] [0, 16, 16, 1]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) #map = affine_map<(d0) -> (d0 * 16)> #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> @@ -995,16 +928,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // Check with multiple `affine.apply` usages in a `amdaie.npu.dma_cpy_nd` operation. // CHECK-LABEL: @multiple_for_dependencies -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index -// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index -// CHECK-DAG: %[[C256:.+]] = arith.constant 256 : index // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode // CHECK-NOT: scf.for -// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C6]], %[[C6]], %[[C8]], %[[C16]]] [%[[C256]], %[[C16]], %[[C16]], %[[C1]]], [] [] []) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 0, 0] [6, 6, 8, 16] [256, 16, 16, 1], [] [] []) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) #map = affine_map<(d0) -> (d0 * 16)> #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> @@ -1031,17 +958,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- // CHECK-LABEL: @multiple_forall_dependencies -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index -// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index -// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index -// CHECK-DAG: %[[C512:.+]] = arith.constant 512 : index // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode // CHECK-NOT: scf.forall -// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C2]], %[[C6]], %[[C8]], %[[C16]]] [%[[C16]], %[[C512]], %[[C16]], %[[C1]]], [] [] []) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 0, 0] [2, 6, 8, 16] [16, 512, 16, 1], [] [] []) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) #map = affine_map<(d0) -> (d0 * 16)> #map1 = affine_map<(d0) -> (d0 * 32)> @@ -1070,15 +990,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- // CHECK-LABEL: @for_with_affine_non_normalized -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index -// CHECK-DAG: %[[C32:.+]] = arith.constant 32 : index // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode // CHECK-NOT: scf.for -// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C16]]] [%[[C3]], %[[C16]]] [%[[C32]], %[[C1]]], [] [] []) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 16] [3, 16] [32, 1], [] [] []) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) #map = affine_map<(d0) -> (d0 * 16)> #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> @@ -1105,19 +1020,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- // CHECK-LABEL: @forall_with_affine_non_normalized -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index -// CHECK-DAG: %[[C5:.+]] = arith.constant 5 : index -// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index -// CHECK-DAG: %[[C32:.+]] = arith.constant 32 : index -// CHECK-DAG: %[[C48:.+]] = arith.constant 48 : index -// CHECK-DAG: %[[C1024:.+]] = arith.constant 1024 : index // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode // CHECK-NOT: scf.forall -// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C32]], %[[C32]]] [%[[C5]], %[[C4]], %[[C8]], %[[C16]]] [%[[C48]], %[[C1024]], %[[C16]], %[[C1]]], [] [] []) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 32, 32] [5, 4, 8, 16] [48, 1024, 16, 1], [] [] []) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) #map = affine_map<(d0) -> (d0 * 16)> #map1 = affine_map<(d0) -> (d0 * 32)> @@ -1150,18 +1056,11 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- // CHECK-LABEL: @nested_dependencies -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index -// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index -// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index -// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index -// CHECK-DAG: %[[C32:.+]] = arith.constant 32 : index // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode // CHECK-NOT: scf.forall // CHECK-NOT: scf.for -// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C2]], %[[C6]], %[[C3]], %[[C8]]] [%[[C0]], %[[C32]], %[[C0]], %[[C1]]], [] [] []) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 0, 0] [2, 6, 3, 8] [0, 32, 0, 1], [] [] []) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) #map = affine_map<(d0) -> (d0 * 16)> #map1 = affine_map<(d0) -> (d0 * 32)> @@ -1197,14 +1096,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- // CHECK-LABEL: @for_with_induction_var_normalized -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode // CHECK-NOT: scf.for -// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]]] [%[[C6]], %[[C16]]] [%[[C1]], %[[C1]]], [] [] []) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0] [6, 16] [1, 1], [] [] []) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { @@ -1229,15 +1124,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- // CHECK-LABEL: @for_with_induction_var_non_normalized -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index -// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode // CHECK-NOT: scf.for -// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C1]]] [%[[C3]], %[[C16]]] [%[[C2]], %[[C1]]], [] [] []) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 1] [3, 16] [2, 1], [] [] []) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { @@ -1262,15 +1152,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- // CHECK-LABEL: @forall_with_induction_var_normalized -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index -// CHECK-DAG: %[[C17:.+]] = arith.constant 17 : index // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode // CHECK-NOT: scf.forall -// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [%[[C17]], %[[C8]], %[[C8]], %[[C16]]] [%[[C1]], %[[C16]], %[[C16]], %[[C1]]], [] [] []) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 0, 0] [17, 8, 8, 16] [1, 16, 16, 1], [] [] []) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { @@ -1292,19 +1177,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- // CHECK-LABEL: @forall_with_induction_var_non_normalized -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index -// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index -// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index -// CHECK-DAG: %[[C5:.+]] = arith.constant 5 : index -// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index -// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index -// CHECK-DAG: %[[C32:.+]] = arith.constant 32 : index // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode // CHECK-NOT: scf.forall -// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C1]], %[[C2]]] [%[[C5]], %[[C4]], %[[C8]], %[[C16]]] [%[[C3]], %[[C32]], %[[C16]], %[[C1]]], [] [] []) +// CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 1, 2] [5, 4, 8, 16] [3, 32, 16, 1], [] [] []) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { @@ -1330,14 +1206,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} //===----------------------------------------------------------------------===// // OUTER-ZERO-STRIDE-LABEL: @for_outer_zero_stride_sanity_check -// OUTER-ZERO-STRIDE-DAG: %[[C0:.+]] = arith.constant 0 : index -// OUTER-ZERO-STRIDE-DAG: %[[C1:.+]] = arith.constant 1 : index -// OUTER-ZERO-STRIDE-DAG: %[[C6:.+]] = arith.constant 6 : index -// OUTER-ZERO-STRIDE-DAG: %[[C16:.+]] = arith.constant 16 : index // OUTER-ZERO-STRIDE: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // OUTER-ZERO-STRIDE: amdaie.controlcode // OUTER-ZERO-STRIDE-NOT: scf.for -// OUTER-ZERO-STRIDE: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]]] [%[[C6]], %[[C16]]] [%[[C1]], %[[C1]]], [] [] []) +// OUTER-ZERO-STRIDE: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0] [6, 16] [1, 1], [] [] []) // OUTER-ZERO-STRIDE: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { @@ -1363,19 +1235,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // Sanity check to ensure that loop subsumption still happens. // OUTER-ZERO-STRIDE-LABEL: @forall_outer_zero_stride_sanity_check -// OUTER-ZERO-STRIDE-DAG: %[[C0:.+]] = arith.constant 0 : index -// OUTER-ZERO-STRIDE-DAG: %[[C1:.+]] = arith.constant 1 : index -// OUTER-ZERO-STRIDE-DAG: %[[C2:.+]] = arith.constant 2 : index -// OUTER-ZERO-STRIDE-DAG: %[[C3:.+]] = arith.constant 3 : index -// OUTER-ZERO-STRIDE-DAG: %[[C4:.+]] = arith.constant 4 : index -// OUTER-ZERO-STRIDE-DAG: %[[C5:.+]] = arith.constant 5 : index -// OUTER-ZERO-STRIDE-DAG: %[[C8:.+]] = arith.constant 8 : index -// OUTER-ZERO-STRIDE-DAG: %[[C16:.+]] = arith.constant 16 : index -// OUTER-ZERO-STRIDE-DAG: %[[C32:.+]] = arith.constant 32 : index // OUTER-ZERO-STRIDE: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // OUTER-ZERO-STRIDE: amdaie.controlcode // OUTER-ZERO-STRIDE-NOT: scf.forall -// OUTER-ZERO-STRIDE: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C1]], %[[C2]]] [%[[C5]], %[[C4]], %[[C8]], %[[C16]]] [%[[C3]], %[[C32]], %[[C16]], %[[C1]]], [] [] []) +// OUTER-ZERO-STRIDE: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 1, 2] [5, 4, 8, 16] [3, 32, 16, 1], [] [] []) // OUTER-ZERO-STRIDE: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { @@ -1399,9 +1262,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // Ensure no modification in case of an out zero stride. // OUTER-ZERO-STRIDE: #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 16)> // OUTER-ZERO-STRIDE-LABEL: @for_outer_zero_stride -// OUTER-ZERO-STRIDE-DAG: %[[C0:.+]] = arith.constant 0 : index -// OUTER-ZERO-STRIDE-DAG: %[[C1:.+]] = arith.constant 1 : index -// OUTER-ZERO-STRIDE-DAG: %[[C6:.+]] = arith.constant 6 : index +// OUTER-ZERO-STRIDE: %[[C0:.+]] = arith.constant 0 : index +// OUTER-ZERO-STRIDE: %[[C1:.+]] = arith.constant 1 : index +// OUTER-ZERO-STRIDE: %[[C6:.+]] = arith.constant 6 : index // OUTER-ZERO-STRIDE: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // OUTER-ZERO-STRIDE: amdaie.controlcode // OUTER-ZERO-STRIDE: scf.for %[[ARG2:.+]] = %[[C0]] to %[[C6]] step %[[C1]] @@ -1494,15 +1357,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // Ensure subsumption in case of a unit iteration. // OUTER-ZERO-STRIDE-LABEL: @forall_outer_zero_stride_with_unit_iteration -// OUTER-ZERO-STRIDE-DAG: %[[C0:.+]] = arith.constant 0 : index -// OUTER-ZERO-STRIDE-DAG: %[[C1:.+]] = arith.constant 1 : index -// OUTER-ZERO-STRIDE-DAG: %[[C2:.+]] = arith.constant 2 : index -// OUTER-ZERO-STRIDE-DAG: %[[C8:.+]] = arith.constant 8 : index -// OUTER-ZERO-STRIDE-DAG: %[[C16:.+]] = arith.constant 16 : index // OUTER-ZERO-STRIDE: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // OUTER-ZERO-STRIDE: amdaie.controlcode // OUTER-ZERO-STRIDE-NOT: scf.forall -// OUTER-ZERO-STRIDE: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([%[[C0]], %[[C0]], %[[C0]]] [%[[C2]], %[[C8]], %[[C16]]] [%[[C16]], %[[C16]], %[[C1]]], [] [] []) +// OUTER-ZERO-STRIDE: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd %[[CIRC_DMA]]([0, 0, 0] [2, 8, 16] [16, 16, 1], [] [] []) // OUTER-ZERO-STRIDE: amdaie.npu.dma_wait(%[[NPU_DMA]], S2MM) #map = affine_map<(d0) -> (d0 * 16)> #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>