diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp index 6b9c985b9..168af5cc8 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp @@ -264,8 +264,8 @@ LogicalResult collectSplittingDims( fetchTotalUniqueLogicalObjFifoUsers( objFifo.getCopyLikeConsumers()); if (failed(maybeNumUniqueConsumers)) { - objFifo.emitOpError() - << "could not retrieve total unique L2<->L1 pairs"; + objFifo.emitOpError() << "could not retrieve the total number of " + "unique consumer objFifos"; } int64_t splitFactor = std::gcd(*maybeNumUniqueConsumers, numCols); int64_t sourceSize = (*sourceSizes)[sourceSplitDim]; @@ -334,8 +334,8 @@ LogicalResult collectSplittingDims( fetchTotalUniqueLogicalObjFifoUsers( objFifo.getCopyLikeProducers()); if (failed(maybeNumUniqueProducers)) { - objFifo.emitOpError() - << "could not retrieve total unique L2<->L1 pairs"; + objFifo.emitOpError() << "could not retrieve the total number of " + "unique producer objFifos"; } int64_t splitFactor = std::gcd(*maybeNumUniqueProducers, numCols); int64_t sourceSize = (*sourceSizes)[sourceSplitDim]; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir index daaf5e81c..16b8332df 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir @@ -499,11 +499,11 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { // ----- // A concise test for LHS/RHS/OUT of a Matmul on 4x8 for pack-peel-4-level-tiling. Although -// we have 8 columns, L2 LHS buffers needs to be split into only 4, L2 RHS into 8 and L2 OUT into 8. -// But to keep the test case concise it demonstrates a similar splitting strategy for 1 row and 2 columns. -// Therefore L2 LHS will be split into 1 and L2 RHS will be split into 2. This needs to happen because -// later in the compilation stack when the tiles are being assigned, we will -// Refer: https://github.com/nod-ai/iree-amd-aie/pull/1031#discussion_r1920237380 +// we have 8 columns, L2 LHS buffers needs to be split into only 4, L2 RHS into 8 and L2 OUT into 8, +// i.e. the splitting will be dependent on unique producer/consumer for the respective ObjectFifos +// being split. +// To keep the test case concise it demonstrates a similar splitting strategy when the actual +// compute is taking place in 1 row and 2 columns. // // CHECK-LABEL: @pack_peel_4_level_4x8_Strix // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index @@ -517,8 +517,6 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { // CHECK: amdaie.dma_cpy_nd(%{{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %[[LOF_RHS_L2_0]][%{{.*}}, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) : // CHECK: amdaie.dma_cpy_nd(%{{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %[[LOF_RHS_L2_1]][%{{.*}}, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) : // CHECK: amdaie.dma_cpy_nd(%{{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 8] [2048, 2048, 32, 8, 256, 1], %[[LOF_LHS_L2]][0, 0, 0, 0] [1, 1, 32, 64] [16384, 2048, 64, 1]) : -// CHECK: amdaie.core -// CHECK: amdaie.core // CHECK: amdaie.dma_cpy_nd(%[[LOF_OUT_L2:.*]][0, 0, 0, 0] [1, 1, 32, 32] [8192, 1024, 32, 1], %{{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : // CHECK: } // CHECK: amdaie.dma_cpy_nd(%{{.*}}[0, 0] [256, 512] [4096, 1], %[[LOF_OUT_L2]][0, 0, 0, 0] [8, 32, 16, 32] [1024, 32, 8192, 1]) : @@ -526,7 +524,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { #executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 8 : i32, num_rows = 4 : i32, target_device = "npu4", ukernels = "none"}> #translation = #iree_codegen.translation_info module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { - func.func @pack_peel_4_level_4x8_Strix() attributes {translation_info = #translation} { + func.func @pack_peel_4_level_4x8_Strix(%lhs: memref<512x512xi32>, %rhs: memref<512x4096xi32>, %out: memref<512x4096xi32>) attributes {translation_info = #translation} { %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index @@ -536,21 +534,12 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { %alloc_2 = memref.alloc() : memref<16x8x32x32xi32, 1 : i32> %alloc_3 = memref.alloc() : memref<16x8x64x32xi32, 1 : i32> %alloc_4 = memref.alloc() : memref<8x8x32x64xi32, 1 : i32> - %alloc_5 = memref.alloc() : memref<512x512xi32> - %alloc_6 = memref.alloc() : memref<512x4096xi32> - %alloc_7 = memref.alloc() : memref<512x4096xi32> - %tile_0_1 = amdaie.tile(%c0, %c1) - %tile_1_1 = amdaie.tile(%c1, %c1) - %tile_2_1 = amdaie.tile(%c2, %c1) - %tile_0_0 = amdaie.tile(%c0, %c0) - %tile_1_0 = amdaie.tile(%c1, %c0) - %tile_2_0 = amdaie.tile(%c2, %c0) - %lof_0_1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_1} : memref<16x8x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> - %lof_1_1 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_1_1} : memref<16x8x64x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> - %lof_2_1 = amdaie.logicalobjectfifo.from_memref %alloc_4, {%tile_2_1} : memref<8x8x32x64xi32, 1 : i32> -> !amdaie.logicalobjectfifo> - %lof_0_0 = amdaie.logicalobjectfifo.from_memref %alloc_5, {%tile_0_0} : memref<512x512xi32> -> !amdaie.logicalobjectfifo> - %lof_1_0 = amdaie.logicalobjectfifo.from_memref %alloc_6, {%tile_1_0} : memref<512x4096xi32> -> !amdaie.logicalobjectfifo> - %lof_2_0 = amdaie.logicalobjectfifo.from_memref %alloc_7, {%tile_2_0} : memref<512x4096xi32> -> !amdaie.logicalobjectfifo> + %lof_0_1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<16x8x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %lof_1_1 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<16x8x64x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %lof_2_1 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<8x8x32x64xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %lof_0_0 = amdaie.logicalobjectfifo.from_memref %lhs, {} : memref<512x512xi32> -> !amdaie.logicalobjectfifo> + %lof_1_0 = amdaie.logicalobjectfifo.from_memref %rhs, {} : memref<512x4096xi32> -> !amdaie.logicalobjectfifo> + %lof_2_0 = amdaie.logicalobjectfifo.from_memref %out, {} : memref<512x4096xi32> -> !amdaie.logicalobjectfifo> scf.forall (%arg0, %arg1) in (2, 8) { %0 = amdaie.dma_cpy_nd(%lof_2_1[0, 0, 0, 0] [8, 32, 8, 64] [16384, 64, 2048, 1], %lof_0_0[0, 0] [256, 512] [512, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %1 = amdaie.dma_cpy_nd(%lof_1_1[0, 0, 0, 0] [8, 64, 16, 32] [2048, 32, 16384, 1], %lof_1_0[0, 0] [512, 512] [4096, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) @@ -559,20 +548,14 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { %of1 = affine.apply affine_map<(d0) -> (d0 * 8 + 1)>(%arg2) %tile_1_2 = amdaie.tile(%c1, %c2) %tile_0_2 = amdaie.tile(%c0, %c2) - %lof_1_2 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_1_2} : memref<1x1x8x8x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %lof_0_2 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_0_2} : memref<1x1x8x8x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %lof_c_2 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_2, %tile_1_2} : memref<1x1x8x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %lof_1_2 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x8x8x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %lof_0_2 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x8x8x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %lof_c_2 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x8x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> %3 = amdaie.dma_cpy_nd(%lof_0_2[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %lof_1_1[%of0, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %4 = amdaie.dma_cpy_nd(%lof_1_2[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %lof_1_1[%of1, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %5 = amdaie.dma_cpy_nd(%lof_c_2[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 8] [2048, 2048, 32, 8, 256, 1], %lof_2_1[0, 0, 0, 0] [1, 1, 32, 64] [16384, 2048, 64, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %lof_0_2_8 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile_0_2} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %6 = amdaie.core(%tile_0_2, in : [%5, %3], out : []) { - amdaie.end - } - %7 = amdaie.core(%tile_1_2, in : [%5, %4], out : []) { - amdaie.end - } - %8 = amdaie.dma_cpy_nd(%lof_0_1[0, 0, 0, 0] [1, 1, 32, 32] [8192, 1024, 32, 1], %lof_0_2_8[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %lof_0_2_8 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %6 = amdaie.dma_cpy_nd(%lof_0_1[0, 0, 0, 0] [1, 1, 32, 32] [8192, 1024, 32, 1], %lof_0_2_8[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) } %2 = amdaie.dma_cpy_nd(%lof_2_0[0, 0] [256, 512] [4096, 1], %lof_0_1[0, 0, 0, 0] [8, 32, 16, 32] [1024, 32, 8192, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) }