Assign correct tiles to reusable L1 buffer

nod-ai · Abhishek-Varma · Sep 10, 2024 · Sep 10, 2024 · Sep 10, 2024 · Sep 11, 2024
commit 4f0caea3e468c57389ad841762d92e4d0c211253
@@ -834,18 +834,18 @@ LogicalResult combineLogicalObjectFifos(
   // will make an attempt to combine the logical objectFifos as per the
   // following algorithm :-
   //  a. Combine i-th and i+1-th L3->L2 DmaCpyNd ops.
-  //  b. Since step a  would create a new L2 buffer (with combined shape), we
-  //  will
-  //     need to update the corresponding two L2->L1 Dma ops by indeed creating
-  //     new ones. NOTE: Both of these new L2->L1 Dma ops will be reusing the
-  //     same L1 buffers as well.
-  //  c. Now pick the unique core ops corresponding to i-th and i+1-th L2->L1
-  //  Dma
-  //     ops and do the following :-
+  //  b. Form reusable L1 buffer by assigning the cumulative tiles of the
+  //     intended core ops.
+  //  c. Since step a  would create a new L2 buffer (with combined shape), we
+  //     will need to update the corresponding two L2->L1 Dma ops by indeed
+  //     creating new ones. NOTE: Both of these new L2->L1 Dma ops will be
+  //     reusing the same L1 buffers as well.
+  //  d. Now pick the unique core ops corresponding to i-th and i+1-th L2->L1
+  //     Dma ops and do the following :-
   //      1. For i-th CoreOp insert an AccessOp from the same L1 buffer towards
-  //      the end.
+  //         the end.
   //      2. For i+1-th CoreOp insert an AccessOp from the same L1 buffer right
-  //      before the corresponding AccessOp within the same CoreOp.
+  //         before the corresponding AccessOp within the same CoreOp.
   for (unsigned i = 0, n = l3ToL2DmaOps.size(); i < n; i += 2) {
     // Step 1. Combine the picked L3->L2 DmaCpyNd pair.
     FailureOr<LogicalObjectFifoFromMemrefOp> maybeNewL2ObjectFifo =
@@ -855,14 +855,56 @@ LogicalResult combineLogicalObjectFifos(
     LogicalObjectFifoFromMemrefOp newL2ObjectFifo =
         maybeNewL2ObjectFifo.value();
 
-    // Step 2. We now have need to create two L2->L1 ops since the size has
+    // Step 2. Form the reusable L1 buffer by assigning the cumulative tiles of
+    // the intended core ops.
+    LogicalObjectFifoFromMemrefOp reuseL1LogicalObjectFifoOp =
+        l2ToL1DmaOps[i].getTargetObjectFifo();
+    SmallVector<Value> tiles;
+    auto addNewTileFrom = [&](CoreOp coreOp) -> LogicalResult {
+      OpBuilder::InsertionGuard guard(rewriter);
+      TileOp tileOp = coreOp.getTileOp();
+      std::optional<int64_t> column = getConstantIntValue(tileOp.getCol());
+      std::optional<int64_t> row = getConstantIntValue(tileOp.getRow());
+      if (!column || !row) {
+        return coreOp.emitOpError() << "has non-constant tile location";
+      }
+      rewriter.setInsertionPoint(reuseL1LogicalObjectFifoOp);
+      auto colIndex = rewriter.create<arith::ConstantIndexOp>(
+          rewriter.getUnknownLoc(), *column);
+      auto rowIndex = rewriter.create<arith::ConstantIndexOp>(
+          rewriter.getUnknownLoc(), *row);
+      tileOp =
+          rewriter.create<TileOp>(rewriter.getUnknownLoc(), colIndex, rowIndex);
+      tiles.push_back(tileOp.getResult());
+      return success();
+    };
+    std::optional<CoreOp> maybeFirstCoreOp = fetchUniqueCoreOp(l2ToL1DmaOps[i]);
+    if (!maybeFirstCoreOp) return failure();
+    CoreOp firstCoreOp = maybeFirstCoreOp.value();
+    std::optional<CoreOp> maybeSecondCoreOp =
+        fetchUniqueCoreOp(l2ToL1DmaOps[i + 1]);
+    if (!maybeSecondCoreOp) return failure();
+    CoreOp secondCoreOp = maybeSecondCoreOp.value();
+    if (failed(addNewTileFrom(firstCoreOp)) ||
+        failed(addNewTileFrom(secondCoreOp))) {
+      return failure();
+    }
+    llvm::sort(tiles.begin(), tiles.end(),
+               AMDAIE::TileOp::tileValueColumnAndRowComparator);
+    rewriter.setInsertionPoint(reuseL1LogicalObjectFifoOp);
+    reuseL1LogicalObjectFifoOp =
+        rewriter.replaceOpWithNewOp<LogicalObjectFifoFromMemrefOp>(
+            reuseL1LogicalObjectFifoOp,
+            cast<LogicalObjectFifoType>(
+                reuseL1LogicalObjectFifoOp.getOutput().getType()),
+            reuseL1LogicalObjectFifoOp.getMemref(), tiles);
+
+    // Step 3. We now have need to create two L2->L1 ops since the size has
     // changed. But for this we first need to find the new offset for L2 as
     // source.
     // TODO: For now I'm hardcoding the offsets but later it'd just depend
     // on combining/non-combining dimensions.
     // Offset = 0,0
-    LogicalObjectFifoFromMemrefOp reuseL1LogicalObjectFifoOp =
-        l2ToL1DmaOps[i].getTargetObjectFifo();
     SmallVector<OpFoldResult> newL2AsSourceOffsets =
         l2ToL1DmaOps[i].getSourceMixedOffsets();
     DmaCpyNdOp newFirstL2ToL1DmaOp = createL2ToL1ForReuse(
@@ -872,31 +914,24 @@ LogicalResult combineLogicalObjectFifos(
     // the first L2->L1 Dma.
     newL2AsSourceOffsets = l2ToL1DmaOps[i + 1].getSourceMixedOffsets();
     newL2AsSourceOffsets[1] = rewriter.getIndexAttr(1);
-    DmaCpyNdOp newSecondL2ToL1DmaOp = createL2ToL1ForReuse(
-        rewriter, l2ToL1DmaOps[i + 1], reuseL1LogicalObjectFifoOp,
-        newL2ObjectFifo, newL2AsSourceOffsets);
+    createL2ToL1ForReuse(rewriter, l2ToL1DmaOps[i + 1],
+                         reuseL1LogicalObjectFifoOp, newL2ObjectFifo,
+                         newL2AsSourceOffsets);
 
-    // Step 3. PICK the CoreOps associated with the 1:1 L2->L1.
+    // Step 4. Pick the CoreOps associated with the 1:1 L2->L1.
     // For the first Core op we'll insert Read at the end. It doesn't matter
     // for now so we're gonna insert it right before amdaie.end op.
-    std::optional<CoreOp> maybeFirstCoreOp =
-        fetchUniqueCoreOp(newFirstL2ToL1DmaOp);
-    if (!maybeFirstCoreOp) return failure();
-    CoreOp firstCoreOp = maybeFirstCoreOp.value();
-    firstCoreOp.walk([&](AMDAIE::EndOp endOp) {
-      OpBuilder::InsertionGuard guard(rewriter);
-      // Hardcoding to `AMDAIE::MemoryAccess::Read`.
-      rewriter.setInsertionPoint(endOp);
-      rewriter.create<AMDAIE::LogicalObjectFifoAccessOp>(
-          rewriter.getUnknownLoc(), reuseL1LogicalObjectFifoOp.getOutput(),
-          AMDAIE::MemoryAccess::Read);
+    firstCoreOp.walk([&](AMDAIE::LogicalObjectFifoAccessOp accessOp) {
+      if (accessOp.getInput() == newFirstL2ToL1DmaOp.getTargetObjectFifo()) {
+        OpBuilder::InsertionGuard guard(rewriter);
+        rewriter.setInsertionPointAfter(accessOp);
+        rewriter.create<AMDAIE::LogicalObjectFifoAccessOp>(
+            rewriter.getUnknownLoc(), reuseL1LogicalObjectFifoOp.getOutput(),
+            accessOp.getAccessType());
+      }
     });
     // For the second Core op we'll insert `Read` right before the first read
     // from the corresponding L1 logicalobjectFifo.
-    std::optional<CoreOp> maybeSecondCoreOp =
-        fetchUniqueCoreOp(newSecondL2ToL1DmaOp);
-    if (!maybeSecondCoreOp) return failure();
-    CoreOp secondCoreOp = maybeSecondCoreOp.value();
     secondCoreOp.walk([&](AMDAIE::LogicalObjectFifoAccessOp accessOp) {
       if (accessOp.getInput() == l2ToL1DmaOps[i + 1].getTargetObjectFifo()) {
         OpBuilder::InsertionGuard guard(rewriter);

@@ -32,16 +32,16 @@
 //       CHECK:       %[[DMA_CPY_ND_L3_TO_L2_1:.*]] = amdaie.dma_cpy_nd(
 //  CHECK-SAME:                                         %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1]
 //  CHECK-SAME:                                         %[[L3_OBJECTFIFO]][0, 0, %[[IV0_32]], %[[IV1_0]]] [1, 2, 32, 32] [4096, 32, 128, 1]
-//       CHECK:       %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]]}
+//       CHECK:       %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]], %[[TILE_0]]}
 //       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd(
 //  CHECK-SAME:                                          %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
 //  CHECK-SAME:                                          %[[L2_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
 //       CHECK:       amdaie.core(%[[TILE_1]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out :
 //       CHECK:         linalg.generic
 //       CHECK:         %[[FIRST_READ:.*]] = amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read)
+//       CHECK:         amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read)
 //       CHECK:         linalg.generic
 //  CHECK-SAME:             %[[FIRST_READ]]
-//       CHECK:         amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read)
 //       CHECK:         amdaie.end
 //       CHECK:       }
 //       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_1:.*]] = amdaie.dma_cpy_nd(
@@ -55,16 +55,16 @@
 //  CHECK-SAME:             %[[SECOND_READ]]
 //       CHECK:         amdaie.end
 //       CHECK:       }
-//       CHECK:       %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_2]]}
+//       CHECK:       %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_3]], %[[TILE_2]]}
 //       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_2:.*]] = amdaie.dma_cpy_nd(
 //  CHECK-SAME:                                          %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] 
 //  CHECK-SAME:                                          %[[L2_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
 //       CHECK:       amdaie.core(%[[TILE_2]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_2]]], out :
 //       CHECK:         linalg.generic
 //       CHECK:         %[[FIRST_READ:.*]] = amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read)
+//       CHECK:         amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read)
 //       CHECK:         linalg.generic
 //  CHECK-SAME:             %[[FIRST_READ]]
-//       CHECK:         amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read)
 //       CHECK:         amdaie.end
 //       CHECK:       }
 //       CHECK:       %[[DMA_CPY_ND_L2_TO_L1_3:.*]] = amdaie.dma_cpy_nd(