Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ObjectFifo] Add a pass to combine logical objFifos for connection reuse #760

Draft
wants to merge 7 commits into
base: main
Choose a base branch
from
Draft
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Assign correct tiles to reusable L1 buffer
Abhishek-Varma committed Sep 11, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
commit 4f0caea3e468c57389ad841762d92e4d0c211253
Original file line number Diff line number Diff line change
@@ -834,18 +834,18 @@ LogicalResult combineLogicalObjectFifos(
// will make an attempt to combine the logical objectFifos as per the
// following algorithm :-
// a. Combine i-th and i+1-th L3->L2 DmaCpyNd ops.
// b. Since step a would create a new L2 buffer (with combined shape), we
// will
// need to update the corresponding two L2->L1 Dma ops by indeed creating
// new ones. NOTE: Both of these new L2->L1 Dma ops will be reusing the
// same L1 buffers as well.
// c. Now pick the unique core ops corresponding to i-th and i+1-th L2->L1
// Dma
// ops and do the following :-
// b. Form reusable L1 buffer by assigning the cumulative tiles of the
// intended core ops.
// c. Since step a would create a new L2 buffer (with combined shape), we
// will need to update the corresponding two L2->L1 Dma ops by indeed
// creating new ones. NOTE: Both of these new L2->L1 Dma ops will be
// reusing the same L1 buffers as well.
// d. Now pick the unique core ops corresponding to i-th and i+1-th L2->L1
// Dma ops and do the following :-
// 1. For i-th CoreOp insert an AccessOp from the same L1 buffer towards
// the end.
// the end.
// 2. For i+1-th CoreOp insert an AccessOp from the same L1 buffer right
// before the corresponding AccessOp within the same CoreOp.
// before the corresponding AccessOp within the same CoreOp.
for (unsigned i = 0, n = l3ToL2DmaOps.size(); i < n; i += 2) {
// Step 1. Combine the picked L3->L2 DmaCpyNd pair.
FailureOr<LogicalObjectFifoFromMemrefOp> maybeNewL2ObjectFifo =
@@ -855,14 +855,56 @@ LogicalResult combineLogicalObjectFifos(
LogicalObjectFifoFromMemrefOp newL2ObjectFifo =
maybeNewL2ObjectFifo.value();

// Step 2. We now have need to create two L2->L1 ops since the size has
// Step 2. Form the reusable L1 buffer by assigning the cumulative tiles of
// the intended core ops.
LogicalObjectFifoFromMemrefOp reuseL1LogicalObjectFifoOp =
l2ToL1DmaOps[i].getTargetObjectFifo();
SmallVector<Value> tiles;
auto addNewTileFrom = [&](CoreOp coreOp) -> LogicalResult {
OpBuilder::InsertionGuard guard(rewriter);
TileOp tileOp = coreOp.getTileOp();
std::optional<int64_t> column = getConstantIntValue(tileOp.getCol());
std::optional<int64_t> row = getConstantIntValue(tileOp.getRow());
if (!column || !row) {
return coreOp.emitOpError() << "has non-constant tile location";
}
rewriter.setInsertionPoint(reuseL1LogicalObjectFifoOp);
auto colIndex = rewriter.create<arith::ConstantIndexOp>(
rewriter.getUnknownLoc(), *column);
auto rowIndex = rewriter.create<arith::ConstantIndexOp>(
rewriter.getUnknownLoc(), *row);
tileOp =
rewriter.create<TileOp>(rewriter.getUnknownLoc(), colIndex, rowIndex);
tiles.push_back(tileOp.getResult());
return success();
};
std::optional<CoreOp> maybeFirstCoreOp = fetchUniqueCoreOp(l2ToL1DmaOps[i]);
if (!maybeFirstCoreOp) return failure();
CoreOp firstCoreOp = maybeFirstCoreOp.value();
std::optional<CoreOp> maybeSecondCoreOp =
fetchUniqueCoreOp(l2ToL1DmaOps[i + 1]);
if (!maybeSecondCoreOp) return failure();
CoreOp secondCoreOp = maybeSecondCoreOp.value();
if (failed(addNewTileFrom(firstCoreOp)) ||
failed(addNewTileFrom(secondCoreOp))) {
return failure();
}
llvm::sort(tiles.begin(), tiles.end(),
AMDAIE::TileOp::tileValueColumnAndRowComparator);
rewriter.setInsertionPoint(reuseL1LogicalObjectFifoOp);
reuseL1LogicalObjectFifoOp =
rewriter.replaceOpWithNewOp<LogicalObjectFifoFromMemrefOp>(
reuseL1LogicalObjectFifoOp,
cast<LogicalObjectFifoType>(
reuseL1LogicalObjectFifoOp.getOutput().getType()),
reuseL1LogicalObjectFifoOp.getMemref(), tiles);

// Step 3. We now have need to create two L2->L1 ops since the size has
// changed. But for this we first need to find the new offset for L2 as
// source.
// TODO: For now I'm hardcoding the offsets but later it'd just depend
// on combining/non-combining dimensions.
// Offset = 0,0
LogicalObjectFifoFromMemrefOp reuseL1LogicalObjectFifoOp =
l2ToL1DmaOps[i].getTargetObjectFifo();
SmallVector<OpFoldResult> newL2AsSourceOffsets =
l2ToL1DmaOps[i].getSourceMixedOffsets();
DmaCpyNdOp newFirstL2ToL1DmaOp = createL2ToL1ForReuse(
@@ -872,31 +914,24 @@ LogicalResult combineLogicalObjectFifos(
// the first L2->L1 Dma.
newL2AsSourceOffsets = l2ToL1DmaOps[i + 1].getSourceMixedOffsets();
newL2AsSourceOffsets[1] = rewriter.getIndexAttr(1);
DmaCpyNdOp newSecondL2ToL1DmaOp = createL2ToL1ForReuse(
rewriter, l2ToL1DmaOps[i + 1], reuseL1LogicalObjectFifoOp,
newL2ObjectFifo, newL2AsSourceOffsets);
createL2ToL1ForReuse(rewriter, l2ToL1DmaOps[i + 1],
reuseL1LogicalObjectFifoOp, newL2ObjectFifo,
newL2AsSourceOffsets);

// Step 3. PICK the CoreOps associated with the 1:1 L2->L1.
// Step 4. Pick the CoreOps associated with the 1:1 L2->L1.
// For the first Core op we'll insert Read at the end. It doesn't matter
// for now so we're gonna insert it right before amdaie.end op.
std::optional<CoreOp> maybeFirstCoreOp =
fetchUniqueCoreOp(newFirstL2ToL1DmaOp);
if (!maybeFirstCoreOp) return failure();
CoreOp firstCoreOp = maybeFirstCoreOp.value();
firstCoreOp.walk([&](AMDAIE::EndOp endOp) {
OpBuilder::InsertionGuard guard(rewriter);
// Hardcoding to `AMDAIE::MemoryAccess::Read`.
rewriter.setInsertionPoint(endOp);
rewriter.create<AMDAIE::LogicalObjectFifoAccessOp>(
rewriter.getUnknownLoc(), reuseL1LogicalObjectFifoOp.getOutput(),
AMDAIE::MemoryAccess::Read);
firstCoreOp.walk([&](AMDAIE::LogicalObjectFifoAccessOp accessOp) {
if (accessOp.getInput() == newFirstL2ToL1DmaOp.getTargetObjectFifo()) {
OpBuilder::InsertionGuard guard(rewriter);
rewriter.setInsertionPointAfter(accessOp);
rewriter.create<AMDAIE::LogicalObjectFifoAccessOp>(
rewriter.getUnknownLoc(), reuseL1LogicalObjectFifoOp.getOutput(),
accessOp.getAccessType());
}
});
// For the second Core op we'll insert `Read` right before the first read
// from the corresponding L1 logicalobjectFifo.
std::optional<CoreOp> maybeSecondCoreOp =
fetchUniqueCoreOp(newSecondL2ToL1DmaOp);
if (!maybeSecondCoreOp) return failure();
CoreOp secondCoreOp = maybeSecondCoreOp.value();
secondCoreOp.walk([&](AMDAIE::LogicalObjectFifoAccessOp accessOp) {
if (accessOp.getInput() == l2ToL1DmaOps[i + 1].getTargetObjectFifo()) {
OpBuilder::InsertionGuard guard(rewriter);
Original file line number Diff line number Diff line change
@@ -32,16 +32,16 @@
// CHECK: %[[DMA_CPY_ND_L3_TO_L2_1:.*]] = amdaie.dma_cpy_nd(
// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 2, 32, 32] [2048, 1024, 32, 1]
// CHECK-SAME: %[[L3_OBJECTFIFO]][0, 0, %[[IV0_32]], %[[IV1_0]]] [1, 2, 32, 32] [4096, 32, 128, 1]
// CHECK: %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]]}
// CHECK: %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]], %[[TILE_0]]}
// CHECK: %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd(
// CHECK-SAME: %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1]
// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
// CHECK: amdaie.core(%[[TILE_1]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out :
// CHECK: linalg.generic
// CHECK: %[[FIRST_READ:.*]] = amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read)
// CHECK: amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read)
// CHECK: linalg.generic
// CHECK-SAME: %[[FIRST_READ]]
// CHECK: amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_0]], Read)
// CHECK: amdaie.end
// CHECK: }
// CHECK: %[[DMA_CPY_ND_L2_TO_L1_1:.*]] = amdaie.dma_cpy_nd(
@@ -55,16 +55,16 @@
// CHECK-SAME: %[[SECOND_READ]]
// CHECK: amdaie.end
// CHECK: }
// CHECK: %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_2]]}
// CHECK: %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_3]], %[[TILE_2]]}
// CHECK: %[[DMA_CPY_ND_L2_TO_L1_2:.*]] = amdaie.dma_cpy_nd(
// CHECK-SAME: %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1]
// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]
// CHECK: amdaie.core(%[[TILE_2]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_2]]], out :
// CHECK: linalg.generic
// CHECK: %[[FIRST_READ:.*]] = amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read)
// CHECK: amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read)
// CHECK: linalg.generic
// CHECK-SAME: %[[FIRST_READ]]
// CHECK: amdaie.logicalobjectfifo.access(%[[L1_OBJECTFIFO_1]], Read)
// CHECK: amdaie.end
// CHECK: }
// CHECK: %[[DMA_CPY_ND_L2_TO_L1_3:.*]] = amdaie.dma_cpy_nd(