Skip to content

Commit

Permalink
Merge branch 'main' into zhewen_ctrl_overlay
Browse files Browse the repository at this point in the history
  • Loading branch information
Yu-Zhewen committed Jan 8, 2025
2 parents 35ef38f + f76c245 commit 379700f
Show file tree
Hide file tree
Showing 18 changed files with 324 additions and 106 deletions.
62 changes: 48 additions & 14 deletions build_tools/ci/cpu_comparison/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -1835,6 +1835,21 @@ def __init__(self):
"transpose_a": True,
"transpose_b": False,
},
# Test where the compute is omitted, this should help triangulate
# how much performance gain can be obtained with better matmul
# on core vs data movement.
{
"M": 4096,
"N": 512,
"K": 512,
"use_ukernel": False,
"peano_opt_level": 3,
"outline": True,
"outline_to_empty_function": True,
"transpose_a": False,
"transpose_b": False,
"skip_numerics": True,
},
]

# Some bf16 Performance tests:
Expand All @@ -1851,15 +1866,29 @@ def __init__(self):
outlining_string = "--iree-amdaie-enable-function-outlining=" + str(
int(outline)
)

peano_opt_level_string = f'"-O{peano_opt_level}"'
aie_compilation_flags = [
outlining_string,
f"--iree-amd-aie-additional-peano-opt-flags={peano_opt_level_string}",
]

outline_to_empty_function = False
empty_key = "outline_to_empty_function"
if empty_key in test and test[empty_key] == True:
outline_to_empty_function = True

if outline_to_empty_function:
aie_compilation_flags.append(
"--iree-amdaie-replace-outlined-functions-with-empty"
)

name_suffix = "O" + str(peano_opt_level)
if outline:
name_suffix += "_outline"
if outline_to_empty_function:
name_suffix += "_outline_empty"
else:
name_suffix += "_outline"

if (transpose_a, transpose_b) == (False, False):
NumericTestClass = Matmul
Expand All @@ -1873,20 +1902,25 @@ def __init__(self):
else:
raise ValueError("Transposing both LHS and RHS is not supported.")

self.register(
NumericTestClass(
M,
N,
K,
"bf16",
"f32",
use_ukernel=use_ukernel,
n_repeats=2,
aie_compilation_flags=aie_compilation_flags,
name_suffix=name_suffix,
additional_labels=["PerformanceCorrectness"],
# This should only be the case for benchmark tests which we expect
# to not pass numerically.
if "skip_numerics" in test and test["skip_numerics"]:
pass
else:
self.register(
NumericTestClass(
M,
N,
K,
"bf16",
"f32",
use_ukernel=use_ukernel,
n_repeats=2,
aie_compilation_flags=aie_compilation_flags,
name_suffix=name_suffix,
additional_labels=["PerformanceCorrectness"],
)
)
)

self.register(
BenchmarkTestClass(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -229,13 +229,15 @@ class AIETargetBackend final : public IREE::HAL::TargetBackend {

void buildTranslationPassPipeline(IREE::HAL::ExecutableTargetAttr,
OpPassManager &passManager) override {

buildAMDAIETransformPassPipeline(
passManager, options.AMDAIETargetDevice, options.AMDAIENumRows,
options.AMDAIENumCols, options.useTilePipeline,
options.useLowerToAIEPipeline, options.matmulElementwiseFusion,
options.enableVectorizationPasses, options.pathToUkernels,
options.enablePacketFlow, options.enableCoalescingLoops,
options.enableCollapsingUnitDims, options.enableFunctionOutlining,
options.replaceOutlinedFunctionsWithEmpty,
options.insertLoopAroundCoreBlock);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ struct AMDAIEOptions {
bool enableCoalescingLoops{false};
bool enableCollapsingUnitDims{false};
bool enableFunctionOutlining{true};
bool replaceOutlinedFunctionsWithEmpty{false};
bool insertLoopAroundCoreBlock{false};
bool matmulElementwiseFusion{false};
AMDAIEDevice AMDAIETargetDevice{AMDAIEDevice::npu1_4col};
Expand Down Expand Up @@ -195,6 +196,13 @@ struct AMDAIEOptions {
llvm::cl::desc("Flag to enable/disable linalg-function-outlining pass."
"It is intended for development purposes only."));

binder.opt<bool>(
"iree-amdaie-replace-outlined-functions-with-empty",
replaceOutlinedFunctionsWithEmpty, llvm::cl::cat(category),
llvm::cl::desc(
"Flag to enable/disable replacing outlined functions with "
"empty functions. For development purposes only."));

binder.opt<bool>(
"iree-amdaie-enable-infinite-loop-around-core-block",
insertLoopAroundCoreBlock, llvm::cl::cat(category),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,14 @@ LogicalResult assignChannels(AMDAIE::WorkgroupOp workgroupOp) {
assert(tileToGeneratorMap.contains(tile) &&
"no channel generator found for tile");
std::optional<uint8_t> maybeChannel =
tileToGeneratorMap[tile].getAndAssignProducerDMAChannel(isPacketFlow);
tileToGeneratorMap[tile].getProducerDMAChannel();
if (!maybeChannel) {
return connectionOp.emitOpError()
<< "no producer DMA channel available";
}
// Only assign the channel if it is for circuit flow.
if (!isPacketFlow)
tileToGeneratorMap[tile].assignProducerDMAChannel(maybeChannel.value());
auto channelOp = rewriter.create<AMDAIE::ChannelOp>(
rewriter.getUnknownLoc(), tile, maybeChannel.value(),
StrmSwPortType::DMA, AMDAIE::DMAChannelDir::MM2S);
Expand All @@ -85,11 +88,14 @@ LogicalResult assignChannels(AMDAIE::WorkgroupOp workgroupOp) {
assert(tileToGeneratorMap.contains(tile) &&
"no channel generator found for tile");
std::optional<uint8_t> maybeChannel =
tileToGeneratorMap[tile].getAndAssignConsumerDMAChannel(isPacketFlow);
tileToGeneratorMap[tile].getConsumerDMAChannel();
if (!maybeChannel) {
return connectionOp.emitOpError()
<< "no consumer DMA channel available";
}
// Only assign the channel if it is for circuit flow.
if (!isPacketFlow)
tileToGeneratorMap[tile].assignConsumerDMAChannel(maybeChannel.value());
auto channelOp = rewriter.create<AMDAIE::ChannelOp>(
rewriter.getUnknownLoc(), tile, maybeChannel.value(),
StrmSwPortType::DMA, AMDAIE::DMAChannelDir::S2MM);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ LogicalResult generateColumnControlOverlay(AMDAIE::WorkgroupOp workgroupOp,
// shared across multiple packet flows as needed.
std::optional<uint8_t> maybeChannel =
shimTileToGeneratorMap[shimTileOp.getResult()]
.getAndAssignProducerDMAChannel(/*isPacketFlow*/ true);
.getProducerDMAChannel();
if (!maybeChannel) {
shimTileOp.emitOpError() << "no producer DMA channel available";
return WalkResult::interrupt();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,12 @@ class AMDAIELinalgFunctionOutliningPass
AMDAIELinalgFunctionOutliningPass> {
public:
AMDAIELinalgFunctionOutliningPass() = default;
AMDAIELinalgFunctionOutliningPass(const AMDAIELinalgFunctionOutliningPass &) {
}
AMDAIELinalgFunctionOutliningPass(
const AMDAIELinalgFunctionOutliningOptions &opts)
: AMDAIELinalgFunctionOutliningBase(opts) {}

void getDependentDialects(DialectRegistry &registry) const override {
registry.insert<linalg::LinalgDialect>();
}
Expand Down Expand Up @@ -169,6 +175,7 @@ void AMDAIELinalgFunctionOutliningPass::runOnOperation() {
if (failed(maybeFunc)) return WalkResult::interrupt();
func::FuncOp func = maybeFunc.value();


rewriter.setInsertionPoint(computeOp);
rewriter.create<func::CallOp>(computeOp.getLoc(), func,
computeOp->getOperands());
Expand All @@ -182,11 +189,28 @@ void AMDAIELinalgFunctionOutliningPass::runOnOperation() {
op->dropAllUses();
rewriter.eraseOp(op);
}

// If the option is set to true, make the body of all outlined functions
// empty, so that only the return remains. This option to 'do no compute'
// is useful for benchmarking purposes.
if (emptyFunctions) {
for (auto &nameAndFuncOp : computeOpToOutlinedFuncMap) {
Region &region = nameAndFuncOp.second.getBody();
Block &block = region.front();
uint64_t nOperations = block.getOperations().size();
assert(nOperations > 0 && "expected terminator");
for (uint64_t i = 0; i < nOperations - 1; ++i) {
Operation *frontOp = &block.front();
rewriter.eraseOp(frontOp);
}
}
}
}

} // namespace

std::unique_ptr<Pass> createAMDAIELinalgFunctionOutliningPass() {
return std::make_unique<AMDAIELinalgFunctionOutliningPass>();
std::unique_ptr<Pass> createAMDAIELinalgFunctionOutliningPass(
AMDAIELinalgFunctionOutliningOptions options) {
return std::make_unique<AMDAIELinalgFunctionOutliningPass>(options);
}
} // namespace mlir::iree_compiler::AMDAIE
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,19 @@ static LogicalResult setRootConfigForPackPeelPipeline(
outerPermVec.push_back(2);
}
SmallVector<SmallVector<int64_t>> outerPerm = {outerPermVec, outerPermVec};
if (isObjectFifo) {
// Add outer permutation for unpack. NOTE: This currently fails for some
// tests in the AIR pipeline.
transposePackIndices.push_back(2);
unpackEmpty.push_back(true);
innerPerm.push_back({0, 1});
if (isa<linalg::BatchMatmulOp>(linalgOp)) {
outerPerm.push_back({0, 2, 1});
} else {
outerPerm.push_back({1, 0});
}
}

auto packingConfigLevel0Attr = getPackingConfigPackingLevelAttr(
context, packedSizesL0, transposePackIndices, unpackEmpty, innerPerm,
outerPerm);
Expand Down
23 changes: 17 additions & 6 deletions compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -497,7 +497,7 @@ void buildAMDAIETransformPassPipeline(
bool enableVectorizationPasses, const std::string &pathToUkernels,
bool enablePacketFlow, bool enableCoalescingLoops,
bool enableCollapsingUnitDims, bool enableFunctionOutlining,
bool insertLoopAroundCoreBlock) {
bool replaceOutlinedFunctionsWithEmpty, bool insertLoopAroundCoreBlock) {
OpPassManager &modulePassManager = variantPassManager.nest<ModuleOp>();
{
FunctionLikeNest funcPassManager(modulePassManager);
Expand Down Expand Up @@ -529,7 +529,7 @@ void buildAMDAIETransformPassPipeline(
modulePassManager, enablePacketFlow, useTilePipeline,
enableVectorizationPasses, enableCoalescingLoops,
enableCollapsingUnitDims, enableFunctionOutlining,
insertLoopAroundCoreBlock, numCols);
replaceOutlinedFunctionsWithEmpty, insertLoopAroundCoreBlock, numCols);
} else if (useLowerToAIEPipeline == LowerToAIEPassPipeline::AIR) {
addMLIRAIRLoweringPasses(modulePassManager, device, useTilePipeline,
matmulElementwiseFusion,
Expand All @@ -553,8 +553,8 @@ void addAMDAIEObjectFifoLoweringPasses(
OpPassManager &passManager, bool enablePacketFlow,
TilePassPipeline useTilePipeline, bool enableVectorizationPasses,
bool enableCoalescingLoops, bool enableCollapsingUnitDims,
bool enableFunctionOutlining, bool insertLoopAroundCoreBlock,
uint32_t numCols) {
bool enableFunctionOutlining, bool replaceOutlinedFunctionsWithEmpty,
bool insertLoopAroundCoreBlock, uint32_t numCols) {
passManager.addPass(createEraseHALDescriptorTypeFromMemRefPass());
passManager.addPass(memref::createFoldMemRefAliasOpsPass());

Expand All @@ -579,8 +579,19 @@ void addAMDAIEObjectFifoLoweringPasses(

passManager.addPass(createAMDAIENormalizeLoopBoundsPass());
passManager.addPass(createAMDAIEInsertCoresPass());
if (enableFunctionOutlining)
passManager.addPass(createAMDAIELinalgFunctionOutliningPass());

if (enableFunctionOutlining) {
// Create function outlining options object, etc.
AMDAIELinalgFunctionOutliningOptions options;
if (replaceOutlinedFunctionsWithEmpty) {
options.emptyFunctions = true;
}
passManager.addPass(createAMDAIELinalgFunctionOutliningPass(options));
} else {
assert(!replaceOutlinedFunctionsWithEmpty &&
"`replaceOutlinedFunctionsWithEmpty` is only valid when "
"`enableFunctionOutlining` is true.");
}

{
// Vectorization passes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ void addAMDAIEObjectFifoLoweringPasses(
OpPassManager &passManager, bool enablePacketFlow,
TilePassPipeline useTilePipeline, bool enableVectorizationPasses,
bool enableCoalescingLoops, bool enableCollapsingUnitDims,
bool enableFunctionOutlining, bool insertLoopAroundCoreBlock,
uint32_t numCols);
bool enableFunctionOutlining, bool replaceOutlinedFunctionsWithEmpty,
bool insertLoopAroundCoreBlock, uint32_t numCols);

/// Add passes to lower from MLIR-AIR through AIE. This is
/// currently the default passes used for lowering after IREEs tiling.
Expand All @@ -43,7 +43,7 @@ void buildAMDAIETransformPassPipeline(
bool enableVectorizationPasses, const std::string &pathToUkernels,
bool enablePacketFlow, bool enableCoalescingLoops,
bool enableCollapsingUnitDims, bool enableFunctionOutlining,
bool insertLoopAroundCoreBlock);
bool replaceOutlinedFunctionsWithEmpty, bool insertLoopAroundCoreBlock);

/// Populates passes needed to lower the IR via a Pack-Peel based approach.
void addPackPeelBasedPassPipeline(OpPassManager &oassManager,
Expand Down Expand Up @@ -185,7 +185,8 @@ std::unique_ptr<Pass> createAMDAIEDmaToCircularDmaPass();
std::unique_ptr<Pass> createAMDAIEFlattenLogicalObjectFifoPass();

/// Create a pass for function outlining.
std::unique_ptr<Pass> createAMDAIELinalgFunctionOutliningPass();
std::unique_ptr<Pass> createAMDAIELinalgFunctionOutliningPass(
AMDAIELinalgFunctionOutliningOptions = {});

/// Create a pass to fuse the consumer op into the innermost last scf loop.
std::unique_ptr<Pass> createAMDAIEFuseConsumerIntoLoopPass(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,14 @@ def AMDAIELinalgFunctionOutlining :
repeated codes.
}];
let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIELinalgFunctionOutliningPass()";
let options = [
Option<"emptyFunctions", "empty-functions", "bool", /*default=*/"false",
"A developer only option that results in incorrect numerics. "
"Replace all outlined functions with a function that does nothing, "
"i.e. it just returns. Useful for measuring the performance of data "
"movement to/from the device -- by doing zero compute, all time is spent "
"moving data to/from the AIE cores.">
];
}

def AMDAIEFoldDmaWaits :
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ iree_lit_test_suite(
"fold_dma_waits.mlir"
"flatten_logical_objectfifo.mlir"
"linalg_function_outlining.mlir"
"linalg_function_outlining_to_empty.mlir"
"fuse_consumer_into_loop.mlir"
"fuse_fill_into_forall.mlir"
"fuse_pack_into_loop.mlir"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// RUN: iree-opt --split-input-file --iree-amdaie-linalg-function-outlining --verify-diagnostics --split-input-file %s | FileCheck %s

// Test demonstrating multiple Matmul using different SSAs.
// Test demonstrating multiple matmuls using different SSAs.

// CHECK-LABEL: func.func private @generic_matmul_0_outlined
// CHECK-SAME: (%[[LHS:.*]]: memref<4x8xbf16>,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(iree-amdaie-linalg-function-outlining{empty-functions=true})" --verify-diagnostics --split-input-file %s | FileCheck %s --check-prefix=EMPTY
// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(iree-amdaie-linalg-function-outlining{empty-functions=false})" --verify-diagnostics --split-input-file %s | FileCheck %s --check-prefix=NOT_EMPTY

func.func @reduction(%A: memref<4xbf16>, %B: memref<bf16>) {
%c2 = arith.constant 2 : index
%tile = amdaie.tile(%c2, %c2)
%1 = amdaie.core(%tile, in : [], out : []) {
linalg.generic {
indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>],
iterator_types = ["reduction"]
} ins(%A: memref<4xbf16>) outs(%B : memref<bf16>) {
^bb0(%in: bf16, %out: bf16):
linalg.yield %in : bf16
}
amdaie.end
}
return
}

// The (default) case where empty-functions is false, outlining works as usual.
// NOT_EMPTY: func.func private
// NOT_EMPTY: linalg.generic
// NOT_EMPTY: return

// When empty-functions=true, the outlined function shouldn't contain compute.
// EMPTY: func.func private
// EMPTY-NOT: linalg.generic
// EMPTY: return
Loading

0 comments on commit 379700f

Please sign in to comment.