Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[WIP] use peano for kernels
Browse files Browse the repository at this point in the history
fixes #637
makslevental committed Aug 31, 2024

Verified

This commit was signed with the committer’s verified signature.
scala-steward Scala Steward
1 parent 37bb7f1 commit 75148af
Showing 6 changed files with 217 additions and 91 deletions.
26 changes: 15 additions & 11 deletions build_tools/ci/run_matmul_test.sh
Original file line number Diff line number Diff line change
@@ -182,7 +182,7 @@ function run_matmul_test() {

local amd_aie_install_path="${IREE_INSTALL_DIR}"

local vitis_path="${VITIS}"
local vitis_path=""

local use_chess="false"

@@ -540,16 +540,15 @@ run_matmul_test \
# MLIR-AIR Matmul tests
###################################################################

if [ -d "$VITIS" ]; then
run_matmul_test \
--name_prefix "ukern" \
--lower_to_aie_pipeline "air" \
--tile_pipeline "pad-pack" \
--lhs_rhs_type "bf16" \
--acc_type "f32" \
--m "256" --k "256" --n "256" \
--use_ukernel "1"
fi
run_matmul_test \
--name_prefix "ukern" \
--lower_to_aie_pipeline "air" \
--tile_pipeline "pad-pack" \
--lhs_rhs_type "bf16" \
--acc_type "f32" \
--m "256" --k "256" --n "256" \
--vitis_path "${VITIS}" \
--use_ukernel "1"

# Example of a run with a group of 2+ matmuls. Currently this test is passed
# the flag '--num_repeat_runs 0" as there is currently an issue with the runtime if
@@ -720,6 +719,7 @@ if [ -d "$VITIS" ]; then
--lhs_rhs_type "bf16" \
--acc_type "f32" \
--num_repeat_runs "2" \
--vitis_path "${VITIS}" \
--use_ukernel "1"

run_matmul_test_on_shapes ${bf16_ukernel_shapes_medium[@]} \
@@ -729,6 +729,7 @@ if [ -d "$VITIS" ]; then
--lhs_rhs_type "bf16" \
--acc_type "f32" \
--num_repeat_runs "2" \
--vitis_path "${VITIS}" \
--use_ukernel "1"
fi

@@ -746,6 +747,7 @@ if [ -d "$VITIS" ]; then
--n "32" \
--k "32" \
--use_chess "1" \
--vitis_path "${VITIS}" \
--num_repeat_runs "10"

run_matmul_test \
@@ -757,6 +759,7 @@ if [ -d "$VITIS" ]; then
--k "64" \
--use_chess "1" \
--num_repeat_runs "10" \
--vitis_path "${VITIS}" \
--use_ukernel "1"

run_matmul_test \
@@ -769,6 +772,7 @@ if [ -d "$VITIS" ]; then
--n "32" \
--k "32" \
--use_chess "1" \
--vitis_path "${VITIS}" \
--num_repeat_runs "10"

fi
2 changes: 1 addition & 1 deletion build_tools/download_peano.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash

RELEASE=19.0.0.2024082221+90abe71b
RELEASE=19.0.0.2024083101+42158757
pip download llvm_aie==$RELEASE -f https://github.com/Xilinx/llvm-aie/releases/expanded_assets/nightly
unzip llvm_aie*whl
Original file line number Diff line number Diff line change
@@ -10,10 +10,11 @@ iree_cc_library(
NAME
AIETargets
SRCS
"AMDAIETargetBCF.cpp"
"AMDAIETargetCDODirect.cpp"
"AMDAIETargetLdScript.cpp"
"XCLBinGen.cpp"
PeanoDriver.cpp
AMDAIETargetBCF.cpp
AMDAIETargetCDODirect.cpp
AMDAIETargetLdScript.cpp
XCLBinGen.cpp
DEPS
iree-amd-aie::aie_runtime::iree_aie_runtime_static
iree::target::amd-aie::Transforms
@@ -28,9 +29,9 @@ iree_cc_library(
NAME
Target
HDRS
"AIETarget.h"
AIETarget.h
SRCS
"AIETarget.cpp"
AIETarget.cpp
DEPS
::AIETargets
iree-amd-aie::schemas::xrt_executable_def_c_fbs
106 changes: 106 additions & 0 deletions compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/PeanoDriver.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
// Copyright 2024 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include "PeanoDriver.h"

#include <filesystem>
#include <string>
#include <vector>

#include "llvm/Support/Error.h"

using Path = std::filesystem::path;

void addExternCSystemInclude(std::vector<std::string> &CC1Args,
const std::string &Path) {
CC1Args.push_back("-internal-externc-isystem");
CC1Args.push_back(Path);
}

void addSystemInclude(std::vector<std::string> &CC1Args,
const std::string &Path) {
CC1Args.push_back("-internal-isystem");
CC1Args.push_back(Path);
}

void AddClangSystemIncludeArgs(std::vector<std::string> &CC1Args,
const Path &peanoDir, const std::string &target,
bool novitisheaders, bool nostdlibinc) {
// Always include our instrinsics, for compatibility with existing toolchain.
if (!novitisheaders) {
std::string path;
if (target.rfind("aie2", 0) == 0) {
path = peanoDir / "lib" / "clang" / "19" / "include" / "aiev2intrin.h";
} else {
llvm::report_fatal_error(("unsupported target: " + target).c_str());
}
CC1Args.push_back("-include");
CC1Args.push_back(path);
}

CC1Args.push_back("-D__AIENGINE__");
if (target.rfind("aie2", 0) == 0) CC1Args.push_back("-D__AIEARCH__=20");

// Don't pull in system headers from /usr/include or /usr/local/include.
// All of the basic headers that we need come from the compiler.
CC1Args.push_back("-nostdsysteminc");

if (nostdlibinc) return;
addExternCSystemInclude(CC1Args, peanoDir / ".." / "include" / target);
}

void addLibCxxIncludePaths(std::vector<std::string> &CC1Args,
const Path &peanoDir, const std::string &target,
bool nostdinc, bool nostdlibinc, bool nostdincxx) {
if (nostdinc || nostdlibinc || nostdincxx) return;
addSystemInclude(CC1Args, peanoDir / "include" / target / "c++" / " v1");
// Second add the generic one.
addSystemInclude(CC1Args, peanoDir / "include" / "c++" / " v1");
}

void addOptTargetOptions(std::vector<std::string> &CC1Args) {
// For now, we disable the auto-vectorizers by default, as the backend cannot
// handle many vector types. For experimentation the vectorizers can still be
// enabled explicitly by the user
CC1Args.push_back("-vectorize-loops=false");
CC1Args.push_back("-vectorize-slp=false");

// An if-then-else cascade requires at least 5 delay slots for evaluating the
// condition and 5 delay slots for one of the branches, thus speculating 10
// instructions should be fine
CC1Args.push_back("--two-entry-phi-node-folding-threshold=10");

// Make sure to perform most optimizations before mandatory inlinings,
// otherwise noalias attributes can get lost and hurt AA results.
CC1Args.push_back("-mandatory-inlining-before-opt=false");

// Perform complete AA analysis on phi nodes.
CC1Args.push_back("-basic-aa-full-phi-analysis=true");

// Extend the max limit of the search depth in BasicAA
CC1Args.push_back("-basic-aa-max-lookup-search-depth=10");
}

void addClangTargetOptions(std::vector<std::string> &CC1Args,
const std::string &target) {
CC1Args.emplace_back("--target=" + target);
CC1Args.push_back("-fno-use-init-array");
// Pass -fno-threadsafe-statics to prevent dependence on lock acquire/release
// handling for static local variables.
CC1Args.push_back("-fno-threadsafe-statics");

std::vector<std::string> peanoArgs;
addOptTargetOptions(peanoArgs);
CC1Args.reserve(CC1Args.size() + 2 * peanoArgs.size());
for (const std::string &item : peanoArgs) {
CC1Args.emplace_back("-mllvm");
CC1Args.emplace_back(item);
}
}

// Avoid using newer dwarf versions, as the simulator doesn't understand newer
// dwarf.
unsigned getMaxDwarfVersion() { return 4; }
28 changes: 28 additions & 0 deletions compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/PeanoDriver.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// Copyright 2024 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include <filesystem>
#include <string>
#include <vector>

#include "llvm/Support/Error.h"

void AddClangSystemIncludeArgs(std::vector<std::string> &CC1Args,
const std::filesystem::path &peanoDir,
const std::string &target,
bool novitisheaders = false,
bool nostdlibinc = false);

void addLibCxxIncludePaths(std::vector<std::string> &CC1Args,
const std::filesystem::path &peanoDir,
const std::string &target, bool nostdinc = false,
bool nostdlibinc = false, bool nostdincxx = false);

void addOptTargetOptions(std::vector<std::string> &CC1Args);
void addClangTargetOptions(std::vector<std::string> &CC1Args,
const std::string &target);

unsigned getMaxDwarfVersion();
133 changes: 60 additions & 73 deletions compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp
Original file line number Diff line number Diff line change
@@ -11,11 +11,13 @@
#include <random>
#include <regex>
#include <sstream>

// ReSharper disable once CppUnusedIncludeDirective
#include <fstream>
#include <unordered_map>

#include "AMDAIETargets.h"
#include "PeanoDriver.h"
#include "aievec/Passes.h"
#include "iree-amd-aie/Transforms/Passes.h"
#include "iree/compiler/Utils/ToolUtils.h"
@@ -96,7 +98,7 @@ namespace {
FailureOr<std::string> getTargetDir(const std::string &npuVersion) {
if (npuVersion == "npu1") return std::string{"target_aie_ml"};
if (npuVersion == "npu4") return std::string{"target_aie2p"};
llvm::errs() << "unsupported NPUVersion: " << npuVersion;
llvm::errs() << "unsupported NPUVersion: " << npuVersion << "\n";
return failure();
}

@@ -152,7 +154,8 @@ FailureOr<Path> findVitis(std::optional<Path> &vitisDir,
return failure();
}
if (!std::filesystem::exists(licenseFile)) {
llvm::errs() << "ERROR: license file" << licenseFile << " does not exist";
llvm::errs() << "ERROR: license file" << licenseFile << " does not exist"
<< "\n";
return failure();
}
}
@@ -215,7 +218,7 @@ std::pair<std::string, std::vector<std::string>> makeChessArgs(
archVersion = "21";
modelDir = "aie2p";
} else {
llvm::errs() << "unsupported NPU version: " << npuVersion;
llvm::errs() << "unsupported NPU version: " << npuVersion << "\n";
llvm::report_fatal_error("unsupported NPU version");
}

@@ -410,27 +413,6 @@ static LogicalResult assembleFileUsingChess(
return runTool(xChessCCExe, args, verbose, env);
}

std::vector<std::string> makePeanoOptArgs() {
return {
// peano has no proper vectorization cost model for AIE
"-vectorize-loops=false",
//
"-vectorize-slp=false",
// An if-then-else cascade requires at least 5 delay slots for
// evaluating the condition and 5 delay slots for one of the
// branches, thus speculating 10 instructions should be fine
"--two-entry-phi-node-folding-threshold=10",
// Make sure to perform most optimizations before mandatory
// inlinings, otherwise noalias attributes can get lost and
// hurt AA results.
"-mandatory-inlining-before-opt=false",
// complete AA analysis on phi nodes.
"-basic-aa-full-phi-analysis=true",
// Extend the max limit of the search depth in BasicAA
"-basic-aa-max-lookup-search-depth=10",
};
}

static LogicalResult assembleFileUsingPeano(
const std::string &inputFile, const std::string &outputFile,
const std::vector<std::string> &extraArgs, Path &_tempDir, Path &peanoDir,
@@ -439,21 +421,13 @@ static LogicalResult assembleFileUsingPeano(
args.reserve(args.size() + std::distance(extraArgs.begin(), extraArgs.end()));
args.insert(args.end(), extraArgs.begin(), extraArgs.end());
args.emplace_back("-O2");

// TODO(max): pipe target arch in somehow
args.emplace_back("--target=aie2-none-unknown-elf");
std::vector<std::string> peanoArgs = makePeanoOptArgs();
args.reserve(args.size() + peanoArgs.size());
for (const std::string &item : peanoArgs) {
args.emplace_back("-mllvm");
args.emplace_back(item);
}
args.emplace_back("-fno-use-init-array");
// Pass -fno-threadsafe-statics to prevent dependence on lock acquire/release
// handling for static local variables.
args.emplace_back("-fno-threadsafe-statics");
// Don't pull in system headers from /usr/include or /usr/local/include.
// All of the basic headers that we need come from the compiler.
args.emplace_back("-nostdsysteminc");
std::string target = "aie2-none-unknown-elf";
addClangTargetOptions(args, target);
AddClangSystemIncludeArgs(args, peanoDir, target);
addLibCxxIncludePaths(args, peanoDir, target);

args.emplace_back("-c");
args.emplace_back(inputFile);
args.emplace_back("-o");
@@ -475,7 +449,7 @@ static FailureOr<Path> assembleStringUsing(
if (auto maybeErr = dumpStrToDisk(inputFileStr, inputFile.string());
maybeErr.has_value()) {
llvm::errs() << "Failed to dump to disk " << inputFile.string()
<< " because: " << maybeErr;
<< " because: " << maybeErr << "\n";
return failure();
}

@@ -487,7 +461,8 @@ static FailureOr<Path> assembleStringUsing(
}
if (failed(assembler(inputFile.string(), outputFile.string(), extraArgs,
workDir, toolDir, npuVersion, verbose))) {
llvm::errs() << "Failed to assemble " << outputFileName << ".o";
llvm::errs() << "Failed to assemble " << outputFileName << ".o"
<< "\n";
return failure();
}
return outputFile;
@@ -532,22 +507,31 @@ static LogicalResult generateCoreElfFiles(
Path cwd = std::filesystem::current_path();
FailureOr<Path> mmObjectFilePath;
if (ukernel && (ukernel == "mm" || ukernel == "all")) {
FailureOr<Path> maybeVitisDir = findVitis(vitisDir, npuVersion);
if (failed(maybeVitisDir)) {
llvm::errs() << "compiling ukernels currently requires chess (even if "
"you're using peano)";
return failure();
}
if (!std::filesystem::exists(cwd / "mm.o")) {
mmObjectFilePath = assembleStringUsingChess(
/*inputFileStr=*/_MM_CC,
/*inputFileName=*/"mm.cc",
/*outputFileName=*/"mm.o",
/*outputDir=*/cwd,
/*extraArgs*/ std::vector<std::string>{},
/*workDir=*/tempDir,
/*vitisDir=*/*maybeVitisDir,
/*npuVersion*/ npuVersion, verbose);
if (useChess) {
if (verbose) llvm::outs() << "using chess for ukernel codegen\n";
FailureOr<Path> maybeVitisDir = findVitis(vitisDir, npuVersion);
mmObjectFilePath = assembleStringUsingChess(
/*inputFileStr=*/_MM_CC,
/*inputFileName=*/"mm.cc",
/*outputFileName=*/"mm.o",
/*outputDir=*/cwd,
/*extraArgs*/ std::vector<std::string>{},
/*workDir=*/tempDir,
/*vitisDir=*/*maybeVitisDir,
/*npuVersion*/ npuVersion, verbose);
} else {
if (verbose) llvm::outs() << "using peano for ukernel codegen\n";
mmObjectFilePath = assembleStringUsingPeano(
/*inputFileStr=*/_MM_CC,
/*inputFileName=*/"mm.cc",
/*outputFileName=*/"mm.o",
/*outputDir=*/cwd,
/*extraArgs*/ std::vector<std::string>{},
/*workDir=*/tempDir,
/*peanoDir=*/peanoDir,
/*npuVersion*/ npuVersion, verbose);
}
if (failed(mmObjectFilePath)) return failure();
} else {
mmObjectFilePath = cwd / "mm.o";
@@ -579,13 +563,14 @@ static LogicalResult generateCoreElfFiles(
{
auto bcfOutput = openOutputFile(bcfPath.string(), &errorMessage);
if (!bcfOutput) {
llvm::errs() << "failed to open bcf file because: " << errorMessage;
llvm::errs() << "failed to open bcf file because: " << errorMessage
<< "\n";
return failure();
}

if (failed(mlir::iree_compiler::AMDAIE::AIETranslateToBCF(
deviceOp, bcfOutput->os(), col, row))) {
llvm::errs() << "Failed to generate BCF";
llvm::errs() << "Failed to generate BCF\n";
return failure();
}
bcfOutput->keep();
@@ -614,7 +599,7 @@ static LogicalResult generateCoreElfFiles(
openOutputFile(ldscriptPath.string(), &errorMessage);
if (!ldscriptOutput) {
llvm::errs() << "Failed to open ldscript file because: "
<< errorMessage;
<< errorMessage << "\n";
return failure();
}
if (failed(mlir::iree_compiler::AMDAIE::AIETranslateToLdScript(
@@ -654,7 +639,7 @@ static LogicalResult generateCDO(MLIRContext *context, AIE::DeviceOp deviceOp,
deviceOp = *copy.getOps<AIE::DeviceOp>().begin();
if (failed(mlir::iree_compiler::AMDAIE::AIETranslateToCDODirect(
deviceOp, tempDir.string()))) {
llvm::errs() << "failed to emit CDO";
llvm::errs() << "failed to emit CDO\n";
return failure();
}
copy->erase();
@@ -750,7 +735,7 @@ static LogicalResult generateXCLBin(
dumpStrToDisk(memTopologyData, memTopologyJsonFile.string());
maybeErr.has_value()) {
llvm::errs() << "failed to dump to disk mem_topology.json because: "
<< *maybeErr;
<< *maybeErr << "\n";
return failure();
}
}
@@ -797,7 +782,7 @@ static LogicalResult generateXCLBin(
dumpStrToDisk(aiePartitionJsonData, aiePartitionJsonFile.string());
maybeErr.has_value()) {
llvm::errs() << "failed to dump to disk aie_partition.json because: "
<< *maybeErr;
<< *maybeErr << "\n";
return failure();
}
}
@@ -816,7 +801,7 @@ static LogicalResult generateXCLBin(
if (auto maybeErr = dumpStrToDisk(kernelStr, kernelsJsonFile.string());
maybeErr.has_value()) {
llvm::errs() << "failed to dump to disk kernels.json because: "
<< *maybeErr;
<< *maybeErr << "\n";
return failure();
}
}
@@ -825,7 +810,8 @@ static LogicalResult generateXCLBin(
{
auto designBifOut = openOutputFile(designBifFile.string(), &errorMessage);
if (!designBifOut) {
llvm::errs() << "failed to open design.bif because: " << errorMessage;
llvm::errs() << "failed to open design.bif because: " << errorMessage
<< "\n";
return failure();
}

@@ -868,7 +854,7 @@ static LogicalResult generateXCLBin(
}
if (iree_aie_bootgen_main(cstrings.size(),
const_cast<const char **>(&cstrings[0]))) {
llvm::errs() << "failed to execute bootgen";
llvm::errs() << "failed to execute bootgen\n";
return failure();
}
}
@@ -892,14 +878,14 @@ static LogicalResult generateXCLBin(
"--force", "--input", *inputXclbin};

if (failed(runTool(xclbinutilBin.value().string(), inputFlags, verbose))) {
llvm::errs() << "failed to execute xclbinutil";
llvm::errs() << "failed to execute xclbinutil\n";
return failure();
}
auto aieInputPartitionOut =
openInputFile(aieInputPartitionJsonFile.string(), &errorMessage);
if (!aieInputPartitionOut) {
llvm::errs() << "failed to open aie_input_partition.json because: "
<< errorMessage;
<< errorMessage << "\n";
return failure();
}
Expected<json::Value> aieInputPartitionOutValue =
@@ -913,7 +899,7 @@ static LogicalResult generateXCLBin(
if (!aiePartitionOut) {
llvm::errs() << "failed to open aie aie_input_partition.json for "
"output because: "
<< errorMessage;
<< errorMessage << "\n";
return failure();
}
llvm::Expected<llvm::json::Value> aiePartitionOutValue =
@@ -931,7 +917,7 @@ static LogicalResult generateXCLBin(
maybeErr.has_value()) {
llvm::errs()
<< "failed to dump to disk aie_input_partition.json because: "
<< errorMessage;
<< errorMessage << "\n";
return failure();
}
flags.insert(flags.end(), {"--input", *inputXclbin});
@@ -1040,14 +1026,14 @@ static LogicalResult generateUnifiedObject(
}

if (failed(pm.run(moduleOpCopy))) {
llvm::errs() << "Failed to lower to LLVM";
llvm::errs() << "Failed to lower to LLVM\n";
return failure();
}

llvm::LLVMContext llvmContext;
auto llvmModule = translateModuleToLLVMIR(moduleOpCopy, llvmContext);
if (!llvmModule) {
llvm::errs() << "Failed to translate module to LLVMIR";
llvm::errs() << "Failed to translate module to LLVMIR\n";
return failure();
}

@@ -1081,7 +1067,7 @@ static LogicalResult generateUnifiedObject(
if (auto maybeErr = dumpStrToDisk(inputLLStr, LLVMIRFile.string());
maybeErr.has_value()) {
llvm::errs() << "Failed to dump to disk input.ll"
<< " because: " << maybeErr;
<< " because: " << maybeErr << "\n";
return failure();
}
Path peanoOptBin = peanoDir / "bin" / "opt";
@@ -1092,11 +1078,12 @@ static LogicalResult generateUnifiedObject(
"-O2", "--inline-threshold=10", "-S", LLVMIRFile.string(),
// missing from libc
"--disable-builtin=memset", "-o", OptLLVMIRFile.string()};
std::vector<std::string> peanoArgs = makePeanoOptArgs();
std::vector<std::string> peanoArgs;
addOptTargetOptions(peanoArgs);
args.reserve(args.size() + peanoArgs.size());
args.insert(args.end(), peanoArgs.begin(), peanoArgs.end());
if (failed(runTool(peanoOptBin.string(), args, verbose))) {
llvm::errs() << "Failed to optimize ll with peano";
llvm::errs() << "Failed to optimize ll with peano\n";
return failure();
}

0 comments on commit 75148af

Please sign in to comment.