diff --git a/.clinerules b/.clinerules new file mode 100644 index 000000000000..78c9248cbb4e --- /dev/null +++ b/.clinerules @@ -0,0 +1,242 @@ +# Cline Rules for llvm-aie Repository + +## Personal User Settings + +You can include personal Cline rules from your user profile by creating a file at: +- `~/.config/cline/user.clinerules` (Linux/macOS) +- `%USERPROFILE%\.config\cline\user.clinerules` (Windows) + +**Important:** If this file doesn't exist, it will be silently ignored - no error will occur. + +### Codebase Identification + +When your personal rules file is loaded, the following environment variable will be available: +- `CLINE_CODEBASE=llvm-aie` + +You can use this in your personal rules to apply codebase-specific customizations. For example: + +```markdown +# My Personal Cline Rules + +## General Preferences +- Always use trailing commas in multi-line arrays and objects + +## Codebase-Specific Rules +When CLINE_CODEBASE=llvm-aie: +- Pay extra attention to LLVM coding standards +- Always verify changes with `ninja check-llvm-codegen-aie` + +When CLINE_CODEBASE=my-other-project: +- Use different style preferences +``` + +The personal rules file can contain any additional instructions you want to apply across all your projects. These personal rules will be merged with the project-specific rules below. In case of conflicts, the project-specific rules in this `.clinerules` file take precedence. + +--- + +# Project-Specific Rules + +The following rules are specific to the llvm-aie repository: + +## Copyright Headers + +When updating any file in this repository, always: +1. Add a copyright header if the file doesn't have one +2. Update the year in existing copyright messages to include the current year (2025) + +### Standard Copyright Header Format + +For files with AMD copyright: +``` +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2023-2025 Advanced Micro Devices, Inc. or its affiliates +``` + +For C/C++ files: +```cpp +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2023-2025 Advanced Micro Devices, Inc. or its affiliates +``` + +### Examples + +- If a file has `(c) Copyright 2023-2024`, update it to `(c) Copyright 2023-2025` +- If a file has `(c) Copyright 2024`, update it to `(c) Copyright 2024-2025` +- If a file has no copyright header and you're making substantial changes, add the appropriate header + +## Project-Specific Guidelines + +- This is an LLVM-based project, so follow LLVM coding standards +- AIE-specific code is located in `llvm/lib/Target/AIE/` and `llvm/test/CodeGen/AIE/` +- When modifying test infrastructure, prefer localized changes (e.g., in `lit.local.cfg` files) over global changes when possible + +## C++ Class Member Ordering + +When defining C++ classes, organize members in the following order to minimize the need for access specifiers: +1. Private members (data and helper methods) first +2. Public interface last + +This allows the class to start without an explicit access specifier (defaulting to private for classes) and requires only a single `public:` label before the public interface. Avoid using multiple `private:` labels by grouping all private members together at the beginning of the class definition. + +Example: +```cpp +class MyClass { + // Private data members + int PrivateData; + + // Private helper methods + void helperMethod(); + +public: + // Public interface + MyClass(); + void publicMethod(); +}; +``` + +## Const Correctness + +Make local variable declarations `const` whenever possible to improve code clarity and prevent accidental modifications: + +- Use `const` for variables that are initialized once and never modified +- Use `const` for references and pointers when the referenced/pointed-to data should not be modified +- This makes the code's intent clearer and helps catch bugs at compile time + +Example: +```cpp +// Good - const when possible +const unsigned NumSlots = getNumSlots(); +const auto &Config = getConfig(); + +// Avoid - non-const when not needed +unsigned NumSlots = getNumSlots(); // Will not be modified +auto &Config = getConfig(); // Will not be modified +``` + +## Eliminate Unused Variables + +Always eliminate unused variables to keep code clean and avoid compiler warnings: + +- Remove any variable declarations that are not used +- If a variable is only needed for its side effects, use `(void)variable;` to explicitly mark it as intentionally unused +- Do not leave unused variables in the code, even temporarily + +Example: +```cpp +// Bad - unused variable +const unsigned NumSlots = getNumSlots(); +doSomething(); // NumSlots never used + +// Good - variable removed +doSomething(); + +// Good - explicitly marked as unused (rare cases) +const bool Success = tryOperation(); +(void)Success; // Used in assert in debug builds +assert(Success && "Operation must succeed"); +``` + +## TableGen Backend Development + +When extending TableGen backends (e.g., CodeGenFormat): + +### Use ConstTable for Emitting Data Tables + +- Prefer `ConstTable` abstraction over raw `raw_ostream` for emitting data tables +- ConstTable provides utilities for managing table indices, references, and array slices +- Example: + ```cpp + ConstTable MyTable("MyType", "MyTableName"); + MyTable << "value1"; + MyTable.next(); + MyTable << "value2"; + MyTable.next(); + MyTable.finish(); + o << MyTable; // Emit to output stream + ``` + +### Parameterized Implementation Headers + +When creating reusable implementations that need to be instantiated for multiple architectures (AIE1, AIE2, AIE2P), use macro-based parameterization: + +- Define a macro parameter (e.g., `SLOT_STRUCTURE_NAMESPACE`) that must be set before including the header +- Use macro concatenation to generate architecture-specific names +- Include the appropriate generated `.inc` file using the macro +- Clean up macros at the end of the header + +Example pattern: +```cpp +#ifndef PARAM_NAMESPACE +#error "PARAM_NAMESPACE must be defined before including this file" +#endif + +#define CONCAT_IMPL(a, b) a##b +#define CONCAT(a, b) CONCAT_IMPL(a, b) +#define GENERATED_INC CONCAT(PARAM_NAMESPACE, GenFormats.inc) + +// Include generated tables +#define GET_SOME_TABLE +#include GENERATED_INC + +// Implementation using generated data +class CONCAT(PARAM_NAMESPACE, ClassName) { + // ... +}; + +// Clean up +#undef GENERATED_INC +#undef CONCAT +#undef CONCAT_IMPL +``` + +## Build Verification + +After making changes to AIE target code, verify the changes by running the AIE CodeGen tests: + +```bash +cd Release && ninja check-llvm-codegen-aie +``` + +This ensures that your changes don't break existing functionality and that all AIE-specific code generation tests pass. + +## Documentation for Significant Changes + +For significant changes or new features, create documentation in `llvm/lib/Target/AIE/docs/`: +- Use Markdown format (`.md` files) with lines wrapped at 80 columns +- Name the file descriptively (e.g., `SlotStructureUnification.md`, `NewFeatureName.md`) +- Include: + - Overview and motivation + - Key concepts and design decisions + - Interface usage examples + - Implementation file locations + - Test results + - Future work considerations + +This helps maintain institutional knowledge and aids future developers. + +## Git Commit Guidelines + +**NEVER add build trees to git commits:** +- Do not add `Release/`, `Debug/`, or any other build directories +- Do not add `.cline_storage/` or other IDE-specific directories +- When committing, explicitly specify only the source files that were modified +- Use `git add ` instead of `git add -A` or `git add .` + +**Only make commits when explicitly requested by the user:** +- Do not automatically commit changes +- Wait for the user to review and approve changes before committing +- When the user requests a commit, include the documentation file if one was created + +Example of correct commit workflow: +```bash +git add llvm/lib/Target/AIE/MyFile.cpp llvm/lib/Target/AIE/MyFile.h +git add llvm/lib/Target/AIE/docs/MyFeature.md # If documentation was created +git commit -m "Description of changes" +``` diff --git a/clang/cmake/caches/Peano-AIE.cmake b/clang/cmake/caches/Peano-AIE.cmake index 0853bf044ee8..34ba364c45b0 100644 --- a/clang/cmake/caches/Peano-AIE.cmake +++ b/clang/cmake/caches/Peano-AIE.cmake @@ -3,7 +3,7 @@ # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -# (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates +# (c) Copyright 2023-2025 Advanced Micro Devices, Inc. or its affiliates # This file sets up a CMakeCache for the Peano AIE toolchain build. @@ -63,7 +63,7 @@ endif() # Switch it on if you have Z3 installed and want to use the solver mode # of the postpipeliner -# option(LLVM_ENABLE_Z3_SOLVER "" ON) +option(LLVM_ENABLE_Z3_SOLVER "" ON) # there's some bug here where if you list(APPEND ...) to a CACHE variable # it doesn't work (neither libLLVM nor clang-cpp were being successfully installed) diff --git a/llvm/lib/Target/AIE/AIEHazardRecognizer.h b/llvm/lib/Target/AIE/AIEHazardRecognizer.h index d145a8423bb6..e0919878e46e 100644 --- a/llvm/lib/Target/AIE/AIEHazardRecognizer.h +++ b/llvm/lib/Target/AIE/AIEHazardRecognizer.h @@ -4,7 +4,7 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -// (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates +// (c) Copyright 2023-2025 Advanced Micro Devices, Inc. or its affiliates // //===----------------------------------------------------------------------===// // This file defines the hazard recognizer for scheduling on AIE. @@ -60,6 +60,60 @@ const_bundled_instrs(const MachineInstr &MI, bool IncludeRoot = false) { using ResourceSet = StaticBitSet; +/// Wrapper around SlotBits to provide a SlotOccupancy-like interface. +/// This is a transitional class to prepare for migrating from SlotBits +/// to SlotOccupancy. It wraps uint64_t and provides methods that match +/// the SlotOccupancy API. +class SlotBitsWrapper { + uint64_t Bits; + +public: + SlotBitsWrapper() : Bits(0) {} + explicit SlotBitsWrapper(uint64_t Bits) : Bits(Bits) {} + + /// Check if empty + bool isEmpty() const { return Bits == 0; } + + /// Clear all bits + void clear() { Bits = 0; } + + /// Block all slots + void blockResources() { Bits = ~uint64_t(0); } + + /// Merge with another + SlotBitsWrapper &operator|=(const SlotBitsWrapper &Other) { + Bits |= Other.Bits; + return *this; + } + + /// Combine two slot sets + SlotBitsWrapper operator|(const SlotBitsWrapper &Other) const { + return SlotBitsWrapper(Bits | Other.Bits); + } + + /// Check for overlap (used in conflict detection) + bool overlaps(const SlotBitsWrapper &Other) const { + return (Bits & Other.Bits) != 0; + } + + /// Equality comparison + bool operator==(const SlotBitsWrapper &Other) const { + return Bits == Other.Bits; + } + + /// Get the underlying bits (for format interface) + uint64_t getBits() const { return Bits; } + + /// Implicit conversion to uint64_t for backward compatibility + operator uint64_t() const { return Bits; } + + /// Assignment from uint64_t for backward compatibility + SlotBitsWrapper &operator=(uint64_t NewBits) { + Bits = NewBits; + return *this; + } +}; + // To be merged with AIEResourceCycle class FuncUnitWrapper { /// The format interface to interpret bundle constraints diff --git a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp index 26438b8e5b9f..18cc026b02ea 100644 --- a/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp +++ b/llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp @@ -67,6 +67,10 @@ static cl::opt EnableMultiSlotInstrMaterialization( cl::desc("Statically materialize Multi-Slot Pseudo Instructions in " "loops.")); +static cl::opt + MaterializeAll("aie-materialize-all", cl::Hidden, cl::init(false), + cl::desc("Materialize all Multi-Slot Pseudo Instructions.")); + static cl::opt PostPipelinerMaxTryII( "aie-postpipeliner-maxtry-ii", cl::init(20), cl::desc("[AIE] Maximum II steps to be tried in the post-ra pipeliner")); @@ -1184,8 +1188,9 @@ void BlockState::initInterBlock(const MachineSchedContext &Context, // perform static assignment of multi-slot pseudos if (EnableMultiSlotInstrMaterialization && - PostSWP->isPostPipelineCandidate(*TheBlock)) - staticallyMaterializeMultiSlotInstructions(*TheBlock, HR); + PostSWP->isPostPipelineCandidate(*TheBlock)) { + staticallyMaterializeMultiSlotInstructions(*TheBlock, HR, MaterializeAll); + } } // We are called just after the first round of scheduling a block. diff --git a/llvm/lib/Target/AIE/AIEMultiSlotInstrMaterializer.cpp b/llvm/lib/Target/AIE/AIEMultiSlotInstrMaterializer.cpp index 589fbda67074..13cef94ec5b9 100644 --- a/llvm/lib/Target/AIE/AIEMultiSlotInstrMaterializer.cpp +++ b/llvm/lib/Target/AIE/AIEMultiSlotInstrMaterializer.cpp @@ -15,6 +15,10 @@ #include "AIEMultiSlotInstrMaterializer.h" #include "AIEHazardRecognizer.h" +#include "AIESlotStatistics.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/Support/ErrorHandling.h" using namespace llvm; @@ -24,6 +28,7 @@ static cl::opt SkipSingleSlotAssignment( "aie-skip-single-slot-assignment", cl::Hidden, cl::init(true), cl::desc("Skip preassigning if all multi-slot instr are assigned to the " "same Slot.")); + namespace llvm::AIE { class SlotMapping { @@ -189,6 +194,84 @@ bool assignSlots(SlotMapping &SlotToBanks, const MachineBasicBlock &MBB, return true; } +namespace { +void materializeMSP(MachineInstr *MSP, SlotStatistics &Statistics, + const AIEBaseInstrInfo *TII) { + + SlotCounts Counts = Statistics.MSPSlotCounts[MSP]; + + // 0. If we can't materialize in a given slot, we are worse + // 1. If Fixed is smaller than an alternative, we won't be increasing the + // slot requirements + // 2. If Free is smaller than an alternative, we lower the probability that + // we force another MSP past the current maximum. + auto Better = [&Statistics, &Counts](int A, int B) { + if (!Counts.at(A)) { + return false; + } + if (!Counts.at(B)) { + return true; + } + if (Statistics.Fixed.at(A) == Statistics.Fixed.at(B)) { + return Statistics.Free.at(A) < Statistics.Free.at(B); + } + return Statistics.Fixed.at(A) < Statistics.Fixed.at(B); + }; + int BestSlot = 0; + for (int S = 1; S < Counts.size(); S++) { + if (Better(S, BestSlot)) { + BestSlot = S; + } + } + // We should always find an alternative, even if it's not perfect. + assert(Counts.at(BestSlot)); + // Reverse lookup of the alternative that matches BestSlot. + auto FindOpcode = [TII, Opcode = MSP->getOpcode()](int BestSlot) { + const AIEBaseMCFormats *Formats = TII->getFormatInterface(); + auto *Alternatives = Formats->getAlternateInstsOpcode(Opcode); + for (auto Opcode : *Alternatives) { + if (Formats->getSlotKind(Opcode) == MCSlotKind(BestSlot)) { + return Opcode; + } + } + llvm_unreachable("BestSlot alternative not found"); + }; + + LLVM_DEBUG(dbgs() << "Materializing " << *MSP); + + // Materialize MSP + MSP->setDesc(TII->get(FindOpcode(BestSlot))); + // Update statistics + Statistics.Fixed[BestSlot] += SlotStatistics::Unit; + Statistics.Free -= Counts; + LLVM_DEBUG(dbgs() << " to " << *MSP); + LLVM_DEBUG(dbgs() << "New Fixed:\n" << Statistics.Fixed << "\n"); +} + +void materializeToMinimizeSlotTotals(MachineBasicBlock &MBB, + const AIEBaseInstrInfo *TII) { + SlotStatistics Statistics = computeSlotStatistics(MBB, TII); + // Sort the list by increasing alternative count + llvm::sort(Statistics.MSPs, [Formats = TII->getFormatInterface()]( + MachineInstr *A, MachineInstr *B) { + auto *AltA = Formats->getAlternateInstsOpcode(A->getOpcode()); + auto *AltB = Formats->getAlternateInstsOpcode(B->getOpcode()); + + return AltA->size() < AltB->size(); + }); + LLVM_DEBUG(dbgs() << "Statistics:\n"); + LLVM_DEBUG(Statistics.dump()); + LLVM_DEBUG(dbgs() << "----\n"); + + // This is still pretty greedy. Just materialize each instruction + // based on the current total slotcounts. + for (auto *MSP : Statistics.MSPs) { + materializeMSP(MSP, Statistics, TII); + } +} + +} // namespace + /// Materialise \p MI into its slot assigned by \p SlotToBanks . void materializeInstr(MachineInstr &MI, const SlotMapping &SlotToBanks, const AIEBaseInstrInfo *TII, @@ -223,7 +306,8 @@ void materializeSlots(const SlotMapping &SlotToBanks, MachineBasicBlock &MBB, } void staticallyMaterializeMultiSlotInstructions(MachineBasicBlock &MBB, - const AIEHazardRecognizer &HR) { + const AIEHazardRecognizer &HR, + bool MaterializeAll) { LLVM_DEBUG(dbgs() << "Statically Assigning multi slot pseudos for " << MBB.getName() << "\n"); @@ -232,14 +316,17 @@ void staticallyMaterializeMultiSlotInstructions(MachineBasicBlock &MBB, auto SlotToBanks = getAssignedSlots(MBB, TII, HR); - if (!assignSlots(SlotToBanks, MBB, TII, HR)) { + if (assignSlots(SlotToBanks, MBB, TII, HR)) { + materializeSlots(SlotToBanks, MBB, TII, HR); + } else { LLVM_DEBUG( dbgs() - << "Could not find Slot Assignments, Skipping materialization\n"); - return; + << "Could not find slot assignments, skipping bank materialization\n"); } - materializeSlots(SlotToBanks, MBB, TII, HR); + if (MaterializeAll) { + materializeToMinimizeSlotTotals(MBB, TII); + } } } // namespace llvm::AIE // diff --git a/llvm/lib/Target/AIE/AIEMultiSlotInstrMaterializer.h b/llvm/lib/Target/AIE/AIEMultiSlotInstrMaterializer.h index 84799906353c..0e4888d33fe1 100644 --- a/llvm/lib/Target/AIE/AIEMultiSlotInstrMaterializer.h +++ b/llvm/lib/Target/AIE/AIEMultiSlotInstrMaterializer.h @@ -12,11 +12,11 @@ // block loop to help loop pipelining. // //===----------------------------------------------------------------------===// -#include "AIEBaseInstrInfo.h" namespace llvm { class AIEHazardRecognizer; -} +class MachineBasicBlock; +} // namespace llvm namespace llvm::AIE { @@ -24,6 +24,7 @@ namespace llvm::AIE { /// \p MBB . /// FIXME: Currently we are only handling multi-slot memory load pseudos. void staticallyMaterializeMultiSlotInstructions(MachineBasicBlock &MBB, - const AIEHazardRecognizer &HR); + const AIEHazardRecognizer &HR, + bool MaterializeAll = false); } // namespace llvm::AIE diff --git a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp index 35322a7dc1ec..c690f104427e 100644 --- a/llvm/lib/Target/AIE/AIEPostPipeliner.cpp +++ b/llvm/lib/Target/AIE/AIEPostPipeliner.cpp @@ -1189,10 +1189,12 @@ SolverData PostPipeliner::createSolverData() { const SUnit &SU = DAG->SUnits[N]; MachineInstr *const MI = SU.getInstr(); auto SlotKind = TII->getSlotKind(MI->getOpcode()); + const auto *SlotInfo = TII->getSlotInfo(SlotKind); + const uint64_t SlotConflicts = SlotInfo->getConflictSet(); const uint64_t MemoryBanks = HR.getMemoryBanks(MI); - const int Id = - Data.addInstruction(SlotKind, MemoryBanks, !isSideEffectFree(MI)); + const int Id = Data.addInstruction(SlotKind, SlotConflicts, MemoryBanks, + !isSideEffectFree(MI)); assert(unsigned(Id) == SU.NodeNum); for (auto Dep : SU.Preds) { const int From = Dep.getSUnit()->NodeNum; diff --git a/llvm/lib/Target/AIE/AIESWPSolver.cpp b/llvm/lib/Target/AIE/AIESWPSolver.cpp index e222d1d378ad..adf4a27e24f3 100644 --- a/llvm/lib/Target/AIE/AIESWPSolver.cpp +++ b/llvm/lib/Target/AIE/AIESWPSolver.cpp @@ -63,11 +63,12 @@ Slot &SolverData::addSlot(int N) { return It->second; } -int SolverData::addInstruction(int SlotNumber, uint64_t MemoryBanks, - bool HasSideEffect) { +int SolverData::addInstruction(int SlotNumber, uint64_t SlotConflicts, + uint64_t MemoryBanks, bool HasSideEffect) { Slot *const Slot = &addSlot(SlotNumber); const int Id = Instructions.size(); - Instructions.emplace_back(Id, Slot, MemoryBanks, HasSideEffect); + Instructions.emplace_back(Id, Slot, SlotConflicts, MemoryBanks, + HasSideEffect); Slot->Instructions.insert(Id); return Id; } @@ -153,6 +154,26 @@ void SWPSolver::latencies(const SolverData &Data) { } void SWPSolver::conflicts(const SolverData &Data) { + // Non-trivial Slot Conflicts. + // If there's overlap between one instruction's slot bit and another + // instruction's conflict bits, report a conflict + for (const auto &I : Data.getInstructions()) { + const int M = I.Id; + const uint64_t ISlotBit = uint64_t(1) << I.TheSlot->SlotNumber; + for (const auto &J : Data.getInstructions()) { + const int N = J.Id; + if (N >= M) { + break; + } + const uint64_t JSlotBit = uint64_t(1) << J.TheSlot->SlotNumber; + if ((J.SlotConflicts & ISlotBit) || (I.SlotConflicts & JSlotBit)) { + LLVM_DEBUG(dbgs() << "Slot conflict(" << M << ", " << N << ")\n"); + genConflict(M, N); + } + } + } + + // Memory bank conflicts for (const auto &I : Data.getInstructions()) { const int M = I.Id; if (!I.MemoryBanks) { diff --git a/llvm/lib/Target/AIE/AIESWPSolver.h b/llvm/lib/Target/AIE/AIESWPSolver.h index 2816f233fb2f..ea7803e10b08 100644 --- a/llvm/lib/Target/AIE/AIESWPSolver.h +++ b/llvm/lib/Target/AIE/AIESWPSolver.h @@ -60,13 +60,15 @@ class Instruction { public: const int Id; const Slot *const TheSlot; + const uint64_t SlotConflicts; const uint64_t MemoryBanks = 0; const bool HasSideEffect = true; int Depth = 0; int Height = 0; - Instruction(int Id, Slot *S, uint64_t MemoryBanks, bool HasSideEffect) - : Id(Id), TheSlot(S), MemoryBanks(MemoryBanks), - HasSideEffect(HasSideEffect) {} + Instruction(int Id, Slot *S, uint64_t SlotConflicts, uint64_t MemoryBanks, + bool HasSideEffect) + : Id(Id), TheSlot(S), SlotConflicts(SlotConflicts), + MemoryBanks(MemoryBanks), HasSideEffect(HasSideEffect) {} }; class ProblemSize { @@ -104,7 +106,8 @@ class SolverData { public: // Add an instruction to the problem. It returns a unique Id - int addInstruction(int Slot, uint64_t MemoryBanks, bool HasSideEffect); + int addInstruction(int Slot, uint64_t SlotConflicts, uint64_t MemoryBanks, + bool HasSideEffect); // Add a latency between two instructions to the problem. // Distance represents the iteration distance, i.e. the number of // cfg backedges it spans. diff --git a/llvm/lib/Target/AIE/AIESlotOccupancy.cpp b/llvm/lib/Target/AIE/AIESlotOccupancy.cpp new file mode 100644 index 000000000000..669d5d156666 --- /dev/null +++ b/llvm/lib/Target/AIE/AIESlotOccupancy.cpp @@ -0,0 +1,386 @@ +//===--- AIESlotOccupancy.cpp - Generalized Slot Occupancy Model ---------===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// + +#include "AIESlotOccupancy.h" +#include "AIESlotStructure.h" +#include "MCTargetDesc/AIEMCFormats.h" +#include "llvm/Support/Debug.h" +#include + +#define DEBUG_TYPE "slot-occupancy" + +using namespace llvm; + +//===----------------------------------------------------------------------===// +// AIESlotStructure Implementation +//===----------------------------------------------------------------------===// + +SlotOccupancy AIESlotStructure::getCapacityBounds() const { + SlotOccupancy Bounds; + for (unsigned I = 0; I < MaxSlotClasses; ++I) { + Bounds.setCount(I, getCapacity(I)); + } + return Bounds; +} + +//===----------------------------------------------------------------------===// +// SlotOccupancy Implementation +//===----------------------------------------------------------------------===// + +SlotOccupancy::SlotOccupancy(SlotBits Slots) : Counts{} { + // Expand each bit in Slots to an occupation count of 1 + // This unifies regular slots (capacity 1) and MSP slot classes (capacity > 1) + // Each bit position corresponds to a slot class index + for (unsigned ClassIdx = 0; ClassIdx < MaxSlotClasses && Slots != 0; + ++ClassIdx) { + if (Slots & (SlotBits(1) << ClassIdx)) { + Counts[ClassIdx] = 1; + } + } +} + +bool SlotOccupancy::isEmpty() const { + return std::all_of(Counts.begin(), Counts.end(), + [](uint8_t C) { return C == 0; }); +} + +void SlotOccupancy::clear() { std::fill(Counts.begin(), Counts.end(), 0); } + +void SlotOccupancy::blockResources() { + // Set all class counts to maximum to prevent any scheduling + std::fill(Counts.begin(), Counts.end(), UINT8_MAX); +} + +SlotOccupancy &SlotOccupancy::operator|=(const SlotOccupancy &Other) { + // Sum occupation counts element-wise + // With max occupation count ~7 and feasible inputs, no overflow occurs + for (unsigned I = 0; I < MaxSlotClasses; ++I) { + Counts[I] += Other.Counts[I]; + } + return *this; +} + +SlotOccupancy SlotOccupancy::operator|(const SlotOccupancy &Other) const { + SlotOccupancy Result(*this); + Result |= Other; + return Result; +} + +bool SlotOccupancy::conflict(const SlotOccupancy &Other, + const AIEBaseMCFormats &FormatInterface) const { + // Get slot structure from format interface + const AIESlotStructure &SlotStructure = FormatInterface.getSlotStructure(); + + // Combine the two occupancies + const SlotOccupancy Combined = *this | Other; + + // Quick pruning 1: Check if all counts are within valid range + const SlotOccupancy CapacityBounds = SlotStructure.getCapacityBounds(); + if (!Combined.boundedBy(CapacityBounds)) + return true; + + const unsigned NumRealSlots = SlotStructure.getNumRealSlots(); + + // Quick pruning 2: Sum of occupancies must not exceed number of real slots + if (Combined.total() > NumRealSlots) + return true; + + // Start with real slot occupancies (these are fixed) + SlotBits RealSlotOccupancy = 0; + for (unsigned I = 0; I < NumRealSlots; ++I) { + if (Combined.Counts[I] > 0) { + RealSlotOccupancy |= (SlotBits(1) << I); + } + } + + // Try to materialize MSPs to real slots + return !tryMaterializeMSPs(FormatInterface, Combined, RealSlotOccupancy); +} + +namespace { + +/// Helper to assign MSP instances to available slots +/// \tparam AssignmentHandler Functor called for each assignment: (MSPClassIdx, +/// InstanceIdx, Bit) +/// \param AvailableSlots Bitmask of slots available for this MSP +/// \param Count Number of instances to assign +/// \param MSPClassIdx The MSP class being assigned +/// \param NewRealSlots Output: updated real slot occupancy +/// \param Handler Functor to record assignments (optional) +/// \return true if all instances were successfully assigned +template +bool assignMSPInstances(SlotBits AvailableSlots, uint8_t Count, + unsigned MSPClassIdx, SlotBits &NewRealSlots, + AssignmentHandler &&Handler) { + SlotBits Available = AvailableSlots; + SlotBits BitMask = 1; + unsigned Bit = 0; + + for (unsigned Assigned = 0; Assigned < Count; ++Assigned) { + if (Available == 0) { + // Not enough free slots in the composition - cannot materialize + return false; + } + + // Find next available slot + while (!(Available & BitMask)) { + BitMask <<= 1; + ++Bit; + } + + NewRealSlots |= BitMask; + Available &= ~BitMask; // Consume this slot + + // Record assignment if handler provided + Handler(MSPClassIdx, Assigned, Bit); + + BitMask <<= 1; // Advance to next bit for next iteration + ++Bit; + } + + return true; +} + +} // anonymous namespace + +bool SlotOccupancy::tryMaterializeMSPs(const AIEBaseMCFormats &FormatInterface, + const SlotOccupancy &RemainingOccupancy, + SlotBits CurrentRealSlots) const { + const AIESlotStructure &SlotStructure = FormatInterface.getSlotStructure(); + const unsigned NumRealSlots = SlotStructure.getNumRealSlots(); + + // Find the first MSP class with non-zero count + unsigned MSPClassIdx = MaxSlotClasses; + for (unsigned I = NumRealSlots; I < MaxSlotClasses; ++I) { + if (RemainingOccupancy.at(I) > 0) { + MSPClassIdx = I; + break; + } + } + + // Base case: no more MSPs to materialize, check if result is feasible + if (MSPClassIdx == MaxSlotClasses) { + return FormatInterface.isFormatAvailable(CurrentRealSlots); + } + + // Get the composition for this MSP + const SlotBits Composition = SlotStructure.getMSPComposition(MSPClassIdx); + const uint8_t Count = RemainingOccupancy.at(MSPClassIdx); + + // Find available slots from the composition + const SlotBits AvailableSlots = Composition & ~CurrentRealSlots; + + // Greedily assign the MSP instances to available slots + SlotBits NewRealSlots = CurrentRealSlots; + if (!assignMSPInstances(AvailableSlots, Count, MSPClassIdx, NewRealSlots, + [](unsigned, unsigned, unsigned) {})) { + return false; + } + + // Create new remaining occupancy with this MSP class zeroed out + SlotOccupancy NewRemaining = RemainingOccupancy; + NewRemaining.setCount(MSPClassIdx, 0); + + // Recursively try to materialize remaining MSPs + return tryMaterializeMSPs(FormatInterface, NewRemaining, NewRealSlots); +} + +void SlotOccupancy::dump() const { + dbgs() << "SlotOccupancy: ["; + bool First = true; + for (unsigned I = 0; I < MaxSlotClasses; ++I) { + if (Counts[I] > 0) { + if (!First) + dbgs() << ", "; + dbgs() << I << ":" << static_cast(Counts[I]); + First = false; + } + } + dbgs() << "]\n"; +} + +bool SlotOccupancy::operator==(const SlotOccupancy &Other) const { + return Counts == Other.Counts; +} + +bool SlotOccupancy::boundedBy(const SlotOccupancy &Bounds) const { + for (unsigned I = 0; I < MaxSlotClasses; ++I) { + if (Counts[I] > Bounds.at(I)) + return false; + } + return true; +} + +unsigned SlotOccupancy::total() const { + unsigned Total = 0; + for (unsigned I = 0; I < MaxSlotClasses; ++I) { + Total += Counts[I]; + } + return Total; +} + +//===----------------------------------------------------------------------===// +// MSPSlotMapping Implementation +//===----------------------------------------------------------------------===// + +MSPSlotMapping::MSPSlotMapping() + : InstanceCounters{}, NumRealSlots(0), FormatInterface(nullptr) {} + +MSPSlotMapping::MSPSlotMapping(const SlotOccupancy &Occupancy, + const AIEBaseMCFormats &FI) + : CurrentOccupancy(Occupancy), InstanceCounters{}, NumRealSlots(0), + FormatInterface(&FI) { + const bool Success = computeMapping(Occupancy, FI); + (void)Success; + assert(Success && "Failed to compute mapping for feasible occupancy"); +} + +bool MSPSlotMapping::computeMapping(const SlotOccupancy &Occupancy, + const AIEBaseMCFormats &FI) { + // Clear any existing assignments + Assignments.clear(); + std::fill(InstanceCounters.begin(), InstanceCounters.end(), 0); + FormatInterface = &FI; + + const AIESlotStructure &SlotStructure = FI.getSlotStructure(); + NumRealSlots = SlotStructure.getNumRealSlots(); + + // Start with real slot occupancies (these are fixed) + SlotBits RealSlotOccupancy = 0; + for (unsigned I = 0; I < NumRealSlots; ++I) { + if (Occupancy.at(I) > 0) { + RealSlotOccupancy |= (SlotBits(1) << I); + } + } + + // Try to materialize MSPs to real slots + return tryMaterializeMSPsWithMapping(FI, Occupancy, RealSlotOccupancy, + Assignments); +} + +bool MSPSlotMapping::tryMaterializeMSPsWithMapping( + const AIEBaseMCFormats &FI, const SlotOccupancy &RemainingOccupancy, + SlotBits CurrentRealSlots, std::vector &Assignments) { + const AIESlotStructure &SlotStructure = FI.getSlotStructure(); + const unsigned NumRealSlots = SlotStructure.getNumRealSlots(); + + // Find the first MSP class with non-zero count + unsigned MSPClassIdx = MaxSlotClasses; + for (unsigned I = NumRealSlots; I < MaxSlotClasses; ++I) { + if (RemainingOccupancy.at(I) > 0) { + MSPClassIdx = I; + break; + } + } + + // Base case: no more MSPs to materialize, check if result is feasible + if (MSPClassIdx == MaxSlotClasses) { + return FI.isFormatAvailable(CurrentRealSlots); + } + + // Get the composition for this MSP + const SlotBits Composition = SlotStructure.getMSPComposition(MSPClassIdx); + const uint8_t Count = RemainingOccupancy.at(MSPClassIdx); + + // Find available slots from the composition + const SlotBits AvailableSlots = Composition & ~CurrentRealSlots; + + // Greedily assign the MSP instances to available slots + // Store the assignments for later retrieval + SlotBits NewRealSlots = CurrentRealSlots; + if (!assignMSPInstances(AvailableSlots, Count, MSPClassIdx, NewRealSlots, + [&Assignments](unsigned MSPClassIdx, + unsigned InstanceIdx, unsigned Bit) { + Assignments.push_back( + {MSPClassIdx, InstanceIdx, Bit}); + })) { + return false; + } + + // Create new remaining occupancy with this MSP class zeroed out + SlotOccupancy NewRemaining = RemainingOccupancy; + NewRemaining.setCount(MSPClassIdx, 0); + + // Recursively try to materialize remaining MSPs + return tryMaterializeMSPsWithMapping(FI, NewRemaining, NewRealSlots, + Assignments); +} + +unsigned MSPSlotMapping::materializeAlternative(unsigned SlotClassIdx) { + // For real slots, they materialize to themselves + if (SlotClassIdx < NumRealSlots) { + // Verify precondition: must have at least one instance + assert(CurrentOccupancy.at(SlotClassIdx) > 0 && + "Cannot materialize slot class with zero count"); + + // Real slots don't need transformation, just return the slot index + // The occupancy already reflects the real slot + return SlotClassIdx; + } + + // For MSPs, find the assignment + const uint8_t InstanceIdx = InstanceCounters[SlotClassIdx]; + + // Verify precondition: must have at least one unmaterialized instance + assert(CurrentOccupancy.at(SlotClassIdx) > InstanceIdx && + "Cannot materialize slot class with no remaining instances"); + + // Find the assignment for this MSP class and instance + for (const auto &Assignment : Assignments) { + if (Assignment.MSPClassIdx == SlotClassIdx && + Assignment.InstanceIdx == InstanceIdx) { + // Found the assignment - increment counter + InstanceCounters[SlotClassIdx]++; + + // Update current occupancy: add the real slot + CurrentOccupancy |= SlotOccupancy(SlotBits(1) << Assignment.RealSlotIdx); + + return Assignment.RealSlotIdx; + } + } + + // Should never reach here if precondition is satisfied + llvm_unreachable("No assignment found for slot class instance"); +} + +void MSPSlotMapping::clear() { + Assignments.clear(); + CurrentOccupancy.clear(); + std::fill(InstanceCounters.begin(), InstanceCounters.end(), 0); +} + +void MSPSlotMapping::dump() const { + dbgs() << "MSPSlotMapping:\n"; + if (Assignments.empty()) { + dbgs() << " (empty)\n"; + return; + } + + for (const auto &Assignment : Assignments) { + dbgs() << " MSP[" << Assignment.MSPClassIdx << "][" + << Assignment.InstanceIdx << "] -> Slot " << Assignment.RealSlotIdx + << "\n"; + } + + dbgs() << "Current Occupancy: "; + CurrentOccupancy.dump(); + + dbgs() << "Instance Counters: ["; + bool First = true; + for (unsigned I = 0; I < MaxSlotClasses; ++I) { + if (InstanceCounters[I] > 0) { + if (!First) + dbgs() << ", "; + dbgs() << I << ":" << static_cast(InstanceCounters[I]); + First = false; + } + } + dbgs() << "]\n"; +} diff --git a/llvm/lib/Target/AIE/AIESlotOccupancy.h b/llvm/lib/Target/AIE/AIESlotOccupancy.h new file mode 100644 index 000000000000..d6edc7f3991c --- /dev/null +++ b/llvm/lib/Target/AIE/AIESlotOccupancy.h @@ -0,0 +1,205 @@ +//===--- AIESlotOccupancy.h - Generalized Slot Occupancy Model ---*- C++ +//-*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// +// +// This file defines a generalized slot occupancy model that unifies regular +// slots and multi-slot pseudos (MSPs) using occupation counts. +// +// Key concepts: +// - Regular slots are treated as slot classes with capacity 1 +// - MSPs are slot classes with capacity > 1 +// - Conflict detection uses precomputed capacity bounds and format feasibility +// - No local materialization decisions during hazard checking +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AIE_AIESLOTOCCUPANCY_H +#define LLVM_LIB_TARGET_AIE_AIESLOTOCCUPANCY_H + +#include "AIEBaseSubtarget.h" +#include "AIESlotStructure.h" +#include "llvm/Support/raw_ostream.h" +#include +#include + +namespace llvm { + +// Forward declarations +class AIEBaseMCFormats; + +/// SlotOccupancy represents the occupancy state of slots in a cycle. +/// It unifies regular slots (capacity 1) and multi-slot pseudos (capacity > 1) +/// using occupation counts per slot class. +class SlotOccupancy { +public: + SlotOccupancy() : Counts{} {} + + /// Construct with concrete slot bits (for regular instructions) + explicit SlotOccupancy(SlotBits Slots); + + /// Check if this occupancy is empty + bool isEmpty() const; + + /// Clear all occupancy + void clear(); + + /// Block all resources (for reserved cycles) + void blockResources(); + + /// Merge another occupancy into this one + SlotOccupancy &operator|=(const SlotOccupancy &Other); + + /// Combine two occupancies (by-value) + SlotOccupancy operator|(const SlotOccupancy &Other) const; + + /// Check if adding Other would create a conflict + /// Uses three-layer checking: + /// 1. Quick prune: All counts within capacity bounds + /// 2. Quick prune: Total occupancy within real slot count + /// 3. Materialization check: MSPs can be placed on real slots in feasible + /// format + /// \param Other The occupancy to check for conflict + /// \param FormatInterface Interface providing slot structure and format + /// feasibility + bool conflict(const SlotOccupancy &Other, + const AIEBaseMCFormats &FormatInterface) const; + + /// Dump a human-readable representation + void dump() const; + + /// Equality comparison (for testing/debugging) + bool operator==(const SlotOccupancy &Other) const; + + /// Check if all counts are bounded by the given bounds + /// \param Bounds SlotOccupancy representing the upper bounds for each slot + /// class + /// \return true if all counts are within the bounds + bool boundedBy(const SlotOccupancy &Bounds) const; + + /// Get the total of all occupation counts + /// \return Sum of all counts + unsigned total() const; + + /// Get the occupation count for a specific slot class (STL-style) + /// \param ClassIdx The slot class index to query + /// \return The occupation count for that class + uint8_t at(unsigned ClassIdx) const { return Counts.at(ClassIdx); } + + /// Set the occupation count for a specific slot class + /// \param ClassIdx The slot class index to set + /// \param Count The new count value + void setCount(unsigned ClassIdx, uint8_t Count) { + assert(ClassIdx < MaxSlotClasses && "Class index out of bounds"); + Counts[ClassIdx] = Count; + } + +private: + /// Occupation counts per slot class + /// For regular slots: 0 or 1 + /// For MSP classes: 0 to capacity + std::array Counts; + + /// Helper to recursively try materializing MSPs to real slots + /// \param FormatInterface Interface for checking format feasibility + /// \param RemainingOccupancy Occupancy still to be materialized + /// \param CurrentRealSlots Current real slot occupancy (bit mask) + /// \return true if a feasible materialization exists + bool tryMaterializeMSPs(const AIEBaseMCFormats &FormatInterface, + const SlotOccupancy &RemainingOccupancy, + SlotBits CurrentRealSlots) const; +}; + +/// MSPSlotMapping stores the materialization of MSP instances to real slots. +/// This allows iterative querying to determine which real slot each MSP +/// instance should use. The class maintains internal counters to track which +/// instance of each MSP class is being queried, and updates a SlotOccupancy +/// to reflect materializations as they are retrieved. +class MSPSlotMapping { + /// Structure to hold assignment information for a single MSP instance + struct MSPAssignment { + unsigned MSPClassIdx; // Which MSP class + unsigned InstanceIdx; // Which instance of that class (0-based) + unsigned RealSlotIdx; // Which real slot it's assigned to + }; + + /// List of all MSP assignments + /// Ordered by MSP class index, then instance index + std::vector Assignments; + + /// Current occupancy state reflecting materializations retrieved so far + SlotOccupancy CurrentOccupancy; + + /// Counters tracking how many instances of each MSP class have been queried + std::array InstanceCounters; + + /// Number of real slots (cached from SlotStructure) + unsigned NumRealSlots; + + /// Reference to format interface (needed for materializeAlternative) + const AIEBaseMCFormats *FormatInterface; + + /// Compute a legal materialization for the given occupancy + /// \param Occupancy The slot occupancy to materialize + /// \param FormatInterface Interface for checking format feasibility and + /// providing slot structure + /// \return true if a legal materialization was found + bool computeMapping(const SlotOccupancy &Occupancy, + const AIEBaseMCFormats &FormatInterface); + + /// Helper to recursively compute MSP materializations + /// \param FormatInterface Interface for checking format feasibility + /// \param RemainingOccupancy Occupancy still to be materialized + /// \param CurrentRealSlots Current real slot occupancy (bit mask) + /// \param Assignments Output vector to store assignments + /// \return true if a feasible materialization exists + static bool + tryMaterializeMSPsWithMapping(const AIEBaseMCFormats &FormatInterface, + const SlotOccupancy &RemainingOccupancy, + SlotBits CurrentRealSlots, + std::vector &Assignments); + +public: + /// Default constructor creates an empty mapping + MSPSlotMapping(); + + /// Construct and compute mapping for the given occupancy + /// \param Occupancy The slot occupancy to materialize + /// \param FormatInterface Interface for checking format feasibility and + /// providing slot structure + MSPSlotMapping(const SlotOccupancy &Occupancy, + const AIEBaseMCFormats &FormatInterface); + + /// Materialize the next instance of a slot class (MSP or real slot) to a + /// real slot. This decrements the count for the slot class and increments + /// the count for the materialized real slot in the current occupancy. + /// \param SlotClassIdx The slot class index (can be MSP or real slot) + /// \return The real slot index this instance materializes to + /// \pre The slot class must have at least one unmaterialized instance + unsigned materializeAlternative(unsigned SlotClassIdx); + + /// Get the current occupancy state reflecting all materializations retrieved + /// so far + /// \return The current occupancy with MSP instances materialized to real + /// slots + const SlotOccupancy &getCurrentOccupancy() const { return CurrentOccupancy; } + + /// Check if the mapping is empty (no MSPs materialized) + bool isEmpty() const { return Assignments.empty(); } + + /// Clear all mappings and reset state + void clear(); + + /// Dump a human-readable representation + void dump() const; +}; + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AIE_AIESLOTOCCUPANCY_H diff --git a/llvm/lib/Target/AIE/AIESlotStructure.h b/llvm/lib/Target/AIE/AIESlotStructure.h new file mode 100644 index 000000000000..650487417c92 --- /dev/null +++ b/llvm/lib/Target/AIE/AIESlotStructure.h @@ -0,0 +1,75 @@ +//===--- AIESlotStructure.h - Slot Structure Query Interface ----*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// +// +// This file defines an interface for querying slot structure information +// needed by the SlotOccupancy conflict detection system. +// +// The interface provides generator-supplied data: +// - Slot definitions (real slots come first, then MSP classes) +// - MSP compositions (which real slots each MSP can use) +// - Feasible formats (valid combinations of real slots) +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AIE_AIESLOTSTRUCTURE_H +#define LLVM_LIB_TARGET_AIE_AIESLOTSTRUCTURE_H + +#include "MCTargetDesc/AIEFormat.h" +#include +#include + +namespace llvm { + +// Forward declaration +class SlotOccupancy; + +// MaxSlotClasses is defined in AIESlotOccupancy.h +// We use a forward declaration here to avoid circular dependencies +constexpr unsigned MaxSlotClasses = 32; + +/// Interface for querying slot structure information. +/// This provides generator-supplied data about slot classes, MSP compositions, +/// and feasible formats. The feasibility logic itself is implemented in +/// SlotOccupancy::conflict(). +/// +/// Complete unification: Real slots are MSPs with composition = themselves. +/// For example, slot A (index 0) has composition = (1 << 0). +/// Capacity is computed as popcount(composition). +class AIESlotStructure { +public: + virtual ~AIESlotStructure() = default; + + /// Get the number of real slots (non-MSP slot classes) + /// Real slots always come first in the slot class indexing + virtual unsigned getNumRealSlots() const = 0; + + /// Get the composition for any slot class (real or MSP) + /// For real slots: returns (1 << ClassIdx) + /// For MSP classes: returns bitmask of real slots it can use + /// \param ClassIdx The slot class index + /// \return Bitmask of real slots this class can occupy + virtual SlotBits getMSPComposition(unsigned ClassIdx) const = 0; + + /// Get the capacity (max occupancy) for a slot class + /// Computed as popcount(getMSPComposition(ClassIdx)) + /// \param ClassIdx The slot class index + /// \return The maximum occupation count for this class + uint8_t getCapacity(unsigned ClassIdx) const { + return llvm::popcount(getMSPComposition(ClassIdx)); + } + + /// Get the capacity bounds for all slot classes as a SlotOccupancy + /// \return SlotOccupancy with each slot class set to its capacity + SlotOccupancy getCapacityBounds() const; +}; + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AIE_AIESLOTSTRUCTURE_H diff --git a/llvm/lib/Target/AIE/CMakeLists.txt b/llvm/lib/Target/AIE/CMakeLists.txt index 5e5f42d9e4ab..37319c30154f 100644 --- a/llvm/lib/Target/AIE/CMakeLists.txt +++ b/llvm/lib/Target/AIE/CMakeLists.txt @@ -118,6 +118,7 @@ add_llvm_target(AIECodeGen AIERegClassConstrainer.cpp AIERegMemEventTracker.cpp AIESlotCounts.cpp + AIESlotOccupancy.cpp AIESlotStatistics.cpp AIESlotUtils.cpp AIESplitInstructionRewriter.cpp diff --git a/llvm/lib/Target/AIE/MCTargetDesc/AIE2MCFormats.cpp b/llvm/lib/Target/AIE/MCTargetDesc/AIE2MCFormats.cpp index be62b5711c49..8046284ea083 100644 --- a/llvm/lib/Target/AIE/MCTargetDesc/AIE2MCFormats.cpp +++ b/llvm/lib/Target/AIE/MCTargetDesc/AIE2MCFormats.cpp @@ -8,6 +8,7 @@ // //===----------------------------------------------------------------------===// +#include "../AIESlotStructure.h" #include "AIE2MCTargetDesc.h" #include "AIEMCFormats.h" @@ -21,13 +22,34 @@ namespace llvm { #define GET_FORMATS_SLOTINFOS_MAPPING #define GET_OPCODE_FORMATS_INDEX_FUNC #define GET_ALTERNATE_INST_OPCODE_FUNC +#define GET_SLOTSTRUCTURE_NUMREALSLOTS +#define GET_SLOTSTRUCTURE_COMPOSITIONS +#define GET_SLOTSTRUCTURE_MSP_OPCODE_TO_CLASS +#define GET_SLOTSTRUCTURE_MSP_MATERIALIZATION #include "AIE2GenFormats.inc" + namespace AIE2 { #define GET_FORMATS_FORMATS_DEFS #include "AIE2GenFormats.inc" } // namespace AIE2 -/***************** AIEMCFormats *******************/ +/// AIE2-specific SlotStructure implementation +class AIE2SlotStructureImpl final : public AIESlotStructure { +public: + unsigned getNumRealSlots() const override { return NumRealSlots; } + + SlotBits getMSPComposition(unsigned ClassIdx) const override { + if (ClassIdx >= TotalSlotClasses) { + return 0; + } + return SlotCompositions[ClassIdx]; + } +}; + +// Static instance +static const AIE2SlotStructureImpl SlotStructureInstance; + +/***************** AIE2MCFormats *******************/ const MCFormatDesc *AIE2MCFormats::getMCFormats() const { return AIE2::Formats; @@ -39,8 +61,21 @@ ArrayRef AIE2MCFormats::getIsFormatAvailable() const { return FormatAvailable; } +const AIESlotStructure &AIE2MCFormats::getSlotStructure() const { + return SlotStructureInstance; +} + SmallVector AIE2MCFormats::getLoadSlotKinds() const { return {MCSlotKind::AIE2_SLOT_LDB, MCSlotKind::AIE2_SLOT_LDA}; } +MultiSlotClass AIE2MCFormats::getMultiSlotClass(unsigned Opcode) const { + return getMSPClassIndexForOpcode(Opcode); +} + +unsigned AIE2MCFormats::getMaterializedOpcode(unsigned Opcode, + unsigned SlotIdx) const { + return getMaterializedOpcodeImpl(Opcode, SlotIdx); +} + } // end namespace llvm diff --git a/llvm/lib/Target/AIE/MCTargetDesc/AIEBaseMCFormats.cpp b/llvm/lib/Target/AIE/MCTargetDesc/AIEBaseMCFormats.cpp index c986aab70d75..39014133fa3b 100644 --- a/llvm/lib/Target/AIE/MCTargetDesc/AIEBaseMCFormats.cpp +++ b/llvm/lib/Target/AIE/MCTargetDesc/AIEBaseMCFormats.cpp @@ -7,6 +7,7 @@ // (c) Copyright 2023-2025 Advanced Micro Devices, Inc. or its affiliates // //===----------------------------------------------------------------------===// +#include "../AIESlotStructure.h" #include "AIEMCFormats.h" #undef DEBUG_TYPE @@ -14,6 +15,29 @@ namespace llvm { +// Stub implementation of AIESlotStructure for default behavior +// This will be replaced by concrete implementations in each architecture +namespace { +class StubSlotStructure : public AIESlotStructure { +public: + unsigned getNumRealSlots() const override { + llvm_unreachable("SlotStructure not implemented for this architecture"); + } + + SlotBits getMSPComposition(unsigned ClassIdx) const override { + llvm_unreachable("SlotStructure not implemented for this architecture"); + } +}; + +static StubSlotStructure DefaultSlotStructure; +} // anonymous namespace + +const AIESlotStructure &AIEBaseMCFormats::getSlotStructure() const { + // Default implementation returns stub + // Concrete format classes will override this + return DefaultSlotStructure; +} + const MCSlotKind AIEPacketFormat::getSlot(unsigned Idx) const { // NOTE: we can't directly retrieve the slot by querying the SlotsMap as the // map doesn't keep the VLIW ordering (using the integer ordering of the enum diff --git a/llvm/lib/Target/AIE/MCTargetDesc/AIEMCFormats.cpp b/llvm/lib/Target/AIE/MCTargetDesc/AIEMCFormats.cpp index ae6863890d1f..2765193d6872 100644 --- a/llvm/lib/Target/AIE/MCTargetDesc/AIEMCFormats.cpp +++ b/llvm/lib/Target/AIE/MCTargetDesc/AIEMCFormats.cpp @@ -8,6 +8,7 @@ // //===----------------------------------------------------------------------===// #include "AIEMCFormats.h" +#include "../AIESlotStructure.h" #include "AIE.h" #undef DEBUG_TYPE @@ -20,6 +21,10 @@ namespace llvm { #define GET_FORMATS_SLOTINFOS_MAPPING #define GET_OPCODE_FORMATS_INDEX_FUNC #define GET_ALTERNATE_INST_OPCODE_FUNC +#define GET_SLOTSTRUCTURE_NUMREALSLOTS +#define GET_SLOTSTRUCTURE_COMPOSITIONS +#define GET_SLOTSTRUCTURE_MSP_OPCODE_TO_CLASS +#define GET_SLOTSTRUCTURE_MSP_MATERIALIZATION #include "AIEGenFormats.inc" namespace AIE { @@ -28,6 +33,22 @@ namespace AIE { #include "AIEGenFormats.inc" } // end namespace AIE +/// AIE1-specific SlotStructure implementation +class AIESlotStructureImpl final : public AIESlotStructure { +public: + unsigned getNumRealSlots() const override { return NumRealSlots; } + + SlotBits getMSPComposition(unsigned ClassIdx) const override { + if (ClassIdx >= TotalSlotClasses) { + return 0; + } + return SlotCompositions[ClassIdx]; + } +}; + +// Static instance +static const AIESlotStructureImpl SlotStructureInstance; + /***************** AIEInstFormat *******************/ bool AIEInstFormat::hasSingleSlot() const { @@ -55,7 +76,6 @@ const MCSlotKind AIEInstFormat::getSlot() const { return SlotsMap.begin()->SlotKind; } - /***************** AIEMCFormats *******************/ const MCFormatDesc *AIEMCFormats::getMCFormats() const { return AIE::Formats; } @@ -66,4 +86,17 @@ ArrayRef AIEMCFormats::getIsFormatAvailable() const { return FormatAvailable; } +const AIESlotStructure &AIEMCFormats::getSlotStructure() const { + return SlotStructureInstance; +} + +MultiSlotClass AIEMCFormats::getMultiSlotClass(unsigned Opcode) const { + return getMSPClassIndexForOpcode(Opcode); +} + +unsigned AIEMCFormats::getMaterializedOpcode(unsigned Opcode, + unsigned SlotIdx) const { + return getMaterializedOpcodeImpl(Opcode, SlotIdx); +} + } // end namespace llvm diff --git a/llvm/lib/Target/AIE/MCTargetDesc/AIEMCFormats.h b/llvm/lib/Target/AIE/MCTargetDesc/AIEMCFormats.h index 32b4c1e59cbc..07a2bfab42db 100644 --- a/llvm/lib/Target/AIE/MCTargetDesc/AIEMCFormats.h +++ b/llvm/lib/Target/AIE/MCTargetDesc/AIEMCFormats.h @@ -26,6 +26,19 @@ namespace llvm { +// Forward declarations +class AIESlotStructure; + +/// MultiSlotClass represents a unified slot class index. +/// Real slots occupy indices [0..NumRealSlots-1]. +/// MSP classes occupy indices [NumRealSlots..TotalSlotClasses-1]. +/// This type unifies both regular instructions (which use a single real slot) +/// and multi-slot pseudos (which can materialize to multiple slots). +enum class MultiSlotClass : int { + NoClass = -1 // Invalid/unknown class + // Actual class indices are architecture-specific and start from 0 +}; + using SlotBits = uint64_t; class MCSlotInfo; class MCSlotKind { @@ -396,7 +409,31 @@ class AIEBaseMCFormats { llvm_unreachable("Target didn't implement getLoadSlotKinds()"); } - bool isFormatAvailable(uint64_t SlotSet) const; + virtual bool isFormatAvailable(uint64_t SlotSet) const; + + /// Get the slot structure interface for this architecture + /// \return Reference to the slot structure providing slot definitions, + /// MSP compositions, and capacity information + /// Default implementation returns a stub that will be overridden by + /// concrete format classes when slot structure is implemented + virtual const AIESlotStructure &getSlotStructure() const; + + /// Get the MultiSlotClass for an instruction opcode + /// \param Opcode The instruction opcode (can be MSP or real instruction) + /// \return The MultiSlotClass, or MultiSlotClass::NoClass if unknown + virtual MultiSlotClass getMultiSlotClass(unsigned Opcode) const { + return MultiSlotClass::NoClass; + } + + /// Get the materialized real instruction opcode for an MSP in a specific slot + /// \param MSPOpcode The MSP pseudo opcode + /// \param SlotIdx The target slot index (0-based, corresponding to real + /// slots) + /// \return The real instruction opcode, or 0 if invalid + virtual unsigned getMaterializedOpcode(unsigned MSPOpcode, + unsigned SlotIdx) const { + return 0; // Default: no materialization + } protected: /// Check if the Instruction is indeed into the Tables. @@ -413,6 +450,10 @@ class AIEMCFormats : public AIEBaseMCFormats { const MCFormatDesc *getMCFormats() const override; ArrayRef getIsFormatAvailable() const override; const PacketFormats &getPacketFormats() const override; + const AIESlotStructure &getSlotStructure() const override; + MultiSlotClass getMultiSlotClass(unsigned Opcode) const override; + unsigned getMaterializedOpcode(unsigned Opcode, + unsigned SlotIdx) const override; }; class AIE2MCFormats : public AIEBaseMCFormats { @@ -426,6 +467,10 @@ class AIE2MCFormats : public AIEBaseMCFormats { const PacketFormats &getPacketFormats() const override; ArrayRef getIsFormatAvailable() const override; SmallVector getLoadSlotKinds() const override; + const AIESlotStructure &getSlotStructure() const override; + MultiSlotClass getMultiSlotClass(unsigned Opcode) const override; + unsigned getMaterializedOpcode(unsigned Opcode, + unsigned SlotIdx) const override; }; class AIE2PMCFormats : public AIEBaseMCFormats { @@ -439,6 +484,10 @@ class AIE2PMCFormats : public AIEBaseMCFormats { const PacketFormats &getPacketFormats() const override; ArrayRef getIsFormatAvailable() const override; SmallVector getLoadSlotKinds() const override; + const AIESlotStructure &getSlotStructure() const override; + MultiSlotClass getMultiSlotClass(unsigned Opcode) const override; + unsigned getMaterializedOpcode(unsigned Opcode, + unsigned SlotIdx) const override; }; } // end namespace llvm diff --git a/llvm/lib/Target/AIE/MCTargetDesc/aie2p/AIE2PMCFormats.cpp b/llvm/lib/Target/AIE/MCTargetDesc/aie2p/AIE2PMCFormats.cpp index a272a51d1b94..7a2e88ec09de 100644 --- a/llvm/lib/Target/AIE/MCTargetDesc/aie2p/AIE2PMCFormats.cpp +++ b/llvm/lib/Target/AIE/MCTargetDesc/aie2p/AIE2PMCFormats.cpp @@ -8,6 +8,7 @@ // //===----------------------------------------------------------------------===// +#include "../../AIESlotStructure.h" #include "AIE2PMCTargetDesc.h" #include "AIEMCFormats.h" @@ -21,13 +22,34 @@ namespace llvm { #define GET_FORMATS_SLOTINFOS_MAPPING #define GET_OPCODE_FORMATS_INDEX_FUNC #define GET_ALTERNATE_INST_OPCODE_FUNC +#define GET_SLOTSTRUCTURE_NUMREALSLOTS +#define GET_SLOTSTRUCTURE_COMPOSITIONS +#define GET_SLOTSTRUCTURE_MSP_OPCODE_TO_CLASS +#define GET_SLOTSTRUCTURE_MSP_MATERIALIZATION #include "AIE2PGenFormats.inc" + namespace AIE2P { #define GET_FORMATS_FORMATS_DEFS #include "AIE2PGenFormats.inc" } // namespace AIE2P -/***************** AIEMCFormats *******************/ +/// AIE2P-specific SlotStructure implementation +class AIE2PSlotStructureImpl final : public AIESlotStructure { +public: + unsigned getNumRealSlots() const override { return NumRealSlots; } + + SlotBits getMSPComposition(unsigned ClassIdx) const override { + if (ClassIdx >= TotalSlotClasses) { + return 0; + } + return SlotCompositions[ClassIdx]; + } +}; + +// Static instance +static const AIE2PSlotStructureImpl SlotStructureInstance; + +/***************** AIE2PMCFormats *******************/ const MCFormatDesc *AIE2PMCFormats::getMCFormats() const { return AIE2P::Formats; @@ -41,8 +63,21 @@ ArrayRef AIE2PMCFormats::getIsFormatAvailable() const { return FormatAvailable; } +const AIESlotStructure &AIE2PMCFormats::getSlotStructure() const { + return SlotStructureInstance; +} + SmallVector AIE2PMCFormats::getLoadSlotKinds() const { return {MCSlotKind::AIE2P_SLOT_LDB, MCSlotKind::AIE2P_SLOT_LDA}; } +MultiSlotClass AIE2PMCFormats::getMultiSlotClass(unsigned Opcode) const { + return getMSPClassIndexForOpcode(Opcode); +} + +unsigned AIE2PMCFormats::getMaterializedOpcode(unsigned Opcode, + unsigned SlotIdx) const { + return getMaterializedOpcodeImpl(Opcode, SlotIdx); +} + } // end namespace llvm diff --git a/llvm/lib/Target/AIE/docs/SlotStructureUnification.md b/llvm/lib/Target/AIE/docs/SlotStructureUnification.md new file mode 100644 index 000000000000..0812663a466b --- /dev/null +++ b/llvm/lib/Target/AIE/docs/SlotStructureUnification.md @@ -0,0 +1,246 @@ + + +# SlotStructure and MultiSlotPseudo Unification + +## Overview + +This document describes the unification of MultiSlotPseudos (MSPs) with +regular slots through a unified `MultiSlotClass` index domain. This +unification enables the `SlotStructure` interface to drive `SlotOccupancy` +conflict detection and MSP materialization with zero runtime computation +beyond table lookups. + +## Motivation + +Previously, MSPs and regular slots were handled separately: +- Regular instructions had a fixed slot assignment +- MSPs used `AlternateOpcodes` arrays to list possible materializations +- Slot conflict detection required runtime computation of MSP compositions + +The new design unifies these concepts: +- Both MSPs and regular slots are represented as slot classes with a + unified index +- MSP compositions are precomputed and stored in generated tables +- Materialization is a simple table lookup: `(MSP opcode, target slot) → + real opcode` + +## Key Concepts + +### MultiSlotClass + +A `MultiSlotClass` is a unified slot class index that can represent either: +- **Real slot** (indices 0..NumRealSlots-1): A single physical VLIW slot +- **MSP class** (indices NumRealSlots..TotalSlotClasses-1): A set of slots + an MSP can materialize to + +```cpp +enum class MultiSlotClass : int { + NoClass = -1 // Invalid/unknown class + // Actual indices are architecture-specific and start from 0 +}; +``` + +### Composition + +Each slot class has a **composition**: a bitmask of real slots it can +occupy. +- For real slots: composition = (1 << slotIdx) - a singleton +- For MSP classes: composition = OR of all real slot bits the MSP can use + +Example (AIE2): +```cpp +SlotCompositions[0] = 1 // Alu slot (bit 0) +SlotCompositions[1] = 2 // Lda slot (bit 1) +SlotCompositions[7] = 26 // MSP class: Lda|Mv|Lng (bits 1,3,4) +``` + +### Deduplication + +MSPs with identical compositions share a single `MultiSlotClass` index. + +Example: If `MOV_A` and `MOV_B` both materialize to {Lda, Mv, Lng}, they +get the same class index. + +## Generated Tables + +The CodeGenFormat TableGen backend generates three new table sections per +architecture: + +### 1. Slot Class Counts + +```cpp +static constexpr unsigned NumRealSlots = 7; // AIE2 example +static constexpr unsigned NumMSPClasses = 9; +static constexpr unsigned TotalSlotClasses = 16; +``` + +### 2. Composition Table + +```cpp +static constexpr uint64_t const SlotCompositions[] = { + 1ULL /* Real slot: Alu */, + 2ULL /* Real slot: Lda */, + 4ULL /* Real slot: Ldb */, + 8ULL /* Real slot: Lng */, + 16ULL /* Real slot: Mv */, + 64ULL /* Real slot: St */, + 128ULL /* Real slot: Vec */, + 26ULL /* MSP class 7: Lda|Mv|Lng (0x1A = bits 1,3,4) */, + 10ULL /* MSP class 8: Lda|Lng (0x0A = bits 1,3) */, + // ... more MSP classes +}; +``` + +### 3. MSP Opcode → MultiSlotClass Mapping + +```cpp +static MultiSlotClass getMSPClassIndexForOpcode(unsigned Opcode) { + switch (Opcode) { + default: + return MultiSlotClass::NoClass; + case AIE2::MOV_PD_imm10_pseudo: + return static_cast(7); + // ... + } +} +``` + +### 4. MSP Materialization Mapping + +```cpp +static unsigned getMaterializedOpcodeImpl(unsigned Opcode, + unsigned SlotIdx) { + switch (Opcode) { + default: + return 0; // Not an MSP or invalid + case AIE2::MOV_PD_imm10_pseudo: + switch (SlotIdx) { + case 1: return AIE2::MOVA_lda_cg; // Lda slot + case 4: return AIE2::MOV_mv_cg; // Mv slot + case 3: return AIE2::MOVXM_lng_cg; // Lng slot + } + // ... + } +} +``` + +## Interface Usage + +### SlotStructure Interface + +```cpp +const AIESlotStructure &SS = FormatInterface.getSlotStructure(); + +// Get composition for any class (real or MSP) +SlotBits Composition = SS.getMSPComposition(ClassIdx); + +// Get capacity (number of slots in composition) +uint8_t Capacity = SS.getCapacity(ClassIdx); // popcount(composition) + +// Get number of real slots +unsigned NumReal = SS.getNumRealSlots(); +``` + +### FormatInterface Extensions + +```cpp +const AIEBaseMCFormats &FI = /* ... */; + +// Get the MultiSlotClass for any instruction +MultiSlotClass Class = FI.getMultiSlotClass(Opcode); + +// Materialize an MSP to a specific slot +unsigned RealOpcode = FI.getMaterializedOpcode(MSPOpcode, SlotIdx); +``` + +## SlotOccupancy Integration + +`SlotOccupancy` uses `SlotStructure` for conflict detection: + +1. Combine occupancies by summing counts per class +2. Check capacity bounds using `SlotStructure.getCapacityBounds()` +3. Try materializing MSPs: + - Get composition via `SlotStructure.getMSPComposition(ClassIdx)` + - Greedily assign to available real slots + - Check feasibility via `FormatInterface.isFormatAvailable(realSlotBits)` + +All operations are table lookups - no runtime composition computation. + +## Migration Path from AlternateOpcodes + +The new design enables deprecation of `AlternateOpcodes`: + +**Old approach:** +```cpp +const std::vector *Alts = + FI.getAlternateInstsOpcode(MSPOpcode); +for (unsigned AltOpcode : *Alts) { + MCSlotKind Slot = FI.getSlotKind(AltOpcode); + // ... +} +``` + +**New approach:** +```cpp +MultiSlotClass Class = FI.getMultiSlotClass(MSPOpcode); +SlotBits Composition = FI.getSlotStructure().getMSPComposition( + static_cast(Class)); +// Iterate over set bits in Composition +for (unsigned SlotIdx = 0; SlotIdx < NumRealSlots; ++SlotIdx) { + if (Composition & (1ULL << SlotIdx)) { + unsigned RealOpcode = FI.getMaterializedOpcode(MSPOpcode, SlotIdx); + // ... + } +} +``` + +## Implementation Files + +### TableGen Backend +- `llvm/utils/TableGen/CodeGenFormat.cpp` - Table generation logic +- `llvm/utils/TableGen/CodeGenFormat.h` - Backend interface + +### Generated Tables +- `Release/lib/Target/AIE/AIEGenFormats.inc` - AIE1 tables +- `Release/lib/Target/AIE/AIE2GenFormats.inc` - AIE2 tables +- `Release/lib/Target/AIE/AIE2PGenFormats.inc` - AIE2P tables + +### Runtime Interfaces +- `llvm/lib/Target/AIE/AIESlotStructure.h` - SlotStructure interface +- `llvm/lib/Target/AIE/MCTargetDesc/AIEMCFormats.h` - FormatInterface with + MultiSlotClass +- `llvm/lib/Target/AIE/MCTargetDesc/AIEMCFormats.cpp` - AIE1 implementation +- `llvm/lib/Target/AIE/MCTargetDesc/AIE2MCFormats.cpp` - AIE2 implementation +- `llvm/lib/Target/AIE/MCTargetDesc/aie2p/AIE2PMCFormats.cpp` - AIE2P + implementation + +### Consumers +- `llvm/lib/Target/AIE/AIESlotOccupancy.cpp` - Uses SlotStructure for + conflict detection +- `llvm/lib/Target/AIE/AIEMultiSlotInstrMaterializer.cpp` - Can use new + materialization API + +## Future Work + +1. **Migrate Materializer**: Update `AIEMultiSlotInstrMaterializer` to use + `getMaterializedOpcode()` instead of `AlternateOpcodes` +2. **Deprecate AlternateOpcodes**: Once all consumers migrate, mark + `getAlternateInstsOpcode()` as deprecated +3. **Runtime Helpers**: Add convenience methods to build slot→opcodes maps + lazily if needed +4. **Extended Metadata**: Consider adding MSP class names or other metadata + to generated tables for debugging + +## Testing + +All existing tests pass: +- SlotOccupancy unit tests: 20/20 (100%) +- AIE CodeGen tests: 1802/1803 (99.72%) + +The single failing test is pre-existing and unrelated to these changes. diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt index 1e3acaaae4fa..af9e64e9ef19 100644 --- a/llvm/test/CMakeLists.txt +++ b/llvm/test/CMakeLists.txt @@ -1,3 +1,10 @@ +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# Modifications (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates + llvm_canonicalize_cmake_booleans( BUILD_SHARED_LIBS HAVE_OCAMLOPT @@ -29,6 +36,7 @@ llvm_canonicalize_cmake_booleans( LLVM_INCLUDE_SPIRV_TOOLS_TESTS LLVM_APPEND_VC_REV LLVM_HAS_LOGF128 + LLVM_ENABLE_Z3_SOLVER ) configure_lit_site_cfg( diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nopresched.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nopresched.mir index 4891e5e1a90d..bac8b6e1a405 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nopresched.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nopresched.mir @@ -3,7 +3,7 @@ # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +# (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates # RUN: llc --mtriple=aie2 -O2 --start-before=postmisched %s \ # RUN: --debug-only=postpipeliner-summary -o - | FileCheck %s @@ -60,9 +60,9 @@ ; CHECK-NEXT: vshuffle x6, x8, x0, r3; vmac.f bml6, bml6, x10, x7, r2 ; CHECK-NEXT: vshuffle x2, x8, x0, r16; vmac.f bml5, bml5, x9, x7, r2 ; CHECK-NEXT: .L_LEnd0: - ; CHECK-NEXT: vshuffle x10, x1, x3, r3; vmac.f bmh4, bmh4, x6, x5, r2 + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x10, x1, x3, r3; vmac.f bmh4, bmh4, x6, x5, r2 ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup - ; CHECK-NEXT: vshuffle x9, x1, x3, r16; vmac.f bmh5, bmh5, x2, x5, r2 + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x9, x1, x3, r16; vmac.f bmh5, bmh5, x2, x5, r2 ; CHECK-NEXT: vshuffle x3, x11, x11, r6; vmac.f bmh6, bmh6, x10, x5, r2 ; CHECK-NEXT: vshuffle x0, x0, x0, r6; vmac.f bmh7, bmh7, x9, x5, r2 ; CHECK-NEXT: vshuffle x7, x7, x7, r6; vmac.f bmh0, bmh0, x6, x3, r2 diff --git a/llvm/test/CodeGen/AIE/aie2p/elongate/zol_phdr_loopbundles_112bytes.mir b/llvm/test/CodeGen/AIE/aie2p/elongate/zol_phdr_loopbundles_112bytes.mir index ad81b4739c0f..9a1f6d089d8b 100644 --- a/llvm/test/CodeGen/AIE/aie2p/elongate/zol_phdr_loopbundles_112bytes.mir +++ b/llvm/test/CodeGen/AIE/aie2p/elongate/zol_phdr_loopbundles_112bytes.mir @@ -43,9 +43,9 @@ ; CHECK-NEXT: vldb x0, [p0], #64; add r0, r0, #-1; vadd.16 x6, x4, x1 ; CHECK-NEXT: vldb x2, [p0], #64; vadd.16 x4, x6, x0 ; CHECK-NEXT: .L_LEnd0: - ; CHECK-NEXT: vldb x1, [p0], #128 + ; CHECK-NEXT: nopa ; vldb x1, [p0], #128; nops ; nopxm ; nopv ; CHECK-NEXT: // %bb.2: // %for.cond.cleanup - ; CHECK-NEXT: vadd.16 x6, x4, x1 + ; CHECK-NEXT: nops ; vadd.16 x6, x4, x1 ; CHECK-NEXT: vadd.16 x4, x6, x0 ; CHECK-NEXT: nop ; CHECK-NEXT: vadd.16 x6, x4, x1 diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/gemm-bfp16-v10-solver.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/gemm-bfp16-v10-solver.mir index 5376373100a2..7fb11e40e013 100644 --- a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/gemm-bfp16-v10-solver.mir +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/gemm-bfp16-v10-solver.mir @@ -22,65 +22,64 @@ ; CHECK-LABEL: gemm: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry - ; CHECK-NEXT: nopa ; vldb x4, [p7, #64]; nopxm + ; CHECK-NEXT: nopa ; vldb x4, [p7, #64]; nopxm ; nops ; CHECK-NEXT: vldb.3d x7, [p7], d0; movs p4, p7 ; CHECK-NEXT: paddb [p4], m4 - ; CHECK-NEXT: vldb x9, [p4, #0] ; CHECK-NEXT: vldb x5, [p4, #64] + ; CHECK-NEXT: vldb x9, [p4, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vldb x4, [p7, #64]; mov p5, p6 ; CHECK-NEXT: movs p4, p7; vldb.3d x7, [p7], d0; vshuffle x6, x7, x4, r0 ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; paddb [p4], m4; vshuffle x7, x7, x4, r1 - ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x9, [p4, #0]; vshuffle x8, x9, x5, r0 - ; CHECK-NEXT: vldb x5, [p4, #64]; vshuffle x9, x9, x5, r1 + ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x5, [p4, #64]; vshuffle x8, x9, x5, r0 + ; CHECK-NEXT: vldb x9, [p4, #0]; vshuffle x9, x9, x5, r1 ; CHECK-NEXT: padda [p5], m5; add.nc lc, r0, #-3; vmul.f dm4, y3, y5, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0]; movxm ls, #.LBB0_1; vmul.f dm4, y4, y5, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64]; movxm le, #.L_LEnd0 - ; CHECK-NEXT: vldb x4, [p7, #64]; mov p5, p6 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64]; movxm ls, #.LBB0_1; vmul.f dm4, y4, y5, r2 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0]; movxm le, #.L_LEnd0 + ; CHECK-NEXT: nopa ; vldb x4, [p7, #64]; nopx ; mov p5, p6; nops ; CHECK-NEXT: movs p4, p7; vldb.3d x7, [p7], d0; vshuffle x6, x7, x4, r0 ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; paddb [p4], m4; vconv.bfp16ebs8.fp32 ex0, dm4; vshuffle x7, x7, x4, r1 - ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x9, [p4, #0]; vconv.bfp16ebs8.fp32 ex1, dm4; vshuffle x8, x9, x5, r0 - ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex3, dm4; vldb x5, [p4, #64]; vshuffle x9, x9, x5, r1 + ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x5, [p4, #64]; vconv.bfp16ebs8.fp32 ex1, dm4; vshuffle x8, x9, x5, r0 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: padda [p5], m5; nopb ; nops ; nopxm ; vmul.f dm4, y3, y5, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0]; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64] - ; CHECK-NEXT: vldb x4, [p7, #64]; mov p5, p6 - ; CHECK-NEXT: movs p4, p7; vldb.3d x7, [p7], d0; vshuffle x6, x7, x4, r0; vmac.f dm3, dm3, ex0, ex1, r3 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; paddb [p4], m4; vconv.bfp16ebs8.fp32 ex0, dm4; nopx ; vshuffle x7, x7, x4, r1; vmac.f dm0, dm0, ex2, ex3, r3 - ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x9, [p4, #0]; vconv.bfp16ebs8.fp32 ex1, dm4; nopx ; vshuffle x8, x9, x5, r0; vmac.f dm1, dm1, ex2, ex1, r3 + ; CHECK-NEXT: nopa ; vldb x9, [p4, #0]; vconv.bfp16ebs8.fp32 ex3, dm4; nopx ; vshuffle x9, x9, x5, r1; nopv + ; CHECK-NEXT: padda [p5], m5; nopb ; nopx ; vmul.f dm4, y3, y5, r2 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64]; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0] + ; CHECK-NEXT: vldb x4, [p7, #64]; mov p5, p6; vmac.f dm3, dm3, ex0, ex1, r3 + ; CHECK-NEXT: movs p4, p7; vldb.3d x7, [p7], d0; vshuffle x6, x7, x4, r0; vmac.f dm2, dm2, ex0, ex3, r3 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; paddb [p4], m4; vconv.bfp16ebs8.fp32 ex0, dm4; nopx ; vshuffle x7, x7, x4, r1; vmac.f dm1, dm1, ex2, ex1, r3 ; CHECK-NEXT: .L_LEnd0: - ; CHECK-NEXT: nopa ; vldb x5, [p4, #64]; vconv.bfp16ebs8.fp32 ex3, dm4; nopx ; vshuffle x9, x9, x5, r1; vmac.f dm2, dm2, ex0, ex3, r3 + ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x5, [p4, #64]; vconv.bfp16ebs8.fp32 ex1, dm4; nopx ; vshuffle x8, x9, x5, r0; vmac.f dm0, dm0, ex2, ex3, r3 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: // %bb.2: - ; CHECK-NEXT: padda [p5], m5; nopb ; nops ; nopxm ; vmul.f dm4, y3, y5, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0]; nopb ; nopx ; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64] - ; CHECK-NEXT: mov p5, p6 - ; CHECK-NEXT: vshuffle x6, x7, x4, r0; vmac.f dm3, dm3, ex0, ex1, r3 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; vconv.bfp16ebs8.fp32 ex0, dm4; vshuffle x7, x7, x4, r1; vmac.f dm0, dm0, ex2, ex3, r3 - ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vconv.bfp16ebs8.fp32 ex1, dm4; vshuffle x8, x9, x5, r0; vmac.f dm1, dm1, ex2, ex1, r3 - ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex3, dm4; vshuffle x9, x9, x5, r1; vmac.f dm2, dm2, ex0, ex3, r3 + ; CHECK-NEXT: nopa ; vldb x9, [p4, #0]; nopx ; vshuffle x9, x9, x5, r1; vconv.bfp16ebs8.fp32 ex3, dm4 ; CHECK-NEXT: padda [p5], m5; vmul.f dm4, y3, y5, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0]; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64] - ; CHECK-NEXT: nop + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64]; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0] + ; CHECK-NEXT: mov p5, p6; vmac.f dm3, dm3, ex0, ex1, r3 + ; CHECK-NEXT: vshuffle x6, x7, x4, r0; vmac.f dm2, dm2, ex0, ex3, r3 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; vconv.bfp16ebs8.fp32 ex0, dm4; vshuffle x7, x7, x4, r1; vmac.f dm1, dm1, ex2, ex1, r3 + ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vconv.bfp16ebs8.fp32 ex1, dm4; vshuffle x8, x9, x5, r0; vmac.f dm0, dm0, ex2, ex3, r3 + ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex3, dm4; vshuffle x9, x9, x5, r1 + ; CHECK-NEXT: padda [p5], m5; vmul.f dm4, y3, y5, r2 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64]; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0] ; CHECK-NEXT: vmac.f dm3, dm3, ex0, ex1, r3 - ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex0, dm4; vmac.f dm0, dm0, ex2, ex3, r3 - ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex1, dm4; vmac.f dm1, dm1, ex2, ex1, r3 - ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex3, dm4; vmac.f dm2, dm2, ex0, ex3, r3 + ; CHECK-NEXT: vmac.f dm2, dm2, ex0, ex3, r3 + ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex0, dm4; vmac.f dm1, dm1, ex2, ex1, r3 + ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex1, dm4; vmac.f dm0, dm0, ex2, ex3, r3 + ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex3, dm4 ; CHECK-NEXT: nop ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex2, dm4 ; CHECK-NEXT: nop - ; CHECK-NEXT: nop ; CHECK-NEXT: vmac.f dm3, dm3, ex0, ex1, r3 - ; CHECK-NEXT: vmac.f dm0, dm0, ex2, ex3, r3 - ; CHECK-NEXT: vmac.f dm1, dm1, ex2, ex1, r3 ; CHECK-NEXT: vmac.f dm2, dm2, ex0, ex3, r3 + ; CHECK-NEXT: vmac.f dm1, dm1, ex2, ex1, r3 + ; CHECK-NEXT: vmac.f dm0, dm0, ex2, ex3, r3 ; CHECK-NEXT: ret lr ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/gemm-bfp16-v6-solver.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/gemm-bfp16-v6-solver.mir index 719ace1080ad..91415faea0b5 100644 --- a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/gemm-bfp16-v6-solver.mir +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/gemm-bfp16-v6-solver.mir @@ -10,6 +10,7 @@ # RUN: llc -verify-machineinstrs --mtriple=aie2p -O2 %s \ # RUN: --start-before=postmisched \ # RUN: --aie-postpipeliner-heuristic-runs=1 \ +# RUN: --aie-postpipeliner-solver-timeout=10000 \ # RUN: -o - | FileCheck %s # derived from GEMM_Bfp16_opt_0 @@ -32,22 +33,22 @@ ; CHECK-NEXT: nopa ; vldb x4, [p7, #64]; nopxm ; movs p4, p7 ; CHECK-NEXT: vldb.3d x7, [p7], d0 ; CHECK-NEXT: paddb [p4], m4 - ; CHECK-NEXT: vldb x9, [p4, #0] ; CHECK-NEXT: vldb x5, [p4, #64] + ; CHECK-NEXT: vldb x9, [p4, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: movs p4, p7; vldb x4, [p7, #64]; add.nc lc, r0, #-3 ; CHECK-NEXT: vldb.3d x7, [p7], d0; vshuffle x6, x7, x4, r0 ; CHECK-NEXT: paddb [p4], m4; vshuffle x7, x7, x4, r1 - ; CHECK-NEXT: vldb x9, [p4, #0]; mov p5, p6 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; vldb x5, [p4, #64]; vshuffle x8, x9, x5, r0; vmul.f dm4, y3, y5, r2 + ; CHECK-NEXT: vldb x5, [p4, #64]; mov p5, p6 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; vldb x9, [p4, #0]; vshuffle x8, x9, x5, r0; vmul.f dm4, y3, y5, r2 ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vshuffle x9, x9, x5, r1 ; CHECK-NEXT: padda [p5], m5; movxm ls, #.LBB0_1 ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64]; vldb x4, [p7, #64]; movs p4, p7; movxm le, #.L_LEnd0; vmul.f dm4, y4, y5, r2 ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0]; vldb.3d x7, [p7], d0; vshuffle x6, x7, x4, r0 ; CHECK-NEXT: paddb [p4], m4; vshuffle x7, x7, x4, r1 - ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex1, dm4; vldb x9, [p4, #0]; mov p5, p6 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; vldb x5, [p4, #64]; vshuffle x8, x9, x5, r0; vmul.f dm4, y3, y5, r2 + ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex1, dm4; vldb x5, [p4, #64]; mov p5, p6 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; vldb x9, [p4, #0]; vshuffle x8, x9, x5, r0; vmul.f dm4, y3, y5, r2 ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vconv.bfp16ebs8.fp32 ex0, dm4; vshuffle x9, x9, x5, r1 ; CHECK-NEXT: padda [p5], m5; vconv.bfp16ebs8.fp32 ex3, dm4 ; CHECK-NEXT: .p2align 4 @@ -56,8 +57,8 @@ ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64]; vldb x4, [p7, #64]; movs p4, p7; nopxm ; vmul.f dm4, y4, y5, r2 ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0]; vldb.3d x7, [p7], d0; vconv.bfp16ebs8.fp32 ex2, dm4; nopx ; vshuffle x6, x7, x4, r0; nopv ; CHECK-NEXT: paddb [p4], m4; vshuffle x7, x7, x4, r1; vmac.f dm3, dm3, ex0, ex1, r3 - ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex1, dm4; vldb x9, [p4, #0]; mov p5, p6; vmac.f dm2, dm2, ex0, ex3, r3 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; vldb x5, [p4, #64]; vshuffle x8, x9, x5, r0; vmul.f dm4, y3, y5, r2 + ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex1, dm4; vldb x5, [p4, #64]; mov p5, p6; vmac.f dm2, dm2, ex0, ex3, r3 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; vldb x9, [p4, #0]; vshuffle x8, x9, x5, r0; vmul.f dm4, y3, y5, r2 ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vconv.bfp16ebs8.fp32 ex0, dm4; vshuffle x9, x9, x5, r1; vmac.f dm1, dm1, ex2, ex1, r3 ; CHECK-NEXT: .L_LEnd0: ; CHECK-NEXT: padda [p5], m5; nopb ; vconv.bfp16ebs8.fp32 ex3, dm4; nopxm ; vmac.f dm0, dm0, ex2, ex3, r3 diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/gemm-bfp16-v7-solver.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/gemm-bfp16-v7-solver.mir index 49e5498ad6d0..e0fffcf3895a 100644 --- a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/gemm-bfp16-v7-solver.mir +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/gemm-bfp16-v7-solver.mir @@ -22,65 +22,64 @@ ; CHECK-LABEL: gemm: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry - ; CHECK-NEXT: nopa ; vldb x4, [p7, #64]; nopxm + ; CHECK-NEXT: nopa ; vldb x4, [p7, #64]; nopxm ; nops ; CHECK-NEXT: vldb.3d x7, [p7], d0; movs p4, p7 ; CHECK-NEXT: paddb [p4], m4 - ; CHECK-NEXT: vldb x9, [p4, #0] ; CHECK-NEXT: vldb x5, [p4, #64] + ; CHECK-NEXT: vldb x9, [p4, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop - ; CHECK-NEXT: vldb x4, [p7, #64]; mov p5, p6 - ; CHECK-NEXT: movs p4, p7; vldb.3d x7, [p7], d0; vshuffle x6, x7, x4, r0 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; paddb [p4], m4; vshuffle x7, x7, x4, r1 - ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x9, [p4, #0]; vshuffle x8, x9, x5, r0 - ; CHECK-NEXT: vldb x5, [p4, #64]; vshuffle x9, x9, x5, r1 + ; CHECK-NEXT: vldb x4, [p7, #64]; vshuffle x6, x7, x4, r0 + ; CHECK-NEXT: movs p4, p7; vldb.3d x7, [p7], d0; vshuffle x7, x7, x4, r1 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; paddb [p4], m4; mov p5, p6 + ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x5, [p4, #64]; vshuffle x8, x9, x5, r0 + ; CHECK-NEXT: vldb x9, [p4, #0]; vshuffle x9, x9, x5, r1 ; CHECK-NEXT: padda [p5], m5; add.nc lc, r0, #-3; vmul.f dm4, y3, y5, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0]; movxm ls, #.LBB0_1; vmul.f dm4, y4, y5, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64]; movxm le, #.L_LEnd0 - ; CHECK-NEXT: vldb x4, [p7, #64]; mov p5, p6 - ; CHECK-NEXT: movs p4, p7; vldb.3d x7, [p7], d0; vshuffle x6, x7, x4, r0 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; paddb [p4], m4; vconv.bfp16ebs8.fp32 ex0, dm4; vshuffle x7, x7, x4, r1 - ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x9, [p4, #0]; vconv.bfp16ebs8.fp32 ex1, dm4; vshuffle x8, x9, x5, r0 - ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex3, dm4; vldb x5, [p4, #64]; vshuffle x9, x9, x5, r1 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64]; movxm ls, #.LBB0_1; vmul.f dm4, y4, y5, r2 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0]; movxm le, #.L_LEnd0 + ; CHECK-NEXT: nopa ; vldb x4, [p7, #64]; nopx ; vshuffle x6, x7, x4, r0; nops + ; CHECK-NEXT: movs p4, p7; vldb.3d x7, [p7], d0; vshuffle x7, x7, x4, r1 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; paddb [p4], m4; vconv.bfp16ebs8.fp32 ex0, dm4; mov p5, p6 + ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x5, [p4, #64]; vconv.bfp16ebs8.fp32 ex1, dm4; vshuffle x8, x9, x5, r0 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: padda [p5], m5; nopb ; nops ; nopxm ; vmul.f dm4, y3, y5, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0]; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64] - ; CHECK-NEXT: vldb x4, [p7, #64]; mov p5, p6 - ; CHECK-NEXT: movs p4, p7; vldb.3d x7, [p7], d0; vshuffle x6, x7, x4, r0; vmac.f dm3, dm3, ex0, ex1, r3 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; paddb [p4], m4; vconv.bfp16ebs8.fp32 ex0, dm4; nopx ; vshuffle x7, x7, x4, r1; vmac.f dm0, dm0, ex2, ex3, r3 - ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x9, [p4, #0]; vconv.bfp16ebs8.fp32 ex1, dm4; nopx ; vshuffle x8, x9, x5, r0; vmac.f dm1, dm1, ex2, ex1, r3 + ; CHECK-NEXT: nopa ; vldb x9, [p4, #0]; vconv.bfp16ebs8.fp32 ex3, dm4; nopx ; vshuffle x9, x9, x5, r1; nopv + ; CHECK-NEXT: padda [p5], m5; nopb ; nopx ; vmul.f dm4, y3, y5, r2 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64]; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0] + ; CHECK-NEXT: vldb x4, [p7, #64]; vshuffle x6, x7, x4, r0; vmac.f dm3, dm3, ex0, ex1, r3 + ; CHECK-NEXT: movs p4, p7; vldb.3d x7, [p7], d0; vshuffle x7, x7, x4, r1; vmac.f dm2, dm2, ex0, ex3, r3 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; paddb [p4], m4; vconv.bfp16ebs8.fp32 ex0, dm4; nopx ; mov p5, p6; vmac.f dm1, dm1, ex2, ex1, r3 ; CHECK-NEXT: .L_LEnd0: - ; CHECK-NEXT: nopa ; vldb x5, [p4, #64]; vconv.bfp16ebs8.fp32 ex3, dm4; nopx ; vshuffle x9, x9, x5, r1; vmac.f dm2, dm2, ex0, ex3, r3 + ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x5, [p4, #64]; vconv.bfp16ebs8.fp32 ex1, dm4; nopx ; vshuffle x8, x9, x5, r0; vmac.f dm0, dm0, ex2, ex3, r3 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: // %bb.2: - ; CHECK-NEXT: padda [p5], m5; nopb ; nops ; nopxm ; vmul.f dm4, y3, y5, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0]; nopb ; nopx ; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64] - ; CHECK-NEXT: mov p5, p6 + ; CHECK-NEXT: nopa ; vldb x9, [p4, #0]; nopx ; vshuffle x9, x9, x5, r1; vconv.bfp16ebs8.fp32 ex3, dm4 + ; CHECK-NEXT: padda [p5], m5; vmul.f dm4, y3, y5, r2 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64]; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0] ; CHECK-NEXT: vshuffle x6, x7, x4, r0; vmac.f dm3, dm3, ex0, ex1, r3 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; vconv.bfp16ebs8.fp32 ex0, dm4; vshuffle x7, x7, x4, r1; vmac.f dm0, dm0, ex2, ex3, r3 - ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vconv.bfp16ebs8.fp32 ex1, dm4; vshuffle x8, x9, x5, r0; vmac.f dm1, dm1, ex2, ex1, r3 - ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex3, dm4; vshuffle x9, x9, x5, r1; vmac.f dm2, dm2, ex0, ex3, r3 + ; CHECK-NEXT: vshuffle x7, x7, x4, r1; vmac.f dm2, dm2, ex0, ex3, r3 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; vconv.bfp16ebs8.fp32 ex0, dm4; mov p5, p6; vmac.f dm1, dm1, ex2, ex1, r3 + ; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vconv.bfp16ebs8.fp32 ex1, dm4; vshuffle x8, x9, x5, r0; vmac.f dm0, dm0, ex2, ex3, r3 + ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex3, dm4; vshuffle x9, x9, x5, r1 ; CHECK-NEXT: padda [p5], m5; vmul.f dm4, y3, y5, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0]; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2 - ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64] - ; CHECK-NEXT: nop + ; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64]; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2 + ; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0] ; CHECK-NEXT: vmac.f dm3, dm3, ex0, ex1, r3 - ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex0, dm4; vmac.f dm0, dm0, ex2, ex3, r3 - ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex1, dm4; vmac.f dm1, dm1, ex2, ex1, r3 - ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex3, dm4; vmac.f dm2, dm2, ex0, ex3, r3 + ; CHECK-NEXT: vmac.f dm2, dm2, ex0, ex3, r3 + ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex0, dm4; vmac.f dm1, dm1, ex2, ex1, r3 + ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex1, dm4; vmac.f dm0, dm0, ex2, ex3, r3 + ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex3, dm4 ; CHECK-NEXT: nop ; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex2, dm4 ; CHECK-NEXT: nop - ; CHECK-NEXT: nop ; CHECK-NEXT: vmac.f dm3, dm3, ex0, ex1, r3 - ; CHECK-NEXT: vmac.f dm0, dm0, ex2, ex3, r3 - ; CHECK-NEXT: vmac.f dm1, dm1, ex2, ex1, r3 ; CHECK-NEXT: vmac.f dm2, dm2, ex0, ex3, r3 + ; CHECK-NEXT: vmac.f dm1, dm1, ex2, ex1, r3 + ; CHECK-NEXT: vmac.f dm0, dm0, ex2, ex3, r3 ; CHECK-NEXT: ret lr ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/xmconflict.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/xmconflict.mir new file mode 100644 index 000000000000..8a106dafa37f --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/xmconflict.mir @@ -0,0 +1,112 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates + + +# This is a case that fits in II=2, but where the solver fails to find the +# schedule if it doesn't take the conflict between XM and X and M into account. +# We deliberately select one fixed heuristic that doesn't find it either +# in order to not short-circuit the solver -- perhaps we should have a +# --runsolver-first option. +# Please ignore the uselessness of the code -- I know it is mainly +# loop invariant, but that is not the subject of this test. + +# REQUIRES: enable_z3_solver +# RUN: llc -verify-machineinstrs --mtriple=aie2p -O2 %s \ +# RUN: --start-before=postmisched \ +# RUN: --aie-postpipeliner-heuristic-runs=1 \ +# RUN: --aie-postpipeliner-heuristic=3 \ +# RUN: -o - | FileCheck %s + +--- | + define dso_local void @movxm(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr { + ; CHECK-LABEL: movxm: + ; CHECK: .p2align 4 + ; CHECK-NEXT: // %bb.0: // %entry + ; CHECK-NEXT: add.nc lc, r0, #-1 + ; CHECK-NEXT: movxm ls, #.LBB0_1 + ; CHECK-NEXT: movxm le, #.L_LEnd0 + ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; nopb ; nops ; nopx ; mov r2, #4; nopv + ; CHECK-NEXT: nopa ; nopb ; nops ; movxm r1, #1234567; nopv + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_1: // %for.body + ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: nopa ; nopb ; nops ; or r3, r1, r2; mov r2, #4; nopv + ; CHECK-NEXT: .L_LEnd0: + ; CHECK-NEXT: nopa ; nopb ; st r3, [p0, #0]; movxm r1, #1234567; nopv + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: // %bb.2: + ; CHECK-NEXT: nopa ; nopb ; or r3, r1, r2; nopm ; nops + ; CHECK-NEXT: st r3, [p0, #0] + ; CHECK-NEXT: ret lr + ; CHECK-NEXT: nop // Delay Slot 5 + ; CHECK-NEXT: nop // Delay Slot 4 + ; CHECK-NEXT: nop // Delay Slot 3 + ; CHECK-NEXT: nop // Delay Slot 2 + ; CHECK-NEXT: nop // Delay Slot 1 + entry: + %cmp5 = icmp sgt i32 %n, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %n) + br label %for.body + + for.body: ; preds = %for.body.preheader, %for.body + %d.addr.07 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %d, %for.body.preheader ] + %s.addr.06 = phi ptr addrspace(6) [ %incdec.ptr1, %for.body ], [ %s, %for.body.preheader ] + %0 = load i32, ptr addrspace(6) %s.addr.06, align 4 + %add = add nsw i32 %0, 1 + store i32 %add, ptr addrspace(5) %d.addr.07, align 4 + %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %d.addr.07, i20 1 + %incdec.ptr1 = getelementptr inbounds i32, ptr addrspace(6) %s.addr.06, i20 1 + %1 = call i1 @llvm.loop.decrement.i32(i32 1) + br i1 %1, label %for.body, label %for.cond.cleanup, !llvm.loop !0 + + for.cond.cleanup: ; preds = %for.body, %entry + ret void + + } + + declare void @llvm.set.loop.iterations.i32(i32) + declare i1 @llvm.loop.decrement.i32(i32) + + !0 = distinct !{!0, !1, !2, !3} + !1 = !{!"llvm.loop.mustprogress"} + !2 = !{!"llvm.loop.itercount.range", i64 8} + !3 = !{!"llvm.loop.pipeline.initiationinterval", i64 2} +... +--- +name: movxm +alignment: 16 +tracksRegLiveness: true +body: | + bb.0.entry (align 16): + successors: %bb.2 + liveins: $p0, $r0 + + $lc = ADD_NC_mv_add_ri $r0, 0 + $ls = MOVXM %bb.2 + $le = MOVXM + + bb.2.for.body (align 16): + successors: %bb.2, %bb.3 + liveins: $p0 + $r1 = MOVXM 1234567 + $r2 = MOV_alu_mv_mv_mv_cg 4 + $r3 = OR $r1, $r2 + ST_dms_sts_idx_imm $r3, $p0, 0 + PseudoLoopEnd , %bb.2 + + bb.3 (align 16): + RET implicit $lr + DelayedSchedBarrier + +... diff --git a/llvm/test/CodeGen/AIE/lit.local.cfg b/llvm/test/CodeGen/AIE/lit.local.cfg index 3e574de4cbca..938cc458bb91 100644 --- a/llvm/test/CodeGen/AIE/lit.local.cfg +++ b/llvm/test/CodeGen/AIE/lit.local.cfg @@ -4,8 +4,16 @@ # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -# (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates +# (c) Copyright 2023-2025 Advanced Micro Devices, Inc. or its affiliates if not 'AIE' in config.root.targets: config.unsupported = True config.substitutions.insert(0, ('%topdown-multi', '--issue-limit=6 --aie-bottomup-cycles=0')) + +# Z3 Solver support for AIE tests +# The enable_z3_solver config variable is set in llvm/test/lit.site.cfg.py.in +# based on the LLVM_ENABLE_Z3_SOLVER CMake option. We use hasattr() to safely +# check for its presence in case this lit.local.cfg is used in a context where +# the parent configuration hasn't been loaded. +if hasattr(config, 'enable_z3_solver') and config.enable_z3_solver: + config.available_features.add("enable_z3_solver") diff --git a/llvm/test/lit.site.cfg.py.in b/llvm/test/lit.site.cfg.py.in index 0d02920323d2..0c7218718d6d 100644 --- a/llvm/test/lit.site.cfg.py.in +++ b/llvm/test/lit.site.cfg.py.in @@ -1,3 +1,10 @@ +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# Modifications (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates + @LIT_SITE_CFG_IN_HEADER@ import sys @@ -60,6 +67,7 @@ config.llvm_raevict_model_autogenerated = @LLVM_RAEVICT_MODEL_AUTOGENERATED@ config.expensive_checks = @LLVM_ENABLE_EXPENSIVE_CHECKS@ config.reverse_iteration = @LLVM_ENABLE_REVERSE_ITERATION@ config.dxil_tests = @LLVM_INCLUDE_DXIL_TESTS@ +config.enable_z3_solver = @LLVM_ENABLE_Z3_SOLVER@ config.have_llvm_driver = @LLVM_TOOL_LLVM_DRIVER_BUILD@ config.spirv_tools_tests = @LLVM_INCLUDE_SPIRV_TOOLS_TESTS@ config.have_vc_rev = @LLVM_APPEND_VC_REV@ diff --git a/llvm/unittests/Target/AIE/CMakeLists.txt b/llvm/unittests/Target/AIE/CMakeLists.txt index 23a1a7105dc6..8446b5e7d457 100644 --- a/llvm/unittests/Target/AIE/CMakeLists.txt +++ b/llvm/unittests/Target/AIE/CMakeLists.txt @@ -34,6 +34,7 @@ add_llvm_target_unittest(AIETests HazardRecognizerTest.cpp StaticBitSetTest.cpp SlotCountsTest.cpp + SlotOccupancyTest.cpp SlotStatisticsTest.cpp VirtUnrollAliasAnalysisTest.cpp ) diff --git a/llvm/unittests/Target/AIE/SlotOccupancyTest.cpp b/llvm/unittests/Target/AIE/SlotOccupancyTest.cpp new file mode 100644 index 000000000000..9bf6077aa046 --- /dev/null +++ b/llvm/unittests/Target/AIE/SlotOccupancyTest.cpp @@ -0,0 +1,445 @@ +//===- SlotOccupancyTest.cpp -----------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// +// +// Unit tests for SlotOccupancy. +// +//===----------------------------------------------------------------------===// + +#include "AIESlotOccupancy.h" +#include "AIESlotStructure.h" +#include "MCTargetDesc/AIEMCFormats.h" +#include "llvm/Support/MathExtras.h" +#include "gtest/gtest.h" + +using namespace llvm; + +namespace { + +// Mock slot structure mimicking AIE2P architecture +// Real slots (indices 0-6): A, B, X, M, S, V, L +// MSP classes (indices 7-10): +// - MSP_AB (index 7): can use A OR B +// - MSP_AXM (index 8): can use A OR X OR M +// - MSP_ABS (index 9): can use A OR B OR S +// - MSP_ABXM (index 10): can use A OR B OR X OR M +class MockSlotStructure : public AIESlotStructure { +public: + unsigned getNumRealSlots() const override { return 7; } + + SlotBits getMSPComposition(unsigned ClassIdx) const override { + // Real slots: composition = (1 << ClassIdx) + if (ClassIdx < 7) + return (1ULL << ClassIdx); + + // MSP classes + switch (ClassIdx) { + case 7: // MSP_AB: A + B + return (1ULL << 0) | (1ULL << 1); + case 8: // MSP_AXM: A + X + M + return (1ULL << 0) | (1ULL << 2) | (1ULL << 3); + case 9: // MSP_ABS: A + B + S + return (1ULL << 0) | (1ULL << 1) | (1ULL << 4); + case 10: // MSP_ABXM: A + B + X + M + return (1ULL << 0) | (1ULL << 1) | (1ULL << 2) | (1ULL << 3); + default: + return 0; + } + } +}; + +// Mock format interface +class MockFormatInterface : public AIEBaseMCFormats { + MockSlotStructure SlotStruct; + +public: + const AIESlotStructure &getSlotStructure() const override { + return SlotStruct; + } + + bool isFormatAvailable(uint64_t SlotSet) const override { + // Format rules: + // - Any combination of A, B, S, X, M, V is valid + // - L conflicts with X and M only + + bool HasL = (SlotSet & (1ULL << 6)) != 0; + bool HasX = (SlotSet & (1ULL << 2)) != 0; + bool HasM = (SlotSet & (1ULL << 3)) != 0; + + // L conflicts with X or M + if (HasL && (HasX || HasM)) + return false; + + return true; + } + + // Stub implementations for pure virtual methods + std::optional + getFormatDescIndex(unsigned int Opcode) const override { + return std::nullopt; + } + + const std::vector * + getAlternateInstsOpcode(unsigned int Opcode) const override { + return nullptr; + } + + const MCSlotInfo *getSlotInfo(const MCSlotKind Kind) const override { + return nullptr; + } + + const MCFormatDesc *getMCFormats() const override { return nullptr; } + + const PacketFormats &getPacketFormats() const override { + llvm_unreachable("Not implemented in mock"); + } + + ArrayRef getIsFormatAvailable() const override { return {}; } +}; + +// Utility function to create SlotOccupancy with a count for a specific class +SlotOccupancy makeOccupancy(unsigned ClassIdx, uint8_t Count) { + SlotOccupancy Result; + for (uint8_t I = 0; I < Count; ++I) { + Result |= SlotOccupancy(SlotBits(1) << ClassIdx); + } + return Result; +} + +} // anonymous namespace + +// Basic functionality tests +TEST(SlotOccupancy, ConstructorFromSlotBits) { + SlotOccupancy Occ(0b1010); + EXPECT_FALSE(Occ.isEmpty()); +} + +TEST(SlotOccupancy, ConstructorFromClassIndex) { + SlotOccupancy Occ = makeOccupancy(7, 2); + EXPECT_FALSE(Occ.isEmpty()); +} + +TEST(SlotOccupancy, Total) { + SlotOccupancy Occ1(0b0111); + EXPECT_EQ(Occ1.total(), 3u); + + SlotOccupancy Occ2 = makeOccupancy(7, 2); + EXPECT_EQ(Occ2.total(), 2u); +} + +TEST(SlotOccupancy, CapacityComputation) { + MockSlotStructure SS; + + // Real slots have capacity 1 (composition = single bit) + EXPECT_EQ(SS.getCapacity(0), 1u); // A + EXPECT_EQ(SS.getCapacity(1), 1u); // B + + // MSPs have capacity = popcount(composition) + EXPECT_EQ(SS.getCapacity(7), 2u); // MSP_AB {A,B} + EXPECT_EQ(SS.getCapacity(8), 3u); // MSP_AXM {A,X,M} + EXPECT_EQ(SS.getCapacity(9), 3u); // MSP_ABS {A,B,S} + EXPECT_EQ(SS.getCapacity(10), 4u); // MSP_ABXM {A,B,X,M} +} + +// MSP Materialization - MSP can use ANY slot from its composition + +TEST(SlotOccupancy, MSP_AB_NoConflictWithFixedA) { + MockFormatInterface FI; + // MSP_AB can use B since A is occupied + SlotOccupancy FixedA(0b0001); + SlotOccupancy MSP_AB = makeOccupancy(7, 1); + EXPECT_FALSE(MSP_AB.conflict(FixedA, FI)); +} + +TEST(SlotOccupancy, MSP_AB_NoConflictWithFixedB) { + MockFormatInterface FI; + // MSP_AB can use A since B is occupied + SlotOccupancy FixedB(0b0010); + SlotOccupancy MSP_AB = makeOccupancy(7, 1); + EXPECT_FALSE(MSP_AB.conflict(FixedB, FI)); +} + +TEST(SlotOccupancy, MSP_AB_ConflictsWithFixedAB) { + MockFormatInterface FI; + // MSP_AB cannot materialize if both A and B are occupied + SlotOccupancy FixedAB(0b0011); + SlotOccupancy MSP_AB = makeOccupancy(7, 1); + EXPECT_TRUE(MSP_AB.conflict(FixedAB, FI)); +} + +TEST(SlotOccupancy, TwoMSP_AB_NoConflict) { + MockFormatInterface FI; + // Two MSP_ABs can use A and B respectively + SlotOccupancy MSP1 = makeOccupancy(7, 1); + SlotOccupancy MSP2 = makeOccupancy(7, 1); + EXPECT_FALSE(MSP1.conflict(MSP2, FI)); +} + +TEST(SlotOccupancy, ThreeMSP_AB_ExceedsCapacity) { + MockFormatInterface FI; + // Three MSP_ABs exceed capacity of 2 + SlotOccupancy MSP1 = makeOccupancy(7, 2); + SlotOccupancy MSP2 = makeOccupancy(7, 1); + EXPECT_TRUE(MSP1.conflict(MSP2, FI)); +} + +TEST(SlotOccupancy, MSP_AXM_WithOneSlotOccupied) { + MockFormatInterface FI; + // MSP_AXM {A,X,M} can still use 2 other slots if one is occupied + SlotOccupancy FixedA(0b0001); + SlotOccupancy MSP_AXM = makeOccupancy(8, 1); + EXPECT_FALSE(MSP_AXM.conflict(FixedA, FI)); +} + +TEST(SlotOccupancy, MSP_AXM_WithTwoSlotsOccupied) { + MockFormatInterface FI; + // MSP_AXM can still use M if A and X are occupied + SlotOccupancy FixedAX(0b0101); + SlotOccupancy MSP_AXM = makeOccupancy(8, 1); + EXPECT_FALSE(MSP_AXM.conflict(FixedAX, FI)); +} + +TEST(SlotOccupancy, MSP_AXM_ConflictsWithAllSlotsOccupied) { + MockFormatInterface FI; + // MSP_AXM cannot materialize if A, X, and M are all occupied + SlotOccupancy FixedAXM(0b1101); + SlotOccupancy MSP_AXM = makeOccupancy(8, 1); + EXPECT_TRUE(MSP_AXM.conflict(FixedAXM, FI)); +} + +TEST(SlotOccupancy, TwoMSPs_AB_AXM_NoConflict) { + MockFormatInterface FI; + // MSP_AB can use B, MSP_AXM can use X or M - no conflict + SlotOccupancy MSP_AB = makeOccupancy(7, 1); + SlotOccupancy MSP_AXM = makeOccupancy(8, 1); + EXPECT_FALSE(MSP_AB.conflict(MSP_AXM, FI)); +} + +TEST(SlotOccupancy, MSP_ABXM_WithThreeSlotsOccupied) { + MockFormatInterface FI; + // MSP_ABXM can use the one remaining slot + SlotOccupancy FixedABX(0b0111); + SlotOccupancy MSP_ABXM = makeOccupancy(10, 1); + EXPECT_FALSE(MSP_ABXM.conflict(FixedABX, FI)); +} + +TEST(SlotOccupancy, FourMSP_ABXM_FillAllSlots) { + MockFormatInterface FI; + // Four MSP_ABXMs can each use one of A, B, X, M + SlotOccupancy MSP = makeOccupancy(10, 4); + EXPECT_FALSE(MSP.isEmpty()); + // Should not conflict with itself at capacity + MockSlotStructure SS; + const SlotOccupancy Bounds = SS.getCapacityBounds(); + EXPECT_TRUE(MSP.boundedBy(Bounds)); +} + +// L slot conflicts + +TEST(SlotOccupancy, LConflictsWithX) { + MockFormatInterface FI; + SlotOccupancy FixedL(0b1000000); + SlotOccupancy FixedX(0b0000100); + EXPECT_TRUE(FixedL.conflict(FixedX, FI)); +} + +TEST(SlotOccupancy, LConflictsWithM) { + MockFormatInterface FI; + SlotOccupancy FixedL(0b1000000); + SlotOccupancy FixedM(0b0001000); + EXPECT_TRUE(FixedL.conflict(FixedM, FI)); +} + +TEST(SlotOccupancy, MSP_AXM_NoConflictWithFixedL) { + MockFormatInterface FI; + // MSP_AXM can materialize to A, which doesn't conflict with L + SlotOccupancy FixedL(0b1000000); + SlotOccupancy MSP_AXM = makeOccupancy(8, 1); + EXPECT_FALSE(MSP_AXM.conflict(FixedL, FI)); +} + +TEST(SlotOccupancy, MSP_AB_NoConflictWithFixedL) { + MockFormatInterface FI; + // MSP_AB uses A or B, which don't conflict with L + SlotOccupancy FixedL(0b1000000); + SlotOccupancy MSP_AB = makeOccupancy(7, 1); + EXPECT_FALSE(MSP_AB.conflict(FixedL, FI)); +} + +TEST(SlotOccupancy, PureMSPs_AB_ABS_CannotMaterialize) { + MockFormatInterface FI; + // 2x MSP_AB will use A and B + // 2x MSP_ABS needs 2 slots from {A,B,S}, but A and B are taken + // Only S is available, which is insufficient for 2 instances + SlotOccupancy TwoMSP_AB = makeOccupancy(7, 2); // Uses A and B + SlotOccupancy TwoMSP_ABS = + makeOccupancy(9, 2); // Needs 2 from {A,B,S}, only S left + EXPECT_TRUE(TwoMSP_AB.conflict(TwoMSP_ABS, FI)); +} + +// MSPSlotMapping Tests + +TEST(MSPSlotMapping, EmptyMapping) { + MSPSlotMapping Mapping; + EXPECT_TRUE(Mapping.isEmpty()); + EXPECT_TRUE(Mapping.getCurrentOccupancy().isEmpty()); +} + +TEST(MSPSlotMapping, RealSlotsOnly) { + MockFormatInterface FI; + // Bundle with only real slots: A, B, X + SlotOccupancy Bundle(0b0111); + + MSPSlotMapping Mapping(Bundle, FI); + EXPECT_TRUE(Mapping.isEmpty()); // No MSPs to materialize + + // Real slots materialize to themselves + EXPECT_EQ(Mapping.materializeAlternative(0), 0u); // A -> A + EXPECT_EQ(Mapping.materializeAlternative(1), 1u); // B -> B + EXPECT_EQ(Mapping.materializeAlternative(2), 2u); // X -> X +} + +TEST(MSPSlotMapping, SingleMSP_AB) { + MockFormatInterface FI; + // Bundle with one MSP_AB instance + SlotOccupancy Bundle = makeOccupancy(7, 1); + + MSPSlotMapping Mapping(Bundle, FI); + EXPECT_FALSE(Mapping.isEmpty()); + + // MSP_AB should materialize to A (lowest available slot) + const unsigned Slot = Mapping.materializeAlternative(7); + EXPECT_EQ(Slot, 0u); // Should be A + + // Current occupancy should now reflect A being used + const SlotOccupancy &Current = Mapping.getCurrentOccupancy(); + EXPECT_EQ(Current.at(0), 1u); // A is occupied +} + +TEST(MSPSlotMapping, TwoMSP_AB_Instances) { + MockFormatInterface FI; + // Bundle with two MSP_AB instances + SlotOccupancy Bundle = makeOccupancy(7, 2); + + MSPSlotMapping Mapping(Bundle, FI); + EXPECT_FALSE(Mapping.isEmpty()); + + // First instance should get A, second should get B + const unsigned Slot1 = Mapping.materializeAlternative(7); + EXPECT_EQ(Slot1, 0u); // A + + const unsigned Slot2 = Mapping.materializeAlternative(7); + EXPECT_EQ(Slot2, 1u); // B + + // Current occupancy should reflect both A and B being used + const SlotOccupancy &Current = Mapping.getCurrentOccupancy(); + EXPECT_EQ(Current.at(0), 1u); // A is occupied + EXPECT_EQ(Current.at(1), 1u); // B is occupied +} + +TEST(MSPSlotMapping, MixedRealAndMSP) { + MockFormatInterface FI; + // Bundle with real slot A and MSP_AB + SlotOccupancy RealA(0b0001); + SlotOccupancy MSP_AB = makeOccupancy(7, 1); + SlotOccupancy Bundle = RealA | MSP_AB; + + MSPSlotMapping Mapping(Bundle, FI); + EXPECT_FALSE(Mapping.isEmpty()); + + // Real slot A materializes to itself + EXPECT_EQ(Mapping.materializeAlternative(0), 0u); + + // MSP_AB should materialize to B (A is already occupied) + const unsigned Slot = Mapping.materializeAlternative(7); + EXPECT_EQ(Slot, 1u); // B + + // Current occupancy should reflect both A and B + const SlotOccupancy &Current = Mapping.getCurrentOccupancy(); + EXPECT_EQ(Current.at(0), 1u); // A + EXPECT_EQ(Current.at(1), 1u); // B +} + +TEST(MSPSlotMapping, MultipleMSPClasses) { + MockFormatInterface FI; + // Bundle with MSP_AB and MSP_AXM + SlotOccupancy MSP_AB = makeOccupancy(7, 1); // Can use A or B + SlotOccupancy MSP_AXM = makeOccupancy(8, 1); // Can use A, X, or M + SlotOccupancy Bundle = MSP_AB | MSP_AXM; + + MSPSlotMapping Mapping(Bundle, FI); + EXPECT_FALSE(Mapping.isEmpty()); + + // MSP_AB should get A (lowest available) + const unsigned Slot1 = Mapping.materializeAlternative(7); + EXPECT_EQ(Slot1, 0u); // A + + // MSP_AXM should get X (A is taken, X is next lowest) + const unsigned Slot2 = Mapping.materializeAlternative(8); + EXPECT_EQ(Slot2, 2u); // X + + // Current occupancy should reflect A and X + const SlotOccupancy &Current = Mapping.getCurrentOccupancy(); + EXPECT_EQ(Current.at(0), 1u); // A + EXPECT_EQ(Current.at(2), 1u); // X +} + +TEST(MSPSlotMapping, ComplexBundle) { + MockFormatInterface FI; + // Complex bundle: Real slot S + 1x MSP_AB + 1x MSP_AXM + // This is feasible: MSP_AB can use A or B, MSP_AXM can use A, X, or M + SlotOccupancy RealS(0b10000); + SlotOccupancy OneMSP_AB = makeOccupancy(7, 1); + SlotOccupancy OneMSP_AXM = makeOccupancy(8, 1); + SlotOccupancy Bundle = RealS | OneMSP_AB | OneMSP_AXM; + + MSPSlotMapping Mapping(Bundle, FI); + EXPECT_FALSE(Mapping.isEmpty()); + + // Materialize real slot + EXPECT_EQ(Mapping.materializeAlternative(4), 4u); // S -> S + + // Materialize MSP_AB (should get A, lowest available) + const unsigned AB = Mapping.materializeAlternative(7); + EXPECT_EQ(AB, 0u); // A + + // MSP_AXM should get X (A is taken, X is next lowest in composition) + const unsigned AXM = Mapping.materializeAlternative(8); + EXPECT_EQ(AXM, 2u); // X + + const SlotOccupancy &Current = Mapping.getCurrentOccupancy(); + EXPECT_EQ(Current.at(0), 1u); // A + EXPECT_EQ(Current.at(2), 1u); // X + EXPECT_EQ(Current.at(4), 1u); // S +} + +TEST(MSPSlotMapping, IterativeRetrieval) { + MockFormatInterface FI; + // Bundle with 3x MSP_AXM (can use A, X, or M) + SlotOccupancy Bundle = makeOccupancy(8, 3); + + MSPSlotMapping Mapping(Bundle, FI); + EXPECT_FALSE(Mapping.isEmpty()); + + // Retrieve all three materializations + const unsigned Slot1 = Mapping.materializeAlternative(8); + const unsigned Slot2 = Mapping.materializeAlternative(8); + const unsigned Slot3 = Mapping.materializeAlternative(8); + + // Should get A, X, M in some order (greedy assignment uses lowest first) + EXPECT_EQ(Slot1, 0u); // A + EXPECT_EQ(Slot2, 2u); // X + EXPECT_EQ(Slot3, 3u); // M + + // All three slots should be occupied + const SlotOccupancy &Current = Mapping.getCurrentOccupancy(); + EXPECT_EQ(Current.at(0), 1u); // A + EXPECT_EQ(Current.at(2), 1u); // X + EXPECT_EQ(Current.at(3), 1u); // M +} diff --git a/llvm/utils/TableGen/CodeGenFormat.cpp b/llvm/utils/TableGen/CodeGenFormat.cpp index 6e4cbe5b1544..0eefc177b10d 100644 --- a/llvm/utils/TableGen/CodeGenFormat.cpp +++ b/llvm/utils/TableGen/CodeGenFormat.cpp @@ -163,6 +163,10 @@ void CodeGenFormat::run(raw_ostream &o) { o << " }\n}\n"; o << "#endif // GET_ALTERNATE_INST_OPCODE_FUNC\n\n"; + // Emit SlotStructure tables for unified slot class indexing + CodeGenFormat::emitSlotStructureTables(o, Target, Slots, InstFormats, + PseudoInstFormats); + if (InstFormats.size() > 0 && Slots.size() > 0) { o << "#ifdef GET_FORMATS_FORMATS_DEFS\n" << "#undef GET_FORMATS_FORMATS_DEFS\n\n"; @@ -1240,5 +1244,179 @@ bool TGFieldIterator::operator==(const TGFieldIterator &Other) const { return FieldCurrent == Other.FieldCurrent; } +void CodeGenFormat::emitSlotStructureTables( + raw_ostream &o, CodeGenTarget &Target, const TGTargetSlots &Slots, + const std::vector &InstFormats, + const std::vector &PseudoInstFormats) { + + const std::string TargetName = Target.getName().str(); + + // Count real slots (non-default, non-artificial) + unsigned NumRealSlots = 0; + for (const auto &[_, Slot] : Slots) { + if (!Slot.isDefaultSlot() && !Slot.isArtificial()) { + NumRealSlots++; + } + } + + // Build a map from instruction name to its slot bits + std::map InstrToSlotBits; + for (const TGInstrLayout &Inst : InstFormats) { + if (!Inst.hasMultipleSlotOptions() && !Inst.isPacketFormat()) { + // Find the slot for this instruction + for (const auto *SlotField : Inst.slots()) { + const TGTargetSlot *Slot = SlotField->getSlot(); + if (Slot && !Slot->isDefaultSlot()) { + InstrToSlotBits[Inst.getInstrName()] = Slot->getSlotBits(); + break; + } + } + } + } + + // Compute MSP compositions and deduplicate by composition + std::map CompositionToClassIdx; + std::vector> MSPCompositions; + + for (const TGInstrLayout &PseudoInst : PseudoInstFormats) { + uint64_t Composition = 0; + const std::vector &AltInsts = PseudoInst.getAlternateInsts(); + + // Compute composition from alternate instructions + for (const std::string &AltInstFull : AltInsts) { + // Extract instruction name from "Target::InstrName" + size_t ColonPos = AltInstFull.find("::"); + std::string AltInstName = (ColonPos != std::string::npos) + ? AltInstFull.substr(ColonPos + 2) + : AltInstFull; + + auto It = InstrToSlotBits.find(AltInstName); + if (It != InstrToSlotBits.end()) { + Composition |= It->second; + } + } + + if (Composition != 0) { + MSPCompositions.push_back({PseudoInst.getInstrName(), Composition}); + + // Assign class index if this composition is new + if (CompositionToClassIdx.find(Composition) == + CompositionToClassIdx.end()) { + CompositionToClassIdx[Composition] = + NumRealSlots + CompositionToClassIdx.size(); + } + } + } + + const unsigned NumMSPClasses = CompositionToClassIdx.size(); + const unsigned TotalClasses = NumRealSlots + NumMSPClasses; + + // Emit GET_SLOTSTRUCTURE_NUMREALSLOTS + o << "#ifdef GET_SLOTSTRUCTURE_NUMREALSLOTS\n" + << "#undef GET_SLOTSTRUCTURE_NUMREALSLOTS\n" + << "static constexpr unsigned NumRealSlots = " << NumRealSlots << ";\n" + << "static constexpr unsigned NumMSPClasses = " << NumMSPClasses << ";\n" + << "static constexpr unsigned TotalSlotClasses = " << TotalClasses << ";\n" + << "#endif // GET_SLOTSTRUCTURE_NUMREALSLOTS\n\n"; + + // Emit GET_SLOTSTRUCTURE_COMPOSITIONS using ConstTable + o << "#ifdef GET_SLOTSTRUCTURE_COMPOSITIONS\n" + << "#undef GET_SLOTSTRUCTURE_COMPOSITIONS\n"; + + ConstTable Compositions("uint64_t", "SlotCompositions"); + + // Real slots: composition is (1 << slotIdx) + for (const auto &[_, Slot] : Slots) { + if (!Slot.isDefaultSlot() && !Slot.isArtificial()) { + Compositions << Slot.getSlotBits() + << "ULL /* Real slot: " << Slot.getSlotName() << " */"; + Compositions.next(); + } + } + + // MSP classes: emit in order of class index + std::vector> MSPClassEntries; + for (const auto &[Composition, ClassIdx] : CompositionToClassIdx) { + MSPClassEntries.push_back({ClassIdx, Composition}); + } + std::sort(MSPClassEntries.begin(), MSPClassEntries.end()); + + for (const auto &[ClassIdx, Composition] : MSPClassEntries) { + Compositions << Composition << "ULL /* MSP class " << ClassIdx << " */"; + Compositions.next(); + } + + Compositions.finish(); + o << Compositions; + o << "#endif // GET_SLOTSTRUCTURE_COMPOSITIONS\n\n"; + + // Emit GET_SLOTSTRUCTURE_MSP_OPCODE_TO_CLASS + o << "#ifdef GET_SLOTSTRUCTURE_MSP_OPCODE_TO_CLASS\n" + << "#undef GET_SLOTSTRUCTURE_MSP_OPCODE_TO_CLASS\n" + << "static MultiSlotClass getMSPClassIndexForOpcode(unsigned Opcode) {\n" + << " switch (Opcode) {\n" + << " default:\n" + << " return MultiSlotClass::NoClass; // Not an MSP\n"; + + for (const auto &[MSPName, Composition] : MSPCompositions) { + unsigned ClassIdx = CompositionToClassIdx[Composition]; + o << " case " << TargetName << "::" << MSPName << ":\n" + << " return static_cast(" << ClassIdx << ");\n"; + } + + o << " }\n" + << "}\n" + << "#endif // GET_SLOTSTRUCTURE_MSP_OPCODE_TO_CLASS\n\n"; + + // Emit GET_SLOTSTRUCTURE_MSP_MATERIALIZATION + // This provides (opcode, slot index) -> real instruction opcode mapping + o << "#ifdef GET_SLOTSTRUCTURE_MSP_MATERIALIZATION\n" + << "#undef GET_SLOTSTRUCTURE_MSP_MATERIALIZATION\n" + << "static unsigned getMaterializedOpcodeImpl(unsigned Opcode, unsigned " + "SlotIdx) {\n" + << " switch (Opcode) {\n" + << " default:\n" + << " return 0; // Not an MSP or invalid\n"; + + // For each MSP, emit a nested switch on slot index + for (const TGInstrLayout &PseudoInst : PseudoInstFormats) { + const std::vector &AltInsts = PseudoInst.getAlternateInsts(); + if (AltInsts.empty()) + continue; + + o << " case " << TargetName << "::" << PseudoInst.getInstrName() << ":\n" + << " switch (SlotIdx) {\n" + << " default: return 0;\n"; + + // Map each alternate instruction to its slot index + for (const std::string &AltInstFull : AltInsts) { + // Extract instruction name + size_t ColonPos = AltInstFull.find("::"); + std::string AltInstName = (ColonPos != std::string::npos) + ? AltInstFull.substr(ColonPos + 2) + : AltInstFull; + + // Find the slot index for this instruction + auto It = InstrToSlotBits.find(AltInstName); + if (It != InstrToSlotBits.end()) { + uint64_t SlotBit = It->second; + // Find the slot index from the bit position + unsigned SlotIndex = 0; + while (SlotBit > 1) { + SlotBit >>= 1; + SlotIndex++; + } + o << " case " << SlotIndex << ": return " << AltInstFull << ";\n"; + } + } + + o << " }\n"; + } + + o << " }\n" + << "}\n" + << "#endif // GET_SLOTSTRUCTURE_MSP_MATERIALIZATION\n\n"; +} + static TableGen::Emitter::OptClass X("gen-instr-format", "Instruction Format Emitter"); diff --git a/llvm/utils/TableGen/CodeGenFormat.h b/llvm/utils/TableGen/CodeGenFormat.h index 6bcabd3d2ad3..e94c91febf32 100644 --- a/llvm/utils/TableGen/CodeGenFormat.h +++ b/llvm/utils/TableGen/CodeGenFormat.h @@ -134,6 +134,11 @@ class CodeGenFormat { unsigned BitPos); static void computeSlotSets(TGTargetSlots &Slots, std::vector &InstFormats); + static void + emitSlotStructureTables(raw_ostream &o, CodeGenTarget &Target, + const TGTargetSlots &Slots, + const std::vector &InstFormats, + const std::vector &PseudoInstFormats); }; /// Main class abstracting a CodeGenInstruction (CGI).