Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AIE2] Combiners for 8x8->8x8 and 8x4->4x8 matrix transposes #76

Open
wants to merge 12 commits into
base: vvandebe.shufflevector.pattern.optimization
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 22 additions & 7 deletions llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "llvm/CodeGenTypes/LowLevelType.h"
#include "llvm/IR/InstrTypes.h"
#include <functional>
#include <optional>

namespace llvm {

Expand Down Expand Up @@ -245,19 +246,33 @@ class CombinerHelper {
/// or an implicit_def if \p Ops is empty.
void applyCombineShuffleConcat(MachineInstr &MI, SmallVector<Register> &Ops);

/// Try to combine G_SHUFFLE_VECTOR into G_CONCAT_VECTORS.
/// A function type that returns either the next value in a
/// shufflemask or an empty value. Each iteration should return
/// one value, like a Python iterator or a Lisp stream.
using GeneratorType = std::function<std::optional<int32_t>()>;

/// Try to combine G_SHUFFLE_VECTOR into more efficient opcodes.
/// Returns true if MI changed.
///
/// \pre MI.getOpcode() == G_SHUFFLE_VECTOR.
bool tryCombineShuffleVector(MachineInstr &MI);
/// Check if the G_SHUFFLE_VECTOR \p MI can be replaced by a
/// concat_vectors.
/// \p Ops will contain the operands needed to produce the flattened
/// concat_vectors.
/// Check if the G_SHUFFLE_VECTOR \p MI can be replaced by checking
/// whether the shufflemask given matches that of a given generator.
///
/// \pre MI.getOpcode() == G_SHUFFLE_VECTOR.
bool matchCombineShuffleVector(MachineInstr &MI,
SmallVectorImpl<Register> &Ops);
bool matchCombineShuffleVector(MachineInstr &MI, GeneratorType Generator,
const size_t TargetDstSize);

/// Create G_UNMERGE_VECTOR instructions until the source has reached a
/// target vector size.
///
/// Requires that the destination fits evenly in the source register. It
/// allows you to pass which of the different destination sized slices
/// you require.
Register createUnmergeValue(MachineInstr &MI, const Register SrcReg,
const Register DstReg, uint8_t DestinationIndex,
const uint32_t Start, const uint32_t End);

/// Replace \p MI with a concat_vectors with \p Ops.
void applyCombineShuffleVector(MachineInstr &MI,
const ArrayRef<Register> Ops);
Expand Down
249 changes: 214 additions & 35 deletions llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetMachine.h"
#include <cmath>
#include <cstdint>
#include <functional>
#include <optional>
#include <tuple>

Expand Down Expand Up @@ -384,17 +386,221 @@ void CombinerHelper::applyCombineShuffleConcat(MachineInstr &MI,
MI.eraseFromParent();
}

// Create a stream from 0 to n with a specified number of steps
CombinerHelper::GeneratorType
adderGenerator(const int32_t From, const int32_t To, const int32_t StepSize) {
int32_t Counter = From;
return [Counter, To, StepSize]() mutable {
std::optional<int32_t> OldCount = std::optional<int32_t>(Counter);
Counter += StepSize;
if (OldCount == (To + StepSize))
OldCount = {};
return OldCount;
};
}

// Move to the next generator if it is exhausted allowing to chain generators
CombinerHelper::GeneratorType
concatGenerators(SmallVector<CombinerHelper::GeneratorType> &Generators) {
auto *GeneratorIterator = Generators.begin();

return [GeneratorIterator, Generators]() mutable {
std::optional<int32_t> GenValue = (*GeneratorIterator)();
if (!GenValue.has_value() && GeneratorIterator != Generators.end()) {
GeneratorIterator++;
GenValue = (*GeneratorIterator)();
}
return GenValue;
};
}

Register CombinerHelper::createUnmergeValue(
MachineInstr &MI, const Register SrcReg, const Register DstReg,
const uint8_t DestinationIndex, const uint32_t Start, const uint32_t End) {
Builder.setInsertPt(*MI.getParent(), MI);
const LLT DstTy = MRI.getType(DstReg);
const LLT SrcTy = MRI.getType(SrcReg);
assert((DstTy.isScalar() ||
(SrcTy.getNumElements() % DstTy.getNumElements()) == 0) &&
"destination vector must divide source cleanly");

const unsigned HalfElements = SrcTy.getNumElements() / 2;
const LLT ScalarTy = SrcTy.getScalarType();
const LLT HalfSizeTy = (HalfElements == 1)
? ScalarTy
: LLT::fixed_vector(HalfElements, ScalarTy);
const Register TmpReg = MRI.createGenericVirtualRegister(HalfSizeTy);
Register TargetReg = DstReg;
if (DstTy != HalfSizeTy) {
TargetReg = MRI.createGenericVirtualRegister(HalfSizeTy);
}

// Each destination fits n times into the source and each iteration we exactly
// half the source. Therefore we need to pick on which side we want to iterate
// on.
const uint32_t DstNumElements = DstTy.isVector() ? DstTy.getNumElements() : 1;
const uint32_t HalfWay = Start + ((End - Start) / 2);
const uint32_t Position = DestinationIndex * DstNumElements;

uint32_t NextStart, NextEnd;
if (Position < HalfWay) {
Builder.buildInstr(TargetOpcode::G_UNMERGE_VALUES, {TargetReg, TmpReg},
{SrcReg});
NextStart = Start;
NextEnd = HalfWay;
} else {
Builder.buildInstr(TargetOpcode::G_UNMERGE_VALUES, {TmpReg, TargetReg},
{SrcReg});
NextStart = HalfWay;
NextEnd = End;
}

if (HalfSizeTy.isVector() && DstTy != HalfSizeTy)
return createUnmergeValue(MI, TargetReg, DstReg, DestinationIndex,
NextStart, NextEnd);

return DstReg;
}

bool CombinerHelper::tryCombineShuffleVector(MachineInstr &MI) {
const Register DstReg = MI.getOperand(0).getReg();
const Register SrcReg1 = MI.getOperand(1).getReg();
const Register SrcReg2 = MI.getOperand(2).getReg();

const LLT DstTy = MRI.getType(DstReg);
const LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());

const unsigned DstNumElts = DstTy.isVector() ? DstTy.getNumElements() : 1;
const unsigned SrcNumElts = SrcTy.isVector() ? SrcTy.getNumElements() : 1;

// This test is a bit silly, but it is required because some tests rely on
// the legalizer changing the type of the shufflevector.
if (DstTy.getScalarSizeInBits() == 1)
return false;

// {1, 2, ..., n} -> G_CONCAT_VECTOR
// Turns a shuffle vector that only increments into a concat vector
// instruction
GeneratorType CountUp = adderGenerator(0, DstNumElts - 1, 1);
SmallVector<Register, 4> Ops;
if (matchCombineShuffleVector(MI, Ops)) {

if (matchCombineShuffleVector(MI, CountUp, 2 * SrcNumElts)) {
// The shuffle is concatenating multiple vectors together.
// Collect the different operands for that.
Register UndefReg;
const Register Src1 = MI.getOperand(1).getReg();
const Register Src2 = MI.getOperand(2).getReg();

const ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();

// The destination can be longer than the source, so we separate them into
// equal blocks and check them separately to see if one of the blocks can be
// copied whole.
unsigned NumConcat = DstNumElts / SrcNumElts;
unsigned Index = 0;
for (unsigned Concat = 0; Concat < NumConcat; Concat++) {
unsigned Target = (Concat + 1) * SrcNumElts;
while (Index < Target) {
int MaskElt = Mask[Index];
if (MaskElt >= 0) {
Ops.push_back((MaskElt < (int)SrcNumElts) ? Src1 : Src2);
break;
}
Index++;
}

if (Index == Target) {
if (!UndefReg) {
Builder.setInsertPt(*MI.getParent(), MI);
UndefReg = Builder.buildUndef(SrcTy).getReg(0);
}
Ops.push_back(UndefReg);
}

Index = Target;
}

applyCombineShuffleVector(MI, Ops);
return true;
}

// {1, 2, ..., |DstVector|} -> G_UNMERGE_VALUES
// Extracts the first chunk of the same size of the destination vector from
// the source
GeneratorType FirstQuarter = adderGenerator(0, DstNumElts - 1, 1);
if (matchCombineShuffleVector(MI, FirstQuarter, DstNumElts - 1)) {
// This optimization does not work if the target type is not a multiple of
// two, this can happen in some backends that support uneven vector types.
// We also need to make sure that the vector can be split into two.
if (SrcTy == DstTy || ((SrcNumElts / 2) % 2) != 0 ||
SrcNumElts % DstNumElts != 0)
return false;
ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
const Register TargetReg = Mask[0] < (int)SrcNumElts ? SrcReg1 : SrcReg2;
createUnmergeValue(MI, TargetReg, DstReg, 0, 0, SrcNumElts);
MI.eraseFromParent();
return true;
}

// {|DstVector|, |DstVector|+1, ..., 2 * |DstVector|} -> G_UNMERGE_VALUES
// Extracts the second chunk of the same size of the destination vector from
// the source
GeneratorType SecondQuarter =
adderGenerator(DstNumElts, (DstNumElts * 2) - 1, 1);
if (matchCombineShuffleVector(MI, SecondQuarter, DstNumElts - 1)) {
if (((SrcNumElts / 2) % 2) != 0 || SrcNumElts % DstNumElts != 0)
return false;
ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
const Register TargetReg = Mask[0] < (int)SrcNumElts ? SrcReg1 : SrcReg2;
createUnmergeValue(MI, TargetReg, DstReg, 1, 0, SrcNumElts);
MI.eraseFromParent();
return true;
}

// After this point, it is assumed our shufflevectors work on vectors that can
// be splint into two
if ((DstNumElts % 2) != 0)
return false;

// {1, 2, ..., n/4, n/2, n/2+1, .... 3n/4} -> G_UNMERGE_VALUES
// Take the first halfs of the two vectors and concatenate them into one
// vector.
GeneratorType FirstEightA = adderGenerator(0, (DstNumElts / 2) - 1, 1);
GeneratorType FirstEightB =
adderGenerator(DstNumElts, DstNumElts + (DstNumElts / 2) - 1, 1);

auto UnmergeMatcher = SmallVector<GeneratorType>{FirstEightA, FirstEightB};
GeneratorType FirstAndThird = concatGenerators(UnmergeMatcher);
if (matchCombineShuffleVector(MI, FirstAndThird, (DstNumElts / 2) - 1)) {
if (DstNumElts <= 2)
return false;
const Register DstReg = MI.getOperand(0).getReg();
const LLT HalfSrcTy =
LLT::fixed_vector(SrcNumElts / 2, SrcTy.getScalarType());
const Register HalfOfA = createUnmergeValue(
MI, MI.getOperand(1).getReg(),
MRI.createGenericVirtualRegister(HalfSrcTy), 0, 0, SrcNumElts);
const Register HalfOfB = createUnmergeValue(
MI, MI.getOperand(2).getReg(),
MRI.createGenericVirtualRegister(HalfSrcTy), 0, 0, SrcNumElts);

const ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
if (Mask[0] <= 0) {
Builder.buildMergeLikeInstr(DstReg, {HalfOfA, HalfOfB});
} else {
Builder.buildMergeLikeInstr(DstReg, {HalfOfB, HalfOfA});
}

MI.eraseFromParent();
return true;
}

return false;
}

bool CombinerHelper::matchCombineShuffleVector(MachineInstr &MI,
SmallVectorImpl<Register> &Ops) {
GeneratorType Generator,
const size_t TargetDstSize) {
assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
"Invalid instruction kind");
LLT DstType = MRI.getType(MI.getOperand(0).getReg());
Expand All @@ -421,51 +627,24 @@ bool CombinerHelper::matchCombineShuffleVector(MachineInstr &MI,
//
// TODO: If the size between the source and destination don't match
// we could still emit an extract vector element in that case.
if (DstNumElts < 2 * SrcNumElts && DstNumElts != 1)
return false;

// Check that the shuffle mask can be broken evenly between the
// different sources.
if (DstNumElts % SrcNumElts != 0)
if ((DstNumElts < TargetDstSize) && DstNumElts != 1)
return false;

// Mask length is a multiple of the source vector length.
// Check if the shuffle is some kind of concatenation of the input
// vectors.
unsigned NumConcat = DstNumElts / SrcNumElts;
SmallVector<int, 8> ConcatSrcs(NumConcat, -1);
ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
for (unsigned i = 0; i != DstNumElts; ++i) {
int Idx = Mask[i];
const int32_t ShiftIndex = Generator().value_or(-1);

// Undef value.
if (Idx < 0)
if (Idx < 0 || ShiftIndex < 0)
continue;

// Ensure the indices in each SrcType sized piece are sequential and that
// the same source is used for the whole piece.
if ((Idx % SrcNumElts != (i % SrcNumElts)) ||
(ConcatSrcs[i / SrcNumElts] >= 0 &&
ConcatSrcs[i / SrcNumElts] != (int)(Idx / SrcNumElts)))
if ((Idx % SrcNumElts != (ShiftIndex % SrcNumElts)))
return false;
// Remember which source this index came from.
ConcatSrcs[i / SrcNumElts] = Idx / SrcNumElts;
}

// The shuffle is concatenating multiple vectors together.
// Collect the different operands for that.
Register UndefReg;
Register Src2 = MI.getOperand(2).getReg();
for (auto Src : ConcatSrcs) {
if (Src < 0) {
if (!UndefReg) {
Builder.setInsertPt(*MI.getParent(), MI);
UndefReg = Builder.buildUndef(SrcType).getReg(0);
}
Ops.push_back(UndefReg);
} else if (Src == 0)
Ops.push_back(Src1);
else
Ops.push_back(Src2);
}
return true;
}

Expand Down
12 changes: 12 additions & 0 deletions llvm/lib/Target/AIE/AIE2InstrPatterns.td
Original file line number Diff line number Diff line change
Expand Up @@ -597,6 +597,18 @@ def : Pat<(int_aie2_vshuffle VEC512:$s1, VEC512:$s2, eR:$mod),
def : Pat<(int_aie2_vshuffle_bf16 VEC512:$s1, VEC512:$s2, eR:$mod),
(VSHUFFLE VEC512:$s1, VEC512:$s2, eR:$mod)>;

// VSHUFFLE generic opcodes translation
def vshuffle_node : SDNode<"AIE2::G_AIE_VSHUFFLE",
SDTypeProfile<1, 3, [SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>]>>;
def : GINodeEquiv<G_AIE_VSHUFFLE, vshuffle_node>;

def : Pat<(v16i32 (vshuffle_node (v16i32 VEC512:$v0), (v16i32 VEC512:$v1), (i32 eR:$mode))),
(VSHUFFLE VEC512:$v0, VEC512:$v1, i32:$mode)>;
def : Pat<(v32i16 (vshuffle_node (v32i16 VEC512:$v0), (v32i16 VEC512:$v1), (i32 eR:$mode))),
(VSHUFFLE VEC512:$v0, VEC512:$v1, i32:$mode)>;
def : Pat<(v64i8 (vshuffle_node (v64i8 VEC512:$v0), (v64i8 VEC512:$v1), (i32 eR:$mode))),
(VSHUFFLE VEC512:$v0, VEC512:$v1, i32:$mode)>;

// VSHIFT Intrinsic (shift/shiftx/shift_bytes)
def : Pat<(int_aie2_vshift_I512_I512 VEC512:$s1, VEC512:$s2, 0x0, eR:$shift),
(VSHIFT VEC512:$s1, VEC512:$s2, eR:$shift)>;
Expand Down
Loading