Skip to content

[AMDGPU] Use reverse iteration in CodeGenPrepare #145484

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: users/pierre-vh/rm-promote-i32-cgp
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 29 additions & 53 deletions llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "AMDGPU.h"
#include "AMDGPUTargetMachine.h"
#include "SIModeRegisterDefaults.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
Expand Down Expand Up @@ -109,6 +110,7 @@ class AMDGPUCodeGenPrepareImpl
bool FlowChanged = false;
mutable Function *SqrtF32 = nullptr;
mutable Function *LdexpF32 = nullptr;
mutable SetVector<Value *> DeadVals;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this needs to be a set, the iteration shouldn't revisit the same instruction twice (at least other IR combiner passes seem to all use vectors)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't use a set because I thought we could visit twice, I used one because I call the recursive delete function and I need to avoid cases where it may delete another instruction in that set (and then we visit a dead pointer).

I can use a vector + find of course if you prefer


DenseMap<const PHINode *, bool> BreakPhiNodesCache;

Expand Down Expand Up @@ -285,28 +287,19 @@ bool AMDGPUCodeGenPrepareImpl::run() {
BreakPhiNodesCache.clear();
bool MadeChange = false;

Function::iterator NextBB;
for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) {
BasicBlock *BB = &*FI;
NextBB = std::next(FI);

BasicBlock::iterator Next;
for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
I = Next) {
Next = std::next(I);

MadeChange |= visit(*I);

if (Next != E) { // Control flow changed
BasicBlock *NextInstBB = Next->getParent();
if (NextInstBB != BB) {
BB = NextInstBB;
E = BB->end();
FE = F.end();
}
}
for (BasicBlock &BB : reverse(F)) {
for (Instruction &I : make_early_inc_range(reverse(BB))) {
if (!DeadVals.contains(&I))
MadeChange |= visit(I);
}
}

while (!DeadVals.empty()) {
RecursivelyDeleteTriviallyDeadInstructions(
DeadVals.pop_back_val(), TLI, /*MSSAU*/ nullptr,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
DeadVals.pop_back_val(), TLI, /*MSSAU*/ nullptr,
DeadVals.pop_back_val(), TLI, /*MSSAU=*/ nullptr,

[&](Value *V) { DeadVals.remove(V); });
}

return MadeChange;
}

Expand Down Expand Up @@ -426,7 +419,7 @@ bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
Value *NewVal = insertValues(Builder, Ty, ResultVals);
NewVal->takeName(&I);
I.replaceAllUsesWith(NewVal);
I.eraseFromParent();
DeadVals.insert(&I);

return true;
}
Expand Down Expand Up @@ -500,10 +493,10 @@ bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {
FoldedT, FoldedF);
NewSelect->takeName(&BO);
BO.replaceAllUsesWith(NewSelect);
BO.eraseFromParent();
DeadVals.insert(&BO);
if (CastOp)
CastOp->eraseFromParent();
Sel->eraseFromParent();
DeadVals.insert(CastOp);
DeadVals.insert(Sel);
return true;
}

Expand Down Expand Up @@ -900,7 +893,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
if (NewVal) {
FDiv.replaceAllUsesWith(NewVal);
NewVal->takeName(&FDiv);
RecursivelyDeleteTriviallyDeadInstructions(&FDiv, TLI);
DeadVals.insert(&FDiv);
}

return true;
Expand Down Expand Up @@ -1310,7 +1303,8 @@ within the byte are all 0.
static bool tryNarrowMathIfNoOverflow(Instruction *I,
const SITargetLowering *TLI,
const TargetTransformInfo &TTI,
const DataLayout &DL) {
const DataLayout &DL,
SetVector<Value *> &DeadVals) {
unsigned Opc = I->getOpcode();
Type *OldType = I->getType();

Expand Down Expand Up @@ -1365,7 +1359,7 @@ static bool tryNarrowMathIfNoOverflow(Instruction *I,

Value *Zext = Builder.CreateZExt(Arith, OldType);
I->replaceAllUsesWith(Zext);
I->eraseFromParent();
DeadVals.insert(I);
return true;
}

Expand All @@ -1376,7 +1370,7 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
if (UseMul24Intrin && replaceMulWithMul24(I))
return true;
if (tryNarrowMathIfNoOverflow(&I, ST.getTargetLowering(),
TM.getTargetTransformInfo(F), DL))
TM.getTargetTransformInfo(F), DL, DeadVals))
return true;

bool Changed = false;
Expand Down Expand Up @@ -1441,7 +1435,7 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {

if (NewDiv) {
I.replaceAllUsesWith(NewDiv);
I.eraseFromParent();
DeadVals.insert(&I);
Changed = true;
}
}
Expand Down Expand Up @@ -1497,7 +1491,7 @@ bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
I.replaceAllUsesWith(ValOrig);
I.eraseFromParent();
DeadVals.insert(&I);
return true;
}

Expand Down Expand Up @@ -1539,7 +1533,7 @@ bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {

Fract->takeName(&I);
I.replaceAllUsesWith(Fract);
RecursivelyDeleteTriviallyDeadInstructions(&I, TLI);
DeadVals.insert(&I);
return true;
}

Expand Down Expand Up @@ -1827,7 +1821,7 @@ bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
}

I.replaceAllUsesWith(Vec);
I.eraseFromParent();
DeadVals.insert(&I);
return true;
}

Expand Down Expand Up @@ -1908,7 +1902,7 @@ bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
auto *Intrin = B.CreateIntrinsic(
I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
I.replaceAllUsesWith(Intrin);
I.eraseFromParent();
DeadVals.insert(&I);
return true;
}

Expand Down Expand Up @@ -2005,16 +1999,10 @@ bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) {
Value *Fract = applyFractPat(Builder, FractArg);
Fract->takeName(&I);
I.replaceAllUsesWith(Fract);

RecursivelyDeleteTriviallyDeadInstructions(&I, TLI);
DeadVals.insert(&I);
return true;
}

static bool isOneOrNegOne(const Value *Val) {
const APFloat *C;
return match(Val, m_APFloat(C)) && C->getExactLog2Abs() == 0;
}

// Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
Type *Ty = Sqrt.getType()->getScalarType();
Expand All @@ -2035,18 +2023,6 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
if (ReqdAccuracy < 1.0f)
return false;

// FIXME: This is an ugly hack for this pass using forward iteration instead
// of reverse. If it worked like a normal combiner, the rsq would form before
// we saw a sqrt call.
auto *FDiv =
dyn_cast_or_null<FPMathOperator>(Sqrt.getUniqueUndroppableUser());
if (FDiv && FDiv->getOpcode() == Instruction::FDiv &&
FDiv->getFPAccuracy() >= 1.0f &&
canOptimizeWithRsq(FPOp, FDiv->getFastMathFlags(), SqrtFMF) &&
// TODO: We should also handle the arcp case for the fdiv with non-1 value
isOneOrNegOne(FDiv->getOperand(0)))
return false;

Value *SrcVal = Sqrt.getOperand(0);
bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);

Expand All @@ -2070,7 +2046,7 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
Value *NewSqrt = insertValues(Builder, Sqrt.getType(), ResultVals);
NewSqrt->takeName(&Sqrt);
Sqrt.replaceAllUsesWith(NewSqrt);
Sqrt.eraseFromParent();
DeadVals.insert(&Sqrt);
return true;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -726,16 +726,16 @@ define amdgpu_kernel void @used_by_unbreakable_and_breakable_phi(<5 x double> %i
; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE815:%.*]] = extractelement <5 x double> [[LARGEPHI_INSERTSLICE4]], i64 4
; CHECK-NEXT: br label [[END]]
; CHECK: end:
; CHECK-NEXT: [[TMP5:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE01]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[FINALLY]] ]
; CHECK-NEXT: [[TMP6:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE22]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[FINALLY]] ]
; CHECK-NEXT: [[TMP7:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE43]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[FINALLY]] ]
; CHECK-NEXT: [[TMP8:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE64]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[FINALLY]] ]
; CHECK-NEXT: [[TMP9:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE85]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[FINALLY]] ]
; CHECK-NEXT: [[TMP10:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE011]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
; CHECK-NEXT: [[TMP11:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE212]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
; CHECK-NEXT: [[TMP12:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE413]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
; CHECK-NEXT: [[TMP13:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE614]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
; CHECK-NEXT: [[TMP14:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE815]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
; CHECK-NEXT: [[TMP5:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE01]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
; CHECK-NEXT: [[TMP6:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE22]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
; CHECK-NEXT: [[TMP7:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE43]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
; CHECK-NEXT: [[TMP8:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE64]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
; CHECK-NEXT: [[TMP9:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE85]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
; CHECK-NEXT: [[TMP10:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE011]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[FINALLY]] ]
; CHECK-NEXT: [[TMP11:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE212]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[FINALLY]] ]
; CHECK-NEXT: [[TMP12:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE413]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[FINALLY]] ]
; CHECK-NEXT: [[TMP13:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE614]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[FINALLY]] ]
; CHECK-NEXT: [[TMP14:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE815]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[FINALLY]] ]
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE016:%.*]] = insertelement <5 x double> poison, double [[TMP10]], i64 0
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE117:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE016]], double [[TMP11]], i64 1
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE218:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE117]], double [[TMP12]], i64 2
Expand All @@ -746,8 +746,8 @@ define amdgpu_kernel void @used_by_unbreakable_and_breakable_phi(<5 x double> %i
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE28:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE17]], double [[TMP7]], i64 2
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE39:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE28]], double [[TMP8]], i64 3
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE410:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE39]], double [[TMP9]], i64 4
; CHECK-NEXT: store <5 x double> [[LARGEPHI_INSERTSLICE410]], ptr [[OUT]], align 1
; CHECK-NEXT: store <5 x double> [[LARGEPHI_INSERTSLICE420]], ptr [[OUT]], align 1
; CHECK-NEXT: store <5 x double> [[LARGEPHI_INSERTSLICE410]], ptr [[OUT]], align 1
; CHECK-NEXT: ret void
;
entry:
Expand Down Expand Up @@ -1187,11 +1187,11 @@ define amdgpu_kernel void @test_breakable_chain_5_out_of_7(<5 x double> %in, ptr
; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE960:%.*]] = extractelement <5 x double> [[IN]], i64 4
; CHECK-NEXT: br i1 [[COND]], label [[END:%.*]], label [[COND5_END]]
; CHECK: cond5.end:
; CHECK-NEXT: [[TMP25:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE041]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[COND5_TRUE]] ]
; CHECK-NEXT: [[TMP26:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE242]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[COND5_TRUE]] ]
; CHECK-NEXT: [[TMP27:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE443]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[COND5_TRUE]] ]
; CHECK-NEXT: [[TMP28:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE644]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[COND5_TRUE]] ]
; CHECK-NEXT: [[TMP29:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE845]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[COND5_TRUE]] ]
; CHECK-NEXT: [[TMP25:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE041]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE152]], [[COND5_TRUE]] ]
; CHECK-NEXT: [[TMP26:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE242]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE354]], [[COND5_TRUE]] ]
; CHECK-NEXT: [[TMP27:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE443]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE556]], [[COND5_TRUE]] ]
; CHECK-NEXT: [[TMP28:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE644]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE758]], [[COND5_TRUE]] ]
; CHECK-NEXT: [[TMP29:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE845]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE960]], [[COND5_TRUE]] ]
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE046:%.*]] = insertelement <5 x double> poison, double [[TMP25]], i64 0
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE147:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE046]], double [[TMP26]], i64 1
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE248:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE147]], double [[TMP27]], i64 2
Expand All @@ -1204,11 +1204,11 @@ define amdgpu_kernel void @test_breakable_chain_5_out_of_7(<5 x double> %in, ptr
; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE859:%.*]] = extractelement <5 x double> [[LARGEPHI_INSERTSLICE450]], i64 4
; CHECK-NEXT: br label [[END]]
; CHECK: end:
; CHECK-NEXT: [[TMP30:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE051]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE152]], [[COND5_TRUE]] ]
; CHECK-NEXT: [[TMP31:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE253]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE354]], [[COND5_TRUE]] ]
; CHECK-NEXT: [[TMP32:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE455]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE556]], [[COND5_TRUE]] ]
; CHECK-NEXT: [[TMP33:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE657]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE758]], [[COND5_TRUE]] ]
; CHECK-NEXT: [[TMP34:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE859]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE960]], [[COND5_TRUE]] ]
; CHECK-NEXT: [[TMP30:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE051]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[COND5_TRUE]] ]
; CHECK-NEXT: [[TMP31:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE253]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[COND5_TRUE]] ]
; CHECK-NEXT: [[TMP32:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE455]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[COND5_TRUE]] ]
; CHECK-NEXT: [[TMP33:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE657]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[COND5_TRUE]] ]
; CHECK-NEXT: [[TMP34:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE859]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[COND5_TRUE]] ]
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE061:%.*]] = insertelement <5 x double> poison, double [[TMP30]], i64 0
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE162:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE061]], double [[TMP31]], i64 1
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE263:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE162]], double [[TMP32]], i64 2
Expand Down
Loading
Loading