Skip to content

Commit 38200e9

Browse files
authored
[DAG] visitFREEZE - always allow freezing multiple operands (#145939)
Always try to fold freeze(op(....)) -> op(freeze(),freeze(),freeze(),...). This patch proposes we drop the opt-in limit for opcodes that are allowed to push a freeze through the op to freeze all its operands, through the tree towards the roots. I'm struggling to find a strong reason for this limit apart from the DAG freeze handling being immature for so long - as we've improved coverage in canCreateUndefOrPoison/isGuaranteedNotToBeUndefOrPoison it looks like the regressions are not as severe. Hopefully this will help some of the regression issues in #143102 etc.
1 parent 4c7a706 commit 38200e9

18 files changed

+1702
-1806
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -16609,22 +16609,14 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
1660916609
// Fold freeze(op(x, ...)) -> op(freeze(x), ...).
1661016610
// Try to push freeze through instructions that propagate but don't produce
1661116611
// poison as far as possible. If an operand of freeze follows three
16612-
// conditions 1) one-use, 2) does not produce poison, and 3) has all but one
16613-
// guaranteed-non-poison operands (or is a BUILD_VECTOR or similar) then push
16612+
// conditions 1) one-use, and 2) does not produce poison then push
1661416613
// the freeze through to the operands that are not guaranteed non-poison.
1661516614
// NOTE: we will strip poison-generating flags, so ignore them here.
1661616615
if (DAG.canCreateUndefOrPoison(N0, /*PoisonOnly*/ false,
1661716616
/*ConsiderFlags*/ false) ||
1661816617
N0->getNumValues() != 1 || !N0->hasOneUse())
1661916618
return SDValue();
1662016619

16621-
bool AllowMultipleMaybePoisonOperands =
16622-
N0.getOpcode() == ISD::SELECT_CC || N0.getOpcode() == ISD::SETCC ||
16623-
N0.getOpcode() == ISD::BUILD_VECTOR ||
16624-
N0.getOpcode() == ISD::BUILD_PAIR ||
16625-
N0.getOpcode() == ISD::VECTOR_SHUFFLE ||
16626-
N0.getOpcode() == ISD::CONCAT_VECTORS || N0.getOpcode() == ISD::FMUL;
16627-
1662816620
// Avoid turning a BUILD_VECTOR that can be recognized as "all zeros", "all
1662916621
// ones" or "constant" into something that depends on FrozenUndef. We can
1663016622
// instead pick undef values to keep those properties, while at the same time
@@ -16657,10 +16649,6 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
1665716649
MaybePoisonOperandNumbers.push_back(OpNo);
1665816650
if (!HadMaybePoisonOperands)
1665916651
continue;
16660-
if (IsNewMaybePoisonOperand && !AllowMultipleMaybePoisonOperands) {
16661-
// Multiple maybe-poison ops when not allowed - bail out.
16662-
return SDValue();
16663-
}
1666416652
}
1666516653
// NOTE: the whole op may be not guaranteed to not be undef or poison because
1666616654
// it could create undef or poison due to it's poison-generating flags.
@@ -23184,13 +23172,16 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
2318423172

2318523173
// Ensure all the operands are the same value type, fill any missing
2318623174
// operands with UNDEF and create the BUILD_VECTOR.
23187-
auto CanonicalizeBuildVector = [&](SmallVectorImpl<SDValue> &Ops) {
23175+
auto CanonicalizeBuildVector = [&](SmallVectorImpl<SDValue> &Ops,
23176+
bool FreezeUndef = false) {
2318823177
assert(Ops.size() == NumElts && "Unexpected vector size");
23178+
SDValue UndefOp = FreezeUndef ? DAG.getFreeze(DAG.getUNDEF(MaxEltVT))
23179+
: DAG.getUNDEF(MaxEltVT);
2318923180
for (SDValue &Op : Ops) {
2319023181
if (Op)
2319123182
Op = VT.isInteger() ? DAG.getAnyExtOrTrunc(Op, DL, MaxEltVT) : Op;
2319223183
else
23193-
Op = DAG.getUNDEF(MaxEltVT);
23184+
Op = UndefOp;
2319423185
}
2319523186
return DAG.getBuildVector(VT, DL, Ops);
2319623187
};
@@ -23204,6 +23195,10 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
2320423195
if (CurVec.isUndef())
2320523196
return CanonicalizeBuildVector(Ops);
2320623197

23198+
// FREEZE(UNDEF) - build new BUILD_VECTOR from already inserted operands.
23199+
if (ISD::isFreezeUndef(CurVec.getNode()) && CurVec.hasOneUse())
23200+
return CanonicalizeBuildVector(Ops, /*FreezeUndef=*/true);
23201+
2320723202
// BUILD_VECTOR - insert unused operands and build new BUILD_VECTOR.
2320823203
if (CurVec.getOpcode() == ISD::BUILD_VECTOR && CurVec.hasOneUse()) {
2320923204
for (unsigned I = 0; I != NumElts; ++I)

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3237,6 +3237,9 @@ lowerVectorFTRUNC_FCEIL_FFLOOR_FROUND(SDValue Op, SelectionDAG &DAG,
32373237

32383238
SDValue Src = Op.getOperand(0);
32393239

3240+
// Freeze the source since we are increasing the number of uses.
3241+
Src = DAG.getFreeze(Src);
3242+
32403243
MVT ContainerVT = VT;
32413244
if (VT.isFixedLengthVector()) {
32423245
ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
@@ -3254,9 +3257,6 @@ lowerVectorFTRUNC_FCEIL_FFLOOR_FROUND(SDValue Op, SelectionDAG &DAG,
32543257
std::tie(Mask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
32553258
}
32563259

3257-
// Freeze the source since we are increasing the number of uses.
3258-
Src = DAG.getFreeze(Src);
3259-
32603260
// We do the conversion on the absolute value and fix the sign at the end.
32613261
SDValue Abs = DAG.getNode(RISCVISD::FABS_VL, DL, ContainerVT, Src, Mask, VL);
32623262

llvm/test/CodeGen/AArch64/vecreduce-propagate-sd-flags.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
; CHECK: Legally typed node: [[VTWOA]]: v2f64 = BUILD_VECTOR
1313
; CHECK: Legalizing node: [[VTWOB:t.*]]: v2f64 = BUILD_VECTOR
1414
; CHECK: Legally typed node: [[VTWOB]]: v2f64 = BUILD_VECTOR
15-
; CHECK: Legalizing node: t30: v2f64 = fmaxnum nnan reassoc [[VTWOB]], [[VTWOA]]
15+
; CHECK: Legalizing node: t31: v2f64 = fmaxnum nnan reassoc [[VTWOB]], [[VTWOA]]
1616

1717
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
1818
target triple = "aarch64--linux-gnu"

llvm/test/CodeGen/AMDGPU/div_i128.ll

Lines changed: 40 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -479,21 +479,28 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
479479
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
480480
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
481481
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
482-
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
483-
; GFX9-O0-NEXT: s_nop 0
484-
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
485482
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
486483
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
487484
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
488485
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
489-
; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
486+
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
487+
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
488+
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
489+
; GFX9-O0-NEXT: s_nop 0
490+
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
491+
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
492+
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4
493+
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
490494
; GFX9-O0-NEXT: s_nop 0
491-
; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
492-
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
495+
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
496+
; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
497+
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
493498
; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f
494-
; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
499+
; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13]
500+
; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
495501
; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
496-
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
502+
; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7]
503+
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
497504
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15]
498505
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
499506
; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6
@@ -504,6 +511,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
504511
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
505512
; GFX9-O0-NEXT: s_mov_b32 s14, s13
506513
; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14
514+
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
507515
; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
508516
; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12
509517
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -1036,10 +1044,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
10361044
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
10371045
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
10381046
; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
1039-
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
1040-
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
1041-
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
1042-
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
1047+
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
1048+
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
1049+
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
1050+
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
10431051
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
10441052
; GFX9-O0-NEXT: s_mov_b32 s5, s6
10451053
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
@@ -2654,21 +2662,28 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
26542662
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
26552663
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
26562664
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
2657-
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
2658-
; GFX9-O0-NEXT: s_nop 0
2659-
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
26602665
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
26612666
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
26622667
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
26632668
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
2664-
; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
2669+
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
2670+
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
2671+
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
2672+
; GFX9-O0-NEXT: s_nop 0
2673+
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
2674+
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
2675+
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4
2676+
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
26652677
; GFX9-O0-NEXT: s_nop 0
2666-
; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
2667-
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
2678+
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
2679+
; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
2680+
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
26682681
; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f
2669-
; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
2682+
; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13]
2683+
; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
26702684
; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
2671-
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
2685+
; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7]
2686+
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
26722687
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15]
26732688
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
26742689
; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6
@@ -2679,6 +2694,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
26792694
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
26802695
; GFX9-O0-NEXT: s_mov_b32 s14, s13
26812696
; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14
2697+
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
26822698
; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
26832699
; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12
26842700
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -3211,10 +3227,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
32113227
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
32123228
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
32133229
; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
3214-
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
3215-
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
3216-
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
3217-
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
3230+
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
3231+
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
3232+
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
3233+
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
32183234
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
32193235
; GFX9-O0-NEXT: s_mov_b32 s5, s6
32203236
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)

llvm/test/CodeGen/AMDGPU/rem_i128.ll

Lines changed: 40 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -517,21 +517,28 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
517517
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
518518
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
519519
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
520-
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
521-
; GFX9-O0-NEXT: s_nop 0
522-
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
523520
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
524521
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
525522
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
526523
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
527-
; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
524+
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
525+
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
526+
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
528527
; GFX9-O0-NEXT: s_nop 0
529-
; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
530-
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
528+
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
529+
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
530+
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4
531+
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
532+
; GFX9-O0-NEXT: s_nop 0
533+
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
534+
; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
535+
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
531536
; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f
532-
; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
537+
; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13]
538+
; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
533539
; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
534-
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
540+
; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7]
541+
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
535542
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15]
536543
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
537544
; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6
@@ -542,6 +549,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
542549
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
543550
; GFX9-O0-NEXT: s_mov_b32 s14, s13
544551
; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14
552+
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
545553
; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
546554
; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12
547555
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -1074,10 +1082,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
10741082
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
10751083
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
10761084
; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
1077-
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
1078-
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
1079-
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
1080-
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
1085+
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
1086+
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
1087+
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
1088+
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
10811089
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
10821090
; GFX9-O0-NEXT: s_mov_b32 s5, s6
10831091
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
@@ -1890,21 +1898,28 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
18901898
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
18911899
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
18921900
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
1893-
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
1894-
; GFX9-O0-NEXT: s_nop 0
1895-
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
18961901
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
18971902
; GFX9-O0-NEXT: ; implicit-def: $sgpr8
18981903
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
18991904
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
1900-
; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
1905+
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
1906+
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
1907+
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
19011908
; GFX9-O0-NEXT: s_nop 0
1902-
; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
1903-
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
1909+
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
1910+
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
1911+
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4
1912+
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
1913+
; GFX9-O0-NEXT: s_nop 0
1914+
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
1915+
; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
1916+
; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
19041917
; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f
1905-
; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
1918+
; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13]
1919+
; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
19061920
; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
1907-
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
1921+
; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7]
1922+
; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
19081923
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15]
19091924
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9]
19101925
; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6
@@ -1915,6 +1930,7 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
19151930
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
19161931
; GFX9-O0-NEXT: s_mov_b32 s14, s13
19171932
; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14
1933+
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
19181934
; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
19191935
; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12
19201936
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -2447,10 +2463,10 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
24472463
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
24482464
; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
24492465
; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
2450-
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
2451-
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
2452-
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
2453-
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
2466+
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
2467+
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
2468+
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
2469+
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
24542470
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1
24552471
; GFX9-O0-NEXT: s_mov_b32 s5, s6
24562472
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)

0 commit comments

Comments
 (0)