Skip to content

Commit f2eb5d4

Browse files
authored
[SelectionDAG] Handle fneg/fabs/fcopysign in SimplifyDemandedBits (#139239)
1 parent 078475d commit f2eb5d4

File tree

9 files changed

+268
-271
lines changed

9 files changed

+268
-271
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 6 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -18403,49 +18403,12 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
1840318403
if (SDValue C = DAG.FoldConstantArithmetic(ISD::FCOPYSIGN, DL, VT, {N0, N1}))
1840418404
return C;
1840518405

18406-
if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N->getOperand(1))) {
18407-
const APFloat &V = N1C->getValueAPF();
18408-
// copysign(x, c1) -> fabs(x) iff ispos(c1)
18409-
// copysign(x, c1) -> fneg(fabs(x)) iff isneg(c1)
18410-
if (!V.isNegative()) {
18411-
if (!LegalOperations || TLI.isOperationLegal(ISD::FABS, VT))
18412-
return DAG.getNode(ISD::FABS, DL, VT, N0);
18413-
} else {
18414-
if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
18415-
return DAG.getNode(ISD::FNEG, DL, VT,
18416-
DAG.getNode(ISD::FABS, SDLoc(N0), VT, N0));
18417-
}
18418-
}
18419-
18420-
// copysign(fabs(x), y) -> copysign(x, y)
18421-
// copysign(fneg(x), y) -> copysign(x, y)
18422-
// copysign(copysign(x,z), y) -> copysign(x, y)
18423-
if (N0.getOpcode() == ISD::FABS || N0.getOpcode() == ISD::FNEG ||
18424-
N0.getOpcode() == ISD::FCOPYSIGN)
18425-
return DAG.getNode(ISD::FCOPYSIGN, DL, VT, N0.getOperand(0), N1);
18426-
18427-
// copysign(x, abs(y)) -> abs(x)
18428-
if (N1.getOpcode() == ISD::FABS)
18429-
return DAG.getNode(ISD::FABS, DL, VT, N0);
18430-
18431-
// copysign(x, copysign(y,z)) -> copysign(x, z)
18432-
if (N1.getOpcode() == ISD::FCOPYSIGN)
18433-
return DAG.getNode(ISD::FCOPYSIGN, DL, VT, N0, N1.getOperand(1));
18434-
1843518406
// copysign(x, fp_extend(y)) -> copysign(x, y)
1843618407
// copysign(x, fp_round(y)) -> copysign(x, y)
1843718408
if (CanCombineFCOPYSIGN_EXTEND_ROUND(N))
1843818409
return DAG.getNode(ISD::FCOPYSIGN, DL, VT, N0, N1.getOperand(0));
1843918410

18440-
// We only take the sign bit from the sign operand.
18441-
EVT SignVT = N1.getValueType();
18442-
if (SimplifyDemandedBits(N1,
18443-
APInt::getSignMask(SignVT.getScalarSizeInBits())))
18444-
return SDValue(N, 0);
18445-
18446-
// We only take the non-sign bits from the value operand
18447-
if (SimplifyDemandedBits(N0,
18448-
APInt::getSignedMaxValue(VT.getScalarSizeInBits())))
18411+
if (SimplifyDemandedBits(SDValue(N, 0)))
1844918412
return SDValue(N, 0);
1845018413

1845118414
return SDValue();
@@ -18972,6 +18935,9 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
1897218935
N0.getOperand(0));
1897318936
}
1897418937

18938+
if (SimplifyDemandedBits(SDValue(N, 0)))
18939+
return SDValue(N, 0);
18940+
1897518941
if (SDValue Cast = foldSignChangeInBitcast(N))
1897618942
return Cast;
1897718943

@@ -19045,14 +19011,8 @@ SDValue DAGCombiner::visitFABS(SDNode *N) {
1904519011
if (SDValue C = DAG.FoldConstantArithmetic(ISD::FABS, DL, VT, {N0}))
1904619012
return C;
1904719013

19048-
// fold (fabs (fabs x)) -> (fabs x)
19049-
if (N0.getOpcode() == ISD::FABS)
19050-
return N->getOperand(0);
19051-
19052-
// fold (fabs (fneg x)) -> (fabs x)
19053-
// fold (fabs (fcopysign x, y)) -> (fabs x)
19054-
if (N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FCOPYSIGN)
19055-
return DAG.getNode(ISD::FABS, DL, VT, N0.getOperand(0));
19014+
if (SimplifyDemandedBits(SDValue(N, 0)))
19015+
return SDValue(N, 0);
1905619016

1905719017
if (SDValue Cast = foldSignChangeInBitcast(N))
1905819018
return Cast;

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2967,6 +2967,77 @@ bool TargetLowering::SimplifyDemandedBits(
29672967
}
29682968
break;
29692969
}
2970+
case ISD::FABS: {
2971+
SDValue Op0 = Op.getOperand(0);
2972+
APInt SignMask = APInt::getSignMask(BitWidth);
2973+
2974+
if (!DemandedBits.intersects(SignMask))
2975+
return TLO.CombineTo(Op, Op0);
2976+
2977+
if (SimplifyDemandedBits(Op0, DemandedBits, DemandedElts, Known, TLO,
2978+
Depth + 1))
2979+
return true;
2980+
2981+
if (Known.isNonNegative())
2982+
return TLO.CombineTo(Op, Op0);
2983+
if (Known.isNegative())
2984+
return TLO.CombineTo(
2985+
Op, TLO.DAG.getNode(ISD::FNEG, dl, VT, Op0, Op->getFlags()));
2986+
2987+
Known.Zero |= SignMask;
2988+
Known.One &= ~SignMask;
2989+
2990+
break;
2991+
}
2992+
case ISD::FCOPYSIGN: {
2993+
SDValue Op0 = Op.getOperand(0);
2994+
SDValue Op1 = Op.getOperand(1);
2995+
2996+
unsigned BitWidth0 = Op0.getScalarValueSizeInBits();
2997+
unsigned BitWidth1 = Op1.getScalarValueSizeInBits();
2998+
APInt SignMask0 = APInt::getSignMask(BitWidth0);
2999+
APInt SignMask1 = APInt::getSignMask(BitWidth1);
3000+
3001+
if (!DemandedBits.intersects(SignMask0))
3002+
return TLO.CombineTo(Op, Op0);
3003+
3004+
if (SimplifyDemandedBits(Op0, ~SignMask0 & DemandedBits, DemandedElts,
3005+
Known, TLO, Depth + 1) ||
3006+
SimplifyDemandedBits(Op1, SignMask1, DemandedElts, Known2, TLO,
3007+
Depth + 1))
3008+
return true;
3009+
3010+
if (Known2.isNonNegative())
3011+
return TLO.CombineTo(
3012+
Op, TLO.DAG.getNode(ISD::FABS, dl, VT, Op0, Op->getFlags()));
3013+
3014+
if (Known2.isNegative())
3015+
return TLO.CombineTo(
3016+
Op, TLO.DAG.getNode(ISD::FNEG, dl, VT,
3017+
TLO.DAG.getNode(ISD::FABS, SDLoc(Op0), VT, Op0)));
3018+
3019+
Known.Zero &= ~SignMask0;
3020+
Known.One &= ~SignMask0;
3021+
break;
3022+
}
3023+
case ISD::FNEG: {
3024+
SDValue Op0 = Op.getOperand(0);
3025+
APInt SignMask = APInt::getSignMask(BitWidth);
3026+
3027+
if (!DemandedBits.intersects(SignMask))
3028+
return TLO.CombineTo(Op, Op0);
3029+
3030+
if (SimplifyDemandedBits(Op0, DemandedBits, DemandedElts, Known, TLO,
3031+
Depth + 1))
3032+
return true;
3033+
3034+
if (!Known.isSignUnknown()) {
3035+
Known.Zero ^= SignMask;
3036+
Known.One ^= SignMask;
3037+
}
3038+
3039+
break;
3040+
}
29703041
default:
29713042
// We also ask the target about intrinsics (which could be specific to it).
29723043
if (Op.getOpcode() >= ISD::BUILTIN_OP_END ||

llvm/test/CodeGen/AArch64/extract-vector-elt.ll

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -391,13 +391,10 @@ define float @extract_v4i32_copysign_build_vector(<4 x float> %a, <4 x float> %b
391391
; CHECK-SD: // %bb.0: // %entry
392392
; CHECK-SD-NEXT: sub sp, sp, #16
393393
; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
394-
; CHECK-SD-NEXT: adrp x8, .LCPI16_0
395-
; CHECK-SD-NEXT: mvni v1.4s, #128, lsl #24
396-
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
397-
; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI16_0]
394+
; CHECK-SD-NEXT: fabs v0.4s, v0.4s
398395
; CHECK-SD-NEXT: mov x8, sp
396+
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
399397
; CHECK-SD-NEXT: bfi x8, x0, #2, #2
400-
; CHECK-SD-NEXT: bif v0.16b, v2.16b, v1.16b
401398
; CHECK-SD-NEXT: str q0, [sp]
402399
; CHECK-SD-NEXT: ldr s0, [x8]
403400
; CHECK-SD-NEXT: add sp, sp, #16
@@ -425,10 +422,7 @@ entry:
425422
define float @extract_v4i32_copysign_build_vector_const(<4 x float> %a, <4 x float> %b, i32 %c) {
426423
; CHECK-SD-LABEL: extract_v4i32_copysign_build_vector_const:
427424
; CHECK-SD: // %bb.0: // %entry
428-
; CHECK-SD-NEXT: adrp x8, .LCPI17_0
429-
; CHECK-SD-NEXT: mvni v1.4s, #128, lsl #24
430-
; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI17_0]
431-
; CHECK-SD-NEXT: bif v0.16b, v2.16b, v1.16b
425+
; CHECK-SD-NEXT: fabs v0.4s, v0.4s
432426
; CHECK-SD-NEXT: mov s0, v0.s[2]
433427
; CHECK-SD-NEXT: ret
434428
;

llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll

Lines changed: 24 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -4388,12 +4388,11 @@ define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2bf16_sign_v2f32(<2 x bfloat> %m
43884388
; GFX8-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f32:
43894389
; GFX8: ; %bb.0:
43904390
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4391-
; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1
4392-
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1
4393-
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
4394-
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v1
4391+
; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
4392+
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
4393+
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
43954394
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
4396-
; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
4395+
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
43974396
; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
43984397
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
43994398
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
@@ -5267,13 +5266,12 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2bf16_sign_v2f32(<2 x bfloat> i
52675266
;
52685267
; GFX8-LABEL: s_copysign_out_v2bf16_mag_v2bf16_sign_v2f32:
52695268
; GFX8: ; %bb.0:
5270-
; GFX8-NEXT: s_bfe_u32 s4, s1, 0x10010
5271-
; GFX8-NEXT: s_add_i32 s4, s4, s1
5272-
; GFX8-NEXT: s_or_b32 s3, s1, 0x400000
5273-
; GFX8-NEXT: s_add_i32 s6, s4, 0x7fff
5269+
; GFX8-NEXT: s_bfe_u32 s3, s1, 0x10010
5270+
; GFX8-NEXT: s_add_i32 s3, s3, s1
5271+
; GFX8-NEXT: s_addk_i32 s3, 0x7fff
52745272
; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], s1, s1
52755273
; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec
5276-
; GFX8-NEXT: s_cselect_b32 s1, s3, s6
5274+
; GFX8-NEXT: s_cselect_b32 s1, s1, s3
52775275
; GFX8-NEXT: s_bfe_u32 s3, s2, 0x10010
52785276
; GFX8-NEXT: s_add_i32 s3, s3, s2
52795277
; GFX8-NEXT: s_addk_i32 s3, 0x7fff
@@ -6340,18 +6338,16 @@ define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3bf16_sign_v3f32(<3 x bfloat> %m
63406338
; GFX8-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f32:
63416339
; GFX8: ; %bb.0:
63426340
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6343-
; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
6344-
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
6345-
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
6346-
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v2
6347-
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
6348-
; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v5, vcc
63496341
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
6350-
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
63516342
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
6352-
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
6343+
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
63536344
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
63546345
; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
6346+
; GFX8-NEXT: v_bfe_u32 v5, v2, 16, 1
6347+
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v2
6348+
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
6349+
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
6350+
; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
63556351
; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
63566352
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
63576353
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
@@ -7687,24 +7683,22 @@ define <4 x bfloat> @v_copysign_out_v4bf16_mag_v4bf16_sign_v4f32(<4 x bfloat> %m
76877683
; GFX8-LABEL: v_copysign_out_v4bf16_mag_v4bf16_sign_v4f32:
76887684
; GFX8: ; %bb.0:
76897685
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7690-
; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1
7691-
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4
7692-
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
7693-
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4
7686+
; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
7687+
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
7688+
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
76947689
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
7695-
; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc
7696-
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
7697-
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
7698-
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
7699-
; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
7700-
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v2
7701-
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
7702-
; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v6, vcc
7690+
; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
77037691
; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
7692+
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
77047693
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
77057694
; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
77067695
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
77077696
; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
7697+
; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
7698+
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
7699+
; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
7700+
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
7701+
; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
77087702
; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
77097703
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
77107704
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6

0 commit comments

Comments
 (0)