diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 5477c5eae9392..8a944c58a0ea7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -247,6 +247,42 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, ArgTys[0] = User->getType(); }); } + + // Fold image.sample + cvt.pkrtz -> extractelement idx0 into a single + // d16 image sample. + // Pattern to match: + // %sample = call float @llvm.amdgcn.image.sample... + // %pack = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %sample, + // float %any) + // %low = extractelement <2 x half> %pack, i64 0 + // Replacement: + // call half @llvm.amdgcn.image.sample + // + // Folding criteria: + // 1. The only user of the image.sample intrinsic is amdgcn.cvt.pkrtz. + // 2. That cvt.pkrtz call has exactly one use. + // 3. Its sole user is an extractelement instruction with index zero. + // Otherwise, folding is not performed, because D16 sampling only + // guarantees that the element at index 0 is defined; index 1 is + // undefined and using it will result in poison. + if (auto *CvtPkrtzCall = dyn_cast(User)) { + if (CvtPkrtzCall->getIntrinsicID() == Intrinsic::amdgcn_cvt_pkrtz && + CvtPkrtzCall->hasOneUse()) { + // Unique use must be extractelement idx == 0 + if (auto *Ext = + dyn_cast(*CvtPkrtzCall->user_begin())) { + if (isa(Ext->getIndexOperand()) && + cast(Ext->getIndexOperand())->isZero()) { + + return modifyIntrinsicCall( + II, *CvtPkrtzCall, ImageDimIntr->Intr, IC, + [&](auto &Args, auto &ArgTys) { + ArgTys[0] = CvtPkrtzCall->getType(); + }); + } + } + } + } } // Only perform D16 folding if every user of the image sample is diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll b/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll index ee5ccf5af987d..f4f74a84bcb8b 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll @@ -239,6 +239,140 @@ main_body: ret bfloat %res } +define amdgpu_ps float @image_sample_2d_single_pkrtz_to_d16(<8 x i32> %surf_desc, <4 x i32> %samp, float %u, float %v) { +; GFX7-LABEL: @image_sample_2d_single_pkrtz_to_d16( +; GFX7-NEXT: main_body: +; GFX7-NEXT: [[SAMPLE:%.*]] = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; GFX7-NEXT: [[PACK:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[SAMPLE]], float 0.000000e+00) +; GFX7-NEXT: [[H0:%.*]] = extractelement <2 x half> [[PACK]], i64 0 +; GFX7-NEXT: [[MUL:%.*]] = fmul reassoc arcp contract afn half [[H0]], [[H0]] +; GFX7-NEXT: [[DIV:%.*]] = fdiv reassoc arcp contract afn half [[MUL]], [[H0]] +; GFX7-NEXT: [[ADD:%.*]] = fadd reassoc arcp contract afn half [[DIV]], [[H0]] +; GFX7-NEXT: [[RES:%.*]] = fpext half [[ADD]] to float +; GFX7-NEXT: ret float [[RES]] +; +; GFX81PLUS-LABEL: @image_sample_2d_single_pkrtz_to_d16( +; GFX81PLUS-NEXT: main_body: +; GFX81PLUS-NEXT: [[SAMPLE:%.*]] = call half @llvm.amdgcn.image.sample.lz.2d.f16.f32.v8i32.v4i32(i32 1, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; GFX81PLUS-NEXT: [[MUL:%.*]] = fmul reassoc arcp contract afn half [[SAMPLE]], [[SAMPLE]] +; GFX81PLUS-NEXT: [[DIV:%.*]] = fdiv reassoc arcp contract afn half [[MUL]], [[SAMPLE]] +; GFX81PLUS-NEXT: [[ADD:%.*]] = fadd reassoc arcp contract afn half [[DIV]], [[SAMPLE]] +; GFX81PLUS-NEXT: [[RES:%.*]] = fpext half [[ADD]] to float +; GFX81PLUS-NEXT: ret float [[RES]] +; +main_body: + %sample = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float %u, float %v, <8 x i32> %surf_desc, <4 x i32> %samp, i1 false, i32 0, i32 0) + %pack = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %sample, float 0.000000e+00) + %h0 = extractelement <2 x half> %pack, i64 0 + %mul = fmul reassoc arcp contract afn half %h0, %h0 + %div = fdiv reassoc arcp contract afn half %mul, %h0 + %add = fadd reassoc arcp contract afn half %div, %h0 + %res = fpext half %add to float + ret float %res +} + +define amdgpu_ps float @image_sample_2d_pkrtz_variable_no_d16(<8 x i32> %surf_desc, <4 x i32> %samp, float %u, float %v) { +; GFX7-LABEL: @image_sample_2d_pkrtz_variable_no_d16( +; GFX7-NEXT: main_body: +; GFX7-NEXT: [[SAMPLE:%.*]] = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; GFX7-NEXT: [[PACK:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[SAMPLE]], float [[V]]) +; GFX7-NEXT: [[H0:%.*]] = extractelement <2 x half> [[PACK]], i64 0 +; GFX7-NEXT: [[H1:%.*]] = extractelement <2 x half> [[PACK]], i64 1 +; GFX7-NEXT: [[MUL:%.*]] = fmul half [[H0]], [[H1]] +; GFX7-NEXT: [[ADD:%.*]] = fadd half [[MUL]], [[H0]] +; GFX7-NEXT: [[RES:%.*]] = fpext half [[ADD]] to float +; GFX7-NEXT: ret float [[RES]] +; +; GFX81PLUS-LABEL: @image_sample_2d_pkrtz_variable_no_d16( +; GFX81PLUS-NEXT: main_body: +; GFX81PLUS-NEXT: [[SAMPLE:%.*]] = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; GFX81PLUS-NEXT: [[PACK:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[SAMPLE]], float [[V]]) +; GFX81PLUS-NEXT: [[H0:%.*]] = extractelement <2 x half> [[PACK]], i64 0 +; GFX81PLUS-NEXT: [[H1:%.*]] = extractelement <2 x half> [[PACK]], i64 1 +; GFX81PLUS-NEXT: [[MUL:%.*]] = fmul half [[H0]], [[H1]] +; GFX81PLUS-NEXT: [[ADD:%.*]] = fadd half [[MUL]], [[H0]] +; GFX81PLUS-NEXT: [[RES:%.*]] = fpext half [[ADD]] to float +; GFX81PLUS-NEXT: ret float [[RES]] +; +main_body: + %sample = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float %u, float %v, <8 x i32> %surf_desc, <4 x i32> %samp, i1 false, i32 0, i32 0) + %pack = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %sample, float %v) + %h0 = extractelement <2 x half> %pack, i64 0 + %h1 = extractelement <2 x half> %pack, i64 1 + %mul = fmul half %h0, %h1 + %add = fadd half %mul, %h0 + %res = fpext half %add to float + ret float %res +} + +define amdgpu_ps float @image_sample_2d_pkrtz_constant_no_fold(<8 x i32> %surf_desc, <4 x i32> %samp, float %u, float %v) { +; GFX7-LABEL: @image_sample_2d_pkrtz_constant_no_fold( +; GFX7-NEXT: main_body: +; GFX7-NEXT: [[SAMPLE:%.*]] = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; GFX7-NEXT: [[PACK:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[SAMPLE]], float 0.000000e+00) +; GFX7-NEXT: [[H0:%.*]] = extractelement <2 x half> [[PACK]], i64 0 +; GFX7-NEXT: [[H1:%.*]] = extractelement <2 x half> [[PACK]], i64 1 +; GFX7-NEXT: [[MUL:%.*]] = fmul half [[H0]], [[H1]] +; GFX7-NEXT: [[ADD:%.*]] = fadd half [[MUL]], [[H0]] +; GFX7-NEXT: [[RES:%.*]] = fpext half [[ADD]] to float +; GFX7-NEXT: ret float [[RES]] +; +; GFX81PLUS-LABEL: @image_sample_2d_pkrtz_constant_no_fold( +; GFX81PLUS-NEXT: main_body: +; GFX81PLUS-NEXT: [[SAMPLE:%.*]] = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; GFX81PLUS-NEXT: [[PACK:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[SAMPLE]], float 0.000000e+00) +; GFX81PLUS-NEXT: [[H0:%.*]] = extractelement <2 x half> [[PACK]], i64 0 +; GFX81PLUS-NEXT: [[H1:%.*]] = extractelement <2 x half> [[PACK]], i64 1 +; GFX81PLUS-NEXT: [[MUL:%.*]] = fmul half [[H0]], [[H1]] +; GFX81PLUS-NEXT: [[ADD:%.*]] = fadd half [[MUL]], [[H0]] +; GFX81PLUS-NEXT: [[RES:%.*]] = fpext half [[ADD]] to float +; GFX81PLUS-NEXT: ret float [[RES]] +; +main_body: + %sample = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float %u, float %v, <8 x i32> %surf_desc, <4 x i32> %samp, i1 false, i32 0, i32 0) + %pack = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %sample, float 0.000000e+00) + %h0 = extractelement <2 x half> %pack, i64 0 + %h1 = extractelement <2 x half> %pack, i64 1 + %mul = fmul half %h0, %h1 + %add = fadd half %mul, %h0 + %res = fpext half %add to float + ret float %res +} + +define amdgpu_ps float @image_sample_2d_single_pkrtz_high_no_d16(<8 x i32> %surf_desc, <4 x i32> %samp, float %u, float %v) { +; GFX7-LABEL: @image_sample_2d_single_pkrtz_high_no_d16( +; GFX7-NEXT: main_body: +; GFX7-NEXT: [[SAMPLE:%.*]] = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; GFX7-NEXT: [[PACK:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.000000e+00, float [[SAMPLE]]) +; GFX7-NEXT: [[H0:%.*]] = extractelement <2 x half> [[PACK]], i64 1 +; GFX7-NEXT: [[MUL:%.*]] = fmul reassoc arcp contract afn half [[H0]], [[H0]] +; GFX7-NEXT: [[DIV:%.*]] = fdiv reassoc arcp contract afn half [[MUL]], [[H0]] +; GFX7-NEXT: [[ADD:%.*]] = fadd reassoc arcp contract afn half [[DIV]], [[H0]] +; GFX7-NEXT: [[RES:%.*]] = fpext half [[ADD]] to float +; GFX7-NEXT: ret float [[RES]] +; +; GFX81PLUS-LABEL: @image_sample_2d_single_pkrtz_high_no_d16( +; GFX81PLUS-NEXT: main_body: +; GFX81PLUS-NEXT: [[SAMPLE:%.*]] = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0) +; GFX81PLUS-NEXT: [[PACK:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.000000e+00, float [[SAMPLE]]) +; GFX81PLUS-NEXT: [[H0:%.*]] = extractelement <2 x half> [[PACK]], i64 1 +; GFX81PLUS-NEXT: [[MUL:%.*]] = fmul reassoc arcp contract afn half [[H0]], [[H0]] +; GFX81PLUS-NEXT: [[DIV:%.*]] = fdiv reassoc arcp contract afn half [[MUL]], [[H0]] +; GFX81PLUS-NEXT: [[ADD:%.*]] = fadd reassoc arcp contract afn half [[DIV]], [[H0]] +; GFX81PLUS-NEXT: [[RES:%.*]] = fpext half [[ADD]] to float +; GFX81PLUS-NEXT: ret float [[RES]] +; +main_body: + %sample = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float %u, float %v, <8 x i32> %surf_desc, <4 x i32> %samp, i1 false, i32 0, i32 0) + %pack = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.000000e+00, float %sample) + %h0 = extractelement <2 x half> %pack, i64 1 + %mul = fmul reassoc arcp contract afn half %h0, %h0 + %div = fdiv reassoc arcp contract afn half %mul, %h0 + %add = fadd reassoc arcp contract afn half %div, %h0 + %res = fpext half %add to float + ret float %res +} + define amdgpu_ps half @image_gather4_2d_v4f32(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) { ; GFX7-LABEL: @image_gather4_2d_v4f32( ; GFX7-NEXT: main_body: