Skip to content

Commit a6b6eb4

Browse files
committed
[AIEX] Add address chaining for fixed stack objects
1 parent be68803 commit a6b6eb4

File tree

10 files changed

+607
-499
lines changed

10 files changed

+607
-499
lines changed

llvm/lib/Target/AIE/AIEClusterBaseAddress.cpp

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
// See https://llvm.org/LICENSE.txt for license information.
55
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
66
//
7-
// (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates
7+
// (c) Copyright 2023-2025 Advanced Micro Devices, Inc. or its affiliates
88
//
99
//===----------------------------------------------------------------------===//
1010
//
@@ -45,9 +45,11 @@
4545
//===----------------------------------------------------------------------===//
4646

4747
#include "AIE.h"
48+
#include "llvm/ADT/iterator_range.h"
4849
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
4950
#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
5051
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
52+
#include "llvm/CodeGen/MachineFrameInfo.h"
5153
#include "llvm/CodeGen/MachineFunction.h"
5254
#include "llvm/CodeGen/MachineFunctionPass.h"
5355
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -82,6 +84,10 @@ static cl::opt<bool>
8284
cl::desc("Disable ptradd chaining that feed "
8385
"loads that are used in conditional jumps."));
8486

87+
static cl::opt<bool>
88+
EnableStackChaining("aie-chain-stack", cl::Hidden, cl::init(true),
89+
cl::desc("Enable pointer chaining for stack access."));
90+
8591
namespace {
8692

8793
LLT getLoadStoreType(const MachineInstr &MI) {
@@ -163,6 +169,10 @@ class AIEClusterBaseAddress : public MachineFunctionPass {
163169
bool processBasicBlock(MachineBasicBlock &MBB, MachineIRBuilder &MIB,
164170
GISelObserverWrapper &Observer);
165171

172+
// Create chaining opportunities related to FRAME_INDEX.
173+
bool convertFIToPtrAdd(MachineBasicBlock &MBB, MachineIRBuilder &MIB,
174+
GISelObserverWrapper &Observer);
175+
166176
// Get all candidates, i.e. groups of G_PTR_ADDs in the same
167177
// basic block that shares the same input pointer.
168178
RegUseMap collectPtrUses(MachineBasicBlock &MBB);
@@ -238,7 +248,10 @@ bool AIEClusterBaseAddress::runOnMachineFunction(MachineFunction &MF) {
238248
}
239249

240250
bool Changed = false;
251+
241252
for (MachineBasicBlock &MBB : MF) {
253+
if (EnableStackChaining)
254+
Changed |= convertFIToPtrAdd(MBB, MIB, Observer);
242255
Changed |= processBasicBlock(MBB, MIB, Observer);
243256
}
244257
return Changed;
@@ -347,6 +360,55 @@ bool AIEClusterBaseAddress::avoidPtrAdd(
347360
return LoadFeedCondBranch;
348361
}
349362

363+
bool AIEClusterBaseAddress::convertFIToPtrAdd(MachineBasicBlock &MBB,
364+
MachineIRBuilder &MIB,
365+
GISelObserverWrapper &Observer) {
366+
367+
const MachineFrameInfo &MFI = MBB.getParent()->getFrameInfo();
368+
std::vector<MachineInstr *> FIs;
369+
370+
for (MachineInstr &FIInstr : MBB) {
371+
// Only consider G_FRAME_INDEX
372+
if (FIInstr.getOpcode() != TargetOpcode::G_FRAME_INDEX)
373+
continue;
374+
375+
const int FrameIdx = FIInstr.getOperand(1).getIndex();
376+
if (!MFI.isFixedObjectIndex(FrameIdx))
377+
continue;
378+
379+
FIs.push_back(&FIInstr);
380+
}
381+
382+
bool Changed = false;
383+
auto ReplaceFI = [&](std::vector<MachineInstr *> &FIs) {
384+
if (FIs.size() < 2)
385+
return;
386+
387+
MachineInstr *FirstMI = FIs[0];
388+
const int64_t FirstOffset =
389+
MFI.getObjectOffset(FirstMI->getOperand(1).getIndex());
390+
const Register FirstPtr = FirstMI->getOperand(0).getReg();
391+
392+
for (MachineInstr *FI : make_range(next(FIs.begin()), FIs.end())) {
393+
const int64_t Offset =
394+
MFI.getObjectOffset(FI->getOperand(1).getIndex()) - FirstOffset;
395+
MIB.setInstrAndDebugLoc(*FI);
396+
Register NewOffsetReg =
397+
MIB.buildConstant(LLT::scalar(20), Offset).getReg(0);
398+
399+
Observer.createdInstr(*MIB.buildInstr(TargetOpcode::G_PTR_ADD,
400+
{FI->getOperand(0).getReg()},
401+
{FirstPtr, NewOffsetReg}));
402+
Observer.erasingInstr(*FI);
403+
FI->eraseFromParent();
404+
Changed = true;
405+
}
406+
};
407+
408+
ReplaceFI(FIs);
409+
return Changed;
410+
}
411+
350412
AIEClusterBaseAddress::RegUseMap
351413
AIEClusterBaseAddress::collectPtrUses(MachineBasicBlock &MBB) {
352414
// Initialize Load Instrs to avoid

llvm/test/CodeGen/AIE/GlobalISel/cluster-base-address-stack-chain.mir

Lines changed: 266 additions & 135 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AIE/aie2/GlobalISel/prologepilog-tail-call-opt.ll

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,7 @@ define dso_local noundef i32 @_Z5test3iiiiiiii(i32 noundef %a, i32 noundef %a1,
6464
; CHECK-NEXT: {{ $}}
6565
; CHECK-NEXT: $p0 = MOV_mv_scl $sp
6666
; CHECK-NEXT: $p0 = PADD_imm9_pseudo $p0, -4
67-
; CHECK-NEXT: renamable $r0 = LDA_dms_lda_idx_imm killed renamable $p0, 0 :: (invariant load (s32) from %fixed-stack.1)
68-
; CHECK-NEXT: $p0 = MOV_mv_scl $sp
69-
; CHECK-NEXT: $p0 = PADD_imm9_pseudo $p0, -4
67+
; CHECK-NEXT: renamable $r0 = LDA_dms_lda_idx_imm renamable $p0, 0 :: (invariant load (s32) from %fixed-stack.1)
7068
; CHECK-NEXT: ST_dms_sts_idx_imm killed renamable $r0, killed renamable $p0, 0 :: (store (s32) into %fixed-stack.0, align 32)
7169
; CHECK-NEXT: PseudoJ_TCO_jump_imm @_Z5func1iiiiiiii, csr_aie2, implicit $r1, implicit $r2, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7
7270
entry:
@@ -83,13 +81,9 @@ define dso_local noundef i32 @_Z5test4iiiiiiiii(i32 noundef %a, i32 noundef %a1,
8381
; CHECK-NEXT: {{ $}}
8482
; CHECK-NEXT: $p0 = MOV_mv_scl $sp
8583
; CHECK-NEXT: $p0 = PADD_imm9_pseudo $p0, -4
86-
; CHECK-NEXT: renamable $r0 = LDA_dms_lda_idx_imm killed renamable $p0, 0 :: (invariant load (s32) from %fixed-stack.2)
87-
; CHECK-NEXT: $p0 = MOV_mv_scl $sp
88-
; CHECK-NEXT: $p0 = PADD_imm9_pseudo $p0, -8
89-
; CHECK-NEXT: renamable $r8 = LDA_dms_lda_idx_imm killed renamable $p0, 0 :: (invariant load (s32) from %fixed-stack.1, align 8)
84+
; CHECK-NEXT: renamable $r0 = LDA_dms_lda_idx_imm renamable $p0, 0 :: (invariant load (s32) from %fixed-stack.2)
85+
; CHECK-NEXT: renamable $r8 = LDA_dms_lda_idx_imm renamable $p0, -4 :: (invariant load (s32) from %fixed-stack.1, align 8)
9086
; CHECK-NEXT: renamable $r0 = nsw ADD killed renamable $r8, killed renamable $r0, implicit-def dead $srcarry
91-
; CHECK-NEXT: $p0 = MOV_mv_scl $sp
92-
; CHECK-NEXT: $p0 = PADD_imm9_pseudo $p0, -4
9387
; CHECK-NEXT: ST_dms_sts_idx_imm killed renamable $r0, killed renamable $p0, 0 :: (store (s32) into %fixed-stack.0, align 32)
9488
; CHECK-NEXT: PseudoJ_TCO_jump_imm @_Z5func1iiiiiiii, csr_aie2, implicit $r1, implicit $r2, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7
9589
entry:

llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll

Lines changed: 42 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -34,48 +34,39 @@ declare { ptr, i20, i20 } @llvm.aie2.add.3d(ptr, i20, i20, i20, i20, i20, i20, i
3434
define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm2_data, ptr noalias %ofm_data, ptr %.out, ptr %conv.i.i.i.out, ptr %idx.ext9.out, ptr %.out1, ptr %.out2, ptr %.out3, ptr %.out4, ptr %.out5, ptr %conv.i.i.i.i.i.out, ptr %.out6, ptr %conv.i.i.i46.out, ptr %xtraiter.out, ptr %in_ptr1.051.unr.ce.out, ptr %in_ptr2.0.in50.unr.ce.out, ptr %out_ptr.049.unr.ce.out, ptr %itr_left_cnt0.048.unr.ce.out, ptr %itr_left_cnt1.047.unr.ce.out) #3 {
3535
; ASM-LABEL: add2d:
3636
; ASM: // %bb.0: // %newFuncRoot
37-
; ASM-NEXT: lda r2, [p0, #64]; paddb [p0], #40; nopm
37+
; ASM-NEXT: lda r2, [p0, #64]; paddb [p0], #40; nopxm ; nops
3838
; ASM-NEXT: lda m2, [p0], #-4
3939
; ASM-NEXT: lda m5, [p0], #8
4040
; ASM-NEXT: lda m4, [p0], #8
41-
; ASM-NEXT: lda m3, [p0], #-24; paddb [sp], #32
42-
; ASM-NEXT: lda r0, [p0], #4; st p6, [sp, #-28] // 4-byte Folded Spill
43-
; ASM-NEXT: lda r1, [p0], #-12; mov p6, sp
44-
; ASM-NEXT: lda r3, [p0], #40; paddb [p6], #-36
45-
; ASM-NEXT: lda p7, [p6, #0]; mov p6, sp
46-
; ASM-NEXT: paddb [p6], #-40
47-
; ASM-NEXT: lda r5, [p6, #0]; mov p6, sp
48-
; ASM-NEXT: lda m1, [p0], #36; paddb [p6], #-44
49-
; ASM-NEXT: lda p6, [p6, #0]
50-
; ASM-NEXT: lda m0, [p0], #-8; st p7, [sp, #-32] // 4-byte Folded Spill
51-
; ASM-NEXT: lda dn0, [p0], #-8; st r3, [p4, #0]
52-
; ASM-NEXT: lda dj0, [p0], #12; nez r4, r1; mov p4, sp
53-
; ASM-NEXT: lda dn4, [p0], #-8; paddb [p4], #-48; st r4, [p5, #0]
54-
; ASM-NEXT: lda p4, [p4, #0]; mov p5, sp
55-
; ASM-NEXT: lda dj4, [p0], #-36; st m1, [p7, #0]
56-
; ASM-NEXT: lda r1, [p0, #0]; mov p7, r5
57-
; ASM-NEXT: lda r5, [p0, #-36]; paddb [p5], #-52; mov p0, sp
58-
; ASM-NEXT: lda p5, [p5, #0]; st m0, [p7, #0]
59-
; ASM-NEXT: paddb [p0], #-72; mov p7, sp
60-
; ASM-NEXT: lda p0, [p0, #0]; paddb [p7], #-56; st dj0, [p6, #0]
61-
; ASM-NEXT: lda r6, [p7, #0]; mov p6, sp
62-
; ASM-NEXT: paddb [p6], #-60; mov p7, sp
63-
; ASM-NEXT: lda r7, [p6, #0]; paddb [p7], #-64; mov p6, sp
64-
; ASM-NEXT: lda p7, [p7, #0]; st dj4, [p4, #0]
65-
; ASM-NEXT: mov p4, sp
66-
; ASM-NEXT: paddb [p4], #-76; st dn0, [p5, #0]
67-
; ASM-NEXT: lda r11, [p4, #0]; paddb [p6], #-68; mov p4, sp
68-
; ASM-NEXT: lda r8, [p6, #0]; paddb [p4], #-80; mov p5, r6
69-
; ASM-NEXT: lda p6, [p4, #0]; mov p4, sp
70-
; ASM-NEXT: mova r6, #1; paddb [p4], #-84; nez r1, r1; st dn4, [p5, #0]
71-
; ASM-NEXT: lda r9, [p4, #0]; ne r6, r0, r6; mov p4, sp
72-
; ASM-NEXT: mova r0, #3; paddb [p4], #-88; add r7, r2, #-1; mov p5, r7
73-
; ASM-NEXT: lda r10, [p4, #0]; ltu r7, r7, r0; mov p4, sp
74-
; ASM-NEXT: jz r7, #.LBB0_2
75-
; ASM-NEXT: paddb [p4], #-92; st r1, [p5, #0] // Delay Slot 5
76-
; ASM-NEXT: lda p4, [p4, #0]; st r5, [p7, #0] // Delay Slot 4
77-
; ASM-NEXT: paddb [p2], m5; mov p7, r8 // Delay Slot 3
78-
; ASM-NEXT: st r6, [p7, #0]; paddb [p2], m4; and r8, r2, r0 // Delay Slot 2
41+
; ASM-NEXT: lda m3, [p0], #-24
42+
; ASM-NEXT: lda r0, [p0], #4
43+
; ASM-NEXT: lda r1, [p0], #-12
44+
; ASM-NEXT: lda r3, [p0], #40
45+
; ASM-NEXT: lda m1, [p0], #36
46+
; ASM-NEXT: lda m0, [p0], #-8
47+
; ASM-NEXT: lda dn0, [p0], #-8
48+
; ASM-NEXT: lda dj0, [p0], #12
49+
; ASM-NEXT: lda dn4, [p0], #-8; paddb [sp], #32
50+
; ASM-NEXT: lda dj4, [p0], #-36; st p7, [sp, #-32] // 4-byte Folded Spill
51+
; ASM-NEXT: lda r1, [p0, #0]; mov p7, sp
52+
; ASM-NEXT: lda r5, [p0, #-36]; paddb [p7], #-36
53+
; ASM-NEXT: lda p0, [p7], #-4
54+
; ASM-NEXT: lda p0, [p7], #-4
55+
; ASM-NEXT: lda p0, [p7], #-4
56+
; ASM-NEXT: lda p0, [p7], #-4
57+
; ASM-NEXT: lda p0, [p7], #-4; st p6, [sp, #-28]; nez r4, r1 // 4-byte Folded Spill
58+
; ASM-NEXT: lda p5, [p7], #-4; st r3, [p4, #0]
59+
; ASM-NEXT: lda p6, [p7], #-4; st r4, [p5, #0]
60+
; ASM-NEXT: lda p0, [p7], #-4; st m1, [p0, #0]; add r7, r2, #-1; mov r6, #1
61+
; ASM-NEXT: lda p4, [p7], #-4; st m0, [p0, #0]; ne r6, r0, r6
62+
; ASM-NEXT: lda r13, [p7], #-4; st dj0, [p0, #0]; movx r0, #3
63+
; ASM-NEXT: st dj4, [p0, #0]; ltu r7, r7, r0
64+
; ASM-NEXT: lda r9, [p7], #-4; st dn0, [p0, #0]; nez r1, r1
65+
; ASM-NEXT: lda r10, [p7], #-4; st dn4, [p5, #0]; jz r7, #.LBB0_2
66+
; ASM-NEXT: lda r11, [p7], #-4; st r1, [p6, #0] // Delay Slot 5
67+
; ASM-NEXT: lda p7, [p7, #-4]; st r5, [p0, #0] // Delay Slot 4
68+
; ASM-NEXT: paddb [p2], m5; st r6, [p4, #0] // Delay Slot 3
69+
; ASM-NEXT: lda r12, [p7, #0]; paddb [p2], m4; and r8, r2, r0; mov p0, r13 // Delay Slot 2
7970
; ASM-NEXT: padda [p1], m2; paddb [p2], m3; movx r0, #0; st r8, [p0, #0] // Delay Slot 1
8071
; ASM-NEXT: // %bb.1:
8172
; ASM-NEXT: j #.LBB0_5
@@ -123,18 +114,20 @@ define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm
123114
; ASM-NEXT: nop
124115
; ASM-NEXT: nop
125116
; ASM-NEXT: vst.srs.d8.s32 cm0, s0, [p3], #32
126-
; ASM-NEXT: vst.srs.d8.s32 cm2, s0, [p3], #32; mov crUPSSign, #0
127-
; ASM-NEXT: vst.srs.d8.s32 cm3, s0, [p3], #32; mov r0, dc0
128-
; ASM-NEXT: vst.srs.d8.s32 cm1, s0, [p3], #32; mov r1, dc4
117+
; ASM-NEXT: vst.srs.d8.s32 cm2, s0, [p3], #32
118+
; ASM-NEXT: vst.srs.d8.s32 cm3, s0, [p3], #32; mov crUPSSign, #0
119+
; ASM-NEXT: vst.srs.d8.s32 cm1, s0, [p3], #32; mov r0, dc0
120+
; ASM-NEXT: mov r1, dc4
129121
; ASM-NEXT: mov crSRSSign, #0
130122
; ASM-NEXT: .LBB0_5: // %for.cond.cleanup.unr-lcssa.split
131-
; ASM-NEXT: nopx ; mov p0, r10
132-
; ASM-NEXT: lda p7, [sp, #-32]; st r1, [p4, #0] // 4-byte Folded Reload
133-
; ASM-NEXT: lda p6, [sp, #-28]; st r0, [p0, #0] // 4-byte Folded Reload
134-
; ASM-NEXT: ret lr ; mov p0, r9
135-
; ASM-NEXT: st p3, [p0, #0] // Delay Slot 5
136-
; ASM-NEXT: mov p0, r11 // Delay Slot 4
137-
; ASM-NEXT: st p2, [p6, #0] // Delay Slot 3
123+
; ASM-NEXT: st r1, [p7, #0]; nopx
124+
; ASM-NEXT: mov p0, r12
125+
; ASM-NEXT: lda p7, [sp, #-32]; st r0, [p0, #0] // 4-byte Folded Reload
126+
; ASM-NEXT: lda p6, [sp, #-28]; mov p0, r11 // 4-byte Folded Reload
127+
; ASM-NEXT: st p3, [p0, #0]; ret lr
128+
; ASM-NEXT: mov p0, r10 // Delay Slot 5
129+
; ASM-NEXT: st p2, [p0, #0] // Delay Slot 4
130+
; ASM-NEXT: mov p0, r9 // Delay Slot 3
138131
; ASM-NEXT: st p1, [p0, #0] // Delay Slot 2
139132
; ASM-NEXT: paddb [sp], #-32 // Delay Slot 1
140133
newFuncRoot:

0 commit comments

Comments
 (0)