https://github.com/EsmeYi updated https://github.com/llvm/llvm-project/pull/74415
>From f6d0ef8357540c61a9c20774e3b170a8db5b72ca Mon Sep 17 00:00:00 2001 From: esmeyi <esme...@ibm.com> Date: Tue, 5 Dec 2023 00:44:04 -0500 Subject: [PATCH 1/2] Exploit STMW and LMW in 32-bit big-endian mode. --- llvm/lib/Target/PowerPC/PPCFrameLowering.cpp | 76 ++++++++++++- llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp | 4 + llvm/test/CodeGen/PowerPC/stm-lm-merge.ll | 110 +++++++++++++++++++ 3 files changed, 188 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/PowerPC/stm-lm-merge.ll diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp index eb3bf3b2690b2..4d4ef6251a999 100644 --- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -40,6 +40,12 @@ EnablePEVectorSpills("ppc-enable-pe-vector-spills", cl::desc("Enable spills in prologue to vector registers."), cl::init(false), cl::Hidden); +static cl::opt<bool> + EnableLoadStoreMultiple("ppc-enable-load-store-multiple", + cl::desc("Enable load/store multiple (only " + "support in 32-bit big-endian mode)."), + cl::init(false), cl::Hidden); + static unsigned computeReturnSaveOffset(const PPCSubtarget &STI) { if (STI.isAIXABI()) return STI.isPPC64() ? 16 : 8; @@ -2407,6 +2413,30 @@ bool PPCFrameLowering::assignCalleeSavedSpillSlots( return AllSpilledToReg; } +static void findContinuousLoadStore(ArrayRef<CalleeSavedInfo> CSI, + Register &MergeFrom) { + CalleeSavedInfo BeginI = CSI[0]; + unsigned I = 1, E = CSI.size(); + for (; I < E; ++I) { + // Find continuous store/load. + unsigned RegDiff = CSI[I].getReg() - CSI[I - 1].getReg(); + unsigned FrameIdxDiff = CSI[I - 1].getFrameIdx() - CSI[I].getFrameIdx(); + Register BeginReg = BeginI.getReg(); + if (BeginReg < PPC::R0 || BeginReg > PPC::R31 || BeginI.isSpilledToReg() || + RegDiff != 1 || FrameIdxDiff != 1) + BeginI = CSI[I]; + if (CSI[I].getReg() == PPC::R31) + break; + } + + if (I == E || BeginI.getReg() == PPC::R31) + return; + + // Record the first reg that STMW/LMW are going to merge since STMW/LMW save + // from rN to r31. + MergeFrom = BeginI.getReg(); +} + bool PPCFrameLowering::spillCalleeSavedRegisters( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const { @@ -2437,6 +2467,11 @@ bool PPCFrameLowering::spillCalleeSavedRegisters( } } + Register MergeFrom = PPC::R31; + if (EnableLoadStoreMultiple && !Subtarget.isLittleEndian() && + !Subtarget.isPPC64()) + findContinuousLoadStore(CSI, MergeFrom); + for (const CalleeSavedInfo &I : CSI) { Register Reg = I.getReg(); @@ -2521,7 +2556,23 @@ bool PPCFrameLowering::spillCalleeSavedRegisters( !MF->getFunction().hasFnAttribute(Attribute::NoUnwind)) TII.storeRegToStackSlotNoUpd(MBB, MI, Reg, !IsLiveIn, I.getFrameIdx(), RC, TRI); - else + else if (MergeFrom < PPC::R31 && Reg == MergeFrom) { + // Build an STMW instruction. + int FrameIdx = I.getFrameIdx(); + MachineInstrBuilder MIB = + BuildMI(MBB, MBB.begin(), DL, TII.get(PPC::STMW)); + MIB.addReg(Reg, getKillRegState(!IsLiveIn)); + // Add frame reference. + MIB.addImm(0).addFrameIndex(FrameIdx); + const MachineFrameInfo &MFI = MF->getFrameInfo(); + MachineMemOperand *MMO = MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*MF, FrameIdx), + MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx), + MFI.getObjectAlign(FrameIdx)); + MIB.addMemOperand(MMO); + } else if (Reg > MergeFrom && Reg <= PPC::R31) + continue; + else TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn, I.getFrameIdx(), RC, TRI, Register()); } @@ -2615,6 +2666,11 @@ bool PPCFrameLowering::restoreCalleeSavedRegisters( unsigned CSIIndex = 0; BitVector Restored(TRI->getNumRegs()); + Register MergeFrom = PPC::R31; + if (EnableLoadStoreMultiple && !Subtarget.isLittleEndian() && + !Subtarget.isPPC64()) + findContinuousLoadStore(CSI, MergeFrom); + // Initialize insertion-point logic; we will be restoring in reverse // order of spill. MachineBasicBlock::iterator I = MI, BeforeI = I; @@ -2694,7 +2750,23 @@ bool PPCFrameLowering::restoreCalleeSavedRegisters( !MF->getFunction().hasFnAttribute(Attribute::NoUnwind)) TII.loadRegFromStackSlotNoUpd(MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI); - else + else if (MergeFrom < PPC::R31 && Reg == MergeFrom) { + // Build an LMW instruction. + int FrameIdx = CSI[i].getFrameIdx(); + DebugLoc DL; + MachineInstrBuilder MIB = + BuildMI(MBB, MBB.begin(), DL, TII.get(PPC::LMW), Reg); + // Add frame reference. + MIB.addImm(0).addFrameIndex(FrameIdx); + const MachineFrameInfo &MFI = MF->getFrameInfo(); + MachineMemOperand *MMO = MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*MF, FrameIdx), + MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx), + MFI.getObjectAlign(FrameIdx)); + MIB.addMemOperand(MMO); + } else if (Reg > MergeFrom && Reg <= PPC::R31) + continue; + else TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI, Register()); diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp index 7d913a77cc715..6f5cc5e550e0f 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -1676,6 +1676,10 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, bool noImmForm = !MI.isInlineAsm() && OpC != TargetOpcode::STACKMAP && OpC != TargetOpcode::PATCHPOINT && !ImmToIdxMap.count(OpC); + // STMW and LMW only have immediate form. + if (OpC == PPC::STMW || OpC == PPC::LMW) + noImmForm = false; + // Now add the frame object offset to the offset from r1. int64_t Offset = MFI.getObjectOffset(FrameIndex); Offset += MI.getOperand(OffsetOperandNo).getImm(); diff --git a/llvm/test/CodeGen/PowerPC/stm-lm-merge.ll b/llvm/test/CodeGen/PowerPC/stm-lm-merge.ll new file mode 100644 index 0000000000000..f486317f758a4 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/stm-lm-merge.ll @@ -0,0 +1,110 @@ +; RUN: llc -mtriple=powerpc-unknown-aix-xcoff -verify-machineinstrs \ +; RUN: -mcpu=pwr4 -mattr=-altivec --ppc-enable-load-store-multiple < %s \ +; RUN: | FileCheck %s + +; CHECK: stmw 16, 64(1) # 4-byte Folded Spill +; CHECK: lmw 16, 64(1) # 4-byte Folded Reload + +@a = external local_unnamed_addr global i32, align 4 +@b = external local_unnamed_addr global i32, align 4 +@f = external local_unnamed_addr global i32, align 4 +@c = external local_unnamed_addr global i32, align 4 +@g = external local_unnamed_addr global i32, align 4 +@e = external local_unnamed_addr global i32, align 4 +@h = external local_unnamed_addr global i32, align 4 +@d = external local_unnamed_addr global i32, align 4 + +; Function Attrs: nounwind +define i32 @foo(ptr noundef %b1, ptr noundef %b2, i32 noundef %count) local_unnamed_addr #0 { +entry: + %invariant.gep = getelementptr i32, ptr %b2, i32 -1 + %cmp63 = icmp sgt i32 %count, 0 + br i1 %cmp63, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %sw.epilog, %entry + %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add29, %sw.epilog ] + %0 = load i32, ptr @a, align 4 + %add30 = add nsw i32 %0, %sum.0.lcssa + %1 = load i32, ptr @b, align 4 + %add31 = add nsw i32 %add30, %1 + %2 = load i32, ptr @c, align 4 + %add32 = add nsw i32 %add31, %2 + %3 = load i32, ptr @d, align 4 + %add33 = add nsw i32 %add32, %3 + %4 = load i32, ptr @e, align 4 + %add34 = add nsw i32 %add33, %4 + ret i32 %add34 + +for.body: ; preds = %entry, %sw.epilog + %i.065 = phi i32 [ %inc, %sw.epilog ], [ 0, %entry ] + %sum.064 = phi i32 [ %add29, %sw.epilog ], [ 0, %entry ] + tail call void @foo1(ptr noundef %b1, ptr noundef %b2) #2 + %rem = and i32 %i.065, 3 + switch i32 %rem, label %for.body.unreachabledefault [ + i32 0, label %sw.bb + i32 1, label %sw.bb4 + i32 2, label %sw.bb11 + i32 3, label %sw.bb19 + ] + +sw.bb: ; preds = %for.body + %gep62 = getelementptr i32, ptr %invariant.gep, i32 %i.065 + %5 = load i32, ptr %gep62, align 4 + %6 = load i32, ptr @a, align 4 + %7 = load i32, ptr @b, align 4 + %reass.add = add i32 %7, %5 + %reass.mul = mul i32 %reass.add, 3 + %add2 = add i32 %reass.mul, %6 + br label %sw.epilog + +sw.bb4: ; preds = %for.body + %arrayidx5 = getelementptr inbounds i32, ptr %b2, i32 %i.065 + %8 = load i32, ptr %arrayidx5, align 4 + %mul6 = mul nsw i32 %8, 5 + %9 = load i32, ptr @b, align 4 + %10 = load i32, ptr @f, align 4 + %11 = shl i32 %10, 2 + %12 = add i32 %9, %11 + %sub9 = sub i32 %mul6, %12 + br label %sw.epilog + +sw.bb11: ; preds = %for.body + %gep = getelementptr i32, ptr %invariant.gep, i32 %i.065 + %13 = load i32, ptr %gep, align 4 + %mul14 = shl nsw i32 %13, 2 + %14 = load i32, ptr @c, align 4 + %mul15 = mul nsw i32 %mul14, %14 + %15 = load i32, ptr @g, align 4 + %mul16 = mul nsw i32 %15, 5 + %add17 = add nsw i32 %mul16, %mul15 + br label %sw.epilog + +sw.bb19: ; preds = %for.body + %arrayidx20 = getelementptr inbounds i32, ptr %b2, i32 %i.065 + %16 = load i32, ptr %arrayidx20, align 4 + %mul21 = mul nsw i32 %16, 6 + %17 = load i32, ptr @e, align 4 + %div = sdiv i32 %mul21, %17 + %div22 = udiv i32 6, %i.065 + %add23 = add nsw i32 %div22, %div + br label %sw.epilog + +for.body.unreachabledefault: ; preds = %for.body + unreachable + +sw.epilog: ; preds = %sw.bb19, %sw.bb11, %sw.bb4, %sw.bb + %add23.sink = phi i32 [ %add23, %sw.bb19 ], [ %add17, %sw.bb11 ], [ %sub9, %sw.bb4 ], [ %add2, %sw.bb ] + %arrayidx24 = getelementptr inbounds i32, ptr %b1, i32 %i.065 + store i32 %add23.sink, ptr %arrayidx24, align 4 + %arrayidx26 = getelementptr inbounds i32, ptr %b2, i32 %i.065 + %18 = load i32, ptr %arrayidx26, align 4 + %19 = load i32, ptr @h, align 4 + %add27 = add i32 %add23.sink, %sum.064 + %add28 = add i32 %add27, %18 + %add29 = add i32 %add28, %19 + %inc = add nuw nsw i32 %i.065, 1 + %exitcond.not = icmp eq i32 %inc, %count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +declare void @foo1(ptr noundef, ptr noundef) local_unnamed_addr #1 >From fcab87aa7c93cde06ffa5cf4e722896e370aa3f5 Mon Sep 17 00:00:00 2001 From: esmeyi <esme...@ibm.com> Date: Tue, 5 Dec 2023 03:44:06 -0500 Subject: [PATCH 2/2] Address ecnelises's comments. --- llvm/lib/Target/PowerPC/PPCFrameLowering.cpp | 35 +++++++++++--------- llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp | 8 ++--- llvm/test/CodeGen/PowerPC/stm-lm-merge.ll | 32 ++++++++++++++++-- 3 files changed, 52 insertions(+), 23 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp index fdd1c6d508638..705dd9140c1b0 100644 --- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -2407,26 +2407,25 @@ bool PPCFrameLowering::assignCalleeSavedSpillSlots( static void findContinuousLoadStore(ArrayRef<CalleeSavedInfo> CSI, Register &MergeFrom) { - CalleeSavedInfo BeginI = CSI[0]; - unsigned I = 1, E = CSI.size(); + unsigned I = 1, E = CSI.size(), BeginI = 0; for (; I < E; ++I) { // Find continuous store/load. - unsigned RegDiff = CSI[I].getReg() - CSI[I - 1].getReg(); - unsigned FrameIdxDiff = CSI[I - 1].getFrameIdx() - CSI[I].getFrameIdx(); - Register BeginReg = BeginI.getReg(); - if (BeginReg < PPC::R0 || BeginReg > PPC::R31 || BeginI.isSpilledToReg() || - RegDiff != 1 || FrameIdxDiff != 1) - BeginI = CSI[I]; + int RegDiff = CSI[I].getReg() - CSI[I - 1].getReg(); + int FrameIdxDiff = CSI[I - 1].getFrameIdx() - CSI[I].getFrameIdx(); + Register BeginReg = CSI[BeginI].getReg(); + if (BeginReg < PPC::R0 || BeginReg > PPC::R31 || + CSI[BeginI].isSpilledToReg() || RegDiff != 1 || FrameIdxDiff != 1) + BeginI = I; if (CSI[I].getReg() == PPC::R31) break; } - if (I == E || BeginI.getReg() == PPC::R31) + if (I == E || CSI[BeginI].getReg() >= PPC::R31) return; // Record the first reg that STMW/LMW are going to merge since STMW/LMW save // from rN to r31. - MergeFrom = BeginI.getReg(); + MergeFrom = CSI[BeginI].getReg(); } bool PPCFrameLowering::spillCalleeSavedRegisters( @@ -2459,6 +2458,8 @@ bool PPCFrameLowering::spillCalleeSavedRegisters( } } + // STMW save from rN to r31, where rN < r31. MergeFrom will be less than + // r31 if continuous store are found. Register MergeFrom = PPC::R31; if (EnableLoadStoreMultiple && !Subtarget.isLittleEndian() && !Subtarget.isPPC64()) @@ -2546,8 +2547,8 @@ bool PPCFrameLowering::spillCalleeSavedRegisters( // saved vector registers. if (Subtarget.needsSwapsForVSXMemOps() && !MF->getFunction().hasFnAttribute(Attribute::NoUnwind)) - TII.storeRegToStackSlotNoUpd(MBB, MI, Reg, !IsLiveIn, - I.getFrameIdx(), RC, TRI); + TII.storeRegToStackSlotNoUpd(MBB, MI, Reg, !IsLiveIn, I.getFrameIdx(), + RC, TRI); else if (MergeFrom < PPC::R31 && Reg == MergeFrom) { // Build an STMW instruction. int FrameIdx = I.getFrameIdx(); @@ -2564,7 +2565,7 @@ bool PPCFrameLowering::spillCalleeSavedRegisters( MIB.addMemOperand(MMO); } else if (Reg > MergeFrom && Reg <= PPC::R31) continue; - else + else TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn, I.getFrameIdx(), RC, TRI, Register()); } @@ -2658,6 +2659,8 @@ bool PPCFrameLowering::restoreCalleeSavedRegisters( unsigned CSIIndex = 0; BitVector Restored(TRI->getNumRegs()); + // LMW save from rN to r31, where rN < r31. MergeFrom will be less than + // r31 if continuous load are found. Register MergeFrom = PPC::R31; if (EnableLoadStoreMultiple && !Subtarget.isLittleEndian() && !Subtarget.isPPC64()) @@ -2756,13 +2759,13 @@ bool PPCFrameLowering::restoreCalleeSavedRegisters( MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx), MFI.getObjectAlign(FrameIdx)); MIB.addMemOperand(MMO); - } else if (Reg > MergeFrom && Reg <= PPC::R31) + } else if (Reg > MergeFrom && Reg < PPC::R31) continue; - else + else TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI, Register()); - assert(I != MBB.begin() && + assert(I != MBB.begin() && "loadRegFromStackSlot didn't insert any code!"); } } diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp index 6f5cc5e550e0f..f582f2d35b495 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -1673,12 +1673,10 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // If the instruction is not present in ImmToIdxMap, then it has no immediate // form (and must be r+r). - bool noImmForm = !MI.isInlineAsm() && OpC != TargetOpcode::STACKMAP && - OpC != TargetOpcode::PATCHPOINT && !ImmToIdxMap.count(OpC); - // STMW and LMW only have immediate form. - if (OpC == PPC::STMW || OpC == PPC::LMW) - noImmForm = false; + bool noImmForm = !MI.isInlineAsm() && OpC != TargetOpcode::STACKMAP && + OpC != TargetOpcode::PATCHPOINT && !ImmToIdxMap.count(OpC) && + OpC != PPC::STMW && OpC != PPC::LMW; // Now add the frame object offset to the offset from r1. int64_t Offset = MFI.getObjectOffset(FrameIndex); diff --git a/llvm/test/CodeGen/PowerPC/stm-lm-merge.ll b/llvm/test/CodeGen/PowerPC/stm-lm-merge.ll index f486317f758a4..94a96c7dd0a5e 100644 --- a/llvm/test/CodeGen/PowerPC/stm-lm-merge.ll +++ b/llvm/test/CodeGen/PowerPC/stm-lm-merge.ll @@ -2,8 +2,36 @@ ; RUN: -mcpu=pwr4 -mattr=-altivec --ppc-enable-load-store-multiple < %s \ ; RUN: | FileCheck %s -; CHECK: stmw 16, 64(1) # 4-byte Folded Spill -; CHECK: lmw 16, 64(1) # 4-byte Folded Reload +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: mflr 0 +; CHECK-NEXT: stwu 1, -128(1) +; CHECK-NEXT: cmpwi 5, 0 +; CHECK-NEXT: stw 0, 136(1) +; CHECK-NEXT: stmw 16, 64(1) # 4-byte Folded Spill +; CHECK-NEXT: ble 0, L..BB0_11 + +; CHECK: L..BB0_12: # %for.cond.cleanup +; CHECK-NEXT: lwz 3, L..C0(2) # @a +; CHECK-NEXT: lwz 4, L..C1(2) # @b +; CHECK-NEXT: lwz 5, L..C4(2) # @c +; CHECK-NEXT: lwz 6, L..C7(2) # @d +; CHECK-NEXT: lwz 7, L..C6(2) # @e +; CHECK-NEXT: lmw 16, 64(1) # 4-byte Folded Reload +; CHECK-NEXT: lwz 3, 0(3) +; CHECK-NEXT: lwz 4, 0(4) +; CHECK-NEXT: add 3, 3, 28 +; CHECK-NEXT: lwz 5, 0(5) +; CHECK-NEXT: add 3, 3, 4 +; CHECK-NEXT: lwz 6, 0(6) +; CHECK-NEXT: add 3, 3, 5 +; CHECK-NEXT: lwz 4, 0(7) +; CHECK-NEXT: add 3, 3, 6 +; CHECK-NEXT: add 3, 3, 4 +; CHECK-NEXT: lwz 31, 124(1) # 4-byte Folded Reload +; CHECK-NEXT: addi 1, 1, 128 +; CHECK-NEXT: lwz 0, 8(1) +; CHECK-NEXT: mtlr 0 +; CHECK-NEXT: bl @a = external local_unnamed_addr global i32, align 4 @b = external local_unnamed_addr global i32, align 4 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits