https://github.com/EsmeYi updated 
https://github.com/llvm/llvm-project/pull/74415

>From f6d0ef8357540c61a9c20774e3b170a8db5b72ca Mon Sep 17 00:00:00 2001
From: esmeyi <esme...@ibm.com>
Date: Tue, 5 Dec 2023 00:44:04 -0500
Subject: [PATCH 1/2] Exploit STMW and LMW in 32-bit big-endian mode.

---
 llvm/lib/Target/PowerPC/PPCFrameLowering.cpp |  76 ++++++++++++-
 llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp  |   4 +
 llvm/test/CodeGen/PowerPC/stm-lm-merge.ll    | 110 +++++++++++++++++++
 3 files changed, 188 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/stm-lm-merge.ll

diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp 
b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index eb3bf3b2690b2..4d4ef6251a999 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -40,6 +40,12 @@ EnablePEVectorSpills("ppc-enable-pe-vector-spills",
                      cl::desc("Enable spills in prologue to vector 
registers."),
                      cl::init(false), cl::Hidden);
 
+static cl::opt<bool>
+    EnableLoadStoreMultiple("ppc-enable-load-store-multiple",
+                            cl::desc("Enable load/store multiple (only "
+                                     "support in 32-bit big-endian mode)."),
+                            cl::init(false), cl::Hidden);
+
 static unsigned computeReturnSaveOffset(const PPCSubtarget &STI) {
   if (STI.isAIXABI())
     return STI.isPPC64() ? 16 : 8;
@@ -2407,6 +2413,30 @@ bool PPCFrameLowering::assignCalleeSavedSpillSlots(
   return AllSpilledToReg;
 }
 
+static void findContinuousLoadStore(ArrayRef<CalleeSavedInfo> CSI,
+                                    Register &MergeFrom) {
+  CalleeSavedInfo BeginI = CSI[0];
+  unsigned I = 1, E = CSI.size();
+  for (; I < E; ++I) {
+    // Find continuous store/load.
+    unsigned RegDiff = CSI[I].getReg() - CSI[I - 1].getReg();
+    unsigned FrameIdxDiff = CSI[I - 1].getFrameIdx() - CSI[I].getFrameIdx();
+    Register BeginReg = BeginI.getReg();
+    if (BeginReg < PPC::R0 || BeginReg > PPC::R31 || BeginI.isSpilledToReg() ||
+        RegDiff != 1 || FrameIdxDiff != 1)
+      BeginI = CSI[I];
+    if (CSI[I].getReg() == PPC::R31)
+      break;
+  }
+
+  if (I == E || BeginI.getReg() == PPC::R31)
+    return;
+
+  // Record the first reg that STMW/LMW are going to merge since STMW/LMW save
+  // from rN to r31.
+  MergeFrom = BeginI.getReg();
+}
+
 bool PPCFrameLowering::spillCalleeSavedRegisters(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
     ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
@@ -2437,6 +2467,11 @@ bool PPCFrameLowering::spillCalleeSavedRegisters(
     }
   }
 
+  Register MergeFrom = PPC::R31;
+  if (EnableLoadStoreMultiple && !Subtarget.isLittleEndian() &&
+      !Subtarget.isPPC64())
+    findContinuousLoadStore(CSI, MergeFrom);
+
   for (const CalleeSavedInfo &I : CSI) {
     Register Reg = I.getReg();
 
@@ -2521,7 +2556,23 @@ bool PPCFrameLowering::spillCalleeSavedRegisters(
             !MF->getFunction().hasFnAttribute(Attribute::NoUnwind))
           TII.storeRegToStackSlotNoUpd(MBB, MI, Reg, !IsLiveIn,
                                        I.getFrameIdx(), RC, TRI);
-        else
+        else if (MergeFrom < PPC::R31 && Reg == MergeFrom) {
+          // Build an STMW instruction.
+          int FrameIdx = I.getFrameIdx();
+          MachineInstrBuilder MIB =
+              BuildMI(MBB, MBB.begin(), DL, TII.get(PPC::STMW));
+          MIB.addReg(Reg, getKillRegState(!IsLiveIn));
+          // Add frame reference.
+          MIB.addImm(0).addFrameIndex(FrameIdx);
+          const MachineFrameInfo &MFI = MF->getFrameInfo();
+          MachineMemOperand *MMO = MF->getMachineMemOperand(
+              MachinePointerInfo::getFixedStack(*MF, FrameIdx),
+              MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx),
+              MFI.getObjectAlign(FrameIdx));
+          MIB.addMemOperand(MMO);
+        } else if (Reg > MergeFrom && Reg <= PPC::R31)
+          continue;
+       else
           TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn, I.getFrameIdx(), RC,
                                   TRI, Register());
       }
@@ -2615,6 +2666,11 @@ bool PPCFrameLowering::restoreCalleeSavedRegisters(
   unsigned CSIIndex = 0;
   BitVector Restored(TRI->getNumRegs());
 
+  Register MergeFrom = PPC::R31;
+  if (EnableLoadStoreMultiple && !Subtarget.isLittleEndian() &&
+      !Subtarget.isPPC64())
+    findContinuousLoadStore(CSI, MergeFrom);
+
   // Initialize insertion-point logic; we will be restoring in reverse
   // order of spill.
   MachineBasicBlock::iterator I = MI, BeforeI = I;
@@ -2694,7 +2750,23 @@ bool PPCFrameLowering::restoreCalleeSavedRegisters(
             !MF->getFunction().hasFnAttribute(Attribute::NoUnwind))
           TII.loadRegFromStackSlotNoUpd(MBB, I, Reg, CSI[i].getFrameIdx(), RC,
                                         TRI);
-        else
+        else if (MergeFrom < PPC::R31 && Reg == MergeFrom) {
+          // Build an LMW instruction.
+          int FrameIdx = CSI[i].getFrameIdx();
+          DebugLoc DL;
+          MachineInstrBuilder MIB =
+              BuildMI(MBB, MBB.begin(), DL, TII.get(PPC::LMW), Reg);
+          // Add frame reference.
+          MIB.addImm(0).addFrameIndex(FrameIdx);
+          const MachineFrameInfo &MFI = MF->getFrameInfo();
+          MachineMemOperand *MMO = MF->getMachineMemOperand(
+              MachinePointerInfo::getFixedStack(*MF, FrameIdx),
+              MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx),
+              MFI.getObjectAlign(FrameIdx));
+          MIB.addMemOperand(MMO);
+        } else if (Reg > MergeFrom && Reg <= PPC::R31)
+          continue;
+       else
           TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI,
                                    Register());
 
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp 
b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 7d913a77cc715..6f5cc5e550e0f 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -1676,6 +1676,10 @@ 
PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   bool noImmForm = !MI.isInlineAsm() && OpC != TargetOpcode::STACKMAP &&
                    OpC != TargetOpcode::PATCHPOINT && !ImmToIdxMap.count(OpC);
 
+  // STMW and LMW only have immediate form.
+  if (OpC == PPC::STMW || OpC == PPC::LMW)
+    noImmForm = false;
+
   // Now add the frame object offset to the offset from r1.
   int64_t Offset = MFI.getObjectOffset(FrameIndex);
   Offset += MI.getOperand(OffsetOperandNo).getImm();
diff --git a/llvm/test/CodeGen/PowerPC/stm-lm-merge.ll 
b/llvm/test/CodeGen/PowerPC/stm-lm-merge.ll
new file mode 100644
index 0000000000000..f486317f758a4
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/stm-lm-merge.ll
@@ -0,0 +1,110 @@
+; RUN: llc -mtriple=powerpc-unknown-aix-xcoff -verify-machineinstrs \
+; RUN:     -mcpu=pwr4 -mattr=-altivec --ppc-enable-load-store-multiple < %s \
+; RUN:     | FileCheck %s
+
+; CHECK: stmw 16, 64(1)   # 4-byte Folded Spill
+; CHECK: lmw 16, 64(1)    # 4-byte Folded Reload
+
+@a = external local_unnamed_addr global i32, align 4
+@b = external local_unnamed_addr global i32, align 4
+@f = external local_unnamed_addr global i32, align 4
+@c = external local_unnamed_addr global i32, align 4
+@g = external local_unnamed_addr global i32, align 4
+@e = external local_unnamed_addr global i32, align 4
+@h = external local_unnamed_addr global i32, align 4
+@d = external local_unnamed_addr global i32, align 4
+
+; Function Attrs: nounwind
+define i32 @foo(ptr noundef %b1, ptr noundef %b2, i32 noundef %count) 
local_unnamed_addr #0 {
+entry:
+  %invariant.gep = getelementptr i32, ptr %b2, i32 -1
+  %cmp63 = icmp sgt i32 %count, 0
+  br i1 %cmp63, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %sw.epilog, %entry
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add29, %sw.epilog ]
+  %0 = load i32, ptr @a, align 4
+  %add30 = add nsw i32 %0, %sum.0.lcssa
+  %1 = load i32, ptr @b, align 4
+  %add31 = add nsw i32 %add30, %1
+  %2 = load i32, ptr @c, align 4
+  %add32 = add nsw i32 %add31, %2
+  %3 = load i32, ptr @d, align 4
+  %add33 = add nsw i32 %add32, %3
+  %4 = load i32, ptr @e, align 4
+  %add34 = add nsw i32 %add33, %4
+  ret i32 %add34
+
+for.body:                                         ; preds = %entry, %sw.epilog
+  %i.065 = phi i32 [ %inc, %sw.epilog ], [ 0, %entry ]
+  %sum.064 = phi i32 [ %add29, %sw.epilog ], [ 0, %entry ]
+  tail call void @foo1(ptr noundef %b1, ptr noundef %b2) #2
+  %rem = and i32 %i.065, 3
+  switch i32 %rem, label %for.body.unreachabledefault [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb4
+    i32 2, label %sw.bb11
+    i32 3, label %sw.bb19
+  ]
+
+sw.bb:                                            ; preds = %for.body
+  %gep62 = getelementptr i32, ptr %invariant.gep, i32 %i.065
+  %5 = load i32, ptr %gep62, align 4
+  %6 = load i32, ptr @a, align 4
+  %7 = load i32, ptr @b, align 4
+  %reass.add = add i32 %7, %5
+  %reass.mul = mul i32 %reass.add, 3
+  %add2 = add i32 %reass.mul, %6
+  br label %sw.epilog
+
+sw.bb4:                                           ; preds = %for.body
+  %arrayidx5 = getelementptr inbounds i32, ptr %b2, i32 %i.065
+  %8 = load i32, ptr %arrayidx5, align 4
+  %mul6 = mul nsw i32 %8, 5
+  %9 = load i32, ptr @b, align 4
+  %10 = load i32, ptr @f, align 4
+  %11 = shl i32 %10, 2
+  %12 = add i32 %9, %11
+  %sub9 = sub i32 %mul6, %12
+  br label %sw.epilog
+
+sw.bb11:                                          ; preds = %for.body
+  %gep = getelementptr i32, ptr %invariant.gep, i32 %i.065
+  %13 = load i32, ptr %gep, align 4
+  %mul14 = shl nsw i32 %13, 2
+  %14 = load i32, ptr @c, align 4
+  %mul15 = mul nsw i32 %mul14, %14
+  %15 = load i32, ptr @g, align 4
+  %mul16 = mul nsw i32 %15, 5
+  %add17 = add nsw i32 %mul16, %mul15
+  br label %sw.epilog
+
+sw.bb19:                                          ; preds = %for.body
+  %arrayidx20 = getelementptr inbounds i32, ptr %b2, i32 %i.065
+  %16 = load i32, ptr %arrayidx20, align 4
+  %mul21 = mul nsw i32 %16, 6
+  %17 = load i32, ptr @e, align 4
+  %div = sdiv i32 %mul21, %17
+  %div22 = udiv i32 6, %i.065
+  %add23 = add nsw i32 %div22, %div
+  br label %sw.epilog
+
+for.body.unreachabledefault:                      ; preds = %for.body
+  unreachable
+
+sw.epilog:                                        ; preds = %sw.bb19, 
%sw.bb11, %sw.bb4, %sw.bb
+  %add23.sink = phi i32 [ %add23, %sw.bb19 ], [ %add17, %sw.bb11 ], [ %sub9, 
%sw.bb4 ], [ %add2, %sw.bb ]
+  %arrayidx24 = getelementptr inbounds i32, ptr %b1, i32 %i.065
+  store i32 %add23.sink, ptr %arrayidx24, align 4
+  %arrayidx26 = getelementptr inbounds i32, ptr %b2, i32 %i.065
+  %18 = load i32, ptr %arrayidx26, align 4
+  %19 = load i32, ptr @h, align 4
+  %add27 = add i32 %add23.sink, %sum.064
+  %add28 = add i32 %add27, %18
+  %add29 = add i32 %add28, %19
+  %inc = add nuw nsw i32 %i.065, 1
+  %exitcond.not = icmp eq i32 %inc, %count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+declare void @foo1(ptr noundef, ptr noundef) local_unnamed_addr #1

>From fcab87aa7c93cde06ffa5cf4e722896e370aa3f5 Mon Sep 17 00:00:00 2001
From: esmeyi <esme...@ibm.com>
Date: Tue, 5 Dec 2023 03:44:06 -0500
Subject: [PATCH 2/2] Address ecnelises's comments.

---
 llvm/lib/Target/PowerPC/PPCFrameLowering.cpp | 35 +++++++++++---------
 llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp  |  8 ++---
 llvm/test/CodeGen/PowerPC/stm-lm-merge.ll    | 32 ++++++++++++++++--
 3 files changed, 52 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp 
b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index fdd1c6d508638..705dd9140c1b0 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -2407,26 +2407,25 @@ bool PPCFrameLowering::assignCalleeSavedSpillSlots(
 
 static void findContinuousLoadStore(ArrayRef<CalleeSavedInfo> CSI,
                                     Register &MergeFrom) {
-  CalleeSavedInfo BeginI = CSI[0];
-  unsigned I = 1, E = CSI.size();
+  unsigned I = 1, E = CSI.size(), BeginI = 0;
   for (; I < E; ++I) {
     // Find continuous store/load.
-    unsigned RegDiff = CSI[I].getReg() - CSI[I - 1].getReg();
-    unsigned FrameIdxDiff = CSI[I - 1].getFrameIdx() - CSI[I].getFrameIdx();
-    Register BeginReg = BeginI.getReg();
-    if (BeginReg < PPC::R0 || BeginReg > PPC::R31 || BeginI.isSpilledToReg() ||
-        RegDiff != 1 || FrameIdxDiff != 1)
-      BeginI = CSI[I];
+    int RegDiff = CSI[I].getReg() - CSI[I - 1].getReg();
+    int FrameIdxDiff = CSI[I - 1].getFrameIdx() - CSI[I].getFrameIdx();
+    Register BeginReg = CSI[BeginI].getReg();
+    if (BeginReg < PPC::R0 || BeginReg > PPC::R31 ||
+        CSI[BeginI].isSpilledToReg() || RegDiff != 1 || FrameIdxDiff != 1)
+      BeginI = I;
     if (CSI[I].getReg() == PPC::R31)
       break;
   }
 
-  if (I == E || BeginI.getReg() == PPC::R31)
+  if (I == E || CSI[BeginI].getReg() >= PPC::R31)
     return;
 
   // Record the first reg that STMW/LMW are going to merge since STMW/LMW save
   // from rN to r31.
-  MergeFrom = BeginI.getReg();
+  MergeFrom = CSI[BeginI].getReg();
 }
 
 bool PPCFrameLowering::spillCalleeSavedRegisters(
@@ -2459,6 +2458,8 @@ bool PPCFrameLowering::spillCalleeSavedRegisters(
     }
   }
 
+  // STMW save from rN to r31, where rN < r31. MergeFrom will be less than
+  // r31 if continuous store are found.
   Register MergeFrom = PPC::R31;
   if (EnableLoadStoreMultiple && !Subtarget.isLittleEndian() &&
       !Subtarget.isPPC64())
@@ -2546,8 +2547,8 @@ bool PPCFrameLowering::spillCalleeSavedRegisters(
         // saved vector registers.
         if (Subtarget.needsSwapsForVSXMemOps() &&
             !MF->getFunction().hasFnAttribute(Attribute::NoUnwind))
-          TII.storeRegToStackSlotNoUpd(MBB, MI, Reg, !IsLiveIn,
-                                       I.getFrameIdx(), RC, TRI);
+          TII.storeRegToStackSlotNoUpd(MBB, MI, Reg, !IsLiveIn, 
I.getFrameIdx(),
+                                       RC, TRI);
         else if (MergeFrom < PPC::R31 && Reg == MergeFrom) {
           // Build an STMW instruction.
           int FrameIdx = I.getFrameIdx();
@@ -2564,7 +2565,7 @@ bool PPCFrameLowering::spillCalleeSavedRegisters(
           MIB.addMemOperand(MMO);
         } else if (Reg > MergeFrom && Reg <= PPC::R31)
           continue;
-       else
+        else
           TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn, I.getFrameIdx(), RC,
                                   TRI, Register());
       }
@@ -2658,6 +2659,8 @@ bool PPCFrameLowering::restoreCalleeSavedRegisters(
   unsigned CSIIndex = 0;
   BitVector Restored(TRI->getNumRegs());
 
+  // LMW save from rN to r31, where rN < r31. MergeFrom will be less than
+  // r31 if continuous load are found.
   Register MergeFrom = PPC::R31;
   if (EnableLoadStoreMultiple && !Subtarget.isLittleEndian() &&
       !Subtarget.isPPC64())
@@ -2756,13 +2759,13 @@ bool PPCFrameLowering::restoreCalleeSavedRegisters(
               MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx),
               MFI.getObjectAlign(FrameIdx));
           MIB.addMemOperand(MMO);
-        } else if (Reg > MergeFrom && Reg <= PPC::R31)
+        } else if (Reg > MergeFrom && Reg < PPC::R31)
           continue;
-       else
+        else
           TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI,
                                    Register());
 
-        assert(I != MBB.begin() &&
+       assert(I != MBB.begin() &&
                "loadRegFromStackSlot didn't insert any code!");
       }
     }
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp 
b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 6f5cc5e550e0f..f582f2d35b495 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -1673,12 +1673,10 @@ 
PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
   // If the instruction is not present in ImmToIdxMap, then it has no immediate
   // form (and must be r+r).
-  bool noImmForm = !MI.isInlineAsm() && OpC != TargetOpcode::STACKMAP &&
-                   OpC != TargetOpcode::PATCHPOINT && !ImmToIdxMap.count(OpC);
-
   // STMW and LMW only have immediate form.
-  if (OpC == PPC::STMW || OpC == PPC::LMW)
-    noImmForm = false;
+  bool noImmForm = !MI.isInlineAsm() && OpC != TargetOpcode::STACKMAP &&
+                   OpC != TargetOpcode::PATCHPOINT && !ImmToIdxMap.count(OpC) 
&&
+                   OpC != PPC::STMW && OpC != PPC::LMW;
 
   // Now add the frame object offset to the offset from r1.
   int64_t Offset = MFI.getObjectOffset(FrameIndex);
diff --git a/llvm/test/CodeGen/PowerPC/stm-lm-merge.ll 
b/llvm/test/CodeGen/PowerPC/stm-lm-merge.ll
index f486317f758a4..94a96c7dd0a5e 100644
--- a/llvm/test/CodeGen/PowerPC/stm-lm-merge.ll
+++ b/llvm/test/CodeGen/PowerPC/stm-lm-merge.ll
@@ -2,8 +2,36 @@
 ; RUN:     -mcpu=pwr4 -mattr=-altivec --ppc-enable-load-store-multiple < %s \
 ; RUN:     | FileCheck %s
 
-; CHECK: stmw 16, 64(1)   # 4-byte Folded Spill
-; CHECK: lmw 16, 64(1)    # 4-byte Folded Reload
+; CHECK:      # %bb.0:                                # %entry
+; CHECK-NEXT:  mflr 0
+; CHECK-NEXT:  stwu 1, -128(1)
+; CHECK-NEXT:  cmpwi   5, 0
+; CHECK-NEXT:  stw 0, 136(1)
+; CHECK-NEXT:  stmw 16, 64(1)                          # 4-byte Folded Spill
+; CHECK-NEXT:  ble 0, L..BB0_11
+
+; CHECK:      L..BB0_12:                              # %for.cond.cleanup
+; CHECK-NEXT:  lwz 3, L..C0(2)                         # @a
+; CHECK-NEXT:  lwz 4, L..C1(2)                         # @b
+; CHECK-NEXT:  lwz 5, L..C4(2)                         # @c
+; CHECK-NEXT:  lwz 6, L..C7(2)                         # @d
+; CHECK-NEXT:  lwz 7, L..C6(2)                         # @e
+; CHECK-NEXT:  lmw 16, 64(1)                           # 4-byte Folded Reload
+; CHECK-NEXT:  lwz 3, 0(3)
+; CHECK-NEXT:  lwz 4, 0(4)
+; CHECK-NEXT:  add 3, 3, 28
+; CHECK-NEXT:  lwz 5, 0(5)
+; CHECK-NEXT:  add 3, 3, 4
+; CHECK-NEXT:  lwz 6, 0(6)
+; CHECK-NEXT:  add 3, 3, 5
+; CHECK-NEXT:  lwz 4, 0(7)
+; CHECK-NEXT:  add 3, 3, 6
+; CHECK-NEXT:  add 3, 3, 4
+; CHECK-NEXT:  lwz 31, 124(1)                          # 4-byte Folded Reload
+; CHECK-NEXT:  addi 1, 1, 128
+; CHECK-NEXT:  lwz 0, 8(1)
+; CHECK-NEXT:  mtlr 0
+; CHECK-NEXT:  bl
 
 @a = external local_unnamed_addr global i32, align 4
 @b = external local_unnamed_addr global i32, align 4

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to