llvmorg-github-actions[bot] wrote:

<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-aarch64

Author: Wei Xiao (williamweixiao)

<details>
<summary>Changes</summary>

Replace getCSRCost() and the global regalloc-csr-cost-scale cl::opt with
two target-overridable methods: getCSRFirstUseCost(MF) and
getCSRCostScale(MF). This makes CSR cost configuration target-specific
and simplifies initializeCSRCost() to a single formula:

  CSRCost = BaseCost * EntryFreq * Scale / 100

Target changes (new cost model):
- AArch64: getCSRFirstUseCost()=2 (save+restore), getCSRCostScale()=80.
  Previously used the legacy getCSRCost()=5 path which produced an
  effectively negligible CSRCost=5. The new model produces a meaningful
  cost that properly weighs CSR usage against spilling.
- RISCV: getCSRFirstUseCost()=2 (save+restore), getCSRCostScale()=80.
  Same change as AArch64.
- AMDGPU: getCSRFirstUseCost()=100. Previously used the legacy
  getCSRCost()=100 path which produced CSRCost=100. The new model
  produces CSRCost=100*EntryFreq, which more aggressively discourages
  CSR usage as intended ("stack access is very expensive").

Newly enabled:
- X86: getCSRFirstUseCost()=2 (push+pop), 0 when PPX is available

X86 SPEC CPU2017 rate performance (Intel Xeon Platinum 8280).
Exp = with this patch, Ref = without this patch (baseline).
Higher rate score is better. Single benchmark changes within 2% are
considered deviation and fluctuation:

```
  Benchmark         Exp(rat)  Ref(rat)  Change
  500.perlbench_r    205.40    204.82    +0.29%
  502.gcc_r          232.12    231.99    +0.06%
  505.mcf_r          148.90    147.42    +1.00%
  508.namd_r         207.94    210.54    -1.23%
  510.parest_r       123.19    123.38    -0.15%
  511.povray_r       287.00    272.70    +5.24%
  520.omnetpp_r      159.85    160.85    -0.62%
  523.xalancbmk_r    210.55    208.41    +1.02%
  525.x264_r         514.22    518.01    -0.73%
  526.blender_r      247.95    251.25    -1.31%
  531.deepsjeng_r    221.74    220.70    +0.47%
  538.imagick_r      376.90    366.98    +2.70%
  541.leela_r        227.98    224.80    +1.42%
  544.nab_r          418.96    415.92    +0.73%
  557.xz_r           167.58    152.60    +9.81%
  Geometric mean:                        +1.21%
```

---

Patch is 12.41 MiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/202007.diff


152 Files Affected:

- (modified) clang/test/CodeGen/X86/ms-secure-hotpatch-globals.c (+1-3) 
- (modified) clang/test/Frontend/stack-layout-remark.c (+3-3) 
- (modified) llvm/include/llvm/CodeGen/TargetRegisterInfo.h (+14-4) 
- (modified) llvm/lib/CodeGen/RegAllocGreedy.cpp (+15-41) 
- (modified) llvm/lib/Target/AArch64/AArch64RegisterInfo.h (+5-8) 
- (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.h (+3-1) 
- (modified) llvm/lib/Target/RISCV/RISCVRegisterInfo.h (+6-5) 
- (modified) llvm/lib/Target/X86/X86RegisterInfo.cpp (+9) 
- (modified) llvm/lib/Target/X86/X86RegisterInfo.h (+2) 
- (modified) llvm/test/CodeGen/AArch64/cgp-usubo.ll (+11-10) 
- (modified) llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll (+17-20) 
- (modified) llvm/test/CodeGen/AArch64/pr51516.mir (+4-4) 
- (modified) llvm/test/CodeGen/AArch64/ragreedy-csr2.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll 
(+12-10) 
- (modified) llvm/test/CodeGen/AArch64/sme-peephole-opts.ll (+21-14) 
- (modified) llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll (+17-15) 
- (modified) llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll 
(+19-15) 
- (modified) llvm/test/CodeGen/AArch64/spill-reload-remarks.ll (+100-1) 
- (modified) llvm/test/CodeGen/AArch64/statepoint-call-lowering.ll (+12-11) 
- (modified) llvm/test/CodeGen/AArch64/sve-breakdown-scalable-vectortype.ll 
(+252-315) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+54056-53059) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll (+81-84) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll (+392-381) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll (+6769-6769) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll (+189-192) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll (+1486-1248) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll (+2683-2141) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll (+3459-2838) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll (+4676-4194) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll (+6841-6526) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll (+8160-7469) 
- (modified) llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir (+16-22) 
- (modified) llvm/test/CodeGen/AMDGPU/eliminate-frame-index-select.ll (+78-82) 
- (modified) llvm/test/CodeGen/AMDGPU/function-resource-usage.ll (+1028-461) 
- (modified) llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll 
(+101-143) 
- (modified) llvm/test/CodeGen/AMDGPU/indirect-call.ll (+148-151) 
- (modified) llvm/test/CodeGen/AMDGPU/issue176578.ll (+58-60) 
- (modified) llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir 
(+110-137) 
- (modified) llvm/test/CodeGen/AMDGPU/split-liverange-overlapping-copies.mir 
(+14-17) 
- (modified) llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask-phi-extend.ll 
(+431-418) 
- (modified) llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll 
(+272-302) 
- (modified) llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll 
(+216-218) 
- (modified) llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll (+7-8) 
- (modified) llvm/test/CodeGen/MLRegAlloc/interactive-mode.ll (+3-6) 
- (modified) llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll (+21-23) 
- (modified) llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll (+25-22) 
- (modified) llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll (+4-4) 
- (modified) llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll (+4-4) 
- (modified) llvm/test/CodeGen/RISCV/bfloat-convert.ll (+31-28) 
- (modified) llvm/test/CodeGen/RISCV/bitint-fp-conv-200.ll (+396-401) 
- (modified) llvm/test/CodeGen/RISCV/condops.ll (+32-35) 
- (modified) llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll (+26-25) 
- (modified) llvm/test/CodeGen/RISCV/double-convert.ll (+39-38) 
- (modified) llvm/test/CodeGen/RISCV/double-round-conv-sat.ll (+66-60) 
- (modified) llvm/test/CodeGen/RISCV/double-stack-spill-restore.ll (+4-8) 
- (modified) llvm/test/CodeGen/RISCV/exception-pointer-register.ll (+18-30) 
- (modified) llvm/test/CodeGen/RISCV/float-convert.ll (+34-33) 
- (modified) llvm/test/CodeGen/RISCV/float-round-conv-sat.ll (+174-162) 
- (modified) llvm/test/CodeGen/RISCV/fp128.ll (+114-114) 
- (modified) llvm/test/CodeGen/RISCV/fpclamptosat.ll (+1416-1354) 
- (modified) llvm/test/CodeGen/RISCV/fpenv.ll (+14-10) 
- (modified) llvm/test/CodeGen/RISCV/half-convert.ll (+151-140) 
- (modified) llvm/test/CodeGen/RISCV/half-round-conv-sat.ll (+240-216) 
- (modified) 
llvm/test/CodeGen/RISCV/machine-outliner-and-machine-copy-propagation.ll (+7-3) 
- (modified) llvm/test/CodeGen/RISCV/machine-sink-load-immediate.ll (+62-63) 
- (modified) llvm/test/CodeGen/RISCV/overflow-intrinsics.ll (+43-65) 
- (modified) llvm/test/CodeGen/RISCV/rv32xtheadbb.ll (+26-25) 
- (modified) llvm/test/CodeGen/RISCV/rv32zbb.ll (+26-25) 
- (modified) llvm/test/CodeGen/RISCV/rv64-double-convert.ll (+47-46) 
- (modified) llvm/test/CodeGen/RISCV/rv64-float-convert.ll (+47-46) 
- (modified) llvm/test/CodeGen/RISCV/rv64-half-convert.ll (+47-46) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll (+12-12) 
- (modified) llvm/test/CodeGen/RISCV/rvv/nontemporal-vp-scalable.ll (+585-585) 
- (modified) llvm/test/CodeGen/RISCV/rvv/pr95865.ll (+33-33) 
- (modified) llvm/test/CodeGen/RISCV/rvv/shrinkwrap.ll (+28-32) 
- (modified) llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll (+33-32) 
- (modified) llvm/test/CodeGen/RISCV/select-cc.ll (+24-27) 
- (modified) llvm/test/CodeGen/RISCV/shrinkwrap.ll (+61-52) 
- (modified) llvm/test/CodeGen/RISCV/simplify-condbr.ll (+4-6) 
- (modified) llvm/test/CodeGen/RISCV/xqcibm-cto-clo-brev.ll (+20-17) 
- (modified) llvm/test/CodeGen/X86/2007-02-16-BranchFold.ll (+11-10) 
- (modified) llvm/test/CodeGen/X86/2007-08-09-IllegalX86-64Asm.ll (+26-31) 
- (modified) llvm/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll (+18-18) 
- (modified) llvm/test/CodeGen/X86/2008-04-16-ReMatBug.ll (+39-31) 
- (modified) llvm/test/CodeGen/X86/2008-12-19-EarlyClobberBug.ll (+4-1) 
- (modified) llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll (+73-75) 
- (modified) llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll (+36-31) 
- (modified) llvm/test/CodeGen/X86/PR40322.ll (+13-15) 
- (modified) llvm/test/CodeGen/X86/andnot-patterns.ll (+24-22) 
- (modified) llvm/test/CodeGen/X86/andnot-sink-not.ll (+280-261) 
- (modified) llvm/test/CodeGen/X86/apx/add.ll (+43-43) 
- (modified) llvm/test/CodeGen/X86/apx/memfold-no-physreg.ll (+65-89) 
- (modified) llvm/test/CodeGen/X86/apx/memfold-origVNI-crash.ll (+24-31) 
- (modified) llvm/test/CodeGen/X86/apx/pr191368.ll (+40-41) 
- (modified) llvm/test/CodeGen/X86/apx/push2-pop2.ll (+24-24) 
- (modified) llvm/test/CodeGen/X86/atom-fixup-lea2.ll (+49-5) 
- (modified) llvm/test/CodeGen/X86/atomic-bit-test.ll (+8-8) 
- (modified) llvm/test/CodeGen/X86/atomic-rm-bit-test.ll (+169-150) 
- (modified) llvm/test/CodeGen/X86/avoid-sfb.ll (+62-110) 
- (modified) llvm/test/CodeGen/X86/block-placement.ll (+1719-227) 
- (modified) llvm/test/CodeGen/X86/bmi.ll (+146-120) 
- (modified) llvm/test/CodeGen/X86/bsf.ll (+46-44) 
- (modified) llvm/test/CodeGen/X86/bt-merge-fuse.ll (+31-25) 
- (modified) llvm/test/CodeGen/X86/btc_bts_btr.ll (+19-19) 
- (modified) llvm/test/CodeGen/X86/bypass-slow-division-32.ll (+12-12) 
- (modified) llvm/test/CodeGen/X86/callbr-asm-blockplacement.ll (+5-5) 
- (modified) llvm/test/CodeGen/X86/callbr-asm-branch-folding.ll (+3-2) 
- (modified) llvm/test/CodeGen/X86/cgp-usubo.ll (+21-24) 
- (modified) llvm/test/CodeGen/X86/codegen-prepare-addrmode-tls.ll (+75-59) 
- (modified) llvm/test/CodeGen/X86/csr-split.ll (+33-37) 
- (modified) llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll 
(+233-235) 
- (modified) llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll 
(+201-199) 
- (modified) llvm/test/CodeGen/X86/extract-bits.ll (+565-478) 
- (modified) llvm/test/CodeGen/X86/extract-lowbits.ll (+68-68) 
- (modified) llvm/test/CodeGen/X86/fp128-cast.ll (+11-11) 
- (modified) llvm/test/CodeGen/X86/fptosi-sat-scalar.ll (+246-234) 
- (modified) llvm/test/CodeGen/X86/fptoui-sat-scalar.ll (+171-163) 
- (modified) llvm/test/CodeGen/X86/fshl.ll (+44-42) 
- (modified) llvm/test/CodeGen/X86/funnel-shift.ll (+18-18) 
- (modified) llvm/test/CodeGen/X86/i128-udiv.ll (+178-171) 
- (modified) llvm/test/CodeGen/X86/indirect-branch-tracking-eh.ll (+161-61) 
- (modified) 
llvm/test/CodeGen/X86/inline-spiller-impdef-on-implicit-def-regression.ll 
(+38-43) 
- (modified) llvm/test/CodeGen/X86/lrshrink.ll (+7-6) 
- (modified) llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll (+6-2192) 
- (modified) llvm/test/CodeGen/X86/memcmp-optsize-x32.ll (+4-400) 
- (modified) llvm/test/CodeGen/X86/memcmp-pgso-x32.ll (+4-400) 
- (modified) llvm/test/CodeGen/X86/memcmp-x32.ll (+6-1678) 
- (modified) llvm/test/CodeGen/X86/midpoint-int.ll (+90-80) 
- (modified) llvm/test/CodeGen/X86/mul-constant-result.ll (+19-19) 
- (modified) llvm/test/CodeGen/X86/no-split-size.ll (+29-38) 
- (modified) llvm/test/CodeGen/X86/optimize-max-0.ll (+171-178) 
- (modified) llvm/test/CodeGen/X86/peep-test-4.ll (+14-14) 
- (modified) llvm/test/CodeGen/X86/probe-stack-eflags.ll (+11-12) 
- (modified) llvm/test/CodeGen/X86/ragreedy-bug.ll (+233-26) 
- (modified) llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll (+36-36) 
- (modified) llvm/test/CodeGen/X86/sjlj-eh.ll (+187-60) 
- (modified) llvm/test/CodeGen/X86/speculative-load-hardening.ll (+71-80) 
- (modified) llvm/test/CodeGen/X86/split-reg-with-hint.ll (+17-38) 
- (modified) llvm/test/CodeGen/X86/statepoint-invoke.ll (+30-33) 
- (modified) llvm/test/CodeGen/X86/statepoint-vreg-details.ll (+10-187) 
- (modified) llvm/test/CodeGen/X86/statepoint-vreg-invoke.ll (+22-19) 
- (modified) llvm/test/CodeGen/X86/tail-dup-merge-loop-headers.ll (+77-75) 
- (modified) llvm/test/CodeGen/X86/tail-opts.ll (+33-30) 
- (modified) llvm/test/CodeGen/X86/tailcall-cgp-dup.ll (+13-11) 
- (modified) llvm/test/CodeGen/X86/tbm_patterns.ll (+54-42) 
- (modified) llvm/test/CodeGen/X86/x86-shrink-wrapping.ll (+12-26) 
- (modified) llvm/test/CodeGen/X86/xmulo.ll (+6-8) 
- (modified) llvm/test/CodeGen/X86/zero-call-used-regs-i386.ll (+13-12) 
- (modified) llvm/test/DebugInfo/KeyInstructions/X86/dwarf-ranks-blocks.ll 
(+15-14) 
- (modified) llvm/test/DebugInfo/RISCV/dw_op_entry_value_32bit.ll (+1-1) 
- (modified) llvm/test/DebugInfo/RISCV/dw_op_entry_value_64bit.ll (+1-1) 
- (modified) llvm/test/tools/llvm-locstats/locstats.ll (+5-5) 


``````````diff
diff --git a/clang/test/CodeGen/X86/ms-secure-hotpatch-globals.c 
b/clang/test/CodeGen/X86/ms-secure-hotpatch-globals.c
index ff3a1a47288a6..49c01d80dc85c 100644
--- a/clang/test/CodeGen/X86/ms-secure-hotpatch-globals.c
+++ b/clang/test/CodeGen/X86/ms-secure-hotpatch-globals.c
@@ -95,10 +95,8 @@ void hp5_phi_ptr_mixed(int x) NO_TAIL {
 // CHECK: test ecx, ecx
 // CHECK: mov rsi, qword ptr [rip + __ref_g_has_pointers]
 // CHECK: call do_side_effects
-// CHECK: jmp
 // CHECK: call do_other_side_effects
-// CHECK: lea rsi, [rip + g_this_is_const]
-// CHECK: mov rcx, rsi
+// CHECK: lea rcx, [rip + g_this_is_const]
 // CHECK: call take_data
 // CHECK: .seh_endproc
 
diff --git a/clang/test/Frontend/stack-layout-remark.c 
b/clang/test/Frontend/stack-layout-remark.c
index b0ed03c80f24a..94e1f8a827777 100644
--- a/clang/test/Frontend/stack-layout-remark.c
+++ b/clang/test/Frontend/stack-layout-remark.c
@@ -152,9 +152,9 @@ extern void use_dot_vector(struct Array *data);
 //      O3-DEBUG: Function: do_work
 // O3-DEBUG-NEXT: Offset: [SP-8], Type: Spill, Align: 16, Size: 8
 // O3-DEBUG-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
-// O3-DEBUG-NEXT: Offset: [SP-24], Type: Spill, Align: 16, Size: 8
-// O3-DEBUG-NEXT: Offset: [SP-32], Type: Spill, Align: 8, Size: 8
-// O3-DEBUG-NEXT: Offset: [SP-40], Type: Spill, Align: 16, Size: 8
+//      O3-DEBUG: Offset: [SP-24], Type: Spill, Align: 8, Size: 8
+//      O3-DEBUG: Offset: [SP-32], Type: Spill, Align: 8, Size: 8
+//      O3-DEBUG: Offset: [SP-40], Type: Spill, Align: 8, Size: 8
 int do_work(struct Array *A, struct Array *B, struct Result *out) {
   if (!A || !B)
     return -1;
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h 
b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 7c3c56552b82c..4db5c50c8bdae 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -1032,10 +1032,20 @@ class LLVM_ABI TargetRegisterInfo : public 
MCRegisterInfo {
 
   /// Allow the target to override the cost of using a callee-saved register 
for
   /// the first time. Default value of 0 means we will use a callee-saved
-  /// register if it is available.
-  virtual unsigned getCSRFirstUseCost() const { return 0; }
-  /// FIXME: We should deprecate this usage.
-  virtual unsigned getCSRCost() const { return 0; }
+  /// register if it is available. The returned value is multiplied by the 
entry
+  /// block frequency to produce the final CSR cost used by the greedy register
+  /// allocator. For example, a cost of 2 represents the cost of a push/pop
+  /// pair (2 memory accesses at entry frequency).
+  virtual unsigned getCSRFirstUseCost(const MachineFunction &MF) const {
+    return 0;
+  }
+
+  /// Allow the target to override the scale applied to the CSR first-use cost.
+  /// The scale is a percentage (e.g., 80 means 80% of the base cost).
+  /// Default value of 100 means no scaling.
+  virtual unsigned getCSRCostScale(const MachineFunction &MF) const {
+    return 100;
+  }
 
   /// Returns true if the target requires (and can make use of) the register
   /// scavenger.
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp 
b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 6b8a9b8190f9a..634ba582e2f93 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -115,11 +115,6 @@ CSRFirstTimeCost("regalloc-csr-first-time-cost",
               cl::desc("Cost for first time use of callee-saved register."),
               cl::init(0), cl::Hidden);
 
-static cl::opt<unsigned> CSRCostScale(
-    "regalloc-csr-cost-scale",
-    cl::desc("Scale for the callee-saved register cost, in percentage."),
-    cl::init(80), cl::Hidden);
-
 static cl::opt<unsigned long> GrowRegionComplexityBudget(
     "grow-region-complexity-budget",
     cl::desc("growRegion() does not scale with the number of BB edges, so "
@@ -2418,43 +2413,22 @@ void RAGreedy::aboutToRemoveInterval(const LiveInterval 
&LI) {
 }
 
 void RAGreedy::initializeCSRCost() {
-  if (!CSRCostScale.getNumOccurrences() &&
-      (CSRFirstTimeCost.getNumOccurrences() || TRI->getCSRCost())) {
-    // We should deprecate the usage of CSRFirstTimeCost!
-    // We use the command-line option if it is explicitly set, otherwise use 
the
-    // larger one out of the command-line option and the value reported by TRI.
-    CSRCost = BlockFrequency(
-        CSRFirstTimeCost.getNumOccurrences()
-            ? CSRFirstTimeCost
-            : std::max((unsigned)CSRFirstTimeCost, TRI->getCSRCost()));
-    if (!CSRCost.getFrequency())
-      return;
-
-    // Raw cost is relative to Entry == 2^14; scale it appropriately.
-    uint64_t ActualEntry = MBFI->getEntryFreq().getFrequency();
-    if (!ActualEntry) {
-      CSRCost = BlockFrequency(0);
-      return;
-    }
-    uint64_t FixedEntry = 1 << 14;
-    if (ActualEntry < FixedEntry) {
-      CSRCost *= BranchProbability(ActualEntry, FixedEntry);
-    } else if (ActualEntry <= UINT32_MAX) {
-      // Invert the fraction and divide.
-      CSRCost /= BranchProbability(FixedEntry, ActualEntry);
-    } else {
-      // Can't use BranchProbability in general, since it takes 32-bit numbers.
-      CSRCost =
-          BlockFrequency(CSRCost.getFrequency() * (ActualEntry / FixedEntry));
-    }
-  } else {
-    uint64_t EntryFreq = MBFI->getEntryFreq().getFrequency();
-    CSRCost = BlockFrequency(TRI->getCSRFirstUseCost() * EntryFreq);
-    if (CSRCostScale < 100)
-      CSRCost *= BranchProbability(CSRCostScale, 100);
-    else
-      CSRCost /= BranchProbability(100, CSRCostScale);
+  unsigned BaseCost = CSRFirstTimeCost.getNumOccurrences()
+                          ? CSRFirstTimeCost
+                          : TRI->getCSRFirstUseCost(*MF);
+  if (!BaseCost) {
+    CSRCost = BlockFrequency(0);
+    return;
   }
+
+  uint64_t EntryFreq = MBFI->getEntryFreq().getFrequency();
+  CSRCost = BlockFrequency(BaseCost * EntryFreq);
+
+  unsigned Scale = TRI->getCSRCostScale(*MF);
+  if (Scale < 100)
+    CSRCost *= BranchProbability(Scale, 100);
+  else if (Scale > 100)
+    CSRCost /= BranchProbability(100, Scale);
 }
 
 /// Collect the hint info for \p Reg.
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h 
b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
index ac58d8d6b1cc7..b307df9cc932b 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -53,16 +53,13 @@ class AArch64RegisterInfo final : public 
AArch64GenRegisterInfo {
   const uint32_t *getDarwinCallPreservedMask(const MachineFunction &MF,
                                              CallingConv::ID) const;
 
-  unsigned getCSRCost() const override {
-    // The cost will be compared against BlockFrequency where entry has the
-    // value of 1 << 14. A value of 5 will choose to spill or split really
-    // cold path instead of using a callee-saved register.
-    return 5;
-  }
-  unsigned getCSRFirstUseCost() const override {
-    // The cost of 2 means push and pop for each CSR.
+  unsigned getCSRFirstUseCost(const MachineFunction &MF) const override {
+    // The cost of save and restore (e.g. STP/LDP) for each CSR.
     return 2;
   }
+  unsigned getCSRCostScale(const MachineFunction &MF) const override {
+    return 80;
+  }
 
   const TargetRegisterClass *
   getSubClassWithSubReg(const TargetRegisterClass *RC,
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h 
b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 5e08e47ad4d83..239a8676c75ea 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -109,7 +109,9 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
 
   // Stack access is very expensive. CSRs are also the high registers, and we
   // want to minimize the number of used registers.
-  unsigned getCSRCost() const override { return 100; }
+  unsigned getCSRFirstUseCost(const MachineFunction &MF) const override {
+    return 100;
+  }
 
   // When building a block VGPR load, we only really transfer a subset of the
   // registers in the block, based on a mask. Liveness analysis is not aware of
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h 
b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
index 3a77820d28bbd..4fabdb9d2ce3c 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -68,11 +68,12 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
   const uint32_t *getCallPreservedMask(const MachineFunction &MF,
                                        CallingConv::ID) const override;
 
-  unsigned getCSRCost() const override {
-    // The cost will be compared against BlockFrequency where entry has the
-    // value of 1 << 14. A value of 5 will choose to spill or split cold
-    // path instead of using a callee-saved register.
-    return 5;
+  unsigned getCSRFirstUseCost(const MachineFunction &MF) const override {
+    // The cost of save and restore (e.g. sd/ld) for each CSR.
+    return 2;
+  }
+  unsigned getCSRCostScale(const MachineFunction &MF) const override {
+    return 80;
   }
 
   const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const 
override;
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp 
b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index c84e0f441a459..5101452a6f78a 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -1281,3 +1281,12 @@ bool X86RegisterInfo::isNonRex2RegClass(const 
TargetRegisterClass *RC) const {
     return true;
   }
 }
+
+unsigned X86RegisterInfo::getCSRFirstUseCost(const MachineFunction &MF) const {
+  const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
+  if (ST.is64Bit() && ST.hasPPX())
+    return 0;
+
+  // push + pop.
+  return 2;
+}
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.h 
b/llvm/lib/Target/X86/X86RegisterInfo.h
index e646591663aca..1418e2892768a 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.h
+++ b/llvm/lib/Target/X86/X86RegisterInfo.h
@@ -182,6 +182,8 @@ class X86RegisterInfo final : public X86GenRegisterInfo {
   bool requiresRegisterScavenging(const MachineFunction &MF) const override {
     return true;
   }
+
+  unsigned getCSRFirstUseCost(const MachineFunction &MF) const override;
 };
 
 } // End llvm namespace
diff --git a/llvm/test/CodeGen/AArch64/cgp-usubo.ll 
b/llvm/test/CodeGen/AArch64/cgp-usubo.ll
index f990920e2793a..35879b94e503b 100644
--- a/llvm/test/CodeGen/AArch64/cgp-usubo.ll
+++ b/llvm/test/CodeGen/AArch64/cgp-usubo.ll
@@ -280,29 +280,30 @@ end:
 define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) 
nounwind {
 ; CHECK-LABEL: usubo_ult_cmp_dominates_i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
 ; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov w19, w3
 ; CHECK-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    tbz w3, #0, .LBB15_3
 ; CHECK-NEXT:  // %bb.1: // %t
 ; CHECK-NEXT:    cmp x0, x1
-; CHECK-NEXT:    mov x22, x0
-; CHECK-NEXT:    mov x20, x2
-; CHECK-NEXT:    cset w21, lo
-; CHECK-NEXT:    mov x23, x1
-; CHECK-NEXT:    mov w0, w21
+; CHECK-NEXT:    mov x21, x0
+; CHECK-NEXT:    str x2, [sp, #8] // 8-byte Spill
+; CHECK-NEXT:    cset w20, lo
+; CHECK-NEXT:    mov x22, x1
+; CHECK-NEXT:    mov w0, w20
 ; CHECK-NEXT:    bl call
-; CHECK-NEXT:    subs x8, x22, x23
+; CHECK-NEXT:    subs x8, x21, x22
 ; CHECK-NEXT:    b.hs .LBB15_3
 ; CHECK-NEXT:  // %bb.2: // %end
-; CHECK-NEXT:    mov w19, w21
-; CHECK-NEXT:    str x8, [x20]
+; CHECK-NEXT:    ldr x9, [sp, #8] // 8-byte Reload
+; CHECK-NEXT:    mov w19, w20
+; CHECK-NEXT:    str x8, [x9]
 ; CHECK-NEXT:  .LBB15_3: // %common.ret
 ; CHECK-NEXT:    and w0, w19, #0x1
 ; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
 entry:
   br i1 %cond, label %t, label %f
diff --git a/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll 
b/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
index 09e80ee936738..3bb09723381fa 100644
--- a/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
+++ b/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
@@ -582,26 +582,30 @@ declare void @do_something() #1
 define i32 @do_nothing_if_resultant_opcodes_would_differ() #0 {
 ; CHECK-LABEL: do_nothing_if_resultant_opcodes_would_differ:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w30, -32
-; CHECK-NEXT:    adrp x19, :got:a
-; CHECK-NEXT:    ldr x19, [x19, :got_lo12:a]
-; CHECK-NEXT:    ldr w8, [x19]
+; CHECK-NEXT:    adrp x8, :got:a
+; CHECK-NEXT:    ldr x8, [x8, :got_lo12:a]
+; CHECK-NEXT:    ldr w8, [x8]
 ; CHECK-NEXT:    cmn w8, #2
 ; CHECK-NEXT:    b.gt .LBB10_4
 ; CHECK-NEXT:  // %bb.1: // %while.body.preheader
-; CHECK-NEXT:    sub w20, w8, #1
+; CHECK-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    sub w19, w8, #1
 ; CHECK-NEXT:  .LBB10_2: // %while.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    bl do_something
-; CHECK-NEXT:    adds w20, w20, #1
+; CHECK-NEXT:    adds w19, w19, #1
 ; CHECK-NEXT:    b.mi .LBB10_2
 ; CHECK-NEXT:  // %bb.3: // %while.cond.while.end_crit_edge
-; CHECK-NEXT:    ldr w8, [x19]
+; CHECK-NEXT:    adrp x8, :got:a
+; CHECK-NEXT:    ldr x8, [x8, :got_lo12:a]
+; CHECK-NEXT:    ldr w8, [x8]
+; CHECK-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w19
+; CHECK-NEXT:    .cfi_restore w30
 ; CHECK-NEXT:  .LBB10_4: // %while.end
 ; CHECK-NEXT:    cmp w8, #1
 ; CHECK-NEXT:    b.gt .LBB10_7
@@ -616,16 +620,9 @@ define i32 @do_nothing_if_resultant_opcodes_would_differ() 
#0 {
 ; CHECK-NEXT:    b.ne .LBB10_7
 ; CHECK-NEXT:  // %bb.6:
 ; CHECK-NEXT:    mov w0, #123 // =0x7b
-; CHECK-NEXT:    b .LBB10_8
+; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB10_7: // %if.end
 ; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:  .LBB10_8: // %return
-; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    .cfi_restore w19
-; CHECK-NEXT:    .cfi_restore w20
-; CHECK-NEXT:    .cfi_restore w30
 ; CHECK-NEXT:    ret
 entry:
   %0 = load i32, ptr @a, align 4
diff --git a/llvm/test/CodeGen/AArch64/pr51516.mir 
b/llvm/test/CodeGen/AArch64/pr51516.mir
index ae54ad0d5cef4..854de23f3d426 100644
--- a/llvm/test/CodeGen/AArch64/pr51516.mir
+++ b/llvm/test/CodeGen/AArch64/pr51516.mir
@@ -5,10 +5,10 @@
 # of ADDXri is killed by the STRXui in this block.
 
 # CHECK-LABEL: name: test
-# CHECK: bb.17:
-# CHECK:   STRXui
-# CHECK:   LDRXui
-# CHECK: bb.18:
+# CHECK: bb.9:
+# CHECK:   ADDXri
+# CHECK:   STRXui %{{[0-9]+}}, %stack.1
+# CHECK: bb.10:
 
 ---
 name:            test
diff --git a/llvm/test/CodeGen/AArch64/ragreedy-csr2.ll 
b/llvm/test/CodeGen/AArch64/ragreedy-csr2.ll
index 2d5f5dbbf8f07..2ef7d1bacd288 100644
--- a/llvm/test/CodeGen/AArch64/ragreedy-csr2.ll
+++ b/llvm/test/CodeGen/AArch64/ragreedy-csr2.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 6
-; RUN: llc < %s -regalloc-csr-cost-scale=80 | FileCheck %s
+; RUN: llc < %s | FileCheck %s
 
 target triple = "aarch64"
 
diff --git a/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll 
b/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
index b9a542b330c0f..03ebba5a3e308 100644
--- a/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
+++ b/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
@@ -42,18 +42,19 @@ define void @fbyte(<vscale x 16 x i8> %v) #0{
 ; NOPAIR-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
 ; NOPAIR-NEXT:    addvl sp, sp, #-1
 ; NOPAIR-NEXT:    str z0, [sp] // 16-byte Folded Spill
-; NOPAIR-NEXT:    mrs x19, SVCR
-; NOPAIR-NEXT:    tbz w19, #0, .LBB0_2
+; NOPAIR-NEXT:    mrs x8, SVCR
+; NOPAIR-NEXT:    tbz w8, #0, .LBB0_2
 ; NOPAIR-NEXT:  // %bb.1:
 ; NOPAIR-NEXT:    smstop sm
 ; NOPAIR-NEXT:  .LBB0_2:
-; NOPAIR-NEXT:    rdvl x8, #1
-; NOPAIR-NEXT:    addsvl x8, x8, #-1
-; NOPAIR-NEXT:    cbz x8, .LBB0_4
+; NOPAIR-NEXT:    rdvl x9, #1
+; NOPAIR-NEXT:    addsvl x9, x9, #-1
+; NOPAIR-NEXT:    cbz x9, .LBB0_4
 ; NOPAIR-NEXT:  // %bb.3:
 ; NOPAIR-NEXT:    brk #0x1
 ; NOPAIR-NEXT:  .LBB0_4:
 ; NOPAIR-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
+; NOPAIR-NEXT:    mov x19, x8
 ; NOPAIR-NEXT:    bl my_func2
 ; NOPAIR-NEXT:    tbz w19, #0, .LBB0_6
 ; NOPAIR-NEXT:  // %bb.5:
@@ -128,18 +129,19 @@ define void @fbyte(<vscale x 16 x i8> %v) #0{
 ; PAIR-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
 ; PAIR-NEXT:    addvl sp, sp, #-1
 ; PAIR-NEXT:    str z0, [sp] // 16-byte Folded Spill
-; PAIR-NEXT:    mrs x19, SVCR
-; PAIR-NEXT:    tbz w19, #0, .LBB0_2
+; PAIR-NEXT:    mrs x8, SVCR
+; PAIR-NEXT:    tbz w8, #0, .LBB0_2
 ; PAIR-NEXT:  // %bb.1:
 ; PAIR-NEXT:    smstop sm
 ; PAIR-NEXT:  .LBB0_2:
-; PAIR-NEXT:    rdvl x8, #1
-; PAIR-NEXT:    addsvl x8, x8, #-1
-; PAIR-NEXT:    cbz x8, .LBB0_4
+; PAIR-NEXT:    rdvl x9, #1
+; PAIR-NEXT:    addsvl x9, x9, #-1
+; PAIR-NEXT:    cbz x9, .LBB0_4
 ; PAIR-NEXT:  // %bb.3:
 ; PAIR-NEXT:    brk #0x1
 ; PAIR-NEXT:  .LBB0_4:
 ; PAIR-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
+; PAIR-NEXT:    mov x19, x8
 ; PAIR-NEXT:    bl my_func2
 ; PAIR-NEXT:    tbz w19, #0, .LBB0_6
 ; PAIR-NEXT:  // %bb.5:
diff --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll 
b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
index 36539d94338a0..850aa7a63e016 100644
--- a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
+++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
@@ -514,24 +514,27 @@ define void @test12() "aarch64_pstate_sm_body" {
 define void @test13(ptr %ptr) nounwind "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: test13:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Spill
-; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    mov z0.s, #0 // =0x0
-; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    rdvl x8, #1
 ; CHECK-NEXT:    addsvl x8, x8, #-1
 ; CHECK-NEXT:    cbnz x8, .LBB14_2
 ; CHECK-NEXT:  // %bb.1:
-; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    str x0, [sp, #8] // 8-byte Spill
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl callee_farg_fret
-; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    rdvl x8, #1
@@ -540,19 +543,23 @@ define void @test13(ptr %ptr) nounwind 
"aarch64_pstate_sm_enabled" {
 ; CHECK-NEXT:  .LBB14_2:
 ; CHECK-NEXT:    brk #0x1
 ; CHECK-NEXT:  .LBB14_3:
-; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl callee_farg_fret
-; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    ldr z0, [sp] // 16-by...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/202007
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to