https://github.com/cdevadas updated https://github.com/llvm/llvm-project/pull/174998
>From b180da617cad7cd5ebfb0db442b9a1ad00f78a2f Mon Sep 17 00:00:00 2001 From: Christudasan Devadasan <[email protected]> Date: Wed, 7 Jan 2026 10:48:26 +0000 Subject: [PATCH 1/2] [AMDGPU] Make AMDGPURewriteAGPRCopyMFMA aware of subreg reload AMDGPURewriteAGPRCopyMFMA pass is currently not subreg-aware. In particular, the logic that optimizes spills into COPY instructions assumes full register reloads. This becomes problematic when the reload instruction partially restores a tuple register. This patch introduces the necessary changes to make this pass subreg-aware, for a future patch that implements subreg reload during RA. --- .../include/llvm/CodeGen/TargetRegisterInfo.h | 3 ++ llvm/lib/CodeGen/TargetRegisterInfo.cpp | 10 +++++ .../AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp | 42 ++++++++++++++++++- 3 files changed, 54 insertions(+), 1 deletion(-) diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h index 35b14e8b8fd30..5c35cd338feb6 100644 --- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -430,6 +430,9 @@ class LLVM_ABI TargetRegisterInfo : public MCRegisterInfo { return SubRegIndexLaneMasks[SubIdx]; } + /// Try to find a matching subreg from the given lanemask. + unsigned getSubRegIdxFromLaneMask(LaneBitmask LaneMask) const; + /// Try to find one or more subregister indexes to cover \p LaneMask. /// /// If this is possible, returns true and appends the best matching set of diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp index cffb3ed1b8779..2b3924e368ccd 100644 --- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp +++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp @@ -546,6 +546,16 @@ TargetRegisterInfo::getRegSizeInBits(Register Reg, return getRegSizeInBits(*RC); } +unsigned +TargetRegisterInfo::getSubRegIdxFromLaneMask(LaneBitmask LaneMask) const { + for (unsigned Idx = 1, E = getNumSubRegIndices(); Idx < E; ++Idx) { + if (getSubRegIndexLaneMask(Idx) == LaneMask) + return Idx; + } + + return 0 /*NoSubRegister*/; +} + bool TargetRegisterInfo::getCoveringSubRegIndexes( const TargetRegisterClass *RC, LaneBitmask LaneMask, SmallVectorImpl<unsigned> &NeededIndexes) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp index ffbb1c183ca9e..b015198c02e8d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp @@ -112,6 +112,17 @@ class AMDGPURewriteAGPRCopyMFMAImpl { bool tryFoldCopiesToAGPR(Register VReg, MCRegister AssignedAGPR) const; bool tryFoldCopiesFromAGPR(Register VReg, MCRegister AssignedAGPR) const; + /// Derives the subregister index from a spill reload pseudo instruction by + /// constructing a lane mask that covers the reloaded portion and finding + /// the matching subregister. + /// + /// \p MI the spill reload pseudo instruction containing the offset and + /// spill size info + /// \p Reg the original virtual register being spilled (mostly a tuple + /// register) + /// \return the subregister index corresponding to the reload portion. + unsigned getSubRegFromReload(MachineInstr &MI, Register VReg) const; + /// Replace spill instruction \p SpillMI which loads/stores from/to \p SpillFI /// with a COPY to the replacement register value \p VReg. void replaceSpillWithCopyToVReg(MachineInstr &SpillMI, int SpillFI, @@ -422,6 +433,33 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::tryFoldCopiesFromAGPR( return MadeChange; } +unsigned +AMDGPURewriteAGPRCopyMFMAImpl::getSubRegFromReload(MachineInstr &MI, + Register Reg) const { + unsigned NumRegs = TRI.getRegSizeInBits(*MRI.getRegClass(Reg)) / 32; + unsigned SubReg = 0; + // SubReg accesses for the tuple registers are of interest here. + // Note: We don't support 16-bit subreg reloads. If that assuption is + // changed in the future, this function should be revised. + if (NumRegs == 1) + return SubReg; + + unsigned NumSpilledRegs = TII.getNumSubRegsForSpillOp(MI); + // Skip if the entire tuple is reloaded. + if (NumRegs == NumSpilledRegs) + return SubReg; + + // Construct the covering lanes for the reloaded portion. + unsigned SubRegIdx = + TII.getNamedOperand(MI, AMDGPU::OpName::offset)->getImm() / 4; + // Subreg lane masks are maintained in terms of regunits and each 32-bit + // register consists of two regunits. + uint64_t Lanes = (1ULL << NumSpilledRegs * 2) - 1; + LaneBitmask CoveringLanes = LaneBitmask(Lanes << SubRegIdx * 2); + SubReg = TRI.getSubRegIdxFromLaneMask(CoveringLanes); + return SubReg; +} + void AMDGPURewriteAGPRCopyMFMAImpl::replaceSpillWithCopyToVReg( MachineInstr &SpillMI, int SpillFI, Register VReg) const { const DebugLoc &DL = SpillMI.getDebugLoc(); @@ -431,9 +469,11 @@ void AMDGPURewriteAGPRCopyMFMAImpl::replaceSpillWithCopyToVReg( NewCopy = BuildMI(MBB, SpillMI, DL, TII.get(TargetOpcode::COPY), VReg) .add(SpillMI.getOperand(0)); } else { + // Identify the subregs if SpillMI is really a subreg-load. + unsigned SubReg = getSubRegFromReload(SpillMI, VReg); NewCopy = BuildMI(MBB, SpillMI, DL, TII.get(TargetOpcode::COPY)) .add(SpillMI.getOperand(0)) - .addReg(VReg); + .addReg(VReg, 0, SubReg); } LIS.ReplaceMachineInstrInMaps(SpillMI, *NewCopy); >From 0ebaff5fb69b54476549043327fb6d92b83e2cfb Mon Sep 17 00:00:00 2001 From: Christudasan Devadasan <[email protected]> Date: Mon, 12 Jan 2026 12:59:34 +0000 Subject: [PATCH 2/2] suggestions incorporated. --- llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp index b015198c02e8d..de329a4083ba5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp @@ -437,9 +437,9 @@ unsigned AMDGPURewriteAGPRCopyMFMAImpl::getSubRegFromReload(MachineInstr &MI, Register Reg) const { unsigned NumRegs = TRI.getRegSizeInBits(*MRI.getRegClass(Reg)) / 32; - unsigned SubReg = 0; + unsigned SubReg = AMDGPU::NoSubRegister; // SubReg accesses for the tuple registers are of interest here. - // Note: We don't support 16-bit subreg reloads. If that assuption is + // Note: We don't support 16-bit subreg reloads. If that assumption is // changed in the future, this function should be revised. if (NumRegs == 1) return SubReg; _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
