https://github.com/vikramRH updated https://github.com/llvm/llvm-project/pull/174096
>From ac372303f262d4b8e6267c13c8525594ae52d47c Mon Sep 17 00:00:00 2001 From: vikhegde <[email protected]> Date: Wed, 31 Dec 2025 19:38:57 +0530 Subject: [PATCH] [AMDGPU][NPM] Complete fast regalloc pipeline --- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 38 +++++++++++++++++++ llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll | 2 +- 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index d25b22b2b96dc..f8a83e72bc3ef 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -154,8 +154,10 @@ class AMDGPUCodeGenPassBuilder void addPostRegAlloc(PassManagerWrapper &PMW) const; void addPreEmitPass(PassManagerWrapper &PMWM) const; void addPreEmitRegAlloc(PassManagerWrapper &PMW) const; + Error addRegAssignmentFast(PassManagerWrapper &PMW) const; Error addRegAssignmentOptimized(PassManagerWrapper &PMW) const; void addPreRegAlloc(PassManagerWrapper &PMW) const; + Error addFastRegAlloc(PassManagerWrapper &PMW) const; void addOptimizedRegAlloc(PassManagerWrapper &PMW) const; void addPreSched2(PassManagerWrapper &PMW) const; void addPostBBSections(PassManagerWrapper &PMW) const; @@ -2311,6 +2313,42 @@ void AMDGPUCodeGenPassBuilder::addMachineSSAOptimization( addMachineFunctionPass(SIShrinkInstructionsPass(), PMW); } +Error AMDGPUCodeGenPassBuilder::addFastRegAlloc(PassManagerWrapper &PMW) const { + insertPass<PHIEliminationPass>(SILowerControlFlowPass()); + + insertPass<TwoAddressInstructionPass>(SIWholeQuadModePass()); + + return Base::addFastRegAlloc(PMW); +} + +Error AMDGPUCodeGenPassBuilder::addRegAssignmentFast( + PassManagerWrapper &PMW) const { + // TODO: handle default regalloc override error (with regalloc-npm) + + addMachineFunctionPass(GCNPreRALongBranchRegPass(), PMW); + + addMachineFunctionPass(RegAllocFastPass({onlyAllocateSGPRs, "sgpr", false}), + PMW); + + // Equivalent of PEI for SGPRs. + addMachineFunctionPass(SILowerSGPRSpillsPass(), PMW); + + // To Allocate wwm registers used in whole quad mode operations (for shaders). + addMachineFunctionPass(SIPreAllocateWWMRegsPass(), PMW); + + // For allocating other wwm register operands. + addMachineFunctionPass(RegAllocFastPass({onlyAllocateWWMRegs, "wwm", false}), + PMW); + + addMachineFunctionPass(SILowerWWMCopiesPass(), PMW); + addMachineFunctionPass(AMDGPUReserveWWMRegsPass(), PMW); + + // For allocating per-thread VGPRs. + addMachineFunctionPass(RegAllocFastPass({onlyAllocateVGPRs, "vgpr"}), PMW); + + return Error::success(); +} + void AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc( PassManagerWrapper &PMW) const { if (EnableDCEInRA) diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll index 4b7d0ce0d5787..3f896ef24a773 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll @@ -9,7 +9,7 @@ ; RUN: | FileCheck -check-prefix=GCN-O3 %s -; GCN-O0: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,require<runtime-libcall-info>,pre-isel-intrinsic-lowering,function(expand-ir-insts<O0>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-lower-exec-sync,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,unreachableblockelim,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim)),require<amdgpu-argument-usage>,cgscc(function(amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,si-memory-legalizer,si-insert-waitcnts,si-mode-register,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,amdgpu-preload-kern-arg-prolog,stack-frame-layout,verify),free-machine-function)) +; GCN-O0: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,require<runtime-libcall-info>,pre-isel-intrinsic-lowering,function(expand-ir-insts<O0>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-lower-exec-sync,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,unreachableblockelim,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim)),require<amdgpu-argument-usage>,cgscc(function(amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,si-lower-control-flow,two-address-instruction,si-wqm,amdgpu-pre-ra-long-branch-reg,regallocfast<filter=sgpr;no-clear-vregs>,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,regallocfast<filter=wwm;no-clear-vregs>,si-lower-wwm-copies,amdgpu-reserve-wwm-regs,regallocfast<filter=vgpr>,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,si-memory-legalizer,si-insert-waitcnts,si-mode-register,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,amdgpu-preload-kern-arg-prolog,stack-frame-layout,verify),free-machine-function)) ; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,require<runtime-libcall-info>,pre-isel-intrinsic-lowering,function(expand-ir-insts<O2>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt,amdgpu-uniform-intrinsic-combine),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-lower-exec-sync,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim)),require<amdgpu-argument-usage>,cgscc(function(flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,amdgpu-rewrite-agpr-copy-mfma,virt-reg-rewriter,amdgpu-mark-last-scratch-load,stack-slot-coloring,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,machine-latecleanup,branch-folder,tailduplication,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-mode-register,si-insert-hard-clauses,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,amdgpu-preload-kern-arg-prolog,stack-frame-layout,verify),free-machine-function)) _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
