[llvm-branch-commits] [llvm] [AMDGPU] Add machine-level inliner pass (PR #169476)

2025-12-08 Thread Gang Chen via llvm-branch-commits


@@ -0,0 +1,122 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 
-amdgpu-enable-machine-level-inliner < %s | FileCheck %s
+
+declare !callback !0 i32 @llvm.amdgcn.call.whole.wave.i32.p0(ptr, ...)
+
+define amdgpu_cs void @inline_simple_wwf(i32 %input, ptr addrspace(1) %output) 
{
+; CHECK-LABEL: inline_simple_wwf:
+; CHECK:   ; %bb.0:
+; CHECK-NEXT:s_mov_b32 s1, simple_whole_wave_func@abs32@hi
+; CHECK-NEXT:s_mov_b32 s0, simple_whole_wave_func@abs32@lo
+; CHECK-NEXT:s_mov_b32 s32, 0
+; CHECK-NEXT:v_dual_mov_b32 v41, v2 :: v_dual_mov_b32 v40, v1
+; CHECK-NEXT:s_swappc_b64 s[30:31], s[0:1]
+; CHECK-NEXT:global_store_b32 v[40:41], v0, off
+; CHECK-NEXT:s_endpgm
+  %result = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr 
@simple_whole_wave_func, i32 %input)
+  store i32 %result, ptr addrspace(1) %output
+  ret void
+}
+
+define amdgpu_gfx_whole_wave i32 @simple_whole_wave_func(i1 %active, i32 %x) {
+  %result = add i32 %x, 42
+  ret i32 %result
+}
+
+define amdgpu_gfx_whole_wave i32 @another_whole_wave_func(i1 %active, i32 %a, 
i32 %b) {
+  %sum = add i32 %a, %b
+  %result = mul i32 %sum, 2
+  ret i32 %result
+}
+
+define amdgpu_cs void @inline_multiple_wwf(i32 %x, i32 %y, ptr addrspace(1) 
%out1, ptr addrspace(1) %out2) {
+; CHECK-LABEL: inline_multiple_wwf:
+; CHECK:   ; %bb.0:
+; CHECK-NEXT:s_mov_b32 s1, simple_whole_wave_func@abs32@hi
+; CHECK-NEXT:s_mov_b32 s0, simple_whole_wave_func@abs32@lo
+; CHECK-NEXT:s_mov_b32 s32, 0
+; CHECK-NEXT:v_dual_mov_b32 v41, v5 :: v_dual_mov_b32 v44, v0
+; CHECK-NEXT:v_dual_mov_b32 v40, v4 :: v_dual_mov_b32 v43, v3
+; CHECK-NEXT:v_dual_mov_b32 v42, v2 :: v_dual_mov_b32 v45, v1
+; CHECK-NEXT:s_swappc_b64 s[30:31], s[0:1]
+; CHECK-NEXT:s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT:v_dual_mov_b32 v46, v0 :: v_dual_mov_b32 v1, v45
+; CHECK-NEXT:v_mov_b32_e32 v0, v44
+; CHECK-NEXT:s_mov_b32 s1, another_whole_wave_func@abs32@hi
+; CHECK-NEXT:s_mov_b32 s0, another_whole_wave_func@abs32@lo
+; CHECK-NEXT:s_wait_alu 0xfffe
+; CHECK-NEXT:s_swappc_b64 s[30:31], s[0:1]
+; CHECK-NEXT:global_store_b32 v[42:43], v46, off
+; CHECK-NEXT:global_store_b32 v[40:41], v0, off
+; CHECK-NEXT:s_endpgm
+  %result1 = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr 
@simple_whole_wave_func, i32 %x)
+  %result2 = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr 
@another_whole_wave_func, i32 %x, i32 %y)
+  store i32 %result1, ptr addrspace(1) %out1
+  store i32 %result2, ptr addrspace(1) %out2
+  ret void
+}
+

cmc-rep wrote:

Could we also have a test that tests one WWF called by multiple kernels? I 
believe it should work, still having a test like that is helpful. 

https://github.com/llvm/llvm-project/pull/169476
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] Add machine-level inliner pass (PR #169476)

2025-12-08 Thread Gang Chen via llvm-branch-commits


@@ -0,0 +1,262 @@
+//===-- AMDGPUMachineLevelInliner.cpp - AMDGPU Machine Level Inliner ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#include "AMDGPUMachineLevelInliner.h"
+#include "AMDGPU.h"
+#include "AMDGPUMachineModuleInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LegacyPassManagers.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassTimingInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TimeProfiler.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-machine-level-inliner"
+
+namespace {
+class AMDGPUInliningPassManager : public FPPassManager {
+public:
+  static char ID;
+
+  explicit AMDGPUInliningPassManager() : FPPassManager(ID) {}
+
+  bool runOnFunction(Function &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  bool doFinalization(Module &M) override;
+
+  StringRef getPassName() const override {
+return "AMDGPU Inlining Pass Manager";
+  }
+};
+
+/// AMDGPUInliningAnchor - A machine function pass that serves as an anchor for
+/// setting up the AMDGPU inlining pass manager infrastructure. It makes sure
+/// the inliner is run via an AMDGPUInliningPassManager. It can be run well in
+/// advance of the inliner as long as there are only FunctionPasses in between.
+class AMDGPUInliningAnchor : public MachineFunctionPass {
+public:
+  static char ID; // Pass identification
+
+  AMDGPUInliningAnchor() : MachineFunctionPass(ID) {}
+
+  // We don't really need to process any functions here.
+  bool runOnMachineFunction(MachineFunction &MF) override { return false; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  StringRef getPassName() const override;
+
+  /// Prepare the pass manager stack for the inliner. This will push an
+  /// `AMDGPUInliningPassManager` onto the stack.
+  void preparePassManager(PMStack &Stack) override;
+};
+
+} // end anonymous namespace.
+
+// Pass identification
+char AMDGPUMachineLevelInliner::ID = 0;
+char AMDGPUInliningPassManager::ID = 0;
+char AMDGPUInliningAnchor::ID = 0;
+
+char &llvm::AMDGPUMachineLevelInlinerID = AMDGPUMachineLevelInliner::ID;
+char &llvm::AMDGPUInliningAnchorID = AMDGPUInliningAnchor::ID;
+
+INITIALIZE_PASS_BEGIN(AMDGPUMachineLevelInliner, DEBUG_TYPE,
+  "AMDGPU Machine Level Inliner", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineModuleInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AMDGPUInliningAnchor)
+INITIALIZE_PASS_END(AMDGPUMachineLevelInliner, DEBUG_TYPE,
+"AMDGPU Machine Level Inliner", false, false)
+
+INITIALIZE_PASS_BEGIN(AMDGPUInliningAnchor, "amdgpu-inlining-anchor",
+  "AMDGPU Inlining Anchor", false, true)
+INITIALIZE_PASS_DEPENDENCY(MachineModuleInfoWrapperPass)
+INITIALIZE_PASS_END(AMDGPUInliningAnchor, "amdgpu-inlining-anchor",
+"AMDGPU Inlining Anchor", false, true)
+
+AMDGPUMachineLevelInliner::AMDGPUMachineLevelInliner()
+: MachineFunctionPass(ID) {
+  initializeAMDGPUMachineLevelInlinerPass(*PassRegistry::getPassRegistry());
+}
+
+void AMDGPUMachineLevelInliner::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired();
+  AU.addRequired();
+  AU.addPreserved();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool AMDGPUMachineLevelInliner::runOnMachineFunction(MachineFunction &MF) {
+  MachineModuleInfo &MMI = 
getAnalysis().getMMI();
+
+  Function &F = MF.getFunction();
+  if (shouldInlineCallsTo(F)) {

cmc-rep wrote:

Suggestion: perhaps we should say "mayInlineCallsTo(F)". I am thinking about 
more broader use of InlinePassManager?

https://github.com/llvm/llvm-project/pull/169476
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] Add machine-level inliner pass (PR #169476)

2025-12-08 Thread Gang Chen via llvm-branch-commits


@@ -0,0 +1,262 @@
+//===-- AMDGPUMachineLevelInliner.cpp - AMDGPU Machine Level Inliner ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--===//
+
+#include "AMDGPUMachineLevelInliner.h"
+#include "AMDGPU.h"
+#include "AMDGPUMachineModuleInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LegacyPassManagers.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassTimingInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TimeProfiler.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-machine-level-inliner"
+
+namespace {
+class AMDGPUInliningPassManager : public FPPassManager {
+public:
+  static char ID;
+
+  explicit AMDGPUInliningPassManager() : FPPassManager(ID) {}
+
+  bool runOnFunction(Function &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  bool doFinalization(Module &M) override;
+
+  StringRef getPassName() const override {
+return "AMDGPU Inlining Pass Manager";
+  }
+};
+
+/// AMDGPUInliningAnchor - A machine function pass that serves as an anchor for
+/// setting up the AMDGPU inlining pass manager infrastructure. It makes sure
+/// the inliner is run via an AMDGPUInliningPassManager. It can be run well in
+/// advance of the inliner as long as there are only FunctionPasses in between.
+class AMDGPUInliningAnchor : public MachineFunctionPass {
+public:
+  static char ID; // Pass identification
+
+  AMDGPUInliningAnchor() : MachineFunctionPass(ID) {}
+
+  // We don't really need to process any functions here.
+  bool runOnMachineFunction(MachineFunction &MF) override { return false; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  StringRef getPassName() const override;
+
+  /// Prepare the pass manager stack for the inliner. This will push an
+  /// `AMDGPUInliningPassManager` onto the stack.
+  void preparePassManager(PMStack &Stack) override;
+};
+
+} // end anonymous namespace.
+
+// Pass identification
+char AMDGPUMachineLevelInliner::ID = 0;
+char AMDGPUInliningPassManager::ID = 0;
+char AMDGPUInliningAnchor::ID = 0;
+
+char &llvm::AMDGPUMachineLevelInlinerID = AMDGPUMachineLevelInliner::ID;
+char &llvm::AMDGPUInliningAnchorID = AMDGPUInliningAnchor::ID;
+
+INITIALIZE_PASS_BEGIN(AMDGPUMachineLevelInliner, DEBUG_TYPE,
+  "AMDGPU Machine Level Inliner", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineModuleInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AMDGPUInliningAnchor)
+INITIALIZE_PASS_END(AMDGPUMachineLevelInliner, DEBUG_TYPE,
+"AMDGPU Machine Level Inliner", false, false)
+
+INITIALIZE_PASS_BEGIN(AMDGPUInliningAnchor, "amdgpu-inlining-anchor",
+  "AMDGPU Inlining Anchor", false, true)
+INITIALIZE_PASS_DEPENDENCY(MachineModuleInfoWrapperPass)
+INITIALIZE_PASS_END(AMDGPUInliningAnchor, "amdgpu-inlining-anchor",
+"AMDGPU Inlining Anchor", false, true)
+
+AMDGPUMachineLevelInliner::AMDGPUMachineLevelInliner()
+: MachineFunctionPass(ID) {
+  initializeAMDGPUMachineLevelInlinerPass(*PassRegistry::getPassRegistry());
+}
+
+void AMDGPUMachineLevelInliner::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired();
+  AU.addRequired();
+  AU.addPreserved();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool AMDGPUMachineLevelInliner::runOnMachineFunction(MachineFunction &MF) {
+  MachineModuleInfo &MMI = 
getAnalysis().getMMI();
+
+  Function &F = MF.getFunction();
+  if (shouldInlineCallsTo(F)) {
+// Mark the function as machine-inlined in AMDGPUMachineModuleInfo. This
+// tells the inlining pass manager to stop processing it.
+auto &AMMMI = MMI.getObjFileInfo();
+AMMMI.addMachineInlinedFunction(F);

cmc-rep wrote:

Maybe the name could be "addMachineIiningCandidate(F)"? Again I am thinking 
about potentially more broader use of the InlinePassManager.

https://github.com/llvm/llvm-project/pull/169476
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] Add machine-level inliner pass (PR #169476)

2025-12-08 Thread Gang Chen via llvm-branch-commits

https://github.com/cmc-rep edited 
https://github.com/llvm/llvm-project/pull/169476
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] Add machine-level inliner pass (PR #169476)

2025-12-08 Thread Gang Chen via llvm-branch-commits

https://github.com/cmc-rep edited 
https://github.com/llvm/llvm-project/pull/169476
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] Add machine-level inliner pass (PR #169476)

2025-12-08 Thread Gang Chen via llvm-branch-commits

https://github.com/cmc-rep edited 
https://github.com/llvm/llvm-project/pull/169476
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] Insert inliner anchor earlier (PR #169478)

2025-12-09 Thread Gang Chen via llvm-branch-commits

https://github.com/cmc-rep approved this pull request.


https://github.com/llvm/llvm-project/pull/169478
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] Update machine frame info during inlining (PR #169477)

2025-12-09 Thread Gang Chen via llvm-branch-commits

cmc-rep wrote:

This one seems a small incremental change. Looks good to me.

https://github.com/llvm/llvm-project/pull/169477
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] Insert inliner anchor earlier (PR #169478)

2025-12-09 Thread Gang Chen via llvm-branch-commits

cmc-rep wrote:

This one also looks good to me.

https://github.com/llvm/llvm-project/pull/169478
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [AMDGPU] Add machine-level inliner pass (PR #169476)

2025-12-12 Thread Gang Chen via llvm-branch-commits


@@ -0,0 +1,122 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 
-amdgpu-enable-machine-level-inliner < %s | FileCheck %s
+
+declare !callback !0 i32 @llvm.amdgcn.call.whole.wave.i32.p0(ptr, ...)
+
+define amdgpu_cs void @inline_simple_wwf(i32 %input, ptr addrspace(1) %output) 
{
+; CHECK-LABEL: inline_simple_wwf:
+; CHECK:   ; %bb.0:
+; CHECK-NEXT:s_mov_b32 s1, simple_whole_wave_func@abs32@hi
+; CHECK-NEXT:s_mov_b32 s0, simple_whole_wave_func@abs32@lo
+; CHECK-NEXT:s_mov_b32 s32, 0
+; CHECK-NEXT:v_dual_mov_b32 v41, v2 :: v_dual_mov_b32 v40, v1
+; CHECK-NEXT:s_swappc_b64 s[30:31], s[0:1]
+; CHECK-NEXT:global_store_b32 v[40:41], v0, off
+; CHECK-NEXT:s_endpgm
+  %result = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr 
@simple_whole_wave_func, i32 %input)
+  store i32 %result, ptr addrspace(1) %output
+  ret void
+}
+
+define amdgpu_gfx_whole_wave i32 @simple_whole_wave_func(i1 %active, i32 %x) {
+  %result = add i32 %x, 42
+  ret i32 %result
+}
+
+define amdgpu_gfx_whole_wave i32 @another_whole_wave_func(i1 %active, i32 %a, 
i32 %b) {
+  %sum = add i32 %a, %b
+  %result = mul i32 %sum, 2
+  ret i32 %result
+}
+
+define amdgpu_cs void @inline_multiple_wwf(i32 %x, i32 %y, ptr addrspace(1) 
%out1, ptr addrspace(1) %out2) {
+; CHECK-LABEL: inline_multiple_wwf:
+; CHECK:   ; %bb.0:
+; CHECK-NEXT:s_mov_b32 s1, simple_whole_wave_func@abs32@hi
+; CHECK-NEXT:s_mov_b32 s0, simple_whole_wave_func@abs32@lo
+; CHECK-NEXT:s_mov_b32 s32, 0
+; CHECK-NEXT:v_dual_mov_b32 v41, v5 :: v_dual_mov_b32 v44, v0
+; CHECK-NEXT:v_dual_mov_b32 v40, v4 :: v_dual_mov_b32 v43, v3
+; CHECK-NEXT:v_dual_mov_b32 v42, v2 :: v_dual_mov_b32 v45, v1
+; CHECK-NEXT:s_swappc_b64 s[30:31], s[0:1]
+; CHECK-NEXT:s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT:v_dual_mov_b32 v46, v0 :: v_dual_mov_b32 v1, v45
+; CHECK-NEXT:v_mov_b32_e32 v0, v44
+; CHECK-NEXT:s_mov_b32 s1, another_whole_wave_func@abs32@hi
+; CHECK-NEXT:s_mov_b32 s0, another_whole_wave_func@abs32@lo
+; CHECK-NEXT:s_wait_alu 0xfffe
+; CHECK-NEXT:s_swappc_b64 s[30:31], s[0:1]
+; CHECK-NEXT:global_store_b32 v[42:43], v46, off
+; CHECK-NEXT:global_store_b32 v[40:41], v0, off
+; CHECK-NEXT:s_endpgm
+  %result1 = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr 
@simple_whole_wave_func, i32 %x)
+  %result2 = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr 
@another_whole_wave_func, i32 %x, i32 %y)
+  store i32 %result1, ptr addrspace(1) %out1
+  store i32 %result2, ptr addrspace(1) %out2
+  ret void
+}
+

cmc-rep wrote:

Sounds good

https://github.com/llvm/llvm-project/pull/169476
___
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits