[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
jwanggit86 wrote: Implementation is moved to SIMemoryLegalizer pass. See pull req [79236](https://github.com/llvm/llvm-project/pull/79236). https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -1708,6 +1710,19 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction , } ++Iter; +if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) { + auto Builder = + BuildMI(Block, Iter, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); + if (IsGFX10Plus) { jayfoad wrote: Yes but why? On GFX10+, why would you put s_waitcnt(0) after a store or s_waitcnt_vscnt(0) after a load? https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -1708,6 +1710,19 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction , } ++Iter; +if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) { + auto Builder = + BuildMI(Block, Iter, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); + if (IsGFX10Plus) { +Builder = +BuildMI(Block, Iter, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT)) +.addReg(AMDGPU::SGPR_NULL, RegState::Undef) +.addImm(0); + } + OldWaitcntInstr = Builder.getInstr(); jwanggit86 wrote: Done. https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -1847,6 +1862,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) { TrackedWaitcntSet.clear(); BlockInfos.clear(); + jwanggit86 wrote: Done. https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/68932 >From e393477607cb94b45a3b9a5db2aea98fb8af2a86 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Thu, 12 Oct 2023 16:45:59 -0500 Subject: [PATCH 01/11] [AMDGPU] Emit a waitcnt instruction after each memory instruction This patch implements a new command-line option for the backend, namely, amdgpu-waitcnt-for-all-mem-op. When this option is specified, a "waitcnt 0" instruction is generated after each memory load/store instruction. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 30 ++- .../CodeGen/AMDGPU/insert_waitcnt_for_all.ll | 222 ++ 2 files changed, 251 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index ede4841b8a5fd7d..728be7c61fa2217 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -52,6 +52,10 @@ static cl::opt ForceEmitZeroFlag( cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden); +static cl::opt EmitForAllMemOpFlag( +"amdgpu-waitcnt-for-all-mem-op", +cl::desc("Emit s_waitcnt 0 after each memory operation"), cl::init(false)); + namespace { // Class of object that encapsulates latest instruction counter score // associated with the operand. Used for determining whether @@ -388,6 +392,8 @@ class SIInsertWaitcnts : public MachineFunctionPass { // message. DenseSet ReleaseVGPRInsts; + bool insertWaitcntAfterMemOp(MachineFunction ); + public: static char ID; @@ -1809,6 +1815,23 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, return HasVMemLoad && UsesVgprLoadedOutside; } +bool SIInsertWaitcnts::insertWaitcntAfterMemOp(MachineFunction ) { + bool Modified = false; + + for (auto : MF) { +for (auto It = MBB.begin(); It != MBB.end();) { + bool IsMemOp = It->mayLoadOrStore(); + ++It; + if (IsMemOp) { +BuildMI(MBB, It, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); +Modified = true; + } +} + } + + return Modified; +} + bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) { ST = (); TII = ST->getInstrInfo(); @@ -1819,6 +1842,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) { MLI = (); PDT = (); + bool Modified = false; + + if (EmitForAllMemOpFlag) { +Modified = insertWaitcntAfterMemOp(MF); + } + ForceEmitZeroWaitcnts = ForceEmitZeroFlag; for (auto T : inst_counter_types()) ForceEmitWaitcnt[T] = false; @@ -1847,7 +1876,6 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) { TrackedWaitcntSet.clear(); BlockInfos.clear(); - bool Modified = false; if (!MFI->isEntryFunction()) { // Wait for any outstanding memory operations that the input registers may diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll new file mode 100644 index 000..4580b9074ada3cc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll @@ -0,0 +1,222 @@ +; Testing the -amdgpu-waitcnt-for-all-mem-op option +; COM: llc -mtriple=amdgcn -mcpu=hawaii -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX7 +; COM: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX8 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX90A +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s + +; from atomicrmw-expand.ll +; covers flat_load, flat_atomic +define void @syncscope_workgroup_nortn(ptr %addr, float %val) { +; GFX90A-LABEL: syncscope_workgroup_nortn: +; GFX90A: ; %bb.0: +; GFX90A: flat_load_dword v5, v[0:1] +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A: .LBB0_1: ; %atomicrmw.start +; GFX90A: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst + ret void +} + +; from atomicrmw-nand.ll +; covers global_atomic, global_load +define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { +; GFX9-LABEL: atomic_nand_i32_global: +; GFX9: ; %bb.0: +; GFX9-NEXT:
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -1708,6 +1710,19 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction , } ++Iter; +if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) { + auto Builder = + BuildMI(Block, Iter, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); + if (IsGFX10Plus) { jwanggit86 wrote: S_waitcnt(0) is inserted after each mem op, both stores and loads. https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -1847,6 +1862,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) { TrackedWaitcntSet.clear(); BlockInfos.clear(); + jayfoad wrote: Remove this https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/68932 >From e393477607cb94b45a3b9a5db2aea98fb8af2a86 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Thu, 12 Oct 2023 16:45:59 -0500 Subject: [PATCH 01/10] [AMDGPU] Emit a waitcnt instruction after each memory instruction This patch implements a new command-line option for the backend, namely, amdgpu-waitcnt-for-all-mem-op. When this option is specified, a "waitcnt 0" instruction is generated after each memory load/store instruction. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 30 ++- .../CodeGen/AMDGPU/insert_waitcnt_for_all.ll | 222 ++ 2 files changed, 251 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index ede4841b8a5fd7d..728be7c61fa2217 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -52,6 +52,10 @@ static cl::opt ForceEmitZeroFlag( cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden); +static cl::opt EmitForAllMemOpFlag( +"amdgpu-waitcnt-for-all-mem-op", +cl::desc("Emit s_waitcnt 0 after each memory operation"), cl::init(false)); + namespace { // Class of object that encapsulates latest instruction counter score // associated with the operand. Used for determining whether @@ -388,6 +392,8 @@ class SIInsertWaitcnts : public MachineFunctionPass { // message. DenseSet ReleaseVGPRInsts; + bool insertWaitcntAfterMemOp(MachineFunction ); + public: static char ID; @@ -1809,6 +1815,23 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, return HasVMemLoad && UsesVgprLoadedOutside; } +bool SIInsertWaitcnts::insertWaitcntAfterMemOp(MachineFunction ) { + bool Modified = false; + + for (auto : MF) { +for (auto It = MBB.begin(); It != MBB.end();) { + bool IsMemOp = It->mayLoadOrStore(); + ++It; + if (IsMemOp) { +BuildMI(MBB, It, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); +Modified = true; + } +} + } + + return Modified; +} + bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) { ST = (); TII = ST->getInstrInfo(); @@ -1819,6 +1842,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) { MLI = (); PDT = (); + bool Modified = false; + + if (EmitForAllMemOpFlag) { +Modified = insertWaitcntAfterMemOp(MF); + } + ForceEmitZeroWaitcnts = ForceEmitZeroFlag; for (auto T : inst_counter_types()) ForceEmitWaitcnt[T] = false; @@ -1847,7 +1876,6 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) { TrackedWaitcntSet.clear(); BlockInfos.clear(); - bool Modified = false; if (!MFI->isEntryFunction()) { // Wait for any outstanding memory operations that the input registers may diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll new file mode 100644 index 000..4580b9074ada3cc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll @@ -0,0 +1,222 @@ +; Testing the -amdgpu-waitcnt-for-all-mem-op option +; COM: llc -mtriple=amdgcn -mcpu=hawaii -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX7 +; COM: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX8 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX90A +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s + +; from atomicrmw-expand.ll +; covers flat_load, flat_atomic +define void @syncscope_workgroup_nortn(ptr %addr, float %val) { +; GFX90A-LABEL: syncscope_workgroup_nortn: +; GFX90A: ; %bb.0: +; GFX90A: flat_load_dword v5, v[0:1] +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A: .LBB0_1: ; %atomicrmw.start +; GFX90A: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst + ret void +} + +; from atomicrmw-nand.ll +; covers global_atomic, global_load +define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { +; GFX9-LABEL: atomic_nand_i32_global: +; GFX9: ; %bb.0: +; GFX9-NEXT:
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -388,6 +388,8 @@ class SIInsertWaitcnts : public MachineFunctionPass { // message. DenseSet ReleaseVGPRInsts; + // bool insertWaitcntAfterMemOp(MachineFunction ); jwanggit86 wrote: Done. https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -1708,6 +1710,13 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction , } ++Iter; +if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) { + auto builder = jwanggit86 wrote: Done. https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -1708,6 +1710,13 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction , } ++Iter; +if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) { + auto builder = + BuildMI(Block, Iter, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); jwanggit86 wrote: Done. https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -0,0 +1,222 @@ +; Testing the -amdgpu-precise-memory-op option +; COM: llc -mtriple=amdgcn -mcpu=hawaii -mattr=+amdgpu-precise-memory-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX7 jwanggit86 wrote: Comment. Some testcases in this file won't run if mcpu=hawaii. In the latest commit, the test file has been split into 2. https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/68932 >From e393477607cb94b45a3b9a5db2aea98fb8af2a86 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Thu, 12 Oct 2023 16:45:59 -0500 Subject: [PATCH 1/9] [AMDGPU] Emit a waitcnt instruction after each memory instruction This patch implements a new command-line option for the backend, namely, amdgpu-waitcnt-for-all-mem-op. When this option is specified, a "waitcnt 0" instruction is generated after each memory load/store instruction. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 30 ++- .../CodeGen/AMDGPU/insert_waitcnt_for_all.ll | 222 ++ 2 files changed, 251 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index ede4841b8a5fd7d..728be7c61fa2217 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -52,6 +52,10 @@ static cl::opt ForceEmitZeroFlag( cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden); +static cl::opt EmitForAllMemOpFlag( +"amdgpu-waitcnt-for-all-mem-op", +cl::desc("Emit s_waitcnt 0 after each memory operation"), cl::init(false)); + namespace { // Class of object that encapsulates latest instruction counter score // associated with the operand. Used for determining whether @@ -388,6 +392,8 @@ class SIInsertWaitcnts : public MachineFunctionPass { // message. DenseSet ReleaseVGPRInsts; + bool insertWaitcntAfterMemOp(MachineFunction ); + public: static char ID; @@ -1809,6 +1815,23 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, return HasVMemLoad && UsesVgprLoadedOutside; } +bool SIInsertWaitcnts::insertWaitcntAfterMemOp(MachineFunction ) { + bool Modified = false; + + for (auto : MF) { +for (auto It = MBB.begin(); It != MBB.end();) { + bool IsMemOp = It->mayLoadOrStore(); + ++It; + if (IsMemOp) { +BuildMI(MBB, It, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); +Modified = true; + } +} + } + + return Modified; +} + bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) { ST = (); TII = ST->getInstrInfo(); @@ -1819,6 +1842,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) { MLI = (); PDT = (); + bool Modified = false; + + if (EmitForAllMemOpFlag) { +Modified = insertWaitcntAfterMemOp(MF); + } + ForceEmitZeroWaitcnts = ForceEmitZeroFlag; for (auto T : inst_counter_types()) ForceEmitWaitcnt[T] = false; @@ -1847,7 +1876,6 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) { TrackedWaitcntSet.clear(); BlockInfos.clear(); - bool Modified = false; if (!MFI->isEntryFunction()) { // Wait for any outstanding memory operations that the input registers may diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll new file mode 100644 index 000..4580b9074ada3cc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll @@ -0,0 +1,222 @@ +; Testing the -amdgpu-waitcnt-for-all-mem-op option +; COM: llc -mtriple=amdgcn -mcpu=hawaii -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX7 +; COM: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX8 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX90A +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s + +; from atomicrmw-expand.ll +; covers flat_load, flat_atomic +define void @syncscope_workgroup_nortn(ptr %addr, float %val) { +; GFX90A-LABEL: syncscope_workgroup_nortn: +; GFX90A: ; %bb.0: +; GFX90A: flat_load_dword v5, v[0:1] +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A: .LBB0_1: ; %atomicrmw.start +; GFX90A: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst + ret void +} + +; from atomicrmw-nand.ll +; covers global_atomic, global_load +define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { +; GFX9-LABEL: atomic_nand_i32_global: +; GFX9: ; %bb.0: +; GFX9-NEXT:
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -0,0 +1,222 @@ +; Testing the -amdgpu-precise-memory-op option +; COM: llc -mtriple=amdgcn -mcpu=hawaii -mattr=+amdgpu-precise-memory-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX7 jayfoad wrote: What is COM: ? https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -1708,6 +1710,13 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction , } ++Iter; +if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) { + auto builder = + BuildMI(Block, Iter, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); jayfoad wrote: On GFX10+ VMEM stores should have S_WAITCNT_VSCNT 0 as well as (or instead of) this. https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -1708,6 +1710,13 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction , } ++Iter; +if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) { + auto builder = jayfoad wrote: Upper case B for Builder. https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
jwanggit86 wrote: > > So, while it's possible to create a combined option, using a separate > > option also makes sense. Do we generally try to avoid creating new > > command-line options? > > Looking again, I see they are different and unrelated. I don't really > understand why we have amdgpu-waitcnt-forcezero, I'm not sure I've ever used > it. I always expected it to behave like this flag. So do you still think the new code should be integrated with the existing pass, or should it be separate? https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
arsenm wrote: > So, while it's possible to create a combined option, using a separate option > also makes sense. Do we generally try to avoid creating new command-line > options? Looking again, I see they are different and unrelated. I don't really understand why we have amdgpu-waitcnt-forcezero, I'm not sure I've ever used it. I always expected it to behave like this flag. https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -1809,6 +1816,23 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, return HasVMemLoad && UsesVgprLoadedOutside; } +bool SIInsertWaitcnts::insertWaitcntAfterMemOp(MachineFunction ) { + bool Modified = false; + + for (auto : MF) { arsenm wrote: Plus I think the two separate, but closely related cl::opts is confusing https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -1809,6 +1816,23 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, return HasVMemLoad && UsesVgprLoadedOutside; } +bool SIInsertWaitcnts::insertWaitcntAfterMemOp(MachineFunction ) { + bool Modified = false; + + for (auto : MF) { arsenm wrote: I think it makes it harder to reason about the pass as a whole to have it as a totally separate phase https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -1809,6 +1816,23 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, return HasVMemLoad && UsesVgprLoadedOutside; } +bool SIInsertWaitcnts::insertWaitcntAfterMemOp(MachineFunction ) { + bool Modified = false; + + for (auto : MF) { arsenm wrote: Should try to integrate with the rest of the logic instead of adding a separate pass over the function https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -52,6 +52,11 @@ static cl::opt ForceEmitZeroFlag( cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden); +static cl::opt +PreciseMemOpFlag("amdgpu-precise-memory-op", + cl::desc("Emit s_waitcnt 0 after each memory operation"), + cl::init(false)); + arsenm wrote: I think this should be fused into an enum flag with the existing waitcnt flag. https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
https://github.com/arsenm edited https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/68932 >From a87ba1892375ef67edb5d6f3bd537869203273a6 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Thu, 12 Oct 2023 16:45:59 -0500 Subject: [PATCH 1/6] [AMDGPU] Emit a waitcnt instruction after each memory instruction This patch implements a new command-line option for the backend, namely, amdgpu-waitcnt-for-all-mem-op. When this option is specified, a "waitcnt 0" instruction is generated after each memory load/store instruction. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 30 ++- .../CodeGen/AMDGPU/insert_waitcnt_for_all.ll | 222 ++ 2 files changed, 251 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index ede4841b8a5fd7d..728be7c61fa2217 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -52,6 +52,10 @@ static cl::opt ForceEmitZeroFlag( cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden); +static cl::opt EmitForAllMemOpFlag( +"amdgpu-waitcnt-for-all-mem-op", +cl::desc("Emit s_waitcnt 0 after each memory operation"), cl::init(false)); + namespace { // Class of object that encapsulates latest instruction counter score // associated with the operand. Used for determining whether @@ -388,6 +392,8 @@ class SIInsertWaitcnts : public MachineFunctionPass { // message. DenseSet ReleaseVGPRInsts; + bool insertWaitcntAfterMemOp(MachineFunction ); + public: static char ID; @@ -1809,6 +1815,23 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, return HasVMemLoad && UsesVgprLoadedOutside; } +bool SIInsertWaitcnts::insertWaitcntAfterMemOp(MachineFunction ) { + bool Modified = false; + + for (auto : MF) { +for (auto It = MBB.begin(); It != MBB.end();) { + bool IsMemOp = It->mayLoadOrStore(); + ++It; + if (IsMemOp) { +BuildMI(MBB, It, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); +Modified = true; + } +} + } + + return Modified; +} + bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) { ST = (); TII = ST->getInstrInfo(); @@ -1819,6 +1842,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) { MLI = (); PDT = (); + bool Modified = false; + + if (EmitForAllMemOpFlag) { +Modified = insertWaitcntAfterMemOp(MF); + } + ForceEmitZeroWaitcnts = ForceEmitZeroFlag; for (auto T : inst_counter_types()) ForceEmitWaitcnt[T] = false; @@ -1847,7 +1876,6 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) { TrackedWaitcntSet.clear(); BlockInfos.clear(); - bool Modified = false; if (!MFI->isEntryFunction()) { // Wait for any outstanding memory operations that the input registers may diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll new file mode 100644 index 000..4580b9074ada3cc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll @@ -0,0 +1,222 @@ +; Testing the -amdgpu-waitcnt-for-all-mem-op option +; COM: llc -mtriple=amdgcn -mcpu=hawaii -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX7 +; COM: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX8 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX90A +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s + +; from atomicrmw-expand.ll +; covers flat_load, flat_atomic +define void @syncscope_workgroup_nortn(ptr %addr, float %val) { +; GFX90A-LABEL: syncscope_workgroup_nortn: +; GFX90A: ; %bb.0: +; GFX90A: flat_load_dword v5, v[0:1] +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A: .LBB0_1: ; %atomicrmw.start +; GFX90A: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst + ret void +} + +; from atomicrmw-nand.ll +; covers global_atomic, global_load +define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { +; GFX9-LABEL: atomic_nand_i32_global: +; GFX9: ; %bb.0: +; GFX9-NEXT:
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/68932 >From 07b3f94e49df221406cf7b83a05c8704e1af1c75 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Thu, 12 Oct 2023 16:45:59 -0500 Subject: [PATCH 1/5] [AMDGPU] Emit a waitcnt instruction after each memory instruction This patch implements a new command-line option for the backend, namely, amdgpu-waitcnt-for-all-mem-op. When this option is specified, a "waitcnt 0" instruction is generated after each memory load/store instruction. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 30 ++- .../CodeGen/AMDGPU/insert_waitcnt_for_all.ll | 222 ++ 2 files changed, 251 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index ede4841b8a5fd7d..728be7c61fa2217 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -52,6 +52,10 @@ static cl::opt ForceEmitZeroFlag( cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden); +static cl::opt EmitForAllMemOpFlag( +"amdgpu-waitcnt-for-all-mem-op", +cl::desc("Emit s_waitcnt 0 after each memory operation"), cl::init(false)); + namespace { // Class of object that encapsulates latest instruction counter score // associated with the operand. Used for determining whether @@ -388,6 +392,8 @@ class SIInsertWaitcnts : public MachineFunctionPass { // message. DenseSet ReleaseVGPRInsts; + bool insertWaitcntAfterMemOp(MachineFunction ); + public: static char ID; @@ -1809,6 +1815,23 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, return HasVMemLoad && UsesVgprLoadedOutside; } +bool SIInsertWaitcnts::insertWaitcntAfterMemOp(MachineFunction ) { + bool Modified = false; + + for (auto : MF) { +for (auto It = MBB.begin(); It != MBB.end();) { + bool IsMemOp = It->mayLoadOrStore(); + ++It; + if (IsMemOp) { +BuildMI(MBB, It, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); +Modified = true; + } +} + } + + return Modified; +} + bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) { ST = (); TII = ST->getInstrInfo(); @@ -1819,6 +1842,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) { MLI = (); PDT = (); + bool Modified = false; + + if (EmitForAllMemOpFlag) { +Modified = insertWaitcntAfterMemOp(MF); + } + ForceEmitZeroWaitcnts = ForceEmitZeroFlag; for (auto T : inst_counter_types()) ForceEmitWaitcnt[T] = false; @@ -1847,7 +1876,6 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) { TrackedWaitcntSet.clear(); BlockInfos.clear(); - bool Modified = false; if (!MFI->isEntryFunction()) { // Wait for any outstanding memory operations that the input registers may diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll new file mode 100644 index 000..4580b9074ada3cc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll @@ -0,0 +1,222 @@ +; Testing the -amdgpu-waitcnt-for-all-mem-op option +; COM: llc -mtriple=amdgcn -mcpu=hawaii -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX7 +; COM: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX8 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX90A +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s + +; from atomicrmw-expand.ll +; covers flat_load, flat_atomic +define void @syncscope_workgroup_nortn(ptr %addr, float %val) { +; GFX90A-LABEL: syncscope_workgroup_nortn: +; GFX90A: ; %bb.0: +; GFX90A: flat_load_dword v5, v[0:1] +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A: .LBB0_1: ; %atomicrmw.start +; GFX90A: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst + ret void +} + +; from atomicrmw-nand.ll +; covers global_atomic, global_load +define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { +; GFX9-LABEL: atomic_nand_i32_global: +; GFX9: ; %bb.0: +; GFX9-NEXT:
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/68932 >From 07b3f94e49df221406cf7b83a05c8704e1af1c75 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Thu, 12 Oct 2023 16:45:59 -0500 Subject: [PATCH 1/4] [AMDGPU] Emit a waitcnt instruction after each memory instruction This patch implements a new command-line option for the backend, namely, amdgpu-waitcnt-for-all-mem-op. When this option is specified, a "waitcnt 0" instruction is generated after each memory load/store instruction. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 30 ++- .../CodeGen/AMDGPU/insert_waitcnt_for_all.ll | 222 ++ 2 files changed, 251 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index ede4841b8a5fd7d..728be7c61fa2217 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -52,6 +52,10 @@ static cl::opt ForceEmitZeroFlag( cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden); +static cl::opt EmitForAllMemOpFlag( +"amdgpu-waitcnt-for-all-mem-op", +cl::desc("Emit s_waitcnt 0 after each memory operation"), cl::init(false)); + namespace { // Class of object that encapsulates latest instruction counter score // associated with the operand. Used for determining whether @@ -388,6 +392,8 @@ class SIInsertWaitcnts : public MachineFunctionPass { // message. DenseSet ReleaseVGPRInsts; + bool insertWaitcntAfterMemOp(MachineFunction ); + public: static char ID; @@ -1809,6 +1815,23 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, return HasVMemLoad && UsesVgprLoadedOutside; } +bool SIInsertWaitcnts::insertWaitcntAfterMemOp(MachineFunction ) { + bool Modified = false; + + for (auto : MF) { +for (auto It = MBB.begin(); It != MBB.end();) { + bool IsMemOp = It->mayLoadOrStore(); + ++It; + if (IsMemOp) { +BuildMI(MBB, It, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); +Modified = true; + } +} + } + + return Modified; +} + bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) { ST = (); TII = ST->getInstrInfo(); @@ -1819,6 +1842,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) { MLI = (); PDT = (); + bool Modified = false; + + if (EmitForAllMemOpFlag) { +Modified = insertWaitcntAfterMemOp(MF); + } + ForceEmitZeroWaitcnts = ForceEmitZeroFlag; for (auto T : inst_counter_types()) ForceEmitWaitcnt[T] = false; @@ -1847,7 +1876,6 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) { TrackedWaitcntSet.clear(); BlockInfos.clear(); - bool Modified = false; if (!MFI->isEntryFunction()) { // Wait for any outstanding memory operations that the input registers may diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll new file mode 100644 index 000..4580b9074ada3cc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll @@ -0,0 +1,222 @@ +; Testing the -amdgpu-waitcnt-for-all-mem-op option +; COM: llc -mtriple=amdgcn -mcpu=hawaii -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX7 +; COM: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX8 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX90A +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s + +; from atomicrmw-expand.ll +; covers flat_load, flat_atomic +define void @syncscope_workgroup_nortn(ptr %addr, float %val) { +; GFX90A-LABEL: syncscope_workgroup_nortn: +; GFX90A: ; %bb.0: +; GFX90A: flat_load_dword v5, v[0:1] +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A: .LBB0_1: ; %atomicrmw.start +; GFX90A: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst + ret void +} + +; from atomicrmw-nand.ll +; covers global_atomic, global_load +define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { +; GFX9-LABEL: atomic_nand_i32_global: +; GFX9: ; %bb.0: +; GFX9-NEXT: