[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)

2024-01-23 Thread Jun Wang via cfe-commits

jwanggit86 wrote:

Implementation is moved to SIMemoryLegalizer pass. See pull req 
[79236](https://github.com/llvm/llvm-project/pull/79236).

https://github.com/llvm/llvm-project/pull/68932
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)

2023-11-22 Thread Jay Foad via cfe-commits


@@ -1708,6 +1710,19 @@ bool 
SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction ,
 }
 
 ++Iter;
+if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
+  auto Builder =
+  BuildMI(Block, Iter, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+  .addImm(0);
+  if (IsGFX10Plus) {

jayfoad wrote:

Yes but why? On GFX10+, why would you put s_waitcnt(0) after a store or 
s_waitcnt_vscnt(0) after a load?

https://github.com/llvm/llvm-project/pull/68932
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)

2023-11-22 Thread Jun Wang via cfe-commits


@@ -1708,6 +1710,19 @@ bool 
SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction ,
 }
 
 ++Iter;
+if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
+  auto Builder =
+  BuildMI(Block, Iter, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+  .addImm(0);
+  if (IsGFX10Plus) {
+Builder =
+BuildMI(Block, Iter, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
+.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+.addImm(0);
+  }
+  OldWaitcntInstr = Builder.getInstr();

jwanggit86 wrote:

Done.

https://github.com/llvm/llvm-project/pull/68932
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)

2023-11-22 Thread Jun Wang via cfe-commits


@@ -1847,6 +1862,7 @@ bool 
SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) {
 
   TrackedWaitcntSet.clear();
   BlockInfos.clear();
+

jwanggit86 wrote:

Done.

https://github.com/llvm/llvm-project/pull/68932
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)

2023-11-22 Thread Jun Wang via cfe-commits

https://github.com/jwanggit86 updated 
https://github.com/llvm/llvm-project/pull/68932

>From e393477607cb94b45a3b9a5db2aea98fb8af2a86 Mon Sep 17 00:00:00 2001
From: Jun Wang 
Date: Thu, 12 Oct 2023 16:45:59 -0500
Subject: [PATCH 01/11] [AMDGPU] Emit a waitcnt instruction after each memory
 instruction

This patch implements a new command-line option for the backend, namely,
amdgpu-waitcnt-for-all-mem-op. When this option is specified, a "waitcnt 0"
instruction is generated after each memory load/store instruction.
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp   |  30 ++-
 .../CodeGen/AMDGPU/insert_waitcnt_for_all.ll  | 222 ++
 2 files changed, 251 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp 
b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ede4841b8a5fd7d..728be7c61fa2217 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -52,6 +52,10 @@ static cl::opt ForceEmitZeroFlag(
   cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) 
expcnt(0) lgkmcnt(0)"),
   cl::init(false), cl::Hidden);
 
+static cl::opt EmitForAllMemOpFlag(
+"amdgpu-waitcnt-for-all-mem-op",
+cl::desc("Emit s_waitcnt 0 after each memory operation"), cl::init(false));
+
 namespace {
 // Class of object that encapsulates latest instruction counter score
 // associated with the operand.  Used for determining whether
@@ -388,6 +392,8 @@ class SIInsertWaitcnts : public MachineFunctionPass {
   // message.
   DenseSet ReleaseVGPRInsts;
 
+  bool insertWaitcntAfterMemOp(MachineFunction );
+
 public:
   static char ID;
 
@@ -1809,6 +1815,23 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
   return HasVMemLoad && UsesVgprLoadedOutside;
 }
 
+bool SIInsertWaitcnts::insertWaitcntAfterMemOp(MachineFunction ) {
+  bool Modified = false;
+
+  for (auto  : MF) {
+for (auto It = MBB.begin(); It != MBB.end();) {
+  bool IsMemOp = It->mayLoadOrStore();
+  ++It;
+  if (IsMemOp) {
+BuildMI(MBB, It, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
+Modified = true;
+  }
+}
+  }
+
+  return Modified;
+}
+
 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) {
   ST = ();
   TII = ST->getInstrInfo();
@@ -1819,6 +1842,12 @@ bool 
SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) {
   MLI = ();
   PDT = ();
 
+  bool Modified = false;
+
+  if (EmitForAllMemOpFlag) {
+Modified = insertWaitcntAfterMemOp(MF);
+  }
+
   ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
   for (auto T : inst_counter_types())
 ForceEmitWaitcnt[T] = false;
@@ -1847,7 +1876,6 @@ bool 
SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) {
 
   TrackedWaitcntSet.clear();
   BlockInfos.clear();
-  bool Modified = false;
 
   if (!MFI->isEntryFunction()) {
 // Wait for any outstanding memory operations that the input registers may
diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll 
b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll
new file mode 100644
index 000..4580b9074ada3cc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll
@@ -0,0 +1,222 @@
+; Testing the -amdgpu-waitcnt-for-all-mem-op option
+; COM: llc -mtriple=amdgcn -mcpu=hawaii -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX7
+; COM: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX8
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX90A
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 
-mattr=-flat-for-global,+enable-flat-scratch 
-amdgpu-use-divergent-register-indexing -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s
+
+; from atomicrmw-expand.ll
+; covers flat_load, flat_atomic
+define void @syncscope_workgroup_nortn(ptr %addr, float %val) {
+; GFX90A-LABEL: syncscope_workgroup_nortn:
+; GFX90A:  ; %bb.0:
+; GFX90A: flat_load_dword v5, v[0:1]
+; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A:  .LBB0_1: ; %atomicrmw.start
+; GFX90A: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+  %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst
+  ret void
+}
+
+; from atomicrmw-nand.ll
+; covers global_atomic, global_load
+define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
+; GFX9-LABEL: atomic_nand_i32_global:
+; GFX9:   ; %bb.0:
+; GFX9-NEXT:

[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)

2023-11-22 Thread Jun Wang via cfe-commits


@@ -1708,6 +1710,19 @@ bool 
SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction ,
 }
 
 ++Iter;
+if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
+  auto Builder =
+  BuildMI(Block, Iter, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+  .addImm(0);
+  if (IsGFX10Plus) {

jwanggit86 wrote:

S_waitcnt(0) is inserted after each mem op, both stores and loads.

https://github.com/llvm/llvm-project/pull/68932
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)

2023-11-22 Thread Jay Foad via cfe-commits


@@ -1847,6 +1862,7 @@ bool 
SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) {
 
   TrackedWaitcntSet.clear();
   BlockInfos.clear();
+

jayfoad wrote:

Remove this

https://github.com/llvm/llvm-project/pull/68932
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)

2023-11-21 Thread Jun Wang via cfe-commits

https://github.com/jwanggit86 updated 
https://github.com/llvm/llvm-project/pull/68932

>From e393477607cb94b45a3b9a5db2aea98fb8af2a86 Mon Sep 17 00:00:00 2001
From: Jun Wang 
Date: Thu, 12 Oct 2023 16:45:59 -0500
Subject: [PATCH 01/10] [AMDGPU] Emit a waitcnt instruction after each memory
 instruction

This patch implements a new command-line option for the backend, namely,
amdgpu-waitcnt-for-all-mem-op. When this option is specified, a "waitcnt 0"
instruction is generated after each memory load/store instruction.
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp   |  30 ++-
 .../CodeGen/AMDGPU/insert_waitcnt_for_all.ll  | 222 ++
 2 files changed, 251 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp 
b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ede4841b8a5fd7d..728be7c61fa2217 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -52,6 +52,10 @@ static cl::opt ForceEmitZeroFlag(
   cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) 
expcnt(0) lgkmcnt(0)"),
   cl::init(false), cl::Hidden);
 
+static cl::opt EmitForAllMemOpFlag(
+"amdgpu-waitcnt-for-all-mem-op",
+cl::desc("Emit s_waitcnt 0 after each memory operation"), cl::init(false));
+
 namespace {
 // Class of object that encapsulates latest instruction counter score
 // associated with the operand.  Used for determining whether
@@ -388,6 +392,8 @@ class SIInsertWaitcnts : public MachineFunctionPass {
   // message.
   DenseSet ReleaseVGPRInsts;
 
+  bool insertWaitcntAfterMemOp(MachineFunction );
+
 public:
   static char ID;
 
@@ -1809,6 +1815,23 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
   return HasVMemLoad && UsesVgprLoadedOutside;
 }
 
+bool SIInsertWaitcnts::insertWaitcntAfterMemOp(MachineFunction ) {
+  bool Modified = false;
+
+  for (auto  : MF) {
+for (auto It = MBB.begin(); It != MBB.end();) {
+  bool IsMemOp = It->mayLoadOrStore();
+  ++It;
+  if (IsMemOp) {
+BuildMI(MBB, It, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
+Modified = true;
+  }
+}
+  }
+
+  return Modified;
+}
+
 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) {
   ST = ();
   TII = ST->getInstrInfo();
@@ -1819,6 +1842,12 @@ bool 
SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) {
   MLI = ();
   PDT = ();
 
+  bool Modified = false;
+
+  if (EmitForAllMemOpFlag) {
+Modified = insertWaitcntAfterMemOp(MF);
+  }
+
   ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
   for (auto T : inst_counter_types())
 ForceEmitWaitcnt[T] = false;
@@ -1847,7 +1876,6 @@ bool 
SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) {
 
   TrackedWaitcntSet.clear();
   BlockInfos.clear();
-  bool Modified = false;
 
   if (!MFI->isEntryFunction()) {
 // Wait for any outstanding memory operations that the input registers may
diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll 
b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll
new file mode 100644
index 000..4580b9074ada3cc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll
@@ -0,0 +1,222 @@
+; Testing the -amdgpu-waitcnt-for-all-mem-op option
+; COM: llc -mtriple=amdgcn -mcpu=hawaii -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX7
+; COM: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX8
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX90A
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 
-mattr=-flat-for-global,+enable-flat-scratch 
-amdgpu-use-divergent-register-indexing -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s
+
+; from atomicrmw-expand.ll
+; covers flat_load, flat_atomic
+define void @syncscope_workgroup_nortn(ptr %addr, float %val) {
+; GFX90A-LABEL: syncscope_workgroup_nortn:
+; GFX90A:  ; %bb.0:
+; GFX90A: flat_load_dword v5, v[0:1]
+; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A:  .LBB0_1: ; %atomicrmw.start
+; GFX90A: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+  %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst
+  ret void
+}
+
+; from atomicrmw-nand.ll
+; covers global_atomic, global_load
+define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
+; GFX9-LABEL: atomic_nand_i32_global:
+; GFX9:   ; %bb.0:
+; GFX9-NEXT:

[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)

2023-11-21 Thread Jun Wang via cfe-commits


@@ -388,6 +388,8 @@ class SIInsertWaitcnts : public MachineFunctionPass {
   // message.
   DenseSet ReleaseVGPRInsts;
 
+  // bool insertWaitcntAfterMemOp(MachineFunction );

jwanggit86 wrote:

Done.

https://github.com/llvm/llvm-project/pull/68932
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)

2023-11-21 Thread Jun Wang via cfe-commits


@@ -1708,6 +1710,13 @@ bool 
SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction ,
 }
 
 ++Iter;
+if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
+  auto builder =

jwanggit86 wrote:

Done.

https://github.com/llvm/llvm-project/pull/68932
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)

2023-11-21 Thread Jun Wang via cfe-commits


@@ -1708,6 +1710,13 @@ bool 
SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction ,
 }
 
 ++Iter;
+if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
+  auto builder =
+  BuildMI(Block, Iter, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+  .addImm(0);

jwanggit86 wrote:

Done.

https://github.com/llvm/llvm-project/pull/68932
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)

2023-11-21 Thread Jun Wang via cfe-commits


@@ -0,0 +1,222 @@
+; Testing the -amdgpu-precise-memory-op option
+; COM: llc -mtriple=amdgcn -mcpu=hawaii -mattr=+amdgpu-precise-memory-op 
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX7

jwanggit86 wrote:

Comment. Some testcases in this file won't run if mcpu=hawaii. In the latest 
commit, the test file has been split into 2.

https://github.com/llvm/llvm-project/pull/68932
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)

2023-11-21 Thread Jun Wang via cfe-commits

https://github.com/jwanggit86 updated 
https://github.com/llvm/llvm-project/pull/68932

>From e393477607cb94b45a3b9a5db2aea98fb8af2a86 Mon Sep 17 00:00:00 2001
From: Jun Wang 
Date: Thu, 12 Oct 2023 16:45:59 -0500
Subject: [PATCH 1/9] [AMDGPU] Emit a waitcnt instruction after each memory
 instruction

This patch implements a new command-line option for the backend, namely,
amdgpu-waitcnt-for-all-mem-op. When this option is specified, a "waitcnt 0"
instruction is generated after each memory load/store instruction.
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp   |  30 ++-
 .../CodeGen/AMDGPU/insert_waitcnt_for_all.ll  | 222 ++
 2 files changed, 251 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp 
b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ede4841b8a5fd7d..728be7c61fa2217 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -52,6 +52,10 @@ static cl::opt ForceEmitZeroFlag(
   cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) 
expcnt(0) lgkmcnt(0)"),
   cl::init(false), cl::Hidden);
 
+static cl::opt EmitForAllMemOpFlag(
+"amdgpu-waitcnt-for-all-mem-op",
+cl::desc("Emit s_waitcnt 0 after each memory operation"), cl::init(false));
+
 namespace {
 // Class of object that encapsulates latest instruction counter score
 // associated with the operand.  Used for determining whether
@@ -388,6 +392,8 @@ class SIInsertWaitcnts : public MachineFunctionPass {
   // message.
   DenseSet ReleaseVGPRInsts;
 
+  bool insertWaitcntAfterMemOp(MachineFunction );
+
 public:
   static char ID;
 
@@ -1809,6 +1815,23 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
   return HasVMemLoad && UsesVgprLoadedOutside;
 }
 
+bool SIInsertWaitcnts::insertWaitcntAfterMemOp(MachineFunction ) {
+  bool Modified = false;
+
+  for (auto  : MF) {
+for (auto It = MBB.begin(); It != MBB.end();) {
+  bool IsMemOp = It->mayLoadOrStore();
+  ++It;
+  if (IsMemOp) {
+BuildMI(MBB, It, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
+Modified = true;
+  }
+}
+  }
+
+  return Modified;
+}
+
 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) {
   ST = ();
   TII = ST->getInstrInfo();
@@ -1819,6 +1842,12 @@ bool 
SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) {
   MLI = ();
   PDT = ();
 
+  bool Modified = false;
+
+  if (EmitForAllMemOpFlag) {
+Modified = insertWaitcntAfterMemOp(MF);
+  }
+
   ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
   for (auto T : inst_counter_types())
 ForceEmitWaitcnt[T] = false;
@@ -1847,7 +1876,6 @@ bool 
SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) {
 
   TrackedWaitcntSet.clear();
   BlockInfos.clear();
-  bool Modified = false;
 
   if (!MFI->isEntryFunction()) {
 // Wait for any outstanding memory operations that the input registers may
diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll 
b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll
new file mode 100644
index 000..4580b9074ada3cc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll
@@ -0,0 +1,222 @@
+; Testing the -amdgpu-waitcnt-for-all-mem-op option
+; COM: llc -mtriple=amdgcn -mcpu=hawaii -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX7
+; COM: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX8
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX90A
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 
-mattr=-flat-for-global,+enable-flat-scratch 
-amdgpu-use-divergent-register-indexing -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s
+
+; from atomicrmw-expand.ll
+; covers flat_load, flat_atomic
+define void @syncscope_workgroup_nortn(ptr %addr, float %val) {
+; GFX90A-LABEL: syncscope_workgroup_nortn:
+; GFX90A:  ; %bb.0:
+; GFX90A: flat_load_dword v5, v[0:1]
+; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A:  .LBB0_1: ; %atomicrmw.start
+; GFX90A: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+  %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst
+  ret void
+}
+
+; from atomicrmw-nand.ll
+; covers global_atomic, global_load
+define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
+; GFX9-LABEL: atomic_nand_i32_global:
+; GFX9:   ; %bb.0:
+; GFX9-NEXT:

[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)

2023-11-21 Thread Jay Foad via cfe-commits


@@ -0,0 +1,222 @@
+; Testing the -amdgpu-precise-memory-op option
+; COM: llc -mtriple=amdgcn -mcpu=hawaii -mattr=+amdgpu-precise-memory-op 
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX7

jayfoad wrote:

What is COM: ?

https://github.com/llvm/llvm-project/pull/68932
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)

2023-11-21 Thread Jay Foad via cfe-commits


@@ -1708,6 +1710,13 @@ bool 
SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction ,
 }
 
 ++Iter;
+if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
+  auto builder =
+  BuildMI(Block, Iter, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+  .addImm(0);

jayfoad wrote:

On GFX10+ VMEM stores should have S_WAITCNT_VSCNT 0 as well as (or instead of) 
this.

https://github.com/llvm/llvm-project/pull/68932
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)

2023-11-21 Thread Jay Foad via cfe-commits


@@ -1708,6 +1710,13 @@ bool 
SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction ,
 }
 
 ++Iter;
+if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
+  auto builder =

jayfoad wrote:

Upper case B for Builder.

https://github.com/llvm/llvm-project/pull/68932
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)

2023-11-15 Thread Jun Wang via cfe-commits

jwanggit86 wrote:

> > So, while it's possible to create a combined option, using a separate 
> > option also makes sense. Do we generally try to avoid creating new 
> > command-line options?
> 
> Looking again, I see they are different and unrelated. I don't really 
> understand why we have amdgpu-waitcnt-forcezero, I'm not sure I've ever used 
> it. I always expected it to behave like this flag.

So do you still think the new code should be integrated with the existing pass, 
or should it be separate?

https://github.com/llvm/llvm-project/pull/68932
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)

2023-11-15 Thread Matt Arsenault via cfe-commits

arsenm wrote:

> So, while it's possible to create a combined option, using a separate option 
> also makes sense. Do we generally try to avoid creating new command-line 
> options?

Looking again, I see they are different and unrelated. I don't really 
understand why we have amdgpu-waitcnt-forcezero, I'm not sure I've ever used 
it. I always expected it to behave like this flag.



https://github.com/llvm/llvm-project/pull/68932
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)

2023-11-09 Thread Matt Arsenault via cfe-commits


@@ -1809,6 +1816,23 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
   return HasVMemLoad && UsesVgprLoadedOutside;
 }
 
+bool SIInsertWaitcnts::insertWaitcntAfterMemOp(MachineFunction ) {
+  bool Modified = false;
+
+  for (auto  : MF) {

arsenm wrote:

Plus I think the two separate, but closely related cl::opts is confusing 

https://github.com/llvm/llvm-project/pull/68932
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)

2023-11-09 Thread Matt Arsenault via cfe-commits


@@ -1809,6 +1816,23 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
   return HasVMemLoad && UsesVgprLoadedOutside;
 }
 
+bool SIInsertWaitcnts::insertWaitcntAfterMemOp(MachineFunction ) {
+  bool Modified = false;
+
+  for (auto  : MF) {

arsenm wrote:

I think it makes it harder to reason about the pass as a whole to have it as a 
totally separate phase

https://github.com/llvm/llvm-project/pull/68932
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)

2023-11-08 Thread Matt Arsenault via cfe-commits


@@ -1809,6 +1816,23 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
   return HasVMemLoad && UsesVgprLoadedOutside;
 }
 
+bool SIInsertWaitcnts::insertWaitcntAfterMemOp(MachineFunction ) {
+  bool Modified = false;
+
+  for (auto  : MF) {

arsenm wrote:

Should try to integrate with the rest of the logic instead of adding a separate 
pass over the function 

https://github.com/llvm/llvm-project/pull/68932
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)

2023-11-08 Thread Matt Arsenault via cfe-commits


@@ -52,6 +52,11 @@ static cl::opt ForceEmitZeroFlag(
   cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) 
expcnt(0) lgkmcnt(0)"),
   cl::init(false), cl::Hidden);
 
+static cl::opt
+PreciseMemOpFlag("amdgpu-precise-memory-op",
+ cl::desc("Emit s_waitcnt 0 after each memory operation"),
+ cl::init(false));
+

arsenm wrote:

I think this should be fused into an enum flag with the existing waitcnt flag.

https://github.com/llvm/llvm-project/pull/68932
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)

2023-11-08 Thread Matt Arsenault via cfe-commits

https://github.com/arsenm edited https://github.com/llvm/llvm-project/pull/68932
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)

2023-11-06 Thread Jun Wang via cfe-commits

https://github.com/jwanggit86 updated 
https://github.com/llvm/llvm-project/pull/68932

>From a87ba1892375ef67edb5d6f3bd537869203273a6 Mon Sep 17 00:00:00 2001
From: Jun Wang 
Date: Thu, 12 Oct 2023 16:45:59 -0500
Subject: [PATCH 1/6] [AMDGPU] Emit a waitcnt instruction after each memory
 instruction

This patch implements a new command-line option for the backend, namely,
amdgpu-waitcnt-for-all-mem-op. When this option is specified, a "waitcnt 0"
instruction is generated after each memory load/store instruction.
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp   |  30 ++-
 .../CodeGen/AMDGPU/insert_waitcnt_for_all.ll  | 222 ++
 2 files changed, 251 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp 
b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ede4841b8a5fd7d..728be7c61fa2217 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -52,6 +52,10 @@ static cl::opt ForceEmitZeroFlag(
   cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) 
expcnt(0) lgkmcnt(0)"),
   cl::init(false), cl::Hidden);
 
+static cl::opt EmitForAllMemOpFlag(
+"amdgpu-waitcnt-for-all-mem-op",
+cl::desc("Emit s_waitcnt 0 after each memory operation"), cl::init(false));
+
 namespace {
 // Class of object that encapsulates latest instruction counter score
 // associated with the operand.  Used for determining whether
@@ -388,6 +392,8 @@ class SIInsertWaitcnts : public MachineFunctionPass {
   // message.
   DenseSet ReleaseVGPRInsts;
 
+  bool insertWaitcntAfterMemOp(MachineFunction );
+
 public:
   static char ID;
 
@@ -1809,6 +1815,23 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
   return HasVMemLoad && UsesVgprLoadedOutside;
 }
 
+bool SIInsertWaitcnts::insertWaitcntAfterMemOp(MachineFunction ) {
+  bool Modified = false;
+
+  for (auto  : MF) {
+for (auto It = MBB.begin(); It != MBB.end();) {
+  bool IsMemOp = It->mayLoadOrStore();
+  ++It;
+  if (IsMemOp) {
+BuildMI(MBB, It, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
+Modified = true;
+  }
+}
+  }
+
+  return Modified;
+}
+
 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) {
   ST = ();
   TII = ST->getInstrInfo();
@@ -1819,6 +1842,12 @@ bool 
SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) {
   MLI = ();
   PDT = ();
 
+  bool Modified = false;
+
+  if (EmitForAllMemOpFlag) {
+Modified = insertWaitcntAfterMemOp(MF);
+  }
+
   ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
   for (auto T : inst_counter_types())
 ForceEmitWaitcnt[T] = false;
@@ -1847,7 +1876,6 @@ bool 
SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) {
 
   TrackedWaitcntSet.clear();
   BlockInfos.clear();
-  bool Modified = false;
 
   if (!MFI->isEntryFunction()) {
 // Wait for any outstanding memory operations that the input registers may
diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll 
b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll
new file mode 100644
index 000..4580b9074ada3cc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll
@@ -0,0 +1,222 @@
+; Testing the -amdgpu-waitcnt-for-all-mem-op option
+; COM: llc -mtriple=amdgcn -mcpu=hawaii -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX7
+; COM: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX8
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX90A
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 
-mattr=-flat-for-global,+enable-flat-scratch 
-amdgpu-use-divergent-register-indexing -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s
+
+; from atomicrmw-expand.ll
+; covers flat_load, flat_atomic
+define void @syncscope_workgroup_nortn(ptr %addr, float %val) {
+; GFX90A-LABEL: syncscope_workgroup_nortn:
+; GFX90A:  ; %bb.0:
+; GFX90A: flat_load_dword v5, v[0:1]
+; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A:  .LBB0_1: ; %atomicrmw.start
+; GFX90A: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+  %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst
+  ret void
+}
+
+; from atomicrmw-nand.ll
+; covers global_atomic, global_load
+define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
+; GFX9-LABEL: atomic_nand_i32_global:
+; GFX9:   ; %bb.0:
+; GFX9-NEXT:

[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)

2023-11-06 Thread Jun Wang via cfe-commits

https://github.com/jwanggit86 updated 
https://github.com/llvm/llvm-project/pull/68932

>From 07b3f94e49df221406cf7b83a05c8704e1af1c75 Mon Sep 17 00:00:00 2001
From: Jun Wang 
Date: Thu, 12 Oct 2023 16:45:59 -0500
Subject: [PATCH 1/5] [AMDGPU] Emit a waitcnt instruction after each memory
 instruction

This patch implements a new command-line option for the backend, namely,
amdgpu-waitcnt-for-all-mem-op. When this option is specified, a "waitcnt 0"
instruction is generated after each memory load/store instruction.
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp   |  30 ++-
 .../CodeGen/AMDGPU/insert_waitcnt_for_all.ll  | 222 ++
 2 files changed, 251 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp 
b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ede4841b8a5fd7d..728be7c61fa2217 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -52,6 +52,10 @@ static cl::opt ForceEmitZeroFlag(
   cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) 
expcnt(0) lgkmcnt(0)"),
   cl::init(false), cl::Hidden);
 
+static cl::opt EmitForAllMemOpFlag(
+"amdgpu-waitcnt-for-all-mem-op",
+cl::desc("Emit s_waitcnt 0 after each memory operation"), cl::init(false));
+
 namespace {
 // Class of object that encapsulates latest instruction counter score
 // associated with the operand.  Used for determining whether
@@ -388,6 +392,8 @@ class SIInsertWaitcnts : public MachineFunctionPass {
   // message.
   DenseSet ReleaseVGPRInsts;
 
+  bool insertWaitcntAfterMemOp(MachineFunction );
+
 public:
   static char ID;
 
@@ -1809,6 +1815,23 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
   return HasVMemLoad && UsesVgprLoadedOutside;
 }
 
+bool SIInsertWaitcnts::insertWaitcntAfterMemOp(MachineFunction ) {
+  bool Modified = false;
+
+  for (auto  : MF) {
+for (auto It = MBB.begin(); It != MBB.end();) {
+  bool IsMemOp = It->mayLoadOrStore();
+  ++It;
+  if (IsMemOp) {
+BuildMI(MBB, It, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
+Modified = true;
+  }
+}
+  }
+
+  return Modified;
+}
+
 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) {
   ST = ();
   TII = ST->getInstrInfo();
@@ -1819,6 +1842,12 @@ bool 
SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) {
   MLI = ();
   PDT = ();
 
+  bool Modified = false;
+
+  if (EmitForAllMemOpFlag) {
+Modified = insertWaitcntAfterMemOp(MF);
+  }
+
   ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
   for (auto T : inst_counter_types())
 ForceEmitWaitcnt[T] = false;
@@ -1847,7 +1876,6 @@ bool 
SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) {
 
   TrackedWaitcntSet.clear();
   BlockInfos.clear();
-  bool Modified = false;
 
   if (!MFI->isEntryFunction()) {
 // Wait for any outstanding memory operations that the input registers may
diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll 
b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll
new file mode 100644
index 000..4580b9074ada3cc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll
@@ -0,0 +1,222 @@
+; Testing the -amdgpu-waitcnt-for-all-mem-op option
+; COM: llc -mtriple=amdgcn -mcpu=hawaii -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX7
+; COM: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX8
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX90A
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 
-mattr=-flat-for-global,+enable-flat-scratch 
-amdgpu-use-divergent-register-indexing -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s
+
+; from atomicrmw-expand.ll
+; covers flat_load, flat_atomic
+define void @syncscope_workgroup_nortn(ptr %addr, float %val) {
+; GFX90A-LABEL: syncscope_workgroup_nortn:
+; GFX90A:  ; %bb.0:
+; GFX90A: flat_load_dword v5, v[0:1]
+; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A:  .LBB0_1: ; %atomicrmw.start
+; GFX90A: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+  %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst
+  ret void
+}
+
+; from atomicrmw-nand.ll
+; covers global_atomic, global_load
+define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
+; GFX9-LABEL: atomic_nand_i32_global:
+; GFX9:   ; %bb.0:
+; GFX9-NEXT:

[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)

2023-11-06 Thread Jun Wang via cfe-commits

https://github.com/jwanggit86 updated 
https://github.com/llvm/llvm-project/pull/68932

>From 07b3f94e49df221406cf7b83a05c8704e1af1c75 Mon Sep 17 00:00:00 2001
From: Jun Wang 
Date: Thu, 12 Oct 2023 16:45:59 -0500
Subject: [PATCH 1/4] [AMDGPU] Emit a waitcnt instruction after each memory
 instruction

This patch implements a new command-line option for the backend, namely,
amdgpu-waitcnt-for-all-mem-op. When this option is specified, a "waitcnt 0"
instruction is generated after each memory load/store instruction.
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp   |  30 ++-
 .../CodeGen/AMDGPU/insert_waitcnt_for_all.ll  | 222 ++
 2 files changed, 251 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp 
b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ede4841b8a5fd7d..728be7c61fa2217 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -52,6 +52,10 @@ static cl::opt ForceEmitZeroFlag(
   cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) 
expcnt(0) lgkmcnt(0)"),
   cl::init(false), cl::Hidden);
 
+static cl::opt EmitForAllMemOpFlag(
+"amdgpu-waitcnt-for-all-mem-op",
+cl::desc("Emit s_waitcnt 0 after each memory operation"), cl::init(false));
+
 namespace {
 // Class of object that encapsulates latest instruction counter score
 // associated with the operand.  Used for determining whether
@@ -388,6 +392,8 @@ class SIInsertWaitcnts : public MachineFunctionPass {
   // message.
   DenseSet ReleaseVGPRInsts;
 
+  bool insertWaitcntAfterMemOp(MachineFunction );
+
 public:
   static char ID;
 
@@ -1809,6 +1815,23 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
   return HasVMemLoad && UsesVgprLoadedOutside;
 }
 
+bool SIInsertWaitcnts::insertWaitcntAfterMemOp(MachineFunction ) {
+  bool Modified = false;
+
+  for (auto  : MF) {
+for (auto It = MBB.begin(); It != MBB.end();) {
+  bool IsMemOp = It->mayLoadOrStore();
+  ++It;
+  if (IsMemOp) {
+BuildMI(MBB, It, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
+Modified = true;
+  }
+}
+  }
+
+  return Modified;
+}
+
 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) {
   ST = ();
   TII = ST->getInstrInfo();
@@ -1819,6 +1842,12 @@ bool 
SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) {
   MLI = ();
   PDT = ();
 
+  bool Modified = false;
+
+  if (EmitForAllMemOpFlag) {
+Modified = insertWaitcntAfterMemOp(MF);
+  }
+
   ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
   for (auto T : inst_counter_types())
 ForceEmitWaitcnt[T] = false;
@@ -1847,7 +1876,6 @@ bool 
SIInsertWaitcnts::runOnMachineFunction(MachineFunction ) {
 
   TrackedWaitcntSet.clear();
   BlockInfos.clear();
-  bool Modified = false;
 
   if (!MFI->isEntryFunction()) {
 // Wait for any outstanding memory operations that the input registers may
diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll 
b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll
new file mode 100644
index 000..4580b9074ada3cc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll
@@ -0,0 +1,222 @@
+; Testing the -amdgpu-waitcnt-for-all-mem-op option
+; COM: llc -mtriple=amdgcn -mcpu=hawaii -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX7
+; COM: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX8
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX90A
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 
-mattr=-flat-for-global,+enable-flat-scratch 
-amdgpu-use-divergent-register-indexing -amdgpu-waitcnt-for-all-mem-op 
-verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s
+
+; from atomicrmw-expand.ll
+; covers flat_load, flat_atomic
+define void @syncscope_workgroup_nortn(ptr %addr, float %val) {
+; GFX90A-LABEL: syncscope_workgroup_nortn:
+; GFX90A:  ; %bb.0:
+; GFX90A: flat_load_dword v5, v[0:1]
+; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A:  .LBB0_1: ; %atomicrmw.start
+; GFX90A: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+  %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst
+  ret void
+}
+
+; from atomicrmw-nand.ll
+; covers global_atomic, global_load
+define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
+; GFX9-LABEL: atomic_nand_i32_global:
+; GFX9:   ; %bb.0:
+; GFX9-NEXT: