https://github.com/jwanggit86 closed
https://github.com/llvm/llvm-project/pull/79236
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
jayfoad wrote:
No further comments.
https://github.com/llvm/llvm-project/pull/79236
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
jwanggit86 wrote:
@jayfoad Do you have any more comments?
https://github.com/llvm/llvm-project/pull/79236
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
jwanggit86 wrote:
Added a testcase that has flat_atomic_swap, which is an atomic without return.
https://github.com/llvm/llvm-project/pull/79236
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
jayfoad wrote:
Can you add at least one test for a VMEM (flat or scratch or global or buffer
or image) atomic without return? That should use vscnt on GFX10.
Apart from that the SIInsertWaitcnts.cpp and tests look good to me. I have not
reviewed the clang parts but it looks like @Pierre-vh
@@ -0,0 +1,1406 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+precise-memory < %s |
FileCheck %s -check-prefixes=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a
@@ -0,0 +1,1406 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+precise-memory < %s |
FileCheck %s -check-prefixes=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a
@@ -2594,12 +2594,10 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const
SIMemOpInfo ,
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
MOI.getFailureOrdering() ==
jwanggit86 wrote:
@jayfoad @arsenm Any other comments?
https://github.com/llvm/llvm-project/pull/79236
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
@@ -0,0 +1,1413 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+precise-memory < %s |
FileCheck %s -check-prefixes=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a
@@ -0,0 +1,1413 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+precise-memory < %s |
FileCheck %s -check-prefixes=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a
@@ -0,0 +1,1413 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+precise-memory < %s |
FileCheck %s -check-prefixes=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a
@@ -0,0 +1,1413 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+precise-memory < %s |
FileCheck %s -check-prefixes=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a
@@ -2326,6 +2326,20 @@ bool
SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction ,
}
#endif
+if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
+ AMDGPU::Waitcnt Wait;
+ if (ST->hasExtendedWaitCounts())
+Wait = AMDGPU::Waitcnt(0, 0, 0,
@@ -2326,6 +2326,20 @@ bool
SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction ,
}
#endif
+if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
+ AMDGPU::Waitcnt Wait;
+ if (ST->hasExtendedWaitCounts())
+Wait = AMDGPU::Waitcnt(0, 0, 0,
@@ -0,0 +1,618 @@
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+precise-memory < %s |
FileCheck %s -check-prefixes=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+precise-memory < %s |
FileCheck %s -check-prefixes=GFX90A
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010
@@ -0,0 +1,618 @@
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+precise-memory < %s |
FileCheck %s -check-prefixes=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+precise-memory < %s |
FileCheck %s -check-prefixes=GFX90A
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010
https://github.com/arsenm edited https://github.com/llvm/llvm-project/pull/79236
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
@@ -2326,6 +2326,20 @@ bool
SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction ,
}
#endif
+if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
+ AMDGPU::Waitcnt Wait;
+ if (ST->hasExtendedWaitCounts())
+Wait = AMDGPU::Waitcnt(0, 0, 0,
https://github.com/jwanggit86 edited
https://github.com/llvm/llvm-project/pull/79236
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
@@ -2326,6 +2326,20 @@ bool
SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction ,
}
#endif
+if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
+ AMDGPU::Waitcnt Wait;
+ if (ST->hasExtendedWaitCounts())
+Wait = AMDGPU::Waitcnt(0, 0, 0,
@@ -2326,6 +2326,20 @@ bool
SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction ,
}
#endif
+if (ST->isPreciseMemoryEnabled()) {
+ AMDGPU::Waitcnt Wait;
+ if (WCG == )
+Wait = AMDGPU::Waitcnt(0, 0, 0, 0);
jwanggit86 wrote:
The
@@ -0,0 +1,577 @@
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+precise-memory < %s |
FileCheck %s -check-prefixes=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+precise-memory < %s |
FileCheck %s -check-prefixes=GFX90A
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010
@@ -0,0 +1,577 @@
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+precise-memory < %s |
FileCheck %s -check-prefixes=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+precise-memory < %s |
FileCheck %s -check-prefixes=GFX90A
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010
@@ -0,0 +1,577 @@
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+precise-memory < %s |
FileCheck %s -check-prefixes=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+precise-memory < %s |
FileCheck %s -check-prefixes=GFX90A
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010
@@ -2326,6 +2326,20 @@ bool
SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction ,
}
#endif
+if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
+ AMDGPU::Waitcnt Wait;
+ if (ST->hasExtendedWaitCounts())
+Wait = AMDGPU::Waitcnt(0, 0, 0,
https://github.com/arsenm requested changes to this pull request.
Outstanding comments
https://github.com/llvm/llvm-project/pull/79236
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
@@ -2326,6 +2326,20 @@ bool
SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction ,
}
#endif
+if (ST->isPreciseMemoryEnabled()) {
+ AMDGPU::Waitcnt Wait;
+ if (WCG == )
+Wait = AMDGPU::Waitcnt(0, 0, 0, 0);
Pierre-vh wrote:
I was
@@ -2326,6 +2326,20 @@ bool
SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction ,
}
#endif
+if (ST->isPreciseMemoryEnabled()) {
+ AMDGPU::Waitcnt Wait;
+ if (WCG == )
Pierre-vh wrote:
Use `ST->hasExtendedWaitCounts()` instead of
@@ -2594,12 +2594,10 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const
SIMemOpInfo ,
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
MOI.getFailureOrdering() ==
jwanggit86 wrote:
@jayfoad After trying the patch you provided above, it appears that this
feature can indeed be done in SIInsertWaitcnt instead of SIMemoryLegalizer.
Code has been updated accordingly. Pls take a look.
https://github.com/llvm/llvm-project/pull/79236
@@ -2561,6 +2567,70 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const
SIMemOpInfo ,
return Changed;
}
+bool SIMemoryLegalizer::GFX9InsertWaitcntForPreciseMem(MachineFunction ) {
+ const GCNSubtarget = MF.getSubtarget();
+ const SIInstrInfo *TII =
t-tye wrote:
I am not clear why new functions need to be added for this, as I think there
are existing functions that already do this.
https://github.com/llvm/llvm-project/pull/79236
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://github.com/jayfoad edited
https://github.com/llvm/llvm-project/pull/79236
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
@@ -355,6 +356,18 @@ class SICacheControl {
MachineBasicBlock::iterator ) const {
return false;
}
+
+public:
+ // The following is for supporting precise memory mode. When the feature
+ // precise-memory is enabled, an s_waitcnt
@@ -2378,6 +2409,215 @@ bool
SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
return Changed;
}
+bool SIGfx6CacheControl::handleNonAtomicForPreciseMemory(
+MachineBasicBlock::iterator ) {
+ assert(MI->mayLoadOrStore());
+
+ MachineInstr = *MI;
+ AMDGPU::Waitcnt
@@ -2378,6 +2409,215 @@ bool
SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
return Changed;
}
+bool SIGfx6CacheControl::handleNonAtomicForPreciseMemory(
+MachineBasicBlock::iterator ) {
+ assert(MI->mayLoadOrStore());
+
+ MachineInstr = *MI;
+ AMDGPU::Waitcnt
https://github.com/jayfoad requested changes to this pull request.
I've added _some_ inline comments, but really I don't want to spend the time to
review this properly (or maintain it, or extend it for new architectures in
future). All this logic already exists in SIInsertWaitcnts. Duplicating
https://github.com/Pierre-vh approved this pull request.
LGTM, but wait for @t-tye or @jayfoad to approve as well
https://github.com/llvm/llvm-project/pull/79236
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
jwanggit86 wrote:
@Pierre-vh Any further comments?
https://github.com/llvm/llvm-project/pull/79236
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
@@ -167,6 +167,10 @@ def FeatureCuMode : SubtargetFeature<"cumode",
"Enable CU wavefront execution mode"
>;
+def FeaturePreciseMemory
jwanggit86 wrote:
Thanks! This works. Code has been updated. Pls take a look.
https://github.com/jwanggit86 updated
https://github.com/llvm/llvm-project/pull/79236
>From 9c40b1151b0673430ff53eb121784724a5b090e5 Mon Sep 17 00:00:00 2001
From: Jun Wang
Date: Tue, 23 Jan 2024 19:19:00 -0600
Subject: [PATCH 1/7] [AMDGPU] Emit a waitcnt instruction after each memory
@@ -167,6 +167,10 @@ def FeatureCuMode : SubtargetFeature<"cumode",
"Enable CU wavefront execution mode"
>;
+def FeaturePreciseMemory
Pierre-vh wrote:
Just remove `m_amdgpu_Features_Group` from your option's `SimpleMFlag`, follow
the same pattern as
https://github.com/jwanggit86 edited
https://github.com/llvm/llvm-project/pull/79236
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
@@ -167,6 +167,10 @@ def FeatureCuMode : SubtargetFeature<"cumode",
"Enable CU wavefront execution mode"
>;
+def FeaturePreciseMemory
jwanggit86 wrote:
A possible fix is to check `hasFlag` after calling `handleTargetFeaturesGroup`,
and if the check is
https://github.com/jwanggit86 edited
https://github.com/llvm/llvm-project/pull/79236
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
@@ -167,6 +167,10 @@ def FeatureCuMode : SubtargetFeature<"cumode",
"Enable CU wavefront execution mode"
>;
+def FeaturePreciseMemory
jwanggit86 wrote:
@Pierre-vh With the suggested change, the func `getAMDGPUTargetFeatures` looks
something like the
@@ -167,6 +167,10 @@ def FeatureCuMode : SubtargetFeature<"cumode",
"Enable CU wavefront execution mode"
>;
+def FeaturePreciseMemory
Pierre-vh wrote:
It's only called once per run by the driver, yes
We already do this for wavefrontsize64, and pretty much
https://github.com/jwanggit86 edited
https://github.com/llvm/llvm-project/pull/79236
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
@@ -167,6 +167,10 @@ def FeatureCuMode : SubtargetFeature<"cumode",
"Enable CU wavefront execution mode"
>;
+def FeaturePreciseMemory
jwanggit86 wrote:
The function `hasFlag()` actually appears to be pretty expensive, and it's
going to be called every
@@ -355,6 +356,18 @@ class SICacheControl {
MachineBasicBlock::iterator ) const {
return false;
}
+
+public:
+ // The following is for supporting precise memory mode. When the option
+ // amdgpu-precise-memory is enabled, an s_waitcnt
jwanggit86 wrote:
> Did you try to move this to SIInsertWaitCnt, as suggested?
> Did you try to move this to SIInsertWaitCnt, as suggested?
Pls see my reply on Feb 15, which is copy-pasted below.
Regarding the question about SIInsertWaitcnt, initially the code was indeed put
there (see PR
@@ -167,6 +167,10 @@ def FeatureCuMode : SubtargetFeature<"cumode",
"Enable CU wavefront execution mode"
>;
+def FeaturePreciseMemory
Pierre-vh wrote:
The extra overhead is just 3 lines in `clang/lib/Driver/ToolChains/AMDGPU.cpp`,
it's negligible.
We
@@ -355,6 +356,18 @@ class SICacheControl {
MachineBasicBlock::iterator ) const {
return false;
}
+
+public:
+ // The following is for supporting precise memory mode. When the option
+ // amdgpu-precise-memory is enabled, an s_waitcnt
https://github.com/Pierre-vh edited
https://github.com/llvm/llvm-project/pull/79236
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
https://github.com/Pierre-vh requested changes to this pull request.
Did you try to move this to SIInsertWaitCnt, as suggested?
https://github.com/llvm/llvm-project/pull/79236
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://github.com/jwanggit86 edited
https://github.com/llvm/llvm-project/pull/79236
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
@@ -2378,6 +2456,221 @@ bool
SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
return Changed;
}
+bool SIGfx6CacheControl ::handleNonAtomicForPreciseMemory(
+MachineBasicBlock::iterator ) {
+ assert(MI->mayLoadOrStore());
+
+ MachineInstr = *MI;
+
@@ -2378,6 +2456,221 @@ bool
SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
return Changed;
}
+bool SIGfx6CacheControl ::handleNonAtomicForPreciseMemory(
+MachineBasicBlock::iterator ) {
+ assert(MI->mayLoadOrStore());
+
+ MachineInstr = *MI;
+
@@ -603,14 +626,69 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile,
bool IsNonTemporal) const
https://github.com/jwanggit86 updated
https://github.com/llvm/llvm-project/pull/79236
>From 9c40b1151b0673430ff53eb121784724a5b090e5 Mon Sep 17 00:00:00 2001
From: Jun Wang
Date: Tue, 23 Jan 2024 19:19:00 -0600
Subject: [PATCH 1/6] [AMDGPU] Emit a waitcnt instruction after each memory
github-actions[bot] wrote:
:warning: C/C++ code formatter, clang-format found issues in your code.
:warning:
You can test this locally with the following command:
``bash
git-clang-format --diff 7fc25928233c133a4af1dadf0e060fb5d42ebd4e
bc7d09dedd199eb2ae739d625183d9370cac6436 --
https://github.com/jwanggit86 updated
https://github.com/llvm/llvm-project/pull/79236
>From 9c40b1151b0673430ff53eb121784724a5b090e5 Mon Sep 17 00:00:00 2001
From: Jun Wang
Date: Tue, 23 Jan 2024 19:19:00 -0600
Subject: [PATCH 1/5] [AMDGPU] Emit a waitcnt instruction after each memory
@@ -603,14 +626,69 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile,
bool IsNonTemporal) const
@@ -603,14 +626,69 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile,
bool IsNonTemporal) const
@@ -603,14 +626,69 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile,
bool IsNonTemporal) const
@@ -167,6 +167,10 @@ def FeatureCuMode : SubtargetFeature<"cumode",
"Enable CU wavefront execution mode"
>;
+def FeaturePreciseMemory
jwanggit86 wrote:
Thanks for the suggestion. It looks it would work. However, I'm not sure the
extra overhead is worth it
@@ -2378,6 +2456,221 @@ bool
SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
return Changed;
}
+bool SIGfx6CacheControl ::handleNonAtomicForPreciseMemory(
+MachineBasicBlock::iterator ) {
+ assert(MI->mayLoadOrStore());
+
+ MachineInstr = *MI;
+
@@ -2378,6 +2456,221 @@ bool
SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
return Changed;
}
+bool SIGfx6CacheControl ::handleNonAtomicForPreciseMemory(
+MachineBasicBlock::iterator ) {
+ assert(MI->mayLoadOrStore());
+
+ MachineInstr = *MI;
+
@@ -603,14 +626,69 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile,
bool IsNonTemporal) const
@@ -603,14 +626,69 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile,
bool IsNonTemporal) const
@@ -603,14 +626,69 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile,
bool IsNonTemporal) const
@@ -167,6 +167,10 @@ def FeatureCuMode : SubtargetFeature<"cumode",
"Enable CU wavefront execution mode"
>;
+def FeaturePreciseMemory
Pierre-vh wrote:
I think you just need to add something like this in `AMDGPU.cpp` in
`getAMDGPUTargetFeatures`
```
if
https://github.com/Pierre-vh commented:
I also agree with Jay, can't this go in InsertWaitCnt? Why does it have to go
in SIMemoryLegalizer instead?
If it has to stay here, fine, but is it possible to merge some code with
SIInsertWaitCnt in a common helper somewhere?
https://github.com/Pierre-vh edited
https://github.com/llvm/llvm-project/pull/79236
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
https://github.com/jwanggit86 updated
https://github.com/llvm/llvm-project/pull/79236
>From 9c40b1151b0673430ff53eb121784724a5b090e5 Mon Sep 17 00:00:00 2001
From: Jun Wang
Date: Tue, 23 Jan 2024 19:19:00 -0600
Subject: [PATCH 1/4] [AMDGPU] Emit a waitcnt instruction after each memory
@@ -605,12 +606,197 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
bool IsNonTemporal) const override;
};
+class SIPreciseMemorySupport {
+protected:
+ const GCNSubtarget
+ const SIInstrInfo *TII = nullptr;
+
+ IsaVersion
@@ -0,0 +1,362 @@
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+amdgpu-precise-memory-op < %s
| FileCheck %s -check-prefixes=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+amdgpu-precise-memory-op < %s
| FileCheck %s -check-prefixes=GFX90A
+; RUN: llc -mtriple=amdgcn
@@ -167,6 +167,10 @@ def FeatureCuMode : SubtargetFeature<"cumode",
"Enable CU wavefront execution mode"
>;
+def FeaturePreciseMemory
jwanggit86 wrote:
As it is, we have a clang command-line option "-mamdgpu-precise-memory-op".
When specified,
@@ -605,12 +606,197 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
bool IsNonTemporal) const override;
};
+class SIPreciseMemorySupport {
jwanggit86 wrote:
Merged with SICacheControl.
https://github.com/jwanggit86 updated
https://github.com/llvm/llvm-project/pull/79236
>From 9c40b1151b0673430ff53eb121784724a5b090e5 Mon Sep 17 00:00:00 2001
From: Jun Wang
Date: Tue, 23 Jan 2024 19:19:00 -0600
Subject: [PATCH 1/3] [AMDGPU] Emit a waitcnt instruction after each memory
@@ -167,6 +167,10 @@ def FeatureCuMode : SubtargetFeature<"cumode",
"Enable CU wavefront execution mode"
>;
+def FeaturePreciseMemory
arsenm wrote:
The subtarget feature prefix should be removed. The subtarget feature name is
not the user facing component
@@ -0,0 +1,199 @@
+; Testing the -amdgpu-precise-memory-op option
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+amdgpu-precise-memory-op
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+amdgpu-precise-memory-op
@@ -605,12 +606,197 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
bool IsNonTemporal) const override;
};
+class SIPreciseMemorySupport {
+protected:
+ const GCNSubtarget
+ const SIInstrInfo *TII = nullptr;
+
+ IsaVersion
@@ -605,12 +606,197 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
bool IsNonTemporal) const override;
};
+class SIPreciseMemorySupport {
jwanggit86 wrote:
Ok, will merge with CacheControl.
jayfoad wrote:
> This logic would need updating again for GFX12. It seems like it's
> duplicating a lot of knowledge which is already implemented in
> SIInsertWaitcnts.
Just to demonstrate, you could implement this feature in SIInsertWaitcnts for
**all** supported architectures with
@@ -605,12 +606,197 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
bool IsNonTemporal) const override;
};
+class SIPreciseMemorySupport {
+protected:
+ const GCNSubtarget
+ const SIInstrInfo *TII = nullptr;
+
+ IsaVersion
@@ -605,12 +606,197 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
bool IsNonTemporal) const override;
};
+class SIPreciseMemorySupport {
t-tye wrote:
My initial thought had been that this would be part of
@@ -167,6 +167,10 @@ def FeatureCuMode : SubtargetFeature<"cumode",
"Enable CU wavefront execution mode"
>;
+def FeaturePreciseMemory
jwanggit86 wrote:
The name was the result of some discussions last year. I've forwarded you the
email.
@@ -605,12 +606,197 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
bool IsNonTemporal) const override;
};
+class SIPreciseMemorySupport {
+protected:
+ const GCNSubtarget
+ const SIInstrInfo *TII = nullptr;
+
+ IsaVersion
jwanggit86 wrote:
@t-tye Code has been updated based on your feedback. Pls take a look.
https://github.com/llvm/llvm-project/pull/79236
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
@@ -0,0 +1,199 @@
+; Testing the -amdgpu-precise-memory-op option
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+amdgpu-precise-memory-op
-verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+amdgpu-precise-memory-op
@@ -2561,6 +2567,70 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const
SIMemOpInfo ,
return Changed;
}
+bool SIMemoryLegalizer::GFX9InsertWaitcntForPreciseMem(MachineFunction ) {
+ const GCNSubtarget = MF.getSubtarget();
+ const SIInstrInfo *TII =
@@ -2561,6 +2567,70 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const
SIMemOpInfo ,
return Changed;
}
+bool SIMemoryLegalizer::GFX9InsertWaitcntForPreciseMem(MachineFunction ) {
arsenm wrote:
can you just make this happen as a consequence of the
@@ -17,13 +17,16 @@
#include "AMDGPUMachineModuleInfo.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/BitmaskEnum.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include
@@ -2561,6 +2567,70 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const
SIMemOpInfo ,
return Changed;
}
+bool SIMemoryLegalizer::GFX9InsertWaitcntForPreciseMem(MachineFunction ) {
+ const GCNSubtarget = MF.getSubtarget();
+ const SIInstrInfo *TII =
@@ -641,6 +644,9 @@ class SIMemoryLegalizer final : public MachineFunctionPass {
bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo ,
MachineBasicBlock::iterator );
+ bool GFX9InsertWaitcntForPreciseMem(MachineFunction );
@@ -641,6 +644,9 @@ class SIMemoryLegalizer final : public MachineFunctionPass {
bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo ,
MachineBasicBlock::iterator );
+ bool GFX9InsertWaitcntForPreciseMem(MachineFunction );
t-tye
https://github.com/t-tye requested changes to this pull request.
https://github.com/llvm/llvm-project/pull/79236
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
@@ -2561,6 +2567,70 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const
SIMemOpInfo ,
return Changed;
}
+bool SIMemoryLegalizer::GFX9InsertWaitcntForPreciseMem(MachineFunction ) {
+ const GCNSubtarget = MF.getSubtarget();
+ const SIInstrInfo *TII =
1 - 100 of 102 matches
Mail list logo