When initially adding MI300 support, the buffer invalidation
before atomics was messed up - it should have been buffer_wbl2
(wbl2 = write back L2). With this patch in place, most test
cases work on MI300A :-)

Without this change, there were several multi-teams issues.

MI300A testing shows: larger programs now work :-)

OK for mainline?

* * *

For libgomp testing, I see the fails:

FAIL: libgomp.c/../libgomp.c-c++-common/declare-target-indirect-2.c execution 
test
FAIL: libgomp.c++/../libgomp.c-c++-common/declare-target-indirect-2.c execution 
test
FAIL: libgomp.fortran/declare-target-indirect-2.f90   -O…  execution tests

→ PR114445, I presume

FAIL: libgomp.c/interop-hsa.c execution test
FAIL: libgomp.c/omp_alloc-3.c execution test
FAIL: libgomp.c/target-52.c execution test
FAIL: libgomp.c/target-53.c execution test
FAIL: libgomp.c/target-54.c output pattern test
FAIL: libgomp.c/target-49.c output pattern test
FAIL: libgomp.c++/target-has-device-addr-2.C execution test
FAIL: libgomp.c++/target-has-device-addr-4.C execution test
FAIL: libgomp.c++/target-has-device-addr-5.C execution test
FAIL: libgomp.c++/target-has-device-addr-6.C execution test
FAIL: libgomp.c++/target-has-device-addr-8.C execution test
FAIL: libgomp.c++/target-has-device-addr-9.C execution test
FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/deep-copy-10.c 
-DACC_DEVICE_TYPE_radeon=1 -DACC_MEM_SHARED=0 -foffload=amdgcn-amdhsa  -O{0,2}  
execution test
FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/vprop.c 
-DACC_DEVICE_TYPE_radeon=1 -DACC_MEM_SHARED=0 -foffload=amdgcn-amdhsa  -O2  
(test for excess errors)
FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/static-variable-1.c 
-DACC_DEVICE_TYPE_radeon=1 -DACC_MEM_SHARED=0 -foffload=amdgcn-amdhsa  -O{0,2}  
execution test
FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/reduction-cplx-dbl.c 
-DACC_DEVICE_TYPE_radeon=1 -DACC_MEM_SHARED=0 -foffload=amdgcn-amdhsa  -O{0,2}  
execution test
FAIL: libgomp.oacc-c++/../libgomp.oacc-c-c++-common/deep-copy-10.c 
-DACC_DEVICE_TYPE_radeon=1 -DACC_MEM_SHARED=0 -foffload=amdgcn-amdhsa  -O{0,2}  
execution test
FAIL: libgomp.oacc-c++/../libgomp.oacc-c-c++-common/reduction-cplx-dbl.c 
-DACC_DEVICE_TYPE_radeon=1 -DACC_MEM_SHARED=0 -foffload=amdgcn-amdhsa  -O{0,2}  
execution test
FAIL: libgomp.oacc-c++/../libgomp.oacc-c-c++-common/static-variable-1.c 
-DACC_DEVICE_TYPE_radeon=1 -DACC_MEM_SHARED=0 -foffload=amdgcn-amdhsa  -O{0,2}  
execution test
FAIL: libgomp.oacc-c++/pr96835-1.C -DACC_DEVICE_TYPE_radeon=1 
-DACC_MEM_SHARED=0 -foffload=amdgcn-amdhsa  -O2  (internal compiler error: 
verify_gimple failed)

→ To be checked - some are known issues, other seem to be true
issues.

* * *

Tobias

PS: I think we eventually have to revisit the atomics/scope topic.
In particular, we don't support system-global atomics properly.
(In OpenMP: 'memscope(all)'; the default is 'memscope(device)';
additionally, 'memscope(cgroup)' exists.)
And going over them and checking shouldn't harm in general.
gcn: Fix CDNA3 atomics' buffer invalidation

For device (agent) scope atomics - as needed when there is more than one teams,
a buffer_wbl2 followed by s_waitcnt is required. When doing the initial porting,
the pre-atomic instruction got accidentally replaced by buffer_inv sc1, which is
not quite the right instruction.

gcc/ChangeLog:

	* config/gcn/gcn.md (atomic_load, atomic_store, atomic_exchange):
	Fix CDNA3 L2 cache write-back before atomic instructions.

diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md
index 9172db08b2e..a8ca16cf167 100644
--- a/gcc/config/gcn/gcn.md
+++ b/gcc/config/gcn/gcn.md
@@ -2174,7 +2174,7 @@ (define_insn "atomic_load<mode>"
 		    ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_load%o0\t%0, %A1%O1 %G1\;"
 		      "s_waitcnt\t0\;buffer_gl1_inv\;buffer_gl0_inv"
 		    : TARGET_TARGET_SC_CACHE
-		    ? "buffer_inv sc1\;flat_load%o0\t%0, %A1%O1 %G1\;"
+		    ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_load%o0\t%0, %A1%O1 %G1\;"
 		      "s_waitcnt\t0\;buffer_inv sc1"
 		    : "buffer_wbinvl1_vol\;flat_load%o0\t%0, %A1%O1 %G1\;"
 		      "s_waitcnt\t0\;buffer_wbinvl1_vol");
@@ -2186,7 +2186,7 @@ (define_insn "atomic_load<mode>"
 		    ? "buffer_gl1_inv\;buffer_gl0_inv\;global_load%o0\t%0, %A1%O1 %G1\;"
 		      "s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv"
 		    : TARGET_TARGET_SC_CACHE
-		    ? "buffer_inv sc1\;global_load%o0\t%0, %A1%O1 %G1\;"
+		    ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;global_load%o0\t%0, %A1%O1 %G1\;"
 		      "s_waitcnt\tvmcnt(0)\;buffer_inv sc1"
 		    : "buffer_wbinvl1_vol\;global_load%o0\t%0, %A1%O1 %G1\;"
 		      "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol");
@@ -2233,7 +2233,7 @@ (define_insn "atomic_store<mode>"
 		    : TARGET_WBINVL1_CACHE
 		    ? "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 %G1"
 		    : TARGET_TARGET_SC_CACHE
-		    ? "buffer_inv sc1\;flat_store%o1\t%A0, %1%O0 %G1"
+		    ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_store%o1\t%A0, %1%O0 %G1"
 		    : "error: cache architectire unspecified");
 	  case 2:
 	    return (TARGET_GLn_CACHE
@@ -2241,7 +2241,7 @@ (define_insn "atomic_store<mode>"
 		    : TARGET_WBINVL1_CACHE
 		    ? "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 %G1"
 		    : TARGET_TARGET_SC_CACHE
-		    ? "buffer_inv sc1\;global_store%o1\t%A0, %1%O0 %G1"
+		    ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;global_store%o1\t%A0, %1%O0 %G1"
 		    : "error: cache architecture unspecified");
 	  }
 	break;
@@ -2261,7 +2261,8 @@ (define_insn "atomic_store<mode>"
 		    ? "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 %G1\;"
 		      "s_waitcnt\t0\;buffer_wbinvl1_vol"
 		    : TARGET_TARGET_SC_CACHE
-		    ? "buffer_inv sc1\;flat_store%o1\t%A0, %1%O0 %G1\;"
+		    ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;"
+		      "flat_store%o1\t%A0, %1%O0 %G1\;"
 		      "s_waitcnt\t0\;buffer_inv sc1"
 		    : "error: cache architecture unspecified");
 	  case 2:
@@ -2272,7 +2273,8 @@ (define_insn "atomic_store<mode>"
 		    ? "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 %G1\;"
 		      "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol"
 		    : TARGET_TARGET_SC_CACHE
-		    ? "buffer_inv sc1\;global_store%o1\t%A0, %1%O0 %G1\;"
+		    ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;"
+		      "global_store%o1\t%A0, %1%O0 %G1\;"
 		      "s_waitcnt\tvmcnt(0)\;buffer_inv sc1"
 		    : "error: cache architecture unspecified");
 	  }
@@ -2356,7 +2358,7 @@ (define_insn "atomic_exchange<mode>"
             ? "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
 		      "s_waitcnt\t0"
 	    : TARGET_TARGET_SC_CACHE
-            ? "buffer_inv sc1\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
+            ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
 		      "s_waitcnt\t0"
             : "error: cache architecture unspecified");
 	  case 2:
@@ -2369,7 +2371,7 @@ (define_insn "atomic_exchange<mode>"
 		      "global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;"
 		      "s_waitcnt\tvmcnt(0)"
 	    : TARGET_TARGET_SC_CACHE
-            ? "buffer_inv sc1\;"
+            ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;"
 		      "global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;"
 		      "s_waitcnt\tvmcnt(0)"
             : "error: cache architecture unspecified");
@@ -2391,7 +2393,7 @@ (define_insn "atomic_exchange<mode>"
             ? "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
 		      "s_waitcnt\t0\;buffer_wbinvl1_vol"
 	    : TARGET_TARGET_SC_CACHE
-            ? "buffer_inv sc1\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
+            ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;"
 		      "s_waitcnt\t0\;buffer_inv sc1"
             : "error: cache architecture unspecified");
 	  case 2:
@@ -2404,7 +2406,7 @@ (define_insn "atomic_exchange<mode>"
 		      "global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;"
 		      "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol"
 	    : TARGET_TARGET_SC_CACHE
-            ? "buffer_inv sc1\;"
+            ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;"
 		      "global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;"
 		      "s_waitcnt\tvmcnt(0)\;buffer_inv sc1"
             : "error: cache architecture unspecified");

Reply via email to