When initially adding MI300 support, the buffer invalidation before atomics was messed up - it should have been buffer_wbl2 (wbl2 = write back L2). With this patch in place, most test cases work on MI300A :-)
Without this change, there were several multi-teams issues. MI300A testing shows: larger programs now work :-) OK for mainline? * * * For libgomp testing, I see the fails: FAIL: libgomp.c/../libgomp.c-c++-common/declare-target-indirect-2.c execution test FAIL: libgomp.c++/../libgomp.c-c++-common/declare-target-indirect-2.c execution test FAIL: libgomp.fortran/declare-target-indirect-2.f90 -O… execution tests → PR114445, I presume FAIL: libgomp.c/interop-hsa.c execution test FAIL: libgomp.c/omp_alloc-3.c execution test FAIL: libgomp.c/target-52.c execution test FAIL: libgomp.c/target-53.c execution test FAIL: libgomp.c/target-54.c output pattern test FAIL: libgomp.c/target-49.c output pattern test FAIL: libgomp.c++/target-has-device-addr-2.C execution test FAIL: libgomp.c++/target-has-device-addr-4.C execution test FAIL: libgomp.c++/target-has-device-addr-5.C execution test FAIL: libgomp.c++/target-has-device-addr-6.C execution test FAIL: libgomp.c++/target-has-device-addr-8.C execution test FAIL: libgomp.c++/target-has-device-addr-9.C execution test FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/deep-copy-10.c -DACC_DEVICE_TYPE_radeon=1 -DACC_MEM_SHARED=0 -foffload=amdgcn-amdhsa -O{0,2} execution test FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/vprop.c -DACC_DEVICE_TYPE_radeon=1 -DACC_MEM_SHARED=0 -foffload=amdgcn-amdhsa -O2 (test for excess errors) FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/static-variable-1.c -DACC_DEVICE_TYPE_radeon=1 -DACC_MEM_SHARED=0 -foffload=amdgcn-amdhsa -O{0,2} execution test FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/reduction-cplx-dbl.c -DACC_DEVICE_TYPE_radeon=1 -DACC_MEM_SHARED=0 -foffload=amdgcn-amdhsa -O{0,2} execution test FAIL: libgomp.oacc-c++/../libgomp.oacc-c-c++-common/deep-copy-10.c -DACC_DEVICE_TYPE_radeon=1 -DACC_MEM_SHARED=0 -foffload=amdgcn-amdhsa -O{0,2} execution test FAIL: libgomp.oacc-c++/../libgomp.oacc-c-c++-common/reduction-cplx-dbl.c -DACC_DEVICE_TYPE_radeon=1 -DACC_MEM_SHARED=0 -foffload=amdgcn-amdhsa -O{0,2} execution test FAIL: libgomp.oacc-c++/../libgomp.oacc-c-c++-common/static-variable-1.c -DACC_DEVICE_TYPE_radeon=1 -DACC_MEM_SHARED=0 -foffload=amdgcn-amdhsa -O{0,2} execution test FAIL: libgomp.oacc-c++/pr96835-1.C -DACC_DEVICE_TYPE_radeon=1 -DACC_MEM_SHARED=0 -foffload=amdgcn-amdhsa -O2 (internal compiler error: verify_gimple failed) → To be checked - some are known issues, other seem to be true issues. * * * Tobias PS: I think we eventually have to revisit the atomics/scope topic. In particular, we don't support system-global atomics properly. (In OpenMP: 'memscope(all)'; the default is 'memscope(device)'; additionally, 'memscope(cgroup)' exists.) And going over them and checking shouldn't harm in general.
gcn: Fix CDNA3 atomics' buffer invalidation For device (agent) scope atomics - as needed when there is more than one teams, a buffer_wbl2 followed by s_waitcnt is required. When doing the initial porting, the pre-atomic instruction got accidentally replaced by buffer_inv sc1, which is not quite the right instruction. gcc/ChangeLog: * config/gcn/gcn.md (atomic_load, atomic_store, atomic_exchange): Fix CDNA3 L2 cache write-back before atomic instructions. diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md index 9172db08b2e..a8ca16cf167 100644 --- a/gcc/config/gcn/gcn.md +++ b/gcc/config/gcn/gcn.md @@ -2174,7 +2174,7 @@ (define_insn "atomic_load<mode>" ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_load%o0\t%0, %A1%O1 %G1\;" "s_waitcnt\t0\;buffer_gl1_inv\;buffer_gl0_inv" : TARGET_TARGET_SC_CACHE - ? "buffer_inv sc1\;flat_load%o0\t%0, %A1%O1 %G1\;" + ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_load%o0\t%0, %A1%O1 %G1\;" "s_waitcnt\t0\;buffer_inv sc1" : "buffer_wbinvl1_vol\;flat_load%o0\t%0, %A1%O1 %G1\;" "s_waitcnt\t0\;buffer_wbinvl1_vol"); @@ -2186,7 +2186,7 @@ (define_insn "atomic_load<mode>" ? "buffer_gl1_inv\;buffer_gl0_inv\;global_load%o0\t%0, %A1%O1 %G1\;" "s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv" : TARGET_TARGET_SC_CACHE - ? "buffer_inv sc1\;global_load%o0\t%0, %A1%O1 %G1\;" + ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;global_load%o0\t%0, %A1%O1 %G1\;" "s_waitcnt\tvmcnt(0)\;buffer_inv sc1" : "buffer_wbinvl1_vol\;global_load%o0\t%0, %A1%O1 %G1\;" "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol"); @@ -2233,7 +2233,7 @@ (define_insn "atomic_store<mode>" : TARGET_WBINVL1_CACHE ? "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 %G1" : TARGET_TARGET_SC_CACHE - ? "buffer_inv sc1\;flat_store%o1\t%A0, %1%O0 %G1" + ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_store%o1\t%A0, %1%O0 %G1" : "error: cache architectire unspecified"); case 2: return (TARGET_GLn_CACHE @@ -2241,7 +2241,7 @@ (define_insn "atomic_store<mode>" : TARGET_WBINVL1_CACHE ? "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 %G1" : TARGET_TARGET_SC_CACHE - ? "buffer_inv sc1\;global_store%o1\t%A0, %1%O0 %G1" + ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;global_store%o1\t%A0, %1%O0 %G1" : "error: cache architecture unspecified"); } break; @@ -2261,7 +2261,8 @@ (define_insn "atomic_store<mode>" ? "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 %G1\;" "s_waitcnt\t0\;buffer_wbinvl1_vol" : TARGET_TARGET_SC_CACHE - ? "buffer_inv sc1\;flat_store%o1\t%A0, %1%O0 %G1\;" + ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;" + "flat_store%o1\t%A0, %1%O0 %G1\;" "s_waitcnt\t0\;buffer_inv sc1" : "error: cache architecture unspecified"); case 2: @@ -2272,7 +2273,8 @@ (define_insn "atomic_store<mode>" ? "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 %G1\;" "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol" : TARGET_TARGET_SC_CACHE - ? "buffer_inv sc1\;global_store%o1\t%A0, %1%O0 %G1\;" + ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;" + "global_store%o1\t%A0, %1%O0 %G1\;" "s_waitcnt\tvmcnt(0)\;buffer_inv sc1" : "error: cache architecture unspecified"); } @@ -2356,7 +2358,7 @@ (define_insn "atomic_exchange<mode>" ? "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;" "s_waitcnt\t0" : TARGET_TARGET_SC_CACHE - ? "buffer_inv sc1\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;" + ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;" "s_waitcnt\t0" : "error: cache architecture unspecified"); case 2: @@ -2369,7 +2371,7 @@ (define_insn "atomic_exchange<mode>" "global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;" "s_waitcnt\tvmcnt(0)" : TARGET_TARGET_SC_CACHE - ? "buffer_inv sc1\;" + ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;" "global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;" "s_waitcnt\tvmcnt(0)" : "error: cache architecture unspecified"); @@ -2391,7 +2393,7 @@ (define_insn "atomic_exchange<mode>" ? "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;" "s_waitcnt\t0\;buffer_wbinvl1_vol" : TARGET_TARGET_SC_CACHE - ? "buffer_inv sc1\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;" + ? "buffer_wbl2\tsc0\;s_waitcnt\t0\;flat_atomic_swap<X>\t%0, %1, %2 %G1\;" "s_waitcnt\t0\;buffer_inv sc1" : "error: cache architecture unspecified"); case 2: @@ -2404,7 +2406,7 @@ (define_insn "atomic_exchange<mode>" "global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;" "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol" : TARGET_TARGET_SC_CACHE - ? "buffer_inv sc1\;" + ? "buffer_wbl2\tsc0\;s_waitcnt\tvmcnt(0)\;" "global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;" "s_waitcnt\tvmcnt(0)\;buffer_inv sc1" : "error: cache architecture unspecified");