Hi,

Wilco pointed out in https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105162#c7? 
that

"Only __sync needs the extra full barrier, but __atomic does not."

The attached patch does that by adding out-of-line functions for 
MEMMODEL_SYNC_*.

Those new functions contain a barrier on the path without LSE instructions.


I tested the patch on aarch64-linux with bootstrap and make check.


Sebastian


________________________________
From: Pop, Sebastian
Sent: Thursday, April 7, 2022 3:15 PM
To: gcc-patches@gcc.gnu.org
Cc: Kyrylo Tkachov
Subject: [AArch64] PR105162: emit barrier for __sync and __atomic builtins on 
CPUs without LSE


Hi,


With -moutline-atomics gcc stops generating a barrier for __sync builtins: 
https://gcc.gnu.org/PR105162<https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105162>

This is a problem on CPUs without LSE instructions where the ld/st exclusives 
do not guarantee a full barrier.

The attached patch adds the barrier to the outline-atomics functions on the 
path without LSE instructions.

In consequence, under -moutline-atomics __atomic and __sync builtins now behave 
the same with and without LSE instructions.

To complete the change, the second patch makes gcc emit the barrier for 
__atomic builtins as well, i.e., independently of is_mm_sync().


Sebastian
From b45842c209ddcb560d6de477ced444f4d948ea76 Mon Sep 17 00:00:00 2001
From: Sebastian Pop <s...@amazon.com>
Date: Mon, 18 Apr 2022 15:13:20 +0000
Subject: [PATCH] [AArch64] add barriers to ool __sync builtins

---
 gcc/config/aarch64/aarch64-protos.h           |  2 +-
 gcc/config/aarch64/aarch64.cc                 | 18 ++++++++--
 .../gcc.target/aarch64/sync-comp-swap-ool.c   |  6 ++++
 .../gcc.target/aarch64/sync-op-acquire-ool.c  |  6 ++++
 .../gcc.target/aarch64/sync-op-full-ool.c     |  9 +++++
 .../gcc.target/aarch64/target_attr_20.c       |  2 +-
 .../gcc.target/aarch64/target_attr_21.c       |  2 +-
 libgcc/config/aarch64/lse.S                   | 33 +++++++++++++++++--
 libgcc/config/aarch64/t-lse                   |  8 ++---
 9 files changed, 74 insertions(+), 12 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sync-comp-swap-ool.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sync-op-acquire-ool.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sync-op-full-ool.c

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 46bade28ed6..0b325e1e32b 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -1051,7 +1051,7 @@ bool aarch64_high_bits_all_ones_p (HOST_WIDE_INT);
 
 struct atomic_ool_names
 {
-    const char *str[5][4];
+    const char *str[5][7];
 };
 
 rtx aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 18f80499079..3b02ee379f5 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -22670,14 +22670,14 @@ aarch64_emit_unlikely_jump (rtx insn)
   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
 }
 
-/* We store the names of the various atomic helpers in a 5x4 array.
+/* We store the names of the various atomic helpers in a 5x7 array.
    Return the libcall function given MODE, MODEL and NAMES.  */
 
 rtx
 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
 			const atomic_ool_names *names)
 {
-  memmodel model = memmodel_base (INTVAL (model_rtx));
+  memmodel model = memmodel_from_int (INTVAL (model_rtx));
   int mode_idx, model_idx;
 
   switch (mode)
@@ -22717,6 +22717,15 @@ aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
     case MEMMODEL_SEQ_CST:
       model_idx = 3;
       break;
+    case MEMMODEL_SYNC_ACQUIRE:
+      model_idx = 4;
+      break;
+    case MEMMODEL_SYNC_RELEASE:
+      model_idx = 5;
+      break;
+    case MEMMODEL_SYNC_SEQ_CST:
+      model_idx = 6;
+      break;
     default:
       gcc_unreachable ();
     }
@@ -22729,7 +22738,10 @@ aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
   { "__aarch64_" #B #N "_relax", \
     "__aarch64_" #B #N "_acq", \
     "__aarch64_" #B #N "_rel", \
-    "__aarch64_" #B #N "_acq_rel" }
+    "__aarch64_" #B #N "_acq_rel", \
+    "__aarch64_" #B #N "_sync_acq", \
+    "__aarch64_" #B #N "_sync_rel", \
+    "__aarch64_" #B #N "_sync_seq" }
 
 #define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
 		 { NULL, NULL, NULL, NULL }
diff --git a/gcc/testsuite/gcc.target/aarch64/sync-comp-swap-ool.c b/gcc/testsuite/gcc.target/aarch64/sync-comp-swap-ool.c
new file mode 100644
index 00000000000..cb28db21d17
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sync-comp-swap-ool.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-march=armv8-a+nolse -O2 -fno-ipa-icf -moutline-atomics" } */
+
+#include "sync-comp-swap.x"
+
+/* { dg-final { scan-assembler-times "bl.*__aarch64_cas4_sync_seq" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sync-op-acquire-ool.c b/gcc/testsuite/gcc.target/aarch64/sync-op-acquire-ool.c
new file mode 100644
index 00000000000..bf8e7210e75
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sync-op-acquire-ool.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-march=armv8-a+nolse -O2 -moutline-atomics" } */
+
+#include "sync-op-acquire.x"
+
+/* { dg-final { scan-assembler-times "bl.*__aarch64_swp4_sync_acq" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sync-op-full-ool.c b/gcc/testsuite/gcc.target/aarch64/sync-op-full-ool.c
new file mode 100644
index 00000000000..a50fd96fffb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sync-op-full-ool.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-march=armv8-a+nolse -O2 -moutline-atomics" } */
+
+#include "sync-op-full.x"
+
+/* { dg-final { scan-assembler-times "bl.*__aarch64_ldadd4_sync_seq" 1 } } */
+/* { dg-final { scan-assembler-times "bl.*__aarch64_ldclr4_sync_seq" 1 } } */
+/* { dg-final { scan-assembler-times "bl.*__aarch64_ldeor4_sync_seq" 1 } } */
+/* { dg-final { scan-assembler-times "bl.*__aarch64_ldset4_sync_seq" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/target_attr_20.c b/gcc/testsuite/gcc.target/aarch64/target_attr_20.c
index 509fb039e84..abf9f5cf230 100644
--- a/gcc/testsuite/gcc.target/aarch64/target_attr_20.c
+++ b/gcc/testsuite/gcc.target/aarch64/target_attr_20.c
@@ -24,4 +24,4 @@ bar (void)
     }
 }
 
-/* { dg-final { scan-assembler-not "bl.*__aarch64_cas2_acq_rel" } } */
+/* { dg-final { scan-assembler-not "bl.*__aarch64_cas2" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/target_attr_21.c b/gcc/testsuite/gcc.target/aarch64/target_attr_21.c
index acace4c8f2a..02d9577c3d4 100644
--- a/gcc/testsuite/gcc.target/aarch64/target_attr_21.c
+++ b/gcc/testsuite/gcc.target/aarch64/target_attr_21.c
@@ -24,4 +24,4 @@ bar (void)
     }
 }
 
-/* { dg-final { scan-assembler-times "bl.*__aarch64_cas2_acq_rel" 1 } } */
+/* { dg-final { scan-assembler-times "bl.*__aarch64_cas2" 1 } } */
diff --git a/libgcc/config/aarch64/lse.S b/libgcc/config/aarch64/lse.S
index c353ec2215b..81cbfe92825 100644
--- a/libgcc/config/aarch64/lse.S
+++ b/libgcc/config/aarch64/lse.S
@@ -87,24 +87,49 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 # define L
 # define M     0x000000
 # define N     0x000000
+# define BARRIER
 #elif MODEL == 2
 # define SUFF  _acq
 # define A     a
 # define L
 # define M     0x400000
 # define N     0x800000
+# define BARRIER
 #elif MODEL == 3
 # define SUFF  _rel
 # define A
 # define L     l
 # define M     0x008000
 # define N     0x400000
+# define BARRIER
 #elif MODEL == 4
 # define SUFF  _acq_rel
 # define A     a
 # define L     l
 # define M     0x408000
 # define N     0xc00000
+# define BARRIER
+#elif MODEL == 5
+# define SUFF  _sync_acq
+# define A     a
+# define L
+# define M     0x400000
+# define N     0x800000
+# define BARRIER dmb		ish
+#elif MODEL == 6
+# define SUFF  _sync_rel
+# define A
+# define L     l
+# define M     0x008000
+# define N     0x400000
+# define BARRIER dmb		ish
+#elif MODEL == 7
+# define SUFF  _sync_seq
+# define A     a
+# define L     l
+# define M     0x408000
+# define N     0xc00000
+# define BARRIER dmb		ish
 #else
 # error
 #endif
@@ -183,7 +208,8 @@ STARTFN	NAME(cas)
 	bne		1f
 	STXR		w(tmp1), s(1), [x2]
 	cbnz		w(tmp1), 0b
-1:	ret
+1:	BARRIER
+	ret
 
 #else
 #define LDXP	glue3(ld, A, xp)
@@ -205,7 +231,8 @@ STARTFN	NAME(cas)
 	bne		1f
 	STXP		w(tmp2), x2, x3, [x4]
 	cbnz		w(tmp2), 0b
-1:	ret
+1:	BARRIER
+	ret
 
 #endif
 
@@ -229,6 +256,7 @@ STARTFN	NAME(swp)
 0:	LDXR		s(0), [x1]
 	STXR		w(tmp1), s(tmp0), [x1]
 	cbnz		w(tmp1), 0b
+	BARRIER
 	ret
 
 ENDFN	NAME(swp)
@@ -273,6 +301,7 @@ STARTFN	NAME(LDNM)
 	OP		s(tmp1), s(0), s(tmp0)
 	STXR		w(tmp2), s(tmp1), [x1]
 	cbnz		w(tmp2), 0b
+	BARRIER
 	ret
 
 ENDFN	NAME(LDNM)
diff --git a/libgcc/config/aarch64/t-lse b/libgcc/config/aarch64/t-lse
index 790cada3315..aca874f0e31 100644
--- a/libgcc/config/aarch64/t-lse
+++ b/libgcc/config/aarch64/t-lse
@@ -18,13 +18,13 @@
 # along with GCC; see the file COPYING3.  If not see
 # <http://www.gnu.org/licenses/>.
 
-# Compare-and-swap has 5 sizes and 4 memory models.
+# Compare-and-swap has 5 sizes and 7 memory models.
 S0 := $(foreach s, 1 2 4 8 16, $(addsuffix _$(s), cas))
-O0 := $(foreach m, 1 2 3 4, $(addsuffix _$(m)$(objext), $(S0)))
+O0 := $(foreach m, 1 2 3 4 5 6 7, $(addsuffix _$(m)$(objext), $(S0)))
 
-# Swap, Load-and-operate have 4 sizes and 4 memory models
+# Swap, Load-and-operate have 4 sizes and 7 memory models
 S1 := $(foreach s, 1 2 4 8, $(addsuffix _$(s), swp ldadd ldclr ldeor ldset))
-O1 := $(foreach m, 1 2 3 4, $(addsuffix _$(m)$(objext), $(S1)))
+O1 := $(foreach m, 1 2 3 4 5 6 7, $(addsuffix _$(m)$(objext), $(S1)))
 
 LSE_OBJS := $(O0) $(O1)
 
-- 
2.25.1

Reply via email to