This is a patch implementing builtins for an atomic exchange with full, acquire, and release memory barrier semantics. It is similar to __sync_lock_test_and_set(), but the target does not have the option of implementing a reduced functionality of only implementing a store of 1. Also, unlike __sync_lock_test_and_set(), we have all three memory barrier variants.

The compiler will fall back to a full barrier if the user requests an acquire/release and it is not available in the target. Also, if no variant is available, we will fall back to a compare and swap loop with a full barrier at the end.

The real reason for this patch is to implement atomic stores in the C++ runtime library, which can currently incorrectly move prior stores past an atomic store, thus invalidating the happens-before promise for the sequentially consistent model. I am attaching the corresponding patch to libstdc++ to show how I intend to use the builtin. This is not an official submission for the C++ library bits, as I have not yet fully tested the library. I will do so separately.

In a followup patch I will be implementing acq/rel/full variants for all the __sync_* builtins which we can use for the atomic loads and for some of the OpenMP atomics Jakub has been working on.

Oh yeah, I would gladly accept patterns/patches for other architectures :).

Tested on x86-64 Linux.

OK for mainline?
        * c-family/c-common.c (resolve_overloaded_builtin): Add
        BUILT_IN_LOCK_TEST_AND_SET_*_N variants.
        * doc/extend.texi: Document __sync_lock_test_and_set_* variants.
        * libgcc-std.ver: Add __sync_swap_*.
        * optabs.h: Add DOI_sync_swap*.
        Define sync_swap*_optab.
        * optabs.c (expand_sync_swap): New.
        * genopinit.c: Add sync_swap_{acq,rel,full}.
        * config/i386/sync.md ("sync_lock_test_and_set_full<mode>"): New.
        * config/i386/i386.md: Add UNSPECV_SWAP_FULL.
        * tree.h (enum membar_mode): Same.
        * builtins.c (expand_builtin_swap): New.
        (expand_builtin): Add cases for BUILT_IN_SWAP_*.
        * sync-builtins.def (BUILT_IN_SWAP_*): New.
        * expr.h (expand_sync_swap): Protoize.
        (expand_builtin_synchronize): Same.

Index: doc/extend.texi
===================================================================
--- doc/extend.texi     (revision 173831)
+++ doc/extend.texi     (working copy)
@@ -6719,6 +6719,22 @@ speculated to) before the builtin, but p
 be globally visible yet, and previous memory loads may not yet be
 satisfied.
 
+@item @var{type} __sync_swap_full (@var{type} *ptr, @var{type} value, ...)
+@itemx @var{type} __sync_swap_acq (@var{type} *ptr, @var{type} value, ...)
+@itemx @var{type} __sync_swap_rel (@var{type} *ptr, @var{type} value, ...)
+@findex __sync_swap_full
+@findex __sync_swap_acq
+@findex __sync_swap_rel
+These builtins implement an atomic exchange operation.  They write
+@var{value} into @code{*@var{ptr}}, and return the previous contents
+of @code{*@var{ptr}}.  The different variants provide a full barrier,
+acquire barrier, or a release barrier respectively depending on the
+suffix.
+
+If the acquire or release variants of these operations are not
+available on the given target, the compiler will fall back to a full
+barrier.
+
 @item void __sync_lock_release (@var{type} *ptr, ...)
 @findex __sync_lock_release
 This builtin releases the lock acquired by @code{__sync_lock_test_and_set}.
Index: c-family/c-common.c
===================================================================
--- c-family/c-common.c (revision 173831)
+++ c-family/c-common.c (working copy)
@@ -9035,6 +9035,9 @@ resolve_overloaded_builtin (location_t l
     case BUILT_IN_VAL_COMPARE_AND_SWAP_N:
     case BUILT_IN_LOCK_TEST_AND_SET_N:
     case BUILT_IN_LOCK_RELEASE_N:
+    case BUILT_IN_SWAP_FULL_N:
+    case BUILT_IN_SWAP_ACQ_N:
+    case BUILT_IN_SWAP_REL_N:
       {
        int n = sync_resolve_size (function, params);
        tree new_function, first_param, result;
Index: optabs.c
===================================================================
--- optabs.c    (revision 173831)
+++ optabs.c    (working copy)
@@ -6988,6 +6988,70 @@ expand_sync_lock_test_and_set (rtx mem, 
 
   return NULL_RTX;
 }
+
+/* This function expands an atomic exchange operation: atomically store
+   VAL in MEM and return the previous value in MEM.
+
+   TARGET is an option place to stick the return value.
+   MBMODE is the memory barrier type to use for the operation.  */
+
+rtx
+expand_sync_swap (rtx mem, rtx val, rtx target, enum membar_mode mbmode)
+{
+  enum machine_mode mode = GET_MODE (mem);
+  enum insn_code icode;
+  direct_optab op;
+
+  switch (mbmode)
+    {
+    case MEMBAR_MODE_ACQUIRE:
+      op = sync_swap_acq_optab;
+      break;
+    case MEMBAR_MODE_RELEASE:
+      op = sync_swap_rel_optab;
+      break;
+    case MEMBAR_MODE_FULL:
+      op = sync_swap_full_optab;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+  /* If no variant is found, try the full barrier.  */
+  if (direct_optab_handler (op, mode) == CODE_FOR_nothing)
+    op = sync_swap_full_optab;
+
+  /* If the target supports the swap directly, great.  */
+  icode = direct_optab_handler (op, mode);
+  if (icode != CODE_FOR_nothing)
+    {
+      struct expand_operand ops[3];
+
+      create_output_operand (&ops[0], target, mode);
+      create_fixed_operand (&ops[1], mem);
+      /* VAL may have been promoted to a wider mode.  Shrink it if so.  */
+      create_convert_operand_to (&ops[2], val, mode, true);
+      if (maybe_expand_insn (icode, 3, ops))
+       return ops[0].value;
+    }
+
+  /* Otherwise, use a compare-and-swap loop for the exchange.  */
+  if (direct_optab_handler (sync_compare_and_swap_optab, mode)
+      != CODE_FOR_nothing)
+    {
+      if (!target || !register_operand (target, mode))
+       target = gen_reg_rtx (mode);
+      if (GET_MODE (val) != VOIDmode && GET_MODE (val) != mode)
+       val = convert_modes (mode, GET_MODE (val), val, 1);
+      if (expand_compare_and_swap_loop (mem, target, val, NULL_RTX))
+       {
+         /* Issue a full barrier.  */
+         expand_builtin_synchronize ();
+         return target;
+       }
+    }
+
+  return NULL_RTX;
+}
 
 /* Return true if OPERAND is suitable for operand number OPNO of
    instruction ICODE.  */
Index: optabs.h
===================================================================
--- optabs.h    (revision 173831)
+++ optabs.h    (working copy)
@@ -669,9 +669,19 @@ enum direct_optab_index
   /* Atomic compare and swap.  */
   DOI_sync_compare_and_swap,
 
-  /* Atomic exchange with acquire semantics.  */
+  /* Atomic exchange with acquire semantics.  Exchange not fully
+     guaranteed.  Some targets may only support a store of 1.  */
   DOI_sync_lock_test_and_set,
 
+  /* Atomic exchange with acquire semantics.  */
+  DOI_sync_swap_acq,
+
+  /* Atomic exchange with release semantics.  */
+  DOI_sync_swap_rel,
+
+  /* Atomic exchange with full barrier semantics.  */
+  DOI_sync_swap_full,
+
   /* Atomic clear with release semantics.  */
   DOI_sync_lock_release,
 
@@ -720,6 +730,12 @@ typedef struct direct_optab_d *direct_op
   (&direct_optab_table[(int) DOI_sync_compare_and_swap])
 #define sync_lock_test_and_set_optab \
   (&direct_optab_table[(int) DOI_sync_lock_test_and_set])
+#define sync_swap_acq_optab \
+  (&direct_optab_table[(int) DOI_sync_swap_acq])
+#define sync_swap_rel_optab \
+  (&direct_optab_table[(int) DOI_sync_swap_rel])
+#define sync_swap_full_optab \
+  (&direct_optab_table[(int) DOI_sync_swap_full])
 #define sync_lock_release_optab \
   (&direct_optab_table[(int) DOI_sync_lock_release])
 
Index: genopinit.c
===================================================================
--- genopinit.c (revision 173831)
+++ genopinit.c (working copy)
@@ -239,6 +239,9 @@ static const char * const optabs[] =
   "set_direct_optab_handler (sync_new_nand_optab, $A, 
CODE_FOR_$(sync_new_nand$I$a$))",
   "set_direct_optab_handler (sync_compare_and_swap_optab, $A, 
CODE_FOR_$(sync_compare_and_swap$I$a$))",
   "set_direct_optab_handler (sync_lock_test_and_set_optab, $A, 
CODE_FOR_$(sync_lock_test_and_set$I$a$))",
+  "set_direct_optab_handler (sync_swap_acq_optab, $A, 
CODE_FOR_$(sync_swap_acq$I$a$))",
+  "set_direct_optab_handler (sync_swap_rel_optab, $A, 
CODE_FOR_$(sync_swap_rel$I$a$))",
+  "set_direct_optab_handler (sync_swap_full_optab, $A, 
CODE_FOR_$(sync_swap_full$I$a$))",
   "set_direct_optab_handler (sync_lock_release_optab, $A, 
CODE_FOR_$(sync_lock_release$I$a$))",
   "set_optab_handler (vec_set_optab, $A, CODE_FOR_$(vec_set$a$))",
   "set_optab_handler (vec_extract_optab, $A, CODE_FOR_$(vec_extract$a$))",
Index: builtins.c
===================================================================
--- builtins.c  (revision 173831)
+++ builtins.c  (working copy)
@@ -5682,9 +5682,35 @@ expand_builtin_lock_test_and_set (enum m
   return expand_sync_lock_test_and_set (mem, val, target);
 }
 
+/* Expand the __sync_swap_* intrinsics.
+
+   EXP is the CALL_EXPR.
+   TARGET is an optional place for us to store the results.
+   MBMODE is the memory barrier mode to use.  */
+
+static rtx
+expand_builtin_swap (enum machine_mode mode, tree exp, rtx target,
+                    enum membar_mode mbmode)
+{
+  rtx val, mem;
+  enum machine_mode old_mode;
+
+  /* Expand the operands.  */
+  mem = get_builtin_sync_mem (CALL_EXPR_ARG (exp, 0), mode);
+  val = expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX, mode, EXPAND_NORMAL);
+  /* If VAL is promoted to a wider mode, convert it back to MODE.  Take care
+     of CONST_INTs, where we know the old_mode only from the call argument.  */
+  old_mode = GET_MODE (val);
+  if (old_mode == VOIDmode)
+    old_mode = TYPE_MODE (TREE_TYPE (CALL_EXPR_ARG (exp, 1)));
+  val = convert_modes (mode, old_mode, val, 1);
+
+  return expand_sync_swap (mem, val, target, mbmode);
+}
+
 /* Expand the __sync_synchronize intrinsic.  */
 
-static void
+void
 expand_builtin_synchronize (void)
 {
   gimple x;
@@ -6495,6 +6521,39 @@ expand_builtin (tree exp, rtx target, rt
        return target;
       break;
 
+    case BUILT_IN_SWAP_ACQ_1:
+    case BUILT_IN_SWAP_ACQ_2:
+    case BUILT_IN_SWAP_ACQ_4:
+    case BUILT_IN_SWAP_ACQ_8:
+    case BUILT_IN_SWAP_ACQ_16:
+      mode = get_builtin_sync_mode (fcode - BUILT_IN_SWAP_ACQ_1);
+      target = expand_builtin_swap (mode, exp, target, MEMBAR_MODE_ACQUIRE);
+      if (target)
+       return target;
+      break;
+
+    case BUILT_IN_SWAP_REL_1:
+    case BUILT_IN_SWAP_REL_2:
+    case BUILT_IN_SWAP_REL_4:
+    case BUILT_IN_SWAP_REL_8:
+    case BUILT_IN_SWAP_REL_16:
+      mode = get_builtin_sync_mode (fcode - BUILT_IN_SWAP_REL_1);
+      target = expand_builtin_swap (mode, exp, target, MEMBAR_MODE_RELEASE);
+      if (target)
+       return target;
+      break;
+
+    case BUILT_IN_SWAP_FULL_1:
+    case BUILT_IN_SWAP_FULL_2:
+    case BUILT_IN_SWAP_FULL_4:
+    case BUILT_IN_SWAP_FULL_8:
+    case BUILT_IN_SWAP_FULL_16:
+      mode = get_builtin_sync_mode (fcode - BUILT_IN_SWAP_FULL_1);
+      target = expand_builtin_swap (mode, exp, target, MEMBAR_MODE_FULL);
+      if (target)
+       return target;
+      break;
+
     case BUILT_IN_LOCK_TEST_AND_SET_1:
     case BUILT_IN_LOCK_TEST_AND_SET_2:
     case BUILT_IN_LOCK_TEST_AND_SET_4:
Index: sync-builtins.def
===================================================================
--- sync-builtins.def   (revision 173831)
+++ sync-builtins.def   (working copy)
@@ -235,6 +235,63 @@ DEF_SYNC_BUILTIN (BUILT_IN_LOCK_TEST_AND
 DEF_SYNC_BUILTIN (BUILT_IN_LOCK_TEST_AND_SET_16, "__sync_lock_test_and_set_16",
                  BT_FN_I16_VPTR_I16, ATTR_NOTHROW_LEAF_LIST)
 
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_ACQ_N,
+                 "__sync_swap_acq",
+                 BT_FN_VOID_VAR, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_ACQ_1,
+                 "__sync_swap_acq_1",
+                 BT_FN_I1_VPTR_I1, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_ACQ_2,
+                 "__sync_swap_acq_2",
+                 BT_FN_I2_VPTR_I2, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_ACQ_4,
+                 "__sync_swap_acq_4",
+                 BT_FN_I4_VPTR_I4, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_ACQ_8,
+                 "__sync_swap_acq_8",
+                 BT_FN_I8_VPTR_I8, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_ACQ_16,
+                 "__sync_swap_acq_16",
+                 BT_FN_I16_VPTR_I16, ATTR_NOTHROW_LEAF_LIST)
+
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_REL_N,
+                 "__sync_swap_rel",
+                 BT_FN_VOID_VAR, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_REL_1,
+                 "__sync_swap_rel_1",
+                 BT_FN_I1_VPTR_I1, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_REL_2,
+                 "__sync_swap_rel_2",
+                 BT_FN_I2_VPTR_I2, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_REL_4,
+                 "__sync_swap_rel_4",
+                 BT_FN_I4_VPTR_I4, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_REL_8,
+                 "__sync_swap_rel_8",
+                 BT_FN_I8_VPTR_I8, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_REL_16,
+                 "__sync_swap_rel_16",
+                 BT_FN_I16_VPTR_I16, ATTR_NOTHROW_LEAF_LIST)
+
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_FULL_N,
+                 "__sync_swap_full",
+                 BT_FN_VOID_VAR, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_FULL_1,
+                 "__sync_swap_full_1",
+                 BT_FN_I1_VPTR_I1, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_FULL_2,
+                 "__sync_swap_full_2",
+                 BT_FN_I2_VPTR_I2, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_FULL_4,
+                 "__sync_swap_full_4",
+                 BT_FN_I4_VPTR_I4, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_FULL_8,
+                 "__sync_swap_full_8",
+                 BT_FN_I8_VPTR_I8, ATTR_NOTHROW_LEAF_LIST)
+DEF_SYNC_BUILTIN (BUILT_IN_SWAP_FULL_16,
+                 "__sync_swap_full_16",
+                 BT_FN_I16_VPTR_I16, ATTR_NOTHROW_LEAF_LIST)
+
 DEF_SYNC_BUILTIN (BUILT_IN_LOCK_RELEASE_N, "__sync_lock_release",
                  BT_FN_VOID_VAR, ATTR_NOTHROW_LEAF_LIST)
 DEF_SYNC_BUILTIN (BUILT_IN_LOCK_RELEASE_1, "__sync_lock_release_1",
Index: expr.h
===================================================================
--- expr.h      (revision 173831)
+++ expr.h      (working copy)
@@ -161,6 +161,14 @@ enum optab_methods
   OPTAB_MUST_WIDEN
 };
 
+/* Memory barrier type.  */
+enum membar_mode
+{
+  MEMBAR_MODE_RELEASE,
+  MEMBAR_MODE_ACQUIRE,
+  MEMBAR_MODE_FULL
+};
+
 /* Generate code for a simple binary or unary operation.  "Simple" in
    this case means "can be unambiguously described by a (mode, code)
    pair and mapped to a single optab."  */
@@ -217,6 +225,7 @@ rtx expand_bool_compare_and_swap (rtx, r
 rtx expand_sync_operation (rtx, rtx, enum rtx_code);
 rtx expand_sync_fetch_operation (rtx, rtx, enum rtx_code, bool, rtx);
 rtx expand_sync_lock_test_and_set (rtx, rtx, rtx);
+rtx expand_sync_swap (rtx, rtx, rtx, enum membar_mode);
 
 /* Functions from expmed.c:  */
 
@@ -248,6 +257,7 @@ extern void expand_builtin_setjmp_receiv
 extern rtx expand_builtin_saveregs (void);
 extern void expand_builtin_trap (void);
 extern rtx builtin_strncpy_read_str (void *, HOST_WIDE_INT, enum machine_mode);
+extern void expand_builtin_synchronize (void);
 
 /* Functions from expr.c:  */
 
Index: config/i386/i386.md
===================================================================
--- config/i386/i386.md (revision 173831)
+++ config/i386/i386.md (working copy)
@@ -250,6 +250,7 @@
   UNSPECV_MWAIT
   UNSPECV_CMPXCHG
   UNSPECV_XCHG
+  UNSPECV_SWAP_FULL
   UNSPECV_LOCK
   UNSPECV_PROLOGUE_USE
   UNSPECV_CLD
Index: config/i386/sync.md
===================================================================
--- config/i386/sync.md (revision 173831)
+++ config/i386/sync.md (working copy)
@@ -232,6 +232,15 @@
   return "lock{%;} add{<imodesuffix>}\t{%1, %0|%0, %1}";
 })
 
+(define_insn "sync_swap_full<mode>"
+  [(set (match_operand:SWI 0 "register_operand" "=<r>")
+       (unspec_volatile:SWI
+         [(match_operand:SWI 1 "memory_operand" "+m")] UNSPECV_SWAP_FULL))
+   (set (match_dup 1)
+       (match_operand:SWI 2 "register_operand" "0"))]
+  ""
+  "xchg{<imodesuffix>}\t{%1, %0|%0, %1}")
+
 ;; Recall that xchg implicitly sets LOCK#, so adding it again wastes space.
 (define_insn "sync_lock_test_and_set<mode>"
   [(set (match_operand:SWI 0 "register_operand" "=<r>")
Index: libgcc-std.ver
===================================================================
--- libgcc-std.ver      (revision 173831)
+++ libgcc-std.ver      (working copy)
@@ -1919,3 +1919,26 @@ GCC_4.6.0 {
   __morestack_initial_sp
   __splitstack_find
 }
+
+%inherit GCC_4.7.0 GCC_4.6.0
+GCC_4.7.0 {
+  __sync_swap_acq_1
+  __sync_swap_rel_1
+  __sync_swap_full_1
+
+  __sync_swap_acq_2
+  __sync_swap_rel_2
+  __sync_swap_full_2
+
+  __sync_swap_acq_4
+  __sync_swap_rel_4
+  __sync_swap_full_4
+
+  __sync_swap_acq_8
+  __sync_swap_rel_8
+  __sync_swap_full_8
+
+  __sync_swap_acq_16
+  __sync_swap_rel_16
+  __sync_swap_full_16
+}
        (__sso_string_base<>::_M_assign): Likewise.
        * include/bits/atomic_2.h (_ITp<>::store): Use __sync_swap_full.
        (_ITp<>::store volatile): Same.
        (_PTp<>::store): Same.
        (_PTp<>::store volatile): Same.

Index: include/bits/atomic_2.h
===================================================================
--- include/bits/atomic_2.h     (revision 173831)
+++ include/bits/atomic_2.h     (working copy)
@@ -249,14 +249,12 @@ namespace __atomic2
        __glibcxx_assert(__m != memory_order_acq_rel);
        __glibcxx_assert(__m != memory_order_consume);
 
-       if (__m == memory_order_relaxed)
-         _M_i = __i;
+       if (__m == memory_order_seq_cst)
+         (void)__sync_swap_full (&_M_i, __i);
        else
          {
            // write_mem_barrier();
            _M_i = __i;
-           if (__m == memory_order_seq_cst)
-             __sync_synchronize();
          }
       }
 
@@ -267,14 +265,12 @@ namespace __atomic2
        __glibcxx_assert(__m != memory_order_acq_rel);
        __glibcxx_assert(__m != memory_order_consume);
 
-       if (__m == memory_order_relaxed)
-         _M_i = __i;
+       if (__m == memory_order_seq_cst)
+         (void)__sync_swap_full (&_M_i, __i);
        else
          {
            // write_mem_barrier();
            _M_i = __i;
-           if (__m == memory_order_seq_cst)
-             __sync_synchronize();
          }
       }
 
@@ -540,14 +536,12 @@ namespace __atomic2
        __glibcxx_assert(__m != memory_order_acq_rel);
        __glibcxx_assert(__m != memory_order_consume);
 
-       if (__m == memory_order_relaxed)
-         _M_p = __p;
+       if (__m = memory_order_seq_cst)
+         __sync_swap_full (&_M_p, __p);
        else
          {
            // write_mem_barrier();
            _M_p = __p;
-           if (__m == memory_order_seq_cst)
-             __sync_synchronize();
          }
       }
 
@@ -559,14 +553,12 @@ namespace __atomic2
        __glibcxx_assert(__m != memory_order_acq_rel);
        __glibcxx_assert(__m != memory_order_consume);
 
-       if (__m == memory_order_relaxed)
-         _M_p = __p;
+       if (__m = memory_order_seq_cst)
+         __sync_swap_full (&_M_p, __p);
        else
          {
            // write_mem_barrier();
            _M_p = __p;
-           if (__m == memory_order_seq_cst)
-             __sync_synchronize();
          }
       }
 

Reply via email to