atomic: Strengthen implementations with fences

Palmer Dabbelt Fri, 09 Mar 2018 10:54:54 -0800

On Fri, 09 Mar 2018 10:36:44 PST (-0800), [email protected] wrote:

On Fri, Mar 09, 2018 at 09:56:21AM -0800, Palmer Dabbelt wrote:

On Fri, 09 Mar 2018 04:13:40 PST (-0800), [email protected] wrote:
>Atomics present the same issue with locking: release and acquire
>variants need to be strengthened to meet the constraints defined
>by the Linux-kernel memory consistency model [1].
>
>Atomics present a further issue: implementations of atomics such
>as atomic_cmpxchg() and atomic_add_unless() rely on LR/SC pairs,
>which do not give full-ordering with .aqrl; for example, current
>implementations allow the "lr-sc-aqrl-pair-vs-full-barrier" test
>below to end up with the state indicated in the "exists" clause.
>
>In order to "synchronize" LKMM and RISC-V's implementation, this
>commit strengthens the implementations of the atomics operations
>by replacing .rl and .aq with the use of ("lightweigth") fences,
>and by replacing .aqrl LR/SC pairs in sequences such as:
>
>  0:      lr.w.aqrl  %0, %addr
>          bne        %0, %old, 1f
>          ...
>          sc.w.aqrl  %1, %new, %addr
>          bnez       %1, 0b
>  1:
>
>with sequences of the form:
>
>  0:      lr.w       %0, %addr
>          bne        %0, %old, 1f
>          ...
>          sc.w.rl    %1, %new, %addr   /* SC-release   */
>          bnez       %1, 0b
>          fence      rw, rw            /* "full" fence */
>  1:
>
>following Daniel's suggestion.
>
>These modifications were validated with simulation of the RISC-V
>memory consistency model.
>
>C lr-sc-aqrl-pair-vs-full-barrier
>
>{}
>
>P0(int *x, int *y, atomic_t *u)
>{
>    int r0;
>    int r1;
>
>    WRITE_ONCE(*x, 1);
>    r0 = atomic_cmpxchg(u, 0, 1);
>    r1 = READ_ONCE(*y);
>}
>
>P1(int *x, int *y, atomic_t *v)
>{
>    int r0;
>    int r1;
>
>    WRITE_ONCE(*y, 1);
>    r0 = atomic_cmpxchg(v, 0, 1);
>    r1 = READ_ONCE(*x);
>}
>
>exists (u=1 /\ v=1 /\ 0:r1=0 /\ 1:r1=0)
>
>[1] https://marc.info/?l=linux-kernel&m=151930201102853&w=2
>    
https://groups.google.com/a/groups.riscv.org/forum/#!topic/isa-dev/hKywNHBkAXM
>    https://marc.info/?l=linux-kernel&m=151633436614259&w=2
>
>Suggested-by: Daniel Lustig <[email protected]>
>Signed-off-by: Andrea Parri <[email protected]>
>Cc: Palmer Dabbelt <[email protected]>
>Cc: Albert Ou <[email protected]>
>Cc: Daniel Lustig <[email protected]>
>Cc: Alan Stern <[email protected]>
>Cc: Will Deacon <[email protected]>
>Cc: Peter Zijlstra <[email protected]>
>Cc: Boqun Feng <[email protected]>
>Cc: Nicholas Piggin <[email protected]>
>Cc: David Howells <[email protected]>
>Cc: Jade Alglave <[email protected]>
>Cc: Luc Maranget <[email protected]>
>Cc: "Paul E. McKenney" <[email protected]>
>Cc: Akira Yokosawa <[email protected]>
>Cc: Ingo Molnar <[email protected]>
>Cc: Linus Torvalds <[email protected]>
>Cc: [email protected]
>Cc: [email protected]
>---
> arch/riscv/include/asm/atomic.h  | 417 +++++++++++++++++++++++++--------------
> arch/riscv/include/asm/cmpxchg.h | 391 +++++++++++++++++++++++++++++-------
> 2 files changed, 588 insertions(+), 220 deletions(-)
>
>diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h
>index e65d1cd89e28b..855115ace98c8 100644
>--- a/arch/riscv/include/asm/atomic.h
>+++ b/arch/riscv/include/asm/atomic.h
>@@ -24,6 +24,20 @@
> #include <asm/barrier.h>
>
> #define ATOMIC_INIT(i)     { (i) }
>+
>+#define __atomic_op_acquire(op, args...)                           \
>+({                                                                 \
>+   typeof(op##_relaxed(args)) __ret  = op##_relaxed(args);         \
>+   __asm__ __volatile__(RISCV_ACQUIRE_BARRIER "" ::: "memory");        \
>+   __ret;                                                          \
>+})
>+
>+#define __atomic_op_release(op, args...)                           \
>+({                                                                 \
>+   __asm__ __volatile__(RISCV_RELEASE_BARRIER "" ::: "memory");        \
>+   op##_relaxed(args);                                             \
>+})
>+
> static __always_inline int atomic_read(const atomic_t *v)
> {
>    return READ_ONCE(v->counter);
>@@ -50,22 +64,23 @@ static __always_inline void atomic64_set(atomic64_t *v, 
long i)
>  * have the AQ or RL bits set.  These don't return anything, so there's only
>  * one version to worry about.
>  */
>-#define ATOMIC_OP(op, asm_op, I, asm_type, c_type, prefix)                    
     \
>-static __always_inline void atomic##prefix##_##op(c_type i, 
atomic##prefix##_t *v) \
>-{                                                                             
     \
>-   __asm__ __volatile__ (                                                     
     \
>-           "amo" #asm_op "." #asm_type " zero, %1, %0"                        
       \
>-           : "+A" (v->counter)                                                
        \
>-           : "r" (I)                                                          
   \
>-           : "memory");                                                       
   \
>-}
>+#define ATOMIC_OP(op, asm_op, I, asm_type, c_type, prefix)         \
>+static __always_inline                                                     \
>+void atomic##prefix##_##op(c_type i, atomic##prefix##_t *v)                \
>+{                                                                  \
>+   __asm__ __volatile__ (                                          \
>+           "  amo" #asm_op "." #asm_type " zero, %1, %0"     \
>+           : "+A" (v->counter)                                        \
>+           : "r" (I)                                             \
>+           : "memory");                                          \
>+}                                                                  \


Just to be sure: this is just a whitespace change, right?


This belongs to the "few style fixes" (in the specific, 80-chars lines)
mentioned in the cover letter; I could not resist ;-), but I'll remove
them in v3 if you like so.


No problem, just next time it's a bit easier to not mix the really complicated
stuff (memory model changes) with the really simple stuff (whitespace changes).


> #ifdef CONFIG_GENERIC_ATOMIC64
>-#define ATOMIC_OPS(op, asm_op, I)                  \
>+#define ATOMIC_OPS(op, asm_op, I)                                  \
>         ATOMIC_OP (op, asm_op, I, w,  int,   )
> #else
>-#define ATOMIC_OPS(op, asm_op, I)                  \
>-        ATOMIC_OP (op, asm_op, I, w,  int,   )     \
>+#define ATOMIC_OPS(op, asm_op, I)                                  \
>+        ATOMIC_OP (op, asm_op, I, w,  int,   )                             \
>         ATOMIC_OP (op, asm_op, I, d, long, 64)
> #endif
>
>@@ -79,75 +94,115 @@ ATOMIC_OPS(xor, xor,  i)
> #undef ATOMIC_OPS
>
> /*
>- * Atomic ops that have ordered, relaxed, acquire, and relese variants.
>+ * Atomic ops that have ordered, relaxed, acquire, and release variants.
>  * There's two flavors of these: the arithmatic ops have both fetch and return
>  * versions, while the logical ops only have fetch versions.
>  */
>-#define ATOMIC_FETCH_OP(op, asm_op, I, asm_or, c_or, asm_type, c_type, 
prefix)                             \
>-static __always_inline c_type atomic##prefix##_fetch_##op##c_or(c_type i, 
atomic##prefix##_t *v)   \
>-{                                                                             
                     \
>-   register c_type ret;                                                       
                     \
>-   __asm__ __volatile__ (                                                     
                     \
>-           "amo" #asm_op "." #asm_type #asm_or " %1, %2, %0"                  
               \
>-           : "+A" (v->counter), "=r" (ret)                                    
                      \
>-           : "r" (I)                                                          
                   \
>-           : "memory");                                                       
                   \
>-   return ret;                                                                
                     \
>+#define ATOMIC_FETCH_OP(op, asm_op, I, asm_type, c_type, prefix)   \
>+static __always_inline                                                     \
>+c_type atomic##prefix##_fetch_##op##_relaxed(c_type i,                     \
>+                                        atomic##prefix##_t *v)     \
>+{                                                                  \
>+   register c_type ret;                                            \
>+   __asm__ __volatile__ (                                          \
>+           "  amo" #asm_op "." #asm_type " %1, %2, %0"       \
>+           : "+A" (v->counter), "=r" (ret)                          \
>+           : "r" (I)                                             \
>+           : "memory");                                          \
>+   return ret;                                                     \
>+}                                                                  \
>+static __always_inline                                                     \
>+c_type atomic##prefix##_fetch_##op(c_type i, atomic##prefix##_t *v)        \
>+{                                                                  \
>+   register c_type ret;                                            \
>+   __asm__ __volatile__ (                                          \
>+           "  amo" #asm_op "." #asm_type ".aqrl  %1, %2, %0" \
>+           : "+A" (v->counter), "=r" (ret)                          \
>+           : "r" (I)                                             \
>+           : "memory");                                          \
>+   return ret;                                                     \
> }
>
>-#define ATOMIC_OP_RETURN(op, asm_op, c_op, I, asm_or, c_or, asm_type, c_type, 
prefix)                      \
>-static __always_inline c_type atomic##prefix##_##op##_return##c_or(c_type i, 
atomic##prefix##_t *v)        \
>-{                                                                             
                     \
>-        return atomic##prefix##_fetch_##op##c_or(i, v) c_op I;                
                             \
>+#define ATOMIC_OP_RETURN(op, asm_op, c_op, I, asm_type, c_type, prefix)    \
>+static __always_inline                                                     \
>+c_type atomic##prefix##_##op##_return_relaxed(c_type i,                    \
>+                                         atomic##prefix##_t *v)    \
>+{                                                                  \
>+        return atomic##prefix##_fetch_##op##_relaxed(i, v) c_op I; \
>+}                                                                  \
>+static __always_inline                                                     \
>+c_type atomic##prefix##_##op##_return(c_type i, atomic##prefix##_t *v)     \
>+{                                                                  \
>+        return atomic##prefix##_fetch_##op(i, v) c_op I;           \
> }
>
> #ifdef CONFIG_GENERIC_ATOMIC64
>-#define ATOMIC_OPS(op, asm_op, c_op, I, asm_or, c_or)                         
     \
>-        ATOMIC_FETCH_OP (op, asm_op,       I, asm_or, c_or, w,  int,   )   \
>-        ATOMIC_OP_RETURN(op, asm_op, c_op, I, asm_or, c_or, w,  int,   )
>+#define ATOMIC_OPS(op, asm_op, c_op, I)                                    \
>+        ATOMIC_FETCH_OP( op, asm_op,       I, w,  int,   )         \
>+        ATOMIC_OP_RETURN(op, asm_op, c_op, I, w,  int,   )
> #else
>-#define ATOMIC_OPS(op, asm_op, c_op, I, asm_or, c_or)                         
     \
>-        ATOMIC_FETCH_OP (op, asm_op,       I, asm_or, c_or, w,  int,   )   \
>-        ATOMIC_OP_RETURN(op, asm_op, c_op, I, asm_or, c_or, w,  int,   )   \
>-        ATOMIC_FETCH_OP (op, asm_op,       I, asm_or, c_or, d, long, 64)   \
>-        ATOMIC_OP_RETURN(op, asm_op, c_op, I, asm_or, c_or, d, long, 64)
>+#define ATOMIC_OPS(op, asm_op, c_op, I)                                    \
>+        ATOMIC_FETCH_OP( op, asm_op,       I, w,  int,   )         \
>+        ATOMIC_OP_RETURN(op, asm_op, c_op, I, w,  int,   )         \
>+        ATOMIC_FETCH_OP( op, asm_op,       I, d, long, 64)         \
>+        ATOMIC_OP_RETURN(op, asm_op, c_op, I, d, long, 64)
> #endif
>
>-ATOMIC_OPS(add, add, +,  i,      , _relaxed)
>-ATOMIC_OPS(add, add, +,  i, .aq  , _acquire)
>-ATOMIC_OPS(add, add, +,  i, .rl  , _release)
>-ATOMIC_OPS(add, add, +,  i, .aqrl,         )
>+ATOMIC_OPS(add, add, +,  i)
>+ATOMIC_OPS(sub, add, +, -i)

Sorry, I must be missing something, but how do the acquire and release atomics
get generated with this new code?


This proposal relies on the generic definition,

   include/linux/atomic.h ,

and on the

   __atomic_op_acquire()
   __atomic_op_release()

above to build the acquire/release atomics (except for the xchg,cmpxchg,
where the ACQUIRE_BARRIER is inserted conditionally/on success).


I thought we wanted to use the AQ and RL bits for AMOs, just not for LR/SC
sequences.  IIRC the AMOs are safe with the current memory model, but I might
just have some version mismatches in my head.


>+
>+#define atomic_add_return_relaxed  atomic_add_return_relaxed
>+#define atomic_sub_return_relaxed  atomic_sub_return_relaxed
>+#define atomic_add_return          atomic_add_return
>+#define atomic_sub_return          atomic_sub_return
>
>-ATOMIC_OPS(sub, add, +, -i,      , _relaxed)
>-ATOMIC_OPS(sub, add, +, -i, .aq  , _acquire)
>-ATOMIC_OPS(sub, add, +, -i, .rl  , _release)
>-ATOMIC_OPS(sub, add, +, -i, .aqrl,         )
>+#define atomic_fetch_add_relaxed   atomic_fetch_add_relaxed
>+#define atomic_fetch_sub_relaxed   atomic_fetch_sub_relaxed
>+#define atomic_fetch_add           atomic_fetch_add
>+#define atomic_fetch_sub           atomic_fetch_sub
>+
>+#ifndef CONFIG_GENERIC_ATOMIC64
>+#define atomic64_add_return_relaxed        atomic64_add_return_relaxed
>+#define atomic64_sub_return_relaxed        atomic64_sub_return_relaxed
>+#define atomic64_add_return                atomic64_add_return
>+#define atomic64_sub_return                atomic64_sub_return
>+
>+#define atomic64_fetch_add_relaxed atomic64_fetch_add_relaxed
>+#define atomic64_fetch_sub_relaxed atomic64_fetch_sub_relaxed
>+#define atomic64_fetch_add         atomic64_fetch_add
>+#define atomic64_fetch_sub         atomic64_fetch_sub
>+#endif
>
> #undef ATOMIC_OPS
>
> #ifdef CONFIG_GENERIC_ATOMIC64
>-#define ATOMIC_OPS(op, asm_op, I, asm_or, c_or)                            \
>-        ATOMIC_FETCH_OP(op, asm_op, I, asm_or, c_or, w,  int,   )
>+#define ATOMIC_OPS(op, asm_op, I)                                  \
>+        ATOMIC_FETCH_OP(op, asm_op, I, w,  int,   )
> #else
>-#define ATOMIC_OPS(op, asm_op, I, asm_or, c_or)                            \
>-        ATOMIC_FETCH_OP(op, asm_op, I, asm_or, c_or, w,  int,   )  \
>-        ATOMIC_FETCH_OP(op, asm_op, I, asm_or, c_or, d, long, 64)
>+#define ATOMIC_OPS(op, asm_op, I)                                  \
>+        ATOMIC_FETCH_OP(op, asm_op, I, w,  int,   )                        \
>+        ATOMIC_FETCH_OP(op, asm_op, I, d, long, 64)
> #endif
>
>-ATOMIC_OPS(and, and, i,      , _relaxed)
>-ATOMIC_OPS(and, and, i, .aq  , _acquire)
>-ATOMIC_OPS(and, and, i, .rl  , _release)
>-ATOMIC_OPS(and, and, i, .aqrl,         )
>+ATOMIC_OPS(and, and, i)
>+ATOMIC_OPS( or,  or, i)
>+ATOMIC_OPS(xor, xor, i)
>
>-ATOMIC_OPS( or,  or, i,      , _relaxed)
>-ATOMIC_OPS( or,  or, i, .aq  , _acquire)
>-ATOMIC_OPS( or,  or, i, .rl  , _release)
>-ATOMIC_OPS( or,  or, i, .aqrl,         )
>+#define atomic_fetch_and_relaxed   atomic_fetch_and_relaxed
>+#define atomic_fetch_or_relaxed            atomic_fetch_or_relaxed
>+#define atomic_fetch_xor_relaxed   atomic_fetch_xor_relaxed
>+#define atomic_fetch_and           atomic_fetch_and
>+#define atomic_fetch_or                    atomic_fetch_or
>+#define atomic_fetch_xor           atomic_fetch_xor
>
>-ATOMIC_OPS(xor, xor, i,      , _relaxed)
>-ATOMIC_OPS(xor, xor, i, .aq  , _acquire)
>-ATOMIC_OPS(xor, xor, i, .rl  , _release)
>-ATOMIC_OPS(xor, xor, i, .aqrl,         )
>+#ifndef CONFIG_GENERIC_ATOMIC64
>+#define atomic64_fetch_and_relaxed atomic64_fetch_and_relaxed
>+#define atomic64_fetch_or_relaxed  atomic64_fetch_or_relaxed
>+#define atomic64_fetch_xor_relaxed atomic64_fetch_xor_relaxed
>+#define atomic64_fetch_and         atomic64_fetch_and
>+#define atomic64_fetch_or          atomic64_fetch_or
>+#define atomic64_fetch_xor         atomic64_fetch_xor
>+#endif
>
> #undef ATOMIC_OPS
>
>@@ -157,22 +212,24 @@ ATOMIC_OPS(xor, xor, i, .aqrl,         )
> /*
>  * The extra atomic operations that are constructed from one of the core
>  * AMO-based operations above (aside from sub, which is easier to fit above).
>- * These are required to perform a barrier, but they're OK this way because
>- * atomic_*_return is also required to perform a barrier.
>+ * These are required to perform a full barrier, but they're OK this way
>+ * because atomic_*_return is also required to perform a full barrier.
>+ *
>  */
>-#define ATOMIC_OP(op, func_op, comp_op, I, c_type, prefix)                 \
>-static __always_inline bool atomic##prefix##_##op(c_type i, 
atomic##prefix##_t *v) \
>-{                                                                          \
>-   return atomic##prefix##_##func_op##_return(i, v) comp_op I;             \
>+#define ATOMIC_OP(op, func_op, comp_op, I, c_type, prefix)         \
>+static __always_inline                                                     \
>+bool atomic##prefix##_##op(c_type i, atomic##prefix##_t *v)                \
>+{                                                                  \
>+   return atomic##prefix##_##func_op##_return(i, v) comp_op I;     \
> }
>
> #ifdef CONFIG_GENERIC_ATOMIC64
>-#define ATOMIC_OPS(op, func_op, comp_op, I)                        \
>-        ATOMIC_OP (op, func_op, comp_op, I,  int,   )
>+#define ATOMIC_OPS(op, func_op, comp_op, I)                                \
>+        ATOMIC_OP(op, func_op, comp_op, I,  int,   )
> #else
>-#define ATOMIC_OPS(op, func_op, comp_op, I)                        \
>-        ATOMIC_OP (op, func_op, comp_op, I,  int,   )              \
>-        ATOMIC_OP (op, func_op, comp_op, I, long, 64)
>+#define ATOMIC_OPS(op, func_op, comp_op, I)                                \
>+        ATOMIC_OP(op, func_op, comp_op, I,  int,   )                       \
>+        ATOMIC_OP(op, func_op, comp_op, I, long, 64)
> #endif
>
> ATOMIC_OPS(add_and_test, add, ==, 0)
>@@ -182,51 +239,87 @@ ATOMIC_OPS(add_negative, add,  <, 0)
> #undef ATOMIC_OP
> #undef ATOMIC_OPS
>
>-#define ATOMIC_OP(op, func_op, I, c_type, prefix)                          \
>-static __always_inline void atomic##prefix##_##op(atomic##prefix##_t *v)   \
>-{                                                                          \
>-   atomic##prefix##_##func_op(I, v);                                       \
>+#define ATOMIC_OP(op, func_op, I, c_type, prefix)                  \
>+static __always_inline                                                     \
>+void atomic##prefix##_##op(atomic##prefix##_t *v)                  \
>+{                                                                  \
>+   atomic##prefix##_##func_op(I, v);                               \
> }
>
>-#define ATOMIC_FETCH_OP(op, func_op, I, c_type, prefix)                       
             \
>-static __always_inline c_type atomic##prefix##_fetch_##op(atomic##prefix##_t 
*v)   \
>-{                                                                             
     \
>-   return atomic##prefix##_fetch_##func_op(I, v);                             
     \
>+#define ATOMIC_FETCH_OP(op, func_op, I, c_type, prefix)                    \
>+static __always_inline                                                     \
>+c_type atomic##prefix##_fetch_##op##_relaxed(atomic##prefix##_t *v)        \
>+{                                                                  \
>+   return atomic##prefix##_fetch_##func_op##_relaxed(I, v);        \
>+}                                                                  \
>+static __always_inline                                                     \
>+c_type atomic##prefix##_fetch_##op(atomic##prefix##_t *v)          \
>+{                                                                  \
>+   return atomic##prefix##_fetch_##func_op(I, v);                  \
> }
>
>-#define ATOMIC_OP_RETURN(op, asm_op, c_op, I, c_type, prefix)                 
             \
>-static __always_inline c_type 
atomic##prefix##_##op##_return(atomic##prefix##_t *v)        \
>-{                                                                             
     \
>-        return atomic##prefix##_fetch_##op(v) c_op I;                         
             \
>+#define ATOMIC_OP_RETURN(op, asm_op, c_op, I, c_type, prefix)              \
>+static __always_inline                                                     \
>+c_type atomic##prefix##_##op##_return_relaxed(atomic##prefix##_t *v)       \
>+{                                                                  \
>+        return atomic##prefix##_fetch_##op##_relaxed(v) c_op I;            \
>+}                                                                  \
>+static __always_inline                                                     \
>+c_type atomic##prefix##_##op##_return(atomic##prefix##_t *v)               \
>+{                                                                  \
>+        return atomic##prefix##_fetch_##op(v) c_op I;                      \
> }
>
> #ifdef CONFIG_GENERIC_ATOMIC64
>-#define ATOMIC_OPS(op, asm_op, c_op, I)                                       
     \
>-        ATOMIC_OP       (op, asm_op,       I,  int,   )                       
     \
>-        ATOMIC_FETCH_OP (op, asm_op,       I,  int,   )                       
     \
>+#define ATOMIC_OPS(op, asm_op, c_op, I)                                    \
>+        ATOMIC_OP(       op, asm_op,       I,  int,   )                    \
>+        ATOMIC_FETCH_OP( op, asm_op,       I,  int,   )                    \
>         ATOMIC_OP_RETURN(op, asm_op, c_op, I,  int,   )
> #else
>-#define ATOMIC_OPS(op, asm_op, c_op, I)                                       
     \
>-        ATOMIC_OP       (op, asm_op,       I,  int,   )                       
     \
>-        ATOMIC_FETCH_OP (op, asm_op,       I,  int,   )                       
     \
>-        ATOMIC_OP_RETURN(op, asm_op, c_op, I,  int,   )                       
     \
>-        ATOMIC_OP       (op, asm_op,       I, long, 64)                       
     \
>-        ATOMIC_FETCH_OP (op, asm_op,       I, long, 64)                       
     \
>+#define ATOMIC_OPS(op, asm_op, c_op, I)                                    \
>+        ATOMIC_OP(       op, asm_op,       I,  int,   )                    \
>+        ATOMIC_FETCH_OP( op, asm_op,       I,  int,   )                    \
>+        ATOMIC_OP_RETURN(op, asm_op, c_op, I,  int,   )                    \
>+        ATOMIC_OP(       op, asm_op,       I, long, 64)                    \
>+        ATOMIC_FETCH_OP( op, asm_op,       I, long, 64)                    \
>         ATOMIC_OP_RETURN(op, asm_op, c_op, I, long, 64)
> #endif
>
> ATOMIC_OPS(inc, add, +,  1)
> ATOMIC_OPS(dec, add, +, -1)
>
>+#define atomic_inc_return_relaxed  atomic_inc_return_relaxed
>+#define atomic_dec_return_relaxed  atomic_dec_return_relaxed
>+#define atomic_inc_return          atomic_inc_return
>+#define atomic_dec_return          atomic_dec_return
>+
>+#define atomic_fetch_inc_relaxed   atomic_fetch_inc_relaxed
>+#define atomic_fetch_dec_relaxed   atomic_fetch_dec_relaxed
>+#define atomic_fetch_inc           atomic_fetch_inc
>+#define atomic_fetch_dec           atomic_fetch_dec
>+
>+#ifndef CONFIG_GENERIC_ATOMIC64
>+#define atomic64_inc_return_relaxed        atomic64_inc_return_relaxed
>+#define atomic64_dec_return_relaxed        atomic64_dec_return_relaxed
>+#define atomic64_inc_return                atomic64_inc_return
>+#define atomic64_dec_return                atomic64_dec_return
>+
>+#define atomic64_fetch_inc_relaxed atomic64_fetch_inc_relaxed
>+#define atomic64_fetch_dec_relaxed atomic64_fetch_dec_relaxed
>+#define atomic64_fetch_inc         atomic64_fetch_inc
>+#define atomic64_fetch_dec         atomic64_fetch_dec
>+#endif
>+
> #undef ATOMIC_OPS
> #undef ATOMIC_OP
> #undef ATOMIC_FETCH_OP
> #undef ATOMIC_OP_RETURN
>
>-#define ATOMIC_OP(op, func_op, comp_op, I, prefix)                         \
>-static __always_inline bool atomic##prefix##_##op(atomic##prefix##_t *v)   \
>-{                                                                          \
>-   return atomic##prefix##_##func_op##_return(v) comp_op I;                \
>+#define ATOMIC_OP(op, func_op, comp_op, I, prefix)                 \
>+static __always_inline                                                     \
>+bool atomic##prefix##_##op(atomic##prefix##_t *v)                  \
>+{                                                                  \
>+   return atomic##prefix##_##func_op##_return(v) comp_op I;        \
> }
>
> ATOMIC_OP(inc_and_test, inc, ==, 0,   )
>@@ -238,19 +331,19 @@ ATOMIC_OP(dec_and_test, dec, ==, 0, 64)
>
> #undef ATOMIC_OP
>
>-/* This is required to provide a barrier on success. */
>+/* This is required to provide a full barrier on success. */
> static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u)
> {
>        int prev, rc;
>
>    __asm__ __volatile__ (
>-           "0:\n\t"
>-           "lr.w.aqrl  %[p],  %[c]\n\t"
>-           "beq        %[p],  %[u], 1f\n\t"
>-           "add       %[rc],  %[p], %[a]\n\t"
>-           "sc.w.aqrl %[rc], %[rc], %[c]\n\t"
>-           "bnez      %[rc], 0b\n\t"
>-           "1:"
>+           "0:        lr.w     %[p],  %[c]\n"
>+           "  beq      %[p],  %[u], 1f\n"
>+           "  add      %[rc], %[p], %[a]\n"
>+           "  sc.w.rl  %[rc], %[rc], %[c]\n"
>+           "  bnez     %[rc], 0b\n"
>+           "  fence    rw, rw\n"
>+           "1:\n"
>            : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
>            : [a]"r" (a), [u]"r" (u)
>            : "memory");
>@@ -263,13 +356,13 @@ static __always_inline long 
__atomic64_add_unless(atomic64_t *v, long a, long u)
>        long prev, rc;
>
>    __asm__ __volatile__ (
>-           "0:\n\t"
>-           "lr.d.aqrl  %[p],  %[c]\n\t"
>-           "beq        %[p],  %[u], 1f\n\t"
>-           "add       %[rc],  %[p], %[a]\n\t"
>-           "sc.d.aqrl %[rc], %[rc], %[c]\n\t"
>-           "bnez      %[rc], 0b\n\t"
>-           "1:"
>+           "0:        lr.d     %[p],  %[c]\n"
>+           "  beq      %[p],  %[u], 1f\n"
>+           "  add      %[rc], %[p], %[a]\n"
>+           "  sc.d.rl  %[rc], %[rc], %[c]\n"
>+           "  bnez     %[rc], 0b\n"
>+           "  fence    rw, rw\n"
>+           "1:\n"
>            : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
>            : [a]"r" (a), [u]"r" (u)
>            : "memory");
>@@ -300,37 +393,63 @@ static __always_inline long 
atomic64_inc_not_zero(atomic64_t *v)
>
> /*
>  * atomic_{cmp,}xchg is required to have exactly the same ordering semantics 
as
>- * {cmp,}xchg and the operations that return, so they need a barrier.
>- */
>-/*
>- * FIXME: atomic_cmpxchg_{acquire,release,relaxed} are all implemented by
>- * assigning the same barrier to both the LR and SC operations, but that might
>- * not make any sense.  We're waiting on a memory model specification to
>- * determine exactly what the right thing to do is here.
>+ * {cmp,}xchg and the operations that return, so they need a full barrier.
>  */
>-#define ATOMIC_OP(c_t, prefix, c_or, size, asm_or)                            
             \
>-static __always_inline c_t atomic##prefix##_cmpxchg##c_or(atomic##prefix##_t 
*v, c_t o, c_t n)     \
>-{                                                                             
             \
>-   return __cmpxchg(&(v->counter), o, n, size, asm_or, asm_or);               
              \
>-}                                                                             
             \
>-static __always_inline c_t atomic##prefix##_xchg##c_or(atomic##prefix##_t *v, 
c_t n)               \
>-{                                                                             
             \
>-   return __xchg(n, &(v->counter), size, asm_or);                             
              \
>+#define ATOMIC_OP(c_t, prefix, size)                                       \
>+static __always_inline                                                     \
>+c_t atomic##prefix##_xchg_relaxed(atomic##prefix##_t *v, c_t n)            \
>+{                                                                  \
>+   return __xchg_relaxed(&(v->counter), n, size);                   \
>+}                                                                  \
>+static __always_inline                                                     \
>+c_t atomic##prefix##_xchg_acquire(atomic##prefix##_t *v, c_t n)            \
>+{                                                                  \
>+   return __xchg_acquire(&(v->counter), n, size);                   \
>+}                                                                  \
>+static __always_inline                                                     \
>+c_t atomic##prefix##_xchg_release(atomic##prefix##_t *v, c_t n)            \
>+{                                                                  \
>+   return __xchg_release(&(v->counter), n, size);                   \
>+}                                                                  \
>+static __always_inline                                                     \
>+c_t atomic##prefix##_xchg(atomic##prefix##_t *v, c_t n)                    \
>+{                                                                  \
>+   return __xchg(&(v->counter), n, size);                           \
>+}                                                                  \
>+static __always_inline                                                     \
>+c_t atomic##prefix##_cmpxchg_relaxed(atomic##prefix##_t *v,                \
>+                                c_t o, c_t n)                      \
>+{                                                                  \
>+   return __cmpxchg_relaxed(&(v->counter), o, n, size);             \
>+}                                                                  \
>+static __always_inline                                                     \
>+c_t atomic##prefix##_cmpxchg_acquire(atomic##prefix##_t *v,                \
>+                                c_t o, c_t n)                      \
>+{                                                                  \
>+   return __cmpxchg_acquire(&(v->counter), o, n, size);             \
>+}                                                                  \
>+static __always_inline                                                     \
>+c_t atomic##prefix##_cmpxchg_release(atomic##prefix##_t *v,                \
>+                                c_t o, c_t n)                      \
>+{                                                                  \
>+   return __cmpxchg_release(&(v->counter), o, n, size);             \
>+}                                                                  \
>+static __always_inline                                                     \
>+c_t atomic##prefix##_cmpxchg(atomic##prefix##_t *v, c_t o, c_t n)  \
>+{                                                                  \
>+   return __cmpxchg(&(v->counter), o, n, size);                     \
> }
>
> #ifdef CONFIG_GENERIC_ATOMIC64
>-#define ATOMIC_OPS(c_or, asm_or)                   \
>-   ATOMIC_OP( int,   , c_or, 4, asm_or)
>+#define ATOMIC_OPS()                                                       \
>+   ATOMIC_OP( int,   , 4)
> #else
>-#define ATOMIC_OPS(c_or, asm_or)                   \
>-   ATOMIC_OP( int,   , c_or, 4, asm_or)            \
>-   ATOMIC_OP(long, 64, c_or, 8, asm_or)
>+#define ATOMIC_OPS()                                                       \
>+   ATOMIC_OP( int,   , 4)                                          \
>+   ATOMIC_OP(long, 64, 8)
> #endif
>
>-ATOMIC_OPS(        , .aqrl)
>-ATOMIC_OPS(_acquire,   .aq)
>-ATOMIC_OPS(_release,   .rl)
>-ATOMIC_OPS(_relaxed,      )
>+ATOMIC_OPS()
>
> #undef ATOMIC_OPS
> #undef ATOMIC_OP
>@@ -340,13 +459,13 @@ static __always_inline int 
atomic_sub_if_positive(atomic_t *v, int offset)
>        int prev, rc;
>
>    __asm__ __volatile__ (
>-           "0:\n\t"
>-           "lr.w.aqrl  %[p],  %[c]\n\t"
>-           "sub       %[rc],  %[p], %[o]\n\t"
>-           "bltz      %[rc],    1f\n\t"
>-           "sc.w.aqrl %[rc], %[rc], %[c]\n\t"
>-           "bnez      %[rc],    0b\n\t"
>-           "1:"
>+           "0:        lr.w     %[p],  %[c]\n"
>+           "  sub      %[rc], %[p], %[o]\n"
>+           "  bltz     %[rc], 1f\n"
>+           "  sc.w.rl  %[rc], %[rc], %[c]\n"
>+           "  bnez     %[rc], 0b\n"
>+           "  fence    rw, rw\n"
>+           "1:\n"
>            : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
>            : [o]"r" (offset)
>            : "memory");
>@@ -361,13 +480,13 @@ static __always_inline long 
atomic64_sub_if_positive(atomic64_t *v, int offset)
>        long prev, rc;
>
>    __asm__ __volatile__ (
>-           "0:\n\t"
>-           "lr.d.aqrl  %[p],  %[c]\n\t"
>-           "sub       %[rc],  %[p], %[o]\n\t"
>-           "bltz      %[rc],    1f\n\t"
>-           "sc.d.aqrl %[rc], %[rc], %[c]\n\t"
>-           "bnez      %[rc],    0b\n\t"
>-           "1:"
>+           "0:        lr.d     %[p],  %[c]\n"
>+           "  sub      %[rc], %[p], %[o]\n"
>+           "  bltz     %[rc], 1f\n"
>+           "  sc.d.rl  %[rc], %[rc], %[c]\n"
>+           "  bnez     %[rc], 0b\n"
>+           "  fence    rw, rw\n"
>+           "1:\n"
>            : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
>            : [o]"r" (offset)
>            : "memory");
>diff --git a/arch/riscv/include/asm/cmpxchg.h 
b/arch/riscv/include/asm/cmpxchg.h
>index db249dbc7b976..c12833f7b6bd1 100644
>--- a/arch/riscv/include/asm/cmpxchg.h
>+++ b/arch/riscv/include/asm/cmpxchg.h
>@@ -17,45 +17,153 @@
> #include <linux/bug.h>
>
> #include <asm/barrier.h>
>+#include <asm/fence.h>
>
>-#define __xchg(new, ptr, size, asm_or)                             \
>-({                                                         \
>-   __typeof__(ptr) __ptr = (ptr);                          \
>-   __typeof__(new) __new = (new);                          \
>-   __typeof__(*(ptr)) __ret;                               \
>-   switch (size) {                                         \
>-   case 4:                                                 \
>-           __asm__ __volatile__ (                          \
>-                   "amoswap.w" #asm_or " %0, %2, %1"   \
>-                   : "=r" (__ret), "+A" (*__ptr)               \
>-                   : "r" (__new)                         \
>-                   : "memory");                          \
>-           break;                                          \
>-   case 8:                                                 \
>-           __asm__ __volatile__ (                          \
>-                   "amoswap.d" #asm_or " %0, %2, %1"   \
>-                   : "=r" (__ret), "+A" (*__ptr)               \
>-                   : "r" (__new)                         \
>-                   : "memory");                          \
>-           break;                                          \
>-   default:                                                \
>-           BUILD_BUG();                                    \
>-   }                                                       \
>-   __ret;                                                  \
>-})
>-
>-#define xchg(ptr, x)    (__xchg((x), (ptr), sizeof(*(ptr)), .aqrl))
>-
>-#define xchg32(ptr, x)                             \
>-({                                         \
>-   BUILD_BUG_ON(sizeof(*(ptr)) != 4);      \
>-   xchg((ptr), (x));                       \
>-})
>-
>-#define xchg64(ptr, x)                             \
>-({                                         \
>-   BUILD_BUG_ON(sizeof(*(ptr)) != 8);      \
>-   xchg((ptr), (x));                       \
>+#define __xchg_relaxed(ptr, new, size)                                     \
>+({                                                                 \
>+   __typeof__(ptr) __ptr = (ptr);                                  \
>+   __typeof__(new) __new = (new);                                  \
>+   __typeof__(*(ptr)) __ret;                                       \
>+   switch (size) {                                                 \
>+   case 4:                                                         \
>+           __asm__ __volatile__ (                                  \
>+                   "  amoswap.w %0, %2, %1\n"                    \
>+                   : "=r" (__ret), "+A" (*__ptr)                       \
>+                   : "r" (__new)                                 \
>+                   : "memory");                                  \
>+           break;                                                  \
>+   case 8:                                                         \
>+           __asm__ __volatile__ (                                  \
>+                   "  amoswap.d %0, %2, %1\n"                    \
>+                   : "=r" (__ret), "+A" (*__ptr)                       \
>+                   : "r" (__new)                                 \
>+                   : "memory");                                  \
>+           break;                                                  \
>+   default:                                                        \
>+           BUILD_BUG();                                            \
>+   }                                                               \
>+   __ret;                                                          \
>+})
>+
>+#define xchg_relaxed(ptr, x)                                               \
>+({                                                                 \
>+   __typeof__(*(ptr)) _x_ = (x);                                   \
>+   (__typeof__(*(ptr))) __xchg_relaxed((ptr),                      \
>+                                       _x_, sizeof(*(ptr)));       \
>+})
>+
>+#define __xchg_acquire(ptr, new, size)                                     \
>+({                                                                 \
>+   __typeof__(ptr) __ptr = (ptr);                                  \
>+   __typeof__(new) __new = (new);                                  \
>+   __typeof__(*(ptr)) __ret;                                       \
>+   switch (size) {                                                 \
>+   case 4:                                                         \
>+           __asm__ __volatile__ (                                  \
>+                   "  amoswap.w %0, %2, %1\n"                    \
>+                   RISCV_ACQUIRE_BARRIER                           \
>+                   : "=r" (__ret), "+A" (*__ptr)                       \
>+                   : "r" (__new)                                 \
>+                   : "memory");                                  \
>+           break;                                                  \
>+   case 8:                                                         \
>+           __asm__ __volatile__ (                                  \
>+                   "  amoswap.d %0, %2, %1\n"                    \
>+                   RISCV_ACQUIRE_BARRIER                           \
>+                   : "=r" (__ret), "+A" (*__ptr)                       \
>+                   : "r" (__new)                                 \
>+                   : "memory");                                  \
>+           break;                                                  \
>+   default:                                                        \
>+           BUILD_BUG();                                            \
>+   }                                                               \
>+   __ret;                                                          \
>+})
>+
>+#define xchg_acquire(ptr, x)                                               \
>+({                                                                 \
>+   __typeof__(*(ptr)) _x_ = (x);                                   \
>+   (__typeof__(*(ptr))) __xchg_acquire((ptr),                      \
>+                                       _x_, sizeof(*(ptr)));       \
>+})
>+
>+#define __xchg_release(ptr, new, size)                                     \
>+({                                                                 \
>+   __typeof__(ptr) __ptr = (ptr);                                  \
>+   __typeof__(new) __new = (new);                                  \
>+   __typeof__(*(ptr)) __ret;                                       \
>+   switch (size) {                                                 \
>+   case 4:                                                         \
>+           __asm__ __volatile__ (                                  \
>+                   RISCV_RELEASE_BARRIER                           \
>+                   "  amoswap.w %0, %2, %1\n"                    \
>+                   : "=r" (__ret), "+A" (*__ptr)                       \
>+                   : "r" (__new)                                 \
>+                   : "memory");                                  \
>+           break;                                                  \
>+   case 8:                                                         \
>+           __asm__ __volatile__ (                                  \
>+                   RISCV_RELEASE_BARRIER                           \
>+                   "  amoswap.d %0, %2, %1\n"                    \
>+                   : "=r" (__ret), "+A" (*__ptr)                       \
>+                   : "r" (__new)                                 \
>+                   : "memory");                                  \
>+           break;                                                  \
>+   default:                                                        \
>+           BUILD_BUG();                                            \
>+   }                                                               \
>+   __ret;                                                          \
>+})
>+
>+#define xchg_release(ptr, x)                                               \
>+({                                                                 \
>+   __typeof__(*(ptr)) _x_ = (x);                                   \
>+   (__typeof__(*(ptr))) __xchg_release((ptr),                      \
>+                                       _x_, sizeof(*(ptr)));       \
>+})
>+
>+#define __xchg(ptr, new, size)                                             \
>+({                                                                 \
>+   __typeof__(ptr) __ptr = (ptr);                                  \
>+   __typeof__(new) __new = (new);                                  \
>+   __typeof__(*(ptr)) __ret;                                       \
>+   switch (size) {                                                 \
>+   case 4:                                                         \
>+           __asm__ __volatile__ (                                  \
>+                   "  amoswap.w.aqrl %0, %2, %1\n"               \
>+                   : "=r" (__ret), "+A" (*__ptr)                       \
>+                   : "r" (__new)                                 \
>+                   : "memory");                                  \
>+           break;                                                  \
>+   case 8:                                                         \
>+           __asm__ __volatile__ (                                  \
>+                   "  amoswap.d.aqrl %0, %2, %1\n"               \
>+                   : "=r" (__ret), "+A" (*__ptr)                       \
>+                   : "r" (__new)                                 \
>+                   : "memory");                                  \
>+           break;                                                  \
>+   default:                                                        \
>+           BUILD_BUG();                                            \
>+   }                                                               \
>+   __ret;                                                          \
>+})
>+
>+#define xchg(ptr, x)                                                       \
>+({                                                                 \
>+   __typeof__(*(ptr)) _x_ = (x);                                   \
>+   (__typeof__(*(ptr))) __xchg((ptr), _x_, sizeof(*(ptr)));        \
>+})
>+
>+#define xchg32(ptr, x)                                                     \
>+({                                                                 \
>+   BUILD_BUG_ON(sizeof(*(ptr)) != 4);                              \
>+   xchg((ptr), (x));                                               \
>+})
>+
>+#define xchg64(ptr, x)                                                     \
>+({                                                                 \
>+   BUILD_BUG_ON(sizeof(*(ptr)) != 8);                              \
>+   xchg((ptr), (x));                                               \
> })
>
> /*
>@@ -63,7 +171,51 @@
>  * store NEW in MEM.  Return the initial value in MEM.  Success is
>  * indicated by comparing RETURN with OLD.
>  */
>-#define __cmpxchg(ptr, old, new, size, lrb, scb)                   \
>+#define __cmpxchg_relaxed(ptr, old, new, size)                             \
>+({                                                                 \
>+   __typeof__(ptr) __ptr = (ptr);                                  \
>+   __typeof__(*(ptr)) __old = (old);                               \
>+   __typeof__(*(ptr)) __new = (new);                               \
>+   __typeof__(*(ptr)) __ret;                                       \
>+   register unsigned int __rc;                                     \
>+   switch (size) {                                                 \
>+   case 4:                                                         \
>+           __asm__ __volatile__ (                                  \
>+                   "0:        lr.w %0, %2\n"                             \
>+                   "  bne  %0, %z3, 1f\n"                        \
>+                   "  sc.w %1, %z4, %2\n"                        \
>+                   "  bnez %1, 0b\n"                             \
>+                   "1:\n"                                                \
>+                   : "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr)      \
>+                   : "rJ" (__old), "rJ" (__new)                        \
>+                   : "memory");                                  \
>+           break;                                                  \
>+   case 8:                                                         \
>+           __asm__ __volatile__ (                                  \
>+                   "0:        lr.d %0, %2\n"                             \
>+                   "  bne %0, %z3, 1f\n"                 \
>+                   "  sc.d %1, %z4, %2\n"                        \
>+                   "  bnez %1, 0b\n"                             \
>+                   "1:\n"                                                \
>+                   : "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr)      \
>+                   : "rJ" (__old), "rJ" (__new)                        \
>+                   : "memory");                                  \
>+           break;                                                  \
>+   default:                                                        \
>+           BUILD_BUG();                                            \
>+   }                                                               \
>+   __ret;                                                          \
>+})
>+
>+#define cmpxchg_relaxed(ptr, o, n)                                 \
>+({                                                                 \
>+   __typeof__(*(ptr)) _o_ = (o);                                   \
>+   __typeof__(*(ptr)) _n_ = (n);                                   \
>+   (__typeof__(*(ptr))) __cmpxchg_relaxed((ptr),                   \
>+                                   _o_, _n_, sizeof(*(ptr)));      \
>+})
>+
>+#define __cmpxchg_acquire(ptr, old, new, size)                             \
> ({                                                                 \
>    __typeof__(ptr) __ptr = (ptr);                                  \
>    __typeof__(*(ptr)) __old = (old);                               \
>@@ -73,24 +225,24 @@
>    switch (size) {                                                 \
>    case 4:                                                         \
>            __asm__ __volatile__ (                                  \
>-           "0:"                                                  \
>-                   "lr.w" #scb " %0, %2\n"                             \
>-                   "bne         %0, %z3, 1f\n"                   \
>-                   "sc.w" #lrb " %1, %z4, %2\n"                        \
>-                   "bnez        %1, 0b\n"                                \
>-           "1:"                                                  \
>+                   "0:        lr.w %0, %2\n"                             \
>+                   "  bne  %0, %z3, 1f\n"                        \
>+                   "  sc.w %1, %z4, %2\n"                        \
>+                   "  bnez %1, 0b\n"                             \
>+                   RISCV_ACQUIRE_BARRIER                           \
>+                   "1:\n"                                                \
>                    : "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr)      \
>                    : "rJ" (__old), "rJ" (__new)                        \
>                    : "memory");                                  \
>            break;                                                  \
>    case 8:                                                         \
>            __asm__ __volatile__ (                                  \
>-           "0:"                                                  \
>-                   "lr.d" #scb " %0, %2\n"                             \
>-                   "bne         %0, %z3, 1f\n"                   \
>-                   "sc.d" #lrb " %1, %z4, %2\n"                        \
>-                   "bnez        %1, 0b\n"                                \
>-           "1:"                                                  \
>+                   "0:        lr.d %0, %2\n"                             \
>+                   "  bne %0, %z3, 1f\n"                 \
>+                   "  sc.d %1, %z4, %2\n"                        \
>+                   "  bnez %1, 0b\n"                             \
>+                   RISCV_ACQUIRE_BARRIER                           \
>+                   "1:\n"                                                \
>                    : "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr)      \
>                    : "rJ" (__old), "rJ" (__new)                        \
>                    : "memory");                                  \
>@@ -101,34 +253,131 @@
>    __ret;                                                          \
> })
>
>-#define cmpxchg(ptr, o, n) \
>-   (__cmpxchg((ptr), (o), (n), sizeof(*(ptr)), .aqrl, .aqrl))
>+#define cmpxchg_acquire(ptr, o, n)                                 \
>+({                                                                 \
>+   __typeof__(*(ptr)) _o_ = (o);                                   \
>+   __typeof__(*(ptr)) _n_ = (n);                                   \
>+   (__typeof__(*(ptr))) __cmpxchg_acquire((ptr),                   \
>+                                   _o_, _n_, sizeof(*(ptr)));      \
>+})
>
>-#define cmpxchg_local(ptr, o, n) \
>-   (__cmpxchg((ptr), (o), (n), sizeof(*(ptr)), , ))
>+#define __cmpxchg_release(ptr, old, new, size)                             \
>+({                                                                 \
>+   __typeof__(ptr) __ptr = (ptr);                                  \
>+   __typeof__(*(ptr)) __old = (old);                               \
>+   __typeof__(*(ptr)) __new = (new);                               \
>+   __typeof__(*(ptr)) __ret;                                       \
>+   register unsigned int __rc;                                     \
>+   switch (size) {                                                 \
>+   case 4:                                                         \
>+           __asm__ __volatile__ (                                  \
>+                   RISCV_RELEASE_BARRIER                           \
>+                   "0:        lr.w %0, %2\n"                             \
>+                   "  bne  %0, %z3, 1f\n"                        \
>+                   "  sc.w %1, %z4, %2\n"                        \
>+                   "  bnez %1, 0b\n"                             \
>+                   "1:\n"                                                \
>+                   : "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr)      \
>+                   : "rJ" (__old), "rJ" (__new)                        \
>+                   : "memory");                                  \
>+           break;                                                  \
>+   case 8:                                                         \
>+           __asm__ __volatile__ (                                  \
>+                   RISCV_RELEASE_BARRIER                           \
>+                   "0:        lr.d %0, %2\n"                             \
>+                   "  bne %0, %z3, 1f\n"                 \
>+                   "  sc.d %1, %z4, %2\n"                        \
>+                   "  bnez %1, 0b\n"                             \
>+                   "1:\n"                                                \
>+                   : "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr)      \
>+                   : "rJ" (__old), "rJ" (__new)                        \
>+                   : "memory");                                  \
>+           break;                                                  \
>+   default:                                                        \
>+           BUILD_BUG();                                            \
>+   }                                                               \
>+   __ret;                                                          \
>+})
>+
>+#define cmpxchg_release(ptr, o, n)                                 \
>+({                                                                 \
>+   __typeof__(*(ptr)) _o_ = (o);                                   \
>+   __typeof__(*(ptr)) _n_ = (n);                                   \
>+   (__typeof__(*(ptr))) __cmpxchg_release((ptr),                   \
>+                                   _o_, _n_, sizeof(*(ptr)));      \
>+})
>+
>+#define __cmpxchg(ptr, old, new, size)                                     \
>+({                                                                 \
>+   __typeof__(ptr) __ptr = (ptr);                                  \
>+   __typeof__(*(ptr)) __old = (old);                               \
>+   __typeof__(*(ptr)) __new = (new);                               \
>+   __typeof__(*(ptr)) __ret;                                       \
>+   register unsigned int __rc;                                     \
>+   switch (size) {                                                 \
>+   case 4:                                                         \
>+           __asm__ __volatile__ (                                  \
>+                   "0:        lr.w %0, %2\n"                             \
>+                   "  bne  %0, %z3, 1f\n"                        \
>+                   "  sc.w.rl %1, %z4, %2\n"                     \
>+                   "  bnez %1, 0b\n"                             \
>+                   "  fence rw, rw\n"                            \
>+                   "1:\n"                                                \
>+                   : "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr)      \
>+                   : "rJ" (__old), "rJ" (__new)                        \
>+                   : "memory");                                  \
>+           break;                                                  \
>+   case 8:                                                         \
>+           __asm__ __volatile__ (                                  \
>+                   "0:        lr.d %0, %2\n"                             \
>+                   "  bne %0, %z3, 1f\n"                 \
>+                   "  sc.d.rl %1, %z4, %2\n"                     \
>+                   "  bnez %1, 0b\n"                             \
>+                   "  fence rw, rw\n"                            \
>+                   "1:\n"                                                \
>+                   : "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr)      \
>+                   : "rJ" (__old), "rJ" (__new)                        \
>+                   : "memory");                                  \
>+           break;                                                  \
>+   default:                                                        \
>+           BUILD_BUG();                                            \
>+   }                                                               \
>+   __ret;                                                          \
>+})
>
>-#define cmpxchg32(ptr, o, n)                       \
>-({                                         \
>-   BUILD_BUG_ON(sizeof(*(ptr)) != 4);      \
>-   cmpxchg((ptr), (o), (n));               \
>+#define cmpxchg(ptr, o, n)                                         \
>+({                                                                 \
>+   __typeof__(*(ptr)) _o_ = (o);                                   \
>+   __typeof__(*(ptr)) _n_ = (n);                                   \
>+   (__typeof__(*(ptr))) __cmpxchg((ptr),                           \
>+                                  _o_, _n_, sizeof(*(ptr)));       \
> })
>
>-#define cmpxchg32_local(ptr, o, n)         \
>-({                                         \
>-   BUILD_BUG_ON(sizeof(*(ptr)) != 4);      \
>-   cmpxchg_local((ptr), (o), (n));         \
>+#define cmpxchg_local(ptr, o, n)                                   \
>+   (__cmpxchg_relaxed((ptr), (o), (n), sizeof(*(ptr))))
>+
>+#define cmpxchg32(ptr, o, n)                                               \
>+({                                                                 \
>+   BUILD_BUG_ON(sizeof(*(ptr)) != 4);                              \
>+   cmpxchg((ptr), (o), (n));                                       \
> })
>
>-#define cmpxchg64(ptr, o, n)                       \
>-({                                         \
>-   BUILD_BUG_ON(sizeof(*(ptr)) != 8);      \
>-   cmpxchg((ptr), (o), (n));               \
>+#define cmpxchg32_local(ptr, o, n)                                 \
>+({                                                                 \
>+   BUILD_BUG_ON(sizeof(*(ptr)) != 4);                              \
>+   cmpxchg_relaxed((ptr), (o), (n))                                \
> })
>
>-#define cmpxchg64_local(ptr, o, n)         \
>-({                                         \
>-   BUILD_BUG_ON(sizeof(*(ptr)) != 8);      \
>-   cmpxchg_local((ptr), (o), (n));         \
>+#define cmpxchg64(ptr, o, n)                                               \
>+({                                                                 \
>+   BUILD_BUG_ON(sizeof(*(ptr)) != 8);                              \
>+   cmpxchg((ptr), (o), (n));                                       \
>+})
>+
>+#define cmpxchg64_local(ptr, o, n)                                 \
>+({                                                                 \
>+   BUILD_BUG_ON(sizeof(*(ptr)) != 8);                              \
>+   cmpxchg_relaxed((ptr), (o), (n));                               \
> })
>
> #endif /* _ASM_RISCV_CMPXCHG_H */

Re: [PATCH v2 2/2] riscv/atomic: Strengthen implementations with fences

Reply via email to