On Thu, 2024-03-21 at 14:03 +0100, Jan Beulich wrote:
> On 15.03.2024 19:06, Oleksii Kurochko wrote:
> > Initially the patch was introduced by Bobby, who takes the header
> > from
> > Linux kernel.
> > 
> > The following changes were done on top of Linux kernel header:
> >  - atomic##prefix##_*xchg_*(atomic##prefix##_t *v, c_t n) were
> > updated
> >      to use__*xchg_generic()
> >  - drop casts in write_atomic() as they are unnecessary
> >  - drop introduction of WRITE_ONCE() and READ_ONCE().
> >    Xen provides ACCESS_ONCE()
> 
> Here and in the code comment: While this may be describing what you
> did
> on top of what Bobby had, here you're describing differences to the
> Linux
> header.
> 
> >  - remove zero-length array access in read_atomic()
> >  - drop defines similar to pattern
> 
> pattern? Which one? Oh, wait, ...
> 
> >  - #define atomic_add_return_relaxed   atomic_add_return_relaxed
> 
> ... this line really isn't a separate bullet point.
Yes, '-' is not needed in this text.

> 
> > + */
> > +static always_inline void read_atomic_size(const volatile void *p,
> > +                                           void *res,
> > +                                           unsigned int size)
> > +{
> > +    switch ( size )
> > +    {
> > +    case 1: *(uint8_t *)res = readb(p); break;
> > +    case 2: *(uint16_t *)res = readw(p); break;
> > +    case 4: *(uint32_t *)res = readl(p); break;
> > +    case 8: *(uint32_t *)res  = readq(p); break;
> 
> Nit: Excess blank before =.
> 
> Also - no #ifdef here to be RV32-ready?
Because there is #ifdef RV32 in io.h for readq().

> 
> > +    default: __bad_atomic_size(); break;
> > +    }
> > +}
> > +
> > +#define read_atomic(p) ({                                   \
> > +    union { typeof(*(p)) val; char c[sizeof(*(p))]; } x_;   \
> 
> One trailing underscore here, but ...
> 
> > +    read_atomic_size(p, x_.c, sizeof(*(p)));                \
> > +    x_.val;                                                 \
> > +})
> > +
> > +#define write_atomic(p, x)                              \
> > +({                                                      \
> > +    typeof(*(p)) x__ = (x);                             \
> 
> ... two here and ...
> 
> > +    switch ( sizeof(*(p)) )                             \
> > +    {                                                   \
> > +    case 1: writeb(x__, p); break;                      \
> > +    case 2: writew(x__, p); break;                      \
> > +    case 4: writel(x__, p); break;                      \
> > +    case 8: writeq(x__, p); break;                      \
> > +    default: __bad_atomic_size(); break;                \
> > +    }                                                   \
> > +    x__;                                                \
> > +})
> > +
> > +#define add_sized(p, x)                                 \
> > +({                                                      \
> > +    typeof(*(p)) x__ = (x);                             \
> 
> ... here?
I'll update in the same way.

> 
> > +    switch ( sizeof(*(p)) )                             \
> > +    {                                                   \
> > +    case 1: writeb(read_atomic(p) + x__, p); break;     \
> > +    case 2: writew(read_atomic(p) + x__, p); break;     \
> > +    case 4: writel(read_atomic(p) + x__, p); break;     \
> > +    case 8: writeq(read_atomic(p) + x__, p); break;     \
> > +    default: __bad_atomic_size(); break;                \
> > +    }                                                   \
> > +})
> > +
> > +#define __atomic_acquire_fence() \
> > +    asm volatile ( RISCV_ACQUIRE_BARRIER "" ::: "memory" )
> > +
> > +#define __atomic_release_fence() \
> > +    asm volatile ( RISCV_RELEASE_BARRIER "" ::: "memory" )
> > +
> > +/*
> > + * First, the atomic ops that have no ordering constraints and
> > therefor don't
> > + * have the AQ or RL bits set.  These don't return anything, so
> > there's only
> > + * one version to worry about.
> > + */
> > +#define ATOMIC_OP(op, asm_op, I, asm_type, c_type, prefix)  \
> > +static inline                                               \
> > +void atomic##prefix##_##op(c_type i, atomic##prefix##_t *v) \
> > +{                                                           \
> > +    asm volatile (                                          \
> > +        "   amo" #asm_op "." #asm_type " zero, %1, %0"      \
> > +        : "+A" (v->counter)                                 \
> > +        : "r" (I)                                           \
> 
> Btw, I consider this pretty confusing. At the 1st and 2nd glance this
> looks like a mistake, i.e. as if i was meant. Imo ...
> 
> > +        : "memory" );                                       \
> > +}                                                           \
> > +
> > +/*
> > + * Only CONFIG_GENERIC_ATOMIC64=y was ported to Xen that is the
> > reason why
> > + * last argument for ATOMIC_OP isn't used.
> > + */
> > +#define ATOMIC_OPS(op, asm_op, I)                           \
> > +        ATOMIC_OP (op, asm_op, I, w, int,   )
> > +
> > +ATOMIC_OPS(add, add,  i)
> > +ATOMIC_OPS(sub, add, -i)
> > +ATOMIC_OPS(and, and,  i)
> > +ATOMIC_OPS( or,  or,  i)
> > +ATOMIC_OPS(xor, xor,  i)
> 
> ... here you want to only pass the (unary) operator (and leaving that
> blank
> is as fine as using +).
I agree that a game with 'i' and 'I' looks confusing, but I am not
really understand what is wrong with using ' i' here. It seems that
preprocessed macros looks fine:
   static inline void atomic_add(int i, atomic_t *v) { asm volatile ( "  
   amo" "add" "." "w" " zero, %1, %0" : "+A" (v->counter) : "r" (i) :
   "memory" ); }
   
   static inline void atomic_sub(int i, atomic_t *v) { asm volatile ( "  
   amo" "add" "." "w" " zero, %1, %0" : "+A" (v->counter) : "r" (-i) :
   "memory" ); }

> 
> > +#undef ATOMIC_OP
> > +#undef ATOMIC_OPS
> > +
> > +#include <asm-generic/atomic-ops.h>
> > +
> > +/*
> > + * Atomic ops that have ordered, relaxed, acquire, and release
> > variants.
> 
> Only the first is implemented afaict; imo the comment would better
> reflect
> that one way or another.
> 
> > + * There's two flavors of these: the arithmatic ops have both
> > fetch and return
> > + * versions, while the logical ops only have fetch versions.
> > + */
> > +#define ATOMIC_FETCH_OP(op, asm_op, I, asm_type, c_type,
> > prefix)    \
> > +static
> > inline                                                       \
> > +c_type atomic##prefix##_fetch_##op##_relaxed(c_type
> > i,              \
> > +                         atomic##prefix##_t
> > *v)                     \
> > +{                                                                 
> >   \
> > +    register c_type
> > ret;                                            \
> > +    asm volatile
> > (                                                  \
> > +        "   amo" #asm_op "." #asm_type " %1, %2,
> > %0"                \
> > +        : "+A" (v->counter), "=r"
> > (ret)                             \
> > +        : "r"
> > (I)                                                   \
> > +        : "memory"
> > );                                               \
> > +    return
> > ret;                                                     \
> > +}                                                                 
> >   \
> 
> Actually a relaxed form is provided here, but does that have any
> user?
There is no user for a relaxed form, just overlooked that.

> 
> > +static
> > inline                                                       \
> > +c_type atomic##prefix##_fetch_##op(c_type i, atomic##prefix##_t
> > *v) \
> > +{                                                                 
> >   \
> > +    register c_type
> > ret;                                            \
> > +    asm volatile
> > (                                                  \
> > +        "   amo" #asm_op "." #asm_type ".aqrl  %1, %2,
> > %0"          \
> > +        : "+A" (v->counter), "=r"
> > (ret)                             \
> > +        : "r"
> > (I)                                                   \
> > +        : "memory"
> > );                                               \
> > +    return
> > ret;                                                     \
> > +}
> > +
> > +#define ATOMIC_OP_RETURN(op, asm_op, c_op, I, asm_type, c_type,
> > prefix) \
> > +static
> > inline                                                           \
> > +c_type atomic##prefix##_##op##_return_relaxed(c_type
> > i,                 \
> > +                          atomic##prefix##_t
> > *v)                        \
> > +{                                                                 
> >       \
> > +        return atomic##prefix##_fetch_##op##_relaxed(i, v) c_op
> > I;      \
> > +}                                                                 
> >       \
> > +static
> > inline                                                           \
> > +c_type atomic##prefix##_##op##_return(c_type i, atomic##prefix##_t
> > *v)  \
> > +{                                                                 
> >       \
> > +        return atomic##prefix##_fetch_##op(i, v) c_op
> > I;                \
> 
> I (or whatever the replacement expression is going to be following
> the
> earlier comment) wants parenthesizing here.
> 
> > +}
> > +
> > +/*
> > + * Only CONFIG_GENERIC_ATOMIC64=y was ported to Xen that is the
> > reason why
> > + * last argument of ATOMIC_FETCH_OP, ATOMIC_OP_RETURN isn't used.
> > + */
> > +#define ATOMIC_OPS(op, asm_op, c_op,
> > I)                                 \
> > +        ATOMIC_FETCH_OP( op, asm_op,       I, w, int,  
> > )               \
> > +        ATOMIC_OP_RETURN(op, asm_op, c_op, I, w, int,   )
> > +
> > +ATOMIC_OPS(add, add, +,  i)
> > +ATOMIC_OPS(sub, add, +, -i)
> > +
> > +#undef ATOMIC_OPS
> > +
> > +#define ATOMIC_OPS(op, asm_op, I) \
> > +        ATOMIC_FETCH_OP(op, asm_op, I, w, int,   )
> > +
> > +ATOMIC_OPS(and, and, i)
> > +ATOMIC_OPS( or,  or, i)
> > +ATOMIC_OPS(xor, xor, i)
> > +
> > +#undef ATOMIC_OPS
> > +
> > +#undef ATOMIC_FETCH_OP
> > +#undef ATOMIC_OP_RETURN
> > +
> > +/* This is required to provide a full barrier on success. */
> > +static inline int atomic_add_unless(atomic_t *v, int a, int u)
> > +{
> > +       int prev, rc;
> > +
> > +    asm volatile (
> > +        "0: lr.w     %[p],  %[c]\n"
> > +        "   beq      %[p],  %[u], 1f\n"
> > +        "   add      %[rc], %[p], %[a]\n"
> > +        "   sc.w.rl  %[rc], %[rc], %[c]\n"
> > +        "   bnez     %[rc], 0b\n"
> > +        RISCV_FULL_BARRIER
> 
> With this and no .aq on the load, why the .rl on the store?
It is something that LKMM requires [1].

This is not fully clear to me what is so specific in LKMM, but accoring
to the spec:
   Ordering Annotation Fence-based Equivalent
   l{b|h|w|d|r}.aq     l{b|h|w|d|r}; fence r,rw
   l{b|h|w|d|r}.aqrl   fence rw,rw; l{b|h|w|d|r}; fence r,rw
   s{b|h|w|d|c}.rl     fence rw,w; s{b|h|w|d|c}
   s{b|h|w|d|c}.aqrl   fence rw,w; s{b|h|w|d|c}
   amo<op>.aq          amo<op>; fence r,rw
   amo<op>.rl          fence rw,w; amo<op>
   amo<op>.aqrl        fence rw,rw; amo<op>; fence rw,rw
   Table 2.2: Mappings from .aq and/or .rl to fence-based equivalents.
   An alternative mapping places a fence rw,rw after the existing 
   s{b|h|w|d|c} mapping rather than at the front of the
   l{b|h|w|d|r} mapping.
   
   It is also safe to translate any .aq, .rl, or .aqrl annotation into
   the fence-based snippets of
   Table 2.2. These can also be used as a legal implementation of
   l{b|h|w|d} or s{b|h|w|d} pseu-
   doinstructions for as long as those instructions are not added to
   the ISA.

So according to the spec, it should be:
 sc.w ...
 RISCV_FULL_BARRIER.

Considering [1] and how this code looks before, it seems to me that it
is safe to use lr.w.aq/sc.w.rl here or an fence equivalent.

But in general it ( a combination of fence, .aq, .rl ) can be
considered as the same things in this context, so it is possible to
leave this function as is to be synced here with Linux kernel.

[1]https://lore.kernel.org/lkml/1520274276-21871-1-git-send-email-parri.and...@gmail.com/

~ Oleksii

> 
> > +        "1:\n"
> > +        : [p] "=&r" (prev), [rc] "=&r" (rc), [c] "+A" (v->counter)
> > +        : [a] "r" (a), [u] "r" (u)
> > +        : "memory");
> > +    return prev;
> > +}
> > +
> > +/*
> > + * atomic_{cmp,}xchg is required to have exactly the same ordering
> > semantics as
> > + * {cmp,}xchg and the operations that return.
> > + */
> > +#define ATOMIC_OP(c_t, prefix, size)                            \
> > +static inline                                                   \
> > +c_t atomic##prefix##_xchg(atomic##prefix##_t *v, c_t n)         \
> > +{                                                               \
> > +    return __xchg(&(v->counter), n, size);                      \
> 
> No need for the inner parentheses, just like ...
> 
> > +}                                                               \
> > +static inline                                                   \
> > +c_t atomic##prefix##_cmpxchg(atomic##prefix##_t *v, c_t o, c_t n)
> > \
> > +{                                                               \
> > +    return __cmpxchg(&v->counter, o, n, size);                  \
> 
> ... you have it here.
> 
> > +}
> > +
> > +#define ATOMIC_OPS() \
> > +    ATOMIC_OP(int,   , 4)
> > +
> > +ATOMIC_OPS()
> > +
> > +#undef ATOMIC_OPS
> > +#undef ATOMIC_OP
> > +
> > +static inline int atomic_sub_if_positive(atomic_t *v, int offset)
> > +{
> > +       int prev, rc;
> > +
> > +    asm volatile (
> > +        "0: lr.w     %[p],  %[c]\n"
> > +        "   sub      %[rc], %[p], %[o]\n"
> > +        "   bltz     %[rc], 1f\n"
> > +        "   sc.w.rl  %[rc], %[rc], %[c]\n"
> > +        "   bnez     %[rc], 0b\n"
> > +        "   fence    rw, rw\n"
> > +        "1:\n"
> > +        : [p] "=&r" (prev), [rc] "=&r" (rc), [c] "+A" (v->counter)
> > +        : [o] "r" (offset)
> > +        : "memory" );
> > +    return prev - offset;
> > +}
> 
> This probably would be nicer if sitting next to atomic_add_unless().
> 
> > --- /dev/null
> > +++ b/xen/include/asm-generic/atomic-ops.h
> > @@ -0,0 +1,97 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +/*
> > + * The header provides default implementations for every
> > xen/atomic.h-provided
> > + * forward inline declaration that can be synthesized from other
> > atomic
> > + * functions.
> 
> Or from scratch, as e.g. ...
> 
> > + */
> > +#ifndef _ASM_GENERIC_ATOMIC_OPS_H_
> > +#define _ASM_GENERIC_ATOMIC_OPS_H_
> > +
> > +#include <xen/atomic.h>
> > +#include <xen/lib.h>
> > +
> > +#ifndef ATOMIC_READ
> > +static inline int atomic_read(const atomic_t *v)
> > +{
> > +    return ACCESS_ONCE(v->counter);
> > +}
> > +#endif
> > +
> > +#ifndef _ATOMIC_READ
> > +static inline int _atomic_read(atomic_t v)
> > +{
> > +    return v.counter;
> > +}
> > +#endif
> > +
> > +#ifndef ATOMIC_SET
> > +static inline void atomic_set(atomic_t *v, int i)
> > +{
> > +    ACCESS_ONCE(v->counter) = i;
> > +}
> > +#endif
> > +
> > +#ifndef _ATOMIC_SET
> > +static inline void _atomic_set(atomic_t *v, int i)
> > +{
> > +    v->counter = i;
> > +}
> > +#endif
> 
> ... all of these.
> 
> > +#ifndef ATOMIC_SUB_AND_TEST
> > +static inline int atomic_sub_and_test(int i, atomic_t *v)
> > +{
> > +    return atomic_sub_return(i, v) == 0;
> > +}
> > +#endif
> > +
> > +#ifndef ATOMIC_INC
> > +static inline void atomic_inc(atomic_t *v)
> > +{
> > +    atomic_add(1, v);
> > +}
> > +#endif
> > +
> > +#ifndef ATOMIC_INC_RETURN
> > +static inline int atomic_inc_return(atomic_t *v)
> > +{
> > +    return atomic_add_return(1, v);
> > +}
> > +#endif
> > +
> > +#ifndef ATOMIC_DEC
> > +static inline void atomic_dec(atomic_t *v)
> > +{
> > +    atomic_sub(1, v);
> > +}
> > +#endif
> > +
> > +#ifndef ATOMIC_DEC_RETURN
> > +static inline int atomic_dec_return(atomic_t *v)
> > +{
> > +    return atomic_sub_return(1, v);
> > +}
> > +#endif
> > +
> > +#ifndef ATOMIC_DEC_AND_TEST
> > +static inline int atomic_dec_and_test(atomic_t *v)
> > +{
> > +    return atomic_sub_return(1, v) == 0;
> > +}
> > +#endif
> > +
> > +#ifndef ATOMIC_ADD_NEGATIVE
> > +static inline int atomic_add_negative(int i, atomic_t *v)
> > +{
> > +    return atomic_add_return(i, v) < 0;
> > +}
> > +#endif
> > +
> > +#ifndef ATOMIC_INC_AND_TEST
> > +static inline int atomic_inc_and_test(atomic_t *v)
> > +{
> > +    return atomic_add_return(1, v) == 0;
> > +}
> > +#endif
> 
> Can this be moved up a little, perhaps next to the other inc-s (or
> else
> next to dec_and_test), please?
> 
> Jan

Reply via email to