arm64: add 128-bit atomic compare exchange

David Marchand Mon, 14 Oct 2019 08:44:04 -0700

On Wed, Aug 14, 2019 at 10:29 AM Phil Yang <[email protected]> wrote:
>
> Add 128-bit atomic compare exchange on aarch64.


A bit short, seeing the complexity of the code and the additional
RTE_ARM_FEATURE_ATOMICS config flag.


Comments inline.

>
> Suggested-by: Jerin Jacob <[email protected]>
> Signed-off-by: Phil Yang <[email protected]>
> Reviewed-by: Honnappa Nagarahalli <[email protected]>
> Tested-by: Honnappa Nagarahalli <[email protected]>
> Acked-by: Jerin Jacob <[email protected]>
> ---
>
> v9:
> Updated 19.11 release note.
>
> v8:
> Fixed "WARNING:LONG_LINE: line over 80 characters" warnings with latest kernel
> checkpatch.pl
>
> v7:
> 1. Adjust code comment.
>
> v6:
> 1. Put the RTE_ARM_FEATURE_ATOMICS flag into EAL group. (Jerin Jocob)
> 2. Keep rte_stack_lf_stubs.h doing nothing. (Gage Eads)
> 3. Fixed 32 bit build issue.
>
> v5:
> 1. Enable RTE_ARM_FEATURE_ATOMICS on octeontx2 in default. (Jerin Jocob)
> 2. Record the reason of introducing "rte_stack_lf_stubs.h" in git
> commit.
> (Jerin, Jocob)
> 3. Fixed a conditional MACRO error in rte_atomic128_cmp_exchange. (Jerin
> Jocob)
>
> v4:
> 1. Add RTE_ARM_FEATURE_ATOMICS flag to support LSE CASP instructions.
> (Jerin Jocob)
> 2. Fix possible arm64 ABI break by making casp_op_name noinline. (Jerin
> Jocob)
> 3. Add rte_stack_lf_stubs.h to reduce the ifdef clutter. (Gage
> Eads/Jerin Jocob)
>
> v3:
> 1. Avoid duplication code with macro. (Jerin Jocob)
> 2. Make invalid memory order to strongest barrier. (Jerin Jocob)
> 3. Update doc/guides/prog_guide/env_abstraction_layer.rst. (Gage Eads)
> 4. Fix 32-bit x86 builds issue. (Gage Eads)
> 5. Correct documentation issues in UT. (Gage Eads)
>
> v2:
> Initial version.
>
>  config/arm/meson.build                             |   2 +
>  config/common_base                                 |   3 +
>  config/defconfig_arm64-octeontx2-linuxapp-gcc      |   1 +
>  config/defconfig_arm64-thunderx2-linuxapp-gcc      |   1 +
>  .../common/include/arch/arm/rte_atomic_64.h        | 163 
> +++++++++++++++++++++
>  .../common/include/arch/x86/rte_atomic_64.h        |  12 --
>  lib/librte_eal/common/include/generic/rte_atomic.h |  17 ++-
>  7 files changed, 186 insertions(+), 13 deletions(-)
>
> diff --git a/config/arm/meson.build b/config/arm/meson.build
> index 979018e..9f28271 100644
> --- a/config/arm/meson.build
> +++ b/config/arm/meson.build
> @@ -71,11 +71,13 @@ flags_thunderx2_extra = [
>         ['RTE_CACHE_LINE_SIZE', 64],
>         ['RTE_MAX_NUMA_NODES', 2],
>         ['RTE_MAX_LCORE', 256],
> +       ['RTE_ARM_FEATURE_ATOMICS', true],
>         ['RTE_USE_C11_MEM_MODEL', true]]
>  flags_octeontx2_extra = [
>         ['RTE_MACHINE', '"octeontx2"'],
>         ['RTE_MAX_NUMA_NODES', 1],
>         ['RTE_MAX_LCORE', 24],
> +       ['RTE_ARM_FEATURE_ATOMICS', true],
>         ['RTE_EAL_IGB_UIO', false],
>         ['RTE_USE_C11_MEM_MODEL', true]]
>
> diff --git a/config/common_base b/config/common_base
> index 8ef75c2..2054480 100644
> --- a/config/common_base
> +++ b/config/common_base
> @@ -82,6 +82,9 @@ CONFIG_RTE_MAX_LCORE=128
>  CONFIG_RTE_MAX_NUMA_NODES=8
>  CONFIG_RTE_MAX_HEAPS=32
>  CONFIG_RTE_MAX_MEMSEG_LISTS=64
> +
> +# Use ARM LSE ATOMIC instructions
> +CONFIG_RTE_ARM_FEATURE_ATOMICS=n
>  # each memseg list will be limited to either RTE_MAX_MEMSEG_PER_LIST pages
>  # or RTE_MAX_MEM_MB_PER_LIST megabytes worth of memory, whichever is smaller
>  CONFIG_RTE_MAX_MEMSEG_PER_LIST=8192
> diff --git a/config/defconfig_arm64-octeontx2-linuxapp-gcc 
> b/config/defconfig_arm64-octeontx2-linuxapp-gcc
> index f20da24..7687dbe 100644
> --- a/config/defconfig_arm64-octeontx2-linuxapp-gcc
> +++ b/config/defconfig_arm64-octeontx2-linuxapp-gcc
> @@ -9,6 +9,7 @@ CONFIG_RTE_MACHINE="octeontx2"
>  CONFIG_RTE_CACHE_LINE_SIZE=128
>  CONFIG_RTE_MAX_NUMA_NODES=1
>  CONFIG_RTE_MAX_LCORE=24
> +CONFIG_RTE_ARM_FEATURE_ATOMICS=y
>
>  # Doesn't support NUMA
>  CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
> diff --git a/config/defconfig_arm64-thunderx2-linuxapp-gcc 
> b/config/defconfig_arm64-thunderx2-linuxapp-gcc
> index cc5c64b..af4a89c 100644
> --- a/config/defconfig_arm64-thunderx2-linuxapp-gcc
> +++ b/config/defconfig_arm64-thunderx2-linuxapp-gcc
> @@ -9,3 +9,4 @@ CONFIG_RTE_MACHINE="thunderx2"
>  CONFIG_RTE_CACHE_LINE_SIZE=64
>  CONFIG_RTE_MAX_NUMA_NODES=2
>  CONFIG_RTE_MAX_LCORE=256
> +CONFIG_RTE_ARM_FEATURE_ATOMICS=y
> diff --git a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h 
> b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> index 97060e4..14d869b 100644
> --- a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> +++ b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> @@ -1,5 +1,6 @@
>  /* SPDX-License-Identifier: BSD-3-Clause
>   * Copyright(c) 2015 Cavium, Inc
> + * Copyright(c) 2019 Arm Limited
>   */
>
>  #ifndef _RTE_ATOMIC_ARM64_H_
> @@ -14,6 +15,9 @@ extern "C" {
>  #endif
>
>  #include "generic/rte_atomic.h"
> +#include <rte_branch_prediction.h>
> +#include <rte_compat.h>
> +#include <rte_debug.h>
>
>  #define dsb(opt) asm volatile("dsb " #opt : : : "memory")
>  #define dmb(opt) asm volatile("dmb " #opt : : : "memory")
> @@ -40,6 +44,165 @@ extern "C" {
>
>  #define rte_cio_rmb() dmb(oshld)
>
> +/*------------------------ 128 bit atomic operations 
> -------------------------*/
> +
> +#define __HAS_ACQ(mo) ((mo) != __ATOMIC_RELAXED && (mo) != __ATOMIC_RELEASE)
> +#define __HAS_RLS(mo) ((mo) == __ATOMIC_RELEASE || (mo) == __ATOMIC_ACQ_REL 
> || \
> +                                         (mo) == __ATOMIC_SEQ_CST)
> +
> +#define __MO_LOAD(mo)  (__HAS_ACQ((mo)) ? __ATOMIC_ACQUIRE : 
> __ATOMIC_RELAXED)
> +#define __MO_STORE(mo) (__HAS_RLS((mo)) ? __ATOMIC_RELEASE : 
> __ATOMIC_RELAXED)

Those 4 first macros only make sense when LSE is not available (see below [1]).
Besides, they are used only once, why not directly use those
conditions where needed?


> +
> +#if defined(__ARM_FEATURE_ATOMICS) || defined(RTE_ARM_FEATURE_ATOMICS)
> +#define __ATOMIC128_CAS_OP(cas_op_name, op_string)                          \
> +static __rte_noinline rte_int128_t                                          \
> +cas_op_name(rte_int128_t *dst, rte_int128_t old,                            \
> +               rte_int128_t updated)                                       \
> +{                                                                           \
> +       /* caspX instructions register pair must start from even-numbered
> +        * register at operand 1.
> +        * So, specify registers for local variables here.
> +        */                                                                 \
> +       register uint64_t x0 __asm("x0") = (uint64_t)old.val[0];            \
> +       register uint64_t x1 __asm("x1") = (uint64_t)old.val[1];            \
> +       register uint64_t x2 __asm("x2") = (uint64_t)updated.val[0];        \
> +       register uint64_t x3 __asm("x3") = (uint64_t)updated.val[1];        \
> +       asm volatile(                                                       \
> +               op_string " %[old0], %[old1], %[upd0], %[upd1], [%[dst]]"   \
> +               : [old0] "+r" (x0),                                         \
> +               [old1] "+r" (x1)                                            \
> +               : [upd0] "r" (x2),                                          \
> +               [upd1] "r" (x3),                                            \
> +               [dst] "r" (dst)                                             \
> +               : "memory");                                                \
> +       old.val[0] = x0;                                                    \
> +       old.val[1] = x1;                                                    \
> +       return old;                                                         \
> +}
> +
> +__ATOMIC128_CAS_OP(__rte_cas_relaxed, "casp")
> +__ATOMIC128_CAS_OP(__rte_cas_acquire, "caspa")
> +__ATOMIC128_CAS_OP(__rte_cas_release, "caspl")
> +__ATOMIC128_CAS_OP(__rte_cas_acq_rel, "caspal")

If LSE is available, we expose __rte_cas_XX (explicitely) *non*
inlined functions, while without LSE, we expose inlined __rte_ldr_XX
and __rte_stx_XX functions.
So we have a first disparity with non-inlined vs inlined functions
depending on a #ifdef.
Then, we have a second disparity with two sets of "apis" depending on
this #ifdef.

And we expose those sets with a rte_ prefix, meaning people will try
to use them, but those are not part of a public api.

Can't we do without them ? (see below [2] for a proposal with ldr/stx,
cas should be the same)


> +#else
> +#define __ATOMIC128_LDX_OP(ldx_op_name, op_string)                          \
> +static inline rte_int128_t                                                  \
> +ldx_op_name(const rte_int128_t *src)                                        \
> +{                                                                           \
> +       rte_int128_t ret;                                                   \
> +       asm volatile(                                                       \
> +                       op_string " %0, %1, %2"                             \
> +                       : "=&r" (ret.val[0]),                               \
> +                         "=&r" (ret.val[1])                                \
> +                       : "Q" (src->val[0])                                 \
> +                       : "memory");                                        \
> +       return ret;                                                         \
> +}
> +
> +__ATOMIC128_LDX_OP(__rte_ldx_relaxed, "ldxp")
> +__ATOMIC128_LDX_OP(__rte_ldx_acquire, "ldaxp")
> +
> +#define __ATOMIC128_STX_OP(stx_op_name, op_string)                          \
> +static inline uint32_t                                                      \
> +stx_op_name(rte_int128_t *dst, const rte_int128_t src)                      \
> +{                                                                           \
> +       uint32_t ret;                                                       \
> +       asm volatile(                                                       \
> +                       op_string " %w0, %1, %2, %3"                        \
> +                       : "=&r" (ret)                                       \
> +                       : "r" (src.val[0]),                                 \
> +                         "r" (src.val[1]),                                 \
> +                         "Q" (dst->val[0])                                 \
> +                       : "memory");                                        \
> +       /* Return 0 on success, 1 on failure */                             \
> +       return ret;                                                         \
> +}
> +
> +__ATOMIC128_STX_OP(__rte_stx_relaxed, "stxp")
> +__ATOMIC128_STX_OP(__rte_stx_release, "stlxp")
> +#endif
> +
> +static inline int __rte_experimental

The __rte_experimental tag comes first.


> +rte_atomic128_cmp_exchange(rte_int128_t *dst,
> +                               rte_int128_t *exp,
> +                               const rte_int128_t *src,
> +                               unsigned int weak,
> +                               int success,
> +                               int failure)
> +{
> +       /* Always do strong CAS */
> +       RTE_SET_USED(weak);
> +       /* Ignore memory ordering for failure, memory order for
> +        * success must be stronger or equal
> +        */
> +       RTE_SET_USED(failure);
> +       /* Find invalid memory order */
> +       RTE_ASSERT(success == __ATOMIC_RELAXED
> +                       || success == __ATOMIC_ACQUIRE
> +                       || success == __ATOMIC_RELEASE
> +                       || success == __ATOMIC_ACQ_REL
> +                       || success == __ATOMIC_SEQ_CST);
> +
> +#if defined(__ARM_FEATURE_ATOMICS) || defined(RTE_ARM_FEATURE_ATOMICS)
> +       rte_int128_t expected = *exp;
> +       rte_int128_t desired = *src;
> +       rte_int128_t old;
> +
> +       if (success == __ATOMIC_RELAXED)
> +               old = __rte_cas_relaxed(dst, expected, desired);
> +       else if (success == __ATOMIC_ACQUIRE)
> +               old = __rte_cas_acquire(dst, expected, desired);
> +       else if (success == __ATOMIC_RELEASE)
> +               old = __rte_cas_release(dst, expected, desired);
> +       else
> +               old = __rte_cas_acq_rel(dst, expected, desired);
> +#else

1: the four first macros (on the memory ordering constraints) can be
moved here then undef'd once unused.
Or you can just do without them.


> +       int ldx_mo = __MO_LOAD(success);
> +       int stx_mo = __MO_STORE(success);
> +       uint32_t ret = 1;
> +       register rte_int128_t expected = *exp;
> +       register rte_int128_t desired = *src;
> +       register rte_int128_t old;
> +
> +       /* ldx128 can not guarantee atomic,
> +        * Must write back src or old to verify atomicity of ldx128;
> +        */
> +       do {
> +               if (ldx_mo == __ATOMIC_RELAXED)
> +                       old = __rte_ldx_relaxed(dst);
> +               else
> +                       old = __rte_ldx_acquire(dst);

2: how about using a simple macro that gets passed the op string?

Something like (untested):

#define __READ_128(op_string, src, dst) \
    asm volatile(                      \
        op_string " %0, %1, %2"    \
        : "=&r" (dst.val[0]),      \
          "=&r" (dst.val[1])       \
        : "Q" (src->val[0])        \
        : "memory")

Then used like this:

        if (ldx_mo == __ATOMIC_RELAXED)
            __READ_128("ldxp", dst, old);
        else
            __READ_128("ldaxp", dst, old);

#undef __READ_128

> +
> +               if (likely(old.int128 == expected.int128)) {
> +                       if (stx_mo == __ATOMIC_RELAXED)
> +                               ret = __rte_stx_relaxed(dst, desired);
> +                       else
> +                               ret = __rte_stx_release(dst, desired);
> +               } else {
> +                       /* In the failure case (since 'weak' is ignored and 
> only
> +                        * weak == 0 is implemented), expected should contain
> +                        * the atomically read value of dst. This means, 'old'
> +                        * needs to be stored back to ensure it was read
> +                        * atomically.
> +                        */
> +                       if (stx_mo == __ATOMIC_RELAXED)
> +                               ret = __rte_stx_relaxed(dst, old);
> +                       else
> +                               ret = __rte_stx_release(dst, old);

And:

#define __STORE_128(op_string, dst, val, ret) \
    asm volatile(                        \
        op_string " %w0, %1, %2, %3"     \
        : "=&r" (ret)                    \
        : "r" (val.val[0]),              \
          "r" (val.val[1]),              \
          "Q" (dst->val[0])              \
        : "memory")

Used like this:

        if (likely(old.int128 == expected.int128)) {
            if (stx_mo == __ATOMIC_RELAXED)
                __STORE_128("stxp", dst, desired, ret);
            else
                __STORE_128("stlxp", dst, desired, ret);
        } else {
            /* In the failure case (since 'weak' is ignored and only
             * weak == 0 is implemented), expected should contain
             * the atomically read value of dst. This means, 'old'
             * needs to be stored back to ensure it was read
             * atomically.
             */
            if (stx_mo == __ATOMIC_RELAXED)
                __STORE_128("stxp", dst, old, ret);
            else
                __STORE_128("stlxp", dst, old, ret);
        }

#undef __STORE_128


> +               }
> +       } while (unlikely(ret));
> +#endif
> +
> +       /* Unconditionally updating expected removes
> +        * an 'if' statement.
> +        * expected should already be in register if
> +        * not in the cache.
> +        */
> +       *exp = old;
> +
> +       return (old.int128 == expected.int128);
> +}
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h 
> b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> index 1335d92..cfe7067 100644
> --- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> @@ -183,18 +183,6 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
>
>  /*------------------------ 128 bit atomic operations 
> -------------------------*/
>
> -/**
> - * 128-bit integer structure.
> - */
> -RTE_STD_C11
> -typedef struct {
> -       RTE_STD_C11
> -       union {
> -               uint64_t val[2];
> -               __extension__ __int128 int128;
> -       };
> -} __rte_aligned(16) rte_int128_t;
> -
>  __rte_experimental
>  static inline int
>  rte_atomic128_cmp_exchange(rte_int128_t *dst,
> diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h 
> b/lib/librte_eal/common/include/generic/rte_atomic.h
> index 24ff7dc..e6ab15a 100644
> --- a/lib/librte_eal/common/include/generic/rte_atomic.h
> +++ b/lib/librte_eal/common/include/generic/rte_atomic.h
> @@ -1081,6 +1081,20 @@ static inline void rte_atomic64_clear(rte_atomic64_t 
> *v)
>
>  /*------------------------ 128 bit atomic operations 
> -------------------------*/
>
> +/**
> + * 128-bit integer structure.
> + */
> +RTE_STD_C11
> +typedef struct {
> +       RTE_STD_C11
> +       union {
> +               uint64_t val[2];
> +#ifdef RTE_ARCH_64
> +               __extension__ __int128 int128;
> +#endif

You hid this field for x86.
What is the reason?


> +       };
> +} __rte_aligned(16) rte_int128_t;
> +
>  #ifdef __DOXYGEN__
>
>  /**
> @@ -1093,7 +1107,8 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
>   *     *exp = *dst
>   * @endcode
>   *
> - * @note This function is currently only available for the x86-64 platform.
> + * @note This function is currently available for the x86-64 and aarch64
> + * platforms.
>   *
>   * @note The success and failure arguments must be one of the __ATOMIC_* 
> values
>   * defined in the C++11 standard. For details on their behavior, refer to the
> --
> 2.7.4
>



--
David Marchand

Re: [dpdk-dev] [PATCH v9 1/3] eal/arm64: add 128-bit atomic compare exchange

Reply via email to