[lttng-dev] [PATCH 2/7] Use gcc __atomic builtis for implementation
Replace the custom assembly code in include/urcu/uatomic/ with __atomic builtins provided by C11-compatible compiler. Signed-off-by: Ondřej Surý --- include/Makefile.am| 16 - include/urcu/uatomic.h | 84 +++-- include/urcu/uatomic/aarch64.h | 41 --- include/urcu/uatomic/alpha.h | 32 -- include/urcu/uatomic/arm.h | 57 --- include/urcu/uatomic/gcc.h | 46 --- include/urcu/uatomic/generic.h | 613 --- include/urcu/uatomic/hppa.h| 10 - include/urcu/uatomic/ia64.h| 41 --- include/urcu/uatomic/m68k.h| 44 --- include/urcu/uatomic/mips.h| 32 -- include/urcu/uatomic/nios2.h | 32 -- include/urcu/uatomic/ppc.h | 237 include/urcu/uatomic/riscv.h | 44 --- include/urcu/uatomic/s390.h| 170 - include/urcu/uatomic/sparc64.h | 81 - include/urcu/uatomic/tile.h| 41 --- include/urcu/uatomic/x86.h | 646 - 18 files changed, 53 insertions(+), 2214 deletions(-) delete mode 100644 include/urcu/uatomic/aarch64.h delete mode 100644 include/urcu/uatomic/alpha.h delete mode 100644 include/urcu/uatomic/arm.h delete mode 100644 include/urcu/uatomic/gcc.h delete mode 100644 include/urcu/uatomic/generic.h delete mode 100644 include/urcu/uatomic/hppa.h delete mode 100644 include/urcu/uatomic/ia64.h delete mode 100644 include/urcu/uatomic/m68k.h delete mode 100644 include/urcu/uatomic/mips.h delete mode 100644 include/urcu/uatomic/nios2.h delete mode 100644 include/urcu/uatomic/ppc.h delete mode 100644 include/urcu/uatomic/riscv.h delete mode 100644 include/urcu/uatomic/s390.h delete mode 100644 include/urcu/uatomic/sparc64.h delete mode 100644 include/urcu/uatomic/tile.h delete mode 100644 include/urcu/uatomic/x86.h diff --git a/include/Makefile.am b/include/Makefile.am index ba1fe60..53a28fd 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -59,24 +59,8 @@ nobase_include_HEADERS = \ urcu/syscall-compat.h \ urcu/system.h \ urcu/tls-compat.h \ - urcu/uatomic/aarch64.h \ - urcu/uatomic/alpha.h \ urcu/uatomic_arch.h \ - urcu/uatomic/arm.h \ - urcu/uatomic/gcc.h \ - urcu/uatomic/generic.h \ urcu/uatomic.h \ - urcu/uatomic/hppa.h \ - urcu/uatomic/ia64.h \ - urcu/uatomic/m68k.h \ - urcu/uatomic/mips.h \ - urcu/uatomic/nios2.h \ - urcu/uatomic/ppc.h \ - urcu/uatomic/riscv.h \ - urcu/uatomic/s390.h \ - urcu/uatomic/sparc64.h \ - urcu/uatomic/tile.h \ - urcu/uatomic/x86.h \ urcu/urcu-bp.h \ urcu/urcu-futex.h \ urcu/urcu.h \ diff --git a/include/urcu/uatomic.h b/include/urcu/uatomic.h index 2fb5fd4..4dbef4c 100644 --- a/include/urcu/uatomic.h +++ b/include/urcu/uatomic.h @@ -22,37 +22,59 @@ #define _URCU_UATOMIC_H #include +#include -#if defined(URCU_ARCH_X86) -#include -#elif defined(URCU_ARCH_PPC) -#include -#elif defined(URCU_ARCH_S390) -#include -#elif defined(URCU_ARCH_SPARC64) -#include -#elif defined(URCU_ARCH_ALPHA) -#include -#elif defined(URCU_ARCH_IA64) -#include -#elif defined(URCU_ARCH_ARM) -#include -#elif defined(URCU_ARCH_AARCH64) -#include -#elif defined(URCU_ARCH_MIPS) -#include -#elif defined(URCU_ARCH_NIOS2) -#include -#elif defined(URCU_ARCH_TILE) -#include -#elif defined(URCU_ARCH_HPPA) -#include -#elif defined(URCU_ARCH_M68K) -#include -#elif defined(URCU_ARCH_RISCV) -#include -#else -#error "Cannot build: unrecognized architecture, see ." -#endif +#define UATOMIC_HAS_ATOMIC_BYTE +#define UATOMIC_HAS_ATOMIC_SHORT + +#define uatomic_set(addr, v) __atomic_store_n(addr, v, __ATOMIC_RELEASE) + +#define uatomic_read(addr) __atomic_load_n((addr), __ATOMIC_CONSUME) + +#define uatomic_xchg(addr, v) __atomic_exchange_n((addr), (v), __ATOMIC_ACQ_REL) + +#define uatomic_cmpxchg(addr, old, new) \ + ({ \ + __typeof__(*(addr)) __old = old; \ + __atomic_compare_exchange_n(addr, &__old, new, 0, \ + __ATOMIC_ACQ_REL, __ATOMIC_CONSUME);\ + __old; \ + }) + +#define uatomic_add_return(addr, v) \ + __atomic_add_fetch((addr), (v), __ATOMIC_ACQ_REL) + +#define uatomic_add(addr, v) \ + (void)__atomic_add_fetch((addr), (v), __ATOMIC_RELAXED) + +#define uatomic_sub_return(addr, v) \ + __atomic_sub_fetch((addr), (v), __ATOMIC_ACQ_REL) + +#define uatomic_sub(addr, v) \ + (void)__atomic_sub_fetch((addr), (v), __ATOMIC_RELAXED) + +#define uatomic_and(addr, mask) \ + (void)__atomic_and_fetch((addr), (mask), __ATOMIC_RELAXED) + +#define uatomic_or(addr, mask) \ + (void)__atomic_or_fetch((addr), (mask), __ATOMIC_RELAXED) + +#define
[lttng-dev] [PATCH 2/7] Use gcc __atomic builtis for implementation
Replace the custom assembly code in include/urcu/uatomic/ with __atomic builtins provided by C11-compatible compiler. Signed-off-by: Ondřej Surý --- include/Makefile.am| 16 - include/urcu/uatomic.h | 84 +++-- include/urcu/uatomic/aarch64.h | 41 --- include/urcu/uatomic/alpha.h | 32 -- include/urcu/uatomic/arm.h | 57 --- include/urcu/uatomic/gcc.h | 46 --- include/urcu/uatomic/generic.h | 613 --- include/urcu/uatomic/hppa.h| 10 - include/urcu/uatomic/ia64.h| 41 --- include/urcu/uatomic/m68k.h| 44 --- include/urcu/uatomic/mips.h| 32 -- include/urcu/uatomic/nios2.h | 32 -- include/urcu/uatomic/ppc.h | 237 include/urcu/uatomic/riscv.h | 44 --- include/urcu/uatomic/s390.h| 170 - include/urcu/uatomic/sparc64.h | 81 - include/urcu/uatomic/tile.h| 41 --- include/urcu/uatomic/x86.h | 646 - 18 files changed, 53 insertions(+), 2214 deletions(-) delete mode 100644 include/urcu/uatomic/aarch64.h delete mode 100644 include/urcu/uatomic/alpha.h delete mode 100644 include/urcu/uatomic/arm.h delete mode 100644 include/urcu/uatomic/gcc.h delete mode 100644 include/urcu/uatomic/generic.h delete mode 100644 include/urcu/uatomic/hppa.h delete mode 100644 include/urcu/uatomic/ia64.h delete mode 100644 include/urcu/uatomic/m68k.h delete mode 100644 include/urcu/uatomic/mips.h delete mode 100644 include/urcu/uatomic/nios2.h delete mode 100644 include/urcu/uatomic/ppc.h delete mode 100644 include/urcu/uatomic/riscv.h delete mode 100644 include/urcu/uatomic/s390.h delete mode 100644 include/urcu/uatomic/sparc64.h delete mode 100644 include/urcu/uatomic/tile.h delete mode 100644 include/urcu/uatomic/x86.h diff --git a/include/Makefile.am b/include/Makefile.am index ba1fe60..53a28fd 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -59,24 +59,8 @@ nobase_include_HEADERS = \ urcu/syscall-compat.h \ urcu/system.h \ urcu/tls-compat.h \ - urcu/uatomic/aarch64.h \ - urcu/uatomic/alpha.h \ urcu/uatomic_arch.h \ - urcu/uatomic/arm.h \ - urcu/uatomic/gcc.h \ - urcu/uatomic/generic.h \ urcu/uatomic.h \ - urcu/uatomic/hppa.h \ - urcu/uatomic/ia64.h \ - urcu/uatomic/m68k.h \ - urcu/uatomic/mips.h \ - urcu/uatomic/nios2.h \ - urcu/uatomic/ppc.h \ - urcu/uatomic/riscv.h \ - urcu/uatomic/s390.h \ - urcu/uatomic/sparc64.h \ - urcu/uatomic/tile.h \ - urcu/uatomic/x86.h \ urcu/urcu-bp.h \ urcu/urcu-futex.h \ urcu/urcu.h \ diff --git a/include/urcu/uatomic.h b/include/urcu/uatomic.h index 2fb5fd4..0327810 100644 --- a/include/urcu/uatomic.h +++ b/include/urcu/uatomic.h @@ -22,37 +22,59 @@ #define _URCU_UATOMIC_H #include +#include -#if defined(URCU_ARCH_X86) -#include -#elif defined(URCU_ARCH_PPC) -#include -#elif defined(URCU_ARCH_S390) -#include -#elif defined(URCU_ARCH_SPARC64) -#include -#elif defined(URCU_ARCH_ALPHA) -#include -#elif defined(URCU_ARCH_IA64) -#include -#elif defined(URCU_ARCH_ARM) -#include -#elif defined(URCU_ARCH_AARCH64) -#include -#elif defined(URCU_ARCH_MIPS) -#include -#elif defined(URCU_ARCH_NIOS2) -#include -#elif defined(URCU_ARCH_TILE) -#include -#elif defined(URCU_ARCH_HPPA) -#include -#elif defined(URCU_ARCH_M68K) -#include -#elif defined(URCU_ARCH_RISCV) -#include -#else -#error "Cannot build: unrecognized architecture, see ." -#endif +#define UATOMIC_HAS_ATOMIC_BYTE +#define UATOMIC_HAS_ATOMIC_SHORT + +#define uatomic_set(addr, v) __atomic_store_n(addr, v, __ATOMIC_RELEASE) + +#define uatomic_read(addr) __atomic_load_n((addr), __ATOMIC_CONSUME) + +#define uatomic_xchg(addr, v) __atomic_exchange_n((addr), (v), __ATOMIC_SEQ_CST) + +#define uatomic_cmpxchg(addr, old, new) \ + ({ \ + __typeof__(*(addr)) __old = old; \ + __atomic_compare_exchange_n(addr, &__old, new, 0, \ + __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);\ + __old; \ + }) + +#define uatomic_add_return(addr, v) \ + __atomic_add_fetch((addr), (v), __ATOMIC_SEQ_CST) + +#define uatomic_add(addr, v) \ + (void)__atomic_add_fetch((addr), (v), __ATOMIC_RELAXED) + +#define uatomic_sub_return(addr, v) \ + __atomic_sub_fetch((addr), (v), __ATOMIC_SEQ_CST) + +#define uatomic_sub(addr, v) \ + (void)__atomic_sub_fetch((addr), (v), __ATOMIC_RELAXED) + +#define uatomic_and(addr, mask) \ + (void)__atomic_and_fetch((addr), (mask), __ATOMIC_RELAXED) + +#define uatomic_or(addr, mask) \ + (void)__atomic_or_fetch((addr), (mask), __ATOMIC_REL
Re: [lttng-dev] [PATCH 2/7] Use gcc __atomic builtis for implementation
On 2023-03-17 17:37, Ondřej Surý via lttng-dev wrote: Replace the custom assembly code in include/urcu/uatomic/ with __atomic builtins provided by C11-compatible compiler. [...] +#define UATOMIC_HAS_ATOMIC_BYTE +#define UATOMIC_HAS_ATOMIC_SHORT + +#define uatomic_set(addr, v) __atomic_store_n(addr, v, __ATOMIC_RELEASE) + +#define uatomic_read(addr) __atomic_load_n((addr), __ATOMIC_CONSUME) + +#define uatomic_xchg(addr, v) __atomic_exchange_n((addr), (v), __ATOMIC_ACQ_REL) + +#define uatomic_cmpxchg(addr, old, new) \ + ({ \ + __typeof__(*(addr)) __old = old; \ + __atomic_compare_exchange_n(addr, &__old, new, 0, \ + __ATOMIC_ACQ_REL, __ATOMIC_CONSUME);\ In doc/uatomic-api.md, we document: "```c type uatomic_cmpxchg(type *addr, type old, type new); ``` An atomic read-modify-write operation that performs this sequence of operations atomically: check if `addr` contains `old`. If true, then replace the content of `addr` by `new`. Return the value previously contained by `addr`. This function implies a full memory barrier before and after the atomic operation." This would map to a "__ATOMIC_ACQ_REL" semantic on cmpxchg failure rather than __ATOMIC_CONSUME". + __old; \ + }) + +#define uatomic_add_return(addr, v) \ + __atomic_add_fetch((addr), (v), __ATOMIC_ACQ_REL) + +#define uatomic_add(addr, v) \ + (void)__atomic_add_fetch((addr), (v), __ATOMIC_RELAXED) + +#define uatomic_sub_return(addr, v) \ + __atomic_sub_fetch((addr), (v), __ATOMIC_ACQ_REL) + +#define uatomic_sub(addr, v) \ + (void)__atomic_sub_fetch((addr), (v), __ATOMIC_RELAXED) + +#define uatomic_and(addr, mask) \ + (void)__atomic_and_fetch((addr), (mask), __ATOMIC_RELAXED) + +#define uatomic_or(addr, mask) \ + (void)__atomic_or_fetch((addr), (mask), __ATOMIC_RELAXED) + +#define uatomic_inc(addr) (void)__atomic_add_fetch((addr), 1, __ATOMIC_RELAXED) +#define uatomic_dec(addr) (void)__atomic_sub_fetch((addr), 1, __ATOMIC_RELAXED) + +#define cmm_smp_mb__before_uatomic_and() __atomic_thread_fence(__ATOMIC_ACQ_REL) +#define cmm_smp_mb__after_uatomic_and() __atomic_thread_fence(__ATOMIC_ACQ_REL) +#define cmm_smp_mb__before_uatomic_or() __atomic_thread_fence(__ATOMIC_ACQ_REL) +#define cmm_smp_mb__after_uatomic_or() __atomic_thread_fence(__ATOMIC_ACQ_REL) +#define cmm_smp_mb__before_uatomic_add() __atomic_thread_fence(__ATOMIC_ACQ_REL) +#define cmm_smp_mb__after_uatomic_add() __atomic_thread_fence(__ATOMIC_ACQ_REL) +#define cmm_smp_mb__before_uatomic_sub() cmm_smp_mb__before_uatomic_add() +#define cmm_smp_mb__after_uatomic_sub() cmm_smp_mb__after_uatomic_add() +#define cmm_smp_mb__before_uatomic_inc() cmm_smp_mb__before_uatomic_add() +#define cmm_smp_mb__after_uatomic_inc() cmm_smp_mb__after_uatomic_add() +#define cmm_smp_mb__before_uatomic_dec() cmm_smp_mb__before_uatomic_add() +#define cmm_smp_mb__after_uatomic_dec() cmm_smp_mb__after_uatomic_add() + +#define cmm_smp_mb() cmm_mb() While OK for the general case, I would recommend that we immediately implement something more efficient on x86 32/64 which takes into account that __ATOMIC_ACQ_REL atomic operations are implemented with LOCK prefixed atomic ops, which imply the barrier already, leaving the before/after_uatomic_*() as no-ops. Thanks, Mathieu -- Mathieu Desnoyers EfficiOS Inc. https://www.efficios.com ___ lttng-dev mailing list lttng-dev@lists.lttng.org https://lists.lttng.org/cgi-bin/mailman/listinfo/lttng-dev
Re: [lttng-dev] [PATCH 2/7] Use gcc __atomic builtis for implementation
On 2023-03-20 14:03, Mathieu Desnoyers via lttng-dev wrote: On 2023-03-17 17:37, Ondřej Surý via lttng-dev wrote: Replace the custom assembly code in include/urcu/uatomic/ with __atomic builtins provided by C11-compatible compiler. [...] +#define UATOMIC_HAS_ATOMIC_BYTE +#define UATOMIC_HAS_ATOMIC_SHORT + +#define uatomic_set(addr, v) __atomic_store_n(addr, v, __ATOMIC_RELEASE) + +#define uatomic_read(addr) __atomic_load_n((addr), __ATOMIC_CONSUME) + +#define uatomic_xchg(addr, v) __atomic_exchange_n((addr), (v), __ATOMIC_ACQ_REL) + +#define uatomic_cmpxchg(addr, old, new) \ + ({ \ + __typeof__(*(addr)) __old = old; \ + __atomic_compare_exchange_n(addr, &__old, new, 0, \ + __ATOMIC_ACQ_REL, __ATOMIC_CONSUME);\ Actually, I suspect we'd want to change __ATOMIC_ACQ_REL to __ATOMIC_SEQ_CST everywhere, because we want total order. Thanks, Mathieu In doc/uatomic-api.md, we document: "```c type uatomic_cmpxchg(type *addr, type old, type new); ``` An atomic read-modify-write operation that performs this sequence of operations atomically: check if `addr` contains `old`. If true, then replace the content of `addr` by `new`. Return the value previously contained by `addr`. This function implies a full memory barrier before and after the atomic operation." This would map to a "__ATOMIC_ACQ_REL" semantic on cmpxchg failure rather than __ATOMIC_CONSUME". + __old; \ + }) + +#define uatomic_add_return(addr, v) \ + __atomic_add_fetch((addr), (v), __ATOMIC_ACQ_REL) + +#define uatomic_add(addr, v) \ + (void)__atomic_add_fetch((addr), (v), __ATOMIC_RELAXED) + +#define uatomic_sub_return(addr, v) \ + __atomic_sub_fetch((addr), (v), __ATOMIC_ACQ_REL) + +#define uatomic_sub(addr, v) \ + (void)__atomic_sub_fetch((addr), (v), __ATOMIC_RELAXED) + +#define uatomic_and(addr, mask) \ + (void)__atomic_and_fetch((addr), (mask), __ATOMIC_RELAXED) + +#define uatomic_or(addr, mask) \ + (void)__atomic_or_fetch((addr), (mask), __ATOMIC_RELAXED) + +#define uatomic_inc(addr) (void)__atomic_add_fetch((addr), 1, __ATOMIC_RELAXED) +#define uatomic_dec(addr) (void)__atomic_sub_fetch((addr), 1, __ATOMIC_RELAXED) + +#define cmm_smp_mb__before_uatomic_and() __atomic_thread_fence(__ATOMIC_ACQ_REL) +#define cmm_smp_mb__after_uatomic_and() __atomic_thread_fence(__ATOMIC_ACQ_REL) +#define cmm_smp_mb__before_uatomic_or() __atomic_thread_fence(__ATOMIC_ACQ_REL) +#define cmm_smp_mb__after_uatomic_or() __atomic_thread_fence(__ATOMIC_ACQ_REL) +#define cmm_smp_mb__before_uatomic_add() __atomic_thread_fence(__ATOMIC_ACQ_REL) +#define cmm_smp_mb__after_uatomic_add() __atomic_thread_fence(__ATOMIC_ACQ_REL) +#define cmm_smp_mb__before_uatomic_sub() cmm_smp_mb__before_uatomic_add() +#define cmm_smp_mb__after_uatomic_sub() cmm_smp_mb__after_uatomic_add() +#define cmm_smp_mb__before_uatomic_inc() cmm_smp_mb__before_uatomic_add() +#define cmm_smp_mb__after_uatomic_inc() cmm_smp_mb__after_uatomic_add() +#define cmm_smp_mb__before_uatomic_dec() cmm_smp_mb__before_uatomic_add() +#define cmm_smp_mb__after_uatomic_dec() cmm_smp_mb__after_uatomic_add() + +#define cmm_smp_mb() cmm_mb() While OK for the general case, I would recommend that we immediately implement something more efficient on x86 32/64 which takes into account that __ATOMIC_ACQ_REL atomic operations are implemented with LOCK prefixed atomic ops, which imply the barrier already, leaving the before/after_uatomic_*() as no-ops. Thanks, Mathieu -- Mathieu Desnoyers EfficiOS Inc. https://www.efficios.com ___ lttng-dev mailing list lttng-dev@lists.lttng.org https://lists.lttng.org/cgi-bin/mailman/listinfo/lttng-dev
Re: [lttng-dev] [PATCH 2/7] Use gcc __atomic builtis for implementation
> On 20. 3. 2023, at 19:03, Mathieu Desnoyers > wrote: > > In doc/uatomic-api.md, we document: > > "```c > type uatomic_cmpxchg(type *addr, type old, type new); > ``` > > An atomic read-modify-write operation that performs this > sequence of operations atomically: check if `addr` contains `old`. > If true, then replace the content of `addr` by `new`. Return the > value previously contained by `addr`. This function implies a full > memory barrier before and after the atomic operation." > > This would map to a "__ATOMIC_ACQ_REL" semantic on cmpxchg failure > rather than __ATOMIC_CONSUME". From: https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html > If desired is written into *ptr then true is returned and memory is affected > according to the memory order specified by success_memorder. There are no > restrictions on what memory order can be used here. > > Otherwise, false is returned and memory is affected according to > failure_memorder. This memory order cannot be __ATOMIC_RELEASE nor > __ATOMIC_ACQ_REL. It also cannot be a stronger order than that specified by > success_memorder. I think it makes sense that the failure_memorder has the same memorder as uatomic_read(), but it definitelly cannot be __ATOMIC_ACQ_REL - it's same as with __atomic_load_n, only following are permitted: > The valid memory order variants are __ATOMIC_RELAXED, __ATOMIC_SEQ_CST, > __ATOMIC_ACQUIRE, and __ATOMIC_CONSUME. Ondrej -- Ondřej Surý (He/Him) ond...@sury.org ___ lttng-dev mailing list lttng-dev@lists.lttng.org https://lists.lttng.org/cgi-bin/mailman/listinfo/lttng-dev
Re: [lttng-dev] [PATCH 2/7] Use gcc __atomic builtis for implementation
On 2023-03-20 14:28, Ondřej Surý wrote: On 20. 3. 2023, at 19:03, Mathieu Desnoyers wrote: In doc/uatomic-api.md, we document: "```c type uatomic_cmpxchg(type *addr, type old, type new); ``` An atomic read-modify-write operation that performs this sequence of operations atomically: check if `addr` contains `old`. If true, then replace the content of `addr` by `new`. Return the value previously contained by `addr`. This function implies a full memory barrier before and after the atomic operation." This would map to a "__ATOMIC_ACQ_REL" semantic on cmpxchg failure rather than __ATOMIC_CONSUME". From: https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html If desired is written into *ptr then true is returned and memory is affected according to the memory order specified by success_memorder. There are no restrictions on what memory order can be used here. Otherwise, false is returned and memory is affected according to failure_memorder. This memory order cannot be __ATOMIC_RELEASE nor __ATOMIC_ACQ_REL. It also cannot be a stronger order than that specified by success_memorder. I think it makes sense that the failure_memorder has the same memorder as uatomic_read(), but it definitelly cannot be __ATOMIC_ACQ_REL - it's same as with __atomic_load_n, only following are permitted: The valid memory order variants are __ATOMIC_RELAXED, __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE, and __ATOMIC_CONSUME. Based on my other reply, we want "SEQ_CST" rather than ACQ_REL everywhere. Thanks, Mathieu Ondrej -- Ondřej Surý (He/Him) ond...@sury.org -- Mathieu Desnoyers EfficiOS Inc. https://www.efficios.com ___ lttng-dev mailing list lttng-dev@lists.lttng.org https://lists.lttng.org/cgi-bin/mailman/listinfo/lttng-dev
Re: [lttng-dev] [PATCH 2/7] Use gcc __atomic builtis for implementation
On 2023-03-20 14:38, Mathieu Desnoyers via lttng-dev wrote: On 2023-03-20 14:28, Ondřej Surý wrote: On 20. 3. 2023, at 19:03, Mathieu Desnoyers wrote: In doc/uatomic-api.md, we document: "```c type uatomic_cmpxchg(type *addr, type old, type new); ``` An atomic read-modify-write operation that performs this sequence of operations atomically: check if `addr` contains `old`. If true, then replace the content of `addr` by `new`. Return the value previously contained by `addr`. This function implies a full memory barrier before and after the atomic operation." This would map to a "__ATOMIC_ACQ_REL" semantic on cmpxchg failure rather than __ATOMIC_CONSUME". From: https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html If desired is written into *ptr then true is returned and memory is affected according to the memory order specified by success_memorder. There are no restrictions on what memory order can be used here. Otherwise, false is returned and memory is affected according to failure_memorder. This memory order cannot be __ATOMIC_RELEASE nor __ATOMIC_ACQ_REL. It also cannot be a stronger order than that specified by success_memorder. I think it makes sense that the failure_memorder has the same memorder as uatomic_read(), but it definitelly cannot be __ATOMIC_ACQ_REL - it's same as with __atomic_load_n, only following are permitted: The valid memory order variants are __ATOMIC_RELAXED, __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE, and __ATOMIC_CONSUME. Based on my other reply, we want "SEQ_CST" rather than ACQ_REL everywhere. And it _would_ make sense to use the same memorder on cmpxchg failure as uatomic_read if we were exposing a new API, but we are modifying an already exposed documented API, so I would stick to SEQ_CST for both cmpxchg success/failure. If we want to expose a new cmpxchg_relaxed_failure with a relaxed memorder on failure that would be fine, but we cannot change the semantic that is already documented. Thanks, Mathieu Thanks, Mathieu Ondrej -- Ondřej Surý (He/Him) ond...@sury.org -- Mathieu Desnoyers EfficiOS Inc. https://www.efficios.com ___ lttng-dev mailing list lttng-dev@lists.lttng.org https://lists.lttng.org/cgi-bin/mailman/listinfo/lttng-dev
Re: [lttng-dev] [PATCH 2/7] Use gcc __atomic builtis for implementation
Hi Mathieu, While OK for the general case, I would recommend that we immediately implement something more efficient on x86 32/64 which takes into account that __ATOMIC_ACQ_REL atomic operations are implemented with LOCK prefixed atomic ops, which imply the barrier already, leaving the before/after_uatomic_*() as no-ops. maybe first check whether the GCC optimizers merge them. I believe some optimizations of atomic primitives are allowed and implemented, but I couldn't say which ones. Best wishes, Duncan. ___ lttng-dev mailing list lttng-dev@lists.lttng.org https://lists.lttng.org/cgi-bin/mailman/listinfo/lttng-dev
Re: [lttng-dev] [PATCH 2/7] Use gcc __atomic builtis for implementation
On 2023-03-21 09:30, Ondřej Surý via lttng-dev wrote: Replace the custom assembly code in include/urcu/uatomic/ with __atomic builtins provided by C11-compatible compiler. Signed-off-by: Ondřej Surý --- include/Makefile.am| 16 - include/urcu/uatomic.h | 84 +++-- include/urcu/uatomic/aarch64.h | 41 --- include/urcu/uatomic/alpha.h | 32 -- include/urcu/uatomic/arm.h | 57 --- include/urcu/uatomic/gcc.h | 46 --- include/urcu/uatomic/generic.h | 613 --- include/urcu/uatomic/hppa.h| 10 - include/urcu/uatomic/ia64.h| 41 --- include/urcu/uatomic/m68k.h| 44 --- include/urcu/uatomic/mips.h| 32 -- include/urcu/uatomic/nios2.h | 32 -- include/urcu/uatomic/ppc.h | 237 include/urcu/uatomic/riscv.h | 44 --- include/urcu/uatomic/s390.h| 170 - include/urcu/uatomic/sparc64.h | 81 - include/urcu/uatomic/tile.h| 41 --- include/urcu/uatomic/x86.h | 646 - 18 files changed, 53 insertions(+), 2214 deletions(-) delete mode 100644 include/urcu/uatomic/aarch64.h delete mode 100644 include/urcu/uatomic/alpha.h delete mode 100644 include/urcu/uatomic/arm.h delete mode 100644 include/urcu/uatomic/gcc.h delete mode 100644 include/urcu/uatomic/generic.h delete mode 100644 include/urcu/uatomic/hppa.h delete mode 100644 include/urcu/uatomic/ia64.h delete mode 100644 include/urcu/uatomic/m68k.h delete mode 100644 include/urcu/uatomic/mips.h delete mode 100644 include/urcu/uatomic/nios2.h delete mode 100644 include/urcu/uatomic/ppc.h delete mode 100644 include/urcu/uatomic/riscv.h delete mode 100644 include/urcu/uatomic/s390.h delete mode 100644 include/urcu/uatomic/sparc64.h delete mode 100644 include/urcu/uatomic/tile.h delete mode 100644 include/urcu/uatomic/x86.h diff --git a/include/Makefile.am b/include/Makefile.am index ba1fe60..53a28fd 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -59,24 +59,8 @@ nobase_include_HEADERS = \ urcu/syscall-compat.h \ urcu/system.h \ urcu/tls-compat.h \ - urcu/uatomic/aarch64.h \ - urcu/uatomic/alpha.h \ urcu/uatomic_arch.h \ - urcu/uatomic/arm.h \ - urcu/uatomic/gcc.h \ - urcu/uatomic/generic.h \ urcu/uatomic.h \ - urcu/uatomic/hppa.h \ - urcu/uatomic/ia64.h \ - urcu/uatomic/m68k.h \ - urcu/uatomic/mips.h \ - urcu/uatomic/nios2.h \ - urcu/uatomic/ppc.h \ - urcu/uatomic/riscv.h \ - urcu/uatomic/s390.h \ - urcu/uatomic/sparc64.h \ - urcu/uatomic/tile.h \ - urcu/uatomic/x86.h \ urcu/urcu-bp.h \ urcu/urcu-futex.h \ urcu/urcu.h \ diff --git a/include/urcu/uatomic.h b/include/urcu/uatomic.h index 2fb5fd4..0327810 100644 --- a/include/urcu/uatomic.h +++ b/include/urcu/uatomic.h @@ -22,37 +22,59 @@ #define _URCU_UATOMIC_H #include +#include -#if defined(URCU_ARCH_X86) -#include -#elif defined(URCU_ARCH_PPC) -#include -#elif defined(URCU_ARCH_S390) -#include -#elif defined(URCU_ARCH_SPARC64) -#include -#elif defined(URCU_ARCH_ALPHA) -#include -#elif defined(URCU_ARCH_IA64) -#include -#elif defined(URCU_ARCH_ARM) -#include -#elif defined(URCU_ARCH_AARCH64) -#include -#elif defined(URCU_ARCH_MIPS) -#include -#elif defined(URCU_ARCH_NIOS2) -#include -#elif defined(URCU_ARCH_TILE) -#include -#elif defined(URCU_ARCH_HPPA) -#include -#elif defined(URCU_ARCH_M68K) -#include -#elif defined(URCU_ARCH_RISCV) -#include -#else -#error "Cannot build: unrecognized architecture, see ." -#endif +#define UATOMIC_HAS_ATOMIC_BYTE +#define UATOMIC_HAS_ATOMIC_SHORT + +#define uatomic_set(addr, v) __atomic_store_n(addr, v, __ATOMIC_RELEASE) + +#define uatomic_read(addr) __atomic_load_n((addr), __ATOMIC_CONSUME) + +#define uatomic_xchg(addr, v) __atomic_exchange_n((addr), (v), __ATOMIC_SEQ_CST) + +#define uatomic_cmpxchg(addr, old, new) \ + ({ \ + __typeof__(*(addr)) __old = old; \ + __atomic_compare_exchange_n(addr, &__old, new, 0, \ + __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);\ + __old; \ + }) + +#define uatomic_add_return(addr, v) \ + __atomic_add_fetch((addr), (v), __ATOMIC_SEQ_CST) The extra parentheses around "addr" and "v" here are not needed due to operator priority of comma ",". Likewise elsewhere in this patch. Also, as mentioned earlier, please special-case the x86 implementation to include the __ATOMIC_SEQ_CST into atomic operations. Thanks, Mathieu + +#define uatomic_add(addr, v) \ + (void)__atomic_add_fetch((addr), (v), __ATOMIC_RELAXED) + +#define uatomic_sub_return(addr, v) \ + __ato
Re: [lttng-dev] [PATCH 2/7] Use gcc __atomic builtis for implementation
On 2023-03-20 15:38, Duncan Sands via lttng-dev wrote: Hi Mathieu, While OK for the general case, I would recommend that we immediately implement something more efficient on x86 32/64 which takes into account that __ATOMIC_ACQ_REL atomic operations are implemented with LOCK prefixed atomic ops, which imply the barrier already, leaving the before/after_uatomic_*() as no-ops. maybe first check whether the GCC optimizers merge them. I believe some optimizations of atomic primitives are allowed and implemented, but I couldn't say which ones. Best wishes, Duncan. Tested on godbolt.org with: int a; void fct(void) { (void) __atomic_add_fetch(&a, 1, __ATOMIC_RELAXED); __atomic_thread_fence(__ATOMIC_SEQ_CST); } x86-64 gcc 12.2 -O2 -std=c11: fct: lock addDWORD PTR a[rip], 1 lock or QWORD PTR [rsp], 0 ret a: .zero 4 x86-64 clang 16.0.0 -O2 -std=c11: fct:# @fct lockinc dword ptr [rip + a] mfence ret a: .long 0 So none of gcc/clang optimize this today, hence the need for an x86-specific implementation. Thanks, Mathieu -- Mathieu Desnoyers EfficiOS Inc. https://www.efficios.com ___ lttng-dev mailing list lttng-dev@lists.lttng.org https://lists.lttng.org/cgi-bin/mailman/listinfo/lttng-dev
Re: [lttng-dev] [PATCH 2/7] Use gcc __atomic builtis for implementation
Hi Mathieu, Tested on godbolt.org with: int a; void fct(void) { (void) __atomic_add_fetch(&a, 1, __ATOMIC_RELAXED); __atomic_thread_fence(__ATOMIC_SEQ_CST); } x86-64 gcc 12.2 -O2 -std=c11: fct: lock add DWORD PTR a[rip], 1 lock or QWORD PTR [rsp], 0 ret a: .zero 4 that's disappointing. It's the same if both use __ATOMIC_SEQ_CST: int a; void fct(void) { (void) __atomic_add_fetch(&a, 1, __ATOMIC_SEQ_CST); __atomic_thread_fence(__ATOMIC_SEQ_CST); } -> fct(): lock addDWORD PTR a[rip], 1 lock or QWORD PTR [rsp], 0 ret a: .zero 4 on x86-64 gcc 12.2 -O2 -std=c11. Clang also doesn't optimize the fence away. Best wishes, Duncan. ___ lttng-dev mailing list lttng-dev@lists.lttng.org https://lists.lttng.org/cgi-bin/mailman/listinfo/lttng-dev