[lttng-dev] [PATCH 2/7] Use gcc __atomic builtis for implementation

2023-03-17 Thread Ondřej Surý via lttng-dev
Replace the custom assembly code in include/urcu/uatomic/ with __atomic
builtins provided by C11-compatible compiler.

Signed-off-by: Ondřej Surý 
---
 include/Makefile.am|  16 -
 include/urcu/uatomic.h |  84 +++--
 include/urcu/uatomic/aarch64.h |  41 ---
 include/urcu/uatomic/alpha.h   |  32 --
 include/urcu/uatomic/arm.h |  57 ---
 include/urcu/uatomic/gcc.h |  46 ---
 include/urcu/uatomic/generic.h | 613 ---
 include/urcu/uatomic/hppa.h|  10 -
 include/urcu/uatomic/ia64.h|  41 ---
 include/urcu/uatomic/m68k.h|  44 ---
 include/urcu/uatomic/mips.h|  32 --
 include/urcu/uatomic/nios2.h   |  32 --
 include/urcu/uatomic/ppc.h | 237 
 include/urcu/uatomic/riscv.h   |  44 ---
 include/urcu/uatomic/s390.h| 170 -
 include/urcu/uatomic/sparc64.h |  81 -
 include/urcu/uatomic/tile.h|  41 ---
 include/urcu/uatomic/x86.h | 646 -
 18 files changed, 53 insertions(+), 2214 deletions(-)
 delete mode 100644 include/urcu/uatomic/aarch64.h
 delete mode 100644 include/urcu/uatomic/alpha.h
 delete mode 100644 include/urcu/uatomic/arm.h
 delete mode 100644 include/urcu/uatomic/gcc.h
 delete mode 100644 include/urcu/uatomic/generic.h
 delete mode 100644 include/urcu/uatomic/hppa.h
 delete mode 100644 include/urcu/uatomic/ia64.h
 delete mode 100644 include/urcu/uatomic/m68k.h
 delete mode 100644 include/urcu/uatomic/mips.h
 delete mode 100644 include/urcu/uatomic/nios2.h
 delete mode 100644 include/urcu/uatomic/ppc.h
 delete mode 100644 include/urcu/uatomic/riscv.h
 delete mode 100644 include/urcu/uatomic/s390.h
 delete mode 100644 include/urcu/uatomic/sparc64.h
 delete mode 100644 include/urcu/uatomic/tile.h
 delete mode 100644 include/urcu/uatomic/x86.h

diff --git a/include/Makefile.am b/include/Makefile.am
index ba1fe60..53a28fd 100644
--- a/include/Makefile.am
+++ b/include/Makefile.am
@@ -59,24 +59,8 @@ nobase_include_HEADERS = \
urcu/syscall-compat.h \
urcu/system.h \
urcu/tls-compat.h \
-   urcu/uatomic/aarch64.h \
-   urcu/uatomic/alpha.h \
urcu/uatomic_arch.h \
-   urcu/uatomic/arm.h \
-   urcu/uatomic/gcc.h \
-   urcu/uatomic/generic.h \
urcu/uatomic.h \
-   urcu/uatomic/hppa.h \
-   urcu/uatomic/ia64.h \
-   urcu/uatomic/m68k.h \
-   urcu/uatomic/mips.h \
-   urcu/uatomic/nios2.h \
-   urcu/uatomic/ppc.h \
-   urcu/uatomic/riscv.h \
-   urcu/uatomic/s390.h \
-   urcu/uatomic/sparc64.h \
-   urcu/uatomic/tile.h \
-   urcu/uatomic/x86.h \
urcu/urcu-bp.h \
urcu/urcu-futex.h \
urcu/urcu.h \
diff --git a/include/urcu/uatomic.h b/include/urcu/uatomic.h
index 2fb5fd4..4dbef4c 100644
--- a/include/urcu/uatomic.h
+++ b/include/urcu/uatomic.h
@@ -22,37 +22,59 @@
 #define _URCU_UATOMIC_H
 
 #include 
+#include 
 
-#if defined(URCU_ARCH_X86)
-#include 
-#elif defined(URCU_ARCH_PPC)
-#include 
-#elif defined(URCU_ARCH_S390)
-#include 
-#elif defined(URCU_ARCH_SPARC64)
-#include 
-#elif defined(URCU_ARCH_ALPHA)
-#include 
-#elif defined(URCU_ARCH_IA64)
-#include 
-#elif defined(URCU_ARCH_ARM)
-#include 
-#elif defined(URCU_ARCH_AARCH64)
-#include 
-#elif defined(URCU_ARCH_MIPS)
-#include 
-#elif defined(URCU_ARCH_NIOS2)
-#include 
-#elif defined(URCU_ARCH_TILE)
-#include 
-#elif defined(URCU_ARCH_HPPA)
-#include 
-#elif defined(URCU_ARCH_M68K)
-#include 
-#elif defined(URCU_ARCH_RISCV)
-#include 
-#else
-#error "Cannot build: unrecognized architecture, see ."
-#endif
+#define UATOMIC_HAS_ATOMIC_BYTE
+#define UATOMIC_HAS_ATOMIC_SHORT
+
+#define uatomic_set(addr, v) __atomic_store_n(addr, v, __ATOMIC_RELEASE)
+
+#define uatomic_read(addr) __atomic_load_n((addr), __ATOMIC_CONSUME)
+
+#define uatomic_xchg(addr, v) __atomic_exchange_n((addr), (v), 
__ATOMIC_ACQ_REL)
+
+#define uatomic_cmpxchg(addr, old, new) \
+   ({  
\
+   __typeof__(*(addr)) __old = old;
\
+   __atomic_compare_exchange_n(addr, &__old, new, 0,   
\
+   __ATOMIC_ACQ_REL, 
__ATOMIC_CONSUME);\
+   __old;  
\
+   })
+
+#define uatomic_add_return(addr, v) \
+   __atomic_add_fetch((addr), (v), __ATOMIC_ACQ_REL)
+
+#define uatomic_add(addr, v) \
+   (void)__atomic_add_fetch((addr), (v), __ATOMIC_RELAXED)
+
+#define uatomic_sub_return(addr, v) \
+   __atomic_sub_fetch((addr), (v), __ATOMIC_ACQ_REL)
+
+#define uatomic_sub(addr, v) \
+   (void)__atomic_sub_fetch((addr), (v), __ATOMIC_RELAXED)
+
+#define uatomic_and(addr, mask) \
+   (void)__atomic_and_fetch((addr), (mask), __ATOMIC_RELAXED)
+
+#define uatomic_or(addr, mask) \
+   (void)__atomic_or_fetch((addr), (mask), __ATOMIC_RELAXED)
+
+#define

[lttng-dev] [PATCH 2/7] Use gcc __atomic builtis for implementation

2023-03-21 Thread Ondřej Surý via lttng-dev
Replace the custom assembly code in include/urcu/uatomic/ with __atomic
builtins provided by C11-compatible compiler.

Signed-off-by: Ondřej Surý 
---
 include/Makefile.am|  16 -
 include/urcu/uatomic.h |  84 +++--
 include/urcu/uatomic/aarch64.h |  41 ---
 include/urcu/uatomic/alpha.h   |  32 --
 include/urcu/uatomic/arm.h |  57 ---
 include/urcu/uatomic/gcc.h |  46 ---
 include/urcu/uatomic/generic.h | 613 ---
 include/urcu/uatomic/hppa.h|  10 -
 include/urcu/uatomic/ia64.h|  41 ---
 include/urcu/uatomic/m68k.h|  44 ---
 include/urcu/uatomic/mips.h|  32 --
 include/urcu/uatomic/nios2.h   |  32 --
 include/urcu/uatomic/ppc.h | 237 
 include/urcu/uatomic/riscv.h   |  44 ---
 include/urcu/uatomic/s390.h| 170 -
 include/urcu/uatomic/sparc64.h |  81 -
 include/urcu/uatomic/tile.h|  41 ---
 include/urcu/uatomic/x86.h | 646 -
 18 files changed, 53 insertions(+), 2214 deletions(-)
 delete mode 100644 include/urcu/uatomic/aarch64.h
 delete mode 100644 include/urcu/uatomic/alpha.h
 delete mode 100644 include/urcu/uatomic/arm.h
 delete mode 100644 include/urcu/uatomic/gcc.h
 delete mode 100644 include/urcu/uatomic/generic.h
 delete mode 100644 include/urcu/uatomic/hppa.h
 delete mode 100644 include/urcu/uatomic/ia64.h
 delete mode 100644 include/urcu/uatomic/m68k.h
 delete mode 100644 include/urcu/uatomic/mips.h
 delete mode 100644 include/urcu/uatomic/nios2.h
 delete mode 100644 include/urcu/uatomic/ppc.h
 delete mode 100644 include/urcu/uatomic/riscv.h
 delete mode 100644 include/urcu/uatomic/s390.h
 delete mode 100644 include/urcu/uatomic/sparc64.h
 delete mode 100644 include/urcu/uatomic/tile.h
 delete mode 100644 include/urcu/uatomic/x86.h

diff --git a/include/Makefile.am b/include/Makefile.am
index ba1fe60..53a28fd 100644
--- a/include/Makefile.am
+++ b/include/Makefile.am
@@ -59,24 +59,8 @@ nobase_include_HEADERS = \
urcu/syscall-compat.h \
urcu/system.h \
urcu/tls-compat.h \
-   urcu/uatomic/aarch64.h \
-   urcu/uatomic/alpha.h \
urcu/uatomic_arch.h \
-   urcu/uatomic/arm.h \
-   urcu/uatomic/gcc.h \
-   urcu/uatomic/generic.h \
urcu/uatomic.h \
-   urcu/uatomic/hppa.h \
-   urcu/uatomic/ia64.h \
-   urcu/uatomic/m68k.h \
-   urcu/uatomic/mips.h \
-   urcu/uatomic/nios2.h \
-   urcu/uatomic/ppc.h \
-   urcu/uatomic/riscv.h \
-   urcu/uatomic/s390.h \
-   urcu/uatomic/sparc64.h \
-   urcu/uatomic/tile.h \
-   urcu/uatomic/x86.h \
urcu/urcu-bp.h \
urcu/urcu-futex.h \
urcu/urcu.h \
diff --git a/include/urcu/uatomic.h b/include/urcu/uatomic.h
index 2fb5fd4..0327810 100644
--- a/include/urcu/uatomic.h
+++ b/include/urcu/uatomic.h
@@ -22,37 +22,59 @@
 #define _URCU_UATOMIC_H
 
 #include 
+#include 
 
-#if defined(URCU_ARCH_X86)
-#include 
-#elif defined(URCU_ARCH_PPC)
-#include 
-#elif defined(URCU_ARCH_S390)
-#include 
-#elif defined(URCU_ARCH_SPARC64)
-#include 
-#elif defined(URCU_ARCH_ALPHA)
-#include 
-#elif defined(URCU_ARCH_IA64)
-#include 
-#elif defined(URCU_ARCH_ARM)
-#include 
-#elif defined(URCU_ARCH_AARCH64)
-#include 
-#elif defined(URCU_ARCH_MIPS)
-#include 
-#elif defined(URCU_ARCH_NIOS2)
-#include 
-#elif defined(URCU_ARCH_TILE)
-#include 
-#elif defined(URCU_ARCH_HPPA)
-#include 
-#elif defined(URCU_ARCH_M68K)
-#include 
-#elif defined(URCU_ARCH_RISCV)
-#include 
-#else
-#error "Cannot build: unrecognized architecture, see ."
-#endif
+#define UATOMIC_HAS_ATOMIC_BYTE
+#define UATOMIC_HAS_ATOMIC_SHORT
+
+#define uatomic_set(addr, v) __atomic_store_n(addr, v, __ATOMIC_RELEASE)
+
+#define uatomic_read(addr) __atomic_load_n((addr), __ATOMIC_CONSUME)
+
+#define uatomic_xchg(addr, v) __atomic_exchange_n((addr), (v), 
__ATOMIC_SEQ_CST)
+
+#define uatomic_cmpxchg(addr, old, new) \
+   ({  
\
+   __typeof__(*(addr)) __old = old;
\
+   __atomic_compare_exchange_n(addr, &__old, new, 0,   
\
+   __ATOMIC_SEQ_CST, 
__ATOMIC_SEQ_CST);\
+   __old;  
\
+   })
+
+#define uatomic_add_return(addr, v) \
+   __atomic_add_fetch((addr), (v), __ATOMIC_SEQ_CST)
+
+#define uatomic_add(addr, v) \
+   (void)__atomic_add_fetch((addr), (v), __ATOMIC_RELAXED)
+
+#define uatomic_sub_return(addr, v) \
+   __atomic_sub_fetch((addr), (v), __ATOMIC_SEQ_CST)
+
+#define uatomic_sub(addr, v) \
+   (void)__atomic_sub_fetch((addr), (v), __ATOMIC_RELAXED)
+
+#define uatomic_and(addr, mask) \
+   (void)__atomic_and_fetch((addr), (mask), __ATOMIC_RELAXED)
+
+#define uatomic_or(addr, mask) \
+   (void)__atomic_or_fetch((addr), (mask), __ATOMIC_REL

Re: [lttng-dev] [PATCH 2/7] Use gcc __atomic builtis for implementation

2023-03-20 Thread Mathieu Desnoyers via lttng-dev

On 2023-03-17 17:37, Ondřej Surý via lttng-dev wrote:

Replace the custom assembly code in include/urcu/uatomic/ with __atomic
builtins provided by C11-compatible compiler.


[...]

+#define UATOMIC_HAS_ATOMIC_BYTE
+#define UATOMIC_HAS_ATOMIC_SHORT
+
+#define uatomic_set(addr, v) __atomic_store_n(addr, v, __ATOMIC_RELEASE)
+
+#define uatomic_read(addr) __atomic_load_n((addr), __ATOMIC_CONSUME)
+
+#define uatomic_xchg(addr, v) __atomic_exchange_n((addr), (v), 
__ATOMIC_ACQ_REL)
+
+#define uatomic_cmpxchg(addr, old, new) \
+   ({  
\
+   __typeof__(*(addr)) __old = old;
\
+   __atomic_compare_exchange_n(addr, &__old, new, 0,   \
+   __ATOMIC_ACQ_REL, 
__ATOMIC_CONSUME);\


In doc/uatomic-api.md, we document:

"```c
type uatomic_cmpxchg(type *addr, type old, type new);
```

An atomic read-modify-write operation that performs this
sequence of operations atomically: check if `addr` contains `old`.
If true, then replace the content of `addr` by `new`. Return the
value previously contained by `addr`. This function implies a full
memory barrier before and after the atomic operation."

This would map to a "__ATOMIC_ACQ_REL" semantic on cmpxchg failure
rather than __ATOMIC_CONSUME".


+   __old;  
\
+   })
+
+#define uatomic_add_return(addr, v) \
+   __atomic_add_fetch((addr), (v), __ATOMIC_ACQ_REL)
+
+#define uatomic_add(addr, v) \
+   (void)__atomic_add_fetch((addr), (v), __ATOMIC_RELAXED)
+
+#define uatomic_sub_return(addr, v) \
+   __atomic_sub_fetch((addr), (v), __ATOMIC_ACQ_REL)
+
+#define uatomic_sub(addr, v) \
+   (void)__atomic_sub_fetch((addr), (v), __ATOMIC_RELAXED)
+
+#define uatomic_and(addr, mask) \
+   (void)__atomic_and_fetch((addr), (mask), __ATOMIC_RELAXED)
+
+#define uatomic_or(addr, mask) \
+   (void)__atomic_or_fetch((addr), (mask), __ATOMIC_RELAXED)
+
+#define uatomic_inc(addr) (void)__atomic_add_fetch((addr), 1, __ATOMIC_RELAXED)
+#define uatomic_dec(addr) (void)__atomic_sub_fetch((addr), 1, __ATOMIC_RELAXED)
+
+#define cmm_smp_mb__before_uatomic_and()   
__atomic_thread_fence(__ATOMIC_ACQ_REL)
+#define cmm_smp_mb__after_uatomic_and()
__atomic_thread_fence(__ATOMIC_ACQ_REL)
+#define cmm_smp_mb__before_uatomic_or()
__atomic_thread_fence(__ATOMIC_ACQ_REL)
+#define cmm_smp_mb__after_uatomic_or() 
__atomic_thread_fence(__ATOMIC_ACQ_REL)
+#define cmm_smp_mb__before_uatomic_add()   
__atomic_thread_fence(__ATOMIC_ACQ_REL)
+#define cmm_smp_mb__after_uatomic_add()
__atomic_thread_fence(__ATOMIC_ACQ_REL)
+#define cmm_smp_mb__before_uatomic_sub()   cmm_smp_mb__before_uatomic_add()
+#define cmm_smp_mb__after_uatomic_sub()
cmm_smp_mb__after_uatomic_add()
+#define cmm_smp_mb__before_uatomic_inc()   cmm_smp_mb__before_uatomic_add()
+#define cmm_smp_mb__after_uatomic_inc()
cmm_smp_mb__after_uatomic_add()
+#define cmm_smp_mb__before_uatomic_dec()   cmm_smp_mb__before_uatomic_add()
+#define cmm_smp_mb__after_uatomic_dec()
cmm_smp_mb__after_uatomic_add()
+
+#define cmm_smp_mb()   cmm_mb()


While OK for the general case, I would recommend that we immediately 
implement something more efficient on x86 32/64 which takes into account 
that __ATOMIC_ACQ_REL atomic operations are implemented with LOCK 
prefixed atomic ops, which imply the barrier already, leaving the 
before/after_uatomic_*() as no-ops.


Thanks,

Mathieu

--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com

___
lttng-dev mailing list
lttng-dev@lists.lttng.org
https://lists.lttng.org/cgi-bin/mailman/listinfo/lttng-dev


Re: [lttng-dev] [PATCH 2/7] Use gcc __atomic builtis for implementation

2023-03-20 Thread Mathieu Desnoyers via lttng-dev

On 2023-03-20 14:03, Mathieu Desnoyers via lttng-dev wrote:

On 2023-03-17 17:37, Ondřej Surý via lttng-dev wrote:

Replace the custom assembly code in include/urcu/uatomic/ with __atomic
builtins provided by C11-compatible compiler.


[...]

+#define UATOMIC_HAS_ATOMIC_BYTE
+#define UATOMIC_HAS_ATOMIC_SHORT
+
+#define uatomic_set(addr, v) __atomic_store_n(addr, v, __ATOMIC_RELEASE)
+
+#define uatomic_read(addr) __atomic_load_n((addr), __ATOMIC_CONSUME)
+
+#define uatomic_xchg(addr, v) __atomic_exchange_n((addr), (v), 
__ATOMIC_ACQ_REL)

+
+#define uatomic_cmpxchg(addr, old, new) \
+    ({    \
+    __typeof__(*(addr)) __old = old;    \
+    __atomic_compare_exchange_n(addr, &__old, new, 0,    \
+    __ATOMIC_ACQ_REL, __ATOMIC_CONSUME);\




Actually, I suspect we'd want to change __ATOMIC_ACQ_REL to 
__ATOMIC_SEQ_CST everywhere, because we want total order.


Thanks,

Mathieu


In doc/uatomic-api.md, we document:

"```c
type uatomic_cmpxchg(type *addr, type old, type new);
```

An atomic read-modify-write operation that performs this
sequence of operations atomically: check if `addr` contains `old`.
If true, then replace the content of `addr` by `new`. Return the
value previously contained by `addr`. This function implies a full
memory barrier before and after the atomic operation."

This would map to a "__ATOMIC_ACQ_REL" semantic on cmpxchg failure
rather than __ATOMIC_CONSUME".


+    __old;    \
+    })
+
+#define uatomic_add_return(addr, v) \
+    __atomic_add_fetch((addr), (v), __ATOMIC_ACQ_REL)
+
+#define uatomic_add(addr, v) \
+    (void)__atomic_add_fetch((addr), (v), __ATOMIC_RELAXED)
+
+#define uatomic_sub_return(addr, v) \
+    __atomic_sub_fetch((addr), (v), __ATOMIC_ACQ_REL)
+
+#define uatomic_sub(addr, v) \
+    (void)__atomic_sub_fetch((addr), (v), __ATOMIC_RELAXED)
+
+#define uatomic_and(addr, mask) \
+    (void)__atomic_and_fetch((addr), (mask), __ATOMIC_RELAXED)
+
+#define uatomic_or(addr, mask)    \
+    (void)__atomic_or_fetch((addr), (mask), __ATOMIC_RELAXED)
+
+#define uatomic_inc(addr) (void)__atomic_add_fetch((addr), 1, 
__ATOMIC_RELAXED)
+#define uatomic_dec(addr) (void)__atomic_sub_fetch((addr), 1, 
__ATOMIC_RELAXED)

+
+#define cmm_smp_mb__before_uatomic_and()
__atomic_thread_fence(__ATOMIC_ACQ_REL)
+#define cmm_smp_mb__after_uatomic_and()
__atomic_thread_fence(__ATOMIC_ACQ_REL)
+#define cmm_smp_mb__before_uatomic_or()
__atomic_thread_fence(__ATOMIC_ACQ_REL)
+#define cmm_smp_mb__after_uatomic_or()
__atomic_thread_fence(__ATOMIC_ACQ_REL)
+#define cmm_smp_mb__before_uatomic_add()
__atomic_thread_fence(__ATOMIC_ACQ_REL)
+#define cmm_smp_mb__after_uatomic_add()
__atomic_thread_fence(__ATOMIC_ACQ_REL)
+#define cmm_smp_mb__before_uatomic_sub()
cmm_smp_mb__before_uatomic_add()
+#define cmm_smp_mb__after_uatomic_sub()
cmm_smp_mb__after_uatomic_add()
+#define cmm_smp_mb__before_uatomic_inc()
cmm_smp_mb__before_uatomic_add()
+#define cmm_smp_mb__after_uatomic_inc()
cmm_smp_mb__after_uatomic_add()
+#define cmm_smp_mb__before_uatomic_dec()
cmm_smp_mb__before_uatomic_add()
+#define cmm_smp_mb__after_uatomic_dec()
cmm_smp_mb__after_uatomic_add()

+
+#define cmm_smp_mb()    cmm_mb()


While OK for the general case, I would recommend that we immediately 
implement something more efficient on x86 32/64 which takes into account 
that __ATOMIC_ACQ_REL atomic operations are implemented with LOCK 
prefixed atomic ops, which imply the barrier already, leaving the 
before/after_uatomic_*() as no-ops.


Thanks,

Mathieu



--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com

___
lttng-dev mailing list
lttng-dev@lists.lttng.org
https://lists.lttng.org/cgi-bin/mailman/listinfo/lttng-dev


Re: [lttng-dev] [PATCH 2/7] Use gcc __atomic builtis for implementation

2023-03-20 Thread Ondřej Surý via lttng-dev

> On 20. 3. 2023, at 19:03, Mathieu Desnoyers  
> wrote:
> 
> In doc/uatomic-api.md, we document:
> 
> "```c
> type uatomic_cmpxchg(type *addr, type old, type new);
> ```
> 
> An atomic read-modify-write operation that performs this
> sequence of operations atomically: check if `addr` contains `old`.
> If true, then replace the content of `addr` by `new`. Return the
> value previously contained by `addr`. This function implies a full
> memory barrier before and after the atomic operation."
> 
> This would map to a "__ATOMIC_ACQ_REL" semantic on cmpxchg failure
> rather than __ATOMIC_CONSUME".


From: https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html

> If desired is written into *ptr then true is returned and memory is affected 
> according to the memory order specified by success_memorder. There are no 
> restrictions on what memory order can be used here.
> 
> Otherwise, false is returned and memory is affected according to 
> failure_memorder. This memory order cannot be __ATOMIC_RELEASE nor 
> __ATOMIC_ACQ_REL. It also cannot be a stronger order than that specified by 
> success_memorder.

I think it makes sense that the failure_memorder has the same memorder as 
uatomic_read(), but it definitelly cannot be __ATOMIC_ACQ_REL - it's same as 
with __atomic_load_n, only following are permitted:

> The valid memory order variants are __ATOMIC_RELAXED, __ATOMIC_SEQ_CST, 
> __ATOMIC_ACQUIRE, and __ATOMIC_CONSUME.

Ondrej
--
Ondřej Surý (He/Him)
ond...@sury.org

___
lttng-dev mailing list
lttng-dev@lists.lttng.org
https://lists.lttng.org/cgi-bin/mailman/listinfo/lttng-dev


Re: [lttng-dev] [PATCH 2/7] Use gcc __atomic builtis for implementation

2023-03-20 Thread Mathieu Desnoyers via lttng-dev

On 2023-03-20 14:28, Ondřej Surý wrote:



On 20. 3. 2023, at 19:03, Mathieu Desnoyers  
wrote:

In doc/uatomic-api.md, we document:

"```c
type uatomic_cmpxchg(type *addr, type old, type new);
```

An atomic read-modify-write operation that performs this
sequence of operations atomically: check if `addr` contains `old`.
If true, then replace the content of `addr` by `new`. Return the
value previously contained by `addr`. This function implies a full
memory barrier before and after the atomic operation."

This would map to a "__ATOMIC_ACQ_REL" semantic on cmpxchg failure
rather than __ATOMIC_CONSUME".



From: https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html


If desired is written into *ptr then true is returned and memory is affected 
according to the memory order specified by success_memorder. There are no 
restrictions on what memory order can be used here.

Otherwise, false is returned and memory is affected according to 
failure_memorder. This memory order cannot be __ATOMIC_RELEASE nor 
__ATOMIC_ACQ_REL. It also cannot be a stronger order than that specified by 
success_memorder.


I think it makes sense that the failure_memorder has the same memorder as 
uatomic_read(), but it definitelly cannot be __ATOMIC_ACQ_REL - it's same as 
with __atomic_load_n, only following are permitted:


The valid memory order variants are __ATOMIC_RELAXED, __ATOMIC_SEQ_CST, 
__ATOMIC_ACQUIRE, and __ATOMIC_CONSUME.


Based on my other reply, we want "SEQ_CST" rather than ACQ_REL everywhere.

Thanks,

Mathieu



Ondrej
--
Ondřej Surý (He/Him)
ond...@sury.org



--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com

___
lttng-dev mailing list
lttng-dev@lists.lttng.org
https://lists.lttng.org/cgi-bin/mailman/listinfo/lttng-dev


Re: [lttng-dev] [PATCH 2/7] Use gcc __atomic builtis for implementation

2023-03-20 Thread Mathieu Desnoyers via lttng-dev

On 2023-03-20 14:38, Mathieu Desnoyers via lttng-dev wrote:

On 2023-03-20 14:28, Ondřej Surý wrote:


On 20. 3. 2023, at 19:03, Mathieu Desnoyers 
 wrote:


In doc/uatomic-api.md, we document:

"```c
type uatomic_cmpxchg(type *addr, type old, type new);
```

An atomic read-modify-write operation that performs this
sequence of operations atomically: check if `addr` contains `old`.
If true, then replace the content of `addr` by `new`. Return the
value previously contained by `addr`. This function implies a full
memory barrier before and after the atomic operation."

This would map to a "__ATOMIC_ACQ_REL" semantic on cmpxchg failure
rather than __ATOMIC_CONSUME".



From: https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html

If desired is written into *ptr then true is returned and memory is 
affected according to the memory order specified by success_memorder. 
There are no restrictions on what memory order can be used here.


Otherwise, false is returned and memory is affected according to 
failure_memorder. This memory order cannot be __ATOMIC_RELEASE nor 
__ATOMIC_ACQ_REL. It also cannot be a stronger order than that 
specified by success_memorder.


I think it makes sense that the failure_memorder has the same memorder 
as uatomic_read(), but it definitelly cannot be __ATOMIC_ACQ_REL - 
it's same as with __atomic_load_n, only following are permitted:


The valid memory order variants are __ATOMIC_RELAXED, 
__ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE, and __ATOMIC_CONSUME.


Based on my other reply, we want "SEQ_CST" rather than ACQ_REL everywhere.


And it _would_ make sense to use the same memorder on cmpxchg failure as 
uatomic_read if we were exposing a new API, but we are modifying an 
already exposed documented API, so I would stick to SEQ_CST for both 
cmpxchg success/failure.


If we want to expose a new cmpxchg_relaxed_failure with a relaxed 
memorder on failure that would be fine, but we cannot change the 
semantic that is already documented.


Thanks,

Mathieu



Thanks,

Mathieu



Ondrej
--
Ondřej Surý (He/Him)
ond...@sury.org





--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com

___
lttng-dev mailing list
lttng-dev@lists.lttng.org
https://lists.lttng.org/cgi-bin/mailman/listinfo/lttng-dev


Re: [lttng-dev] [PATCH 2/7] Use gcc __atomic builtis for implementation

2023-03-20 Thread Duncan Sands via lttng-dev

Hi Mathieu,

While OK for the general case, I would recommend that we immediately implement 
something more efficient on x86 32/64 which takes into account that 
__ATOMIC_ACQ_REL atomic operations are implemented with LOCK prefixed atomic 
ops, which imply the barrier already, leaving the before/after_uatomic_*() as 
no-ops.


maybe first check whether the GCC optimizers merge them.  I believe some 
optimizations of atomic primitives are allowed and implemented, but I couldn't 
say which ones.


Best wishes, Duncan.
___
lttng-dev mailing list
lttng-dev@lists.lttng.org
https://lists.lttng.org/cgi-bin/mailman/listinfo/lttng-dev


Re: [lttng-dev] [PATCH 2/7] Use gcc __atomic builtis for implementation

2023-03-21 Thread Mathieu Desnoyers via lttng-dev

On 2023-03-21 09:30, Ondřej Surý via lttng-dev wrote:

Replace the custom assembly code in include/urcu/uatomic/ with __atomic
builtins provided by C11-compatible compiler.

Signed-off-by: Ondřej Surý 
---
  include/Makefile.am|  16 -
  include/urcu/uatomic.h |  84 +++--
  include/urcu/uatomic/aarch64.h |  41 ---
  include/urcu/uatomic/alpha.h   |  32 --
  include/urcu/uatomic/arm.h |  57 ---
  include/urcu/uatomic/gcc.h |  46 ---
  include/urcu/uatomic/generic.h | 613 ---
  include/urcu/uatomic/hppa.h|  10 -
  include/urcu/uatomic/ia64.h|  41 ---
  include/urcu/uatomic/m68k.h|  44 ---
  include/urcu/uatomic/mips.h|  32 --
  include/urcu/uatomic/nios2.h   |  32 --
  include/urcu/uatomic/ppc.h | 237 
  include/urcu/uatomic/riscv.h   |  44 ---
  include/urcu/uatomic/s390.h| 170 -
  include/urcu/uatomic/sparc64.h |  81 -
  include/urcu/uatomic/tile.h|  41 ---
  include/urcu/uatomic/x86.h | 646 -
  18 files changed, 53 insertions(+), 2214 deletions(-)
  delete mode 100644 include/urcu/uatomic/aarch64.h
  delete mode 100644 include/urcu/uatomic/alpha.h
  delete mode 100644 include/urcu/uatomic/arm.h
  delete mode 100644 include/urcu/uatomic/gcc.h
  delete mode 100644 include/urcu/uatomic/generic.h
  delete mode 100644 include/urcu/uatomic/hppa.h
  delete mode 100644 include/urcu/uatomic/ia64.h
  delete mode 100644 include/urcu/uatomic/m68k.h
  delete mode 100644 include/urcu/uatomic/mips.h
  delete mode 100644 include/urcu/uatomic/nios2.h
  delete mode 100644 include/urcu/uatomic/ppc.h
  delete mode 100644 include/urcu/uatomic/riscv.h
  delete mode 100644 include/urcu/uatomic/s390.h
  delete mode 100644 include/urcu/uatomic/sparc64.h
  delete mode 100644 include/urcu/uatomic/tile.h
  delete mode 100644 include/urcu/uatomic/x86.h

diff --git a/include/Makefile.am b/include/Makefile.am
index ba1fe60..53a28fd 100644
--- a/include/Makefile.am
+++ b/include/Makefile.am
@@ -59,24 +59,8 @@ nobase_include_HEADERS = \
urcu/syscall-compat.h \
urcu/system.h \
urcu/tls-compat.h \
-   urcu/uatomic/aarch64.h \
-   urcu/uatomic/alpha.h \
urcu/uatomic_arch.h \
-   urcu/uatomic/arm.h \
-   urcu/uatomic/gcc.h \
-   urcu/uatomic/generic.h \
urcu/uatomic.h \
-   urcu/uatomic/hppa.h \
-   urcu/uatomic/ia64.h \
-   urcu/uatomic/m68k.h \
-   urcu/uatomic/mips.h \
-   urcu/uatomic/nios2.h \
-   urcu/uatomic/ppc.h \
-   urcu/uatomic/riscv.h \
-   urcu/uatomic/s390.h \
-   urcu/uatomic/sparc64.h \
-   urcu/uatomic/tile.h \
-   urcu/uatomic/x86.h \
urcu/urcu-bp.h \
urcu/urcu-futex.h \
urcu/urcu.h \
diff --git a/include/urcu/uatomic.h b/include/urcu/uatomic.h
index 2fb5fd4..0327810 100644
--- a/include/urcu/uatomic.h
+++ b/include/urcu/uatomic.h
@@ -22,37 +22,59 @@
  #define _URCU_UATOMIC_H
  
  #include 

+#include 
  
-#if defined(URCU_ARCH_X86)

-#include 
-#elif defined(URCU_ARCH_PPC)
-#include 
-#elif defined(URCU_ARCH_S390)
-#include 
-#elif defined(URCU_ARCH_SPARC64)
-#include 
-#elif defined(URCU_ARCH_ALPHA)
-#include 
-#elif defined(URCU_ARCH_IA64)
-#include 
-#elif defined(URCU_ARCH_ARM)
-#include 
-#elif defined(URCU_ARCH_AARCH64)
-#include 
-#elif defined(URCU_ARCH_MIPS)
-#include 
-#elif defined(URCU_ARCH_NIOS2)
-#include 
-#elif defined(URCU_ARCH_TILE)
-#include 
-#elif defined(URCU_ARCH_HPPA)
-#include 
-#elif defined(URCU_ARCH_M68K)
-#include 
-#elif defined(URCU_ARCH_RISCV)
-#include 
-#else
-#error "Cannot build: unrecognized architecture, see ."
-#endif
+#define UATOMIC_HAS_ATOMIC_BYTE
+#define UATOMIC_HAS_ATOMIC_SHORT
+
+#define uatomic_set(addr, v) __atomic_store_n(addr, v, __ATOMIC_RELEASE)
+
+#define uatomic_read(addr) __atomic_load_n((addr), __ATOMIC_CONSUME)
+
+#define uatomic_xchg(addr, v) __atomic_exchange_n((addr), (v), 
__ATOMIC_SEQ_CST)
+
+#define uatomic_cmpxchg(addr, old, new) \
+   ({  
\
+   __typeof__(*(addr)) __old = old;
\
+   __atomic_compare_exchange_n(addr, &__old, new, 0,   \
+   __ATOMIC_SEQ_CST, 
__ATOMIC_SEQ_CST);\
+   __old;  
\
+   })
+
+#define uatomic_add_return(addr, v) \
+   __atomic_add_fetch((addr), (v), __ATOMIC_SEQ_CST)


The extra parentheses around "addr" and "v" here are not needed due to 
operator priority of comma ",". Likewise elsewhere in this patch.


Also, as mentioned earlier, please special-case the x86 implementation 
to include the __ATOMIC_SEQ_CST into atomic operations.


Thanks,

Mathieu


+
+#define uatomic_add(addr, v) \
+   (void)__atomic_add_fetch((addr), (v), __ATOMIC_RELAXED)
+
+#define uatomic_sub_return(addr, v) \
+   __ato

Re: [lttng-dev] [PATCH 2/7] Use gcc __atomic builtis for implementation

2023-03-21 Thread Mathieu Desnoyers via lttng-dev

On 2023-03-20 15:38, Duncan Sands via lttng-dev wrote:

Hi Mathieu,

While OK for the general case, I would recommend that we immediately 
implement something more efficient on x86 32/64 which takes into 
account that __ATOMIC_ACQ_REL atomic operations are implemented with 
LOCK prefixed atomic ops, which imply the barrier already, leaving the 
before/after_uatomic_*() as no-ops.


maybe first check whether the GCC optimizers merge them.  I believe some 
optimizations of atomic primitives are allowed and implemented, but I 
couldn't say which ones.


Best wishes, Duncan.


Tested on godbolt.org with:

int a;

void fct(void)
{
(void) __atomic_add_fetch(&a, 1, __ATOMIC_RELAXED);
__atomic_thread_fence(__ATOMIC_SEQ_CST);
}

x86-64 gcc 12.2 -O2 -std=c11:

fct:
lock addDWORD PTR a[rip], 1
lock or QWORD PTR [rsp], 0
ret
a:
.zero   4

x86-64 clang 16.0.0 -O2 -std=c11:

fct:# @fct
lockinc dword ptr [rip + a]
mfence
ret
a:
.long   0

So none of gcc/clang optimize this today, hence the need for an 
x86-specific implementation.


Thanks,

Mathieu


--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com

___
lttng-dev mailing list
lttng-dev@lists.lttng.org
https://lists.lttng.org/cgi-bin/mailman/listinfo/lttng-dev


Re: [lttng-dev] [PATCH 2/7] Use gcc __atomic builtis for implementation

2023-03-22 Thread Duncan Sands via lttng-dev

Hi Mathieu,


Tested on godbolt.org with:

int a;

void fct(void)
{
     (void) __atomic_add_fetch(&a, 1, __ATOMIC_RELAXED);
     __atomic_thread_fence(__ATOMIC_SEQ_CST);
}

x86-64 gcc 12.2 -O2 -std=c11:

fct:
     lock add    DWORD PTR a[rip], 1
     lock or QWORD PTR [rsp], 0
     ret
a:
     .zero   4


that's disappointing.  It's the same if both use __ATOMIC_SEQ_CST:

int a;

void fct(void)
{
(void) __atomic_add_fetch(&a, 1, __ATOMIC_SEQ_CST);
__atomic_thread_fence(__ATOMIC_SEQ_CST);
}

->

fct():
lock addDWORD PTR a[rip], 1
lock or QWORD PTR [rsp], 0
ret
a:
.zero   4

on x86-64 gcc 12.2 -O2 -std=c11.  Clang also doesn't optimize the fence away.

Best wishes, Duncan.
___
lttng-dev mailing list
lttng-dev@lists.lttng.org
https://lists.lttng.org/cgi-bin/mailman/listinfo/lttng-dev