Damien Zammit, le mer. 07 janv. 2026 03:08:27 +0000, a ecrit:
> To prevent GCC clobbering one of the inputs and
> to make this cross arch compatible, we use C11 atomics.
> The packed struct containing a lock is unpacked to fix alignment.
>
> ---
> i386/i386/irq.c | 2 +-
> i386/i386/lock.h | 26 +++++++++++---------------
> kern/lock.h | 4 ++--
> 3 files changed, 14 insertions(+), 18 deletions(-)
>
> diff --git a/i386/i386/irq.c b/i386/i386/irq.c
> index ea10e179..8b89b70d 100644
> --- a/i386/i386/irq.c
> +++ b/i386/i386/irq.c
> @@ -39,7 +39,7 @@ struct nested_irq {
> simple_lock_irq_data_t irq_lock; /* Protects ndisabled */
> int32_t ndisabled;
> uint32_t unused[14];
> -} __attribute__((packed)) nested_irqs[NINTR];
> +} nested_irqs[NINTR];
Actually, better replace the "unused" field (and the packed attribute)
with an aligned attribute, that will make it way clearer what we want
here.
(you can introduce the 64B cache line size as a macro in a header).
> diff --git a/i386/i386/lock.h b/i386/i386/lock.h
> index 0384536a..c086e9b5 100644
> --- a/i386/i386/lock.h
> +++ b/i386/i386/lock.h
> @@ -33,11 +33,11 @@
> {.lock_data = 0}
>
> #if NCPUS > 1
> +#include <stdatomic.h>
> #include <i386/smp.h>
>
> /*
> - * All of the locking routines are built from calls on
> - * a locked-exchange operation. Values of the lock are
> + * Values of the lock are
> * 0 for unlocked, 1 for locked.
> */
>
> @@ -47,31 +47,27 @@
> * The code here depends on the GNU C compiler.
> */
>
> -#define _simple_lock_xchg_(lock, new_val) \
> -({ natural_t _old_val_; \
> - asm volatile("xchg %0, %2" \
> - : "=r" (_old_val_) \
> - : "0" ((natural_t)(new_val)), "m" (*(lock)) : "memory" \
> - ); \
> - _old_val_; \
> - })
> -
> #define simple_lock_init(l) \
> ((l)->lock_data = 0)
>
> #define _simple_lock(l) \
> ({ \
> - while(_simple_lock_xchg_(l, 1)) \
> - while (*(volatile natural_t *)&(l)->lock_data) \
> + for (;;) { \
> + if (!atomic_exchange_explicit(&(l)->lock_data, 1,
> __ATOMIC_ACQUIRE)) \
> + break; \
> + while (atomic_load_explicit(&(l)->lock_data, __ATOMIC_RELAXED)) \
> cpu_pause(); \
> + } \
> 0; \
> })
>
> #define _simple_unlock(l) \
> - (_simple_lock_xchg_(l, 0))
> + atomic_store_explicit(&(l)->lock_data, 0, __ATOMIC_RELEASE)
>
> #define _simple_lock_try(l) \
> - (!_simple_lock_xchg_(l, 1))
> + (!atomic_load_explicit(&(l)->lock_data, __ATOMIC_RELAXED) && \
> + !atomic_exchange_explicit(&(l)->lock_data, 1, __ATOMIC_ACQUIRE))
I wonder if the load is really useful. It indeed avoids a memory barrier
if the try fails, but in the success case, that will trigger two
coherency protocol events: first shared read, then exclusive exchange.
This try macro is usually used when the success is rather probable, so
I'd rather optimize for the success case.
> +
>
> /*
> * General bit-lock routines.
> diff --git a/kern/lock.h b/kern/lock.h
> index 74f2e26e..38111ed0 100644
> --- a/kern/lock.h
> +++ b/kern/lock.h
> @@ -58,6 +58,7 @@
> * MACH_LDEBUG is set.
> */
>
> +#include <stdatomic.h>
> #include <machine/lock.h>/*XXX*/
> #if NCPUS > 1
> #if MACH_LOCK_MON == 0
> @@ -76,9 +77,8 @@
> /*
> * A simple spin lock.
> */
> -
> struct slock {
> - volatile natural_t lock_data; /* in general 1 bit is sufficient */
> + atomic_uint lock_data; /* in general 1 bit is sufficient */
> struct {} is_a_simple_lock;
> };
>
> --
> 2.45.2