On Tue, 27 Jun 2023 at 07:09, Andres Freund <and...@anarazel.de> wrote: > On 2023-06-27 15:33:57 +1200, Thomas Munro wrote: > > On Tue, Jun 27, 2023 at 2:05 PM Andres Freund <and...@anarazel.de> wrote: > > > Unfortunately it scaled way worse at first. This is not an inherent > > > issue, but > > > due to an implementation choice in ReadRecentBuffer(). Whereas the normal > > > BufferAlloc() path uses PinBuffer(), ReadRecentBuffer() first does > > > LockBufHdr(), checks if the buffer ID is the same and then uses > > > PinBuffer_Locked(). > > > > > > The problem with that is that PinBuffer() takes care to not hold the > > > buffer > > > header spinlock, it uses compare_exchange to atomically acquire the pin, > > > while > > > guaranteing nobody holds the lock. When holding the buffer header > > > spinlock, > > > there obviously is the risk of being scheduled out (or even just not have > > > exclusive access to the cacheline). > > > > Yeah. Aside from inherent nastiness of user-space spinlocks > > I've been wondering about making our backoff path use futexes, after some > adaptive spinning.
If you want to experiment, here is a rebased version of something I hacked up a couple of years back on the way to Fosdem Pgday. I didn't pursue it further because I didn't have a use case where it showed a significant difference. -- Ants
diff --git a/src/backend/storage/lmgr/s_lock.c b/src/backend/storage/lmgr/s_lock.c index 327ac64f7c2..67a5e8a0246 100644 --- a/src/backend/storage/lmgr/s_lock.c +++ b/src/backend/storage/lmgr/s_lock.c @@ -92,6 +92,7 @@ s_lock_stuck(const char *file, int line, const char *func) int s_lock(volatile slock_t *lock, const char *file, int line, const char *func) { +#ifndef HAS_FUTEX SpinDelayStatus delayStatus; init_spin_delay(&delayStatus, file, line, func); @@ -104,6 +105,8 @@ s_lock(volatile slock_t *lock, const char *file, int line, const char *func) finish_spin_delay(&delayStatus); return delayStatus.delays; +#endif + elog(FATAL, "Should not be called"); } #ifdef USE_DEFAULT_S_UNLOCK @@ -230,6 +233,71 @@ update_spins_per_delay(int shared_spins_per_delay) return (shared_spins_per_delay * 15 + spins_per_delay) / 16; } +#ifdef HAS_FUTEX +#include <unistd.h> +#include <sys/syscall.h> +#include <linux/futex.h> + +static int +futex(volatile uint32 *uaddr, int futex_op, int val, + const struct timespec *timeout, int *uaddr2, int val3) +{ + return syscall(SYS_futex, uaddr, futex_op, val, + timeout, uaddr, val3); +} + +int +futex_lock(volatile slock_t *lock, uint32 current, const char *file, int line, const char *func) +{ + int i, s; + /* + * First lets wait for a bit without involving the kernel, it is quite likely + * the lock holder is still running. + **/ + if (likely(current < 2)) + { + uint32 expected; + for (i = 0; i < DEFAULT_SPINS_PER_DELAY; i++) + { + SPIN_DELAY(); + expected = lock->value; + if (expected == 0 && pg_atomic_compare_exchange_u32(lock, &expected, 1)) + return i; + } + + while (expected != 2 && !pg_atomic_compare_exchange_u32(lock, &expected, 2)) { + if (expected == 0 && pg_atomic_compare_exchange_u32(lock, &expected, 2)) + return i; + } + } + + /* At this point lock value is 2 and we will get waken up */ + while (true) + { + uint32 expected = 0; + s = futex(&(lock->value), FUTEX_WAIT, 2, NULL, NULL, 0); + if (s == -1 && errno != EAGAIN) + elog(FATAL, "Futex wait failed with error: %m"); + + /* Maybe someone else was waiting too, we will try to wake them up. */ + if (pg_atomic_compare_exchange_u32(lock, &expected, 2)) + break; + + } + + return i; +} + +int futex_unlock(volatile slock_t *lock, uint32 current) +{ + lock->value = 0; + if (futex(&(lock->value), FUTEX_WAKE, 1, NULL, NULL, 0) == -1) + elog(FATAL, "Futex wake failed with error: %m"); + + return 0; +} + +#endif /* HAS_FUTEX */ /*****************************************************************************/ #if defined(S_LOCK_TEST) diff --git a/src/include/storage/s_lock.h b/src/include/storage/s_lock.h index c9fa84cc43c..6351ec0804e 100644 --- a/src/include/storage/s_lock.h +++ b/src/include/storage/s_lock.h @@ -205,6 +205,52 @@ spin_delay(void) #ifdef __x86_64__ /* AMD Opteron, Intel EM64T */ #define HAS_TEST_AND_SET +#if defined(__linux__) +#define HAS_FUTEX 1 /* TODO: move to configure to check for old kernels */ +#endif + +#ifdef HAS_FUTEX + +#include "port/atomics.h" + +typedef pg_atomic_uint32 slock_t; + +#define S_LOCK(lock) \ + do { \ + uint32 expected = 0; \ + if (unlikely(!pg_atomic_compare_exchange_u32((lock), &expected, 1))) \ + futex_lock((lock), expected, __FILE__, __LINE__, __func__); \ + } while (0) + + +#define S_UNLOCK(lock) \ + do { \ + uint32 actual = pg_atomic_exchange_u32((lock), 0); \ + if (unlikely(actual == 2)) \ + futex_unlock((lock), actual); \ + } while (0) +extern int futex_lock(volatile slock_t *lock, uint32 current, const char *file, int line, const char *func); +extern int futex_unlock(volatile slock_t *lock, uint32 current); + +/* TAS only needed for regress */ +#define TAS(lock) tas(lock) + +static __inline__ int +tas(volatile slock_t *lock) +{ + register uint32 _res = 1; + + __asm__ __volatile__( + " lock \n" + " xchgl %0,%1 \n" +: "+q"(_res), "+m"(*lock) +: /* no inputs */ +: "memory", "cc"); + return (int) _res; +} + + +#else typedef unsigned char slock_t; #define TAS(lock) tas(lock) @@ -247,6 +293,7 @@ spin_delay(void) " rep; nop \n"); } +#endif /* HAS_FUTEX */ #endif /* __x86_64__ */ diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c index bcbc6d910f1..b92f8542ae9 100644 --- a/src/test/regress/regress.c +++ b/src/test/regress/regress.c @@ -857,7 +857,11 @@ test_spinlock(void) S_UNLOCK(&struct_w_lock.lock); /* and that "contended" acquisition works */ +#ifdef HAS_FUTEX + futex_lock(&struct_w_lock.lock, 1, "testfile", 17, "testfunc"); +#else s_lock(&struct_w_lock.lock, "testfile", 17, "testfunc"); +#endif S_UNLOCK(&struct_w_lock.lock); /*