On Tue, 27 Jun 2023 at 07:09, Andres Freund <and...@anarazel.de> wrote:
> On 2023-06-27 15:33:57 +1200, Thomas Munro wrote:
> > On Tue, Jun 27, 2023 at 2:05 PM Andres Freund <and...@anarazel.de> wrote:
> > > Unfortunately it scaled way worse at first. This is not an inherent 
> > > issue, but
> > > due to an implementation choice in ReadRecentBuffer().  Whereas the normal
> > > BufferAlloc() path uses PinBuffer(), ReadRecentBuffer() first does
> > > LockBufHdr(), checks if the buffer ID is the same and then uses
> > > PinBuffer_Locked().
> > >
> > > The problem with that is that PinBuffer() takes care to not hold the 
> > > buffer
> > > header spinlock, it uses compare_exchange to atomically acquire the pin, 
> > > while
> > > guaranteing nobody holds the lock.  When holding the buffer header 
> > > spinlock,
> > > there obviously is the risk of being scheduled out (or even just not have
> > > exclusive access to the cacheline).
> >
> > Yeah.  Aside from inherent nastiness of user-space spinlocks
>
> I've been wondering about making our backoff path use futexes, after some
> adaptive spinning.

If you want to experiment, here is a rebased version of something I
hacked up a couple of years back on the way to Fosdem Pgday. I didn't
pursue it further because I didn't have a use case where it showed a
significant difference.

--
Ants
diff --git a/src/backend/storage/lmgr/s_lock.c b/src/backend/storage/lmgr/s_lock.c
index 327ac64f7c2..67a5e8a0246 100644
--- a/src/backend/storage/lmgr/s_lock.c
+++ b/src/backend/storage/lmgr/s_lock.c
@@ -92,6 +92,7 @@ s_lock_stuck(const char *file, int line, const char *func)
 int
 s_lock(volatile slock_t *lock, const char *file, int line, const char *func)
 {
+#ifndef HAS_FUTEX
 	SpinDelayStatus delayStatus;
 
 	init_spin_delay(&delayStatus, file, line, func);
@@ -104,6 +105,8 @@ s_lock(volatile slock_t *lock, const char *file, int line, const char *func)
 	finish_spin_delay(&delayStatus);
 
 	return delayStatus.delays;
+#endif
+	elog(FATAL, "Should not be called");
 }
 
 #ifdef USE_DEFAULT_S_UNLOCK
@@ -230,6 +233,71 @@ update_spins_per_delay(int shared_spins_per_delay)
 	return (shared_spins_per_delay * 15 + spins_per_delay) / 16;
 }
 
+#ifdef HAS_FUTEX
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <linux/futex.h>
+
+static int
+futex(volatile uint32 *uaddr, int futex_op, int val,
+	  const struct timespec *timeout, int *uaddr2, int val3)
+{
+	return syscall(SYS_futex, uaddr, futex_op, val,
+				   timeout, uaddr, val3);
+}
+
+int
+futex_lock(volatile slock_t *lock, uint32 current, const char *file, int line, const char *func)
+{
+	int i, s;
+	/*
+	 * First lets wait for a bit without involving the kernel, it is quite likely
+	 * the lock holder is still running.
+	 **/
+	if (likely(current < 2))
+	{
+		uint32 expected;
+		for (i = 0; i < DEFAULT_SPINS_PER_DELAY; i++)
+		{
+			SPIN_DELAY();
+			expected = lock->value;
+			if (expected == 0 && pg_atomic_compare_exchange_u32(lock, &expected, 1))
+				return i;
+		}
+
+		while (expected != 2 && !pg_atomic_compare_exchange_u32(lock, &expected, 2)) {
+			if (expected == 0 && pg_atomic_compare_exchange_u32(lock, &expected, 2))
+				return i;
+		}
+	}
+
+	/* At this point lock value is 2 and we will get waken up */
+	while (true)
+	{
+		uint32 expected = 0;
+		s = futex(&(lock->value), FUTEX_WAIT, 2, NULL, NULL, 0);
+		if (s == -1 && errno != EAGAIN)
+			elog(FATAL, "Futex wait failed with error: %m");
+
+		/* Maybe someone else was waiting too, we will try to wake them up. */
+		if (pg_atomic_compare_exchange_u32(lock, &expected, 2))
+			break;
+
+	}
+
+	return i;
+}
+
+int futex_unlock(volatile slock_t *lock, uint32 current)
+{
+	lock->value = 0;
+	if (futex(&(lock->value), FUTEX_WAKE, 1, NULL, NULL, 0) == -1)
+		elog(FATAL, "Futex wake failed with error: %m");
+
+	return 0;
+}
+
+#endif /* HAS_FUTEX */
 
 /*****************************************************************************/
 #if defined(S_LOCK_TEST)
diff --git a/src/include/storage/s_lock.h b/src/include/storage/s_lock.h
index c9fa84cc43c..6351ec0804e 100644
--- a/src/include/storage/s_lock.h
+++ b/src/include/storage/s_lock.h
@@ -205,6 +205,52 @@ spin_delay(void)
 #ifdef __x86_64__		/* AMD Opteron, Intel EM64T */
 #define HAS_TEST_AND_SET
 
+#if defined(__linux__)
+#define HAS_FUTEX 1 	/* TODO: move to configure to check for old kernels */
+#endif
+
+#ifdef HAS_FUTEX
+
+#include "port/atomics.h"
+
+typedef pg_atomic_uint32 slock_t;
+
+#define S_LOCK(lock) \
+	do { \
+		uint32 expected = 0; \
+		if (unlikely(!pg_atomic_compare_exchange_u32((lock), &expected, 1))) \
+			futex_lock((lock), expected, __FILE__, __LINE__, __func__); \
+	} while (0)
+
+
+#define S_UNLOCK(lock) \
+	do { \
+		uint32 actual = pg_atomic_exchange_u32((lock), 0); \
+		if (unlikely(actual == 2)) \
+			futex_unlock((lock), actual); \
+	} while (0)
+extern int futex_lock(volatile slock_t *lock, uint32 current, const char *file, int line, const char *func);
+extern int futex_unlock(volatile slock_t *lock, uint32 current);
+
+/* TAS only needed for regress */
+#define TAS(lock) tas(lock)
+
+static __inline__ int
+tas(volatile slock_t *lock)
+{
+	register uint32 _res = 1;
+
+	__asm__ __volatile__(
+		"	lock			\n"
+		"	xchgl	%0,%1	\n"
+:		"+q"(_res), "+m"(*lock)
+:		/* no inputs */
+:		"memory", "cc");
+	return (int) _res;
+}
+
+
+#else
 typedef unsigned char slock_t;
 
 #define TAS(lock) tas(lock)
@@ -247,6 +293,7 @@ spin_delay(void)
 		" rep; nop			\n");
 }
 
+#endif   /* HAS_FUTEX */
 #endif	 /* __x86_64__ */
 
 
diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c
index bcbc6d910f1..b92f8542ae9 100644
--- a/src/test/regress/regress.c
+++ b/src/test/regress/regress.c
@@ -857,7 +857,11 @@ test_spinlock(void)
 		S_UNLOCK(&struct_w_lock.lock);
 
 		/* and that "contended" acquisition works */
+#ifdef HAS_FUTEX
+		futex_lock(&struct_w_lock.lock, 1, "testfile", 17, "testfunc");
+#else
 		s_lock(&struct_w_lock.lock, "testfile", 17, "testfunc");
+#endif
 		S_UNLOCK(&struct_w_lock.lock);
 
 		/*

Reply via email to