On Sun, Mar 17, 2024 at 09:47:33AM +0700, John Naylor wrote:
> I haven't looked at the patches, but the graphs look good.

I spent some more time on these patches.  Specifically, I reordered them to
demonstrate the effects on systems without AVX2 support.  I've also added a
shortcut to jump to the one-by-one approach when there aren't many
elements, as the overhead becomes quite noticeable otherwise.  Finally, I
ran the same benchmarks again on x86 and Arm out to 128 elements.

Overall, I think 0001 and 0002 are in decent shape, although I'm wondering
if it's possible to improve the style a bit.  0003 at least needs a big
comment in simd.h, and it might need a note in the documentation, too.  If
the approach in this patch set seems reasonable, I'll spend some time on
that.

BTW I did try to add some other optimizations, such as processing remaining
elements with only one vector and trying to use the overlapping strategy
with more registers if we know there are relatively many remaining
elements.  These other approaches all added a lot of complexity and began
hurting performance, and I've probably already spent way too much time
optimizing a linear search, so this is where I've decided to stop.

-- 
Nathan Bossart
Amazon Web Services: https://aws.amazon.com
>From 2f4a7747025cd3288453fdabd520638e37e3633c Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nat...@postgresql.org>
Date: Mon, 18 Mar 2024 10:44:08 -0500
Subject: [PATCH v4 1/3] pg_lfind32(): Optimize processing remaining elements.

Discussion: https://postgr.es/m/20231129171526.GA857928%40nathanxps13
---
 src/include/port/pg_lfind.h | 42 +++++++++++++++++++++++++++++++++----
 1 file changed, 38 insertions(+), 4 deletions(-)

diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h
index b8dfa66eef..bef0e2d5be 100644
--- a/src/include/port/pg_lfind.h
+++ b/src/include/port/pg_lfind.h
@@ -95,15 +95,16 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 
 	/*
 	 * For better instruction-level parallelism, each loop iteration operates
-	 * on a block of four registers.  Testing for SSE2 has showed this is ~40%
-	 * faster than using a block of two registers.
+	 * on a block of registers.  We first do as much processing as possible
+	 * with a block of 4 registers, then we try to process what remains with a
+	 * block of 2 registers.
 	 */
 	const Vector32 keys = vector32_broadcast(key);	/* load copies of key */
 	const uint32 nelem_per_vector = sizeof(Vector32) / sizeof(uint32);
-	const uint32 nelem_per_iteration = 4 * nelem_per_vector;
+	uint32		nelem_per_iteration = 4 * nelem_per_vector;
 
 	/* round down to multiple of elements per iteration */
-	const uint32 tail_idx = nelem & ~(nelem_per_iteration - 1);
+	uint32		tail_idx = nelem & ~(nelem_per_iteration - 1);
 
 #if defined(USE_ASSERT_CHECKING)
 	bool		assert_result = false;
@@ -157,6 +158,39 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 			return true;
 		}
 	}
+
+	/*
+	 * Try processing the remaining elements using 2 registers instead of 4.
+	 */
+	nelem_per_iteration = 2 * nelem_per_vector;
+	tail_idx = nelem & ~(nelem_per_iteration - 1);
+
+	for (; i < tail_idx; i += nelem_per_iteration)
+	{
+		Vector32	vals1,
+					vals2,
+					result1,
+					result2,
+					result;
+
+		/* load the next block into 2 registers */
+		vector32_load(&vals1, &base[i]);
+		vector32_load(&vals2, &base[i + nelem_per_vector]);
+
+		/* compare each value to the key */
+		result1 = vector32_eq(keys, vals1);
+		result2 = vector32_eq(keys, vals2);
+
+		/* combine the results into a single variable */
+		result = vector32_or(result1, result2);
+
+		/* see if there was a match */
+		if (vector32_is_highbit_set(result))
+		{
+			Assert(assert_result);
+			return true;
+		}
+	}
 #endif							/* ! USE_NO_SIMD */
 
 	/* Process the remaining elements one at a time. */
-- 
2.25.1

>From 68ee8bf34c80a0a3df02c2aae8357f664895b4de Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nat...@postgresql.org>
Date: Mon, 18 Mar 2024 10:55:50 -0500
Subject: [PATCH v4 2/3] pg_lfind32(): Further optimize processing remaining
 elements.

Discussion: https://postgr.es/m/20231129171526.GA857928%40nathanxps13
---
 src/include/port/pg_lfind.h | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h
index bef0e2d5be..83fb8f50d2 100644
--- a/src/include/port/pg_lfind.h
+++ b/src/include/port/pg_lfind.h
@@ -96,8 +96,8 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	/*
 	 * For better instruction-level parallelism, each loop iteration operates
 	 * on a block of registers.  We first do as much processing as possible
-	 * with a block of 4 registers, then we try to process what remains with a
-	 * block of 2 registers.
+	 * with a block of 4 registers, then we process what remains with a block
+	 * of 2 registers.
 	 */
 	const Vector32 keys = vector32_broadcast(key);	/* load copies of key */
 	const uint32 nelem_per_vector = sizeof(Vector32) / sizeof(uint32);
@@ -120,6 +120,15 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	}
 #endif
 
+	/*
+	 * If there aren't enough elements for the SIMD optimizations, jump
+	 * straight to the standard one-by-one linear search code.  Testing has
+	 * shown that the gains of skipping to the standard linear search code are
+	 * worth the extra check.
+	 */
+	if (nelem < nelem_per_vector * 2)
+		goto slow_path;
+
 	for (i = 0; i < tail_idx; i += nelem_per_iteration)
 	{
 		Vector32	vals1,
@@ -165,6 +174,7 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	nelem_per_iteration = 2 * nelem_per_vector;
 	tail_idx = nelem & ~(nelem_per_iteration - 1);
 
+retry:
 	for (; i < tail_idx; i += nelem_per_iteration)
 	{
 		Vector32	vals1,
@@ -191,8 +201,25 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 			return true;
 		}
 	}
+
+	/*
+	 * Process the remaining elements via the 2-register loop above.  This
+	 * will cause us to process some elements more than once, but that won't
+	 * affect correctness, and testing shows that this approach helps more
+	 * than it harms.
+	 */
+	if (i != nelem)
+	{
+		tail_idx = nelem;
+		i = tail_idx - nelem_per_iteration;
+		goto retry;
+	}
+
+	Assert(!assert_result);
+	return false;
 #endif							/* ! USE_NO_SIMD */
 
+slow_path:
 	/* Process the remaining elements one at a time. */
 	for (; i < nelem; i++)
 	{
-- 
2.25.1

>From 41882bbf78f2d8a1fe817a0cbac70f221a0debf4 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nat...@postgresql.org>
Date: Mon, 18 Mar 2024 11:02:05 -0500
Subject: [PATCH v4 3/3] Add support for AVX2 in simd.h.

Discussion: https://postgr.es/m/20231129171526.GA857928%40nathanxps13
---
 src/include/port/simd.h | 61 ++++++++++++++++++++++++++++++++---------
 1 file changed, 48 insertions(+), 13 deletions(-)

diff --git a/src/include/port/simd.h b/src/include/port/simd.h
index 597496f2fb..f06b21876b 100644
--- a/src/include/port/simd.h
+++ b/src/include/port/simd.h
@@ -18,7 +18,18 @@
 #ifndef SIMD_H
 #define SIMD_H
 
-#if (defined(__x86_64__) || defined(_M_AMD64))
+#if defined(__AVX2__)
+
+/*
+ * XXX: Need to add a big comment here.
+ */
+#include <immintrin.h>
+#define USE_AVX2
+typedef __m256i Vector8;
+typedef __m256i Vector32;
+
+#elif (defined(__x86_64__) || defined(_M_AMD64))
+
 /*
  * SSE2 instructions are part of the spec for the 64-bit x86 ISA. We assume
  * that compilers targeting this architecture understand SSE2 intrinsics.
@@ -107,7 +118,9 @@ static inline Vector32 vector32_eq(const Vector32 v1, const Vector32 v2);
 static inline void
 vector8_load(Vector8 *v, const uint8 *s)
 {
-#if defined(USE_SSE2)
+#if defined(USE_AVX2)
+	*v = _mm256_loadu_si256((const __m256i *) s);
+#elif defined(USE_SSE2)
 	*v = _mm_loadu_si128((const __m128i *) s);
 #elif defined(USE_NEON)
 	*v = vld1q_u8(s);
@@ -120,7 +133,9 @@ vector8_load(Vector8 *v, const uint8 *s)
 static inline void
 vector32_load(Vector32 *v, const uint32 *s)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	*v = _mm256_loadu_si256((const __m256i *) s);
+#elif defined(USE_SSE2)
 	*v = _mm_loadu_si128((const __m128i *) s);
 #elif defined(USE_NEON)
 	*v = vld1q_u32(s);
@@ -134,7 +149,9 @@ vector32_load(Vector32 *v, const uint32 *s)
 static inline Vector8
 vector8_broadcast(const uint8 c)
 {
-#if defined(USE_SSE2)
+#if defined(USE_AVX2)
+	return _mm256_set1_epi8(c);
+#elif defined(USE_SSE2)
 	return _mm_set1_epi8(c);
 #elif defined(USE_NEON)
 	return vdupq_n_u8(c);
@@ -147,7 +164,9 @@ vector8_broadcast(const uint8 c)
 static inline Vector32
 vector32_broadcast(const uint32 c)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_set1_epi32(c);
+#elif defined(USE_SSE2)
 	return _mm_set1_epi32(c);
 #elif defined(USE_NEON)
 	return vdupq_n_u32(c);
@@ -270,7 +289,9 @@ vector8_has_le(const Vector8 v, const uint8 c)
 static inline bool
 vector8_is_highbit_set(const Vector8 v)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_movemask_epi8(v) != 0;
+#elif defined(USE_SSE2)
 	return _mm_movemask_epi8(v) != 0;
 #elif defined(USE_NEON)
 	return vmaxvq_u8(v) > 0x7F;
@@ -308,7 +329,9 @@ vector32_is_highbit_set(const Vector32 v)
 static inline uint32
 vector8_highbit_mask(const Vector8 v)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return (uint32) _mm256_movemask_epi8(v);
+#elif defined(USE_SSE2)
 	return (uint32) _mm_movemask_epi8(v);
 #elif defined(USE_NEON)
 	/*
@@ -337,7 +360,9 @@ vector8_highbit_mask(const Vector8 v)
 static inline Vector8
 vector8_or(const Vector8 v1, const Vector8 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_or_si256(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_or_si128(v1, v2);
 #elif defined(USE_NEON)
 	return vorrq_u8(v1, v2);
@@ -350,7 +375,9 @@ vector8_or(const Vector8 v1, const Vector8 v2)
 static inline Vector32
 vector32_or(const Vector32 v1, const Vector32 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_or_si256(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_or_si128(v1, v2);
 #elif defined(USE_NEON)
 	return vorrq_u32(v1, v2);
@@ -368,7 +395,9 @@ vector32_or(const Vector32 v1, const Vector32 v2)
 static inline Vector8
 vector8_ssub(const Vector8 v1, const Vector8 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_subs_epu8(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_subs_epu8(v1, v2);
 #elif defined(USE_NEON)
 	return vqsubq_u8(v1, v2);
@@ -384,7 +413,9 @@ vector8_ssub(const Vector8 v1, const Vector8 v2)
 static inline Vector8
 vector8_eq(const Vector8 v1, const Vector8 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_cmpeq_epi8(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_cmpeq_epi8(v1, v2);
 #elif defined(USE_NEON)
 	return vceqq_u8(v1, v2);
@@ -396,7 +427,9 @@ vector8_eq(const Vector8 v1, const Vector8 v2)
 static inline Vector32
 vector32_eq(const Vector32 v1, const Vector32 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_cmpeq_epi32(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_cmpeq_epi32(v1, v2);
 #elif defined(USE_NEON)
 	return vceqq_u32(v1, v2);
@@ -411,7 +444,9 @@ vector32_eq(const Vector32 v1, const Vector32 v2)
 static inline Vector8
 vector8_min(const Vector8 v1, const Vector8 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_min_epu8(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_min_epu8(v1, v2);
 #elif defined(USE_NEON)
 	return vminq_u8(v1, v2);
-- 
2.25.1

Reply via email to