Here's a new version of 0001 with some added #ifdefs that cfbot revealed
were missing.

-- 
Nathan Bossart
Amazon Web Services: https://aws.amazon.com
>From cc2bc5ca5b49cd8641af8b2231a34a1aa5091bb9 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nat...@postgresql.org>
Date: Wed, 20 Mar 2024 14:20:24 -0500
Subject: [PATCH v7 1/1] pg_lfind32(): add "overlap" code for remaining
 elements

---
 src/include/port/pg_lfind.h | 105 ++++++++++++++++++++++++------------
 1 file changed, 72 insertions(+), 33 deletions(-)

diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h
index b8dfa66eef..5830cc7cb3 100644
--- a/src/include/port/pg_lfind.h
+++ b/src/include/port/pg_lfind.h
@@ -80,6 +80,51 @@ pg_lfind8_le(uint8 key, uint8 *base, uint32 nelem)
 	return false;
 }
 
+#ifndef USE_NO_SIMD
+/*
+ * pg_lfind32_helper
+ *
+ * Searches one 4-register-block of integers.  The caller is responsible for
+ * ensuring that there are at least 4-registers-worth of integers remaining.
+ */
+static inline bool
+pg_lfind32_helper(const Vector32 keys, uint32 *base)
+{
+	const uint32 nelem_per_vector = sizeof(Vector32) / sizeof(uint32);
+	Vector32	vals1,
+				vals2,
+				vals3,
+				vals4,
+				result1,
+				result2,
+				result3,
+				result4,
+				tmp1,
+				tmp2,
+				result;
+
+	/* load the next block into 4 registers */
+	vector32_load(&vals1, base);
+	vector32_load(&vals2, &base[nelem_per_vector]);
+	vector32_load(&vals3, &base[nelem_per_vector * 2]);
+	vector32_load(&vals4, &base[nelem_per_vector * 3]);
+
+	/* compare each value to the key */
+	result1 = vector32_eq(keys, vals1);
+	result2 = vector32_eq(keys, vals2);
+	result3 = vector32_eq(keys, vals3);
+	result4 = vector32_eq(keys, vals4);
+
+	/* combine the results into a single variable */
+	tmp1 = vector32_or(result1, result2);
+	tmp2 = vector32_or(result3, result4);
+	result = vector32_or(tmp1, tmp2);
+
+	/* return whether there was a match */
+	return vector32_is_highbit_set(result);
+}
+#endif							/* ! USE_NO_SIMD */
+
 /*
  * pg_lfind32
  *
@@ -119,46 +164,40 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	}
 #endif
 
-	for (i = 0; i < tail_idx; i += nelem_per_iteration)
+	/*
+	 * If there aren't enough elements for the SIMD code, jump to the standard
+	 * one-by-one linear search code.
+	 */
+	if (nelem <= nelem_per_iteration)
+		goto one_by_one;
+
+	/*
+	 * Process as many elements as possible with a block of 4 registers.
+	 */
+	do
 	{
-		Vector32	vals1,
-					vals2,
-					vals3,
-					vals4,
-					result1,
-					result2,
-					result3,
-					result4,
-					tmp1,
-					tmp2,
-					result;
-
-		/* load the next block into 4 registers */
-		vector32_load(&vals1, &base[i]);
-		vector32_load(&vals2, &base[i + nelem_per_vector]);
-		vector32_load(&vals3, &base[i + nelem_per_vector * 2]);
-		vector32_load(&vals4, &base[i + nelem_per_vector * 3]);
-
-		/* compare each value to the key */
-		result1 = vector32_eq(keys, vals1);
-		result2 = vector32_eq(keys, vals2);
-		result3 = vector32_eq(keys, vals3);
-		result4 = vector32_eq(keys, vals4);
-
-		/* combine the results into a single variable */
-		tmp1 = vector32_or(result1, result2);
-		tmp2 = vector32_or(result3, result4);
-		result = vector32_or(tmp1, tmp2);
-
-		/* see if there was a match */
-		if (vector32_is_highbit_set(result))
+		if (pg_lfind32_helper(keys, &base[i]))
 		{
 			Assert(assert_result == true);
 			return true;
 		}
-	}
+
+		i += nelem_per_iteration;
+
+	} while (i < tail_idx);
+
+	/*
+	 * Process the last 'nelem_per_iteration' elements in the array with a
+	 * 4-register block.  This will cause us to check some of the elements
+	 * more than once, but that won't affect correctness, and testing has
+	 * demonstrated that this helps more cases than it harms.
+	 */
+	Assert(assert_result == pg_lfind32_helper(keys, &base[nelem - nelem_per_iteration]));
+	return pg_lfind32_helper(keys, &base[nelem - nelem_per_iteration]);
+
 #endif							/* ! USE_NO_SIMD */
 
+one_by_one:
 	/* Process the remaining elements one at a time. */
 	for (; i < nelem; i++)
 	{
-- 
2.25.1

Reply via email to