Here's a new version of 0001 with some added #ifdefs that cfbot revealed were missing.
-- Nathan Bossart Amazon Web Services: https://aws.amazon.com
>From cc2bc5ca5b49cd8641af8b2231a34a1aa5091bb9 Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nat...@postgresql.org> Date: Wed, 20 Mar 2024 14:20:24 -0500 Subject: [PATCH v7 1/1] pg_lfind32(): add "overlap" code for remaining elements --- src/include/port/pg_lfind.h | 105 ++++++++++++++++++++++++------------ 1 file changed, 72 insertions(+), 33 deletions(-) diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h index b8dfa66eef..5830cc7cb3 100644 --- a/src/include/port/pg_lfind.h +++ b/src/include/port/pg_lfind.h @@ -80,6 +80,51 @@ pg_lfind8_le(uint8 key, uint8 *base, uint32 nelem) return false; } +#ifndef USE_NO_SIMD +/* + * pg_lfind32_helper + * + * Searches one 4-register-block of integers. The caller is responsible for + * ensuring that there are at least 4-registers-worth of integers remaining. + */ +static inline bool +pg_lfind32_helper(const Vector32 keys, uint32 *base) +{ + const uint32 nelem_per_vector = sizeof(Vector32) / sizeof(uint32); + Vector32 vals1, + vals2, + vals3, + vals4, + result1, + result2, + result3, + result4, + tmp1, + tmp2, + result; + + /* load the next block into 4 registers */ + vector32_load(&vals1, base); + vector32_load(&vals2, &base[nelem_per_vector]); + vector32_load(&vals3, &base[nelem_per_vector * 2]); + vector32_load(&vals4, &base[nelem_per_vector * 3]); + + /* compare each value to the key */ + result1 = vector32_eq(keys, vals1); + result2 = vector32_eq(keys, vals2); + result3 = vector32_eq(keys, vals3); + result4 = vector32_eq(keys, vals4); + + /* combine the results into a single variable */ + tmp1 = vector32_or(result1, result2); + tmp2 = vector32_or(result3, result4); + result = vector32_or(tmp1, tmp2); + + /* return whether there was a match */ + return vector32_is_highbit_set(result); +} +#endif /* ! USE_NO_SIMD */ + /* * pg_lfind32 * @@ -119,46 +164,40 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem) } #endif - for (i = 0; i < tail_idx; i += nelem_per_iteration) + /* + * If there aren't enough elements for the SIMD code, jump to the standard + * one-by-one linear search code. + */ + if (nelem <= nelem_per_iteration) + goto one_by_one; + + /* + * Process as many elements as possible with a block of 4 registers. + */ + do { - Vector32 vals1, - vals2, - vals3, - vals4, - result1, - result2, - result3, - result4, - tmp1, - tmp2, - result; - - /* load the next block into 4 registers */ - vector32_load(&vals1, &base[i]); - vector32_load(&vals2, &base[i + nelem_per_vector]); - vector32_load(&vals3, &base[i + nelem_per_vector * 2]); - vector32_load(&vals4, &base[i + nelem_per_vector * 3]); - - /* compare each value to the key */ - result1 = vector32_eq(keys, vals1); - result2 = vector32_eq(keys, vals2); - result3 = vector32_eq(keys, vals3); - result4 = vector32_eq(keys, vals4); - - /* combine the results into a single variable */ - tmp1 = vector32_or(result1, result2); - tmp2 = vector32_or(result3, result4); - result = vector32_or(tmp1, tmp2); - - /* see if there was a match */ - if (vector32_is_highbit_set(result)) + if (pg_lfind32_helper(keys, &base[i])) { Assert(assert_result == true); return true; } - } + + i += nelem_per_iteration; + + } while (i < tail_idx); + + /* + * Process the last 'nelem_per_iteration' elements in the array with a + * 4-register block. This will cause us to check some of the elements + * more than once, but that won't affect correctness, and testing has + * demonstrated that this helps more cases than it harms. + */ + Assert(assert_result == pg_lfind32_helper(keys, &base[nelem - nelem_per_iteration])); + return pg_lfind32_helper(keys, &base[nelem - nelem_per_iteration]); + #endif /* ! USE_NO_SIMD */ +one_by_one: /* Process the remaining elements one at a time. */ for (; i < nelem; i++) { -- 2.25.1