On Thu, Mar 21, 2024 at 11:30:30AM +0700, John Naylor wrote: > I'm much happier about v5-0001. With a small tweak it would match what > I had in mind: > > + if (nelem < nelem_per_iteration) > + goto one_by_one; > > If this were "<=" then the for long arrays we could assume there is > always more than one block, and wouldn't need to check if any elements > remain -- first block, then a single loop and it's done. > > The loop could also then be a "do while" since it doesn't have to > check the exit condition up front.
Good idea. That causes us to re-check all of the tail elements when the number of elements is evenly divisible by nelem_per_iteration, but that might be worth the trade-off. > Yes, that spike is weird, because it seems super-linear. However, the > more interesting question for me is: AVX2 isn't really buying much for > the numbers covered in this test. Between 32 and 48 elements, and > between 64 and 80, it's indistinguishable from SSE2. The jumps to the > next shelf are postponed, but the jumps are just as high. From earlier > system benchmarks, I recall it eventually wins out with hundreds of > elements, right? Is that still true? It does still eventually win, although not nearly to the same extent as before. I extended the benchmark a bit to show this. I wouldn't be devastated if we only got 0001 committed for v17, given these results. > Further, now that the algorithm is more SIMD-appropriate, I wonder > what doing 4 registers at a time is actually buying us for either SSE2 > or AVX2. It might just be a matter of scale, but that would be good to > understand. I'll follow up with these numbers shortly. -- Nathan Bossart Amazon Web Services: https://aws.amazon.com
>From 5d4d91d169b973838c99e8d4fdadcb09df36a6ea Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nat...@postgresql.org> Date: Wed, 20 Mar 2024 14:20:24 -0500 Subject: [PATCH v6 1/2] pg_lfind32(): add "overlap" code for remaining elements --- src/include/port/pg_lfind.h | 103 ++++++++++++++++++++++++------------ 1 file changed, 70 insertions(+), 33 deletions(-) diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h index b8dfa66eef..22a3711ab5 100644 --- a/src/include/port/pg_lfind.h +++ b/src/include/port/pg_lfind.h @@ -80,6 +80,49 @@ pg_lfind8_le(uint8 key, uint8 *base, uint32 nelem) return false; } +/* + * pg_lfind32_helper + * + * Searches one 4-register-block of integers. The caller is responsible for + * ensuring that there are at least 4-registers-worth of integers remaining. + */ +static inline bool +pg_lfind32_helper(const Vector32 keys, uint32 *base) +{ + const uint32 nelem_per_vector = sizeof(Vector32) / sizeof(uint32); + Vector32 vals1, + vals2, + vals3, + vals4, + result1, + result2, + result3, + result4, + tmp1, + tmp2, + result; + + /* load the next block into 4 registers */ + vector32_load(&vals1, base); + vector32_load(&vals2, &base[nelem_per_vector]); + vector32_load(&vals3, &base[nelem_per_vector * 2]); + vector32_load(&vals4, &base[nelem_per_vector * 3]); + + /* compare each value to the key */ + result1 = vector32_eq(keys, vals1); + result2 = vector32_eq(keys, vals2); + result3 = vector32_eq(keys, vals3); + result4 = vector32_eq(keys, vals4); + + /* combine the results into a single variable */ + tmp1 = vector32_or(result1, result2); + tmp2 = vector32_or(result3, result4); + result = vector32_or(tmp1, tmp2); + + /* return whether there was a match */ + return vector32_is_highbit_set(result); +} + /* * pg_lfind32 * @@ -119,46 +162,40 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem) } #endif - for (i = 0; i < tail_idx; i += nelem_per_iteration) + /* + * If there aren't enough elements for the SIMD code, jump to the standard + * one-by-one linear search code. + */ + if (nelem <= nelem_per_iteration) + goto one_by_one; + + /* + * Process as many elements as possible with a block of 4 registers. + */ + do { - Vector32 vals1, - vals2, - vals3, - vals4, - result1, - result2, - result3, - result4, - tmp1, - tmp2, - result; - - /* load the next block into 4 registers */ - vector32_load(&vals1, &base[i]); - vector32_load(&vals2, &base[i + nelem_per_vector]); - vector32_load(&vals3, &base[i + nelem_per_vector * 2]); - vector32_load(&vals4, &base[i + nelem_per_vector * 3]); - - /* compare each value to the key */ - result1 = vector32_eq(keys, vals1); - result2 = vector32_eq(keys, vals2); - result3 = vector32_eq(keys, vals3); - result4 = vector32_eq(keys, vals4); - - /* combine the results into a single variable */ - tmp1 = vector32_or(result1, result2); - tmp2 = vector32_or(result3, result4); - result = vector32_or(tmp1, tmp2); - - /* see if there was a match */ - if (vector32_is_highbit_set(result)) + if (pg_lfind32_helper(keys, &base[i])) { Assert(assert_result == true); return true; } - } + + i += nelem_per_iteration; + + } while (i < tail_idx); + + /* + * Process the last 'nelem_per_iteration' elements in the array with a + * 4-register block. This will cause us to check some of the elements + * more than once, but that won't affect correctness, and testing has + * demonstrated that this helps more cases than it harms. + */ + Assert(assert_result == pg_lfind32_helper(keys, &base[nelem - nelem_per_iteration])); + return pg_lfind32_helper(keys, &base[nelem - nelem_per_iteration]); + #endif /* ! USE_NO_SIMD */ +one_by_one: /* Process the remaining elements one at a time. */ for (; i < nelem; i++) { -- 2.25.1
>From 7e7781454646992218a990cf75f0654c67ce2dab Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nat...@postgresql.org> Date: Mon, 18 Mar 2024 11:02:05 -0500 Subject: [PATCH v6 2/2] Add support for AVX2 in simd.h. Discussion: https://postgr.es/m/20231129171526.GA857928%40nathanxps13 --- src/include/port/simd.h | 61 ++++++++++++++++++++++++++++++++--------- 1 file changed, 48 insertions(+), 13 deletions(-) diff --git a/src/include/port/simd.h b/src/include/port/simd.h index 597496f2fb..f06b21876b 100644 --- a/src/include/port/simd.h +++ b/src/include/port/simd.h @@ -18,7 +18,18 @@ #ifndef SIMD_H #define SIMD_H -#if (defined(__x86_64__) || defined(_M_AMD64)) +#if defined(__AVX2__) + +/* + * XXX: Need to add a big comment here. + */ +#include <immintrin.h> +#define USE_AVX2 +typedef __m256i Vector8; +typedef __m256i Vector32; + +#elif (defined(__x86_64__) || defined(_M_AMD64)) + /* * SSE2 instructions are part of the spec for the 64-bit x86 ISA. We assume * that compilers targeting this architecture understand SSE2 intrinsics. @@ -107,7 +118,9 @@ static inline Vector32 vector32_eq(const Vector32 v1, const Vector32 v2); static inline void vector8_load(Vector8 *v, const uint8 *s) { -#if defined(USE_SSE2) +#if defined(USE_AVX2) + *v = _mm256_loadu_si256((const __m256i *) s); +#elif defined(USE_SSE2) *v = _mm_loadu_si128((const __m128i *) s); #elif defined(USE_NEON) *v = vld1q_u8(s); @@ -120,7 +133,9 @@ vector8_load(Vector8 *v, const uint8 *s) static inline void vector32_load(Vector32 *v, const uint32 *s) { -#ifdef USE_SSE2 +#if defined(USE_AVX2) + *v = _mm256_loadu_si256((const __m256i *) s); +#elif defined(USE_SSE2) *v = _mm_loadu_si128((const __m128i *) s); #elif defined(USE_NEON) *v = vld1q_u32(s); @@ -134,7 +149,9 @@ vector32_load(Vector32 *v, const uint32 *s) static inline Vector8 vector8_broadcast(const uint8 c) { -#if defined(USE_SSE2) +#if defined(USE_AVX2) + return _mm256_set1_epi8(c); +#elif defined(USE_SSE2) return _mm_set1_epi8(c); #elif defined(USE_NEON) return vdupq_n_u8(c); @@ -147,7 +164,9 @@ vector8_broadcast(const uint8 c) static inline Vector32 vector32_broadcast(const uint32 c) { -#ifdef USE_SSE2 +#if defined(USE_AVX2) + return _mm256_set1_epi32(c); +#elif defined(USE_SSE2) return _mm_set1_epi32(c); #elif defined(USE_NEON) return vdupq_n_u32(c); @@ -270,7 +289,9 @@ vector8_has_le(const Vector8 v, const uint8 c) static inline bool vector8_is_highbit_set(const Vector8 v) { -#ifdef USE_SSE2 +#if defined(USE_AVX2) + return _mm256_movemask_epi8(v) != 0; +#elif defined(USE_SSE2) return _mm_movemask_epi8(v) != 0; #elif defined(USE_NEON) return vmaxvq_u8(v) > 0x7F; @@ -308,7 +329,9 @@ vector32_is_highbit_set(const Vector32 v) static inline uint32 vector8_highbit_mask(const Vector8 v) { -#ifdef USE_SSE2 +#if defined(USE_AVX2) + return (uint32) _mm256_movemask_epi8(v); +#elif defined(USE_SSE2) return (uint32) _mm_movemask_epi8(v); #elif defined(USE_NEON) /* @@ -337,7 +360,9 @@ vector8_highbit_mask(const Vector8 v) static inline Vector8 vector8_or(const Vector8 v1, const Vector8 v2) { -#ifdef USE_SSE2 +#if defined(USE_AVX2) + return _mm256_or_si256(v1, v2); +#elif defined(USE_SSE2) return _mm_or_si128(v1, v2); #elif defined(USE_NEON) return vorrq_u8(v1, v2); @@ -350,7 +375,9 @@ vector8_or(const Vector8 v1, const Vector8 v2) static inline Vector32 vector32_or(const Vector32 v1, const Vector32 v2) { -#ifdef USE_SSE2 +#if defined(USE_AVX2) + return _mm256_or_si256(v1, v2); +#elif defined(USE_SSE2) return _mm_or_si128(v1, v2); #elif defined(USE_NEON) return vorrq_u32(v1, v2); @@ -368,7 +395,9 @@ vector32_or(const Vector32 v1, const Vector32 v2) static inline Vector8 vector8_ssub(const Vector8 v1, const Vector8 v2) { -#ifdef USE_SSE2 +#if defined(USE_AVX2) + return _mm256_subs_epu8(v1, v2); +#elif defined(USE_SSE2) return _mm_subs_epu8(v1, v2); #elif defined(USE_NEON) return vqsubq_u8(v1, v2); @@ -384,7 +413,9 @@ vector8_ssub(const Vector8 v1, const Vector8 v2) static inline Vector8 vector8_eq(const Vector8 v1, const Vector8 v2) { -#ifdef USE_SSE2 +#if defined(USE_AVX2) + return _mm256_cmpeq_epi8(v1, v2); +#elif defined(USE_SSE2) return _mm_cmpeq_epi8(v1, v2); #elif defined(USE_NEON) return vceqq_u8(v1, v2); @@ -396,7 +427,9 @@ vector8_eq(const Vector8 v1, const Vector8 v2) static inline Vector32 vector32_eq(const Vector32 v1, const Vector32 v2) { -#ifdef USE_SSE2 +#if defined(USE_AVX2) + return _mm256_cmpeq_epi32(v1, v2); +#elif defined(USE_SSE2) return _mm_cmpeq_epi32(v1, v2); #elif defined(USE_NEON) return vceqq_u32(v1, v2); @@ -411,7 +444,9 @@ vector32_eq(const Vector32 v1, const Vector32 v2) static inline Vector8 vector8_min(const Vector8 v1, const Vector8 v2) { -#ifdef USE_SSE2 +#if defined(USE_AVX2) + return _mm256_min_epu8(v1, v2); +#elif defined(USE_SSE2) return _mm_min_epu8(v1, v2); #elif defined(USE_NEON) return vminq_u8(v1, v2); -- 2.25.1