On Wed, Nov 22, 2023 at 12:49:35PM -0600, Nathan Bossart wrote:
> On Wed, Nov 22, 2023 at 02:54:13PM +0200, Ants Aasma wrote:
>> For reference, executing the page checksum 10M times on a AMD 3900X CPU:
>> 
>> clang-14 -O2                 4.292s (17.8 GiB/s)
>> clang-14 -O2 -msse4.1        2.859s (26.7 GiB/s)
>> clang-14 -O2 -msse4.1 -mavx2 1.378s (55.4 GiB/s)
> 
> Nice.  I've noticed similar improvements with AVX2 intrinsics in simd.h.

I've alluded to this a few times now, so I figured I'd park the patch and
preliminary benchmarks in a new thread while we iron out how to support
newer instructions (see discussion here [0]).

Using the same benchmark as we did for the SSE2 linear searches in
XidInMVCCSnapshot() (commit 37a6e5d) [1] [2], I see the following:

  writers    sse2    avx2     %
      256    1195    1188    -1
      512     928    1054   +14
     1024     633     716   +13
     2048     332     420   +27
     4096     162     203   +25
     8192     162     182   +12

It's been a while since I ran these benchmarks, but I vaguely recall also
seeing something like a 50% improvement for a dedicated pg_lfind32()
benchmark on long arrays.

As is, the patch likely won't do anything unless you add -mavx2 or
-march=native to your CFLAGS.  I don't intend for this patch to be
seriously considered until we have better support for detecting/compiling
AVX2 instructions and a buildfarm machine that uses them.

I plan to start another thread for AVX2 support for the page checksums.

[0] https://postgr.es/m/20231107024734.GB729644%40nathanxps13
[1] https://postgr.es/m/057a9a95-19d2-05f0-17e2-f46ff20e9...@2ndquadrant.com
[2] https://postgr.es/m/20220713170950.GA3116318%40nathanxps13

-- 
Nathan Bossart
Amazon Web Services: https://aws.amazon.com
>From 5a90f1597fdc64aa6df6b9d0ffd959af7df41abd Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nat...@postgresql.org>
Date: Wed, 29 Nov 2023 10:01:32 -0600
Subject: [PATCH v1 1/1] add avx2 support in simd.h

---
 src/include/port/simd.h | 50 ++++++++++++++++++++++++++++++++---------
 1 file changed, 39 insertions(+), 11 deletions(-)

diff --git a/src/include/port/simd.h b/src/include/port/simd.h
index 1fa6c3bc6c..0e698dcfab 100644
--- a/src/include/port/simd.h
+++ b/src/include/port/simd.h
@@ -18,7 +18,15 @@
 #ifndef SIMD_H
 #define SIMD_H
 
-#if (defined(__x86_64__) || defined(_M_AMD64))
+#if defined(__AVX2__)
+
+#include <immintrin.h>
+#define USE_AVX2
+typedef __m256i Vector8;
+typedef __m256i Vector32;
+
+#elif (defined(__x86_64__) || defined(_M_AMD64))
+
 /*
  * SSE2 instructions are part of the spec for the 64-bit x86 ISA. We assume
  * that compilers targeting this architecture understand SSE2 intrinsics.
@@ -105,7 +113,9 @@ static inline Vector32 vector32_eq(const Vector32 v1, const Vector32 v2);
 static inline void
 vector8_load(Vector8 *v, const uint8 *s)
 {
-#if defined(USE_SSE2)
+#if defined(USE_AVX2)
+	*v = _mm256_loadu_si256((const __m256i *) s);
+#elif defined(USE_SSE2)
 	*v = _mm_loadu_si128((const __m128i *) s);
 #elif defined(USE_NEON)
 	*v = vld1q_u8(s);
@@ -118,7 +128,9 @@ vector8_load(Vector8 *v, const uint8 *s)
 static inline void
 vector32_load(Vector32 *v, const uint32 *s)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	*v = _mm256_loadu_si256((const __m256i *) s);
+#elif defined(USE_SSE2)
 	*v = _mm_loadu_si128((const __m128i *) s);
 #elif defined(USE_NEON)
 	*v = vld1q_u32(s);
@@ -132,7 +144,9 @@ vector32_load(Vector32 *v, const uint32 *s)
 static inline Vector8
 vector8_broadcast(const uint8 c)
 {
-#if defined(USE_SSE2)
+#if defined(USE_AVX2)
+	return _mm256_set1_epi8(c);
+#elif defined(USE_SSE2)
 	return _mm_set1_epi8(c);
 #elif defined(USE_NEON)
 	return vdupq_n_u8(c);
@@ -145,7 +159,9 @@ vector8_broadcast(const uint8 c)
 static inline Vector32
 vector32_broadcast(const uint32 c)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_set1_epi32(c);
+#elif defined(USE_SSE2)
 	return _mm_set1_epi32(c);
 #elif defined(USE_NEON)
 	return vdupq_n_u32(c);
@@ -268,7 +284,9 @@ vector8_has_le(const Vector8 v, const uint8 c)
 static inline bool
 vector8_is_highbit_set(const Vector8 v)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_movemask_epi8(v) != 0;
+#elif defined(USE_SSE2)
 	return _mm_movemask_epi8(v) != 0;
 #elif defined(USE_NEON)
 	return vmaxvq_u8(v) > 0x7F;
@@ -305,7 +323,9 @@ vector32_is_highbit_set(const Vector32 v)
 static inline Vector8
 vector8_or(const Vector8 v1, const Vector8 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_or_si256(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_or_si128(v1, v2);
 #elif defined(USE_NEON)
 	return vorrq_u8(v1, v2);
@@ -318,7 +338,9 @@ vector8_or(const Vector8 v1, const Vector8 v2)
 static inline Vector32
 vector32_or(const Vector32 v1, const Vector32 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_or_si256(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_or_si128(v1, v2);
 #elif defined(USE_NEON)
 	return vorrq_u32(v1, v2);
@@ -336,7 +358,9 @@ vector32_or(const Vector32 v1, const Vector32 v2)
 static inline Vector8
 vector8_ssub(const Vector8 v1, const Vector8 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_subs_epu8(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_subs_epu8(v1, v2);
 #elif defined(USE_NEON)
 	return vqsubq_u8(v1, v2);
@@ -352,7 +376,9 @@ vector8_ssub(const Vector8 v1, const Vector8 v2)
 static inline Vector8
 vector8_eq(const Vector8 v1, const Vector8 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_cmpeq_epi8(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_cmpeq_epi8(v1, v2);
 #elif defined(USE_NEON)
 	return vceqq_u8(v1, v2);
@@ -364,7 +390,9 @@ vector8_eq(const Vector8 v1, const Vector8 v2)
 static inline Vector32
 vector32_eq(const Vector32 v1, const Vector32 v2)
 {
-#ifdef USE_SSE2
+#if defined(USE_AVX2)
+	return _mm256_cmpeq_epi32(v1, v2);
+#elif defined(USE_SSE2)
 	return _mm_cmpeq_epi32(v1, v2);
 #elif defined(USE_NEON)
 	return vceqq_u32(v1, v2);
-- 
2.25.1

Reply via email to