On Wed, Sep 24, 2025 at 10:59:38AM +0700, John Naylor wrote:
> + if (unlikely(!hex_decode_simd_helper(srcv, &dstv1)))
> + break;
>
> But if you really want to do something here, sprinkling "(un)likely"'s
> here seems like solving the wrong problem (even if they make any
> difference), since the early return is optimizing for exceptional
> conditions. In other places (cf. the UTF8 string verifier), we
> accumulate errors, and only if we have them at the end do we restart
> from the beginning with the slow error-checking path that can show the
> user the offending input.
I switched to an accumulator approach in v11.
> +vector8_sssub(const Vector8 v1, const Vector8 v2)
>
> It's hard to parse "sss", so maybe we can borrow an Intel-ism and use
> "iss" for the signed case?
Done.
> +/* vector manipulation */
> +#ifndef USE_NO_SIMD
> +static inline Vector8 vector8_interleave_low(const Vector8 v1, const
> Vector8 v2);
> +static inline Vector8 vector8_interleave_high(const Vector8 v1, const
> Vector8 v2);
> +static inline Vector8 vector8_pack_16(const Vector8 v1, const Vector8 v2);
> +static inline Vector32 vector32_shift_left_nibble(const Vector32 v1);
> +static inline Vector32 vector32_shift_right_nibble(const Vector32 v1);
> +static inline Vector32 vector32_shift_right_byte(const Vector32 v1);
>
> Do we need declarations for these? I recall that the existing
> declarations are there for functions that are also used internally.
Removed.
> The nibble/byte things are rather specific. Wouldn't it be more
> logical to expose the already-generic shift operations and let the
> caller say by how much? Or does the compiler refuse because the
> intrinsic doesn't get an immediate value? Some are like that, but I'm
> not sure about these. If so, that's annoying and I wonder if there's a
> workaround.
Yeah, the compiler refuses unless the value is an integer literal. I
thought of using a switch statement to cover all the values used in-tree,
but I didn't like that, either.
> +vector8_has_ge(const Vector8 v, const uint8 c)
> +{
> +#ifdef USE_SSE2
> + Vector8 umax = _mm_max_epu8(v, vector8_broadcast(c));
> + Vector8 cmpe = _mm_cmpeq_epi8(umax, v);
> +
> + return vector8_is_highbit_set(cmpe);
>
> We take pains to avoid signed comparison on unsigned input for the
> "le" case, and I don't see why it's okay here.
_mm_max_epu8() does unsigned comparisons, I think...
> Do the regression tests have long enough cases that test exceptional
> paths, like invalid bytes and embedded whitespace? If not, we need
> some.
Added.
I've also fixed builds on gcc/arm64, as discussed elsewhere [0]. Here are
the current numbers on my laptop:
arm
buf | HEAD | patch | % diff
-------+-------+-------+--------
2 | 4 | 4 | 0
4 | 6 | 6 | 0
8 | 8 | 8 | 0
16 | 11 | 12 | -9
32 | 18 | 5 | 72
64 | 38 | 6 | 84
256 | 134 | 17 | 87
1024 | 513 | 63 | 88
4096 | 2081 | 262 | 87
16384 | 8524 | 1058 | 88
65536 | 34731 | 4224 | 88
[0] https://postgr.es/m/aNQtN89VW8z-yo3B%40nathan
--
nathan
>From b914ad69199bfbb4af95b97f83568401bc42f319 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <[email protected]>
Date: Fri, 12 Sep 2025 15:51:55 -0500
Subject: [PATCH v11 1/1] Optimize hex_encode() and hex_decode() using SIMD.
---
src/backend/utils/adt/encode.c | 137 +++++++++++++++-
src/include/port/simd.h | 223 +++++++++++++++++++++++++-
src/test/regress/expected/strings.out | 58 +++++++
src/test/regress/sql/strings.sql | 16 ++
4 files changed, 424 insertions(+), 10 deletions(-)
diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c
index 9a9c7e8da99..7ba92c2c481 100644
--- a/src/backend/utils/adt/encode.c
+++ b/src/backend/utils/adt/encode.c
@@ -16,6 +16,7 @@
#include <ctype.h>
#include "mb/pg_wchar.h"
+#include "port/simd.h"
#include "utils/builtins.h"
#include "utils/memutils.h"
#include "varatt.h"
@@ -177,8 +178,8 @@ static const int8 hexlookup[128] = {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
};
-uint64
-hex_encode(const char *src, size_t len, char *dst)
+static inline uint64
+hex_encode_scalar(const char *src, size_t len, char *dst)
{
const char *end = src + len;
@@ -193,6 +194,57 @@ hex_encode(const char *src, size_t len, char *dst)
return (uint64) len * 2;
}
+uint64
+hex_encode(const char *src, size_t len, char *dst)
+{
+#ifdef USE_NO_SIMD
+ return hex_encode_scalar(src, len, dst);
+#else
+ const uint64 tail_idx = len & ~(sizeof(Vector8) - 1);
+ uint64 i;
+
+ /*
+ * This works by splitting the high and low nibbles of each byte into
+ * separate vectors, adding the vectors to a mask that converts the
+ * nibbles to their equivalent ASCII bytes, and interleaving those bytes
+ * back together to form the final hex-encoded string. It might be
+ * possible to squeeze out a little more gain by manually unrolling the
+ * loop, but for now we don't bother.
+ */
+ for (i = 0; i < tail_idx; i += sizeof(Vector8))
+ {
+ Vector8 srcv;
+ Vector8 lo;
+ Vector8 hi;
+ Vector8 mask;
+
+ vector8_load(&srcv, (const uint8 *) &src[i]);
+
+ lo = vector8_and(srcv, vector8_broadcast(0x0f));
+ mask = vector8_gt(lo, vector8_broadcast(0x9));
+ mask = vector8_and(mask, vector8_broadcast('a' - '0' - 10));
+ mask = vector8_add(mask, vector8_broadcast('0'));
+ lo = vector8_add(lo, mask);
+
+ hi = vector8_and(srcv, vector8_broadcast(0xf0));
+ hi = vector8_shift_right_nibble(hi);
+ mask = vector8_gt(hi, vector8_broadcast(0x9));
+ mask = vector8_and(mask, vector8_broadcast('a' - '0' - 10));
+ mask = vector8_add(mask, vector8_broadcast('0'));
+ hi = vector8_add(hi, mask);
+
+ vector8_store((uint8 *) &dst[i * 2],
+ vector8_interleave_low(hi, lo));
+ vector8_store((uint8 *) &dst[i * 2 + sizeof(Vector8)],
+ vector8_interleave_high(hi, lo));
+ }
+
+ (void) hex_encode_scalar(src + i, len - i, dst + i * 2);
+
+ return (uint64) len * 2;
+#endif
+}
+
static inline bool
get_hex(const char *cp, char *out)
{
@@ -213,8 +265,8 @@ hex_decode(const char *src, size_t len, char *dst)
return hex_decode_safe(src, len, dst, NULL);
}
-uint64
-hex_decode_safe(const char *src, size_t len, char *dst, Node *escontext)
+static inline uint64
+hex_decode_safe_scalar(const char *src, size_t len, char *dst, Node *escontext)
{
const char *s,
*srcend;
@@ -254,6 +306,83 @@ hex_decode_safe(const char *src, size_t len, char *dst,
Node *escontext)
return p - dst;
}
+/*
+ * This helper converts each byte to its binary-equivalent nibble by
+ * subtraction and combines them to form the return bytes (separated by zero
+ * bytes). Returns false if any input bytes are outside the expected ranges of
+ * ASCII values. Otherwise, returns true.
+ */
+#ifndef USE_NO_SIMD
+static inline bool
+hex_decode_simd_helper(const Vector8 src, Vector8 *dst)
+{
+ Vector8 sub;
+ Vector8 msk;
+ bool ret;
+
+ msk = vector8_gt(vector8_broadcast('9' + 1), src);
+ sub = vector8_and(msk, vector8_broadcast('0'));
+
+ msk = vector8_gt(src, vector8_broadcast('A' - 1));
+ msk = vector8_and(msk, vector8_broadcast('A' - 10));
+ sub = vector8_add(sub, msk);
+
+ msk = vector8_gt(src, vector8_broadcast('a' - 1));
+ msk = vector8_and(msk, vector8_broadcast('a' - 'A'));
+ sub = vector8_add(sub, msk);
+
+ *dst = vector8_issub(src, sub);
+ ret = !vector8_has_ge(*dst, 0x10);
+
+ msk = vector8_and(*dst, vector8_broadcast_u32(0xff00ff00));
+ msk = vector8_shift_right_byte(msk);
+ *dst = vector8_and(*dst, vector8_broadcast_u32(0x00ff00ff));
+ *dst = vector8_shift_left_nibble(*dst);
+ *dst = vector8_or(*dst, msk);
+ return ret;
+}
+#endif /* ! USE_NO_SIMD */
+
+uint64
+hex_decode_safe(const char *src, size_t len, char *dst, Node *escontext)
+{
+#ifdef USE_NO_SIMD
+ return hex_decode_safe_scalar(src, len, dst, escontext);
+#else
+ const uint64 tail_idx = len & ~(sizeof(Vector8) * 2 - 1);
+ uint64 i;
+ bool success = true;
+
+ /*
+ * We must process 2 vectors at a time since the output will be half the
+ * length of the input.
+ */
+ for (i = 0; i < tail_idx; i += sizeof(Vector8) * 2)
+ {
+ Vector8 srcv;
+ Vector8 dstv1;
+ Vector8 dstv2;
+
+ vector8_load(&srcv, (const uint8 *) &src[i]);
+ success &= hex_decode_simd_helper(srcv, &dstv1);
+
+ vector8_load(&srcv, (const uint8 *) &src[i + sizeof(Vector8)]);
+ success &= hex_decode_simd_helper(srcv, &dstv2);
+
+ vector8_store((uint8 *) &dst[i / 2], vector8_pack_16(dstv1,
dstv2));
+ }
+
+ /*
+ * If something didn't look right in the vector path, try again in the
+ * scalar path so that we can handle it correctly.
+ */
+ if (unlikely(!success))
+ i = 0;
+
+ return i / 2 + hex_decode_safe_scalar(src + i, len - i, dst + i / 2,
escontext);
+#endif
+}
+
static uint64
hex_enc_len(const char *src, size_t srclen)
{
diff --git a/src/include/port/simd.h b/src/include/port/simd.h
index 97c5f353022..531d8b8b6d1 100644
--- a/src/include/port/simd.h
+++ b/src/include/port/simd.h
@@ -86,7 +86,7 @@ static inline uint32 vector8_highbit_mask(const Vector8 v);
static inline Vector8 vector8_or(const Vector8 v1, const Vector8 v2);
#ifndef USE_NO_SIMD
static inline Vector32 vector32_or(const Vector32 v1, const Vector32 v2);
-static inline Vector8 vector8_ssub(const Vector8 v1, const Vector8 v2);
+static inline Vector8 vector8_ussub(const Vector8 v1, const Vector8 v2);
#endif
/*
@@ -128,6 +128,21 @@ vector32_load(Vector32 *v, const uint32 *s)
}
#endif /* ! USE_NO_SIMD */
+/*
+ * Store a vector into the given memory address.
+ */
+#ifndef USE_NO_SIMD
+static inline void
+vector8_store(uint8 *s, Vector8 v)
+{
+#ifdef USE_SSE2
+ _mm_storeu_si128((Vector8 *) s, v);
+#elif defined(USE_NEON)
+ vst1q_u8(s, v);
+#endif
+}
+#endif /* ! USE_NO_SIMD */
+
/*
* Create a vector with all elements set to the same value.
*/
@@ -155,6 +170,24 @@ vector32_broadcast(const uint32 c)
}
#endif /* ! USE_NO_SIMD */
+/*
+ * Some compilers are picky about casts to the same underlying type, and others
+ * are picky about implicit conversions with vector types. This function does
+ * the same thing as vector32_broadcast(), but it returns a Vector8 and is
+ * carefully crafted to avoid compiler indigestion.
+ */
+#ifndef USE_NO_SIMD
+static inline Vector8
+vector8_broadcast_u32(const uint32 c)
+{
+#ifdef USE_SSE2
+ return vector32_broadcast(c);
+#elif defined(USE_NEON)
+ return (Vector8) vector32_broadcast(c);
+#endif
+}
+#endif /* ! USE_NO_SIMD */
+
/*
* Return true if any elements in the vector are equal to the given scalar.
*/
@@ -257,13 +290,32 @@ vector8_has_le(const Vector8 v, const uint8 c)
* NUL bytes. This approach is a workaround for the lack of unsigned
* comparison instructions on some architectures.
*/
- result = vector8_has_zero(vector8_ssub(v, vector8_broadcast(c)));
+ result = vector8_has_zero(vector8_ussub(v, vector8_broadcast(c)));
#endif
Assert(assert_result == result);
return result;
}
+/*
+ * Returns true if any elements in the vector are greater than or equal to the
+ * given scalar.
+ */
+#ifndef USE_NO_SIMD
+static inline bool
+vector8_has_ge(const Vector8 v, const uint8 c)
+{
+#ifdef USE_SSE2
+ Vector8 umax = _mm_max_epu8(v, vector8_broadcast(c));
+ Vector8 cmpe = _mm_cmpeq_epi8(umax, v);
+
+ return vector8_is_highbit_set(cmpe);
+#elif defined(USE_NEON)
+ return vmaxvq_u8(v) >= c;
+#endif
+}
+#endif /* ! USE_NO_SIMD */
+
/*
* Return true if the high bit of any element is set
*/
@@ -358,15 +410,65 @@ vector32_or(const Vector32 v1, const Vector32 v2)
}
#endif /* ! USE_NO_SIMD */
+/*
+ * Return the bitwise AND of the inputs.
+ */
+#ifndef USE_NO_SIMD
+static inline Vector8
+vector8_and(const Vector8 v1, const Vector8 v2)
+{
+#ifdef USE_SSE2
+ return _mm_and_si128(v1, v2);
+#elif defined(USE_NEON)
+ return vandq_u8(v1, v2);
+#endif
+}
+#endif /* ! USE_NO_SIMD */
+
+/*
+ * Return the result of adding the respective elements of the input vectors.
+ */
+#ifndef USE_NO_SIMD
+static inline Vector8
+vector8_add(const Vector8 v1, const Vector8 v2)
+{
+#ifdef USE_SSE2
+ return _mm_add_epi8(v1, v2);
+#elif defined(USE_NEON)
+ return vaddq_u8(v1, v2);
+#endif
+}
+#endif /* ! USE_NO_SIMD */
+
+/*
+ * Return the result of subtracting the respective elements of the input
+ * vectors using signed saturation (i.e., if the operation would yield a value
+ * less than -128, -128 is returned instead). For more information on
+ * saturation arithmetic, see
+ * https://en.wikipedia.org/wiki/Saturation_arithmetic
+ */
+#ifndef USE_NO_SIMD
+static inline Vector8
+vector8_issub(const Vector8 v1, const Vector8 v2)
+{
+#ifdef USE_SSE2
+ return _mm_subs_epi8(v1, v2);
+#elif defined(USE_NEON)
+ return (Vector8) vqsubq_s8((int8x16_t) v1, (int8x16_t) v2);
+#endif
+}
+#endif /* ! USE_NO_SIMD */
+
/*
* Return the result of subtracting the respective elements of the input
- * vectors using saturation (i.e., if the operation would yield a value less
- * than zero, zero is returned instead). For more information on saturation
- * arithmetic, see https://en.wikipedia.org/wiki/Saturation_arithmetic
+ * vectors using unsigned saturation (i.e., if the operation would yield a
+ * value less than zero, zero is returned instead). For more information on
+ * saturation arithmetic, see
+ * https://en.wikipedia.org/wiki/Saturation_arithmetic
*/
#ifndef USE_NO_SIMD
static inline Vector8
-vector8_ssub(const Vector8 v1, const Vector8 v2)
+vector8_ussub(const Vector8 v1, const Vector8 v2)
{
#ifdef USE_SSE2
return _mm_subs_epu8(v1, v2);
@@ -404,6 +506,23 @@ vector32_eq(const Vector32 v1, const Vector32 v2)
}
#endif /* ! USE_NO_SIMD */
+/*
+ * Return a vector with all bits set for each lane of v1 that is greater than
+ * the corresponding lane of v2. NB: The comparison treats the elements as
+ * signed.
+ */
+#ifndef USE_NO_SIMD
+static inline Vector8
+vector8_gt(const Vector8 v1, const Vector8 v2)
+{
+#ifdef USE_SSE2
+ return _mm_cmpgt_epi8(v1, v2);
+#elif defined(USE_NEON)
+ return vcgtq_s8((int8x16_t) v1, (int8x16_t) v2);
+#endif
+}
+#endif /* ! USE_NO_SIMD */
+
/*
* Given two vectors, return a vector with the minimum element of each.
*/
@@ -419,4 +538,96 @@ vector8_min(const Vector8 v1, const Vector8 v2)
}
#endif /* ! USE_NO_SIMD */
+/*
+ * Interleave elements of low halves of given vectors.
+ */
+#ifndef USE_NO_SIMD
+static inline Vector8
+vector8_interleave_low(const Vector8 v1, const Vector8 v2)
+{
+#ifdef USE_SSE2
+ return _mm_unpacklo_epi8(v1, v2);
+#elif defined(USE_NEON)
+ return vzip1q_u8(v1, v2);
+#endif
+}
+#endif /* ! USE_NO_SIMD */
+
+/*
+ * Interleave elements of high halves of given vectors.
+ */
+#ifndef USE_NO_SIMD
+static inline Vector8
+vector8_interleave_high(const Vector8 v1, const Vector8 v2)
+{
+#ifdef USE_SSE2
+ return _mm_unpackhi_epi8(v1, v2);
+#elif defined(USE_NEON)
+ return vzip2q_u8(v1, v2);
+#endif
+}
+#endif /* ! USE_NO_SIMD */
+
+/*
+ * Pack 16-bit elements in the given vectors into a single vector of 8-bit
+ * elements. NB: The upper 8-bits of each 16-bit element must be zeros, else
+ * this will produce different results on different architectures.
+ */
+#ifndef USE_NO_SIMD
+static inline Vector8
+vector8_pack_16(const Vector8 v1, const Vector8 v2)
+{
+#ifdef USE_SSE2
+ return _mm_packus_epi16(v1, v2);
+#elif defined(USE_NEON)
+ return vuzp1q_u8(v1, v2);
+#endif
+}
+#endif /* ! USE_NO_SIMD */
+
+/*
+ * Unsigned shift left of each 32-bit element in the vector by 4 bits.
+ */
+#ifndef USE_NO_SIMD
+static inline Vector8
+vector8_shift_left_nibble(const Vector8 v1)
+{
+#ifdef USE_SSE2
+ return _mm_slli_epi32(v1, 4);
+#elif defined(USE_NEON)
+ return (Vector8) vshlq_n_u32((Vector32) v1, 4);
+#endif
+}
+#endif /* ! USE_NO_SIMD */
+
+/*
+ * Unsigned shift right of each 32-bit element in the vector by 4 bits.
+ */
+#ifndef USE_NO_SIMD
+static inline Vector8
+vector8_shift_right_nibble(const Vector8 v1)
+{
+#ifdef USE_SSE2
+ return _mm_srli_epi32(v1, 4);
+#elif defined(USE_NEON)
+ return (Vector8) vshrq_n_u32((Vector32) v1, 4);
+#endif
+}
+#endif /* ! USE_NO_SIMD */
+
+/*
+ * Unsigned shift right of each 32-bit element in the vector by 1 byte.
+ */
+#ifndef USE_NO_SIMD
+static inline Vector8
+vector8_shift_right_byte(const Vector8 v1)
+{
+#ifdef USE_SSE2
+ return _mm_srli_epi32(v1, 8);
+#elif defined(USE_NEON)
+ return (Vector8) vshrq_n_u32((Vector32) v1, 8);
+#endif
+}
+#endif /* ! USE_NO_SIMD */
+
#endif /* SIMD_H */
diff --git a/src/test/regress/expected/strings.out
b/src/test/regress/expected/strings.out
index 691e475bce3..3e351cf1cd0 100644
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -260,6 +260,64 @@ SELECT reverse('\xabcd'::bytea);
\xcdab
(1 row)
+SELECT ('\x' || repeat(' ', 32))::bytea;
+ bytea
+-------
+ \x
+(1 row)
+
+SELECT ('\x' || repeat('!', 33))::bytea;
+ERROR: invalid hexadecimal digit: "!"
+SELECT ('\x' || repeat('/', 33))::bytea;
+ERROR: invalid hexadecimal digit: "/"
+SELECT ('\x' || repeat('0', 32))::bytea;
+ bytea
+------------------------------------
+ \x00000000000000000000000000000000
+(1 row)
+
+SELECT ('\x' || repeat('9', 32))::bytea;
+ bytea
+------------------------------------
+ \x99999999999999999999999999999999
+(1 row)
+
+SELECT ('\x' || repeat(':', 33))::bytea;
+ERROR: invalid hexadecimal digit: ":"
+SELECT ('\x' || repeat('@', 33))::bytea;
+ERROR: invalid hexadecimal digit: "@"
+SELECT ('\x' || repeat('A', 32))::bytea;
+ bytea
+------------------------------------
+ \xaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+(1 row)
+
+SELECT ('\x' || repeat('F', 32))::bytea;
+ bytea
+------------------------------------
+ \xffffffffffffffffffffffffffffffff
+(1 row)
+
+SELECT ('\x' || repeat('G', 33))::bytea;
+ERROR: invalid hexadecimal digit: "G"
+SELECT ('\x' || repeat('`', 33))::bytea;
+ERROR: invalid hexadecimal digit: "`"
+SELECT ('\x' || repeat('a', 32))::bytea;
+ bytea
+------------------------------------
+ \xaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+(1 row)
+
+SELECT ('\x' || repeat('f', 32))::bytea;
+ bytea
+------------------------------------
+ \xffffffffffffffffffffffffffffffff
+(1 row)
+
+SELECT ('\x' || repeat('g', 33))::bytea;
+ERROR: invalid hexadecimal digit: "g"
+SELECT ('\x' || repeat('~', 33))::bytea;
+ERROR: invalid hexadecimal digit: "~"
SET bytea_output TO escape;
SELECT E'\\xDeAdBeEf'::bytea;
bytea
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
index c05f3413699..35910369b6f 100644
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -82,6 +82,22 @@ SELECT reverse(''::bytea);
SELECT reverse('\xaa'::bytea);
SELECT reverse('\xabcd'::bytea);
+SELECT ('\x' || repeat(' ', 32))::bytea;
+SELECT ('\x' || repeat('!', 33))::bytea;
+SELECT ('\x' || repeat('/', 33))::bytea;
+SELECT ('\x' || repeat('0', 32))::bytea;
+SELECT ('\x' || repeat('9', 32))::bytea;
+SELECT ('\x' || repeat(':', 33))::bytea;
+SELECT ('\x' || repeat('@', 33))::bytea;
+SELECT ('\x' || repeat('A', 32))::bytea;
+SELECT ('\x' || repeat('F', 32))::bytea;
+SELECT ('\x' || repeat('G', 33))::bytea;
+SELECT ('\x' || repeat('`', 33))::bytea;
+SELECT ('\x' || repeat('a', 32))::bytea;
+SELECT ('\x' || repeat('f', 32))::bytea;
+SELECT ('\x' || repeat('g', 33))::bytea;
+SELECT ('\x' || repeat('~', 33))::bytea;
+
SET bytea_output TO escape;
SELECT E'\\xDeAdBeEf'::bytea;
SELECT E'\\x De Ad Be Ef '::bytea;
--
2.39.5 (Apple Git-154)