From 2094bc7f60db93693f2c054e9044d8baa128bb8f Mon Sep 17 00:00:00 2001
From: Chiranmoy Bhattacharya <chiranmoy.bhattacharya@fujitsu.com>
Date: Wed, 22 Jan 2025 15:52:40 +0530
Subject: [PATCH v2] SVE support for hex encode and hex decode

---
 config/c-compiler.m4           |  53 ++++++++
 configure                      |  63 +++++++++
 configure.ac                   |   9 ++
 meson.build                    |  47 +++++++
 src/backend/utils/adt/encode.c | 241 +++++++++++++++++++++++++++++++++
 src/include/pg_config.h.in     |   3 +
 6 files changed, 416 insertions(+)

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 8534cc54c1..bb22ceed17 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -704,3 +704,56 @@ if test x"$Ac_cachevar" = x"yes"; then
 fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_AVX512_POPCNT_INTRINSICS
+
+# PGAC_ARM_SVE_HEX_INTRINSICS
+# ------------------------------
+# Check if the compiler supports the ARM SVE intrinsic required for hex coding:
+# svld1, svtbl, svsel, etc.
+#
+# If the intrinsics are supported, sets pgac_arm_sve_hex_intrinsics.
+AC_DEFUN([PGAC_ARM_SVE_HEX_INTRINSICS],
+[
+  AC_CACHE_CHECK([for svtbl, svlsr_z, svand_z, svcreate2, svst2, svsel and svget2 intrinsics],
+                 [pgac_cv_arm_sve_hex_intrinsics],
+  [
+
+    AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <arm_sve.h>],
+    #if defined(__has_attribute) && __has_attribute (target)
+      __attribute__((target("arch=armv8-a+sve")))
+    #endif
+
+    [
+      char input[64] = {0};
+      char output[64] = {0};
+      svbool_t pred = svptrue_b8(), cmp1, cmp2;
+      svuint8_t bytes, hextbl_vec;
+      svuint8x2_t	merged;
+
+      /* intrinsics used in hex_encode_sve */
+      hextbl_vec = svld1(svwhilelt_b8(0, 16), (uint8_t *) "0123456789ABCDEF");
+      bytes = svld1(pred, (uint8_t *) input);
+      bytes = svlsr_z(pred, bytes, 4);
+      bytes = svand_z(pred, bytes, 0xF);
+      merged = svcreate2(svtbl(hextbl_vec, bytes), svtbl(hextbl_vec, bytes));
+      svst2(pred, (uint8_t *) output, merged);
+
+      /* intrinsics used in hex_decode_sve */
+      bytes = svget2(svld2(pred, (uint8_t *) output), 0);
+      bytes = svsub_x(pred, bytes, bytes);
+      cmp1 = svcmplt(pred, bytes, 0);
+      cmp2 = svcmpgt(pred, bytes, 0);
+      bytes = svsel(svnot_z(pred, svand_z(pred, cmp1, cmp2)), bytes, bytes);
+      svst1(pred, output, bytes);
+
+      /* return computed value, to prevent the above being optimized away */
+      return output[0] == 0;
+    ])],
+    [pgac_cv_arm_sve_hex_intrinsics=yes],
+    [pgac_cv_arm_sve_hex_intrinsics=no])
+
+  ])
+
+  if test x"$pgac_cv_arm_sve_hex_intrinsics" = x"yes"; then
+    pgac_arm_sve_hex_intrinsics = yes
+  fi
+])
diff --git a/configure b/configure
index ceeef9b091..e634feec02 100755
--- a/configure
+++ b/configure
@@ -17168,6 +17168,69 @@ $as_echo "#define USE_AVX512_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h
   fi
 fi
 
+# Check SVE intrinsics for hex coding
+#
+if test x"$host_cpu" = x"aarch64"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for SVE intrinsic svtbl, svlsr_z, etc." >&5
+  $as_echo_n "checking for SVE intrinsic svtbl, svlsr_z... " >&6; }
+if ${pgac_cv_arm_sve_hex_intrinsics+:} false; then :
+    $as_echo_n "(cached) " >&6
+else
+    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <arm_sve.h>
+#if defined(__has_attribute) && __has_attribute(target)
+    __attribute__((target("arch=armv8-a+sve")))
+#endif
+int
+main ()
+{
+    char input[64] = {0};
+    char output[64] = {0};
+    svbool_t pred = svptrue_b8(), cmp1, cmp2;
+    svuint8_t bytes, hextbl_vec;
+    svuint8x2_t	merged;
+
+    /* intrinsics used in hex_encode_sve */
+    hextbl_vec = svld1(svwhilelt_b8(0, 16), (uint8_t *) "0123456789ABCDEF");
+    bytes = svld1(pred, (uint8_t *) input);
+    bytes = svlsr_z(pred, bytes, 4);
+    bytes = svand_z(pred, bytes, 0xF);
+    merged = svcreate2(svtbl(hextbl_vec, bytes), svtbl(hextbl_vec, bytes));
+		svst2(pred, (uint8_t *) output, merged);
+
+    /* intrinsics used in hex_decode_sve */
+    bytes = svget2(svld2(pred, (uint8_t *) output), 0);
+    bytes = svsub_x(pred, bytes, bytes);
+    cmp1 = svcmplt(pred, bytes, 0);
+    cmp2 = svcmpgt(pred, bytes, 0);
+    bytes = svsel(svnot_z(pred, svand_z(pred, cmp1, cmp2)), bytes, bytes);
+    svst1(pred, output, bytes);
+
+    /* return computed value, to prevent the above being optimized away */
+    return output[0] == 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_arm_sve_hex_intrinsics=yes
+else
+  pgac_cv_arm_sve_hex_intrinsics=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_arm_sve_hex_intrinsics" >&5
+$as_echo "$pgac_cv_arm_sve_hex_intrinsics" >&6; }
+
+if test x"$pgac_cv_arm_sve_hex_intrinsics" = x"yes"; then
+  PGAC_ARM_SVE_HEX_INTRINSICS=yes
+fi
+
+if test x"$PGAC_ARM_SVE_HEX_INTRINSICS" = x"yes"; then
+  $as_echo "#define USE_SVE_WITH_RUNTIME_CHECK 1" >>confdefs.h
+fi
+fi
+
 # Check for Intel SSE 4.2 intrinsics to do CRC calculations.
 #
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32" >&5
diff --git a/configure.ac b/configure.ac
index d713360f34..cc805667b9 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2021,6 +2021,15 @@ if test x"$host_cpu" = x"x86_64"; then
   fi
 fi
 
+# Check for ARM SVE intrinsics for hex coding
+#
+if test x"$host_cpu" = x"aarch64"; then
+  PGAC_ARM_SVE_HEX_INTRINSICS()
+  if test x"$PGAC_ARM_SVE_HEX_INTRINSICS" = x"yes"; then
+    AC_DEFINE(USE_SVE_WITH_RUNTIME_CHECK, 1, [Define to 1 to use ARM SVE intrinsic for hex coding.])
+  fi
+fi
+
 # Check for Intel SSE 4.2 intrinsics to do CRC calculations.
 #
 PGAC_SSE42_CRC32_INTRINSICS()
diff --git a/meson.build b/meson.build
index 32fc89f3a4..d9d13b3c55 100644
--- a/meson.build
+++ b/meson.build
@@ -2194,6 +2194,53 @@ int main(void)
 endif
 
 
+###############################################################
+# Check the availability of ARM SVE intrinsics for hex coding.
+###############################################################
+
+if host_cpu == 'aarch64'
+
+  prog = '''
+#include <arm_sve.h>
+#if defined(__has_attribute) && __has_attribute (target)
+    __attribute__((target("arch=armv8-a+sve")))
+#endif
+int main(void)
+{
+    char input[64] = {0};
+    char output[64] = {0};
+    svbool_t pred = svptrue_b8(), cmp1, cmp2;
+    svuint8_t bytes, hextbl_vec;
+    svuint8x2_t	merged;
+
+    /* intrinsics used in hex_encode_sve */
+    hextbl_vec = svld1(svwhilelt_b8(0, 16), (uint8_t *) "0123456789ABCDEF");
+    bytes = svld1(pred, (uint8_t *) input);
+    bytes = svlsr_z(pred, bytes, 4);
+    bytes = svand_z(pred, bytes, 0xF);
+    merged = svcreate2(svtbl(hextbl_vec, bytes), svtbl(hextbl_vec, bytes));
+		svst2(pred, (uint8_t *) output, merged);
+
+    /* intrinsics used in hex_decode_sve */
+    bytes = svget2(svld2(pred, (uint8_t *) output), 0);
+    bytes = svsub_x(pred, bytes, bytes);
+    cmp1 = svcmplt(pred, bytes, 0);
+    cmp2 = svcmpgt(pred, bytes, 0);
+    bytes = svsel(svnot_z(pred, svand_z(pred, cmp1, cmp2)), bytes, bytes);
+    svst1(pred, output, bytes);
+
+    /* return computed value, to prevent the above being optimized away */
+    return output[0] == 0;
+}
+'''
+
+  if cc.links(prog, name: 'ARM SVE hex encoding', args: test_c_args)
+    cdata.set('USE_SVE_WITH_RUNTIME_CHECK', 1)
+  endif
+
+endif
+
+
 ###############################################################
 # Select CRC-32C implementation.
 #
diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c
index 4ccaed815d..0fe41a8d00 100644
--- a/src/backend/utils/adt/encode.c
+++ b/src/backend/utils/adt/encode.c
@@ -20,6 +20,10 @@
 #include "utils/memutils.h"
 #include "varatt.h"
 
+#ifdef USE_SVE_WITH_RUNTIME_CHECK
+#include <sys/auxv.h>
+#include <arm_sve.h>
+#endif
 
 /*
  * Encoding conversion API.
@@ -177,8 +181,106 @@ static const int8 hexlookup[128] = {
 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 };
 
+#ifdef USE_SVE_WITH_RUNTIME_CHECK
+static uint64 hex_encode_slow(const char *src, size_t len, char *dst);
+static uint64 hex_decode_slow(const char *src, size_t len, char *dst);
+static uint64 hex_decode_safe_slow(const char *src, size_t len, char *dst,
+								   Node *escontext);
+static uint64 hex_encode_sve(const char *src, size_t len, char *dst);
+static uint64 hex_decode_sve(const char *src, size_t len, char *dst);
+static uint64 hex_decode_safe_sve(const char *src, size_t len, char *dst,
+								  Node *escontext);
+static uint64 hex_encode_choose(const char *src, size_t len, char *dst);
+static uint64 hex_decode_choose(const char *src, size_t len, char *dst);
+static uint64 hex_decode_safe_choose(const char *src, size_t len, char *dst,
+									 Node *escontext);
+uint64 (*hex_encode_optimized)
+	   (const char *src, size_t len, char *dst) = hex_encode_choose;
+uint64 (*hex_decode_optimized)
+	   (const char *src, size_t len, char *dst) = hex_decode_choose;
+uint64 (*hex_decode_safe_optimized)
+	   (const char *src, size_t len, char *dst, Node *escontext) =
+		hex_decode_safe_choose;
+
+/*
+ * Returns true if the CPU supports SVE instructions.
+ */
+static inline bool
+check_sve_support(void)
+{
+	return (getauxval(AT_HWCAP) & HWCAP_SVE) != 0;
+}
+
+static inline void
+choose_hex_functions(void)
+{
+	if (check_sve_support())
+	{
+		hex_encode_optimized = hex_encode_sve;
+		hex_decode_optimized = hex_decode_sve;
+		hex_decode_safe_optimized = hex_decode_safe_sve;
+	}
+	else
+	{
+		hex_encode_optimized = hex_encode_slow;
+		hex_decode_optimized = hex_decode_slow;
+		hex_decode_safe_optimized = hex_decode_safe_slow;
+	}
+}
+
+static uint64
+hex_encode_choose(const char *src, size_t len, char *dst)
+{
+	choose_hex_functions();
+	return hex_encode_optimized(src, len, dst);
+}
+
+static uint64
+hex_decode_choose(const char *src, size_t len, char *dst)
+{
+	choose_hex_functions();
+	return hex_decode_optimized(src, len, dst);
+}
+
+static uint64
+hex_decode_safe_choose(const char *src, size_t len, char *dst, Node *escontext)
+{
+	choose_hex_functions();
+	return hex_decode_safe_optimized(src, len, dst, escontext);
+}
+
+uint64
+hex_encode(const char *src, size_t len, char *dst)
+{
+	if (len < 16)
+		return hex_encode_slow(src, len, dst);
+	return hex_encode_optimized(src, len, dst);
+}
+
+uint64
+hex_decode(const char *src, size_t len, char *dst)
+{
+	if (len < 32)
+		return hex_decode_slow(src, len, dst);
+	return hex_decode_optimized(src, len, dst);
+}
+
+uint64
+hex_decode_safe(const char *src, size_t len, char *dst, Node *escontext)
+{
+	if (len < 32)
+		return hex_decode_safe_slow(src, len, dst, escontext);
+	return hex_decode_safe_optimized(src, len, dst, escontext);
+}
+#endif							/* USE_SVE_WITH_RUNTIME_CHECK */
+
+#ifdef USE_SVE_WITH_RUNTIME_CHECK
+uint64
+hex_encode_slow(const char *src, size_t len, char *dst)
+#else
 uint64
 hex_encode(const char *src, size_t len, char *dst)
+#endif
 {
 	const char *end = src + len;
 
@@ -207,14 +309,24 @@ get_hex(const char *cp, char *out)
 	return (res >= 0);
 }
 
+#ifdef USE_SVE_WITH_RUNTIME_CHECK
+uint64
+hex_decode_slow(const char *src, size_t len, char *dst)
+#else
 uint64
 hex_decode(const char *src, size_t len, char *dst)
+#endif
 {
 	return hex_decode_safe(src, len, dst, NULL);
 }
 
+#ifdef USE_SVE_WITH_RUNTIME_CHECK
+uint64
+hex_decode_safe_slow(const char *src, size_t len, char *dst, Node *escontext)
+#else
 uint64
 hex_decode_safe(const char *src, size_t len, char *dst, Node *escontext)
+#endif
 {
 	const char *s,
 			   *srcend;
@@ -254,6 +366,135 @@ hex_decode_safe(const char *src, size_t len, char *dst, Node *escontext)
 	return p - dst;
 }
 
+#ifdef USE_SVE_WITH_RUNTIME_CHECK
+/*
+ * SVE implementation of hex_encode and hex_decode.
+ */
+
+pg_attribute_target("arch=armv8-a+sve")
+uint64
+hex_encode_sve(const char *src, size_t len, char *dst)
+{
+	const char	hextbl[] = "0123456789abcdef";
+	svbool_t	pred;
+	svuint8_t	bytes,
+				high,
+				low,
+				hextbl_vec = svld1(svwhilelt_b8(0, 16), (uint8 *) hextbl);
+	svuint8x2_t	merged;
+	uint32 		vec_len = svcntb();
+
+	for (size_t i = 0; i < len; i += vec_len)
+	{
+		pred = svwhilelt_b8(i, len);
+		bytes = svld1(pred, (uint8 *) src);
+		high = svlsr_z(pred, bytes, 4);	/* shift-right to get the high nibble */
+		low = svand_z(pred, bytes, 0xF);   /* mask high to get the low nibble */
+
+		/*
+		 * Convert the nibbles to hex digits by indexing into hextbl_vec,
+		 * for example, a nibble value of 10 indexed into hextbl_vec gives 'a'.
+		 * Finally, interleave the high and low nibbles
+		 */
+		merged = svcreate2(svtbl(hextbl_vec, high), svtbl(hextbl_vec, low));
+		svst2(pred, (uint8 *) dst, merged);
+
+		dst += 2 * vec_len;
+		src += vec_len;
+	}
+
+	return (uint64) len * 2;
+}
+
+pg_attribute_target("arch=armv8-a+sve")
+static inline bool
+get_hex_sve(svbool_t pred, svuint8_t vec, svuint8_t *res)
+{
+	/*
+	 * Convert ASCII values '0'-'9' to integers 0-9 by subtracting 48.
+	 * Similarly, convert letters 'A'-'F' and 'a'-'f' to integers 10-15.
+	 */
+	svuint8_t	dgt_vec = svsub_x(pred, vec, 48),
+				cap_vec = svsub_x(pred, vec, 55),
+				sml_vec = svsub_x(pred, vec, 87),
+				letter_vec;
+	/*
+	 * Identify valid integers in dgt_vec, cap_vec, and sml_vec.
+	 * Values 0-9 are valid in dgt_vec, while values 10-15 are valid
+	 * in cap_vec and sml_vec.
+	 */
+	svbool_t	dgt_bool = svcmplt(pred, dgt_vec, 10),
+				cap_bool = svcmplt(pred, cap_vec, 16),
+				letter_bool;
+	/*
+	 * Combine cap_vec and sml_vec and mark the valid range 10-15.
+	 */
+	letter_vec = svsel(cap_bool, cap_vec, sml_vec);
+	letter_bool = svand_z(pred, svcmpgt(pred, letter_vec, 9),
+								svcmplt(pred, letter_vec, 16));
+	/*
+	 * Check for invalid hexadecimal digits. Each value must fall
+	 * within the range 0-9 (true in dgt_bool) or 10-15 (true in letter_bool).
+	 */
+	if (svptest_any(pred, svnot_z(pred, svorr_z(pred, dgt_bool, letter_bool))))
+		return false;
+
+	/* Finally, combine dgt_vec and letter_vec */
+	*res = svsel(dgt_bool, dgt_vec, letter_vec);
+	return true;
+}
+
+uint64
+hex_decode_sve(const char *src, size_t len, char *dst)
+{
+	return hex_decode_safe_sve(src, len, dst, NULL);
+}
+
+pg_attribute_target("arch=armv8-a+sve")
+uint64
+hex_decode_safe_sve(const char *src, size_t len, char *dst, Node *escontext)
+{
+	svbool_t	pred;
+	svuint8x2_t	bytes;
+	svuint8_t	high,
+				low;
+	uint32		processed;
+	size_t		i = 0,
+				loop_bytes = len & ~1;	/* handles inputs of odd length */
+	const char *p = dst;
+
+	while (i < loop_bytes)
+	{
+		pred = svwhilelt_b8(i / 2, len / 2);
+		bytes = svld2(pred, (uint8 *) src);
+		high = svget2(bytes, 0);	/* hex digit for high nibble */
+		low = svget2(bytes, 1);		/* hex digit for low nibble */
+
+		/* fall back if ASCII less than '0' is found */
+		if (svptest_any(pred, svorr_z(pred, svcmplt(pred, high, '0'),
+											svcmplt(pred, low, '0'))))
+			break;
+
+		/* fall back if invalid hexadecimal digit is found */
+		if (!get_hex_sve(pred, high, &high) || !get_hex_sve(pred, low, &low))
+			break;
+
+		/* left-shift high and perform bitwise OR with low to form the byte */
+		svst1(pred, (uint8 *) dst, svorr_x(pred, svlsl_x(pred, high, 4), low));
+
+		processed = svcntp_b8(pred, pred) * 2;
+		src += processed;
+		i += processed;
+		dst += processed / 2;
+	}
+
+	if (i < len)	/* fall back */
+		return dst - p + hex_decode_safe_slow(src, len - i, dst, escontext);
+
+	return dst - p;
+}
+#endif							/* USE_SVE_WITH_RUNTIME_CHECK */
+
 static uint64
 hex_enc_len(const char *src, size_t srclen)
 {
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 07b2f798ab..b5096c11f4 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -648,6 +648,9 @@
 /* Define to 1 to use AVX-512 popcount instructions with a runtime check. */
 #undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
 
+/* Define to 1 to use SVE instructions for hex coding with a runtime check. */
+#undef USE_SVE_WITH_RUNTIME_CHECK
+
 /* Define to 1 to build with Bonjour support. (--with-bonjour) */
 #undef USE_BONJOUR
 
-- 
2.34.1

