From d41ac63a6f4dc71df5894a31a9d0b0b5572816ae Mon Sep 17 00:00:00 2001
From: Yuqi Gu <yuqi.gu@arm.com>
Date: Mon, 8 Jan 2018 03:03:31 +0000
Subject: [PATCH] Optimize Arm64 crc32c implementation in Postgresql

Providing the ARM64v8 crc32 Interfaces to optimize the performance on ARM64 Platform.

Change-Id: I3af7e7e6a9f36936e7c16c5863a7c3e87e911cbf
Signed-off-by: Yuqi Gu <yuqi.gu@arm.com>
---
 config/c-compiler.m4         | 15 ++++++++
 configure                    | 82 ++++++++++++++++++++++++++++++++++++++------
 configure.in                 | 24 +++++++++----
 src/include/pg_config.h.in   |  3 ++
 src/include/port/pg_crc32c.h | 10 +++++-
 src/port/pg_crc32c_choose.c  | 20 +++++++++++
 src/port/pg_crc32c_sb8.c     | 47 +++++++++++++++++++++++++
 7 files changed, 183 insertions(+), 18 deletions(-)

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 076656c..9cd6270 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -595,3 +595,18 @@ if test x"$Ac_cachevar" = x"yes"; then
 fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_SSE42_CRC32_INTRINSICS
+
+AC_DEFUN([PGAC_ARM64CE_CRC32_INTRINSICS],
+[AC_CACHE_CHECK([for Arm64ce CRC32], [pgac_cv_arm64ce_crc32_intrinsics],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([],
+  [unsigned int arm_flag = 0;
+#if defined(__ARM_ARCH) && (__ARM_ARCH > 7)
+   arm_flag = 1;
+#endif
+   return arm_flag == 1;])],
+  [pgac_cv_arm64ce_crc32_intrinsics="yes"],
+  [pgac_cv_arm64ce_crc32_intrinsics="no"])])
+if test x"$pgac_cv_arm64ce_crc32_intrinsics" = x"yes"; then
+  pgac_arm64ce_crc32_intrinsics=yes
+fi
+])# PGAC_ARM64CE_CRC32_INTRINSICS
diff --git a/configure b/configure
index 45221e1..1c7f0b3 100755
--- a/configure
+++ b/configure
@@ -777,6 +777,7 @@ infodir
 docdir
 oldincludedir
 includedir
+runstatedir
 localstatedir
 sharedstatedir
 sysconfdir
@@ -904,6 +905,7 @@ datadir='${datarootdir}'
 sysconfdir='${prefix}/etc'
 sharedstatedir='${prefix}/com'
 localstatedir='${prefix}/var'
+runstatedir='${localstatedir}/run'
 includedir='${prefix}/include'
 oldincludedir='/usr/include'
 docdir='${datarootdir}/doc/${PACKAGE_TARNAME}'
@@ -1156,6 +1158,15 @@ do
   | -silent | --silent | --silen | --sile | --sil)
     silent=yes ;;
 
+  -runstatedir | --runstatedir | --runstatedi | --runstated \
+  | --runstate | --runstat | --runsta | --runst | --runs \
+  | --run | --ru | --r)
+    ac_prev=runstatedir ;;
+  -runstatedir=* | --runstatedir=* | --runstatedi=* | --runstated=* \
+  | --runstate=* | --runstat=* | --runsta=* | --runst=* | --runs=* \
+  | --run=* | --ru=* | --r=*)
+    runstatedir=$ac_optarg ;;
+
   -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb)
     ac_prev=sbindir ;;
   -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \
@@ -1293,7 +1304,7 @@ fi
 for ac_var in	exec_prefix prefix bindir sbindir libexecdir datarootdir \
 		datadir sysconfdir sharedstatedir localstatedir includedir \
 		oldincludedir docdir infodir htmldir dvidir pdfdir psdir \
-		libdir localedir mandir
+		libdir localedir mandir runstatedir
 do
   eval ac_val=\$$ac_var
   # Remove trailing slashes.
@@ -1446,6 +1457,7 @@ Fine tuning of the installation directories:
   --sysconfdir=DIR        read-only single-machine data [PREFIX/etc]
   --sharedstatedir=DIR    modifiable architecture-independent data [PREFIX/com]
   --localstatedir=DIR     modifiable single-machine data [PREFIX/var]
+  --runstatedir=DIR       modifiable per-process data [LOCALSTATEDIR/run]
   --libdir=DIR            object code libraries [EPREFIX/lib]
   --includedir=DIR        C header files [PREFIX/include]
   --oldincludedir=DIR     C header files for non-gcc [/usr/include]
@@ -12655,7 +12667,7 @@ else
     We can't simply define LARGE_OFF_T to be 9223372036854775807,
     since some C++ compilers masquerading as C compilers
     incorrectly reject 9223372036854775807.  */
-#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62))
+#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31))
   int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721
 		       && LARGE_OFF_T % 2147483647 == 1)
 		      ? 1 : -1];
@@ -12701,7 +12713,7 @@ else
     We can't simply define LARGE_OFF_T to be 9223372036854775807,
     since some C++ compilers masquerading as C compilers
     incorrectly reject 9223372036854775807.  */
-#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62))
+#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31))
   int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721
 		       && LARGE_OFF_T % 2147483647 == 1)
 		      ? 1 : -1];
@@ -12725,7 +12737,7 @@ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
     We can't simply define LARGE_OFF_T to be 9223372036854775807,
     since some C++ compilers masquerading as C compilers
     incorrectly reject 9223372036854775807.  */
-#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62))
+#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31))
   int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721
 		       && LARGE_OFF_T % 2147483647 == 1)
 		      ? 1 : -1];
@@ -12770,7 +12782,7 @@ else
     We can't simply define LARGE_OFF_T to be 9223372036854775807,
     since some C++ compilers masquerading as C compilers
     incorrectly reject 9223372036854775807.  */
-#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62))
+#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31))
   int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721
 		       && LARGE_OFF_T % 2147483647 == 1)
 		      ? 1 : -1];
@@ -12794,7 +12806,7 @@ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
     We can't simply define LARGE_OFF_T to be 9223372036854775807,
     since some C++ compilers masquerading as C compilers
     incorrectly reject 9223372036854775807.  */
-#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62))
+#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31))
   int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721
 		       && LARGE_OFF_T % 2147483647 == 1)
 		      ? 1 : -1];
@@ -15449,6 +15461,41 @@ if ac_fn_c_try_compile "$LINENO"; then :
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for Arm64ce CRC32" >&5
+$as_echo_n "checking for Arm64ce CRC32... " >&6; }
+if ${pgac_cv_arm64ce_crc32_intrinsics+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+unsigned int arm_flag = 0;
+#if defined(__ARM_ARCH) && (__ARM_ARCH > 7)
+   arm_flag = 1;
+#endif
+   return arm_flag == 1;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_arm64ce_crc32_intrinsics="yes"
+else
+  pgac_cv_arm64ce_crc32_intrinsics="no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_arm64ce_crc32_intrinsics" >&5
+$as_echo "$pgac_cv_arm64ce_crc32_intrinsics" >&6; }
+if test x"$pgac_cv_arm64ce_crc32_intrinsics" = x"yes"; then
+  pgac_arm64ce_crc32_intrinsics=yes
+fi
+
+
 # Select CRC-32C implementation.
 #
 # If we are targeting a processor that has SSE 4.2 instructions, we can use the
@@ -15468,9 +15515,13 @@ if test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHEC
     if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
       USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
     else
-      # fall back to slicing-by-8 algorithm which doesn't require any special
-      # CPU support.
-      USE_SLICING_BY_8_CRC32C=1
+      if test x"$pgac_arm64ce_crc32_intrinsics" = x"yes"; then
+        USE_ARMCE_CRC32C_WITH_RUNTIME_CHECK=1
+      else
+        # fall back to slicing-by-8 algorithm which doesn't require any special
+        # CPU support.
+        USE_SLICING_BY_8_CRC32C=1
+      fi
     fi
   fi
 fi
@@ -15494,12 +15545,21 @@ $as_echo "#define USE_SSE42_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
     { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5
 $as_echo "SSE 4.2 with runtime check" >&6; }
   else
+    if test x"$USE_ARMCE_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
+
+$as_echo "#define USE_ARMCE_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+      PG_CRC32C_OBJS="pg_crc32c_sb8.o pg_crc32c_choose.o"
+      { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARM64 CE with runtime check" >&5
+$as_echo "ARM64 CE with runtime check" >&6; }
+    else
 
 $as_echo "#define USE_SLICING_BY_8_CRC32C 1" >>confdefs.h
 
-    PG_CRC32C_OBJS="pg_crc32c_sb8.o"
-    { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
+      PG_CRC32C_OBJS="pg_crc32c_sb8.o"
+      { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
 $as_echo "slicing-by-8" >&6; }
+    fi
   fi
 fi
 
diff --git a/configure.in b/configure.in
index 4d26034..84ebf53 100644
--- a/configure.in
+++ b/configure.in
@@ -1900,6 +1900,8 @@ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [
 #endif
 ])], [SSE4_2_TARGETED=1])
 
+PGAC_ARM64CE_CRC32_INTRINSICS
+
 # Select CRC-32C implementation.
 #
 # If we are targeting a processor that has SSE 4.2 instructions, we can use the
@@ -1919,9 +1921,13 @@ if test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHEC
     if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
       USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
     else
-      # fall back to slicing-by-8 algorithm which doesn't require any special
-      # CPU support.
-      USE_SLICING_BY_8_CRC32C=1
+      if test x"$pgac_arm64ce_crc32_intrinsics" = x"yes"; then
+        USE_ARMCE_CRC32C_WITH_RUNTIME_CHECK=1
+      else
+        # fall back to slicing-by-8 algorithm which doesn't require any special
+        # CPU support.
+        USE_SLICING_BY_8_CRC32C=1
+      fi
     fi
   fi
 fi
@@ -1938,9 +1944,15 @@ else
     PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_choose.o"
     AC_MSG_RESULT(SSE 4.2 with runtime check)
   else
-    AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check.])
-    PG_CRC32C_OBJS="pg_crc32c_sb8.o"
-    AC_MSG_RESULT(slicing-by-8)
+    if test x"$USE_ARMCE_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
+      AC_DEFINE(USE_ARMCE_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use ARM64 CE CRC instructions with a runtime check.])
+      PG_CRC32C_OBJS="pg_crc32c_sb8.o pg_crc32c_choose.o"
+      AC_MSG_RESULT(ARM64 CE with runtime check)
+    else
+      AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check.])
+      PG_CRC32C_OBJS="pg_crc32c_sb8.o"
+      AC_MSG_RESULT(slicing-by-8)
+    fi
   fi
 fi
 AC_SUBST(PG_CRC32C_OBJS)
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index f98f773..ae2cdf1 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -871,6 +871,9 @@
 /* Define to 1 to use Intel SSSE 4.2 CRC instructions with a runtime check. */
 #undef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
 
+/* Define to 1 to use ARM64 CRC instructions with a runtime check. */
+#undef USE_ARMCE_CRC32C_WITH_RUNTIME_CHECK
+
 /* Define to build with systemd support. (--with-systemd) */
 #undef USE_SYSTEMD
 
diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index ae2701e..50405a5 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -49,7 +49,8 @@ typedef uint32 pg_crc32c;
 
 extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
 
-#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)
+#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMCE_CRC32C_WITH_RUNTIME_CHECK)
+
 /*
  * Use SSE4.2 instructions, but perform a runtime check first to check that
  * they are available.
@@ -62,6 +63,13 @@ extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t le
 extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
 extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
 
+/* Correspondence with pg_com_crc32c_sb8
+ * Arm64 using Castagnoli polynomial 0x1EDC6F41: crc32c
+ */
+#ifdef USE_ARMCE_CRC32C_WITH_RUNTIME_CHECK
+extern pg_crc32c pg_comp_crc32c_arm64(pg_crc32c crc, const void *data, size_t len);
+#endif
+
 #else
 /*
  * Use slicing-by-8 algorithm.
diff --git a/src/port/pg_crc32c_choose.c b/src/port/pg_crc32c_choose.c
index 40bee67..d3682ad 100644
--- a/src/port/pg_crc32c_choose.c
+++ b/src/port/pg_crc32c_choose.c
@@ -29,6 +29,20 @@
 
 #include "port/pg_crc32c.h"
 
+#ifdef USE_ARMCE_CRC32C_WITH_RUNTIME_CHECK
+#include <sys/auxv.h>
+#include <asm/hwcap.h>
+#ifndef HWCAP_CRC32
+#define HWCAP_CRC32 (1 << 7)
+#endif
+
+static bool
+pg_crc32c_arm64ce_available(void) {
+	unsigned long auxv = getauxval(AT_HWCAP);
+	return (auxv & HWCAP_CRC32) != 0;
+}
+
+#else
 static bool
 pg_crc32c_sse42_available(void)
 {
@@ -44,6 +58,7 @@ pg_crc32c_sse42_available(void)
 
 	return (exx[2] & (1 << 20)) != 0;	/* SSE 4.2 */
 }
+#endif
 
 /*
  * This gets called on the first call. It replaces the function pointer
@@ -52,8 +67,13 @@ pg_crc32c_sse42_available(void)
 static pg_crc32c
 pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
 {
+#if defined(USE_ARMCE_CRC32C_WITH_RUNTIME_CHECK)
+	if (pg_crc32c_arm64ce_available())
+		pg_comp_crc32c = pg_comp_crc32c_arm64;
+#else
 	if (pg_crc32c_sse42_available())
 		pg_comp_crc32c = pg_comp_crc32c_sse42;
+#endif
 	else
 		pg_comp_crc32c = pg_comp_crc32c_sb8;
 
diff --git a/src/port/pg_crc32c_sb8.c b/src/port/pg_crc32c_sb8.c
index 5205ba9..fd9dd93 100644
--- a/src/port/pg_crc32c_sb8.c
+++ b/src/port/pg_crc32c_sb8.c
@@ -22,6 +22,53 @@
 
 #include "port/pg_crc32c.h"
 
+#if defined(USE_ARMCE_CRC32C_WITH_RUNTIME_CHECK)
+asm(".arch_extension crc");
+#define LDP(x,y,p) asm("ldp %x[a], %x[b], [%x[c]], #16" : [a]"=r"(x),[b]"=r"(y),[c]"+r"(p))
+/* CRC32C: Castagnoli polynomial 0x1EDC6F41 */
+#define CRC32CX(crc,value) asm("crc32cx %w[c], %w[c], %x[v]" : [c]"+r"(*&crc) : [v]"r"(+value))
+#define CRC32CW(crc,value) asm("crc32cw %w[c], %w[c], %w[v]" : [c]"+r"(*&crc) : [v]"r"(+value))
+#define CRC32CH(crc,value) asm("crc32ch %w[c], %w[c], %w[v]" : [c]"+r"(*&crc) : [v]"r"(+value))
+#define CRC32CB(crc,value) asm("crc32cb %w[c], %w[c], %w[v]" : [c]"+r"(*&crc) : [v]"r"(+value))
+
+pg_crc32c
+pg_comp_crc32c_arm64(pg_crc32c crc, const void* data, size_t len) {
+	uint64 p0, p1;
+	pg_crc32c crc32_c = crc;
+	long length = len;
+	const unsigned char *p_buf = data;
+
+	/* Allow crc instructions in asm */
+	asm(".cpu generic+crc");
+	while ((length -= 2*sizeof(uint64)) >= 0) {
+		LDP(p0, p1, p_buf);
+		CRC32CX(crc32_c,p0);
+		CRC32CX(crc32_c,p1);
+	}
+
+	if (length & sizeof(uint64)) {
+		CRC32CX(crc32_c, *(uint64*)p_buf);
+		p_buf += sizeof(uint64);
+	}
+
+	if (length & sizeof(uint32)) {
+		CRC32CW(crc32_c, *(uint64*)p_buf);
+		p_buf += sizeof(uint32);
+	}
+
+	if (length & sizeof(uint16)) {
+		CRC32CH(crc32_c, *(uint16*)p_buf);
+		p_buf += sizeof(uint16);
+	}
+
+	if (length & sizeof(uint8)) {
+		CRC32CB(crc32_c, *p_buf);
+	}
+
+	return crc32_c;
+}
+#endif
+
 static const uint32 pg_crc32c_table[8][256];
 
 /* Accumulate one input byte */
-- 
2.7.4

