On 02/11/2015 04:20 PM, Abhijit Menon-Sen wrote:
At 2015-02-11 13:20:29 +0200, hlinnakan...@vmware.com wrote:

I don't follow. I didn't change configure at all, compared to your
patch.

OK, I extrapolated a little too much. Your patch didn't actually include
crc_instructions.h;

Oh, I'm sorry. Here's the complete patch with crc_instructions.h
- Heikki

From bd4a90d339e21cd6ac517d077fe3a76abb5ef37d Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Tue, 10 Feb 2015 14:26:24 +0200
Subject: [PATCH 1/1] Use Intel SSE4.2 CRC instructions where available.

On x86, perform a runtime check to see if we're running on a CPU that
supports SSE 4.2. If we are, we can use the special crc32b and crc32q
instructions for the CRC-32C calculations. That greatly speeds up CRC
calculation.

Abhijit Menon-Sen, reviewed by Andres Freund and me.
---
 configure                           |   2 +-
 configure.in                        |   2 +-
 src/common/pg_crc.c                 | 109 +++++++++++++++++++++++++++++---
 src/include/common/pg_crc.h         |  12 +++-
 src/include/pg_config.h.in          |   3 +
 src/include/port/crc_instructions.h | 121 ++++++++++++++++++++++++++++++++++++
 6 files changed, 235 insertions(+), 14 deletions(-)
 create mode 100644 src/include/port/crc_instructions.h

diff --git a/configure b/configure
index fa271fe..c352128 100755
--- a/configure
+++ b/configure
@@ -9204,7 +9204,7 @@ fi
 done
 
 
-for ac_header in atomic.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h
+for ac_header in atomic.h cpuid.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h
 do :
   as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
 ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default"
diff --git a/configure.in b/configure.in
index e6a49d1..588d626 100644
--- a/configure.in
+++ b/configure.in
@@ -1032,7 +1032,7 @@ AC_SUBST(UUID_LIBS)
 ##
 
 dnl sys/socket.h is required by AC_FUNC_ACCEPT_ARGTYPES
-AC_CHECK_HEADERS([atomic.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h])
+AC_CHECK_HEADERS([atomic.h cpuid.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h])
 
 # On BSD, test for net/if.h will fail unless sys/socket.h
 # is included first.
diff --git a/src/common/pg_crc.c b/src/common/pg_crc.c
index eba32d3..b6db749 100644
--- a/src/common/pg_crc.c
+++ b/src/common/pg_crc.c
@@ -21,25 +21,113 @@
 
 #include "common/pg_crc.h"
 
-/* Accumulate one input byte */
-#ifdef WORDS_BIGENDIAN
-#define CRC8(x) pg_crc32c_table[0][((crc >> 24) ^ (x)) & 0xFF] ^ (crc << 8)
+#ifdef PG_HAVE_CRC32C_INSTRUCTIONS
+static pg_crc32 pg_comp_crc32c_hw(pg_crc32 crc, const void *data, size_t len);
+#endif
+
+#if !defined(PG_HAVE_CRC32C_INSTRUCTIONS) || defined(PG_CRC32C_INSTRUCTIONS_NEED_RUNTIME_CHECK)
+static pg_crc32 pg_comp_crc32c_sb8(pg_crc32 crc, const void *data, size_t len);
+static const uint32 pg_crc32c_table[8][256];
+#endif
+
+#ifdef PG_CRC32C_INSTRUCTIONS_NEED_RUNTIME_CHECK
+/*
+ * When built with support for CRC instructions, but we need to perform a
+ * run-time check to determine whether we can actually use them,
+ * pg_comp_crc32c is a function pointer. It is initialized to
+ * pg_comp_crc32c_choose, which performs the runtime check, and changes the
+ * function pointer so that subsequent calls go directly to the hw-accelerated
+ * version, or the fallback slicing-by-8 version.
+ */
+static pg_crc32
+pg_comp_crc32c_choose(pg_crc32 crc, const void *data, size_t len)
+{
+	if (pg_crc32_instructions_runtime_check())
+		pg_comp_crc32c = pg_comp_crc32c_hw;
+	else
+		pg_comp_crc32c = pg_comp_crc32c_sb8;
+
+	return pg_comp_crc32c(crc, data, len);
+}
+
+pg_crc32 (*pg_comp_crc32c)(pg_crc32 crc, const void *data, size_t len) = pg_comp_crc32c_choose;
+
 #else
-#define CRC8(x) pg_crc32c_table[0][(crc ^ (x)) & 0xFF] ^ (crc >> 8)
+/*
+ * No need for a runtime check. Compile directly with the hw-accelerated
+ * or the slicing-by-8 version. (We trust that the compiler
+ * is smart enough to inline it here.)
+ */
+pg_crc32
+pg_comp_crc32c(pg_crc32 crc, const void *data, size_t len)
+{
+#ifdef PG_HAVE_CRC32C_INSTRUCTIONS
+	return pg_comp_crc32c_hw(crc, data, len);
+#else
+	return pg_comp_crc32c_sb8(crc, data, len);
+#endif
+}
 #endif
 
+#ifdef PG_HAVE_CRC32C_INSTRUCTIONS
 /*
- * This function computes a CRC using the slicing-by-8 algorithm, which
- * uses an 8*256 lookup table to operate on eight bytes in parallel and
- * recombine the results.
+ * This function computes CRC-32C using special-purpose CPU instructions.
+ */
+static pg_crc32
+pg_comp_crc32c_hw(pg_crc32 crc, const void *data, size_t len)
+{
+	const unsigned char *p = data;
+	const uint64 *p8;
+
+	/*
+	 * Process eight bytes of data at a time.
+	 *
+	 * NB: We do unaligned 8-byte accesses here. Currently, the only CRC
+	 * instructions supported are the ones on Intel SSE 4.2, and that works
+	 * and performs well with unaligned access. This may need to be changed
+	 * if we get support for more architectures.
+	 */
+	p8 = (const uint64 *) p;
+	while (len >= 8)
+	{
+		crc = pg_asm_crc32q(crc, *p8++);
+		len -= 8;
+	}
+
+	/*
+	 * Handle any remaining bytes one at a time.
+	 */
+	p = (const unsigned char *) p8;
+	while (len > 0)
+	{
+		crc = pg_asm_crc32b(crc, *p++);
+		len--;
+	}
+
+	return crc;
+}
+
+#endif /* PG_HAVE_CRC32C_INSTRUCTIONS */
+
+#if !defined(PG_HAVE_CRC32C_INSTRUCTIONS) || defined(PG_CRC32C_INSTRUCTIONS_NEED_RUNTIME_CHECK)
+/*
+ * Compute CRC-32C using slicing-by-8 algorithm.
  *
  * Michael E. Kounavis, Frank L. Berry,
  * "Novel Table Lookup-Based Algorithms for High-Performance CRC
  * Generation", IEEE Transactions on Computers, vol.57, no. 11,
  * pp. 1550-1560, November 2008, doi:10.1109/TC.2008.85
  */
-pg_crc32
-pg_comp_crc32c(pg_crc32 crc, const void *data, size_t len)
+
+/* Accumulate one input byte */
+#ifdef WORDS_BIGENDIAN
+#define CRC8(x) pg_crc32c_table[0][((crc >> 24) ^ (x)) & 0xFF] ^ (crc << 8)
+#else
+#define CRC8(x) pg_crc32c_table[0][(crc ^ (x)) & 0xFF] ^ (crc >> 8)
+#endif
+
+static pg_crc32
+pg_comp_crc32c_sb8(pg_crc32 crc, const void *data, size_t len)
 {
 	const unsigned char *p = data;
 	const uint32 *p4;
@@ -113,7 +201,7 @@ pg_comp_crc32c(pg_crc32 crc, const void *data, size_t len)
  * order (IOW, the tables are stored in little-endian order even on big-endian
  * systems).
  */
-const uint32 pg_crc32c_table[8][256] = {
+static const uint32 pg_crc32c_table[8][256] = {
 #ifndef WORDS_BIGENDIAN
 	{
 		0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4,
@@ -1175,6 +1263,7 @@ const uint32 pg_crc32c_table[8][256] = {
 #endif /* WORDS_BIGENDIAN */
 };
 
+#endif
 
 /*
  * Lookup table for calculating CRC-32 using Sarwate's algorithm.
diff --git a/src/include/common/pg_crc.h b/src/include/common/pg_crc.h
index f496659..6806f6f 100644
--- a/src/include/common/pg_crc.h
+++ b/src/include/common/pg_crc.h
@@ -32,6 +32,8 @@
 #ifndef PG_CRC_H
 #define PG_CRC_H
 
+#include "port/crc_instructions.h"
+
 /* ugly hack to let this be used in frontend and backend code on Cygwin */
 #ifdef FRONTEND
 #define CRCDLLIMPORT
@@ -71,7 +73,11 @@ typedef uint32 pg_crc32;
 	((crc) = pg_comp_crc32c((crc), (data), (len)))
 #define EQ_CRC32C(c1, c2) ((c1) == (c2))
 
+#ifdef PG_CRC32C_INSTRUCTIONS_NEED_RUNTIME_CHECK
+extern pg_crc32 (*pg_comp_crc32c)(pg_crc32 crc, const void *data, size_t len);
+#else
 extern pg_crc32 pg_comp_crc32c(pg_crc32 crc, const void *data, size_t len);
+#endif
 
 /*
  * CRC-32, the same used e.g. in Ethernet.
@@ -135,8 +141,10 @@ do {															  \
 	} \
 } while (0)
 
-/* Constant tables for CRC-32C and CRC-32 polynomials */
-extern CRCDLLIMPORT const uint32 pg_crc32c_table[8][256];
+/*
+ * Constant table for the CRC-32 polynomial (the tables for CRC-32C are
+ * static in pg_crc.c)
+ */
 extern CRCDLLIMPORT const uint32 pg_crc32_table[256];
 
 #endif   /* PG_CRC_H */
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index ece57c8..685ff81 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -96,6 +96,9 @@
 /* Define to 1 if you have the `class' function. */
 #undef HAVE_CLASS
 
+/* Define to 1 if you have the <cpuid.h> header file. */
+#undef HAVE_CPUID_H
+
 /* Define to 1 if you have the <crtdefs.h> header file. */
 #undef HAVE_CRTDEFS_H
 
diff --git a/src/include/port/crc_instructions.h b/src/include/port/crc_instructions.h
new file mode 100644
index 0000000..85c9347
--- /dev/null
+++ b/src/include/port/crc_instructions.h
@@ -0,0 +1,121 @@
+/*-------------------------------------------------------------------------
+ *
+ * crc_instructions.h
+ *	  Hardware support for calculating CRCs.
+ *
+ * Some CPU architectures have special instructions for speeding up CRC
+ * calculations. This header files provides support for them in a
+ * reasonably platform and compiler independent way.
+ *
+ * This header file defines or no two preprocessor symbols, depending on
+ * the current platform:
+ *
+ * PG_HAVE_CRC32C_INSTRUCTIONS
+ *		Defined if this architecture has accelerated support for CRC32C
+ *		calculation.
+ *
+ * PG_CRC32C_INSTRUCTIONS_NEED_RUNTIME_CHECK
+ *		Defined if the architecture has accelerated support for CRC32C
+ *		calculation, but it's not available on all platforms that this
+ *		binary supports. A runtime check must be performed before
+ *		attempting to use the instructions, and a fallback implementation
+ *		is needed.
+ *
+ * If PG_HAVE_CRC32C_INSTRUCTIONS is defined, two inline functions or macros
+ * are also defined:
+ *
+ * uint32 pg_accumulate_crc32c_byte(uint32 crc, unsigned char data)
+ *		Add one byte to the current crc value.
+ *
+ * uint32 pg_accumulate_crc32c_uint64(uint32 crc, uint64 data)
+ *		Add eight bytes of data to the current crc value.
+ *
+ * If PG_CRC32C_INSTRUCTIONS_NEED_RUNTIME_CHECK is defined, there following
+ * inline function or macro is also defined:
+ *
+ * bool pg_crc32_instructions_runtime_check(void)
+ *		Returns 'true' if the CRC instructions can be used, 'false'
+ *		otherwise.
+ *
+ *
+ * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/port/crc_instructions.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef CRC_INSTRUCTIONS_H
+#define CRC_INSTRUCTIONS_H
+
+#ifdef HAVE_CPUID_H
+#include <cpuid.h>
+#endif
+#ifdef _MSC_VER
+#include <intrin.h>
+#include <nmmintrin.h>
+#endif
+
+#if (defined(__GNUC__) && defined(__x86_64__) && defined(HAVE_CPUID_H))
+
+#define PG_HAVE_CRC32C_INSTRUCTIONS
+#define PG_CRC32C_INSTRUCTIONS_NEED_RUNTIME_CHECK
+
+static inline uint32
+pg_asm_crc32b(uint32 crc, unsigned char data)
+{
+	__asm__ (
+		"crc32b %[data], %[crc]\n"
+:		[crc] "+r" (crc)
+:		[data] "rm" (data));
+	return crc;
+}
+
+static inline uint32
+pg_asm_crc32q(uint32 crc, uint64 data)
+{
+	/*
+	 * For some strange reason, the crc32q instruction accepts and returns
+	 * the current crc value as a 64-bit integer, even though the upper 32
+	 * bits are always zeros.
+	 */
+	uint64 _crc = crc;
+	__asm__ (
+		"crc32q %[data], %[crc]\n"
+:		[crc] "+r" (_crc)
+:		[data] "rm" (data));
+	return (uint32) _crc;
+}
+
+static inline bool
+pg_crc32_instructions_runtime_check(void)
+{
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+	return true;
+
+	return (exx[2] & (1 << 20)); /* SSE 4.2 */
+}
+
+#elif defined(_MSC_VER)
+
+#define PG_HAVE_CRC32C_INSTRUCTIONS
+#define PG_CRC32C_INSTRUCTIONS_NEED_RUNTIME_CHECK
+
+#define pg_asm_crc32b(crc, data) _mm_crc32_u8(crc, data)
+#define pg_asm_crc32q(crc, data) ((uint32) _mm_crc32_u64(crc, data))
+
+static inline bool
+pg_crc32_instructions_runtime_check(void)
+{
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+	__cpuid(exx, 1);
+
+	return (exx[2] & (1 << 20)); /* SSE 4.2 */
+}
+
+#endif
+
+#endif /* CRC_INSTRUCTIONS_H */
-- 
2.1.4

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to