On 02/12/2015 09:26 PM, Heikki Linnakangas wrote:
On 02/11/2015 04:20 PM, Abhijit Menon-Sen wrote:
At 2015-02-11 13:20:29 +0200, hlinnakan...@vmware.com wrote:

I don't follow. I didn't change configure at all, compared to your
patch.

OK, I extrapolated a little too much. Your patch didn't actually include
crc_instructions.h;

Oh, I'm sorry. Here's the complete patch with crc_instructions.h

I was just about to commit the attached, which is the same as the previous patch with just cosmetic comment changes, but then I realized that this probably doesn't compile with Visual Studio 2005 or older. The code does "#ifdef _MSC_VER", and then uses the _mm_crc32_u64 intrinsic, but that intrinsic was added in Visual Studio 2008. I think we'll need a version check there. Or better yet, a direct configure test to check if the intrinsic exists - that way we get to also use it on Intel compilers, which I believe also has the same intrinsics.

You want to write that or should I? How do you like this latest version of the patch otherwise? You had some criticism earlier, but I had forgotten to include the crc_instructions.h header file in that earlier version.

- Heikki

From f934cb017ad0270ded73feb4d3279e81a58a4149 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Wed, 25 Mar 2015 18:44:07 +0200
Subject: [PATCH v2 1/1] Use Intel SSE4.2 CRC instructions where available.

On x86, perform a runtime check to see if we're running on a CPU that
supports SSE 4.2. If we are, we can use the special crc32b and crc32q
instructions for the CRC-32C calculations. That greatly speeds up CRC
calculation.

Abhijit Menon-Sen, reviewed by Andres Freund and me.
---
 configure                           |   2 +-
 configure.in                        |   2 +-
 src/common/pg_crc.c                 | 113 +++++++++++++++++++++++++++----
 src/include/common/pg_crc.h         |  20 ++++--
 src/include/pg_config.h.in          |   3 +
 src/include/port/crc_instructions.h | 128 ++++++++++++++++++++++++++++++++++++
 6 files changed, 248 insertions(+), 20 deletions(-)
 create mode 100644 src/include/port/crc_instructions.h

diff --git a/configure b/configure
index 2c9b3a7..87ceb0b 100755
--- a/configure
+++ b/configure
@@ -9204,7 +9204,7 @@ fi
 done
 
 
-for ac_header in atomic.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h
+for ac_header in atomic.h cpuid.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h
 do :
   as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
 ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default"
diff --git a/configure.in b/configure.in
index b2c1ce7..bf604ea 100644
--- a/configure.in
+++ b/configure.in
@@ -1032,7 +1032,7 @@ AC_SUBST(UUID_LIBS)
 ##
 
 dnl sys/socket.h is required by AC_FUNC_ACCEPT_ARGTYPES
-AC_CHECK_HEADERS([atomic.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h])
+AC_CHECK_HEADERS([atomic.h cpuid.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h])
 
 # On BSD, test for net/if.h will fail unless sys/socket.h
 # is included first.
diff --git a/src/common/pg_crc.c b/src/common/pg_crc.c
index eba32d3..6675ae7 100644
--- a/src/common/pg_crc.c
+++ b/src/common/pg_crc.c
@@ -21,25 +21,113 @@
 
 #include "common/pg_crc.h"
 
-/* Accumulate one input byte */
-#ifdef WORDS_BIGENDIAN
-#define CRC8(x) pg_crc32c_table[0][((crc >> 24) ^ (x)) & 0xFF] ^ (crc << 8)
+#ifdef PG_HAVE_CRC32C_INSTRUCTIONS
+static pg_crc32 pg_comp_crc32c_hw(pg_crc32 crc, const void *data, size_t len);
+#endif
+
+#if !defined(PG_HAVE_CRC32C_INSTRUCTIONS) || defined(PG_CRC32C_INSTRUCTIONS_NEED_RUNTIME_CHECK)
+static pg_crc32 pg_comp_crc32c_sb8(pg_crc32 crc, const void *data, size_t len);
+static const uint32 pg_crc32c_table[8][256];
+#endif
+
+#ifdef PG_CRC32C_INSTRUCTIONS_NEED_RUNTIME_CHECK
+/*
+ * When built with support for CRC instructions, but we need to perform a
+ * run-time check to determine whether we can actually use them,
+ * pg_comp_crc32c is a function pointer. It is initialized to
+ * pg_comp_crc32c_choose, which performs the runtime check, and changes the
+ * function pointer so that subsequent calls go directly to the hw-accelerated
+ * version, or the fallback slicing-by-8 version.
+ */
+static pg_crc32
+pg_comp_crc32c_choose(pg_crc32 crc, const void *data, size_t len)
+{
+	if (pg_crc32_instructions_runtime_check())
+		pg_comp_crc32c = pg_comp_crc32c_hw;
+	else
+		pg_comp_crc32c = pg_comp_crc32c_sb8;
+
+	return pg_comp_crc32c(crc, data, len);
+}
+
+pg_crc32	(*pg_comp_crc32c) (pg_crc32 crc, const void *data, size_t len) = pg_comp_crc32c_choose;
+
 #else
-#define CRC8(x) pg_crc32c_table[0][(crc ^ (x)) & 0xFF] ^ (crc >> 8)
+/*
+ * No need for a runtime check. Compile directly with the hw-accelerated
+ * or the slicing-by-8 version. (We trust that the compiler
+ * is smart enough to inline it here.)
+ */
+pg_crc32
+pg_comp_crc32c(pg_crc32 crc, const void *data, size_t len)
+{
+#ifdef PG_HAVE_CRC32C_INSTRUCTIONS
+	return pg_comp_crc32c_hw(crc, data, len);
+#else
+	return pg_comp_crc32c_sb8(crc, data, len);
+#endif
+}
 #endif
 
+#ifdef PG_HAVE_CRC32C_INSTRUCTIONS
 /*
- * This function computes a CRC using the slicing-by-8 algorithm, which
- * uses an 8*256 lookup table to operate on eight bytes in parallel and
- * recombine the results.
+ * This function computes CRC-32C using special-purpose CPU instructions.
+ */
+static pg_crc32
+pg_comp_crc32c_hw(pg_crc32 crc, const void *data, size_t len)
+{
+	const unsigned char *p = data;
+	const uint64 *p8;
+
+	/*
+	 * Process eight bytes of data at a time.
+	 *
+	 * NB: We do unaligned 8-byte accesses here. Currently, the only CRC
+	 * instructions supported are the ones on Intel SSE 4.2, and that works
+	 * and performs well with unaligned access. This may need to be changed if
+	 * we get support for more architectures.
+	 */
+	p8 = (const uint64 *) p;
+	while (len >= 8)
+	{
+		crc = pg_asm_crc32q(crc, *p8++);
+		len -= 8;
+	}
+
+	/*
+	 * Handle any remaining bytes one at a time.
+	 */
+	p = (const unsigned char *) p8;
+	while (len > 0)
+	{
+		crc = pg_asm_crc32b(crc, *p++);
+		len--;
+	}
+
+	return crc;
+}
+
+#endif   /* PG_HAVE_CRC32C_INSTRUCTIONS */
+
+#if !defined(PG_HAVE_CRC32C_INSTRUCTIONS) || defined(PG_CRC32C_INSTRUCTIONS_NEED_RUNTIME_CHECK)
+/*
+ * Compute CRC-32C using slicing-by-8 algorithm.
  *
  * Michael E. Kounavis, Frank L. Berry,
  * "Novel Table Lookup-Based Algorithms for High-Performance CRC
  * Generation", IEEE Transactions on Computers, vol.57, no. 11,
  * pp. 1550-1560, November 2008, doi:10.1109/TC.2008.85
  */
-pg_crc32
-pg_comp_crc32c(pg_crc32 crc, const void *data, size_t len)
+
+/* Accumulate one input byte */
+#ifdef WORDS_BIGENDIAN
+#define CRC8(x) pg_crc32c_table[0][((crc >> 24) ^ (x)) & 0xFF] ^ (crc << 8)
+#else
+#define CRC8(x) pg_crc32c_table[0][(crc ^ (x)) & 0xFF] ^ (crc >> 8)
+#endif
+
+static pg_crc32
+pg_comp_crc32c_sb8(pg_crc32 crc, const void *data, size_t len)
 {
 	const unsigned char *p = data;
 	const uint32 *p4;
@@ -113,7 +201,7 @@ pg_comp_crc32c(pg_crc32 crc, const void *data, size_t len)
  * order (IOW, the tables are stored in little-endian order even on big-endian
  * systems).
  */
-const uint32 pg_crc32c_table[8][256] = {
+static const uint32 pg_crc32c_table[8][256] = {
 #ifndef WORDS_BIGENDIAN
 	{
 		0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4,
@@ -643,7 +731,7 @@ const uint32 pg_crc32c_table[8][256] = {
 		0xE54C35A1, 0xAC704886, 0x7734CFEF, 0x3E08B2C8,
 		0xC451B7CC, 0x8D6DCAEB, 0x56294D82, 0x1F1530A5
 	}
-#else		/* !WORDS_BIGENDIAN */
+#else							/* !WORDS_BIGENDIAN */
 	{
 		0x00000000, 0x03836BF2, 0xF7703BE1, 0xF4F35013,
 		0x1F979AC7, 0x1C14F135, 0xE8E7A126, 0xEB64CAD4,
@@ -1172,9 +1260,10 @@ const uint32 pg_crc32c_table[8][256] = {
 		0xA1354CE5, 0x864870AC, 0xEFCF3477, 0xC8B2083E,
 		0xCCB751C4, 0xEBCA6D8D, 0x824D2956, 0xA530151F
 	}
-#endif /* WORDS_BIGENDIAN */
+#endif   /* WORDS_BIGENDIAN */
 };
 
+#endif
 
 /*
  * Lookup table for calculating CRC-32 using Sarwate's algorithm.
diff --git a/src/include/common/pg_crc.h b/src/include/common/pg_crc.h
index f496659..de1ebdf 100644
--- a/src/include/common/pg_crc.h
+++ b/src/include/common/pg_crc.h
@@ -32,6 +32,8 @@
 #ifndef PG_CRC_H
 #define PG_CRC_H
 
+#include "port/crc_instructions.h"
+
 /* ugly hack to let this be used in frontend and backend code on Cygwin */
 #ifdef FRONTEND
 #define CRCDLLIMPORT
@@ -63,15 +65,19 @@ typedef uint32 pg_crc32;
  */
 #define INIT_CRC32C(crc) ((crc) = 0xFFFFFFFF)
 #ifdef WORDS_BIGENDIAN
-#define FIN_CRC32C(crc)	((crc) = BSWAP32(crc) ^ 0xFFFFFFFF)
+#define FIN_CRC32C(crc) ((crc) = BSWAP32(crc) ^ 0xFFFFFFFF)
 #else
-#define FIN_CRC32C(crc)	((crc) ^= 0xFFFFFFFF)
+#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
 #endif
-#define COMP_CRC32C(crc, data, len)	\
+#define COMP_CRC32C(crc, data, len) \
 	((crc) = pg_comp_crc32c((crc), (data), (len)))
 #define EQ_CRC32C(c1, c2) ((c1) == (c2))
 
+#ifdef PG_CRC32C_INSTRUCTIONS_NEED_RUNTIME_CHECK
+extern pg_crc32 (*pg_comp_crc32c) (pg_crc32 crc, const void *data, size_t len);
+#else
 extern pg_crc32 pg_comp_crc32c(pg_crc32 crc, const void *data, size_t len);
+#endif
 
 /*
  * CRC-32, the same used e.g. in Ethernet.
@@ -130,13 +136,15 @@ do {															  \
 \
 	while (__len-- > 0) \
 	{ \
-		int		__tab_index = ((int) ((crc) >> 24) ^ *__data++) & 0xFF;	\
+		int		__tab_index = ((int) ((crc) >> 24) ^ *__data++) & 0xFF; \
 		(crc) = table[__tab_index] ^ ((crc) << 8); \
 	} \
 } while (0)
 
-/* Constant tables for CRC-32C and CRC-32 polynomials */
-extern CRCDLLIMPORT const uint32 pg_crc32c_table[8][256];
+/*
+ * Constant table for the CRC-32 polynomial (the tables for CRC-32C are
+ * static in pg_crc.c)
+ */
 extern CRCDLLIMPORT const uint32 pg_crc32_table[256];
 
 #endif   /* PG_CRC_H */
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 202c51a..129d9d0 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -96,6 +96,9 @@
 /* Define to 1 if you have the `class' function. */
 #undef HAVE_CLASS
 
+/* Define to 1 if you have the <cpuid.h> header file. */
+#undef HAVE_CPUID_H
+
 /* Define to 1 if you have the <crtdefs.h> header file. */
 #undef HAVE_CRTDEFS_H
 
diff --git a/src/include/port/crc_instructions.h b/src/include/port/crc_instructions.h
new file mode 100644
index 0000000..357890f
--- /dev/null
+++ b/src/include/port/crc_instructions.h
@@ -0,0 +1,128 @@
+/*-------------------------------------------------------------------------
+ *
+ * crc_instructions.h
+ *	  Hardware support for calculating CRCs.
+ *
+ * Some CPU architectures have special instructions for speeding up CRC
+ * calculations. This header files provides access to them in a reasonably
+ * platform and compiler independent way.
+ *
+ * This header file defines two preprocessor symbols, or not, depending on
+ * the current platform:
+ *
+ * PG_HAVE_CRC32C_INSTRUCTIONS
+ *		Defined if this architecture has a special instruction for CRC32C
+ *		calculation.
+ *
+ * PG_CRC32C_INSTRUCTIONS_NEED_RUNTIME_CHECK
+ *		Defined if the architecture has a special instruction for CRC32C
+ *		calculation, but it's not available on all platforms that this
+ *		binary supports. A runtime check must be performed before
+ *		attempting to use the instructions, and a fallback implementation
+ *		is needed.
+ *
+ * If PG_HAVE_CRC32C_INSTRUCTIONS is defined, two inline functions or macros
+ * are also defined:
+ *
+ * uint32 pg_accumulate_crc32c_byte(uint32 crc, unsigned char data)
+ *		Add one byte to the current crc value.
+ *
+ * uint32 pg_accumulate_crc32c_uint64(uint32 crc, uint64 data)
+ *		Add eight bytes of data to the current crc value.
+ *
+ * If PG_CRC32C_INSTRUCTIONS_NEED_RUNTIME_CHECK is defined, there following
+ * inline function or macro is also defined:
+ *
+ * bool pg_crc32_instructions_runtime_check(void)
+ *		Returns 'true' if the CRC instructions can be used, 'false'
+ *		otherwise.
+ *
+ *
+ * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/port/crc_instructions.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef CRC_INSTRUCTIONS_H
+#define CRC_INSTRUCTIONS_H
+
+#ifdef HAVE_CPUID_H
+#include <cpuid.h>
+#endif
+#ifdef _MSC_VER
+#include <intrin.h>
+#include <nmmintrin.h>
+#endif
+
+
+/*
+ * Intel SSE 4.2 instructions, using GCC-style inline assembly
+ */
+#if (defined(__GNUC__) && defined(__x86_64__) && defined(HAVE_CPUID_H))
+
+#define PG_HAVE_CRC32C_INSTRUCTIONS
+#define PG_CRC32C_INSTRUCTIONS_NEED_RUNTIME_CHECK
+
+static inline uint32
+pg_asm_crc32b(uint32 crc, unsigned char data)
+{
+	__asm__(
+			"crc32b %[data], %[crc]\n"
+:			[crc] "+r"(crc)
+:			[data] "rm"(data));
+	return crc;
+}
+
+static inline uint32
+pg_asm_crc32q(uint32 crc, uint64 data)
+{
+	/*
+	 * For some strange reason, the crc32q instruction accepts and returns the
+	 * current crc value as a 64-bit integer, even though the upper 32 bits
+	 * are always zeros.
+	 */
+	uint64		_crc = crc;
+
+	__asm__(
+			"crc32q %[data], %[crc]\n"
+:			[crc] "+r"(_crc)
+:			[data] "rm"(data));
+	return (uint32) _crc;
+}
+
+static inline bool
+pg_crc32_instructions_runtime_check(void)
+{
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+
+	return (exx[2] & (1 << 20));	/* SSE 4.2 */
+}
+
+/*
+ * Intel SSE 4.2 instructions, using Microsoft intrinsics
+ */
+#elif defined(_MSC_VER)
+
+#define PG_HAVE_CRC32C_INSTRUCTIONS
+#define PG_CRC32C_INSTRUCTIONS_NEED_RUNTIME_CHECK
+
+#define pg_asm_crc32b(crc, data) _mm_crc32_u8(crc, data)
+#define pg_asm_crc32q(crc, data) ((uint32) _mm_crc32_u64(crc, data))
+
+static inline bool
+pg_crc32_instructions_runtime_check(void)
+{
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+	__cpuid(exx, 1);
+
+	return (exx[2] & (1 << 20));	/* SSE 4.2 */
+}
+
+#endif   /* _MSC_VER */
+
+#endif   /* CRC_INSTRUCTIONS_H */
-- 
2.1.4

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to