On 01/09/2015 10:32 AM, Abhijit Menon-Sen wrote:
2. The sse4.2 patch has only some minor compile fixes.
I have built and tested both patches individually on little-endian
(amd64) and big-endian (ppc) systems. I verified that the _sse is
chosen at startup on the former, and _sb8 on the latter, and that
both implementations function correctly with respect to HEAD.
I'd like to arrange this somewhat differently, to keep the really
low-level assembler blocks and such separate from the higher level
parts. Especially now that pg_crc.c is in src/common and src/port, it
doesn't seem appropriate to have assembler code in there. Also, some of
the high-level code can be reused if we later add support e.g. for the
ARMv8 CRC instructions.
I propose that we add a new header file in src/port, let's call it
crc_instructions.h. That file contains the very low-level stuff, the
pg_asm_crc32q() and pg_asm_crc32b() inline functions, which just contain
the single assembler instruction. Or the corresponding mnemomic or macro
depending on the compiler; the point of the crc_instructions.h header is
to hide the differences between compilers and architectures.
If the CRC instructions are available, crc_instructions.h defines
PG_HAVE_CRC32C_INSTRUCTIONS, as well as the pg_asm_crc32q() and
pg_asm_crc32b() macros/functions. It also defines
pg_crc32_instructions_runtime_check(), to perform the runtime check to
determine whether the instructions can be used on the current platform
(i.e. if you're running on a CPU with SSE 4.2 support). There's another
symbol PG_CRC32C_INSTRUCTIONS_NEED_RUNTIME_CHECK, which indicates
whether the runtime check is needed. That's currently always defined
when PG_HAVE_CRC32C_INSTRUCTIONS is, but conceivably you might want to
build a version that skips the runtime check altogether, and doesn't
therefore require the slicing-by-8 fallback implementation at all.
Gentoo users might like that ;-), as well as possible future
architectures that always have CRC32 instructions.
Attached is a patch along those lines. I haven't done much testing, but
it demonstrates the code structure I have in mind.
A couple of remarks on your original patch, which also apply to the
attached version:
+static inline pg_crc32
+pg_asm_crc32b(pg_crc32 crc, unsigned char data)
+{
+#if defined(__GNUC__) && defined(__x86_64__)
+ __asm__ ("crc32b %[data], %[crc]\n" : [crc] "+r" (crc) : [data] "rm"
(data));
+ return crc;
+#elif defined(_MSC_VER)
+ return _mm_crc32_u8(crc, data);
+#else
+ /* Can't generate crc32b, but keep the compiler quiet. */
+ return 0;
+#endif
I believe the CRC instructions are also available in 32-bit mode, so the
check for __x86_64__ is not needed. Right? Any CPU recent enough to have
these instructions certainly supports the x86-64 instruction set, but
you might nevertheless have compiled and be running in i386 mode.
It would be nice to use GCC's builtin intrinsics, __builtin_ia32_crc32*
where available, but I guess those are only available if you build with
-msse4.2, and that would defeat the point of the runtime check.
I believe the _mm_* mnemonics would also work with the Intel compiler.
- Heikki
From 7e16b0d5be39f832769da463f069c51afa04fb57 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Tue, 10 Feb 2015 14:26:24 +0200
Subject: [PATCH 1/1] Use Intel SSE4.2 CRC instructions where available.
On x86, perform a runtime check to see if we're running on a CPU that
supports SSE 4.2. If we are, we can use the special crc32b and crc32q
instructions for the CRC-32C calculations. That greatly speeds up CRC
calculation.
Abhijit Menon-Sen, reviewed by Andres Freund and me.
---
configure | 2 +-
configure.in | 2 +-
src/common/pg_crc.c | 109 ++++++++++++++++++++++++++++++++++++++++----
src/include/common/pg_crc.h | 12 ++++-
src/include/pg_config.h.in | 3 ++
5 files changed, 114 insertions(+), 14 deletions(-)
diff --git a/configure b/configure
index fa271fe..c352128 100755
--- a/configure
+++ b/configure
@@ -9204,7 +9204,7 @@ fi
done
-for ac_header in atomic.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h
+for ac_header in atomic.h cpuid.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h
do :
as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default"
diff --git a/configure.in b/configure.in
index e6a49d1..588d626 100644
--- a/configure.in
+++ b/configure.in
@@ -1032,7 +1032,7 @@ AC_SUBST(UUID_LIBS)
##
dnl sys/socket.h is required by AC_FUNC_ACCEPT_ARGTYPES
-AC_CHECK_HEADERS([atomic.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h])
+AC_CHECK_HEADERS([atomic.h cpuid.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h])
# On BSD, test for net/if.h will fail unless sys/socket.h
# is included first.
diff --git a/src/common/pg_crc.c b/src/common/pg_crc.c
index eba32d3..b6db749 100644
--- a/src/common/pg_crc.c
+++ b/src/common/pg_crc.c
@@ -21,25 +21,113 @@
#include "common/pg_crc.h"
-/* Accumulate one input byte */
-#ifdef WORDS_BIGENDIAN
-#define CRC8(x) pg_crc32c_table[0][((crc >> 24) ^ (x)) & 0xFF] ^ (crc << 8)
+#ifdef PG_HAVE_CRC32C_INSTRUCTIONS
+static pg_crc32 pg_comp_crc32c_hw(pg_crc32 crc, const void *data, size_t len);
+#endif
+
+#if !defined(PG_HAVE_CRC32C_INSTRUCTIONS) || defined(PG_CRC32C_INSTRUCTIONS_NEED_RUNTIME_CHECK)
+static pg_crc32 pg_comp_crc32c_sb8(pg_crc32 crc, const void *data, size_t len);
+static const uint32 pg_crc32c_table[8][256];
+#endif
+
+#ifdef PG_CRC32C_INSTRUCTIONS_NEED_RUNTIME_CHECK
+/*
+ * When built with support for CRC instructions, but we need to perform a
+ * run-time check to determine whether we can actually use them,
+ * pg_comp_crc32c is a function pointer. It is initialized to
+ * pg_comp_crc32c_choose, which performs the runtime check, and changes the
+ * function pointer so that subsequent calls go directly to the hw-accelerated
+ * version, or the fallback slicing-by-8 version.
+ */
+static pg_crc32
+pg_comp_crc32c_choose(pg_crc32 crc, const void *data, size_t len)
+{
+ if (pg_crc32_instructions_runtime_check())
+ pg_comp_crc32c = pg_comp_crc32c_hw;
+ else
+ pg_comp_crc32c = pg_comp_crc32c_sb8;
+
+ return pg_comp_crc32c(crc, data, len);
+}
+
+pg_crc32 (*pg_comp_crc32c)(pg_crc32 crc, const void *data, size_t len) = pg_comp_crc32c_choose;
+
#else
-#define CRC8(x) pg_crc32c_table[0][(crc ^ (x)) & 0xFF] ^ (crc >> 8)
+/*
+ * No need for a runtime check. Compile directly with the hw-accelerated
+ * or the slicing-by-8 version. (We trust that the compiler
+ * is smart enough to inline it here.)
+ */
+pg_crc32
+pg_comp_crc32c(pg_crc32 crc, const void *data, size_t len)
+{
+#ifdef PG_HAVE_CRC32C_INSTRUCTIONS
+ return pg_comp_crc32c_hw(crc, data, len);
+#else
+ return pg_comp_crc32c_sb8(crc, data, len);
+#endif
+}
#endif
+#ifdef PG_HAVE_CRC32C_INSTRUCTIONS
/*
- * This function computes a CRC using the slicing-by-8 algorithm, which
- * uses an 8*256 lookup table to operate on eight bytes in parallel and
- * recombine the results.
+ * This function computes CRC-32C using special-purpose CPU instructions.
+ */
+static pg_crc32
+pg_comp_crc32c_hw(pg_crc32 crc, const void *data, size_t len)
+{
+ const unsigned char *p = data;
+ const uint64 *p8;
+
+ /*
+ * Process eight bytes of data at a time.
+ *
+ * NB: We do unaligned 8-byte accesses here. Currently, the only CRC
+ * instructions supported are the ones on Intel SSE 4.2, and that works
+ * and performs well with unaligned access. This may need to be changed
+ * if we get support for more architectures.
+ */
+ p8 = (const uint64 *) p;
+ while (len >= 8)
+ {
+ crc = pg_asm_crc32q(crc, *p8++);
+ len -= 8;
+ }
+
+ /*
+ * Handle any remaining bytes one at a time.
+ */
+ p = (const unsigned char *) p8;
+ while (len > 0)
+ {
+ crc = pg_asm_crc32b(crc, *p++);
+ len--;
+ }
+
+ return crc;
+}
+
+#endif /* PG_HAVE_CRC32C_INSTRUCTIONS */
+
+#if !defined(PG_HAVE_CRC32C_INSTRUCTIONS) || defined(PG_CRC32C_INSTRUCTIONS_NEED_RUNTIME_CHECK)
+/*
+ * Compute CRC-32C using slicing-by-8 algorithm.
*
* Michael E. Kounavis, Frank L. Berry,
* "Novel Table Lookup-Based Algorithms for High-Performance CRC
* Generation", IEEE Transactions on Computers, vol.57, no. 11,
* pp. 1550-1560, November 2008, doi:10.1109/TC.2008.85
*/
-pg_crc32
-pg_comp_crc32c(pg_crc32 crc, const void *data, size_t len)
+
+/* Accumulate one input byte */
+#ifdef WORDS_BIGENDIAN
+#define CRC8(x) pg_crc32c_table[0][((crc >> 24) ^ (x)) & 0xFF] ^ (crc << 8)
+#else
+#define CRC8(x) pg_crc32c_table[0][(crc ^ (x)) & 0xFF] ^ (crc >> 8)
+#endif
+
+static pg_crc32
+pg_comp_crc32c_sb8(pg_crc32 crc, const void *data, size_t len)
{
const unsigned char *p = data;
const uint32 *p4;
@@ -113,7 +201,7 @@ pg_comp_crc32c(pg_crc32 crc, const void *data, size_t len)
* order (IOW, the tables are stored in little-endian order even on big-endian
* systems).
*/
-const uint32 pg_crc32c_table[8][256] = {
+static const uint32 pg_crc32c_table[8][256] = {
#ifndef WORDS_BIGENDIAN
{
0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4,
@@ -1175,6 +1263,7 @@ const uint32 pg_crc32c_table[8][256] = {
#endif /* WORDS_BIGENDIAN */
};
+#endif
/*
* Lookup table for calculating CRC-32 using Sarwate's algorithm.
diff --git a/src/include/common/pg_crc.h b/src/include/common/pg_crc.h
index f496659..6806f6f 100644
--- a/src/include/common/pg_crc.h
+++ b/src/include/common/pg_crc.h
@@ -32,6 +32,8 @@
#ifndef PG_CRC_H
#define PG_CRC_H
+#include "port/crc_instructions.h"
+
/* ugly hack to let this be used in frontend and backend code on Cygwin */
#ifdef FRONTEND
#define CRCDLLIMPORT
@@ -71,7 +73,11 @@ typedef uint32 pg_crc32;
((crc) = pg_comp_crc32c((crc), (data), (len)))
#define EQ_CRC32C(c1, c2) ((c1) == (c2))
+#ifdef PG_CRC32C_INSTRUCTIONS_NEED_RUNTIME_CHECK
+extern pg_crc32 (*pg_comp_crc32c)(pg_crc32 crc, const void *data, size_t len);
+#else
extern pg_crc32 pg_comp_crc32c(pg_crc32 crc, const void *data, size_t len);
+#endif
/*
* CRC-32, the same used e.g. in Ethernet.
@@ -135,8 +141,10 @@ do { \
} \
} while (0)
-/* Constant tables for CRC-32C and CRC-32 polynomials */
-extern CRCDLLIMPORT const uint32 pg_crc32c_table[8][256];
+/*
+ * Constant table for the CRC-32 polynomial (the tables for CRC-32C are
+ * static in pg_crc.c)
+ */
extern CRCDLLIMPORT const uint32 pg_crc32_table[256];
#endif /* PG_CRC_H */
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index ece57c8..685ff81 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -96,6 +96,9 @@
/* Define to 1 if you have the `class' function. */
#undef HAVE_CLASS
+/* Define to 1 if you have the <cpuid.h> header file. */
+#undef HAVE_CPUID_H
+
/* Define to 1 if you have the <crtdefs.h> header file. */
#undef HAVE_CRTDEFS_H
--
2.1.4
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers