On Tue, 15 Dec 2020 at 20:34, Amit Khandekar <amitdkhan...@gmail.com> wrote:
>
> On Sun, 13 Dec 2020 at 9:28 PM, Andrey Borodin <x4...@yandex-team.ru> wrote:
> > +1
> > This will make all INSERTs and UPDATES for tsvector's GiSTs.
>
> Oh, I didn't realize that this code is getting used in GIST index
> insertion and creation too. Will check there.

I ran some insert and update tests; they show only marginal
improvement. So looks like the patch is mainly improving index builds.

> > Meanwhile there are at least 4 incarnation of hemdistsign() functions that 
> > are quite similar. I'd propose to refactor them somehow...
>
> Yes, I hope we get the benefit there also. Before that, I thought I
> should post the first use-case to get some early comments. Thanks for
> your encouraging comments :)

The attached v2 version of 0001 patch extends the hemdistsign()
changes to the other use cases like  intarray, ltree and hstore. I see
the same index build improvement for all these types.

Since for the gist index creation of some of these types the default
value for siglen is small (8-20), I tested with small siglens. For
siglens <= 20, particularly for values that are not multiples of 8
(e.g. 10, 13, etc), I see a  1-7 % reduction in speed of index
creation. It's probably because of
an extra function call for pg_xorcount(); and also might be due to the
extra logic in pg_xorcount() which becomes prominent for shorter
traversals. So for siglen less than 32, I kept the existing method
using byte-by-byte traversal.

--
Thanks,
-Amit Khandekar
Huawei Technologies
From b8905b47f7e0af8d4558fdac73cc2283c263cbcc Mon Sep 17 00:00:00 2001
From: Amit Khandekar <amitdkhan...@gmail.com>
Date: Thu, 10 Dec 2020 21:45:49 +0800
Subject: [PATCH 2/2] Avoid function pointer dereferencing for
 pg_popcount32/64() call.

With USE_POPCNT_ASM set (which is true only on x86), we decide with
the help of __get_cpuid() at runtime whether the CPU supports popcnt
instruction, and hence we dynamically choose the corresponding
function; thus pg_popcount32/64 is a function pointer. But for
platforms with USE_POPCNT_ASM not set, we don't have to use dynamic
assignment of functions, but we were still declaring pg_popcount64 as a
function pointer. So arrange for direct function call so as to get rid
of function pointer dereferencing each time pg_popcount32/64 is
called.

To do this, define pg_popcount64 to another function name
(pg_popcount64_nonasm) rather than a function pointer, whenever
USE_POPCNT_ASM is not defined.  And let pg_popcount64_nonasm() be a
static inline function so that whenever pg_popcount64() is called,
the __builtin_popcount() gets called directly.  For platforms not
supporting __builtin_popcount(), continue using the slow version as is
the current behaviour. pg_popcount64_slow() was actually a misnomer,
because it was actually calling __builtin_popcount() which is not a slow
function. Now with this commit, pg_popcount64_slow() indeed has only
slow code.

Tested this on ARM64, and the gist index creation for tsvectors speeds
up by ~ 6% for default siglen (=124), and by 12% with siglen=700
---
 src/include/port/pg_bitutils.h | 59 ++++++++++++++++++++++++++++++++++
 src/port/pg_bitutils.c         | 47 +++++----------------------
 2 files changed, 67 insertions(+), 39 deletions(-)

diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 174df28e66..b039f94ee5 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -23,6 +23,19 @@ extern const uint8 pg_rightmost_one_pos[256];
 extern const uint8 pg_number_of_ones[256];
 #endif
 
+/*
+ * On x86_64, we can use the hardware popcount instruction, but only if
+ * we can verify that the CPU supports it via the cpuid instruction.
+ *
+ * Otherwise, we fall back to __builtin_popcount if the compiler has that,
+ * or a hand-rolled implementation if not.
+ */
+#ifdef HAVE_X86_64_POPCNTQ
+#if defined(HAVE__GET_CPUID) || defined(HAVE__CPUID)
+#define USE_POPCNT_ASM 1
+#endif
+#endif
+
 /*
  * pg_leftmost_one_pos32
  *		Returns the position of the most significant set bit in "word",
@@ -208,8 +221,54 @@ pg_ceil_log2_64(uint64 num)
 }
 
 /* Count the number of one-bits in a uint32 or uint64 */
+
+/*
+ * With USE_POPCNT_ASM set (which is true only on x86), we decide at runtime
+ * whether the CPU supports popcnt instruction, and hence we dynamically choose
+ * the corresponding function; thus pg_popcount32/64 is a function pointer. But
+ * for platforms with USE_POPCNT_ASM not set, we don't have to use dynamic
+ * assignment of functions, so we arrange for direct function call so as to get
+ * rid of function pointer dereferencing each time pg_popcount32/64 is called.
+ */
+#ifdef USE_POPCNT_ASM
 extern int	(*pg_popcount32) (uint32 word);
 extern int	(*pg_popcount64) (uint64 word);
+#else
+#define pg_popcount32 pg_popcount32_nonasm
+#define pg_popcount64 pg_popcount64_nonasm
+#endif
+
+/* Slow versions of pg_popcount */
+#ifndef HAVE__BUILTIN_POPCOUNT
+extern int pg_popcount32_slow(uint32 word);
+extern int pg_popcount64_slow(uint64 word);
+#endif
+
+static inline int
+pg_popcount64_nonasm(uint64 word)
+{
+#ifdef HAVE__BUILTIN_POPCOUNT
+#if defined(HAVE_LONG_INT_64)
+	return __builtin_popcountl(word);
+#elif defined(HAVE_LONG_LONG_INT_64)
+	return __builtin_popcountll(word);
+#else
+#error must have a working 64-bit integer datatype
+#endif
+#else							/* !HAVE__BUILTIN_POPCOUNT */
+	return pg_popcount64_slow(word);
+#endif							/* HAVE__BUILTIN_POPCOUNT */
+}
+
+static inline int
+pg_popcount32_nonasm(uint32 word)
+{
+#ifdef HAVE__BUILTIN_POPCOUNT
+	return __builtin_popcount(word);
+#else
+	return pg_popcount32_slow(word);
+#endif							/* HAVE__BUILTIN_POPCOUNT */
+}
 
 /* Count the number of one-bits in a byte array */
 extern uint64 pg_popcount(const char *buf, int bytes);
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index cb2f5f5f0b..4a220590bf 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -103,22 +103,6 @@ const uint8 pg_number_of_ones[256] = {
 	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
 };
 
-/*
- * On x86_64, we can use the hardware popcount instruction, but only if
- * we can verify that the CPU supports it via the cpuid instruction.
- *
- * Otherwise, we fall back to __builtin_popcount if the compiler has that,
- * or a hand-rolled implementation if not.
- */
-#ifdef HAVE_X86_64_POPCNTQ
-#if defined(HAVE__GET_CPUID) || defined(HAVE__CPUID)
-#define USE_POPCNT_ASM 1
-#endif
-#endif
-
-static int	pg_popcount32_slow(uint32 word);
-static int	pg_popcount64_slow(uint64 word);
-
 #ifdef USE_POPCNT_ASM
 static bool pg_popcount_available(void);
 static int	pg_popcount32_choose(uint32 word);
@@ -128,9 +112,6 @@ static int	pg_popcount64_asm(uint64 word);
 
 int			(*pg_popcount32) (uint32 word) = pg_popcount32_choose;
 int			(*pg_popcount64) (uint64 word) = pg_popcount64_choose;
-#else
-int			(*pg_popcount32) (uint32 word) = pg_popcount32_slow;
-int			(*pg_popcount64) (uint64 word) = pg_popcount64_slow;
 #endif							/* USE_POPCNT_ASM */
 
 #ifdef USE_POPCNT_ASM
@@ -170,8 +151,8 @@ pg_popcount32_choose(uint32 word)
 	}
 	else
 	{
-		pg_popcount32 = pg_popcount32_slow;
-		pg_popcount64 = pg_popcount64_slow;
+		pg_popcount32 = pg_popcount32_nonasm;
+		pg_popcount64 = pg_popcount64_nonasm;
 	}
 
 	return pg_popcount32(word);
@@ -187,8 +168,8 @@ pg_popcount64_choose(uint64 word)
 	}
 	else
 	{
-		pg_popcount32 = pg_popcount32_slow;
-		pg_popcount64 = pg_popcount64_slow;
+		pg_popcount32 = pg_popcount32_nonasm;
+		pg_popcount64 = pg_popcount64_nonasm;
 	}
 
 	return pg_popcount64(word);
@@ -223,16 +204,14 @@ __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
 #endif							/* USE_POPCNT_ASM */
 
 
+#ifndef HAVE__BUILTIN_POPCOUNT
 /*
  * pg_popcount32_slow
  *		Return the number of 1 bits set in word
  */
-static int
+int
 pg_popcount32_slow(uint32 word)
 {
-#ifdef HAVE__BUILTIN_POPCOUNT
-	return __builtin_popcount(word);
-#else							/* !HAVE__BUILTIN_POPCOUNT */
 	int			result = 0;
 
 	while (word != 0)
@@ -242,25 +221,15 @@ pg_popcount32_slow(uint32 word)
 	}
 
 	return result;
-#endif							/* HAVE__BUILTIN_POPCOUNT */
 }
 
 /*
  * pg_popcount64_slow
  *		Return the number of 1 bits set in word
  */
-static int
+int
 pg_popcount64_slow(uint64 word)
 {
-#ifdef HAVE__BUILTIN_POPCOUNT
-#if defined(HAVE_LONG_INT_64)
-	return __builtin_popcountl(word);
-#elif defined(HAVE_LONG_LONG_INT_64)
-	return __builtin_popcountll(word);
-#else
-#error must have a working 64-bit integer datatype
-#endif
-#else							/* !HAVE__BUILTIN_POPCOUNT */
 	int			result = 0;
 
 	while (word != 0)
@@ -270,8 +239,8 @@ pg_popcount64_slow(uint64 word)
 	}
 
 	return result;
-#endif							/* HAVE__BUILTIN_POPCOUNT */
 }
+#endif							/* HAVE__BUILTIN_POPCOUNT */
 
 
 /*
-- 
2.17.1

From 474771cf8c3d90cc0356cb22f38c8d46d7ab0742 Mon Sep 17 00:00:00 2001
From: Amit Khandekar <amitdkhan...@gmail.com>
Date: Wed, 27 Jan 2021 22:34:06 +0800
Subject: [PATCH 1/2] Speed up xor'ing of two gist index signatures for
 tsvectors

In hemdistsign(), rather than using xor operator on char values, use
it in 64-bit chunks. And since the chunks are 64-bit, use popcount64()
on each of the chunks. I have checked that the two bitvector pointer
arguments of hemdistsign() are not always 64-bit aligned. So process
the leading and trailing bits char-by-char, leaving the middle 64-bit
chunks for use of popcount64().

This results in speed-up in Gist index creation for tsvectors. With
default siglen (124), the speed up is 12-20%. With siglen=700, it is
30-50%. So with longer signature lengths, we get higher percentage
speed-up.

Similar results are seen in other types using gist index, such as
intarray, hstore, and ltree that are availale in contrib.

With smaller siglens such as 10, 20 etc, there is a bit of a reduction
in speed by 1-7% if we use this optimization. It's probably because of
an extra function call for pg_xorcount(); and also might be due to
the extra logic in pg_xorcount(). So for siglen less than 32,
keep the existing method using byte-by-byte traversal.
---
 contrib/hstore/hstore_gist.c      | 14 +++------
 contrib/intarray/_intbig_gist.c   | 15 ++--------
 contrib/ltree/_ltree_gist.c       | 15 ++--------
 src/backend/utils/adt/tsgistidx.c | 15 ++--------
 src/include/port/pg_bitutils.h    | 20 +++++++++++++
 src/port/pg_bitutils.c            | 47 +++++++++++++++++++++++++++++++
 6 files changed, 80 insertions(+), 46 deletions(-)

diff --git a/contrib/hstore/hstore_gist.c b/contrib/hstore/hstore_gist.c
index 102c9cea72..ba1395dea3 100644
--- a/contrib/hstore/hstore_gist.c
+++ b/contrib/hstore/hstore_gist.c
@@ -8,6 +8,7 @@
 #include "access/stratnum.h"
 #include "catalog/pg_type.h"
 #include "hstore.h"
+#include "port/pg_bitutils.h"
 #include "utils/pg_crc.h"
 
 /* gist_hstore_ops opclass options */
@@ -256,18 +257,11 @@ sizebitvec(BITVECP sign, int siglen)
 	return size;
 }
 
-static int
+static inline int
 hemdistsign(BITVECP a, BITVECP b, int siglen)
 {
-	int			i,
-				dist = 0;
-
-	LOOPBIT(siglen)
-	{
-		if (GETBIT(a, i) != GETBIT(b, i))
-			dist++;
-	}
-	return dist;
+	return pg_xorcount((const unsigned char *) a, (const unsigned char *) b,
+					   siglen);
 }
 
 static int
diff --git a/contrib/intarray/_intbig_gist.c b/contrib/intarray/_intbig_gist.c
index 18ecd8cda6..c3bad635c3 100644
--- a/contrib/intarray/_intbig_gist.c
+++ b/contrib/intarray/_intbig_gist.c
@@ -211,20 +211,11 @@ sizebitvec(BITVECP sign, int siglen)
 	return pg_popcount(sign, siglen);
 }
 
-static int
+static inline int
 hemdistsign(BITVECP a, BITVECP b, int siglen)
 {
-	int			i,
-				diff,
-				dist = 0;
-
-	LOOPBYTE(siglen)
-	{
-		diff = (unsigned char) (a[i] ^ b[i]);
-		/* Using the popcount functions here isn't likely to win */
-		dist += pg_number_of_ones[diff];
-	}
-	return dist;
+	return pg_xorcount((const unsigned char *) a, (const unsigned char *) b,
+					   siglen);
 }
 
 static int
diff --git a/contrib/ltree/_ltree_gist.c b/contrib/ltree/_ltree_gist.c
index 72516c3b6b..ead08b00ed 100644
--- a/contrib/ltree/_ltree_gist.c
+++ b/contrib/ltree/_ltree_gist.c
@@ -180,20 +180,11 @@ sizebitvec(BITVECP sign, int siglen)
 	return pg_popcount((const char *) sign, siglen);
 }
 
-static int
+static inline int
 hemdistsign(BITVECP a, BITVECP b, int siglen)
 {
-	int			i,
-				diff,
-				dist = 0;
-
-	ALOOPBYTE(siglen)
-	{
-		diff = (unsigned char) (a[i] ^ b[i]);
-		/* Using the popcount functions here isn't likely to win */
-		dist += pg_number_of_ones[diff];
-	}
-	return dist;
+	return pg_xorcount((const unsigned char *) a, (const unsigned char *) b,
+					   siglen);
 }
 
 static int
diff --git a/src/backend/utils/adt/tsgistidx.c b/src/backend/utils/adt/tsgistidx.c
index c09eefdda2..9d822da241 100644
--- a/src/backend/utils/adt/tsgistidx.c
+++ b/src/backend/utils/adt/tsgistidx.c
@@ -486,20 +486,11 @@ sizebitvec(BITVECP sign, int siglen)
 	return pg_popcount(sign, siglen);
 }
 
-static int
+static inline int
 hemdistsign(BITVECP a, BITVECP b, int siglen)
 {
-	int			i,
-				diff,
-				dist = 0;
-
-	LOOPBYTE(siglen)
-	{
-		diff = (unsigned char) (a[i] ^ b[i]);
-		/* Using the popcount functions here isn't likely to win */
-		dist += pg_number_of_ones[diff];
-	}
-	return dist;
+	return pg_xorcount((const unsigned char *) a, (const unsigned char *) b,
+					   siglen);
 }
 
 static int
diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index f9b77ec278..55ef9a8e9e 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -214,6 +214,26 @@ extern int	(*pg_popcount64) (uint64 word);
 /* Count the number of one-bits in a byte array */
 extern uint64 pg_popcount(const char *buf, int bytes);
 
+/* Count the number of 1-bits in the result of xor operation */
+extern uint64 pg_xorcount_long(const unsigned char *a, const unsigned char *b,
+							   int bytes);
+static inline uint64 pg_xorcount(const unsigned char *a, const unsigned char *b,
+								 int bytes)
+{
+	/* For smaller lengths, do simple byte-by-byte traversal */
+	if (bytes <= 32)
+	{
+		int			i;
+		uint64		popcnt = 0;
+
+		for (i = 0; i < bytes; i++)
+			popcnt += pg_number_of_ones[a[i] ^ b[i]];
+		return popcnt;
+	}
+	else
+		return pg_xorcount_long(a, b, bytes);
+}
+
 /*
  * Rotate the bits of "word" to the right by n bits.
  */
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 2252021854..3a3bbc4262 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -319,3 +319,50 @@ pg_popcount(const char *buf, int bytes)
 
 	return popcnt;
 }
+
+/*
+ * pg_xorcount
+ *		Count the number of 1-bits in the result of xor operation.
+ */
+uint64
+pg_xorcount_long(const unsigned char *a, const unsigned char *b, int bytes)
+{
+	uint64		popcnt = 0;
+	int			i = 0;
+
+#if SIZEOF_VOID_P >= 8
+	const unsigned char *a_aligned = (const unsigned char *) TYPEALIGN(8, a);
+	const unsigned char *b_aligned = (const unsigned char *) TYPEALIGN(8, b);
+
+	/*
+	 * We can process 64-bit chunks only if both are mis-aligned by the same
+	 * number of bytes.
+	 */
+	if (b_aligned - b == a_aligned - a)
+	{
+		int			unaligned_bytes = a_aligned - a;
+		uint64	   *aint64 = (uint64*) a_aligned;
+		uint64	   *bint64 = (uint64*) b_aligned;
+		int			nelem;
+
+		/* Process leading bytes upto where aligned bytes start */
+		unaligned_bytes = Min(unaligned_bytes, bytes);
+		for (i = 0; i < unaligned_bytes; i++)
+			popcnt += pg_number_of_ones[a[i] ^ b[i]];
+
+		/* Process 64-bit chunks using popcount function */
+		nelem = (bytes - unaligned_bytes)/sizeof(uint64);
+		for (i = 0; i < nelem; i++)
+			popcnt += pg_popcount64(aint64[i] ^ bint64[i]);
+
+		/* Position i for the trailing bytes */
+		i = unaligned_bytes + nelem*sizeof(uint64);
+	}
+#endif
+
+	/* Process trailing bytes */
+	for (; i < bytes; i++)
+		popcnt += pg_number_of_ones[a[i] ^ b[i]];
+
+	return popcnt;
+}
-- 
2.17.1

Reply via email to