On 13.04.2026 17:06, David Geier wrote:
>> I squashed 0002 and 0004 into one commit, and did some more refactoring:
>> I created a trigram_qsort() helper function that calls the signed or
>> unsigned variant, so that that logic doesn't need to be duplicated in
>> the callers. For symmetry, I also added a trigram_qunique() helper
>> function which just calls qunique() with the new, faster CMPTRGM_EQ
>> comparator. Pushed these as commit 9f3755ea07.
>
> Thanks for committing these patches.
Attached are the remaining patches (previously 0003 and 0005) rebased on
latest master. Currently, there's no radix sort variant for the unsigned
char case. Do we care about this case or is it fine if that case runs
slower?
The following perf profiles show that trigram_qsort() goes from ~34%
down to ~7% with the radix sort optimization. The optimized run also
includes the btint4cmp() optimization. Without that the result would be
even better.
With that change we could move on and tackle optimizing
1. 41.52% generate_trgm_only() by e.g. using an ASCII fast-patch
2. 32.72% ginInsertBAEntries() by no longer using the RB-tree but
e.g. also the radix sort
master
- heapam_index_build_range_scan
- 99.40% ginBuildCallback
- ginHeapTupleBulkInsert
- 66.55% ginExtractEntries
- 65.29% FunctionCall3Coll
- gin_extract_value_trgm
- 62.80% generate_trgm
+ 34.33% trigram_qsort (inlined)
+ 26.20% generate_trgm_only
+ 2.23% trigram_qunique (inlined)
+ 1.74% detoast_attr
+ 1.19% qsort_arg_entries
+ 32.72% ginInsertBAEntries
patched
- heapam_index_build_range_scan
- 99.42% ginBuildCallback
- 95.95% ginHeapTupleBulkInsert
- 59.11% ginExtractEntries
- 56.93% FunctionCall3Coll
- gin_extract_value_trgm
- 52.19% generate_trgm
+ 41.52% generate_trgm_only
+ 7.14% trigram_qsort (inlined)
+ 3.53% trigram_qunique (inlined)
+ 4.08% detoast_attr
+ 2.13% qsort_arg_entries
+ 36.78% ginInsertBAEntries
--
David GeierFrom b20910a814e8c8b64c3d74841fe27a7c12dd927a Mon Sep 17 00:00:00 2001
From: David Geier <[email protected]>
Date: Tue, 11 Nov 2025 13:18:59 +0100
Subject: [PATCH v6 2/2] Optimize generate_trgm() with radix sort
---
contrib/pg_trgm/trgm_op.c | 59 +++++++++++++++++++++++++++++++++------
1 file changed, 51 insertions(+), 8 deletions(-)
diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c
index 0aca9b5826f..f2dbe88ece0 100644
--- a/contrib/pg_trgm/trgm_op.c
+++ b/contrib/pg_trgm/trgm_op.c
@@ -226,13 +226,56 @@ CMPTRGM_CHOOSE(const void *a, const void *b)
return CMPTRGM(a, b);
}
-#define ST_SORT trigram_qsort_signed
-#define ST_ELEMENT_TYPE_VOID
-#define ST_COMPARE(a, b) CMPTRGM_SIGNED(a, b)
-#define ST_SCOPE static
-#define ST_DEFINE
-#define ST_DECLARE
-#include "lib/sort_template.h"
+/*
+ * Needed to properly handle negative numbers in case char is signed.
+ */
+static inline unsigned char FlipSign(char x)
+{
+ return x^0x80;
+}
+
+static void radix_sort_trigrams_signed(trgm *trg, int count)
+{
+ trgm *buffer = palloc_array(trgm, count);
+ trgm *starts[256];
+ trgm *from = trg;
+ trgm *to = buffer;
+ int freqs[3][256];
+
+ /*
+ * Compute frequencies to partition the buffer.
+ */
+ memset(freqs, 0, sizeof(freqs));
+
+ for (int i=0; i<count; i++)
+ for (int j=0; j<3; j++)
+ freqs[j][FlipSign(trg[i][j])]++;
+
+ /*
+ * Do the sorting. Start with last character because that's the is "LSB"
+ * in a trigram. Avoid unnecessary copies by ping-ponging between the buffers.
+ */
+ for (int i=2; i>=0; i--)
+ {
+ trgm *old_from = from;
+ trgm *next = to;
+
+ for (int j=0; j<256; j++)
+ {
+ starts[j] = next;
+ next += freqs[i][j];
+ }
+
+ for (int j=0; j<count; j++)
+ memcpy(starts[FlipSign(from[j][i])]++, from[j], sizeof(trgm));
+
+ from = to;
+ to = old_from;
+ }
+
+ memcpy(trg, buffer, sizeof(trgm) * count);
+ pfree(buffer);
+}
#define ST_SORT trigram_qsort_unsigned
#define ST_ELEMENT_TYPE_VOID
@@ -247,7 +290,7 @@ static void
trigram_qsort(trgm *array, size_t n)
{
if (GetDefaultCharSignedness())
- trigram_qsort_signed(array, n, sizeof(trgm));
+ radix_sort_trigrams_signed(array, n);
else
trigram_qsort_unsigned(array, n, sizeof(trgm));
}
--
2.51.0
From 0f39f8618c784299b30566a0bf68beef8b2b8aef Mon Sep 17 00:00:00 2001
From: David Geier <[email protected]>
Date: Mon, 10 Nov 2025 15:40:11 +0100
Subject: [PATCH v6 1/2] Make btint4cmp() branchless
---
src/backend/access/nbtree/nbtcompare.c | 8 ++------
1 file changed, 2 insertions(+), 6 deletions(-)
diff --git a/src/backend/access/nbtree/nbtcompare.c b/src/backend/access/nbtree/nbtcompare.c
index 1d343377e98..ac16e3d993d 100644
--- a/src/backend/access/nbtree/nbtcompare.c
+++ b/src/backend/access/nbtree/nbtcompare.c
@@ -61,6 +61,7 @@
#include "utils/fmgrprotos.h"
#include "utils/skipsupport.h"
#include "utils/sortsupport.h"
+#include "common/int.h"
#ifdef STRESS_SORT_INT_MIN
#define A_LESS_THAN_B INT_MIN
@@ -203,12 +204,7 @@ btint4cmp(PG_FUNCTION_ARGS)
int32 a = PG_GETARG_INT32(0);
int32 b = PG_GETARG_INT32(1);
- if (a > b)
- PG_RETURN_INT32(A_GREATER_THAN_B);
- else if (a == b)
- PG_RETURN_INT32(0);
- else
- PG_RETURN_INT32(A_LESS_THAN_B);
+ PG_RETURN_INT32(pg_cmp_s32(a, b));
}
Datum
--
2.51.0