From 77b80116117a337b7163e65c54085e517b9d0e17 Mon Sep 17 00:00:00 2001
From: Masahiko Sawada <sawada.mshk@gmail.com>
Date: Wed, 25 Sep 2024 15:17:02 -0700
Subject: [PATCH v2 5/5] Fix an issue with index scan using pg_trgm due to char
 signedness on different architectures.

GIN and GiST indexes utilizing pg_trgm's opclasses store sorted
trigrams within index tuples. When comparing and sorting each trigram,
pg_trgm treats each character as a 'char[3]' type in C. However, the
char type in C can be interpreted as either signed char or unsigned
char, depending on the platform, if the signedness is not explicitly
specified. Consequently, during replication between different CPU
architectures, there was an issue where index scans on standby servers
could not locate matching index tuples due to the differing treatment
of character signedness.

This change introduces comparison functions for trgm that explicitly
handle signed char and unsigned char. The appropriate comparison
function will be dynamically selected based on the character
signedness stored in the control file. Therefore, upgraded clusters
can utilize the indexes without rebuilding, provided the cluster
upgrade occurs on platforms with the same character signedness as the
original cluster initialization.

The default char signedness information was introduced in XXXX, so no
backpatch.

Reviewed-by:
Discussion: https://postgr.es/m/CB11ADBC-0C3F-4FE0-A678-666EE80CBB07%40amazon.com
---
 contrib/pg_trgm/trgm.h               |  4 +--
 contrib/pg_trgm/trgm_op.c            | 42 ++++++++++++++++++++++++++++
 src/backend/storage/lmgr/predicate.c |  5 ++++
 3 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/contrib/pg_trgm/trgm.h b/contrib/pg_trgm/trgm.h
index afb0adb222..5534abe680 100644
--- a/contrib/pg_trgm/trgm.h
+++ b/contrib/pg_trgm/trgm.h
@@ -42,8 +42,6 @@
 typedef char trgm[3];
 
 #define CMPCHAR(a,b) ( ((a)==(b)) ? 0 : ( ((a)<(b)) ? -1 : 1 ) )
-#define CMPPCHAR(a,b,i)  CMPCHAR( *(((const char*)(a))+i), *(((const char*)(b))+i) )
-#define CMPTRGM(a,b) ( CMPPCHAR(a,b,0) ? CMPPCHAR(a,b,0) : ( CMPPCHAR(a,b,1) ? CMPPCHAR(a,b,1) : CMPPCHAR(a,b,2) ) )
 
 #define CPTRGM(a,b) do {				\
 	*(((char*)(a))+0) = *(((char*)(b))+0);	\
@@ -51,6 +49,8 @@ typedef char trgm[3];
 	*(((char*)(a))+2) = *(((char*)(b))+2);	\
 } while(0)
 
+extern int	(*CMPTRGM) (const void *a, const void *b);
+
 #ifdef KEEPONLYALNUM
 #define ISWORDCHR(c)	(t_isalnum(c))
 #define ISPRINTABLECHAR(a)	( isascii( *(unsigned char*)(a) ) && (isalnum( *(unsigned char*)(a) ) || *(unsigned char*)(a)==' ') )
diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c
index c509d15ee4..75e670edf6 100644
--- a/contrib/pg_trgm/trgm_op.c
+++ b/contrib/pg_trgm/trgm_op.c
@@ -40,6 +40,9 @@ PG_FUNCTION_INFO_V1(strict_word_similarity_commutator_op);
 PG_FUNCTION_INFO_V1(strict_word_similarity_dist_op);
 PG_FUNCTION_INFO_V1(strict_word_similarity_dist_commutator_op);
 
+static inline int CMPTRGM_CHOOSE(const void *a, const void *b);
+int			(*CMPTRGM) (const void *a, const void *b) = CMPTRGM_CHOOSE;
+
 /* Trigram with position */
 typedef struct
 {
@@ -105,6 +108,45 @@ _PG_init(void)
 	MarkGUCPrefixReserved("pg_trgm");
 }
 
+/*
+ * Functions for comparing two trgms while treating each char as "signed char" or
+ * "unsigned char".
+ */
+static inline int
+CMPTRGM_SIGNED(const void *a, const void *b)
+{
+#define CMPPCHAR_S(a,b,i)  CMPCHAR( *(((const signed char*)(a))+i), *(((const signed char*)(b))+i) )
+
+	return CMPPCHAR_S(a, b, 0) ? CMPPCHAR_S(a, b, 0)
+		: (CMPPCHAR_S(a, b, 1) ? CMPPCHAR_S(a, b, 1)
+		   : CMPPCHAR_S(a, b, 2));
+}
+
+static inline int
+CMPTRGM_UNSIGNED(const void *a, const void *b)
+{
+#define CMPPCHAR_UNS(a,b,i)  CMPCHAR( *(((const unsigned char*)(a))+i), *(((const unsigned char*)(b))+i) )
+
+	return CMPPCHAR_UNS(a, b, 0) ? CMPPCHAR_UNS(a, b, 0)
+		: (CMPPCHAR_UNS(a, b, 1) ? CMPPCHAR_UNS(a, b, 1)
+		   : CMPPCHAR_UNS(a, b, 2));
+}
+
+/*
+ * This gets called on the first call. It replaces the function pointer so
+ * that subsequent calls are routed directly to the chosen implementation.
+ */
+static inline int
+CMPTRGM_CHOOSE(const void *a, const void *b)
+{
+	if (GetDefaultCharSignedness())
+		CMPTRGM = CMPTRGM_SIGNED;
+	else
+		CMPTRGM = CMPTRGM_UNSIGNED;
+
+	return CMPTRGM(a, b);
+}
+
 /*
  * Deprecated function.
  * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index e24a0f2fdb..2648c1e260 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -2588,6 +2588,11 @@ PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
 	if (!SerializationNeededForRead(relation, snapshot))
 		return;
 
+	ereport(LOG, (errmsg("predicate %s blk %u",
+						 RelationGetRelationName(relation),
+						 blkno),
+				  errbacktrace()));
+
 	SET_PREDICATELOCKTARGETTAG_PAGE(tag,
 									relation->rd_locator.dbOid,
 									relation->rd_id,
-- 
2.39.3

