commit e0e14e301c9da9d25dfebfe89039210f46c0efe1
Author: Alexander Korotkov <akorotkov@postgresql.org>
Date:   Wed Aug 7 23:41:58 2019 +0300

    Adjust string comparison in jsonpath
    
    We have implemented jsonpath string comparison using default database locale.
    However, standard requires us to compare Unicode codepoints.  This commit
    implements that, but for performance reasons we still use per-byte comparison
    for "==" operator.  Thus, for consistency other comparison operators do per-byte
    comparison if Unicode codepoints appear to be equal.
    
    In some edge cases, when same Unicode codepoints have different binary
    representations in database encoding, we diverge standard to achieve better
    performance of "==" operator.  In future to implement strict standard
    conformance, we can do normalization of input JSON strings.
    
    Original patch was written by Nikita Glukhov, rewritten by me.
    
    Reported-by: Markus Winand
    Discussion: https://postgr.es/m/8B7FA3B4-328D-43D7-95A8-37B8891B8C78%40winand.at
    Author: Nikita Glukhov, Alexander Korotkov
    Backpatch-through: 12

diff --git a/src/backend/utils/adt/jsonpath_exec.c b/src/backend/utils/adt/jsonpath_exec.c
index 293d6da027c..ced036d112d 100644
--- a/src/backend/utils/adt/jsonpath_exec.c
+++ b/src/backend/utils/adt/jsonpath_exec.c
@@ -1980,6 +1980,103 @@ executeComparison(JsonPathItem *cmp, JsonbValue *lv, JsonbValue *rv, void *p)
 	return compareItems(cmp->type, lv, rv);
 }
 
+/*
+ * Perform per-byte comparison of two strings.
+ */
+static int
+binaryCompareStrings(const char *s1, int len1,
+					 const char *s2, int len2)
+{
+	int			cmp;
+
+	cmp = memcmp(s1, s2, Min(len1, len2));
+
+	if (cmp != 0)
+		return cmp;
+
+	if (len1 == len2)
+		return 0;
+
+	return len1 < len2 ? -1 : 1;
+}
+
+/*
+ * Compare two strings in the current server encoding using Unicode codepoint
+ * collation.
+ */
+static int
+compareStrings(const char *mbstr1, int mblen1,
+			   const char *mbstr2, int mblen2)
+{
+	if (GetDatabaseEncoding() == PG_SQL_ASCII)
+	{
+		/*
+		 * ASCII encoding: simple case, just do binary comparison.
+		 */
+		return binaryCompareStrings(mbstr1, mblen1, mbstr2, mblen2);
+	}
+	else
+	{
+		/*
+		 * In order to get Unicode codepoints, we have to convert to utf8
+		 * first.  If server encoding is utf8, pg_server_to_any() will just
+		 * return original pointers.
+		 */
+		char	   *utf8str1 = pg_server_to_any(mbstr1, mblen1, PG_UTF8),
+				   *utf8str2 = pg_server_to_any(mbstr2, mblen2, PG_UTF8);
+		unsigned char *utf8c1,
+				   *utf8c2;
+		pg_wchar	uchar1,
+					uchar2;
+		int			cmp = 0;
+
+		/*
+		 * Loop over utf8 characters comparing corresponding Unicode
+		 * codepoints.
+		 */
+		utf8c1 = (unsigned char *) utf8str1;
+		utf8c2 = (unsigned char *) utf8str2;
+		while (*utf8c1 && *utf8c2)
+		{
+			uchar1 = utf8_to_unicode(utf8c1);
+			uchar2 = utf8_to_unicode(utf8c2);
+
+			if (uchar1 != uchar2)
+			{
+				cmp = uchar1 < uchar2 ? -1 : 1;
+				break;
+			}
+
+			utf8c1 += pg_utf_mblen(utf8c1);
+			utf8c2 += pg_utf_mblen(utf8c2);
+		}
+
+		/* If one string is prefix on another, shorter is lesser */
+		if (cmp == 0 && *utf8c1 != *utf8c2)
+			cmp = *utf8c1 == 0 ? -1 : 1;
+
+		/* Free allocated strings if any */
+		if (utf8str1 != mbstr1)
+			pfree(utf8str1);
+		if (utf8str2 != mbstr2)
+			pfree(utf8str2);
+
+		/*
+		 * When all Unicode codepoints are equal, return result of binary
+		 * comparison.  In some edge cases, same characters may have different
+		 * representations in encoding.  Then our behavior could diverge from
+		 * standard.  However, that allow us to do simple binary comparison
+		 * for "==" operator, which is performance critical in typical cases.
+		 * In future to implement strict standard conformance, we should do
+		 * normalization of input JSON strings.
+		 */
+		if (cmp != 0)
+			return binaryCompareStrings(mbstr1, mblen1, mbstr2, mblen2);
+		else
+			return cmp;
+	}
+}
+
 /*
  * Compare two SQL/JSON items using comparison operation 'op'.
  */
@@ -2017,14 +2114,13 @@ compareItems(int32 op, JsonbValue *jb1, JsonbValue *jb2)
 			break;
 		case jbvString:
 			if (op == jpiEqual)
-				return jb1->val.string.len != jb2->val.string.len ||
-					memcmp(jb1->val.string.val,
-						   jb2->val.string.val,
-						   jb1->val.string.len) ? jpbFalse : jpbTrue;
-
-			cmp = varstr_cmp(jb1->val.string.val, jb1->val.string.len,
-							 jb2->val.string.val, jb2->val.string.len,
-							 DEFAULT_COLLATION_OID);
+				return binaryCompareStrings(jb1->val.string.val,
+											jb1->val.string.len,
+											jb2->val.string.val,
+											jb2->val.string.len) ? jpbFalse : jpbTrue;
+
+			cmp = compareStrings(jb1->val.string.val, jb1->val.string.len,
+								 jb2->val.string.val, jb2->val.string.len);
 			break;
 
 		case jbvBinary:
diff --git a/src/test/regress/expected/jsonb_jsonpath.out b/src/test/regress/expected/jsonb_jsonpath.out
index 31a871af028..0202667a1f7 100644
--- a/src/test/regress/expected/jsonb_jsonpath.out
+++ b/src/test/regress/expected/jsonb_jsonpath.out
@@ -1833,3 +1833,166 @@ SELECT jsonb_path_match('[{"a": 1}, {"a": 2}]', '$[*].a > 1');
  t
 (1 row)
 
+-- test string comparison (Unicode codepoint collation)
+WITH str(j, num) AS
+(
+	SELECT jsonb_build_object('s', s), num
+	FROM unnest('{"", "a", "ab", "abc", "abcd", "b", "A", "AB", "ABC", "ABc", "ABcD", "B"}'::text[]) WITH ORDINALITY AS a(s, num)
+)
+SELECT
+	s1.j, s2.j,
+	jsonb_path_query_first(s1.j, '$.s < $s', vars => s2.j) lt,
+	jsonb_path_query_first(s1.j, '$.s <= $s', vars => s2.j) le,
+	jsonb_path_query_first(s1.j, '$.s == $s', vars => s2.j) eq,
+	jsonb_path_query_first(s1.j, '$.s >= $s', vars => s2.j) ge,
+	jsonb_path_query_first(s1.j, '$.s > $s', vars => s2.j) gt
+FROM str s1, str s2
+ORDER BY s1.num, s2.num;
+       j       |       j       |  lt   |  le   |  eq   |  ge   |  gt   
+---------------+---------------+-------+-------+-------+-------+-------
+ {"s": ""}     | {"s": ""}     | false | true  | true  | true  | false
+ {"s": ""}     | {"s": "a"}    | true  | true  | false | false | false
+ {"s": ""}     | {"s": "ab"}   | true  | true  | false | false | false
+ {"s": ""}     | {"s": "abc"}  | true  | true  | false | false | false
+ {"s": ""}     | {"s": "abcd"} | true  | true  | false | false | false
+ {"s": ""}     | {"s": "b"}    | true  | true  | false | false | false
+ {"s": ""}     | {"s": "A"}    | true  | true  | false | false | false
+ {"s": ""}     | {"s": "AB"}   | true  | true  | false | false | false
+ {"s": ""}     | {"s": "ABC"}  | true  | true  | false | false | false
+ {"s": ""}     | {"s": "ABc"}  | true  | true  | false | false | false
+ {"s": ""}     | {"s": "ABcD"} | true  | true  | false | false | false
+ {"s": ""}     | {"s": "B"}    | true  | true  | false | false | false
+ {"s": "a"}    | {"s": ""}     | false | false | false | true  | true
+ {"s": "a"}    | {"s": "a"}    | false | true  | true  | true  | false
+ {"s": "a"}    | {"s": "ab"}   | true  | true  | false | false | false
+ {"s": "a"}    | {"s": "abc"}  | true  | true  | false | false | false
+ {"s": "a"}    | {"s": "abcd"} | true  | true  | false | false | false
+ {"s": "a"}    | {"s": "b"}    | true  | true  | false | false | false
+ {"s": "a"}    | {"s": "A"}    | false | false | false | true  | true
+ {"s": "a"}    | {"s": "AB"}   | false | false | false | true  | true
+ {"s": "a"}    | {"s": "ABC"}  | false | false | false | true  | true
+ {"s": "a"}    | {"s": "ABc"}  | false | false | false | true  | true
+ {"s": "a"}    | {"s": "ABcD"} | false | false | false | true  | true
+ {"s": "a"}    | {"s": "B"}    | false | false | false | true  | true
+ {"s": "ab"}   | {"s": ""}     | false | false | false | true  | true
+ {"s": "ab"}   | {"s": "a"}    | false | false | false | true  | true
+ {"s": "ab"}   | {"s": "ab"}   | false | true  | true  | true  | false
+ {"s": "ab"}   | {"s": "abc"}  | true  | true  | false | false | false
+ {"s": "ab"}   | {"s": "abcd"} | true  | true  | false | false | false
+ {"s": "ab"}   | {"s": "b"}    | true  | true  | false | false | false
+ {"s": "ab"}   | {"s": "A"}    | false | false | false | true  | true
+ {"s": "ab"}   | {"s": "AB"}   | false | false | false | true  | true
+ {"s": "ab"}   | {"s": "ABC"}  | false | false | false | true  | true
+ {"s": "ab"}   | {"s": "ABc"}  | false | false | false | true  | true
+ {"s": "ab"}   | {"s": "ABcD"} | false | false | false | true  | true
+ {"s": "ab"}   | {"s": "B"}    | false | false | false | true  | true
+ {"s": "abc"}  | {"s": ""}     | false | false | false | true  | true
+ {"s": "abc"}  | {"s": "a"}    | false | false | false | true  | true
+ {"s": "abc"}  | {"s": "ab"}   | false | false | false | true  | true
+ {"s": "abc"}  | {"s": "abc"}  | false | true  | true  | true  | false
+ {"s": "abc"}  | {"s": "abcd"} | true  | true  | false | false | false
+ {"s": "abc"}  | {"s": "b"}    | true  | true  | false | false | false
+ {"s": "abc"}  | {"s": "A"}    | false | false | false | true  | true
+ {"s": "abc"}  | {"s": "AB"}   | false | false | false | true  | true
+ {"s": "abc"}  | {"s": "ABC"}  | false | false | false | true  | true
+ {"s": "abc"}  | {"s": "ABc"}  | false | false | false | true  | true
+ {"s": "abc"}  | {"s": "ABcD"} | false | false | false | true  | true
+ {"s": "abc"}  | {"s": "B"}    | false | false | false | true  | true
+ {"s": "abcd"} | {"s": ""}     | false | false | false | true  | true
+ {"s": "abcd"} | {"s": "a"}    | false | false | false | true  | true
+ {"s": "abcd"} | {"s": "ab"}   | false | false | false | true  | true
+ {"s": "abcd"} | {"s": "abc"}  | false | false | false | true  | true
+ {"s": "abcd"} | {"s": "abcd"} | false | true  | true  | true  | false
+ {"s": "abcd"} | {"s": "b"}    | true  | true  | false | false | false
+ {"s": "abcd"} | {"s": "A"}    | false | false | false | true  | true
+ {"s": "abcd"} | {"s": "AB"}   | false | false | false | true  | true
+ {"s": "abcd"} | {"s": "ABC"}  | false | false | false | true  | true
+ {"s": "abcd"} | {"s": "ABc"}  | false | false | false | true  | true
+ {"s": "abcd"} | {"s": "ABcD"} | false | false | false | true  | true
+ {"s": "abcd"} | {"s": "B"}    | false | false | false | true  | true
+ {"s": "b"}    | {"s": ""}     | false | false | false | true  | true
+ {"s": "b"}    | {"s": "a"}    | false | false | false | true  | true
+ {"s": "b"}    | {"s": "ab"}   | false | false | false | true  | true
+ {"s": "b"}    | {"s": "abc"}  | false | false | false | true  | true
+ {"s": "b"}    | {"s": "abcd"} | false | false | false | true  | true
+ {"s": "b"}    | {"s": "b"}    | false | true  | true  | true  | false
+ {"s": "b"}    | {"s": "A"}    | false | false | false | true  | true
+ {"s": "b"}    | {"s": "AB"}   | false | false | false | true  | true
+ {"s": "b"}    | {"s": "ABC"}  | false | false | false | true  | true
+ {"s": "b"}    | {"s": "ABc"}  | false | false | false | true  | true
+ {"s": "b"}    | {"s": "ABcD"} | false | false | false | true  | true
+ {"s": "b"}    | {"s": "B"}    | false | false | false | true  | true
+ {"s": "A"}    | {"s": ""}     | false | false | false | true  | true
+ {"s": "A"}    | {"s": "a"}    | true  | true  | false | false | false
+ {"s": "A"}    | {"s": "ab"}   | true  | true  | false | false | false
+ {"s": "A"}    | {"s": "abc"}  | true  | true  | false | false | false
+ {"s": "A"}    | {"s": "abcd"} | true  | true  | false | false | false
+ {"s": "A"}    | {"s": "b"}    | true  | true  | false | false | false
+ {"s": "A"}    | {"s": "A"}    | false | true  | true  | true  | false
+ {"s": "A"}    | {"s": "AB"}   | true  | true  | false | false | false
+ {"s": "A"}    | {"s": "ABC"}  | true  | true  | false | false | false
+ {"s": "A"}    | {"s": "ABc"}  | true  | true  | false | false | false
+ {"s": "A"}    | {"s": "ABcD"} | true  | true  | false | false | false
+ {"s": "A"}    | {"s": "B"}    | true  | true  | false | false | false
+ {"s": "AB"}   | {"s": ""}     | false | false | false | true  | true
+ {"s": "AB"}   | {"s": "a"}    | true  | true  | false | false | false
+ {"s": "AB"}   | {"s": "ab"}   | true  | true  | false | false | false
+ {"s": "AB"}   | {"s": "abc"}  | true  | true  | false | false | false
+ {"s": "AB"}   | {"s": "abcd"} | true  | true  | false | false | false
+ {"s": "AB"}   | {"s": "b"}    | true  | true  | false | false | false
+ {"s": "AB"}   | {"s": "A"}    | false | false | false | true  | true
+ {"s": "AB"}   | {"s": "AB"}   | false | true  | true  | true  | false
+ {"s": "AB"}   | {"s": "ABC"}  | true  | true  | false | false | false
+ {"s": "AB"}   | {"s": "ABc"}  | true  | true  | false | false | false
+ {"s": "AB"}   | {"s": "ABcD"} | true  | true  | false | false | false
+ {"s": "AB"}   | {"s": "B"}    | true  | true  | false | false | false
+ {"s": "ABC"}  | {"s": ""}     | false | false | false | true  | true
+ {"s": "ABC"}  | {"s": "a"}    | true  | true  | false | false | false
+ {"s": "ABC"}  | {"s": "ab"}   | true  | true  | false | false | false
+ {"s": "ABC"}  | {"s": "abc"}  | true  | true  | false | false | false
+ {"s": "ABC"}  | {"s": "abcd"} | true  | true  | false | false | false
+ {"s": "ABC"}  | {"s": "b"}    | true  | true  | false | false | false
+ {"s": "ABC"}  | {"s": "A"}    | false | false | false | true  | true
+ {"s": "ABC"}  | {"s": "AB"}   | false | false | false | true  | true
+ {"s": "ABC"}  | {"s": "ABC"}  | false | true  | true  | true  | false
+ {"s": "ABC"}  | {"s": "ABc"}  | true  | true  | false | false | false
+ {"s": "ABC"}  | {"s": "ABcD"} | true  | true  | false | false | false
+ {"s": "ABC"}  | {"s": "B"}    | true  | true  | false | false | false
+ {"s": "ABc"}  | {"s": ""}     | false | false | false | true  | true
+ {"s": "ABc"}  | {"s": "a"}    | true  | true  | false | false | false
+ {"s": "ABc"}  | {"s": "ab"}   | true  | true  | false | false | false
+ {"s": "ABc"}  | {"s": "abc"}  | true  | true  | false | false | false
+ {"s": "ABc"}  | {"s": "abcd"} | true  | true  | false | false | false
+ {"s": "ABc"}  | {"s": "b"}    | true  | true  | false | false | false
+ {"s": "ABc"}  | {"s": "A"}    | false | false | false | true  | true
+ {"s": "ABc"}  | {"s": "AB"}   | false | false | false | true  | true
+ {"s": "ABc"}  | {"s": "ABC"}  | false | false | false | true  | true
+ {"s": "ABc"}  | {"s": "ABc"}  | false | true  | true  | true  | false
+ {"s": "ABc"}  | {"s": "ABcD"} | true  | true  | false | false | false
+ {"s": "ABc"}  | {"s": "B"}    | true  | true  | false | false | false
+ {"s": "ABcD"} | {"s": ""}     | false | false | false | true  | true
+ {"s": "ABcD"} | {"s": "a"}    | true  | true  | false | false | false
+ {"s": "ABcD"} | {"s": "ab"}   | true  | true  | false | false | false
+ {"s": "ABcD"} | {"s": "abc"}  | true  | true  | false | false | false
+ {"s": "ABcD"} | {"s": "abcd"} | true  | true  | false | false | false
+ {"s": "ABcD"} | {"s": "b"}    | true  | true  | false | false | false
+ {"s": "ABcD"} | {"s": "A"}    | false | false | false | true  | true
+ {"s": "ABcD"} | {"s": "AB"}   | false | false | false | true  | true
+ {"s": "ABcD"} | {"s": "ABC"}  | false | false | false | true  | true
+ {"s": "ABcD"} | {"s": "ABc"}  | false | false | false | true  | true
+ {"s": "ABcD"} | {"s": "ABcD"} | false | true  | true  | true  | false
+ {"s": "ABcD"} | {"s": "B"}    | true  | true  | false | false | false
+ {"s": "B"}    | {"s": ""}     | false | false | false | true  | true
+ {"s": "B"}    | {"s": "a"}    | true  | true  | false | false | false
+ {"s": "B"}    | {"s": "ab"}   | true  | true  | false | false | false
+ {"s": "B"}    | {"s": "abc"}  | true  | true  | false | false | false
+ {"s": "B"}    | {"s": "abcd"} | true  | true  | false | false | false
+ {"s": "B"}    | {"s": "b"}    | true  | true  | false | false | false
+ {"s": "B"}    | {"s": "A"}    | false | false | false | true  | true
+ {"s": "B"}    | {"s": "AB"}   | false | false | false | true  | true
+ {"s": "B"}    | {"s": "ABC"}  | false | false | false | true  | true
+ {"s": "B"}    | {"s": "ABc"}  | false | false | false | true  | true
+ {"s": "B"}    | {"s": "ABcD"} | false | false | false | true  | true
+ {"s": "B"}    | {"s": "B"}    | false | true  | true  | true  | false
+(144 rows)
+
diff --git a/src/test/regress/sql/jsonb_jsonpath.sql b/src/test/regress/sql/jsonb_jsonpath.sql
index 733fbd4e0d0..e7629fb7f9d 100644
--- a/src/test/regress/sql/jsonb_jsonpath.sql
+++ b/src/test/regress/sql/jsonb_jsonpath.sql
@@ -387,3 +387,19 @@ SELECT jsonb_path_match('[true, true]', '$[*]', silent => false);
 SELECT jsonb '[{"a": 1}, {"a": 2}]' @@ '$[*].a > 1';
 SELECT jsonb '[{"a": 1}, {"a": 2}]' @@ '$[*].a > 2';
 SELECT jsonb_path_match('[{"a": 1}, {"a": 2}]', '$[*].a > 1');
+
+-- test string comparison (Unicode codepoint collation)
+WITH str(j, num) AS
+(
+	SELECT jsonb_build_object('s', s), num
+	FROM unnest('{"", "a", "ab", "abc", "abcd", "b", "A", "AB", "ABC", "ABc", "ABcD", "B"}'::text[]) WITH ORDINALITY AS a(s, num)
+)
+SELECT
+	s1.j, s2.j,
+	jsonb_path_query_first(s1.j, '$.s < $s', vars => s2.j) lt,
+	jsonb_path_query_first(s1.j, '$.s <= $s', vars => s2.j) le,
+	jsonb_path_query_first(s1.j, '$.s == $s', vars => s2.j) eq,
+	jsonb_path_query_first(s1.j, '$.s >= $s', vars => s2.j) ge,
+	jsonb_path_query_first(s1.j, '$.s > $s', vars => s2.j) gt
+FROM str s1, str s2
+ORDER BY s1.num, s2.num;
