Hi,
Jeff pointed out to me that the case conversion functions in ICU have
UTF-8 specific versions which means we can call those directly if the
database encoding is UTF-8 and skip having to convert to and from UChar.
Since most people today run their databases in UTF-8 I think this
optimization is worth it and when measuring on short to medium length
strings I got a 15-20% speed up. It is still slower than glibc in my
benchmarks but the gap is smaller now.
SELECT count(upper) FROM (SELECT upper(('Kålhuvud ' || i) COLLATE
"sv-SE-x-icu") FROM generate_series(1, 1000000) i);
master: ~540 ms
Patched: ~460 ms
glibc: ~410 ms
I have also attached a clean up patch for the non-UTF-8 code paths. I
thought about doing the same for the new UTF-8 code paths but it turned
out to be a bit messy due to different function signatures for
ucasemap_utf8ToUpper() and ucasemap_utf8ToLower() vs ucasemap_utf8ToTitle().
Andreas
From 5a355ef083cc7de92ae1e5dcc0198866a07919eb Mon Sep 17 00:00:00 2001
From: Andreas Karlsson <andr...@proxel.se>
Date: Tue, 17 Dec 2024 22:47:00 +0100
Subject: [PATCH v1 1/2] Use optimized versions of ICU case conversion for
UTF-8
Instead of converting to and from UChar when doing case conversions we
use the UTF-8 versions of the functions. This can give a signficant
speedup, 15-20%, on short to medium length strings.
---
src/backend/utils/adt/pg_locale_icu.c | 161 ++++++++++++++++++--------
1 file changed, 114 insertions(+), 47 deletions(-)
diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c
index f0a77a767e7..eea6f48f6c3 100644
--- a/src/backend/utils/adt/pg_locale_icu.c
+++ b/src/backend/utils/adt/pg_locale_icu.c
@@ -12,6 +12,7 @@
#include "postgres.h"
#ifdef USE_ICU
+#include "unicode/ucasemap.h"
#include <unicode/ucnv.h>
#include <unicode/ustring.h>
@@ -100,9 +101,9 @@ static size_t icu_from_uchar(char *dest, size_t destsize,
const UChar *buff_uchar, int32_t len_uchar);
static void icu_set_collation_attributes(UCollator *collator, const char *loc,
UErrorCode *status);
-static int32_t icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
- UChar **buff_dest, UChar *buff_source,
- int32_t len_source);
+static int32_t icu_convert_case_uchar(ICU_Convert_Func func, pg_locale_t mylocale,
+ UChar **buff_dest, UChar *buff_source,
+ int32_t len_source);
static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
const char *locale,
@@ -350,60 +351,126 @@ size_t
strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
pg_locale_t locale)
{
- int32_t len_uchar;
- int32_t len_conv;
- UChar *buff_uchar;
- UChar *buff_conv;
- size_t result_len;
-
- len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
- len_conv = icu_convert_case(u_strToLower, locale,
- &buff_conv, buff_uchar, len_uchar);
- result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
- pfree(buff_uchar);
- pfree(buff_conv);
-
- return result_len;
+ if (GetDatabaseEncoding() == PG_UTF8)
+ {
+ UErrorCode status = U_ZERO_ERROR;
+ UCaseMap *casemap;
+ int32_t needed;
+
+ casemap = ucasemap_open(locale->info.icu.locale, U_FOLD_CASE_DEFAULT, &status);
+ if (U_FAILURE(status))
+ ereport(ERROR,
+ (errmsg("casemap lookup failed: %s", u_errorName(status))));
+
+ status = U_ZERO_ERROR;
+ needed = ucasemap_utf8ToLower(casemap, dest, destsize, src, srclen, &status);
+ ucasemap_close(casemap);
+ if (status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(status))
+ ereport(ERROR,
+ (errmsg("case conversion failed: %s", u_errorName(status))));
+ return needed;
+ }
+ else
+ {
+ int32_t len_uchar;
+ int32_t len_conv;
+ UChar *buff_uchar;
+ UChar *buff_conv;
+ size_t result_len;
+
+ len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
+ len_conv = icu_convert_case_uchar(u_strToLower, locale, &buff_conv,
+ buff_uchar, len_uchar);
+ result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
+ pfree(buff_uchar);
+ pfree(buff_conv);
+
+ return result_len;
+ }
}
size_t
strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
pg_locale_t locale)
{
- int32_t len_uchar;
- int32_t len_conv;
- UChar *buff_uchar;
- UChar *buff_conv;
- size_t result_len;
-
- len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
- len_conv = icu_convert_case(u_strToTitle_default_BI, locale,
- &buff_conv, buff_uchar, len_uchar);
- result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
- pfree(buff_uchar);
- pfree(buff_conv);
-
- return result_len;
+ if (GetDatabaseEncoding() == PG_UTF8)
+ {
+ UErrorCode status = U_ZERO_ERROR;
+ UCaseMap *casemap;
+ int32_t needed;
+
+ casemap = ucasemap_open(locale->info.icu.locale, U_FOLD_CASE_DEFAULT, &status);
+ if (U_FAILURE(status))
+ ereport(ERROR,
+ (errmsg("casemap lookup failed: %s", u_errorName(status))));
+
+ status = U_ZERO_ERROR;
+ needed = ucasemap_utf8ToTitle(casemap, dest, destsize, src, srclen, &status);
+ ucasemap_close(casemap);
+ if (status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(status))
+ ereport(ERROR,
+ (errmsg("case conversion failed: %s", u_errorName(status))));
+ return needed;
+ }
+ else
+ {
+ int32_t len_uchar;
+ int32_t len_conv;
+ UChar *buff_uchar;
+ UChar *buff_conv;
+ size_t result_len;
+
+ len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
+ len_conv = icu_convert_case_uchar(u_strToTitle_default_BI, locale, &buff_conv,
+ buff_uchar, len_uchar);
+ result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
+ pfree(buff_uchar);
+ pfree(buff_conv);
+
+ return result_len;
+ }
}
size_t
strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
pg_locale_t locale)
{
- int32_t len_uchar;
- int32_t len_conv;
- UChar *buff_uchar;
- UChar *buff_conv;
- size_t result_len;
-
- len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
- len_conv = icu_convert_case(u_strToUpper, locale,
- &buff_conv, buff_uchar, len_uchar);
- result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
- pfree(buff_uchar);
- pfree(buff_conv);
-
- return result_len;
+ if (GetDatabaseEncoding() == PG_UTF8)
+ {
+ UErrorCode status = U_ZERO_ERROR;
+ UCaseMap *casemap;
+ int32_t needed;
+
+ casemap = ucasemap_open(locale->info.icu.locale, U_FOLD_CASE_DEFAULT, &status);
+ if (U_FAILURE(status))
+ ereport(ERROR,
+ (errmsg("casemap lookup failed: %s", u_errorName(status))));
+
+ status = U_ZERO_ERROR;
+ needed = ucasemap_utf8ToUpper(casemap, dest, destsize, src, srclen, &status);
+ ucasemap_close(casemap);
+ if (status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(status))
+ ereport(ERROR,
+ (errmsg("case conversion failed: %s", u_errorName(status))));
+ return needed;
+ }
+ else
+ {
+ int32_t len_uchar;
+ int32_t len_conv;
+ UChar *buff_uchar;
+ UChar *buff_conv;
+ size_t result_len;
+
+ len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
+ len_conv = icu_convert_case_uchar(u_strToUpper, locale, &buff_conv,
+ buff_uchar, len_uchar);
+ result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
+ pfree(buff_uchar);
+ pfree(buff_conv);
+
+ return result_len;
+ }
}
/*
@@ -599,8 +666,8 @@ icu_from_uchar(char *dest, size_t destsize, const UChar *buff_uchar, int32_t len
}
static int32_t
-icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
- UChar **buff_dest, UChar *buff_source, int32_t len_source)
+icu_convert_case_uchar(ICU_Convert_Func func, pg_locale_t mylocale,
+ UChar **buff_dest, UChar *buff_source, int32_t len_source)
{
UErrorCode status;
int32_t len_dest;
--
2.45.2
From a4bfcbd8d9ad9c56995fa4a6736480fc11ce1bd4 Mon Sep 17 00:00:00 2001
From: Andreas Karlsson <andr...@proxel.se>
Date: Fri, 20 Dec 2024 02:00:33 +0100
Subject: [PATCH v1 2/2] Reduce code duplication in ICU case mapping code
---
src/backend/utils/adt/pg_locale_icu.c | 74 ++++++++++-----------------
1 file changed, 26 insertions(+), 48 deletions(-)
diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c
index eea6f48f6c3..905b2308fbd 100644
--- a/src/backend/utils/adt/pg_locale_icu.c
+++ b/src/backend/utils/adt/pg_locale_icu.c
@@ -101,6 +101,9 @@ static size_t icu_from_uchar(char *dest, size_t destsize,
const UChar *buff_uchar, int32_t len_uchar);
static void icu_set_collation_attributes(UCollator *collator, const char *loc,
UErrorCode *status);
+static int32_t icu_convert_case_no_utf8(ICU_Convert_Func func, char *dest,
+ size_t destsize, const char *src,
+ ssize_t srclen, pg_locale_t locale);
static int32_t icu_convert_case_uchar(ICU_Convert_Func func, pg_locale_t mylocale,
UChar **buff_dest, UChar *buff_source,
int32_t len_source);
@@ -371,22 +374,7 @@ strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
return needed;
}
else
- {
- int32_t len_uchar;
- int32_t len_conv;
- UChar *buff_uchar;
- UChar *buff_conv;
- size_t result_len;
-
- len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
- len_conv = icu_convert_case_uchar(u_strToLower, locale, &buff_conv,
- buff_uchar, len_uchar);
- result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
- pfree(buff_uchar);
- pfree(buff_conv);
-
- return result_len;
- }
+ return icu_convert_case_no_utf8(u_strToLower, dest, destsize, src, srclen, locale);
}
size_t
@@ -413,22 +401,7 @@ strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
return needed;
}
else
- {
- int32_t len_uchar;
- int32_t len_conv;
- UChar *buff_uchar;
- UChar *buff_conv;
- size_t result_len;
-
- len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
- len_conv = icu_convert_case_uchar(u_strToTitle_default_BI, locale, &buff_conv,
- buff_uchar, len_uchar);
- result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
- pfree(buff_uchar);
- pfree(buff_conv);
-
- return result_len;
- }
+ return icu_convert_case_no_utf8(u_strToTitle_default_BI, dest, destsize, src, srclen, locale);
}
size_t
@@ -455,22 +428,7 @@ strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
return needed;
}
else
- {
- int32_t len_uchar;
- int32_t len_conv;
- UChar *buff_uchar;
- UChar *buff_conv;
- size_t result_len;
-
- len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
- len_conv = icu_convert_case_uchar(u_strToUpper, locale, &buff_conv,
- buff_uchar, len_uchar);
- result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
- pfree(buff_uchar);
- pfree(buff_conv);
-
- return result_len;
- }
+ return icu_convert_case_no_utf8(u_strToUpper, dest, destsize, src, srclen, locale);
}
/*
@@ -665,6 +623,26 @@ icu_from_uchar(char *dest, size_t destsize, const UChar *buff_uchar, int32_t len
return len_result;
}
+static int32_t
+icu_convert_case_no_utf8(ICU_Convert_Func func, char *dest, size_t destsize,
+ const char *src, ssize_t srclen, pg_locale_t locale)
+{
+ int32_t len_uchar;
+ int32_t len_conv;
+ UChar *buff_uchar;
+ UChar *buff_conv;
+ size_t result_len;
+
+ len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
+ len_conv = icu_convert_case_uchar(func, locale, &buff_conv,
+ buff_uchar, len_uchar);
+ result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
+ pfree(buff_uchar);
+ pfree(buff_conv);
+
+ return result_len;
+}
+
static int32_t
icu_convert_case_uchar(ICU_Convert_Func func, pg_locale_t mylocale,
UChar **buff_dest, UChar *buff_source, int32_t len_source)
--
2.45.2