Hello!
Attached is the patch to optimize performance of wcwith, uc_width and
uc{8,16,32}_width functions.
The optimization is caching of is_cjk_encoding() and using
nl_langinfo(CODESET) before the complex locale_charset() to check if the
charset has changed.
Besides, uc_width is used in wcwidth for cjk encodings as designed.
--
Alexander.
diff --git a/lib/uniwidth.in.h b/lib/uniwidth.in.h
index e806744..8b962ef 100644
--- a/lib/uniwidth.in.h
+++ b/lib/uniwidth.in.h
@@ -38,7 +38,7 @@ extern "C" {
/* Determine number of column positions required for UC. */
extern int
- uc_width (ucs4_t uc, const char *encoding)
+ uc_width (ucs4_t uc, int is_cjk_encoding)
#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96)
__attribute__ ((__pure__))
#endif
diff --git a/lib/uniwidth/u16-width.c b/lib/uniwidth/u16-width.c
index f8008f2..64cb66f 100644
--- a/lib/uniwidth/u16-width.c
+++ b/lib/uniwidth/u16-width.c
@@ -27,6 +27,7 @@ u16_width (const uint16_t *s, size_t n, const char *encoding)
{
const uint16_t *s_end = s + n;
int width = 0;
+ int is_cjk = is_cjk_encoding(encoding);
while (s < s_end)
{
@@ -38,7 +39,7 @@ u16_width (const uint16_t *s, size_t n, const char *encoding)
if (uc == 0)
break; /* end of string reached */
- w = uc_width (uc, encoding);
+ w = uc_width (uc, is_cjk);
if (w >= 0) /* ignore control characters in the string */
width += w;
}
diff --git a/lib/uniwidth/u32-width.c b/lib/uniwidth/u32-width.c
index 60b5a35..55bbd3a 100644
--- a/lib/uniwidth/u32-width.c
+++ b/lib/uniwidth/u32-width.c
@@ -25,6 +25,7 @@ u32_width (const uint32_t *s, size_t n, const char *encoding)
{
const uint32_t *s_end = s + n;
int width = 0;
+ int is_cjk = is_cjk_encoding(encoding);
while (s < s_end)
{
@@ -34,7 +35,7 @@ u32_width (const uint32_t *s, size_t n, const char *encoding)
if (uc == 0)
break; /* end of string reached */
- w = uc_width (uc, encoding);
+ w = uc_width (uc, is_cjk);
if (w >= 0) /* ignore control characters in the string */
width += w;
}
diff --git a/lib/uniwidth/u8-width.c b/lib/uniwidth/u8-width.c
index 96e5ea4..49e063e 100644
--- a/lib/uniwidth/u8-width.c
+++ b/lib/uniwidth/u8-width.c
@@ -27,6 +27,7 @@ u8_width (const uint8_t *s, size_t n, const char *encoding)
{
const uint8_t *s_end = s + n;
int width = 0;
+ int is_cjk = is_cjk_encoding(encoding);
while (s < s_end)
{
@@ -38,7 +39,7 @@ u8_width (const uint8_t *s, size_t n, const char *encoding)
if (uc == 0)
break; /* end of string reached */
- w = uc_width (uc, encoding);
+ w = uc_width (uc, is_cjk);
if (w >= 0) /* ignore control characters in the string */
width += w;
}
diff --git a/lib/uniwidth/width.c b/lib/uniwidth/width.c
index a314e71..8eb2eff 100644
--- a/lib/uniwidth/width.c
+++ b/lib/uniwidth/width.c
@@ -20,8 +20,6 @@
/* Specification. */
#include "uniwidth.h"
-#include "cjk.h"
-
/*
* Non-spacing attribute table.
* Consists of:
@@ -312,7 +310,7 @@ static const signed char nonspacing_table_ind[240] = {
/* Determine number of column positions required for UC. */
int
-uc_width (ucs4_t uc, const char *encoding)
+uc_width (ucs4_t uc, int is_cjk_encoding)
{
/* Test for non-spacing or control character. */
if ((uc >> 9) < 240)
@@ -361,8 +359,7 @@ uc_width (ucs4_t uc, const char *encoding)
return 2;
/* In ancient CJK encodings, Cyrillic and most other characters are
double-width as well. */
- if (uc >= 0x00A1 && uc < 0xFF61 && uc != 0x20A9
- && is_cjk_encoding (encoding))
+ if (is_cjk_encoding && uc >= 0x00A1 && uc < 0xFF61 && uc != 0x20A9)
return 2;
return 1;
}
diff --git a/lib/wcwidth.c b/lib/wcwidth.c
index a006ca7..49378bd 100644
--- a/lib/wcwidth.c
+++ b/lib/wcwidth.c
@@ -22,21 +22,53 @@
/* Get iswprint. */
#include <wctype.h>
+#if HAVE_LANGINFO_CODESET
+# include <langinfo.h>
+#endif
+
#include "localcharset.h"
#include "streq.h"
#include "uniwidth.h"
+#include "uniwidth/cjk.h"
+
+static char cached_encoding[32];
+static int cached_is_cjk_encoding;
+static int cached_is_utf8_encoding;
+
+static const char *locale_charset_simple ()
+{
+#if HAVE_LANGINFO_CODESET
+ /* Most systems support nl_langinfo (CODESET) nowadays. */
+ return nl_langinfo (CODESET);
+# else
+ /* Do the complex case */
+ return locale_charset ();
+# endif
+}
+
+static void cache_encoding ()
+{
+ const char *encoding = locale_charset_simple ();
+ if (!strncmp(encoding, cached_encoding, sizeof (cached_encoding)))
+ return;
+ strncpy (cached_encoding, encoding, sizeof (cached_encoding));
+ encoding = locale_charset ();
+ cached_is_utf8_encoding = STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8',
0, 0, 0 ,0);
+ cached_is_cjk_encoding = is_cjk_encoding (encoding);
+}
+
int
wcwidth (wchar_t wc)
#undef wcwidth
{
+ cache_encoding ();
/* In UTF-8 locales, use a Unicode aware width function. */
- const char *encoding = locale_charset ();
- if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0 ,0))
+ if (cached_is_utf8_encoding || cached_is_cjk_encoding)
{
/* We assume that in a UTF-8 locale, a wide character is the same as a
Unicode character. */
- return uc_width (wc, encoding);
+ return uc_width (wc, cached_is_cjk_encoding);
}
else
{