uc_width and wcwidth optimization

Alexander V. Lukyanov Fri, 09 Dec 2011 00:07:30 -0800

Hello!

Attached is the patch to optimize performance of wcwith, uc_width and
uc{8,16,32}_width functions.


The optimization is caching of is_cjk_encoding() and using
nl_langinfo(CODESET) before the complex locale_charset() to check if the
charset has changed.

Besides, uc_width is used in wcwidth for cjk encodings as designed.

-- 
   Alexander.

diff --git a/lib/uniwidth.in.h b/lib/uniwidth.in.h
index e806744..8b962ef 100644
--- a/lib/uniwidth.in.h
+++ b/lib/uniwidth.in.h
@@ -38,7 +38,7 @@ extern "C" {
 
 /* Determine number of column positions required for UC.  */
 extern int
-       uc_width (ucs4_t uc, const char *encoding)
+       uc_width (ucs4_t uc, int is_cjk_encoding)
 #if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96)
        __attribute__ ((__pure__))
 #endif
diff --git a/lib/uniwidth/u16-width.c b/lib/uniwidth/u16-width.c
index f8008f2..64cb66f 100644
--- a/lib/uniwidth/u16-width.c
+++ b/lib/uniwidth/u16-width.c
@@ -27,6 +27,7 @@ u16_width (const uint16_t *s, size_t n, const char *encoding)
 {
   const uint16_t *s_end = s + n;
   int width = 0;
+  int is_cjk = is_cjk_encoding(encoding);
 
   while (s < s_end)
     {
@@ -38,7 +39,7 @@ u16_width (const uint16_t *s, size_t n, const char *encoding)
       if (uc == 0)
         break; /* end of string reached */
 
-      w = uc_width (uc, encoding);
+      w = uc_width (uc, is_cjk);
       if (w >= 0) /* ignore control characters in the string */
         width += w;
     }
diff --git a/lib/uniwidth/u32-width.c b/lib/uniwidth/u32-width.c
index 60b5a35..55bbd3a 100644
--- a/lib/uniwidth/u32-width.c
+++ b/lib/uniwidth/u32-width.c
@@ -25,6 +25,7 @@ u32_width (const uint32_t *s, size_t n, const char *encoding)
 {
   const uint32_t *s_end = s + n;
   int width = 0;
+  int is_cjk = is_cjk_encoding(encoding);
 
   while (s < s_end)
     {
@@ -34,7 +35,7 @@ u32_width (const uint32_t *s, size_t n, const char *encoding)
       if (uc == 0)
         break; /* end of string reached */
 
-      w = uc_width (uc, encoding);
+      w = uc_width (uc, is_cjk);
       if (w >= 0) /* ignore control characters in the string */
         width += w;
     }
diff --git a/lib/uniwidth/u8-width.c b/lib/uniwidth/u8-width.c
index 96e5ea4..49e063e 100644
--- a/lib/uniwidth/u8-width.c
+++ b/lib/uniwidth/u8-width.c
@@ -27,6 +27,7 @@ u8_width (const uint8_t *s, size_t n, const char *encoding)
 {
   const uint8_t *s_end = s + n;
   int width = 0;
+  int is_cjk = is_cjk_encoding(encoding);
 
   while (s < s_end)
     {
@@ -38,7 +39,7 @@ u8_width (const uint8_t *s, size_t n, const char *encoding)
       if (uc == 0)
         break; /* end of string reached */
 
-      w = uc_width (uc, encoding);
+      w = uc_width (uc, is_cjk);
       if (w >= 0) /* ignore control characters in the string */
         width += w;
     }
diff --git a/lib/uniwidth/width.c b/lib/uniwidth/width.c
index a314e71..8eb2eff 100644
--- a/lib/uniwidth/width.c
+++ b/lib/uniwidth/width.c
@@ -20,8 +20,6 @@
 /* Specification.  */
 #include "uniwidth.h"
 
-#include "cjk.h"
-
 /*
  * Non-spacing attribute table.
  * Consists of:
@@ -312,7 +310,7 @@ static const signed char nonspacing_table_ind[240] = {
 
 /* Determine number of column positions required for UC.  */
 int
-uc_width (ucs4_t uc, const char *encoding)
+uc_width (ucs4_t uc, int is_cjk_encoding)
 {
   /* Test for non-spacing or control character.  */
   if ((uc >> 9) < 240)
@@ -361,8 +359,7 @@ uc_width (ucs4_t uc, const char *encoding)
     return 2;
   /* In ancient CJK encodings, Cyrillic and most other characters are
      double-width as well.  */
-  if (uc >= 0x00A1 && uc < 0xFF61 && uc != 0x20A9
-      && is_cjk_encoding (encoding))
+  if (is_cjk_encoding && uc >= 0x00A1 && uc < 0xFF61 && uc != 0x20A9)
     return 2;
   return 1;
 }
diff --git a/lib/wcwidth.c b/lib/wcwidth.c
index a006ca7..49378bd 100644
--- a/lib/wcwidth.c
+++ b/lib/wcwidth.c
@@ -22,21 +22,53 @@
 /* Get iswprint.  */
 #include <wctype.h>
 
+#if HAVE_LANGINFO_CODESET
+# include <langinfo.h>
+#endif
+
 #include "localcharset.h"
 #include "streq.h"
 #include "uniwidth.h"
 
+#include "uniwidth/cjk.h"
+
+static char cached_encoding[32];
+static int cached_is_cjk_encoding;
+static int cached_is_utf8_encoding;
+
+static const char *locale_charset_simple ()
+{
+#if HAVE_LANGINFO_CODESET
+  /* Most systems support nl_langinfo (CODESET) nowadays.  */
+  return nl_langinfo (CODESET);
+# else
+  /* Do the complex case */
+  return locale_charset ();
+# endif
+}
+
+static void cache_encoding ()
+{
+  const char *encoding = locale_charset_simple ();
+  if (!strncmp(encoding, cached_encoding, sizeof (cached_encoding)))
+    return;
+  strncpy (cached_encoding, encoding, sizeof (cached_encoding));
+  encoding = locale_charset ();
+  cached_is_utf8_encoding = STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 
0, 0, 0 ,0);
+  cached_is_cjk_encoding = is_cjk_encoding (encoding);
+}
+
 int
 wcwidth (wchar_t wc)
 #undef wcwidth
 {
+  cache_encoding ();
   /* In UTF-8 locales, use a Unicode aware width function.  */
-  const char *encoding = locale_charset ();
-  if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0 ,0))
+  if (cached_is_utf8_encoding || cached_is_cjk_encoding)
     {
       /* We assume that in a UTF-8 locale, a wide character is the same as a
          Unicode character.  */
-      return uc_width (wc, encoding);
+      return uc_width (wc, cached_is_cjk_encoding);
     }
   else
     {

uc_width and wcwidth optimization

Reply via email to