Hi Paul,

In case your bug fix looks safe/small, and is ready, ...

I'm hoping to release 2.18 today, with the attached commits.
Changes since yesterday: comment/log tweaks, and I've hoisted the using_utf8
test in trivial_case_ignore to precede the two memchr tests.
From 07a4f69da701abfdee047f26c603002c20d4c7d4 Mon Sep 17 00:00:00 2001
From: Jim Meyering <[email protected]>
Date: Wed, 19 Feb 2014 19:22:24 -0800
Subject: [PATCH 1/2] maint: factor out using_utf8 function for use in main.c

* src/searchutils.c (is_mb_middle): Use using_utf8 rather than
rolling our own.
(using_utf8): New function (copy of the one in dfa.c).
* src/search.h (using_utf8): Declare it.
---
 src/search.h      |  2 ++
 src/searchutils.c | 26 +++++++++++++++++++-------
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/src/search.h b/src/search.h
index 12d0822..167e0e7 100644
--- a/src/search.h
+++ b/src/search.h
@@ -80,4 +80,6 @@ mb_case_map_apply (mb_len_map_t const *map, size_t *off, 
size_t *len)
     }
 }

+int using_utf8 (void);
+
 #endif /* GREP_SEARCH_H */
diff --git a/src/searchutils.c b/src/searchutils.c
index 3478417..51bba59 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -234,13 +234,8 @@ is_mb_middle (const char **good, const char *buf, const 
char *end,
   const char *p = *good;
   const char *prev = p;
   mbstate_t cur_state;
-#if HAVE_LANGINFO_CODESET
-  static int is_utf8 = -1;
-
-  if (is_utf8 == -1)
-    is_utf8 = STREQ (nl_langinfo (CODESET), "UTF-8");

-  if (is_utf8 && buf - p > MB_CUR_MAX)
+  if (using_utf8 () && buf - p > MB_CUR_MAX)
     {
       for (p = buf; buf - p > MB_CUR_MAX; p--)
         if (mbclen_cache[to_uchar (*p)] != (size_t) -1)
@@ -249,7 +244,6 @@ is_mb_middle (const char **good, const char *buf, const 
char *end,
       if (buf - p == MB_CUR_MAX)
         p = buf;
     }
-#endif

   memset (&cur_state, 0, sizeof cur_state);

@@ -283,3 +277,21 @@ is_mb_middle (const char **good, const char *buf, const 
char *end,
   return 0 < match_len && match_len < mbrlen (p, end - p, &cur_state);
 }
 #endif /* MBS_SUPPORT */
+
+/* UTF-8 encoding allows some optimizations that we can't otherwise
+   assume in a multibyte encoding.  */
+int
+using_utf8 (void)
+{
+  static int utf8 = -1;
+  if (utf8 == -1)
+    {
+#if defined HAVE_LANGINFO_CODESET && MBS_SUPPORT
+      utf8 = (STREQ (nl_langinfo (CODESET), "UTF-8"));
+#else
+      utf8 = 0;
+#endif
+    }
+
+  return utf8;
+}
-- 
1.9.0


From 6053c388d4f56fae2b639f566f2bd0f9830f0276 Mon Sep 17 00:00:00 2001
From: Jim Meyering <[email protected]>
Date: Wed, 19 Feb 2014 19:31:43 -0800
Subject: [PATCH 2/2] grep -i: avoid 200x perf. regression in multibyte
 non-UTF8 locales

* src/main.c (trivial_case_ignore): Perform this optimization only
for UTF8 locales.  This rectifies a 200x performance regression in
multi-byte non-UTF8 locales like ja_JP.eucJP.  The regression was
introduced by the 10x UTF8/grep-i speedup, commit v2.16-4-g97318f5.
Reported by Norihiro Tanaka in http://debbugs.gnu.org/16232#50
* NEWS (Bug fixes): Mention it.
---
 NEWS       | 5 +++++
 src/main.c | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/NEWS b/NEWS
index 6785a96..49a17b0 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,11 @@ GNU grep NEWS                                    -*- outline 
-*-

 * Noteworthy changes in release ?.? (????-??-??) [?]

+** Bug fixes
+
+  grep -i in a multibyte, non-UTF8 locale could be up to 200 times slower
+  than in 2.16.  [bug introduced in grep-2.17]
+

 * Noteworthy changes in release 2.17 (2014-02-17) [stable]

diff --git a/src/main.c b/src/main.c
index bd20297..ca7c7b3 100644
--- a/src/main.c
+++ b/src/main.c
@@ -1883,6 +1883,11 @@ static bool
 trivial_case_ignore (size_t len, char const *keys,
                      size_t *new_len, char **new_keys)
 {
+  /* Perform this translation only for UTF-8.  Otherwise, this would induce
+     a 100-200x performance penalty for non-UTF8 multibyte locales.  */
+  if ( ! using_utf8 ())
+    return false;
+
   /* FIXME: consider removing the following restriction:
      Reject if KEYS contain ASCII '\\' or '['.  */
   if (memchr (keys, '\\', len) || memchr (keys, '[', len))
-- 
1.9.0

Reply via email to