bug#16232: [PATCH] grep: make --ignore-case (-i) faster (sometimes 10x) in multibyte locales

Jim Meyering Thu, 20 Feb 2014 12:43:39 -0800

On Thu, Feb 20, 2014 at 9:22 AM,  <[email protected]> wrote:
> Hi Jim.
>
> Why copy the using_utf8() routine out of dfa.c?  Why not just link
> to it instead?  If it's static, make it extern... That way if the
> logic ever changes then it only has to be changed in one place.


Hi Arnold,

That was due to my reflex of avoiding unnecessary change to dfa.c,
but in this case, it is definitely better to do as you suggest, not
just to avoid code duplication, but also for run-time efficiency:
with two copies of the function, there would have been two calls to
nl_langinfo per run; with only that one copy, we save a call, too.

Revised commits attached.

Thanks,
Jim

From 9a5c6c856892fde5df07666d4bb6641a05f33712 Mon Sep 17 00:00:00 2001
From: Jim Meyering <[email protected]>
Date: Wed, 19 Feb 2014 19:22:24 -0800
Subject: [PATCH 1/2] maint: give dfa.c's using_utf8 function external scope

* src/dfa.c (using_utf8): Remove "static inline".
* src/dfa.h (using_utf8): Declare it.
* src/searchutils.c (is_mb_middle): Use using_utf8 rather than
rolling our own.
---
 src/dfa.c         | 2 +-
 src/dfa.h         | 2 ++
 src/searchutils.c | 9 ++-------
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index a133e03..ba9e7a2 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -753,7 +753,7 @@ setbit_case_fold_c (int b, charclass c)

 /* UTF-8 encoding allows some optimizations that we can't otherwise
    assume in a multibyte encoding.  */
-static inline int
+int
 using_utf8 (void)
 {
   static int utf8 = -1;
diff --git a/src/dfa.h b/src/dfa.h
index bacd489..7e0674f 100644
--- a/src/dfa.h
+++ b/src/dfa.h
@@ -99,3 +99,5 @@ extern void dfawarn (const char *);
    takes a single argument, a NUL-terminated string describing the error.
    The user must supply a dfaerror.  */
 extern _Noreturn void dfaerror (const char *);
+
+extern int using_utf8 (void);
diff --git a/src/searchutils.c b/src/searchutils.c
index 3478417..7363701 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -19,6 +19,7 @@
 #include <config.h>
 #include <assert.h>
 #include "search.h"
+#include "dfa.h"
 #if HAVE_LANGINFO_CODESET
 # include <langinfo.h>
 #endif
@@ -234,13 +235,8 @@ is_mb_middle (const char **good, const char *buf, const 
char *end,
   const char *p = *good;
   const char *prev = p;
   mbstate_t cur_state;
-#if HAVE_LANGINFO_CODESET
-  static int is_utf8 = -1;
-
-  if (is_utf8 == -1)
-    is_utf8 = STREQ (nl_langinfo (CODESET), "UTF-8");

-  if (is_utf8 && buf - p > MB_CUR_MAX)
+  if (using_utf8 () && buf - p > MB_CUR_MAX)
     {
       for (p = buf; buf - p > MB_CUR_MAX; p--)
         if (mbclen_cache[to_uchar (*p)] != (size_t) -1)
@@ -249,7 +245,6 @@ is_mb_middle (const char **good, const char *buf, const 
char *end,
       if (buf - p == MB_CUR_MAX)
         p = buf;
     }
-#endif

   memset (&cur_state, 0, sizeof cur_state);

-- 
1.9.0


From 5295d1d528afabba15ed0710211ba24854c0c7ab Mon Sep 17 00:00:00 2001
From: Jim Meyering <[email protected]>
Date: Wed, 19 Feb 2014 19:31:43 -0800
Subject: [PATCH 2/2] grep -i: avoid 200x perf. regression in multibyte
 non-UTF8 locales

* src/main.c: Include dfa.h.
(trivial_case_ignore): Perform this optimization only for UTF8 locales.
This rectifies a 200x performance regression in multi-byte non-UTF8
locales like ja_JP.eucJP.  The regression was introduced by the 10x
UTF8/grep-i speedup, commit v2.16-4-g97318f5.
* NEWS (Bug fixes): Mention it.
Reported by Norihiro Tanaka in http://debbugs.gnu.org/16232#50
---
 NEWS       | 5 +++++
 src/main.c | 6 ++++++
 2 files changed, 11 insertions(+)

diff --git a/NEWS b/NEWS
index 6785a96..49a17b0 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,11 @@ GNU grep NEWS                                    -*- outline 
-*-

 * Noteworthy changes in release ?.? (????-??-??) [?]

+** Bug fixes
+
+  grep -i in a multibyte, non-UTF8 locale could be up to 200 times slower
+  than in 2.16.  [bug introduced in grep-2.17]
+

 * Noteworthy changes in release 2.17 (2014-02-17) [stable]

diff --git a/src/main.c b/src/main.c
index bd20297..56ec6b3 100644
--- a/src/main.c
+++ b/src/main.c
@@ -34,6 +34,7 @@
 #include "c-ctype.h"
 #include "closeout.h"
 #include "colorize.h"
+#include "dfa.h"
 #include "error.h"
 #include "exclude.h"
 #include "exitfail.h"
@@ -1883,6 +1884,11 @@ static bool
 trivial_case_ignore (size_t len, char const *keys,
                      size_t *new_len, char **new_keys)
 {
+  /* Perform this translation only for UTF-8.  Otherwise, this would induce
+     a 100-200x performance penalty for non-UTF8 multibyte locales.  */
+  if ( ! using_utf8 ())
+    return false;
+
   /* FIXME: consider removing the following restriction:
      Reject if KEYS contain ASCII '\\' or '['.  */
   if (memchr (keys, '\\', len) || memchr (keys, '[', len))
-- 
1.9.0

bug#16232: [PATCH] grep: make --ignore-case (-i) faster (sometimes 10x) in multibyte locales

Reply via email to