On Thu, Feb 20, 2014 at 9:22 AM, <[email protected]> wrote: > Hi Jim. > > Why copy the using_utf8() routine out of dfa.c? Why not just link > to it instead? If it's static, make it extern... That way if the > logic ever changes then it only has to be changed in one place.
Hi Arnold, That was due to my reflex of avoiding unnecessary change to dfa.c, but in this case, it is definitely better to do as you suggest, not just to avoid code duplication, but also for run-time efficiency: with two copies of the function, there would have been two calls to nl_langinfo per run; with only that one copy, we save a call, too. Revised commits attached. Thanks, Jim
From 9a5c6c856892fde5df07666d4bb6641a05f33712 Mon Sep 17 00:00:00 2001 From: Jim Meyering <[email protected]> Date: Wed, 19 Feb 2014 19:22:24 -0800 Subject: [PATCH 1/2] maint: give dfa.c's using_utf8 function external scope * src/dfa.c (using_utf8): Remove "static inline". * src/dfa.h (using_utf8): Declare it. * src/searchutils.c (is_mb_middle): Use using_utf8 rather than rolling our own. --- src/dfa.c | 2 +- src/dfa.h | 2 ++ src/searchutils.c | 9 ++------- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index a133e03..ba9e7a2 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -753,7 +753,7 @@ setbit_case_fold_c (int b, charclass c) /* UTF-8 encoding allows some optimizations that we can't otherwise assume in a multibyte encoding. */ -static inline int +int using_utf8 (void) { static int utf8 = -1; diff --git a/src/dfa.h b/src/dfa.h index bacd489..7e0674f 100644 --- a/src/dfa.h +++ b/src/dfa.h @@ -99,3 +99,5 @@ extern void dfawarn (const char *); takes a single argument, a NUL-terminated string describing the error. The user must supply a dfaerror. */ extern _Noreturn void dfaerror (const char *); + +extern int using_utf8 (void); diff --git a/src/searchutils.c b/src/searchutils.c index 3478417..7363701 100644 --- a/src/searchutils.c +++ b/src/searchutils.c @@ -19,6 +19,7 @@ #include <config.h> #include <assert.h> #include "search.h" +#include "dfa.h" #if HAVE_LANGINFO_CODESET # include <langinfo.h> #endif @@ -234,13 +235,8 @@ is_mb_middle (const char **good, const char *buf, const char *end, const char *p = *good; const char *prev = p; mbstate_t cur_state; -#if HAVE_LANGINFO_CODESET - static int is_utf8 = -1; - - if (is_utf8 == -1) - is_utf8 = STREQ (nl_langinfo (CODESET), "UTF-8"); - if (is_utf8 && buf - p > MB_CUR_MAX) + if (using_utf8 () && buf - p > MB_CUR_MAX) { for (p = buf; buf - p > MB_CUR_MAX; p--) if (mbclen_cache[to_uchar (*p)] != (size_t) -1) @@ -249,7 +245,6 @@ is_mb_middle (const char **good, const char *buf, const char *end, if (buf - p == MB_CUR_MAX) p = buf; } -#endif memset (&cur_state, 0, sizeof cur_state); -- 1.9.0 From 5295d1d528afabba15ed0710211ba24854c0c7ab Mon Sep 17 00:00:00 2001 From: Jim Meyering <[email protected]> Date: Wed, 19 Feb 2014 19:31:43 -0800 Subject: [PATCH 2/2] grep -i: avoid 200x perf. regression in multibyte non-UTF8 locales * src/main.c: Include dfa.h. (trivial_case_ignore): Perform this optimization only for UTF8 locales. This rectifies a 200x performance regression in multi-byte non-UTF8 locales like ja_JP.eucJP. The regression was introduced by the 10x UTF8/grep-i speedup, commit v2.16-4-g97318f5. * NEWS (Bug fixes): Mention it. Reported by Norihiro Tanaka in http://debbugs.gnu.org/16232#50 --- NEWS | 5 +++++ src/main.c | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/NEWS b/NEWS index 6785a96..49a17b0 100644 --- a/NEWS +++ b/NEWS @@ -2,6 +2,11 @@ GNU grep NEWS -*- outline -*- * Noteworthy changes in release ?.? (????-??-??) [?] +** Bug fixes + + grep -i in a multibyte, non-UTF8 locale could be up to 200 times slower + than in 2.16. [bug introduced in grep-2.17] + * Noteworthy changes in release 2.17 (2014-02-17) [stable] diff --git a/src/main.c b/src/main.c index bd20297..56ec6b3 100644 --- a/src/main.c +++ b/src/main.c @@ -34,6 +34,7 @@ #include "c-ctype.h" #include "closeout.h" #include "colorize.h" +#include "dfa.h" #include "error.h" #include "exclude.h" #include "exitfail.h" @@ -1883,6 +1884,11 @@ static bool trivial_case_ignore (size_t len, char const *keys, size_t *new_len, char **new_keys) { + /* Perform this translation only for UTF-8. Otherwise, this would induce + a 100-200x performance penalty for non-UTF8 multibyte locales. */ + if ( ! using_utf8 ()) + return false; + /* FIXME: consider removing the following restriction: Reject if KEYS contain ASCII '\\' or '['. */ if (memchr (keys, '\\', len) || memchr (keys, '[', len)) -- 1.9.0
