bug#16232: [PATCH] grep: make --ignore-case (-i) faster (sometimes 10x) in multibyte locales

Jim Meyering Thu, 20 Feb 2014 16:46:35 -0800

On Thu, Feb 20, 2014 at 12:42 PM, Jim Meyering <[email protected]> wrote:
> Revised commits attached.


One more revision.  This is a big enough deal (and subtle enough) that
I thought I really should add a test for it.  Did that, so now it's 3
commits:

I'm going to push these in an hour or so, then see if I have time to
make the release this evening.

From 11ce80861109361570cbedda6a966264367f7c76 Mon Sep 17 00:00:00 2001
From: Jim Meyering <[email protected]>
Date: Wed, 19 Feb 2014 19:22:24 -0800
Subject: [PATCH 1/3] maint: give dfa.c's using_utf8 function external scope

* src/dfa.c (using_utf8): Remove "static inline".
* src/dfa.h (using_utf8): Declare it.
* src/searchutils.c (is_mb_middle): Use using_utf8 rather than
rolling our own.
---
 src/dfa.c         | 2 +-
 src/dfa.h         | 2 ++
 src/searchutils.c | 9 ++-------
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index 9266f6f..8906ed3 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -753,7 +753,7 @@ setbit_case_fold_c (int b, charclass c)

 /* UTF-8 encoding allows some optimizations that we can't otherwise
    assume in a multibyte encoding.  */
-static inline int
+int
 using_utf8 (void)
 {
   static int utf8 = -1;
diff --git a/src/dfa.h b/src/dfa.h
index bacd489..7e0674f 100644
--- a/src/dfa.h
+++ b/src/dfa.h
@@ -99,3 +99,5 @@ extern void dfawarn (const char *);
    takes a single argument, a NUL-terminated string describing the error.
    The user must supply a dfaerror.  */
 extern _Noreturn void dfaerror (const char *);
+
+extern int using_utf8 (void);
diff --git a/src/searchutils.c b/src/searchutils.c
index 3478417..7363701 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -19,6 +19,7 @@
 #include <config.h>
 #include <assert.h>
 #include "search.h"
+#include "dfa.h"
 #if HAVE_LANGINFO_CODESET
 # include <langinfo.h>
 #endif
@@ -234,13 +235,8 @@ is_mb_middle (const char **good, const char *buf, const 
char *end,
   const char *p = *good;
   const char *prev = p;
   mbstate_t cur_state;
-#if HAVE_LANGINFO_CODESET
-  static int is_utf8 = -1;
-
-  if (is_utf8 == -1)
-    is_utf8 = STREQ (nl_langinfo (CODESET), "UTF-8");

-  if (is_utf8 && buf - p > MB_CUR_MAX)
+  if (using_utf8 () && buf - p > MB_CUR_MAX)
     {
       for (p = buf; buf - p > MB_CUR_MAX; p--)
         if (mbclen_cache[to_uchar (*p)] != (size_t) -1)
@@ -249,7 +245,6 @@ is_mb_middle (const char **good, const char *buf, const 
char *end,
       if (buf - p == MB_CUR_MAX)
         p = buf;
     }
-#endif

   memset (&cur_state, 0, sizeof cur_state);

-- 
1.9.0


From c7c8bcdefe7be5f59a242eea63df7f64eacb6a09 Mon Sep 17 00:00:00 2001
From: Jim Meyering <[email protected]>
Date: Wed, 19 Feb 2014 19:31:43 -0800
Subject: [PATCH 2/3] grep -i: avoid a performance regression in multibyte
 non-UTF8 locales

* src/main.c: Include dfa.h.
(trivial_case_ignore): Perform this optimization only for UTF8 locales.
This rectifies a 100-200x performance regression in non-UTF8 multi-byte
locales like ja_JP.eucJP.  The regression was introduced by the 10x
UTF8/grep-i speedup, commit v2.16-4-g97318f5.
* NEWS (Bug fixes): Mention it.
Reported by Norihiro Tanaka in http://debbugs.gnu.org/16232#50
---
 NEWS       | 3 +++
 src/main.c | 6 ++++++
 2 files changed, 9 insertions(+)

diff --git a/NEWS b/NEWS
index 6cbbc46..df5632b 100644
--- a/NEWS
+++ b/NEWS
@@ -7,6 +7,9 @@ GNU grep NEWS                                    -*- outline -*-
   grep no longer mishandles patterns like [^^-~] in unibyte locales.
   [bug introduced in grep-2.8]

+  grep -i in a multibyte, non-UTF8 locale could be up to 200 times slower
+  than in 2.16.  [bug introduced in grep-2.17]
+

 * Noteworthy changes in release 2.17 (2014-02-17) [stable]

diff --git a/src/main.c b/src/main.c
index bd20297..56ec6b3 100644
--- a/src/main.c
+++ b/src/main.c
@@ -34,6 +34,7 @@
 #include "c-ctype.h"
 #include "closeout.h"
 #include "colorize.h"
+#include "dfa.h"
 #include "error.h"
 #include "exclude.h"
 #include "exitfail.h"
@@ -1883,6 +1884,11 @@ static bool
 trivial_case_ignore (size_t len, char const *keys,
                      size_t *new_len, char **new_keys)
 {
+  /* Perform this translation only for UTF-8.  Otherwise, this would induce
+     a 100-200x performance penalty for non-UTF8 multibyte locales.  */
+  if ( ! using_utf8 ())
+    return false;
+
   /* FIXME: consider removing the following restriction:
      Reject if KEYS contain ASCII '\\' or '['.  */
   if (memchr (keys, '\\', len) || memchr (keys, '[', len))
-- 
1.9.0


From d5bfcc2daa123fa0e8660909052f7ca2ec6b7649 Mon Sep 17 00:00:00 2001
From: Jim Meyering <[email protected]>
Date: Thu, 20 Feb 2014 16:06:13 -0800
Subject: [PATCH 3/3] tests: test for the non-UTF8 multi-byte performance
 regression

Test for the just-fixed performance regression.
With a 100-200x differential, it is reasonable to expect that
a very slow system will be able to complete the designated
task in a few seconds, while with the bug, even a very fast
system would exceed the timeout.
* tests/mb-non-UTF8-performance: New file.
* tests/Makefile.am (TESTS): Add it.
* tests/init.cfg (require_JP_EUC_locale_): New function.
---
 tests/Makefile.am             |  1 +
 tests/init.cfg                | 16 ++++++++++++++++
 tests/mb-non-UTF8-performance | 32 ++++++++++++++++++++++++++++++++
 3 files changed, 49 insertions(+)
 create mode 100755 tests/mb-non-UTF8-performance

diff --git a/tests/Makefile.am b/tests/Makefile.am
index 331467a..4ffea85 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -72,6 +72,7 @@ TESTS =                                               \
   khadafy                                      \
   long-line-vs-2GiB-read                       \
   max-count-vs-context                         \
+  mb-non-UTF8-performance                      \
   multibyte-white-space                                \
   empty-line-mb                                        \
   unibyte-bracket-expr                         \
diff --git a/tests/init.cfg b/tests/init.cfg
index ee5d537..5555e2d 100644
--- a/tests/init.cfg
+++ b/tests/init.cfg
@@ -103,6 +103,22 @@ require_unibyte_locale()
   skip_ 'no unibyte locale found'
 }

+require_JP_EUC_locale_()
+{
+  local locale=ja_JP.eucJP
+  path_prepend_ .
+  case $(get-mb-cur-max $locale) in
+    [23])
+        LC_ALL=$locale &&
+        export LC_ALL &&
+        return
+        ;;
+    *) ;;
+  esac
+
+  skip_ "$loc locale not found"
+}
+
 expensive_()
 {
   if test "$RUN_EXPENSIVE_TESTS" != yes; then
diff --git a/tests/mb-non-UTF8-performance b/tests/mb-non-UTF8-performance
new file mode 100755
index 0000000..282f0c4
--- /dev/null
+++ b/tests/mb-non-UTF8-performance
@@ -0,0 +1,32 @@
+#!/bin/sh
+# grep-2.17 would take nearly 200x longer to run the command below.
+
+# Copyright 2014 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+require_timeout_
+
+fail=0
+
+require_JP_EUC_locale_
+
+yes $(printf '%078d' 0) | head -50000 > in || framework_failure_
+
+# Expect no match, i.e., exit status of 1.  Anything else is an error.
+timeout 4 grep -i foobar in; st=$?
+test $st = 1 || fail=1
+
+Exit $fail
-- 
1.9.0

bug#16232: [PATCH] grep: make --ignore-case (-i) faster (sometimes 10x) in multibyte locales

Reply via email to