bug#16232: [PATCH] grep: make --ignore-case (-i) faster (sometimes 10x) in multibyte locales

Paul Eggert Thu, 20 Feb 2014 10:36:32 -0800

On 02/20/2014 09:13 AM, Jim Meyering wrote:

In case your bug fix looks safe/small, and is ready, ...

Attached. I have some other fixes too, which I'll try to get out thedoor today (though I can't promise that).

>From 80250e3fae3a333160014bed7613a5cc9e42413a Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Wed, 19 Feb 2014 18:58:42 -0800
Subject: [PATCH 1/2] tests: test [^^-^] in unibyte locales


This is a bug in the current dfa.c, which was reintroduced by the
recent reversion from RRI.
* tests/unibyte-negated-circumflex: New file.
* tests/Makefile.am (TESTS): Add it.
* tests/init.cfg (require_unibyte_locale): New function.
---
 tests/Makefile.am                |  1 +
 tests/init.cfg                   | 16 ++++++++++++++++
 tests/unibyte-negated-circumflex | 27 +++++++++++++++++++++++++++
 3 files changed, 44 insertions(+)
 create mode 100755 tests/unibyte-negated-circumflex

diff --git a/tests/Makefile.am b/tests/Makefile.am
index e2967fa..331467a 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -75,6 +75,7 @@ TESTS =						\
   multibyte-white-space				\
   empty-line-mb					\
   unibyte-bracket-expr				\
+  unibyte-negated-circumflex			\
   high-bit-range				\
   options					\
   pcre						\
diff --git a/tests/init.cfg b/tests/init.cfg
index 2e8330b..ee5d537 100644
--- a/tests/init.cfg
+++ b/tests/init.cfg
@@ -87,6 +87,22 @@ require_compiled_in_MB_support()
     || skip_ this test requires MBS support
 }
 
+require_unibyte_locale()
+{
+  path_prepend_ .
+  for loc in C en_US; do
+    for encoding in '' .iso88591 .iso885915 .ISO8859-1 .ISO8859-15; do
+      locale=$loc$encoding
+      MB_CUR_MAX=$(get-mb-cur-max $locale 2>/dev/null) &&
+        test "$MB_CUR_MAX" -eq 1 &&
+        LC_ALL=$locale &&
+        export LC_ALL &&
+        return
+    done
+  done
+  skip_ 'no unibyte locale found'
+}
+
 expensive_()
 {
   if test "$RUN_EXPENSIVE_TESTS" != yes; then
diff --git a/tests/unibyte-negated-circumflex b/tests/unibyte-negated-circumflex
new file mode 100755
index 0000000..b6d747c
--- /dev/null
+++ b/tests/unibyte-negated-circumflex
@@ -0,0 +1,27 @@
+#!/bin/sh
+# Exercise a bug where [^^-^] was treated as if it were [^-^].
+
+# Copyright 2014 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+require_unibyte_locale
+
+fail=0
+
+echo a >in || framework_failure_
+grep '[^^-^]' in >out || fail=1
+compare out in || fail=1
+Exit $fail
-- 
1.8.5.3

>From c1fa72bd324aac44a1d99b83bb585dbcf291041d Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Thu, 20 Feb 2014 10:17:47 -0800
Subject: [PATCH 2/2] grep: fix bug with patterns like [^^-~] in unibyte
 locales

* NEWS: Document this.
* src/dfa.c (parse_bracket_exp): Escape patterns like [^^-~], or
Awk patterns like [\^-\]], so that they are not misinterpreted by
the system regex library.  Check for system regex failure due to
memory exhaustion.
---
 NEWS      |  5 +++++
 src/dfa.c | 41 ++++++++++++++++++++++-------------------
 2 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/NEWS b/NEWS
index c6d78d0..8639ce1 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,11 @@ GNU grep NEWS                                    -*- outline -*-
 
 * Noteworthy changes in release ?.? (????-??-??) [?]
 
+** Bug fixes
+
+  grep no longer mishandles patterns like [^^-~] in unibyte locales.
+  [bug introduced in grep-2.8]
+
 ** Improvements
 
   grep -i in a multibyte locale may be over 130 times faster than in 2.17
diff --git a/src/dfa.c b/src/dfa.c
index a133e03..9266f6f 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -1108,28 +1108,31 @@ parse_bracket_exp (void)
             {
               /* Defer to the system regex library about the meaning
                  of range expressions.  */
-              regex_t re;
-              char pattern[6] = { '[', 0, '-', 0, ']', 0 };
-              char subject[2] = { 0, 0 };
-              c1 = c;
-              if (case_fold)
-                {
-                  c1 = tolower (c1);
-                  c2 = tolower (c2);
-                }
-
-              pattern[1] = c1;
-              pattern[3] = c2;
-              regcomp (&re, pattern, REG_NOSUB);
-              for (c = 0; c < NOTCHAR; ++c)
+              struct re_pattern_buffer re = { 0 };
+              char const *compile_msg;
+#if 199901 <= __STDC_VERSION__
+              char pattern[] = { '[', '\\', c, '-', '\\', c2, ']' };
+#else
+              char pattern[] = { '[', '\\', 0, '-', '\\', 0, ']' };
+              pattern[2] = c;
+              pattern[5] = c2;
+#endif
+              re_set_syntax (syntax_bits | RE_BACKSLASH_ESCAPE_IN_LISTS);
+              compile_msg = re_compile_pattern (pattern, sizeof pattern, &re);
+              if (compile_msg)
+                dfaerror (compile_msg);
+              for (c = 0; c < NOTCHAR; c++)
                 {
-                  if ((case_fold && isupper (c)))
-                    continue;
-                  subject[0] = c;
-                  if (regexec (&re, subject, 0, NULL, 0) != REG_NOMATCH)
-                    setbit_case_fold_c (c, ccl);
+                  char subject = c;
+                  switch (re_match (&re, &subject, 1, 0, NULL))
+                    {
+                    case 1: setbit (c, ccl); break;
+                    case -1: break;
+                    default: xalloc_die ();
+                    }
                 }
               regfree (&re);
+              re_set_syntax (syntax_bits);
             }
 
           colon_warning_state |= 8;
-- 
1.8.5.3

bug#16232: [PATCH] grep: make --ignore-case (-i) faster (sometimes 10x) in multibyte locales

Reply via email to