On 02/20/2014 09:13 AM, Jim Meyering wrote:
In case your bug fix looks safe/small, and is ready, ...
Attached. I have some other fixes too, which I'll try to get out the
door today (though I can't promise that).
>From 80250e3fae3a333160014bed7613a5cc9e42413a Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Wed, 19 Feb 2014 18:58:42 -0800
Subject: [PATCH 1/2] tests: test [^^-^] in unibyte locales
This is a bug in the current dfa.c, which was reintroduced by the
recent reversion from RRI.
* tests/unibyte-negated-circumflex: New file.
* tests/Makefile.am (TESTS): Add it.
* tests/init.cfg (require_unibyte_locale): New function.
---
tests/Makefile.am | 1 +
tests/init.cfg | 16 ++++++++++++++++
tests/unibyte-negated-circumflex | 27 +++++++++++++++++++++++++++
3 files changed, 44 insertions(+)
create mode 100755 tests/unibyte-negated-circumflex
diff --git a/tests/Makefile.am b/tests/Makefile.am
index e2967fa..331467a 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -75,6 +75,7 @@ TESTS = \
multibyte-white-space \
empty-line-mb \
unibyte-bracket-expr \
+ unibyte-negated-circumflex \
high-bit-range \
options \
pcre \
diff --git a/tests/init.cfg b/tests/init.cfg
index 2e8330b..ee5d537 100644
--- a/tests/init.cfg
+++ b/tests/init.cfg
@@ -87,6 +87,22 @@ require_compiled_in_MB_support()
|| skip_ this test requires MBS support
}
+require_unibyte_locale()
+{
+ path_prepend_ .
+ for loc in C en_US; do
+ for encoding in '' .iso88591 .iso885915 .ISO8859-1 .ISO8859-15; do
+ locale=$loc$encoding
+ MB_CUR_MAX=$(get-mb-cur-max $locale 2>/dev/null) &&
+ test "$MB_CUR_MAX" -eq 1 &&
+ LC_ALL=$locale &&
+ export LC_ALL &&
+ return
+ done
+ done
+ skip_ 'no unibyte locale found'
+}
+
expensive_()
{
if test "$RUN_EXPENSIVE_TESTS" != yes; then
diff --git a/tests/unibyte-negated-circumflex b/tests/unibyte-negated-circumflex
new file mode 100755
index 0000000..b6d747c
--- /dev/null
+++ b/tests/unibyte-negated-circumflex
@@ -0,0 +1,27 @@
+#!/bin/sh
+# Exercise a bug where [^^-^] was treated as if it were [^-^].
+
+# Copyright 2014 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+require_unibyte_locale
+
+fail=0
+
+echo a >in || framework_failure_
+grep '[^^-^]' in >out || fail=1
+compare out in || fail=1
+Exit $fail
--
1.8.5.3
>From c1fa72bd324aac44a1d99b83bb585dbcf291041d Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Thu, 20 Feb 2014 10:17:47 -0800
Subject: [PATCH 2/2] grep: fix bug with patterns like [^^-~] in unibyte
locales
* NEWS: Document this.
* src/dfa.c (parse_bracket_exp): Escape patterns like [^^-~], or
Awk patterns like [\^-\]], so that they are not misinterpreted by
the system regex library. Check for system regex failure due to
memory exhaustion.
---
NEWS | 5 +++++
src/dfa.c | 41 ++++++++++++++++++++++-------------------
2 files changed, 27 insertions(+), 19 deletions(-)
diff --git a/NEWS b/NEWS
index c6d78d0..8639ce1 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,11 @@ GNU grep NEWS -*- outline -*-
* Noteworthy changes in release ?.? (????-??-??) [?]
+** Bug fixes
+
+ grep no longer mishandles patterns like [^^-~] in unibyte locales.
+ [bug introduced in grep-2.8]
+
** Improvements
grep -i in a multibyte locale may be over 130 times faster than in 2.17
diff --git a/src/dfa.c b/src/dfa.c
index a133e03..9266f6f 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -1108,28 +1108,31 @@ parse_bracket_exp (void)
{
/* Defer to the system regex library about the meaning
of range expressions. */
- regex_t re;
- char pattern[6] = { '[', 0, '-', 0, ']', 0 };
- char subject[2] = { 0, 0 };
- c1 = c;
- if (case_fold)
- {
- c1 = tolower (c1);
- c2 = tolower (c2);
- }
-
- pattern[1] = c1;
- pattern[3] = c2;
- regcomp (&re, pattern, REG_NOSUB);
- for (c = 0; c < NOTCHAR; ++c)
+ struct re_pattern_buffer re = { 0 };
+ char const *compile_msg;
+#if 199901 <= __STDC_VERSION__
+ char pattern[] = { '[', '\\', c, '-', '\\', c2, ']' };
+#else
+ char pattern[] = { '[', '\\', 0, '-', '\\', 0, ']' };
+ pattern[2] = c;
+ pattern[5] = c2;
+#endif
+ re_set_syntax (syntax_bits | RE_BACKSLASH_ESCAPE_IN_LISTS);
+ compile_msg = re_compile_pattern (pattern, sizeof pattern, &re);
+ if (compile_msg)
+ dfaerror (compile_msg);
+ for (c = 0; c < NOTCHAR; c++)
{
- if ((case_fold && isupper (c)))
- continue;
- subject[0] = c;
- if (regexec (&re, subject, 0, NULL, 0) != REG_NOMATCH)
- setbit_case_fold_c (c, ccl);
+ char subject = c;
+ switch (re_match (&re, &subject, 1, 0, NULL))
+ {
+ case 1: setbit (c, ccl); break;
+ case -1: break;
+ default: xalloc_die ();
+ }
}
regfree (&re);
+ re_set_syntax (syntax_bits);
}
colon_warning_state |= 8;
--
1.8.5.3