Norihiro Tanaka wrote:
I'm worried that to re-run for invalid UTF-8 makes slowness for searching of the large number of binary files.
Yes, that could be a problem, but even so it's better for grep to report matches than to give up and fail. Perhaps someone could optimize this better later, but to be honest given how flaky libpcre is we're probably better off spending our scarce development resources elsewhere.
Santiago's latest patch still had some troubles, unfortunately. It could mishandle '^' by having it match just past an encoding error. It was less efficient than it could be, as it checked all valid bytes for UTF-8-edness twice. If I understand PCRE correctly (which quite possibly I don't), it also appeared to mishandle matches that contain nested subexpressions. But the worst part was that the code was too complicated (and this was true even before Santiago's patch was applied). So I rewrote it and installed the attached patch instead. Please give it a try.
From 29855e7bbe47b91680ae0cba5729c5becfaa3216 Mon Sep 17 00:00:00 2001 From: Paul Eggert <egg...@cs.ucla.edu> Date: Tue, 9 Sep 2014 12:41:54 -0700 Subject: [PATCH] grep: -P now treats invalid UTF-8 input as non-matching Problem reported by Santiago Vila in: http://bugs.gnu.org/18266 * NEWS: Mention this. * src/pcresearch.c (Pexecute): Treat UTF-8 encoding errors as non-matching data, instead of exiting 'grep'. * tests/pcre-infloop: grep now exits with status 1, not 2. * tests/pcre-invalid-utf8-input: grep now exits with status 0, not 2. --- NEWS | 3 ++ src/pcresearch.c | 70 +++++++++++++++++-------------------------- tests/pcre-infloop | 2 +- tests/pcre-invalid-utf8-input | 2 +- 4 files changed, 33 insertions(+), 44 deletions(-) diff --git a/NEWS b/NEWS index 550bf4c..ca79525 100644 --- a/NEWS +++ b/NEWS @@ -6,6 +6,9 @@ GNU grep NEWS -*- outline -*- Performance has improved for very long strings in patterns. + grep -P no longer reports an error and exits when given invalid UTF-8 data. + Instead, it considers the data to be non-matching. + ** Bug fixes grep -E rejected unmatched ')', instead of treating it like '\)'. diff --git a/src/pcresearch.c b/src/pcresearch.c index 820dd00..2a01e6d 100644 --- a/src/pcresearch.c +++ b/src/pcresearch.c @@ -136,34 +136,41 @@ Pexecute (char const *buf, size_t size, size_t *match_size, #else /* This array must have at least two elements; everything after that is just for performance improvement in pcre_exec. */ - int sub[300]; + enum { nsub = 300 }; + int sub[nsub]; - const char *line_buf, *line_end, *line_next; + char const *p = start_ptr ? start_ptr : buf; + int options = p == buf || p[-1] == eolbyte ? 0 : PCRE_NOTBOL; + char const *line_start = buf; int e = PCRE_ERROR_NOMATCH; - ptrdiff_t start_ofs = start_ptr ? start_ptr - buf : 0; + char const *line_end; /* PCRE can't limit the matching to single lines, therefore we have to match each line in the buffer separately. */ - for (line_next = buf; - e == PCRE_ERROR_NOMATCH && line_next < buf + size; - start_ofs -= line_next - line_buf) + for (; p < buf + size; p = line_start = line_end + 1) { - line_buf = line_next; - line_end = memchr (line_buf, eolbyte, (buf + size) - line_buf); - if (line_end == NULL) - line_next = line_end = buf + size; - else - line_next = line_end + 1; - - if (start_ptr && start_ptr >= line_end) - continue; + line_end = memchr (p, eolbyte, buf + size - p); - if (INT_MAX < line_end - line_buf) + if (INT_MAX < line_end - p) error (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit")); - e = pcre_exec (cre, extra, line_buf, line_end - line_buf, - start_ofs < 0 ? 0 : start_ofs, 0, - sub, sizeof sub / sizeof *sub); + /* Treat encoding-error bytes as data that cannot match. */ + for (;;) + { + e = pcre_exec (cre, extra, p, line_end - p, 0, options, sub, nsub); + if (e != PCRE_ERROR_BADUTF8) + break; + e = pcre_exec (cre, extra, p, sub[0], 0, + options | PCRE_NO_UTF8_CHECK, sub, nsub); + if (e != PCRE_ERROR_NOMATCH) + break; + p += sub[0] + 1; + options = PCRE_NOTBOL; + } + + if (e != PCRE_ERROR_NOMATCH) + break; + options = 0; } if (e <= 0) @@ -180,10 +187,6 @@ Pexecute (char const *buf, size_t size, size_t *match_size, error (EXIT_TROUBLE, 0, _("exceeded PCRE's backtracking limit")); - case PCRE_ERROR_BADUTF8: - error (EXIT_TROUBLE, 0, - _("invalid UTF-8 byte sequence in input")); - default: /* For now, we lump all remaining PCRE failures into this basket. If anyone cares to provide sample grep usage that can trigger @@ -197,25 +200,8 @@ Pexecute (char const *buf, size_t size, size_t *match_size, } else { - /* Narrow down to the line we've found. */ - char const *beg = line_buf + sub[0]; - char const *end = line_buf + sub[1]; - char const *buflim = buf + size; - char eol = eolbyte; - if (!start_ptr) - { - /* FIXME: The case when '\n' is not found indicates a bug: - Since grep is line oriented, the match should never contain - a newline, so there _must_ be a newline following. - */ - if (!(end = memchr (end, eol, buflim - end))) - end = buflim; - else - end++; - while (buf < beg && beg[-1] != eol) - --beg; - } - + char const *beg = start_ptr ? p + sub[0] : line_start; + char const *end = start_ptr ? p + sub[1] : line_end + 1; *match_size = end - beg; return beg - buf; } diff --git a/tests/pcre-infloop b/tests/pcre-infloop index 1b33e72..b92f8e1 100755 --- a/tests/pcre-infloop +++ b/tests/pcre-infloop @@ -28,6 +28,6 @@ printf 'a\201b\r' > in || framework_failure_ fail=0 LC_ALL=en_US.UTF-8 timeout 3 grep -P 'a.?..b' in -test $? = 2 || fail_ "libpcre's match function appears to infloop" +test $? = 1 || fail_ "libpcre's match function appears to infloop" Exit $fail diff --git a/tests/pcre-invalid-utf8-input b/tests/pcre-invalid-utf8-input index 913e8ee..f42e0dd 100755 --- a/tests/pcre-invalid-utf8-input +++ b/tests/pcre-invalid-utf8-input @@ -16,6 +16,6 @@ fail=0 printf 'j\202\nj\n' > in || framework_failure_ LC_ALL=en_US.UTF-8 grep -P j in -test $? -eq 2 || fail=1 +test $? -eq 0 || fail=1 Exit $fail -- 1.9.3