Shlomi Fish <shlo...@shlomifish.org> write: > `which grep` --color -rP getline grep-test
If -o or --color option is specified, may be line_end < validated in longest exact match. As a result, a negative value is set to `search_bytes'. I improved validation for input buffer in order to fix the bug. However, possibly it may cause slowdown.
From 0643f5c6877a4fa7a0aa1d10670558c5e089e1f3 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka <nori...@kcn.ne.jp> Date: Sat, 25 Oct 2014 01:05:57 +0900 Subject: [PATCH] grep: improvement of validation for input buffer in grep -P * src/grep.c src/grep.h src/pcresearch.c (validated_boundary): Remove var. * src/pcresearch.c (prev_valid): New var. (Pexecute): improvement of validation for input buffer. --- src/grep.c | 3 -- src/grep.h | 4 --- src/pcresearch.c | 103 +++++++++++++++++++++++++++++++------------------------ tests/pcre-o | 17 +++++++++ 4 files changed, 75 insertions(+), 52 deletions(-) create mode 100755 tests/pcre-o diff --git a/src/grep.c b/src/grep.c index a0f2620..0a4ac27 100644 --- a/src/grep.c +++ b/src/grep.c @@ -352,7 +352,6 @@ bool match_words; bool match_lines; char eolbyte; enum textbin input_textbin; -char const *validated_boundary; static char const *matcher; @@ -1226,7 +1225,6 @@ grepbuf (char const *beg, char const *lim) intmax_t outleft0 = outleft; char const *p; char const *endp; - validated_boundary = beg; for (p = beg; p < lim; p = endp) { @@ -2516,7 +2514,6 @@ main (int argc, char **argv) /* We need one byte prior and one after. */ char eolbytes[3] = { 0, eolbyte, 0 }; size_t match_size; - validated_boundary = eolbytes + 1; skip_empty_lines = ((execute (eolbytes + 1, 1, &match_size, NULL) == 0) == out_invert); diff --git a/src/grep.h b/src/grep.h index 86259fb..02052b4 100644 --- a/src/grep.h +++ b/src/grep.h @@ -47,8 +47,4 @@ enum textbin /* Input file type. */ extern enum textbin input_textbin; -/* Validation boundary. Earlier bytes have already been validated by - the PCRE matcher, which cares about this sort of thing. */ -extern char const *validated_boundary; - #endif diff --git a/src/pcresearch.c b/src/pcresearch.c index 1fd5bde..85c1acd 100644 --- a/src/pcresearch.c +++ b/src/pcresearch.c @@ -42,6 +42,10 @@ static pcre_extra *extra; string matches when that flag is used. */ static int empty_match[2]; +/* Previous result of a validation. If it's true, omit the validation in + longest exact match. */ +static bool prev_valid; + /* This must be at least 2; everything after that is for performance in pcre_exec. */ enum { NSUB = 300 }; @@ -156,7 +160,7 @@ Pexecute (char const *buf, size_t size, size_t *match_size, char const *line_start = buf; int e = PCRE_ERROR_NOMATCH; char const *line_end; - char const *validated = validated_boundary; + bool valid = false; /* If the input type is unknown, the caller is still testing the input, which means the current buffer cannot contain encoding @@ -176,11 +180,13 @@ Pexecute (char const *buf, size_t size, size_t *match_size, size_t scan_size = MIN (pcre_size_max + 1, buf + size - p); line_end = memrchr (p, eolbyte, scan_size); too_big = ! line_end; + valid = true; } else { line_end = memchr (p, eolbyte, buf + size - p); too_big = INT_MAX < line_end - p; + valid = (prev_valid && start_ptr != NULL); } if (too_big) @@ -188,69 +194,76 @@ Pexecute (char const *buf, size_t size, size_t *match_size, for (;;) { - /* Skip past bytes that are easily determined to be encoding - errors, treating them as data that cannot match. This is - faster than having pcre_exec check them. */ - while (mbclen_cache[to_uchar (*p)] == (size_t) -1) - { - p++; - bol = false; - } + int valid_bytes; + int options = 0; - /* Check for an empty match; this is faster than letting - pcre_exec do it. */ - int search_bytes = line_end - p; - if (search_bytes == 0) + if (!valid) { - sub[0] = sub[1] = 0; - e = empty_match[bol]; - break; - } + /* Skip past bytes that are easily determined to be encoding + errors, treating them as data that cannot match. This is + faster than having pcre_exec check them. */ + while (mbclen_cache[to_uchar (*p)] == (size_t) -1) + { + p++; + bol = false; + } - int options = 0; - if (!bol) - options |= PCRE_NOTBOL; - if (multiline || p + search_bytes <= validated) - options |= PCRE_NO_UTF8_CHECK; + /* Check for an empty match; this is faster than letting + pcre_exec do it. */ + int search_bytes = line_end - p; + if (search_bytes == 0) + { + sub[0] = sub[1] = 0; + e = empty_match[bol]; + if (p == line_start) + valid = true; + break; + } + + if (!bol) + options |= PCRE_NOTBOL; - int valid_bytes = validated - p; - if (valid_bytes <= 0) - { e = pcre_exec (cre, extra, p, search_bytes, 0, options, sub, NSUB); if (e != PCRE_ERROR_BADUTF8) { - validated = p + search_bytes; - if (0 < e && multiline && sub[1] - sub[0] != 0) - { - char const *nl = memchr (p + sub[0], eolbyte, - sub[1] - sub[0]); - if (nl) - { - /* This match crosses a line boundary; reject it. */ - p += sub[0]; - line_end = nl; - continue; - } - } + if (p == line_start) + valid = true; break; } valid_bytes = sub[0]; - validated = p + valid_bytes; } + else + valid_bytes = line_end - p; /* Try to match the string before the encoding error. Again, handle the empty-match case specially, for speed. */ if (valid_bytes == 0) { - sub[1] = 0; + sub[0] = sub[1] = 0; e = empty_match[bol]; } else - e = pcre_exec (cre, extra, p, valid_bytes, 0, - options | PCRE_NO_UTF8_CHECK | PCRE_NOTEOL, - sub, NSUB); - if (e != PCRE_ERROR_NOMATCH) + { + options |= PCRE_NO_UTF8_CHECK; + if (valid_bytes < line_end - p) + options |= PCRE_NOTEOL; + e = pcre_exec (cre, extra, p, valid_bytes, 0, options, + sub, NSUB); + if (0 < e && multiline && sub[1] - sub[0] != 0) + { + char const *nl = memchr (p + sub[0], eolbyte, + sub[1] - sub[0]); + if (nl) + { + /* This match crosses a line boundary; reject it. */ + p += sub[0]; + line_end = nl; + continue; + } + } + } + if (e != PCRE_ERROR_NOMATCH || p + valid_bytes >= line_end) break; /* Treat the encoding error as data that cannot match. */ @@ -263,7 +276,7 @@ Pexecute (char const *buf, size_t size, size_t *match_size, bol = true; } - validated_boundary = validated; + prev_valid = valid; if (e <= 0) { diff --git a/tests/pcre-o b/tests/pcre-o new file mode 100755 index 0000000..5f5891e --- /dev/null +++ b/tests/pcre-o @@ -0,0 +1,17 @@ +#! /bin/sh +# Ensure that, grep -oP doesn't cause internal error at match. +# +# Copyright (C) 2014 Free Software Foundation, Inc. +# +# Copying and distribution of this file, with or without modification, +# are permitted in any medium without royalty provided the copyright +# notice and this notice are preserved. + +. "${srcdir=.}/init.sh"; path_prepend_ ../src +require_pcre_ + +fail=0 + +echo ab | grep -oP 'a' || fail=1 + +Exit $fail -- 2.1.1