I added a rule to run the test to the patch.
From 8021dc2ac728c527cba648edfa5842c7a80a8a66 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <[email protected]>
Date: Sat, 25 Oct 2014 01:05:57 +0900
Subject: [PATCH] grep: improvement of validation for input buffer in grep -P
* src/grep.c src/grep.h src/pcresearch.c (validated_boundary): Remove var.
* src/pcresearch.c (prev_valid): New var.
(Pexecute): improvement of validation for input buffer.
* tests/pcre-o: New test for this change.
* tests/Makefile.am: Add a rule to run new test.
---
src/grep.c | 3 --
src/grep.h | 4 ---
src/pcresearch.c | 103 ++++++++++++++++++++++++++++++------------------------
tests/Makefile.am | 1 +
tests/pcre-o | 17 +++++++++
5 files changed, 76 insertions(+), 52 deletions(-)
create mode 100755 tests/pcre-o
diff --git a/src/grep.c b/src/grep.c
index a0f2620..0a4ac27 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -352,7 +352,6 @@ bool match_words;
bool match_lines;
char eolbyte;
enum textbin input_textbin;
-char const *validated_boundary;
static char const *matcher;
@@ -1226,7 +1225,6 @@ grepbuf (char const *beg, char const *lim)
intmax_t outleft0 = outleft;
char const *p;
char const *endp;
- validated_boundary = beg;
for (p = beg; p < lim; p = endp)
{
@@ -2516,7 +2514,6 @@ main (int argc, char **argv)
/* We need one byte prior and one after. */
char eolbytes[3] = { 0, eolbyte, 0 };
size_t match_size;
- validated_boundary = eolbytes + 1;
skip_empty_lines = ((execute (eolbytes + 1, 1, &match_size, NULL) == 0)
== out_invert);
diff --git a/src/grep.h b/src/grep.h
index 86259fb..02052b4 100644
--- a/src/grep.h
+++ b/src/grep.h
@@ -47,8 +47,4 @@ enum textbin
/* Input file type. */
extern enum textbin input_textbin;
-/* Validation boundary. Earlier bytes have already been validated by
- the PCRE matcher, which cares about this sort of thing. */
-extern char const *validated_boundary;
-
#endif
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 1fd5bde..85c1acd 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -42,6 +42,10 @@ static pcre_extra *extra;
string matches when that flag is used. */
static int empty_match[2];
+/* Previous result of a validation. If it's true, omit the validation in
+ longest exact match. */
+static bool prev_valid;
+
/* This must be at least 2; everything after that is for performance
in pcre_exec. */
enum { NSUB = 300 };
@@ -156,7 +160,7 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
char const *line_start = buf;
int e = PCRE_ERROR_NOMATCH;
char const *line_end;
- char const *validated = validated_boundary;
+ bool valid = false;
/* If the input type is unknown, the caller is still testing the
input, which means the current buffer cannot contain encoding
@@ -176,11 +180,13 @@ Pexecute (char const *buf, size_t size, size_t
*match_size,
size_t scan_size = MIN (pcre_size_max + 1, buf + size - p);
line_end = memrchr (p, eolbyte, scan_size);
too_big = ! line_end;
+ valid = true;
}
else
{
line_end = memchr (p, eolbyte, buf + size - p);
too_big = INT_MAX < line_end - p;
+ valid = (prev_valid && start_ptr != NULL);
}
if (too_big)
@@ -188,69 +194,76 @@ Pexecute (char const *buf, size_t size, size_t
*match_size,
for (;;)
{
- /* Skip past bytes that are easily determined to be encoding
- errors, treating them as data that cannot match. This is
- faster than having pcre_exec check them. */
- while (mbclen_cache[to_uchar (*p)] == (size_t) -1)
- {
- p++;
- bol = false;
- }
+ int valid_bytes;
+ int options = 0;
- /* Check for an empty match; this is faster than letting
- pcre_exec do it. */
- int search_bytes = line_end - p;
- if (search_bytes == 0)
+ if (!valid)
{
- sub[0] = sub[1] = 0;
- e = empty_match[bol];
- break;
- }
+ /* Skip past bytes that are easily determined to be encoding
+ errors, treating them as data that cannot match. This is
+ faster than having pcre_exec check them. */
+ while (mbclen_cache[to_uchar (*p)] == (size_t) -1)
+ {
+ p++;
+ bol = false;
+ }
- int options = 0;
- if (!bol)
- options |= PCRE_NOTBOL;
- if (multiline || p + search_bytes <= validated)
- options |= PCRE_NO_UTF8_CHECK;
+ /* Check for an empty match; this is faster than letting
+ pcre_exec do it. */
+ int search_bytes = line_end - p;
+ if (search_bytes == 0)
+ {
+ sub[0] = sub[1] = 0;
+ e = empty_match[bol];
+ if (p == line_start)
+ valid = true;
+ break;
+ }
+
+ if (!bol)
+ options |= PCRE_NOTBOL;
- int valid_bytes = validated - p;
- if (valid_bytes <= 0)
- {
e = pcre_exec (cre, extra, p, search_bytes, 0,
options, sub, NSUB);
if (e != PCRE_ERROR_BADUTF8)
{
- validated = p + search_bytes;
- if (0 < e && multiline && sub[1] - sub[0] != 0)
- {
- char const *nl = memchr (p + sub[0], eolbyte,
- sub[1] - sub[0]);
- if (nl)
- {
- /* This match crosses a line boundary; reject it. */
- p += sub[0];
- line_end = nl;
- continue;
- }
- }
+ if (p == line_start)
+ valid = true;
break;
}
valid_bytes = sub[0];
- validated = p + valid_bytes;
}
+ else
+ valid_bytes = line_end - p;
/* Try to match the string before the encoding error.
Again, handle the empty-match case specially, for speed. */
if (valid_bytes == 0)
{
- sub[1] = 0;
+ sub[0] = sub[1] = 0;
e = empty_match[bol];
}
else
- e = pcre_exec (cre, extra, p, valid_bytes, 0,
- options | PCRE_NO_UTF8_CHECK | PCRE_NOTEOL,
- sub, NSUB);
- if (e != PCRE_ERROR_NOMATCH)
+ {
+ options |= PCRE_NO_UTF8_CHECK;
+ if (valid_bytes < line_end - p)
+ options |= PCRE_NOTEOL;
+ e = pcre_exec (cre, extra, p, valid_bytes, 0, options,
+ sub, NSUB);
+ if (0 < e && multiline && sub[1] - sub[0] != 0)
+ {
+ char const *nl = memchr (p + sub[0], eolbyte,
+ sub[1] - sub[0]);
+ if (nl)
+ {
+ /* This match crosses a line boundary; reject it. */
+ p += sub[0];
+ line_end = nl;
+ continue;
+ }
+ }
+ }
+ if (e != PCRE_ERROR_NOMATCH || p + valid_bytes >= line_end)
break;
/* Treat the encoding error as data that cannot match. */
@@ -263,7 +276,7 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
bol = true;
}
- validated_boundary = validated;
+ prev_valid = valid;
if (e <= 0)
{
diff --git a/tests/Makefile.am b/tests/Makefile.am
index c298835..f6f051c 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -90,6 +90,7 @@ TESTS = \
pcre-abort \
pcre-infloop \
pcre-invalid-utf8-input \
+ pcre-o \
pcre-utf8 \
pcre-w \
pcre-wx-backref \
diff --git a/tests/pcre-o b/tests/pcre-o
new file mode 100755
index 0000000..5f5891e
--- /dev/null
+++ b/tests/pcre-o
@@ -0,0 +1,17 @@
+#! /bin/sh
+# Ensure that, grep -oP doesn't cause internal error at match.
+#
+# Copyright (C) 2014 Free Software Foundation, Inc.
+#
+# Copying and distribution of this file, with or without modification,
+# are permitted in any medium without royalty provided the copyright
+# notice and this notice are preserved.
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+require_pcre_
+
+fail=0
+
+echo ab | grep -oP 'a' || fail=1
+
+Exit $fail
--
2.1.1