Shlomi Fish <shlo...@shlomifish.org> write:
> `which grep` --color -rP getline grep-test

If -o or --color option is specified, may be line_end < validated in
longest exact match.  As a result, a negative value is set to
`search_bytes'.

I improved validation for input buffer in order to fix the bug.
However, possibly it may cause slowdown.
From 0643f5c6877a4fa7a0aa1d10670558c5e089e1f3 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <nori...@kcn.ne.jp>
Date: Sat, 25 Oct 2014 01:05:57 +0900
Subject: [PATCH] grep: improvement of validation for input buffer in grep -P

* src/grep.c src/grep.h src/pcresearch.c (validated_boundary): Remove var.
* src/pcresearch.c (prev_valid): New var.
(Pexecute): improvement of validation for input buffer.
---
 src/grep.c       |   3 --
 src/grep.h       |   4 ---
 src/pcresearch.c | 103 +++++++++++++++++++++++++++++++------------------------
 tests/pcre-o     |  17 +++++++++
 4 files changed, 75 insertions(+), 52 deletions(-)
 create mode 100755 tests/pcre-o

diff --git a/src/grep.c b/src/grep.c
index a0f2620..0a4ac27 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -352,7 +352,6 @@ bool match_words;
 bool match_lines;
 char eolbyte;
 enum textbin input_textbin;
-char const *validated_boundary;
 
 static char const *matcher;
 
@@ -1226,7 +1225,6 @@ grepbuf (char const *beg, char const *lim)
   intmax_t outleft0 = outleft;
   char const *p;
   char const *endp;
-  validated_boundary = beg;
 
   for (p = beg; p < lim; p = endp)
     {
@@ -2516,7 +2514,6 @@ main (int argc, char **argv)
   /* We need one byte prior and one after.  */
   char eolbytes[3] = { 0, eolbyte, 0 };
   size_t match_size;
-  validated_boundary = eolbytes + 1;
   skip_empty_lines = ((execute (eolbytes + 1, 1, &match_size, NULL) == 0)
                       == out_invert);
 
diff --git a/src/grep.h b/src/grep.h
index 86259fb..02052b4 100644
--- a/src/grep.h
+++ b/src/grep.h
@@ -47,8 +47,4 @@ enum textbin
 /* Input file type.  */
 extern enum textbin input_textbin;
 
-/* Validation boundary.  Earlier bytes have already been validated by
-   the PCRE matcher, which cares about this sort of thing.  */
-extern char const *validated_boundary;
-
 #endif
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 1fd5bde..85c1acd 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -42,6 +42,10 @@ static pcre_extra *extra;
    string matches when that flag is used.  */
 static int empty_match[2];
 
+/* Previous result of a validation.  If it's true, omit the validation in
+   longest exact match.  */
+static bool prev_valid;
+
 /* This must be at least 2; everything after that is for performance
    in pcre_exec.  */
 enum { NSUB = 300 };
@@ -156,7 +160,7 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
   char const *line_start = buf;
   int e = PCRE_ERROR_NOMATCH;
   char const *line_end;
-  char const *validated = validated_boundary;
+  bool valid = false;
 
   /* If the input type is unknown, the caller is still testing the
      input, which means the current buffer cannot contain encoding
@@ -176,11 +180,13 @@ Pexecute (char const *buf, size_t size, size_t 
*match_size,
           size_t scan_size = MIN (pcre_size_max + 1, buf + size - p);
           line_end = memrchr (p, eolbyte, scan_size);
           too_big = ! line_end;
+          valid = true;
         }
       else
         {
           line_end = memchr (p, eolbyte, buf + size - p);
           too_big = INT_MAX < line_end - p;
+          valid = (prev_valid && start_ptr != NULL);
         }
 
       if (too_big)
@@ -188,69 +194,76 @@ Pexecute (char const *buf, size_t size, size_t 
*match_size,
 
       for (;;)
         {
-          /* Skip past bytes that are easily determined to be encoding
-             errors, treating them as data that cannot match.  This is
-             faster than having pcre_exec check them.  */
-          while (mbclen_cache[to_uchar (*p)] == (size_t) -1)
-            {
-              p++;
-              bol = false;
-            }
+          int valid_bytes;
+          int options = 0;
 
-          /* Check for an empty match; this is faster than letting
-             pcre_exec do it.  */
-          int search_bytes = line_end - p;
-          if (search_bytes == 0)
+          if (!valid)
             {
-              sub[0] = sub[1] = 0;
-              e = empty_match[bol];
-              break;
-            }
+              /* Skip past bytes that are easily determined to be encoding
+                 errors, treating them as data that cannot match.  This is
+                 faster than having pcre_exec check them.  */
+              while (mbclen_cache[to_uchar (*p)] == (size_t) -1)
+                {
+                  p++;
+                  bol = false;
+                }
 
-          int options = 0;
-          if (!bol)
-            options |= PCRE_NOTBOL;
-          if (multiline || p + search_bytes <= validated)
-            options |= PCRE_NO_UTF8_CHECK;
+              /* Check for an empty match; this is faster than letting
+                 pcre_exec do it.  */
+              int search_bytes = line_end - p;
+              if (search_bytes == 0)
+                {
+                  sub[0] = sub[1] = 0;
+                  e = empty_match[bol];
+                  if (p == line_start)
+                    valid = true;
+                  break;
+                }
+
+              if (!bol)
+                options |= PCRE_NOTBOL;
 
-          int valid_bytes = validated - p;
-          if (valid_bytes <= 0)
-            {
               e = pcre_exec (cre, extra, p, search_bytes, 0,
                              options, sub, NSUB);
               if (e != PCRE_ERROR_BADUTF8)
                 {
-                  validated = p + search_bytes;
-                  if (0 < e && multiline && sub[1] - sub[0] != 0)
-                    {
-                      char const *nl = memchr (p + sub[0], eolbyte,
-                                               sub[1] - sub[0]);
-                      if (nl)
-                        {
-                          /* This match crosses a line boundary; reject it.  */
-                          p += sub[0];
-                          line_end = nl;
-                          continue;
-                        }
-                    }
+                  if (p == line_start)
+                    valid = true;
                   break;
                 }
               valid_bytes = sub[0];
-              validated = p + valid_bytes;
             }
+          else
+            valid_bytes = line_end - p;
 
           /* Try to match the string before the encoding error.
              Again, handle the empty-match case specially, for speed.  */
           if (valid_bytes == 0)
             {
-              sub[1] = 0;
+              sub[0] = sub[1] = 0;
               e = empty_match[bol];
             }
           else
-            e = pcre_exec (cre, extra, p, valid_bytes, 0,
-                           options | PCRE_NO_UTF8_CHECK | PCRE_NOTEOL,
-                           sub, NSUB);
-          if (e != PCRE_ERROR_NOMATCH)
+            {
+              options |= PCRE_NO_UTF8_CHECK;
+              if (valid_bytes < line_end - p)
+                options |= PCRE_NOTEOL;
+              e = pcre_exec (cre, extra, p, valid_bytes, 0, options,
+                             sub, NSUB);
+              if (0 < e && multiline && sub[1] - sub[0] != 0)
+                {
+                  char const *nl = memchr (p + sub[0], eolbyte,
+                                           sub[1] - sub[0]);
+                  if (nl)
+                    {
+                      /* This match crosses a line boundary; reject it.  */
+                      p += sub[0];
+                      line_end = nl;
+                      continue;
+                    }
+                }
+            }
+          if (e != PCRE_ERROR_NOMATCH || p + valid_bytes >= line_end)
             break;
 
           /* Treat the encoding error as data that cannot match.  */
@@ -263,7 +276,7 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
       bol = true;
     }
 
-  validated_boundary = validated;
+  prev_valid = valid;
 
   if (e <= 0)
     {
diff --git a/tests/pcre-o b/tests/pcre-o
new file mode 100755
index 0000000..5f5891e
--- /dev/null
+++ b/tests/pcre-o
@@ -0,0 +1,17 @@
+#! /bin/sh
+# Ensure that, grep -oP doesn't cause internal error at match.
+#
+# Copyright (C) 2014 Free Software Foundation, Inc.
+#
+# Copying and distribution of this file, with or without modification,
+# are permitted in any medium without royalty provided the copyright
+# notice and this notice are preserved.
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+require_pcre_
+
+fail=0
+
+echo ab | grep -oP 'a' || fail=1
+
+Exit $fail
-- 
2.1.1

Reply via email to