From e2f385b9325533aa31eea48d2d4d68058c40d4d7 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <noritnk@kcn.ne.jp>
Date: Mon, 17 Nov 2014 08:26:53 +0900
Subject: [PATCH] dfa: speed-up for long pattern

DFA trys to find a long sequence of characters that must appear in any
line containing the r.e.  However, if a pattern is long, it is very slow,
as it processes all characters step by step.  This change makes a string
concatenated some normal characters process at a time.

* src/dfa.c (dfamust): Process a string concatenated normal characters
at a time.
* NEWS (Improvement): Mention it.
---
 NEWS      |  3 +++
 src/dfa.c | 54 +++++++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 44 insertions(+), 13 deletions(-)

diff --git a/NEWS b/NEWS
index 6138c4e..cda7685 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,9 @@ GNU grep NEWS                                    -*- outline -*-
 
 * Noteworthy changes in release ?.? (????-??-??) [?]
 
+** Improvements
+
+  Performance has improved for very long strings in patterns.
 
 * Noteworthy changes in release 2.21 (2014-11-23) [stable]
 
diff --git a/src/dfa.c b/src/dfa.c
index 65862e8..626d0e2 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -3995,13 +3995,13 @@ struct must
 };
 
 static must *
-allocmust (must *mp)
+allocmust (must *mp, size_t size)
 {
   must *new_mp = xmalloc (sizeof *new_mp);
   new_mp->in = xzalloc (sizeof *new_mp->in);
-  new_mp->left = xzalloc (2);
-  new_mp->right = xzalloc (2);
-  new_mp->is = xzalloc (2);
+  new_mp->left = xzalloc (size);
+  new_mp->right = xzalloc (size);
+  new_mp->is = xzalloc (size);
   new_mp->begline = false;
   new_mp->endline = false;
   new_mp->prev = mp;
@@ -4034,7 +4034,7 @@ dfamust (struct dfa *d)
 {
   must *mp = NULL;
   char const *result = "";
-  size_t ri;
+  size_t ri, rj;
   size_t i;
   bool exact = false;
   bool begline = false;
@@ -4047,11 +4047,11 @@ dfamust (struct dfa *d)
       switch (t)
         {
         case BEGLINE:
-          mp = allocmust (mp);
+          mp = allocmust (mp, 2);
           mp->begline = true;
           break;
         case ENDLINE:
-          mp = allocmust (mp);
+          mp = allocmust (mp, 2);
           mp->endline = true;
           break;
         case LPAREN:
@@ -4066,7 +4066,7 @@ dfamust (struct dfa *d)
         case BACKREF:
         case ANYCHAR:
         case MBCSET:
-          mp = allocmust (mp);
+          mp = allocmust (mp, 2);
           break;
 
         case STAR:
@@ -4183,7 +4183,6 @@ dfamust (struct dfa *d)
           goto done;
 
         default:
-          mp = allocmust (mp);
           if (CSET <= t)
             {
               /* If T is a singleton, or if case-folding in a unibyte
@@ -4196,7 +4195,10 @@ dfamust (struct dfa *d)
                 if (tstbit (j, *ccl))
                   break;
               if (! (j < NOTCHAR))
-                break;
+                {
+                  mp = allocmust (mp, 2);
+                  break;
+                }
               t = j;
               while (++j < NOTCHAR)
                 if (tstbit (j, *ccl)
@@ -4204,12 +4206,38 @@ dfamust (struct dfa *d)
                           && toupper (j) == toupper (t)))
                   break;
               if (j < NOTCHAR)
-                break;
+                {
+                  mp = allocmust (mp, 2);
+                  break;
+                }
             }
+
+          rj = ri + 2;
+          if (d->tokens[ri + 1] == CAT)
+            {
+              for (; rj < d->tindex - 1; rj += 2)
+                if ((rj != ri && (d->tokens[rj] <= 0 || NOTCHAR <= d->tokens[rj]))
+                    || d->tokens[rj + 1] != CAT)
+                  break;
+            }
+          mp = allocmust (mp, ((rj - ri) >> 1) + 1);
           mp->is[0] = mp->left[0] = mp->right[0]
             = case_fold && !d->multibyte ? toupper (t) : t;
-          mp->is[1] = mp->left[1] = mp->right[1] = '\0';
-          mp->in = enlist (mp->in, mp->is, 1);
+          i = 1;
+          if (ri + 2 < rj)
+            {
+              do
+                {
+                  ri += 2;
+                  t = d->tokens[ri];
+                  mp->is[i] = mp->left[i] = mp->right[i]
+                    = case_fold && !d->multibyte ? toupper (t) : t;
+                  ++i;
+                }
+              while (ri + 2 < rj);
+            }
+          mp->is[i] = mp->left[i] = mp->right[i] = '\0';
+          mp->in = enlist (mp->in, mp->is, i - 1);
           break;
         }
     }
-- 
2.1.3

