I rebased this patch because of confliction.
From 10960a2940ff49eb5a7584e9da9ffe342e34d948 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <[email protected]>
Date: Mon, 31 Mar 2014 21:51:48 +0900
Subject: [PATCH] grep: speed-up for exact matching with begline and endline
 constraints.

dfamust turns on the flag when a state exactly matches the proposed one.
However, when the state has begline and/or endline constraints, turns
off it.

This patch enables to match a state exactly, even if the state has
begline and/or endline constraints.  If a exact string has one of their
constrations, the string adding eolbyte to a head and/or foot is pushed
to kwsincr().  In addition, if it has begline constration, start
searching from just before the position of the text.

* src/dfa.c (variable must): New members `begline' and `endline'.
(dfamust): Consideration of begline and endline constrations.
* src/dfa.h (struct dfamust): New members `begline' and `endline'.
* src/dfasearch.c (kwsmusts): If a exact string has begline constration,
start searching from just before the position of the text.
(EGexecute): Same as above.
* src/kwsearch.c (Fexecute): Same as above.
---
 src/dfa.c       | 44 ++++++++++++++++++++++++++++++++++++++------
 src/dfa.h       |  2 ++
 src/dfasearch.c | 29 ++++++++++++++++++++++++-----
 src/kwsearch.c  | 28 ++++++++++++++++------------
 4 files changed, 80 insertions(+), 23 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index eeca257..e5bcdb6 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -3827,6 +3827,8 @@ typedef struct
   char *left;
   char *right;
   char *is;
+  bool begline;
+  bool endline;
 } must;
 
 static void
@@ -3846,6 +3848,8 @@ dfamust (struct dfa *d)
   size_t ri;
   size_t i;
   bool exact = false;
+  bool begline = false;
+  bool endline = false;
   struct dfamust *dm;
 
   for (i = 0; i <= d->tindex; ++i)
@@ -3854,6 +3858,8 @@ dfamust (struct dfa *d)
       mp[i].left = xzalloc (2);
       mp[i].right = xzalloc (2);
       mp[i].is = xzalloc (2);
+      mp[i].begline = false;
+      mp[i].endline = false;
     }
 #ifdef DEBUG
   fprintf (stderr, "dfamust:\n");
@@ -3869,6 +3875,14 @@ dfamust (struct dfa *d)
       token t = d->tokens[ri];
       switch (t)
         {
+        case BEGLINE:
+          resetmust (mp);
+          mp->begline = true;
+          break;
+        case ENDLINE:
+          resetmust (mp);
+          mp->endline = true;
+          break;
         case LPAREN:
         case RPAREN:
           assert (!"neither LPAREN nor RPAREN may appear here");
@@ -3879,8 +3893,6 @@ dfamust (struct dfa *d)
           --mp;
           /* Fall through.  */
         case EMPTY:
-        case BEGLINE:
-        case ENDLINE:
         case BEGWORD:
         case ENDWORD:
         case LIMWORD:
@@ -3904,6 +3916,10 @@ dfamust (struct dfa *d)
             /* Guaranteed to be.  Unlikely, but ...  */
             if (!STREQ (lmp->is, rmp->is))
               lmp->is[0] = '\0';
+            if (lmp->begline)
+              lmp->begline = rmp->begline;
+            if (lmp->endline)
+              lmp->endline = rmp->endline;
             /* Left side--easy */
             i = 0;
             while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
@@ -3940,7 +3956,11 @@ dfamust (struct dfa *d)
             if (strlen (musts[0].in[i]) > strlen (result))
               result = musts[0].in[i];
           if (STREQ (result, musts[0].is))
-            exact = true;
+            {
+              exact = true;
+              begline = musts[0].begline;
+              endline = musts[0].endline;
+            }
           goto done;
 
         case CAT:
@@ -3973,10 +3993,20 @@ dfamust (struct dfa *d)
               lmp->right[0] = '\0';
             lmp->right = icatalloc (lmp->right, rmp->right);
             /* Guaranteed to be */
-            if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
-              lmp->is = icatalloc (lmp->is, rmp->is);
+            if ((lmp->is[0] != '\0' || lmp->begline)
+                && (rmp->is[0] != '\0' || rmp->endline))
+              {
+                lmp->is = icatalloc (lmp->is, rmp->is);
+                if (lmp->is)
+                  goto done;
+                lmp->endline = rmp->endline;
+              }
             else
-              lmp->is[0] = '\0';
+              {
+                lmp->is[0] = '\0';
+                lmp->begline = false;
+                lmp->endline = false;
+              }
           }
           break;
 
@@ -4031,6 +4061,8 @@ done:
     {
       dm = xmalloc (sizeof *dm);
       dm->exact = exact;
+      dm->begline = begline;
+      dm->endline = endline;
       dm->must = xstrdup (result);
       dm->next = d->musts;
       d->musts = dm;
diff --git a/src/dfa.h b/src/dfa.h
index db29a62..3829e6a 100644
--- a/src/dfa.h
+++ b/src/dfa.h
@@ -26,6 +26,8 @@
 struct dfamust
 {
   int exact;
+  int begline;
+  int endline;
   char *must;
   struct dfamust *next;
 };
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 1266c80..34c4505 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -51,6 +51,8 @@ static size_t pcount;
    call the regexp matcher at all. */
 static size_t kwset_exact_matches;
 
+static int begline;
+
 void
 dfaerror (char const *mesg)
 {
@@ -85,15 +87,30 @@ kwsmusts (void)
   if (dm)
     {
       kwsinit (&kwset);
+      begline = 0;
       /* First, we compile in the substrings known to be exact
          matches.  The kwset matcher will return the index
          of the matching string that it chooses. */
       for (; dm; dm = dm->next)
         {
+          char *must, *mp;
+          size_t old_len, new_len;
           if (!dm->exact)
             continue;
           ++kwset_exact_matches;
-          kwsincr (kwset, dm->must, strlen (dm->must));
+          old_len = strlen (dm->must);
+          new_len = old_len + dm->begline + dm->endline;
+          must = mp = xmalloc (new_len);
+          if (dm->begline)
+            {
+              (mp++)[0] = eolbyte;
+              begline = 1;
+            }
+          memcpy (mp, dm->must, old_len);
+          if (dm->endline)
+            mp[old_len] = eolbyte;
+          kwsincr (kwset, must, new_len);
+          free (must);
         }
       /* Now, we compile the substrings that will require
          the use of the regexp matcher.  */
@@ -212,7 +229,8 @@ EGexecute (char const *buf, size_t size, size_t *match_size,
           if (kwset)
             {
               /* Find a possible match using the KWset matcher. */
-              size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
+              size_t offset = kwsexec (kwset, beg - begline,
+                                       buflim - beg + begline, &kwsm);
               if (offset == (size_t) -1)
                 goto failure;
               beg += offset;
@@ -225,11 +243,12 @@ EGexecute (char const *buf, size_t size, size_t 
*match_size,
               beg = beg ? beg + 1 : buf;
               if (kwsm.index < kwset_exact_matches)
                 {
+                  if (MB_CUR_MAX == 1)
+                    goto success;
                   if (mb_start < beg)
                     mb_start = beg;
-                  if (MB_CUR_MAX == 1
-                      || !is_mb_middle (&mb_start, match, buflim,
-                                        kwsm.size[0]))
+                  if (!is_mb_middle (&mb_start, match, buflim,
+                                     kwsm.size[0] - begline))
                     goto success;
                   /* The matched line starts in the middle of a multibyte
                      character.  Perform the DFA search starting from the
diff --git a/src/kwsearch.c b/src/kwsearch.c
index 5f3f233..9904bfe 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -57,7 +57,16 @@ Fcompile (char const *pattern, size_t size)
           total = 0;
         }
 
-      kwsincr (kwset, p, len);
+      if (match_lines)
+        {
+          char *buf = xmalloc (len + 2);
+          memcpy (&buf[1], p, len);
+          buf[0] = buf[len + 1] = eolbyte;
+          kwsincr (kwset, buf, len + 2);
+          free (buf);
+        }
+      else
+        kwsincr (kwset, p, len);
       p = sep;
     }
   while (p);
@@ -109,11 +118,12 @@ Fexecute (char const *buf, size_t size, size_t 
*match_size,
 
   for (mb_start = beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++)
     {
-      size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
+      size_t offset = kwsexec (kwset, beg - match_lines,
+                               buf + size - beg + match_lines, &kwsmatch);
       if (offset == (size_t) -1)
         goto failure;
-      len = kwsmatch.size[0];
-      if (MB_CUR_MAX > 1
+      len = kwsmatch.size[0] - match_lines;
+      if (MB_CUR_MAX > 1 && !match_lines
           && is_mb_middle (&mb_start, beg + offset, buf + size, len))
         {
           /* The match was a part of multibyte character, advance at least
@@ -130,14 +140,8 @@ Fexecute (char const *buf, size_t size, size_t *match_size,
       if (start_ptr && !match_words)
         goto success_in_beg_and_len;
       if (match_lines)
-        {
-          if (beg > buf && beg[-1] != eol)
-            continue;
-          if (beg + len < buf + size && beg[len] != eol)
-            continue;
-          goto success;
-        }
-      else if (match_words)
+        goto success_in_beg_and_len;
+      if (match_words)
         for (try = beg; ; )
           {
             if (try > buf && WCHAR(to_uchar (try[-1])))
-- 
1.9.2

Reply via email to