dfamust turns on the flag when a state exactly matches the proposed one.
However, when the state has begline and/or endline constraints, turns
off it.

This patch enables to match a state exactly, even if the state has
begline and/or endline constraints.  If a exact string has one of their
constrations, the string adding eolbyte to a head and/or foot is pushed
to kwsincr().  In addition, if it has begline constration, start
searching from just before the position of the text.

Norihiro
From 642d6fed4310b3e4ee9569473b03e10dbee0e523 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <[email protected]>
Date: Mon, 31 Mar 2014 21:51:48 +0900
Subject: [PATCH] grep: speed-up for exact matching with begline and endline
 constraints

dfamust turns on the flag when a state exactly matches the proposed one.
However, when the state has begline and/or endline constraints, turns
off it.

This patch enables to match a state exactly, even if the state has
begline and/or endline constraints.  If a exact string has one of their
constrations, the string adding eolbyte to a head and/or foot is pushed
to kwsincr().  In addition, if it has begline constration, start
searching from just before the position of the text.

* src/dfa.c (variable must): New members `begline' and `endline'.
(dfamust): Consideration of begline and endline constrations.
* src/dfa.h (struct dfamust): New members `begline' and `endline'.
* src/dfasearch.c (kwsmusts): If a exact string has begline constration,
start searching from just before the position of the text.
(EGexecute): Same as above.
* src/kwsearch.c (Fexecute): Same as above.
---
 src/dfa.c       | 40 +++++++++++++++++++++++++++++++++++-----
 src/dfa.h       |  2 ++
 src/dfasearch.c | 29 ++++++++++++++++++++++++-----
 src/kwsearch.c  | 32 +++++++++++++++++++-------------
 4 files changed, 80 insertions(+), 23 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index ef5c8a9..c4a3194 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -3902,6 +3902,8 @@ typedef struct
   char *left;
   char *right;
   char *is;
+  int begline;
+  int endline;
 } must;
 
 static void
@@ -3920,6 +3922,8 @@ dfamust (struct dfa *d)
   size_t ri;
   size_t i;
   int exact;
+  int begline;
+  int endline;
   token t;
   static must must0;
   struct dfamust *dm;
@@ -3927,6 +3931,8 @@ dfamust (struct dfa *d)
 
   result = empty_string;
   exact = 0;
+  begline = 0;
+  endline = 0;
   MALLOC (musts, d->tindex + 1);
   mp = musts;
   for (i = 0; i <= d->tindex; ++i)
@@ -3939,6 +3945,8 @@ dfamust (struct dfa *d)
       mp[i].is = xmalloc (2);
       mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
       mp[i].in[0] = NULL;
+      mp[i].begline = 0;
+      mp[i].endline = 0;
     }
 #ifdef DEBUG
   fprintf (stderr, "dfamust:\n");
@@ -3953,12 +3961,18 @@ dfamust (struct dfa *d)
     {
       switch (t = d->tokens[ri])
         {
+        case BEGLINE:
+          resetmust (mp);
+          mp->begline = 1;
+          break;
+        case ENDLINE:
+          resetmust (mp);
+          mp->endline = 1;
+          break;
         case LPAREN:
         case RPAREN:
           assert (!"neither LPAREN nor RPAREN may appear here");
         case EMPTY:
-        case BEGLINE:
-        case ENDLINE:
         case BEGWORD:
         case ENDWORD:
         case LIMWORD:
@@ -3985,6 +3999,10 @@ dfamust (struct dfa *d)
             /* Guaranteed to be.  Unlikely, but ...  */
             if (!STREQ (lmp->is, rmp->is))
               lmp->is[0] = '\0';
+            if (lmp->begline)
+              lmp->begline = rmp->begline;
+            if (lmp->endline)
+              lmp->endline = rmp->endline;
             /* Left side--easy */
             i = 0;
             while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
@@ -4021,7 +4039,11 @@ dfamust (struct dfa *d)
             if (strlen (musts[0].in[i]) > strlen (result))
               result = musts[0].in[i];
           if (STREQ (result, musts[0].is))
-            exact = 1;
+            {
+              exact = 1;
+              begline = musts[0].begline;
+              endline = musts[0].endline;
+            }
           goto done;
         case CAT:
           assert (&musts[2] <= mp);
@@ -4062,14 +4084,20 @@ dfamust (struct dfa *d)
             if (lmp->right == NULL)
               goto done;
             /* Guaranteed to be */
-            if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
+            if ((lmp->is[0] != '\0' || lmp->begline)
+                && (rmp->is[0] != '\0' || rmp->endline))
               {
                 lmp->is = icatalloc (lmp->is, rmp->is);
                 if (lmp->is == NULL)
                   goto done;
+                lmp->endline = rmp->endline;
               }
             else
-              lmp->is[0] = '\0';
+              {
+                lmp->is[0] = '\0';
+                lmp->begline = 0;
+                lmp->endline = 0;
+              }
           }
           break;
         default:
@@ -4116,6 +4144,8 @@ done:
     {
       MALLOC (dm, 1);
       dm->exact = exact;
+      dm->begline = begline;
+      dm->endline = endline;
       dm->must = xmemdup (result, strlen (result) + 1);
       dm->next = d->musts;
       d->musts = dm;
diff --git a/src/dfa.h b/src/dfa.h
index 24fbcbe..7aa1b9a 100644
--- a/src/dfa.h
+++ b/src/dfa.h
@@ -26,6 +26,8 @@
 struct dfamust
 {
   int exact;
+  int begline;
+  int endline;
   char *must;
   struct dfamust *next;
 };
diff --git a/src/dfasearch.c b/src/dfasearch.c
index ca00505..f0ab4a8 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -51,6 +51,8 @@ static size_t pcount;
    call the regexp matcher at all. */
 static size_t kwset_exact_matches;
 
+static int begline;
+
 void
 dfaerror (char const *mesg)
 {
@@ -92,16 +94,31 @@ kwsmusts (void)
     {
       char const *err;
       kwsinit (&kwset);
+      begline = 0;
       /* First, we compile in the substrings known to be exact
          matches.  The kwset matcher will return the index
          of the matching string that it chooses. */
       for (; dm; dm = dm->next)
         {
+          char *must, *mp;
+          size_t old_len, new_len;
           if (!dm->exact)
             continue;
           ++kwset_exact_matches;
-          if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != NULL)
+          old_len = strlen (dm->must);
+          new_len = old_len + dm->begline + dm->endline;
+          must = mp = xmalloc (new_len);
+          if (dm->begline)
+            {
+              (mp++)[0] = eolbyte;
+              begline = 1;
+            }
+          memcpy (mp, dm->must, old_len);
+          if (dm->endline)
+            mp[old_len] = eolbyte;
+          if ((err = kwsincr (kwset, must, new_len)) != NULL)
             error (EXIT_TROUBLE, 0, "%s", err);
+          free (must);
         }
       /* Now, we compile the substrings that will require
          the use of the regexp matcher.  */
@@ -223,7 +240,8 @@ EGexecute (char const *buf, size_t size, size_t *match_size,
           if (kwset)
             {
               /* Find a possible match using the KWset matcher. */
-              size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
+              size_t offset = kwsexec (kwset, beg - begline,
+                                       buflim - beg + begline, &kwsm);
               if (offset == (size_t) -1)
                 goto failure;
               beg += offset;
@@ -239,11 +257,12 @@ EGexecute (char const *buf, size_t size, size_t 
*match_size,
               char const *dfa_start = beg;
               if (kwsm.index < kwset_exact_matches)
                 {
+                  if (MB_CUR_MAX == 1)
+                    goto success;
                   if (mb_start < beg)
                     mb_start = beg;
-                  if (MB_CUR_MAX == 1
-                      || !is_mb_middle (&mb_start, match, buflim,
-                                        kwsm.size[0]))
+                  if (!is_mb_middle (&mb_start, match, buflim,
+                                     kwsm.size[0] - begline))
                     goto success;
                   /* The matched line starts in the middle of a multibyte
                      character.  Perform the DFA search starting from the
diff --git a/src/kwsearch.c b/src/kwsearch.c
index dd01518..1e471d0 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -65,8 +65,20 @@ Fcompile (char const *pattern, size_t size)
 #endif
         }
 
-      if ((err = kwsincr (kwset, beg, end - beg)) != NULL)
-        error (EXIT_TROUBLE, 0, "%s", err);
+      if (match_lines)
+        {
+          char *new_beg = xmalloc (end - beg + 2);
+          new_beg[0] = new_beg[end - beg + 1] = eolbyte;
+          memcpy (&new_beg[1], beg, end - beg);
+          if ((err = kwsincr (kwset, new_beg, end - beg + 2)) != NULL)
+            error (EXIT_TROUBLE, 0, "%s", err);
+          free (new_beg);
+        }
+      else
+        {
+          if ((err = kwsincr (kwset, beg, end - beg)) != NULL)
+            error (EXIT_TROUBLE, 0, "%s", err);
+        }
       beg = lim;
     }
   while (beg < pat + psize);
@@ -119,11 +131,11 @@ Fexecute (char const *buf, size_t size, size_t 
*match_size,
 
   for (mb_start = beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++)
     {
-      size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
+      size_t offset = kwsexec (kwset, beg - match_lines, buf + size - beg + 
match_lines, &kwsmatch);
       if (offset == (size_t) -1)
         goto failure;
-      len = kwsmatch.size[0];
-      if (MB_CUR_MAX > 1
+      len = kwsmatch.size[0] - match_lines;
+      if (MB_CUR_MAX > 1 && !match_lines
           && is_mb_middle (&mb_start, beg + offset, buf + size, len))
         {
           /* The match was a part of multibyte character, advance at least
@@ -140,14 +152,8 @@ Fexecute (char const *buf, size_t size, size_t *match_size,
       if (start_ptr && !match_words)
         goto success_in_beg_and_len;
       if (match_lines)
-        {
-          if (beg > buf && beg[-1] != eol)
-            continue;
-          if (beg + len < buf + size && beg[len] != eol)
-            continue;
-          goto success;
-        }
-      else if (match_words)
+        goto success_in_beg_and_len;
+      if (match_words)
         for (try = beg; ; )
           {
             if (try > buf && WCHAR(to_uchar (try[-1])))
-- 
1.9.1

Reply via email to