I rebased this patch because of confliction.
From 10960a2940ff49eb5a7584e9da9ffe342e34d948 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <[email protected]>
Date: Mon, 31 Mar 2014 21:51:48 +0900
Subject: [PATCH] grep: speed-up for exact matching with begline and endline
constraints.
dfamust turns on the flag when a state exactly matches the proposed one.
However, when the state has begline and/or endline constraints, turns
off it.
This patch enables to match a state exactly, even if the state has
begline and/or endline constraints. If a exact string has one of their
constrations, the string adding eolbyte to a head and/or foot is pushed
to kwsincr(). In addition, if it has begline constration, start
searching from just before the position of the text.
* src/dfa.c (variable must): New members `begline' and `endline'.
(dfamust): Consideration of begline and endline constrations.
* src/dfa.h (struct dfamust): New members `begline' and `endline'.
* src/dfasearch.c (kwsmusts): If a exact string has begline constration,
start searching from just before the position of the text.
(EGexecute): Same as above.
* src/kwsearch.c (Fexecute): Same as above.
---
src/dfa.c | 44 ++++++++++++++++++++++++++++++++++++++------
src/dfa.h | 2 ++
src/dfasearch.c | 29 ++++++++++++++++++++++++-----
src/kwsearch.c | 28 ++++++++++++++++------------
4 files changed, 80 insertions(+), 23 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index eeca257..e5bcdb6 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -3827,6 +3827,8 @@ typedef struct
char *left;
char *right;
char *is;
+ bool begline;
+ bool endline;
} must;
static void
@@ -3846,6 +3848,8 @@ dfamust (struct dfa *d)
size_t ri;
size_t i;
bool exact = false;
+ bool begline = false;
+ bool endline = false;
struct dfamust *dm;
for (i = 0; i <= d->tindex; ++i)
@@ -3854,6 +3858,8 @@ dfamust (struct dfa *d)
mp[i].left = xzalloc (2);
mp[i].right = xzalloc (2);
mp[i].is = xzalloc (2);
+ mp[i].begline = false;
+ mp[i].endline = false;
}
#ifdef DEBUG
fprintf (stderr, "dfamust:\n");
@@ -3869,6 +3875,14 @@ dfamust (struct dfa *d)
token t = d->tokens[ri];
switch (t)
{
+ case BEGLINE:
+ resetmust (mp);
+ mp->begline = true;
+ break;
+ case ENDLINE:
+ resetmust (mp);
+ mp->endline = true;
+ break;
case LPAREN:
case RPAREN:
assert (!"neither LPAREN nor RPAREN may appear here");
@@ -3879,8 +3893,6 @@ dfamust (struct dfa *d)
--mp;
/* Fall through. */
case EMPTY:
- case BEGLINE:
- case ENDLINE:
case BEGWORD:
case ENDWORD:
case LIMWORD:
@@ -3904,6 +3916,10 @@ dfamust (struct dfa *d)
/* Guaranteed to be. Unlikely, but ... */
if (!STREQ (lmp->is, rmp->is))
lmp->is[0] = '\0';
+ if (lmp->begline)
+ lmp->begline = rmp->begline;
+ if (lmp->endline)
+ lmp->endline = rmp->endline;
/* Left side--easy */
i = 0;
while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
@@ -3940,7 +3956,11 @@ dfamust (struct dfa *d)
if (strlen (musts[0].in[i]) > strlen (result))
result = musts[0].in[i];
if (STREQ (result, musts[0].is))
- exact = true;
+ {
+ exact = true;
+ begline = musts[0].begline;
+ endline = musts[0].endline;
+ }
goto done;
case CAT:
@@ -3973,10 +3993,20 @@ dfamust (struct dfa *d)
lmp->right[0] = '\0';
lmp->right = icatalloc (lmp->right, rmp->right);
/* Guaranteed to be */
- if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
- lmp->is = icatalloc (lmp->is, rmp->is);
+ if ((lmp->is[0] != '\0' || lmp->begline)
+ && (rmp->is[0] != '\0' || rmp->endline))
+ {
+ lmp->is = icatalloc (lmp->is, rmp->is);
+ if (lmp->is)
+ goto done;
+ lmp->endline = rmp->endline;
+ }
else
- lmp->is[0] = '\0';
+ {
+ lmp->is[0] = '\0';
+ lmp->begline = false;
+ lmp->endline = false;
+ }
}
break;
@@ -4031,6 +4061,8 @@ done:
{
dm = xmalloc (sizeof *dm);
dm->exact = exact;
+ dm->begline = begline;
+ dm->endline = endline;
dm->must = xstrdup (result);
dm->next = d->musts;
d->musts = dm;
diff --git a/src/dfa.h b/src/dfa.h
index db29a62..3829e6a 100644
--- a/src/dfa.h
+++ b/src/dfa.h
@@ -26,6 +26,8 @@
struct dfamust
{
int exact;
+ int begline;
+ int endline;
char *must;
struct dfamust *next;
};
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 1266c80..34c4505 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -51,6 +51,8 @@ static size_t pcount;
call the regexp matcher at all. */
static size_t kwset_exact_matches;
+static int begline;
+
void
dfaerror (char const *mesg)
{
@@ -85,15 +87,30 @@ kwsmusts (void)
if (dm)
{
kwsinit (&kwset);
+ begline = 0;
/* First, we compile in the substrings known to be exact
matches. The kwset matcher will return the index
of the matching string that it chooses. */
for (; dm; dm = dm->next)
{
+ char *must, *mp;
+ size_t old_len, new_len;
if (!dm->exact)
continue;
++kwset_exact_matches;
- kwsincr (kwset, dm->must, strlen (dm->must));
+ old_len = strlen (dm->must);
+ new_len = old_len + dm->begline + dm->endline;
+ must = mp = xmalloc (new_len);
+ if (dm->begline)
+ {
+ (mp++)[0] = eolbyte;
+ begline = 1;
+ }
+ memcpy (mp, dm->must, old_len);
+ if (dm->endline)
+ mp[old_len] = eolbyte;
+ kwsincr (kwset, must, new_len);
+ free (must);
}
/* Now, we compile the substrings that will require
the use of the regexp matcher. */
@@ -212,7 +229,8 @@ EGexecute (char const *buf, size_t size, size_t *match_size,
if (kwset)
{
/* Find a possible match using the KWset matcher. */
- size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
+ size_t offset = kwsexec (kwset, beg - begline,
+ buflim - beg + begline, &kwsm);
if (offset == (size_t) -1)
goto failure;
beg += offset;
@@ -225,11 +243,12 @@ EGexecute (char const *buf, size_t size, size_t
*match_size,
beg = beg ? beg + 1 : buf;
if (kwsm.index < kwset_exact_matches)
{
+ if (MB_CUR_MAX == 1)
+ goto success;
if (mb_start < beg)
mb_start = beg;
- if (MB_CUR_MAX == 1
- || !is_mb_middle (&mb_start, match, buflim,
- kwsm.size[0]))
+ if (!is_mb_middle (&mb_start, match, buflim,
+ kwsm.size[0] - begline))
goto success;
/* The matched line starts in the middle of a multibyte
character. Perform the DFA search starting from the
diff --git a/src/kwsearch.c b/src/kwsearch.c
index 5f3f233..9904bfe 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -57,7 +57,16 @@ Fcompile (char const *pattern, size_t size)
total = 0;
}
- kwsincr (kwset, p, len);
+ if (match_lines)
+ {
+ char *buf = xmalloc (len + 2);
+ memcpy (&buf[1], p, len);
+ buf[0] = buf[len + 1] = eolbyte;
+ kwsincr (kwset, buf, len + 2);
+ free (buf);
+ }
+ else
+ kwsincr (kwset, p, len);
p = sep;
}
while (p);
@@ -109,11 +118,12 @@ Fexecute (char const *buf, size_t size, size_t
*match_size,
for (mb_start = beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++)
{
- size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
+ size_t offset = kwsexec (kwset, beg - match_lines,
+ buf + size - beg + match_lines, &kwsmatch);
if (offset == (size_t) -1)
goto failure;
- len = kwsmatch.size[0];
- if (MB_CUR_MAX > 1
+ len = kwsmatch.size[0] - match_lines;
+ if (MB_CUR_MAX > 1 && !match_lines
&& is_mb_middle (&mb_start, beg + offset, buf + size, len))
{
/* The match was a part of multibyte character, advance at least
@@ -130,14 +140,8 @@ Fexecute (char const *buf, size_t size, size_t *match_size,
if (start_ptr && !match_words)
goto success_in_beg_and_len;
if (match_lines)
- {
- if (beg > buf && beg[-1] != eol)
- continue;
- if (beg + len < buf + size && beg[len] != eol)
- continue;
- goto success;
- }
- else if (match_words)
+ goto success_in_beg_and_len;
+ if (match_words)
for (try = beg; ; )
{
if (try > buf && WCHAR(to_uchar (try[-1])))
--
1.9.2