Norihiro Tanaka wrote:
I rebased this patch, and added four fixes to it.
Thanks. This patch seems like a real performance win for grep -i in the
usual case, so I installed it into the master. If we run into
performance problems in unusual cases I suppose we can look into them as
they arise.
In reviewing the patch I found one memory-access typo:
- if ((end = memchr(beg, eol, buflim - beg)) != NULL)
...
+ if ((end = memchr(next_beg, eol, buflim - beg)) != NULL)
That last "beg" should be "next_beg".
I merged the patch into the current master on Savannah (first attached
file) and then applied a fixup patch (second attached file) that fixes
this bug and makes some other minor related improvements, e.g., using
memrchr rather than looking for eol by hand.
From 37f96cdd238a2e4a8ef5442d26eb4f3d97062446 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <[email protected]>
Date: Sun, 6 Apr 2014 13:42:08 -0700
Subject: [PATCH 1/2] grep: optimization with the superset of DFA
The superset of a DFA is like the DFA, except that for speed
ANYCHAR, MBCSET and BACKREF are replaced by (CSET full bits) STAR,
and mb_cur_max is 1. For example, for 'a\(b\)c\1':
original: a b CAT c CAT BACKREF CAT
superset: a b CAT c CAT CSET STAR CAT (The CSET has all bits set.)
If a string matches a DFA, it matches the DFA's superset.
Using the superset to filter can dramatically improve performance,
over 200x in some cases. See <http://bugs.gnu.org/16966>.
* src/dfa.c (struct dfa): New member 'superset'.
(dfahint, dfasuperset): New functions.
(dfacomp): Create and analyze the superset.
(dfafree): Free only non-NULL items.
(dfaalloc): Initialize superset member.
(dfaoptimize): If succeed in optimization for UTF-8 locale, don't use
the superset.
* src/dfa.h (dfahint): New decl.
* src/dfasearch.c (EGexecute): Use dfahint.
---
src/dfa.c | 132 +++++++++++++++++++++++++++++++++++++++++++++++++++-----
src/dfa.h | 3 ++
src/dfasearch.c | 63 +++++++++++++++++++++------
3 files changed, 173 insertions(+), 25 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index ef5c8a9..d88d077 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -373,6 +373,9 @@ struct dfa
size_t nmbcsets;
size_t mbcsets_alloc;
+ /* Fields filled by the superset. */
+ struct dfa *superset; /* Hint of the dfa. */
+
/* Fields filled by the state builder. */
dfa_state *states; /* States of the dfa. */
state_num sindex; /* Index for adding new states. */
@@ -3488,6 +3491,21 @@ dfaexec (struct dfa *d, char const *begin, char *end,
}
}
+size_t
+dfahint (struct dfa *d, char const *begin, char *end, size_t *count)
+{
+ char const *match;
+
+ if (d->superset == NULL)
+ return (size_t) -2;
+
+ match = dfaexec (d->superset, begin, end, 1, count, NULL);
+ if (match == NULL)
+ return (size_t) -1;
+
+ return match - begin;
+}
+
static void
free_mbdata (struct dfa *d)
{
@@ -3579,6 +3597,81 @@ dfaoptimize (struct dfa *d)
d->mb_cur_max = 1;
}
+static void
+dfasuperset (struct dfa *d)
+{
+ size_t i, j;
+ charclass ccl;
+ bool have_achar = false;
+ bool have_nchar = false;
+
+ dfa = d->superset = dfaalloc ();
+ *d->superset = *d;
+ d->superset->mb_cur_max = 1;
+ d->superset->multibyte_prop = NULL;
+ d->superset->mbcsets = NULL;
+ d->superset->superset = NULL;
+ d->superset->states = NULL;
+ d->superset->follows = NULL;
+ d->superset->trans = NULL;
+ d->superset->realtrans = NULL;
+ d->superset->fails = NULL;
+ d->superset->success = NULL;
+ d->superset->newlines = NULL;
+ d->superset->musts = NULL;
+
+ MALLOC (d->superset->charclasses, d->superset->calloc);
+ for (i = 0; i < d->cindex; i++)
+ copyset (d->charclasses[i], d->superset->charclasses[i]);
+
+ d->superset->talloc = d->tindex * 2;
+ MALLOC (d->superset->tokens, d->superset->talloc);
+
+ for (i = j = 0; i < d->tindex; i++)
+ {
+ switch (d->tokens[i])
+ {
+ case ANYCHAR:
+ case MBCSET:
+ case BACKREF:
+ zeroset (ccl);
+ notset (ccl);
+ d->superset->tokens[j++] = CSET + charclass_index (ccl);
+ d->superset->tokens[j++] = STAR;
+ if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
+ || d->tokens[i + 1] == PLUS)
+ i++;
+ have_achar = true;
+ break;
+ case BEGWORD:
+ case ENDWORD:
+ case LIMWORD:
+ case NOTLIMWORD:
+ if (MB_CUR_MAX > 1)
+ {
+ /* Ignore these constraints. */
+ d->superset->tokens[j++] = EMPTY;
+ break;
+ }
+ default:
+ d->superset->tokens[j++] = d->tokens[i];
+ if ((d->tokens[i] >= 0 && d->tokens[i] < NOTCHAR)
+ || d->tokens[i] >= CSET)
+ have_nchar = true;
+ break;
+ }
+ }
+
+ if ((d->mb_cur_max == 1 && !have_achar) || !have_nchar)
+ {
+ dfafree (d->superset);
+ d->superset = NULL;
+ return;
+ }
+
+ d->superset->tindex = j;
+}
+
/* Parse and analyze a single string of the given length. */
void
dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
@@ -3588,7 +3681,10 @@ dfacomp (char const *s, size_t len, struct dfa *d, int
searchflag)
dfaparse (s, len, d);
dfamust (d);
dfaoptimize (d);
+ dfasuperset (d);
dfaanalyze (d, searchflag);
+ if (d->superset != NULL)
+ dfaanalyze (d->superset, searchflag);
}
/* Free the storage held by the components of a dfa. */
@@ -3604,30 +3700,42 @@ dfafree (struct dfa *d)
if (d->mb_cur_max > 1)
free_mbdata (d);
- for (i = 0; i < d->sindex; ++i)
+ if (d->states != NULL)
{
- free (d->states[i].elems.elems);
- free (d->states[i].mbps.elems);
+ for (i = 0; i < d->sindex; ++i)
+ {
+ free (d->states[i].elems.elems);
+ free (d->states[i].mbps.elems);
+ }
+ free (d->states);
}
- free (d->states);
- for (i = 0; i < d->tindex; ++i)
- free (d->follows[i].elems);
- free (d->follows);
- for (i = 0; i < d->tralloc; ++i)
+ if (d->follows != NULL)
{
- free (d->trans[i]);
- free (d->fails[i]);
+ for (i = 0; i < d->tindex; ++i)
+ free (d->follows[i].elems);
+ free (d->follows);
}
+ if (d->trans != NULL)
+ for (i = 0; i < d->tralloc; ++i)
+ {
+ free (d->trans[i]);
+ free (d->fails[i]);
+ }
+
free (d->realtrans);
free (d->fails);
free (d->newlines);
free (d->success);
+
for (dm = d->musts; dm; dm = ndm)
{
ndm = dm->next;
free (dm->must);
free (dm);
}
+
+ if (d->superset != NULL)
+ dfafree (d->superset);
}
/* Having found the postfix representation of the regular expression,
@@ -4135,7 +4243,9 @@ done:
struct dfa *
dfaalloc (void)
{
- return xmalloc (sizeof (struct dfa));
+ struct dfa *d = xmalloc (sizeof (struct dfa));
+ d->superset = NULL;
+ return d;
}
struct dfamust *_GL_ATTRIBUTE_PURE
diff --git a/src/dfa.h b/src/dfa.h
index 24fbcbe..ad97498 100644
--- a/src/dfa.h
+++ b/src/dfa.h
@@ -67,6 +67,9 @@ extern void dfacomp (char const *, size_t, struct dfa *, int);
to decide whether to fall back on a backtracking matcher. */
extern char *dfaexec (struct dfa *d, char const *begin, char *end,
int newline, size_t *count, int *backref);
+extern size_t dfahint (struct dfa *d, char const *begin, char *end,
+ size_t *count);
+
/* Free the storage held by the components of a struct dfa. */
extern void dfafree (struct dfa *);
diff --git a/src/dfasearch.c b/src/dfasearch.c
index ca00505..383c2ad 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -236,7 +236,6 @@ EGexecute (char const *buf, size_t size, size_t *match_size,
match = beg;
while (beg > buf && beg[-1] != eol)
--beg;
- char const *dfa_start = beg;
if (kwsm.index < kwset_exact_matches)
{
if (mb_start < beg)
@@ -248,29 +247,65 @@ EGexecute (char const *buf, size_t size, size_t
*match_size,
/* The matched line starts in the middle of a multibyte
character. Perform the DFA search starting from the
beginning of the next character. */
- dfa_start = mb_start;
+ if (dfaexec (dfa, mb_start, (char *) end, 0, NULL,
+ &backref) == NULL)
+ continue;
+ }
+ else
+ {
+ if (dfahint (dfa, beg, (char *) end, NULL) ==
+ (size_t) -1)
+ continue;
+ if (dfaexec (dfa, beg, (char *) end, 0, NULL,
+ &backref) == NULL)
+ continue;
}
- if (dfaexec (dfa, dfa_start, (char *) end, 0, NULL,
- &backref) == NULL)
- continue;
}
else
{
/* No good fixed strings; start with DFA. */
- char const *next_beg = dfaexec (dfa, beg, (char *) buflim,
- 0, NULL, &backref);
- /* If there's no match, or if we've matched the sentinel,
- we're done. */
- if (next_beg == NULL || next_beg == buflim)
- break;
+ size_t offset, count;
+ char const *next_beg;
+ count = 0;
+ offset = dfahint (dfa, beg, (char *) buflim, &count);
+ if (offset == (size_t) -1)
+ goto failure;
+ if (offset == (size_t) -2)
+ {
+ /* No use hint. */
+ next_beg = dfaexec (dfa, beg, (char *) buflim, 0,
+ NULL, &backref);
+ /* If there's no match, or if we've matched the sentinel,
+ we're done. */
+ if (next_beg == NULL || next_beg == buflim)
+ goto failure;
+ }
+ else
+ next_beg = beg + offset;
/* Narrow down to the line we've found. */
beg = next_beg;
- if ((end = memchr(beg, eol, buflim - beg)) != NULL)
+ while (beg > buf && beg[-1] != eol)
+ --beg;
+ if (count > 0)
+ {
+ /* dfahint() may match in multiple lines. If that is
+ the case, try to match in one line. */
+ end = beg;
+ continue;
+ }
+ if ((end = memchr(next_beg, eol, buflim - beg)) != NULL)
end++;
else
end = buflim;
- while (beg > buf && beg[-1] != eol)
- --beg;
+ if (offset != (size_t) -2)
+ {
+ next_beg = dfaexec (dfa, beg, (char *) end, 0, NULL,
+ &backref);
+ /* If there's no match, or if we've matched the sentinel,
+ we're done. */
+ if (next_beg == NULL || next_beg == end)
+ continue;
+ }
}
/* Successful, no backreferences encountered! */
if (!backref)
--
1.9.0
From 9554aac8117d43346402476e5df8d837a3e9a9f8 Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Sun, 6 Apr 2014 21:54:59 -0700
Subject: [PATCH 2/2] grep: cleanup DFA superset optimization
* src/dfa.c (dfa_charclass_index): New function, with body of
old dfa_charclass but with an extra parameter D.
(charclass_index): Reimplement in terms of dfa_charclass_index.
(dfahint): Clarify.
(dfasuperset): Do not assign to 'dfa' static variable. Instead,
use a local, and use the new dfa_charclass_index function. This
doesn't fix any bugs, but it's clearer. Initialize a few more
members, to simplify dfafree. Copy the charclasses with
just one memcpy call. Don't assign nonnull to D->superset until
it's known to be valid; that's simpler.
(dfafree, dfaalloc): Simplify based on dfasuperset initializations.
* src/dfa.h (dfahint): Add comment.
* src/dfasearch.c (EGexecute): Simplify use of memchr.
Simplify by using memrchr. Fix typo that could cause a buffer
read overrun.
---
src/dfa.c | 155 +++++++++++++++++++++++++++++---------------------------
src/dfa.h | 10 +++-
src/dfasearch.c | 37 ++++++--------
3 files changed, 104 insertions(+), 98 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index d88d077..b6c1250 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -674,25 +674,31 @@ equal (charclass const s1, charclass const s2)
return memcmp (s1, s2, sizeof (charclass)) == 0;
}
-/* A pointer to the current dfa is kept here during parsing. */
-static struct dfa *dfa;
-
-/* Find the index of charclass s in dfa->charclasses, or allocate a
- new charclass. */
+/* In DFA D, find the index of charclass S, or allocate a new one. */
static size_t
-charclass_index (charclass const s)
+dfa_charclass_index (struct dfa *d, charclass const s)
{
size_t i;
- for (i = 0; i < dfa->cindex; ++i)
- if (equal (s, dfa->charclasses[i]))
+ for (i = 0; i < d->cindex; ++i)
+ if (equal (s, d->charclasses[i]))
return i;
- REALLOC_IF_NECESSARY (dfa->charclasses, dfa->calloc, dfa->cindex + 1);
- ++dfa->cindex;
- copyset (s, dfa->charclasses[i]);
+ REALLOC_IF_NECESSARY (d->charclasses, d->calloc, d->cindex + 1);
+ ++d->cindex;
+ copyset (s, d->charclasses[i]);
return i;
}
+/* A pointer to the current dfa is kept here during parsing. */
+static struct dfa *dfa;
+
+/* Find the index of charclass S in the current DFA, or allocate a new one. */
+static size_t
+charclass_index (charclass const s)
+{
+ return dfa_charclass_index (dfa, s);
+}
+
/* Syntax bits controlling the behavior of the lexical analyzer. */
static reg_syntax_t syntax_bits, syntax_bits_set;
@@ -3491,19 +3497,24 @@ dfaexec (struct dfa *d, char const *begin, char *end,
}
}
+/* Search through a buffer looking for a potential match for D.
+ Return the offset of the byte after the first potential match.
+ If there is no match, return (size_t) -1. If D lacks a superset
+ so it's not known whether there is a match, return (size_t) -2.
+ BEGIN points to the beginning of the buffer, and END points to the
+ first byte after its end. Store a sentinel byte (usually newline)
+ in *END, so the actual buffer must be one byte longer. If COUNT is
+ non-NULL, increment *COUNT once for each newline processed. */
size_t
dfahint (struct dfa *d, char const *begin, char *end, size_t *count)
{
- char const *match;
-
- if (d->superset == NULL)
- return (size_t) -2;
-
- match = dfaexec (d->superset, begin, end, 1, count, NULL);
- if (match == NULL)
- return (size_t) -1;
-
- return match - begin;
+ if (! d->superset)
+ return -2;
+ else
+ {
+ char const *match = dfaexec (d->superset, begin, end, 1, count, NULL);
+ return match ? match - begin : -1;
+ }
}
static void
@@ -3604,28 +3615,29 @@ dfasuperset (struct dfa *d)
charclass ccl;
bool have_achar = false;
bool have_nchar = false;
-
- dfa = d->superset = dfaalloc ();
- *d->superset = *d;
- d->superset->mb_cur_max = 1;
- d->superset->multibyte_prop = NULL;
- d->superset->mbcsets = NULL;
- d->superset->superset = NULL;
- d->superset->states = NULL;
- d->superset->follows = NULL;
- d->superset->trans = NULL;
- d->superset->realtrans = NULL;
- d->superset->fails = NULL;
- d->superset->success = NULL;
- d->superset->newlines = NULL;
- d->superset->musts = NULL;
-
- MALLOC (d->superset->charclasses, d->superset->calloc);
- for (i = 0; i < d->cindex; i++)
- copyset (d->charclasses[i], d->superset->charclasses[i]);
-
- d->superset->talloc = d->tindex * 2;
- MALLOC (d->superset->tokens, d->superset->talloc);
+ struct dfa *sup = dfaalloc ();
+
+ *sup = *d;
+ sup->mb_cur_max = 1;
+ sup->multibyte_prop = NULL;
+ sup->mbcsets = NULL;
+ sup->superset = NULL;
+ sup->states = NULL;
+ sup->sindex = 0;
+ sup->follows = NULL;
+ sup->tralloc = 0;
+ sup->realtrans = NULL;
+ sup->fails = NULL;
+ sup->success = NULL;
+ sup->newlines = NULL;
+ sup->musts = NULL;
+
+ MALLOC (sup->charclasses, sup->calloc);
+ memcpy (sup->charclasses, d->charclasses,
+ d->cindex * sizeof *sup->charclasses);
+
+ sup->talloc = d->tindex * 2;
+ MALLOC (sup->tokens, sup->talloc);
for (i = j = 0; i < d->tindex; i++)
{
@@ -3636,8 +3648,8 @@ dfasuperset (struct dfa *d)
case BACKREF:
zeroset (ccl);
notset (ccl);
- d->superset->tokens[j++] = CSET + charclass_index (ccl);
- d->superset->tokens[j++] = STAR;
+ sup->tokens[j++] = CSET + dfa_charclass_index (sup, ccl);
+ sup->tokens[j++] = STAR;
if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
|| d->tokens[i + 1] == PLUS)
i++;
@@ -3650,26 +3662,23 @@ dfasuperset (struct dfa *d)
if (MB_CUR_MAX > 1)
{
/* Ignore these constraints. */
- d->superset->tokens[j++] = EMPTY;
+ sup->tokens[j++] = EMPTY;
break;
}
default:
- d->superset->tokens[j++] = d->tokens[i];
- if ((d->tokens[i] >= 0 && d->tokens[i] < NOTCHAR)
- || d->tokens[i] >= CSET)
+ sup->tokens[j++] = d->tokens[i];
+ if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
+ || d->tokens[i] >= CSET)
have_nchar = true;
break;
}
}
+ sup->tindex = j;
if ((d->mb_cur_max == 1 && !have_achar) || !have_nchar)
- {
- dfafree (d->superset);
- d->superset = NULL;
- return;
- }
-
- d->superset->tindex = j;
+ dfafree (sup);
+ else
+ d->superset = sup;
}
/* Parse and analyze a single string of the given length. */
@@ -3683,7 +3692,7 @@ dfacomp (char const *s, size_t len, struct dfa *d, int
searchflag)
dfaoptimize (d);
dfasuperset (d);
dfaanalyze (d, searchflag);
- if (d->superset != NULL)
+ if (d->superset)
dfaanalyze (d->superset, searchflag);
}
@@ -3700,27 +3709,25 @@ dfafree (struct dfa *d)
if (d->mb_cur_max > 1)
free_mbdata (d);
- if (d->states != NULL)
+ for (i = 0; i < d->sindex; ++i)
{
- for (i = 0; i < d->sindex; ++i)
- {
- free (d->states[i].elems.elems);
- free (d->states[i].mbps.elems);
- }
- free (d->states);
+ free (d->states[i].elems.elems);
+ free (d->states[i].mbps.elems);
}
- if (d->follows != NULL)
+ free (d->states);
+
+ if (d->follows)
{
for (i = 0; i < d->tindex; ++i)
free (d->follows[i].elems);
free (d->follows);
}
- if (d->trans != NULL)
- for (i = 0; i < d->tralloc; ++i)
- {
- free (d->trans[i]);
- free (d->fails[i]);
- }
+
+ for (i = 0; i < d->tralloc; ++i)
+ {
+ free (d->trans[i]);
+ free (d->fails[i]);
+ }
free (d->realtrans);
free (d->fails);
@@ -3734,7 +3741,7 @@ dfafree (struct dfa *d)
free (dm);
}
- if (d->superset != NULL)
+ if (d->superset)
dfafree (d->superset);
}
@@ -4243,9 +4250,7 @@ done:
struct dfa *
dfaalloc (void)
{
- struct dfa *d = xmalloc (sizeof (struct dfa));
- d->superset = NULL;
- return d;
+ return xmalloc (sizeof (struct dfa));
}
struct dfamust *_GL_ATTRIBUTE_PURE
diff --git a/src/dfa.h b/src/dfa.h
index ad97498..6ed2231 100644
--- a/src/dfa.h
+++ b/src/dfa.h
@@ -67,10 +67,18 @@ extern void dfacomp (char const *, size_t, struct dfa *,
int);
to decide whether to fall back on a backtracking matcher. */
extern char *dfaexec (struct dfa *d, char const *begin, char *end,
int newline, size_t *count, int *backref);
+
+/* Search through a buffer looking for a potential match for D.
+ Return the offset of the byte after the first potential match.
+ If there is no match, return (size_t) -1. If D lacks a superset
+ so it's not known whether there is a match, return (size_t) -2.
+ BEGIN points to the beginning of the buffer, and END points to the
+ first byte after its end. Store a sentinel byte (usually newline)
+ in *END, so the actual buffer must be one byte longer. If COUNT is
+ non-NULL, increment *COUNT once for each newline processed. */
extern size_t dfahint (struct dfa *d, char const *begin, char *end,
size_t *count);
-
/* Free the storage held by the components of a struct dfa. */
extern void dfafree (struct dfa *);
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 383c2ad..adec4e2 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -229,13 +229,11 @@ EGexecute (char const *buf, size_t size, size_t
*match_size,
beg += offset;
/* Narrow down to the line containing the candidate, and
run it through DFA. */
- if ((end = memchr(beg, eol, buflim - beg)) != NULL)
- end++;
- else
- end = buflim;
+ end = memchr (beg, eol, buflim - beg);
+ end = end ? end + 1 : buflim;
match = beg;
- while (beg > buf && beg[-1] != eol)
- --beg;
+ beg = memrchr (buf, eol, beg - buf);
+ beg = beg ? beg + 1 : buf;
if (kwsm.index < kwset_exact_matches)
{
if (mb_start < beg)
@@ -247,17 +245,15 @@ EGexecute (char const *buf, size_t size, size_t
*match_size,
/* The matched line starts in the middle of a multibyte
character. Perform the DFA search starting from the
beginning of the next character. */
- if (dfaexec (dfa, mb_start, (char *) end, 0, NULL,
- &backref) == NULL)
+ if (! dfaexec (dfa, mb_start, (char *) end, 0, NULL,
+ &backref))
continue;
}
else
{
- if (dfahint (dfa, beg, (char *) end, NULL) ==
- (size_t) -1)
+ if (dfahint (dfa, beg, (char *) end, NULL) == (size_t) -1)
continue;
- if (dfaexec (dfa, beg, (char *) end, 0, NULL,
- &backref) == NULL)
+ if (! dfaexec (dfa, beg, (char *) end, 0, NULL, &backref))
continue;
}
}
@@ -283,20 +279,17 @@ EGexecute (char const *buf, size_t size, size_t
*match_size,
else
next_beg = beg + offset;
/* Narrow down to the line we've found. */
- beg = next_beg;
- while (beg > buf && beg[-1] != eol)
- --beg;
- if (count > 0)
+ beg = memrchr (buf, eol, next_beg - buf);
+ beg = beg ? beg + 1 : buf;
+ if (count != 0)
{
- /* dfahint() may match in multiple lines. If that is
- the case, try to match in one line. */
+ /* If dfahint may match in multiple lines, try to
+ match in one line. */
end = beg;
continue;
}
- if ((end = memchr(next_beg, eol, buflim - beg)) != NULL)
- end++;
- else
- end = buflim;
+ end = memchr (next_beg, eol, buflim - next_beg);
+ end = end ? end + 1 : buflim;
if (offset != (size_t) -2)
{
next_beg = dfaexec (dfa, beg, (char *) end, 0, NULL,
--
1.9.0