Sorry, I found and fixed a bug in a previous patch. A context should be
checked for a previous wide character instead of a next after skip.
From 2e384f68e0bf0ce29b8c801bc6f0479319c14678 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <[email protected]>
Date: Sat, 11 Oct 2014 11:38:09 +0900
Subject: [PATCH] dfa: treat a multibyte character even with constraints
correctly
* src/dfa.c (struct dfa): Add a new members `min_trcount',
`initstate_letter' and `initstate_others'.
(dfaanalyze): Build states with not only a newline context but others.
(build_state): Don't release initial states.
(skip_remains_mb): Add an argument `wcp'. It returns to have got a last
wide character.
(dfaexec_main): If multiple states exists in initial, transit a state
to another after skip a middle position in a multibyte character
tests/euc-mb: Add a new test.
* NEWS (Bug fixes): Mention it.
---
NEWS | 4 ++++
src/dfa.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++-------------
tests/euc-mb | 1 +
3 files changed, 60 insertions(+), 15 deletions(-)
diff --git a/NEWS b/NEWS
index 07a5d54..f454068 100644
--- a/NEWS
+++ b/NEWS
@@ -38,6 +38,10 @@ GNU grep NEWS -*- outline
-*-
implying that the match, "10" was on line 1.
[bug introduced in grep-2.19]
+ grep would match to match at a middle of a multibyte character when
+ using '^' in a pattern in non-UTF8 multibyte locales, leading it to
+ print lines that did not match.
+
grep -E rejected unmatched ')', instead of treating it like '\)'.
[bug present since "the beginning"]
diff --git a/src/dfa.c b/src/dfa.c
index 58a4b83..9df9736 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -405,6 +405,10 @@ struct dfa
slots so far, not counting trans[-1]. */
int trcount; /* Number of transition tables that have
actually been built. */
+ int min_trcount; /* Minimum of number of transition tables.
+ Always keep the number, even if release
+ transition tables. It's also number of
+ initial states. */
state_num **trans; /* Transition tables for states that can
never accept. If the transitions for a
state have not yet been computed, or the
@@ -423,6 +427,8 @@ struct dfa
newline is stored separately and handled
as a special case. Newline is also used
as a sentinel at the end of the buffer. */
+ state_num initstate_letter; /* Initial state for letter context. */
+ state_num initstate_others; /* Initial state for other contexts. */
struct dfamust *musts; /* List of strings, at least one of which
is known to appear in any r.e. matching
the dfa. */
@@ -2517,9 +2523,16 @@ dfaanalyze (struct dfa *d, int searchflag)
/* Build the initial state. */
separate_contexts = state_separate_contexts (&merged);
- state_index (d, &merged,
- (separate_contexts & CTX_NEWLINE
- ? CTX_NEWLINE : separate_contexts ^ CTX_ANY));
+ if (separate_contexts & CTX_NEWLINE)
+ state_index (d, &merged, CTX_NEWLINE);
+ d->initstate_others = d->min_trcount
+ = state_index (d, &merged, separate_contexts ^ CTX_ANY);
+ if (separate_contexts & CTX_LETTER)
+ d->initstate_letter = d->min_trcount
+ = state_index (d, &merged, CTX_LETTER);
+ else
+ d->initstate_letter = d->initstate_others;
+ d->min_trcount++;
free (posalloc);
free (stkalloc);
@@ -2859,13 +2872,13 @@ build_state (state_num s, struct dfa *d)
not clear the initial state, as it's always used. */
if (d->trcount >= 1024)
{
- for (i = 1; i < d->tralloc; ++i)
+ for (i = d->min_trcount; i < d->tralloc; ++i)
{
free (d->trans[i]);
free (d->fails[i]);
d->trans[i] = d->fails[i] = NULL;
}
- d->trcount = 1;
+ d->trcount = d->min_trcount;
}
++d->trcount;
@@ -3249,12 +3262,14 @@ transit_state (struct dfa *d, state_num s, unsigned
char const **pp,
character. */
static unsigned char const *
skip_remains_mb (struct dfa *d, unsigned char const *p,
- unsigned char const *mbp, char const *end)
+ unsigned char const *mbp, char const *end, wint_t *wcp)
{
- wint_t wc;
+ wint_t wc = WEOF;
while (mbp < p)
mbp += mbs_to_wchar (&wc, (char const *) mbp,
end - (char const *) mbp, d);
+ if (wcp != NULL)
+ *wcp = wc;
return mbp;
}
@@ -3316,20 +3331,45 @@ dfaexec_main (struct dfa *d, char const *begin, char
*end,
{
s1 = s;
- if (s == 0)
+ if (s < d->min_trcount)
{
- if (d->states[s].mbps.nelem == 0)
+ if (d->min_trcount == 1)
{
- do
+ if (d->states[s].mbps.nelem == 0)
{
- while (t[*p] == 0)
- p++;
- p = mbp = skip_remains_mb (d, p, mbp, end);
+ do
+ {
+ while (t[*p] == 0)
+ p++;
+ p = mbp = skip_remains_mb (d, p, mbp, end,
+ NULL);
+ }
+ while (t[*p] == 0);
}
- while (t[*p] == 0);
+ else
+ p = mbp = skip_remains_mb (d, p, mbp, end, NULL);
}
else
- p = mbp = skip_remains_mb (d, p, mbp, end);
+ {
+ wint_t wc;
+ mbp = skip_remains_mb (d, p, mbp, end, &wc);
+
+ /* If d->min_trcount is greater than 1, maybe
+ transit to another initial state after skip. */
+ if (p < mbp)
+ {
+ int context = wchar_context (wc);
+ if (context == CTX_LETTER)
+ s = d->initstate_letter;
+ else
+ /* It's CTX_NONE. CTX_NEWLINE cannot happen,
+ as we assume that a newline is always a
+ single byte character. */
+ s = d->initstate_others;
+ p = mbp;
+ s1 = s;
+ }
+ }
}
if (d->states[s].mbps.nelem == 0)
diff --git a/tests/euc-mb b/tests/euc-mb
index 6a9a845..b625046 100755
--- a/tests/euc-mb
+++ b/tests/euc-mb
@@ -39,6 +39,7 @@ make_input BABAAB |euc_grep AB > out || fail=1
make_input BABAAB > exp || framework_failure_
compare exp out || fail=1
make_input BABABA |euc_grep AB; test $? = 1 || fail=1
+make_input BABABA |euc_grep '^x\|AB'; test $? = 1 || fail=1
# -P supports only unibyte and UTF-8 locales.
LC_ALL=$locale grep -P x /dev/null
--
2.1.1