Set constraint, may be caused wrong result in non-UTF8 locales. Below
fails in a current master.
$ pattern=$(printf '^x\|\244\263')
$ printf '\263\244\263\244\n' |
env LC_ALL=ja_JP.eucJP src/grep "$pattern" && echo FAIL
skip_remains_mb runs in only state 0, but it's wrong. Set constraint,
may transit to a state besides state 0 after failure.
From 060bcdbdfde4fb73fb0c90c05c6298cd37be6663 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <[email protected]>
Date: Sat, 11 Oct 2014 11:38:09 +0900
Subject: [PATCH] dfa: treat a multibyte character even with constraints
correctly
* src/dfa.c (struct dfa): Add a new members `min_trcount',
`initstate_letter' and `initstate_others'.
(dfaanalyze): Build states with not only a newline context but others.
(build_state): Don't release initial states.
(dfaexec_main): If multiple states exists in initial, transit a state
to another after skip a middle position in a multibyte character
tests/euc-mb: Add a new test.
---
src/dfa.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++------------
tests/euc-mb | 1 +
2 files changed, 56 insertions(+), 13 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index 58a4b83..9899749 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -405,6 +405,10 @@ struct dfa
slots so far, not counting trans[-1]. */
int trcount; /* Number of transition tables that have
actually been built. */
+ int min_trcount; /* Minimum of a number of transition tables.
+ Always keep the number, even if release
+ transition tables. It also a number of
+ initial states. */
state_num **trans; /* Transition tables for states that can
never accept. If the transitions for a
state have not yet been computed, or the
@@ -423,6 +427,8 @@ struct dfa
newline is stored separately and handled
as a special case. Newline is also used
as a sentinel at the end of the buffer. */
+ state_num initstate_letter; /* Initial state for letter context. */
+ state_num initstate_others; /* Initial state for other contexts. */
struct dfamust *musts; /* List of strings, at least one of which
is known to appear in any r.e. matching
the dfa. */
@@ -2517,9 +2523,16 @@ dfaanalyze (struct dfa *d, int searchflag)
/* Build the initial state. */
separate_contexts = state_separate_contexts (&merged);
- state_index (d, &merged,
- (separate_contexts & CTX_NEWLINE
- ? CTX_NEWLINE : separate_contexts ^ CTX_ANY));
+ if (separate_contexts & CTX_NEWLINE)
+ state_index (d, &merged, CTX_NEWLINE);
+ d->initstate_others = d->min_trcount
+ = state_index (d, &merged, separate_contexts ^ CTX_ANY);
+ if (separate_contexts & CTX_LETTER)
+ d->initstate_letter = d->min_trcount
+ = state_index (d, &merged, CTX_LETTER);
+ else
+ d->initstate_letter = d->initstate_others;
+ d->min_trcount++;
free (posalloc);
free (stkalloc);
@@ -2859,13 +2872,13 @@ build_state (state_num s, struct dfa *d)
not clear the initial state, as it's always used. */
if (d->trcount >= 1024)
{
- for (i = 1; i < d->tralloc; ++i)
+ for (i = d->min_trcount; i < d->tralloc; ++i)
{
free (d->trans[i]);
free (d->fails[i]);
d->trans[i] = d->fails[i] = NULL;
}
- d->trcount = 1;
+ d->trcount = d->min_trcount;
}
++d->trcount;
@@ -3316,20 +3329,49 @@ dfaexec_main (struct dfa *d, char const *begin, char
*end,
{
s1 = s;
- if (s == 0)
+ if (s < d->min_trcount)
{
- if (d->states[s].mbps.nelem == 0)
+ if (d->min_trcount == 1)
{
- do
+ if (d->states[s].mbps.nelem == 0)
{
- while (t[*p] == 0)
- p++;
- p = mbp = skip_remains_mb (d, p, mbp, end);
+ do
+ {
+ while (t[*p] == 0)
+ p++;
+ p = mbp = skip_remains_mb (d, p, mbp, end);
+ }
+ while (t[*p] == 0);
}
- while (t[*p] == 0);
+ else
+ p = mbp = skip_remains_mb (d, p, mbp, end);
}
else
- p = mbp = skip_remains_mb (d, p, mbp, end);
+ {
+ mbp = skip_remains_mb (d, p, mbp, end);
+
+ /* If d->min_trcount is greater than 1, maybe
+ transit to another initial state after skip. */
+ if (p < mbp)
+ {
+ if (*p == eol)
+ s = 0;
+ else if (d->initstate_letter == d->initstate_others)
+ s = d->initstate_others;
+ else
+ {
+ wint_t wc;
+ mbs_to_wchar (&wc, (char const *) p,
+ (unsigned char *) end - p, d);
+ if (wchar_context (wc))
+ s = d->initstate_letter;
+ else
+ s = d->initstate_others;
+ }
+ p = mbp;
+ s1 = s;
+ }
+ }
}
if (d->states[s].mbps.nelem == 0)
diff --git a/tests/euc-mb b/tests/euc-mb
index 6a9a845..b625046 100755
--- a/tests/euc-mb
+++ b/tests/euc-mb
@@ -39,6 +39,7 @@ make_input BABAAB |euc_grep AB > out || fail=1
make_input BABAAB > exp || framework_failure_
compare exp out || fail=1
make_input BABABA |euc_grep AB; test $? = 1 || fail=1
+make_input BABABA |euc_grep '^x\|AB'; test $? = 1 || fail=1
# -P supports only unibyte and UTF-8 locales.
LC_ALL=$locale grep -P x /dev/null
--
2.1.1