bug#18685: [PATCH] dfa: treat a multibyte character even with constraints

Norihiro Tanaka Mon, 13 Oct 2014 05:38:14 -0700

Sorry, I found and fixed a bug in a previous patch.  A context should be
checked for a previous wide character instead of a next after skip.

From 2e384f68e0bf0ce29b8c801bc6f0479319c14678 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <[email protected]>
Date: Sat, 11 Oct 2014 11:38:09 +0900
Subject: [PATCH] dfa: treat a multibyte character even with constraints
 correctly


* src/dfa.c (struct dfa): Add a new members `min_trcount',
`initstate_letter' and `initstate_others'.
(dfaanalyze): Build states with not only a newline context but others.
(build_state): Don't release initial states.
(skip_remains_mb): Add an argument `wcp'.  It returns to have got a last
wide character.
(dfaexec_main): If multiple states exists in initial, transit a state
to another after skip a middle position in a multibyte character
tests/euc-mb: Add a new test.
* NEWS (Bug fixes): Mention it.
---
 NEWS         |  4 ++++
 src/dfa.c    | 70 +++++++++++++++++++++++++++++++++++++++++++++++-------------
 tests/euc-mb |  1 +
 3 files changed, 60 insertions(+), 15 deletions(-)

diff --git a/NEWS b/NEWS
index 07a5d54..f454068 100644
--- a/NEWS
+++ b/NEWS
@@ -38,6 +38,10 @@ GNU grep NEWS                                    -*- outline 
-*-
   implying that the match, "10" was on line 1.
   [bug introduced in grep-2.19]
 
+  grep would match to match at a middle of a multibyte character when
+  using '^' in a pattern in non-UTF8 multibyte locales, leading it to
+  print lines that did not match.
+
   grep -E rejected unmatched ')', instead of treating it like '\)'.
   [bug present since "the beginning"]
 
diff --git a/src/dfa.c b/src/dfa.c
index 58a4b83..9df9736 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -405,6 +405,10 @@ struct dfa
                                    slots so far, not counting trans[-1].  */
   int trcount;                  /* Number of transition tables that have
                                    actually been built.  */
+  int min_trcount;              /* Minimum of number of transition tables.
+                                   Always keep the number, even if release
+                                   transition tables.  It's also number of
+                                   initial states.  */
   state_num **trans;            /* Transition tables for states that can
                                    never accept.  If the transitions for a
                                    state have not yet been computed, or the
@@ -423,6 +427,8 @@ struct dfa
                                    newline is stored separately and handled
                                    as a special case.  Newline is also used
                                    as a sentinel at the end of the buffer.  */
+  state_num initstate_letter;   /* Initial state for letter context.  */
+  state_num initstate_others;   /* Initial state for other contexts.  */
   struct dfamust *musts;        /* List of strings, at least one of which
                                    is known to appear in any r.e. matching
                                    the dfa.  */
@@ -2517,9 +2523,16 @@ dfaanalyze (struct dfa *d, int searchflag)
 
   /* Build the initial state.  */
   separate_contexts = state_separate_contexts (&merged);
-  state_index (d, &merged,
-               (separate_contexts & CTX_NEWLINE
-                ? CTX_NEWLINE : separate_contexts ^ CTX_ANY));
+  if (separate_contexts & CTX_NEWLINE)
+    state_index (d, &merged, CTX_NEWLINE);
+  d->initstate_others = d->min_trcount
+    = state_index (d, &merged, separate_contexts ^ CTX_ANY);
+  if (separate_contexts & CTX_LETTER)
+    d->initstate_letter = d->min_trcount
+      = state_index (d, &merged, CTX_LETTER);
+  else
+    d->initstate_letter = d->initstate_others;
+  d->min_trcount++;
 
   free (posalloc);
   free (stkalloc);
@@ -2859,13 +2872,13 @@ build_state (state_num s, struct dfa *d)
      not clear the initial state, as it's always used.  */
   if (d->trcount >= 1024)
     {
-      for (i = 1; i < d->tralloc; ++i)
+      for (i = d->min_trcount; i < d->tralloc; ++i)
         {
           free (d->trans[i]);
           free (d->fails[i]);
           d->trans[i] = d->fails[i] = NULL;
         }
-      d->trcount = 1;
+      d->trcount = d->min_trcount;
     }
 
   ++d->trcount;
@@ -3249,12 +3262,14 @@ transit_state (struct dfa *d, state_num s, unsigned 
char const **pp,
    character.  */
 static unsigned char const *
 skip_remains_mb (struct dfa *d, unsigned char const *p,
-                 unsigned char const *mbp, char const *end)
+                 unsigned char const *mbp, char const *end, wint_t *wcp)
 {
-  wint_t wc;
+  wint_t wc = WEOF;
   while (mbp < p)
     mbp += mbs_to_wchar (&wc, (char const *) mbp,
                          end - (char const *) mbp, d);
+  if (wcp != NULL)
+    *wcp = wc;
   return mbp;
 }
 
@@ -3316,20 +3331,45 @@ dfaexec_main (struct dfa *d, char const *begin, char 
*end,
             {
               s1 = s;
 
-              if (s == 0)
+              if (s < d->min_trcount)
                 {
-                  if (d->states[s].mbps.nelem == 0)
+                  if (d->min_trcount == 1)
                     {
-                      do
+                      if (d->states[s].mbps.nelem == 0)
                         {
-                          while (t[*p] == 0)
-                            p++;
-                          p = mbp = skip_remains_mb (d, p, mbp, end);
+                          do
+                            {
+                              while (t[*p] == 0)
+                                p++;
+                              p = mbp = skip_remains_mb (d, p, mbp, end,
+                                                         NULL);
+                            }
+                          while (t[*p] == 0);
                         }
-                      while (t[*p] == 0);
+                      else
+                        p = mbp = skip_remains_mb (d, p, mbp, end, NULL);
                     }
                   else
-                    p = mbp = skip_remains_mb (d, p, mbp, end);
+                    {
+                      wint_t wc;
+                      mbp = skip_remains_mb (d, p, mbp, end, &wc);
+
+                      /* If d->min_trcount is greater than 1, maybe
+                         transit to another initial state after skip.  */
+                      if (p < mbp)
+                        {
+                          int context = wchar_context (wc);
+                          if (context == CTX_LETTER)
+                            s = d->initstate_letter;
+                          else
+                            /* It's CTX_NONE.  CTX_NEWLINE cannot happen,
+                               as we assume that a newline is always a
+                               single byte character.  */
+                            s = d->initstate_others;
+                          p = mbp;
+                          s1 = s;
+                        }
+                    }
                 }
 
               if (d->states[s].mbps.nelem == 0)
diff --git a/tests/euc-mb b/tests/euc-mb
index 6a9a845..b625046 100755
--- a/tests/euc-mb
+++ b/tests/euc-mb
@@ -39,6 +39,7 @@ make_input BABAAB |euc_grep AB > out || fail=1
 make_input BABAAB > exp || framework_failure_
 compare exp out || fail=1
 make_input BABABA |euc_grep AB; test $? = 1 || fail=1
+make_input BABABA |euc_grep '^x\|AB'; test $? = 1 || fail=1
 
 # -P supports only unibyte and UTF-8 locales.
 LC_ALL=$locale grep -P x /dev/null
-- 
2.1.1

bug#18685: [PATCH] dfa: treat a multibyte character even with constraints

Reply via email to