Norihiro Tanaka wrote:
I fixed the bug in the patch.  Added call of resetmust().

Thanks, I installed that, along with the attached fixeup patch, and am marking this as done. The most interesting part of the fixup patch is its replacement of 'for (; j < NOTCHAR; j++)' with 'while (++j < NOTCHAR)', which should work a bit better when analyzing patterns like '[aaa]'.
From 94381e963c78e4d02ed4b8f73bdff0f5efb27510 Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Wed, 9 Apr 2014 19:59:12 -0700
Subject: [PATCH] grep: improvements for the open-CSET patch

* src/dfa.c (dfamust): Simplify by removing some duplicate code.
Optimize patterns like [aaa] even when not case-folding.
Avoid an unnecessary copy of the charclass.
---
 src/dfa.c | 101 ++++++++++++++++++++++++++------------------------------------
 1 file changed, 43 insertions(+), 58 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index 6b4169d..2b6c5d6 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -4036,7 +4036,6 @@ dfamust (struct dfa *d)
   size_t ri;
   size_t i;
   bool exact;
-  token t;
   static must must0;
   struct dfamust *dm;
   static char empty_string[] = "";
@@ -4067,11 +4066,18 @@ dfamust (struct dfa *d)
 #endif
   for (ri = 0; ri < d->tindex; ++ri)
     {
-      switch (t = d->tokens[ri])
+      token t = d->tokens[ri];
+      switch (t)
         {
         case LPAREN:
         case RPAREN:
           assert (!"neither LPAREN nor RPAREN may appear here");
+
+        case STAR:
+        case QMARK:
+          assert (musts < mp);
+          --mp;
+          /* Fall through.  */
         case EMPTY:
         case BEGLINE:
         case ENDLINE:
@@ -4080,14 +4086,11 @@ dfamust (struct dfa *d)
         case LIMWORD:
         case NOTLIMWORD:
         case BACKREF:
+        case ANYCHAR:
+        case MBCSET:
           resetmust (mp);
           break;
-        case STAR:
-        case QMARK:
-          assert (musts < mp);
-          --mp;
-          resetmust (mp);
-          break;
+
         case OR:
           assert (&musts[2] <= mp);
           {
@@ -4126,11 +4129,13 @@ dfamust (struct dfa *d)
             lmp->in = new;
           }
           break;
+
         case PLUS:
           assert (musts < mp);
           --mp;
           mp->is[0] = '\0';
           break;
+
         case END:
           assert (mp == &musts[1]);
           for (i = 0; musts[0].in[i] != NULL; ++i)
@@ -4139,6 +4144,7 @@ dfamust (struct dfa *d)
           if (STREQ (result, musts[0].is))
             exact = true;
           goto done;
+
         case CAT:
           assert (&musts[2] <= mp);
           {
@@ -4188,62 +4194,41 @@ dfamust (struct dfa *d)
               lmp->is[0] = '\0';
           }
           break;
+
+        case '\0':
+          /* Not on *my* shift.  */
+          goto done;
+
         default:
-          if (t < END)
-            {
-              assert (!"oops! t >= END");
-            }
-          else if (t == '\0')
-            {
-              /* not on *my* shift */
-              goto done;
-            }
-          else if (t >= CSET)
+          resetmust (mp);
+          if (CSET <= t)
             {
-              charclass ccl;
+              /* If T is a singleton, or if case-folding in a unibyte
+                 locale and T's members all case-fold to the same char,
+                 convert T to one of its members.  Otherwise, do
+                 nothing further with T.  */
+              charclass *ccl = &d->charclasses[t - CSET];
               int j;
-              copyset (d->charclasses[t - CSET], ccl);
-              for (j = 0; j < NOTCHAR; ++j)
-                if (tstbit (j, ccl))
+              for (j = 0; j < NOTCHAR; j++)
+                if (tstbit (j, *ccl))
+                  break;
+              if (! (j < NOTCHAR))
+                break;
+              t = j;
+              while (++j < NOTCHAR)
+                if (tstbit (j, *ccl)
+                    && ! (case_fold && MB_CUR_MAX == 1
+                          && toupper (j) == toupper (t)))
                   break;
               if (j < NOTCHAR)
-                {
-                  int c = (case_fold && MB_CUR_MAX == 1) ? toupper (j) : j;
-                  for (; j < NOTCHAR; j++)
-                    if (tstbit (j, ccl)
-                        && (!(case_fold && MB_CUR_MAX == 1) || c != toupper 
(j)))
-                      break;
-                  if (j < NOTCHAR)
-                    resetmust (mp);
-                  else
-                    {
-                      resetmust (mp);
-                      mp->is[0] = mp->left[0] = mp->right[0] = c;
-                      mp->is[1] = mp->left[1] = mp->right[1] = '\0';
-                      mp->in = enlist (mp->in, mp->is, (size_t) 1);
-                      if (mp->in == NULL)
-                        goto done;
-                    }
-                }
-              else
-                resetmust (mp);
-            }
-          else if (t == ANYCHAR || t == MBCSET)
-            {
-              /* easy enough */
-              resetmust (mp);
-            }
-          else
-            {
-              /* plain character */
-              resetmust (mp);
-              mp->is[0] = mp->left[0] = mp->right[0] =
-                (case_fold && MB_CUR_MAX == 1) ? toupper (t) : t;
-              mp->is[1] = mp->left[1] = mp->right[1] = '\0';
-              mp->in = enlist (mp->in, mp->is, (size_t) 1);
-              if (mp->in == NULL)
-                goto done;
+                break;
             }
+          mp->is[0] = mp->left[0] = mp->right[0]
+            = case_fold && MB_CUR_MAX == 1 ? toupper (t) : t;
+          mp->is[1] = mp->left[1] = mp->right[1] = '\0';
+          mp->in = enlist (mp->in, mp->is, 1);
+          if (mp->in == NULL)
+            goto done;
           break;
         }
 #ifdef DEBUG
-- 
1.9.0

Reply via email to