Norihiro Tanaka wrote:
> s0: The position set is none.
> s1: The position set is 1:a
> s2: The position set is 1:a 2:CSET
> s3: The position set is 2:CSET 3:b (accepted)
> s4: The position set is 2:CSET

Sorry, it was wrong yet.  It should be as follows.

s0: The position set is none.
s1: The position set is 1:a 2:CSET 3:b
s2: The position set is 1:a 2:CSET 3:b (accepted)

First of all, I noticed that the example is wrong, because if fixed
strings are included in the pattern, kwset is used for the pattern,
and dfahint() is called for each single line.

--
By the way, I compared two cases, which are CSET* and
CSET{1,mb_cur_max}.  I used `\(a\|z\)[x-y]\{10\}\(b\|z\)' as the pattern.
`[x-z]' is MBCSET in UTF-8 locale.

If replace MBCSET to CSET*, the pattern in the superset is
`\(a\|z\)\(CSET*\)\{10\}\(b\|z\)'.
If replace MBCSET to CSET{1,mb_cur_max}, the pattern in the superset is
`\(a\|z\)\(CSET\{1,6\}\)\{10\}\(b\|z\)'.

In order to simplify, use `\(a\|z\)\(.\{1,6\}\)\{10\}\(b\|z\)' in
POSIX locale for later.

Before testing, apply the patch attached on this mail, Set `DDEBUG' to
CPPFLAGS, and re-build.  It outputs more debugging information.

  $ yes `printf '%0100d\n'` | head -320 | \
      sed -e '1s/^0/a/; $s/0$/b/' | \
      env LANG=en_US.UTF-8 src/grep '\(a\|z\)[x-y]\{10\}\(b\|z\)' 2>debug1.log
  $ yes `printf '%0100d\n'` | head -320 | \
      sed -e '1s/^0/a/; $s/0$/b/' | \
      env LANG=C src/grep '\(a\|z\)\(.\{1,6\}\)\{10\}\(b\|z\)' 2>debug2.log

(`320' originates in the default size of a buffer.)

The formar is only built 3 states.
OTOH, the later is built 224 dfastates.
i.e. even if matched multi-lines, many states aren't necessarily built. 

Next, I checked performance for each case in worst text.
It's best in 10 times.

yes `printf '%0100d\n'` | head -2 | sed -e '1s/^0/a/; $s/0$/b/' >t
yes t | head -1000000 | xargs cat >u

time -p env LANG=C src/grep '\(a\|z\)\(.\{1,6\}\)\{10\}\(b\|z\)' u
    real 1.26       user 0.95       sys 0.28
time -p env LANG=en_US.UTF-8 src/grep '\(a\|z\)[x-y]\{10\}\(b\|z\)' u
    real 0.74       user 0.45       sys 0.27

Though, the later is fast, I like the former, because it's very simple.
the later is complicated, also needs to update nleaves and depth.

Norihiro
From df6da5d40a47abbc6e3451cb9fab7a8c9ede12cc Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Fri, 28 Mar 2014 19:27:00 -0700
Subject: [PATCH 1/3] dfa: improve port to freestanding DJGPP

Suggested by Aharon Robbins (Bug#17056).
* src/dfa.c (setlocale) [!LC_ALL]: Return NULL, not "C",
reverting part of a recent change.
(using_simple_locale): Return true if setlocale returns null.
---
 src/dfa.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index 4ed2189..b22fe97 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -37,7 +37,7 @@
 /* Gawk doesn't use Gnulib, so don't assume that setlocale and
    static_assert are present.  */
 #ifndef LC_ALL
-# define setlocale(category, locale) "C"
+# define setlocale(category, locale) NULL
 #endif
 #ifndef static_assert
 # define static_assert(cond, diagnostic) \
@@ -850,8 +850,9 @@ using_simple_locale (void)
       if (unibyte_c < 0)
         {
           char const *locale = setlocale (LC_ALL, NULL);
-          unibyte_c = (locale && (STREQ (locale, "C")
-                                  || STREQ (locale, "POSIX")));
+          unibyte_c = (!locale
+                       || STREQ (locale, "C")
+                       || STREQ (locale, "POSIX"));
         }
       return unibyte_c;
     }
-- 
1.9.1


From d80c9a5da2c5e04844ac39f8cdd45e6425b2dde6 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <[email protected]>
Date: Tue, 1 Apr 2014 11:18:44 +0200
Subject: [PATCH 2/3] dfa: avoid re-building a state built previously

* src/dfa.c (dfaexec): Avoid to re-build a state built previously.
---
 src/dfa.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/dfa.c b/src/dfa.c
index b22fe97..b6fbd58 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -3537,7 +3537,8 @@ dfaexec (struct dfa *d, char const *begin, char *end,
 
       if (s >= 0)
         {
-          build_state (s, d);
+          if (!d->trans[s])
+            build_state (s, d);
           trans = d->trans;
           continue;
         }
-- 
1.9.1


From f18a096be8a0be2941f97111a3870e14a8abb613 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka <[email protected]>
Date: Sat, 22 Mar 2014 17:00:36 +0900
Subject: [PATCH 3/3] grep: print the detail of DFA states

* src/dfa.c (prtok): replace `%c' to `%02x' in format of printf().
(state_index): print detail of new state.
(dfastate) print detail of the process.
---
 src/dfa.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 51 insertions(+), 1 deletion(-)

diff --git a/src/dfa.c b/src/dfa.c
index b6fbd58..fc4edcb 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -556,7 +556,7 @@ prtok (token t)
   else if (t < NOTCHAR)
     {
       int ch = t;
-      fprintf (stderr, "%c", ch);
+      fprintf (stderr, "0x%02x", ch);
     }
   else
     {
@@ -2155,6 +2155,28 @@ state_index (struct dfa *d, position_set const *s, int 
context)
         return i;
     }
 
+#ifdef DEBUG
+  fprintf (stderr, "new state %d:\n  position:", i);
+  for (j = 0; j < s->nelem; ++j)
+    {
+      fprintf (stderr, " %d:", s->elems[j].index);
+      prtok (d->tokens[s->elems[j].index]);
+    }
+  fprintf (stderr, "\n  context:");
+  if (context & CTX_ANY)
+    fprintf (stderr, " CTX_ANY");
+  else
+    {
+      if (context & CTX_NONE)
+        fprintf (stderr, " CTX_NONE");
+      if (context & CTX_NEWLINE)
+        fprintf (stderr, " CTX_LETTER");
+      if (context & CTX_NEWLINE)
+        fprintf (stderr, " CTX_NEWLINE");
+    }
+  fprintf (stderr, "\n");
+#endif
+
   /* We'll have to create a new state.  */
   REALLOC_IF_NECESSARY (d->states, d->salloc, d->sindex + 1);
   d->states[i].hash = hash;
@@ -2626,6 +2648,10 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
   int next_isnt_1st_byte = 0;   /* Flag if we can't add state0.  */
   size_t i, j, k;
 
+#ifdef DEBUG
+  fprintf (stderr, "*** DFA state %d ***\n", s);
+#endif
+
   MALLOC (grps, NOTCHAR);
   MALLOC (labels, NOTCHAR);
 
@@ -2678,6 +2704,14 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
             continue;
         }
 
+#ifdef DEBUG
+      fprintf (stderr, "position %d:", pos.index);
+      for (j = 0; j < NOTCHAR; j++)
+      if (tstbit (j,  matches))
+        fprintf (stderr, " 0x%02x", j);
+      fprintf (stderr, "\n");
+#endif
+
       for (j = 0; j < ngrps; ++j)
         {
           /* If matches contains a single character only, and the current
@@ -2836,6 +2870,22 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
       else
         state_letter = state;
 
+#ifdef DEBUG
+      fprintf (stderr, "group %d:\n  nextpos :", i);
+      for (j = 0; j < grps[i].nelem; ++j)
+        {
+          fprintf (stderr, " %d:", grps[i].elems[j]);
+          prtok (d->tokens[grps[i].elems[j]]);
+        }
+      fprintf (stderr, "\n  follows :");
+      for (j = 0; j < follows.nelem; ++j)
+        {
+          fprintf (stderr, " %d:", follows.elems[j].index);
+          prtok (d->tokens[follows.elems[j].index]);
+        }
+      fprintf (stderr, "\n  nextstate: %d, %d, %d\n", state, state_newline, 
state_letter);
+#endif
+
       /* Set the transitions for each character in the current label.  */
       for (j = 0; j < CHARCLASS_INTS; ++j)
         for (k = 0; k < INTBITS; ++k)
-- 
1.9.1

Reply via email to