From bcd7930dba2f078da992660f6a14cde9e42b94c6 Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyering@fb.com>
Date: Fri, 11 Sep 2020 01:25:10 -0700
Subject: [PATCH] dfa: speed up epsilon-node removal

Build auxiliary indices before removing epsilon closure.
Before, when removing an epsilon closure, we would search for nodes
including that epsilon closure in all nodes sequentially, but that
could be very slow.  Now, build auxiliary indices before searching.
Reported in: https://bugs.gnu.org/40634

* lib/dfa.c (overwrap): New function.
(epsclosure): Build auxiliary indices before removing any
epsilon closure; use them to speed up that process.
---
 lib/dfa.c | 107 +++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 73 insertions(+), 34 deletions(-)

diff --git a/lib/dfa.c b/lib/dfa.c
index 1f0587a7a..c4300dfb5 100644
--- a/lib/dfa.c
+++ b/lib/dfa.c
@@ -2203,6 +2203,22 @@ replace (position_set *dst, idx_t del, position_set *add,
     }
 }

+static bool _GL_ATTRIBUTE_PURE
+overwrap (position_set const *s, idx_t const *elems, idx_t nelem)
+{
+  idx_t i = 0, j = 0;
+
+  while (i < s->nelem && j < nelem)
+    if (s->elems[i].index < elems[j])
+      i++;
+    else if (s->elems[i].index > elems[j])
+      j++;
+    else
+      return true;
+
+  return false;
+}
+
 /* Find the index of the state corresponding to the given position set with
    the given preceding context, or create a new state if there is no such
    state.  Context tells whether we got here on a newline or letter.  */
@@ -2300,45 +2316,68 @@ static void
 epsclosure (struct dfa const *d)
 {
   position_set tmp;
+  idx_t *currs, *nexts;
+  idx_t ncurr = 0;
+  idx_t nnext = 0;
+
   alloc_position_set (&tmp, d->nleaves);
+  currs = xnmalloc (d->tindex, sizeof *currs);
+  nexts = xnmalloc (d->tindex, sizeof *nexts);
+
   for (idx_t i = 0; i < d->tindex; i++)
-    if (d->follows[i].nelem > 0 && d->tokens[i] >= NOTCHAR
-        && d->tokens[i] != BACKREF && d->tokens[i] != ANYCHAR
-        && d->tokens[i] != MBCSET && d->tokens[i] < CSET)
-      {
-        unsigned int constraint;
-        switch (d->tokens[i])
-          {
-          case BEGLINE:
-            constraint = BEGLINE_CONSTRAINT;
-            break;
-          case ENDLINE:
-            constraint = ENDLINE_CONSTRAINT;
-            break;
-          case BEGWORD:
-            constraint = BEGWORD_CONSTRAINT;
-            break;
-          case ENDWORD:
-            constraint = ENDWORD_CONSTRAINT;
-            break;
-          case LIMWORD:
-            constraint = LIMWORD_CONSTRAINT;
-            break;
-          case NOTLIMWORD:
-            constraint = NOTLIMWORD_CONSTRAINT;
-            break;
-          default:
-            constraint = NO_CONSTRAINT;
-            break;
-          }
+    {
+      if (d->follows[i].nelem > 0 && d->tokens[i] >= NOTCHAR
+          && d->tokens[i] != BACKREF && d->tokens[i] != ANYCHAR
+          && d->tokens[i] != MBCSET && d->tokens[i] < CSET)
+        currs[ncurr++] = i;
+    }

-        delete (i, &d->follows[i]);
+  for (idx_t i = 0, j = 0; i < d->tindex; i++)
+    {
+      while (j < ncurr && currs[j] < i)
+        j++;
+      if (overwrap (&d->follows[i], currs, ncurr))
+        nexts[nnext++] = i;
+    }
+
+  for (idx_t i = 0; i < ncurr; i++)
+    {
+      unsigned int constraint;
+      switch (d->tokens[currs[i]])
+        {
+        case BEGLINE:
+          constraint = BEGLINE_CONSTRAINT;
+          break;
+        case ENDLINE:
+          constraint = ENDLINE_CONSTRAINT;
+          break;
+        case BEGWORD:
+          constraint = BEGWORD_CONSTRAINT;
+          break;
+        case ENDWORD:
+          constraint = ENDWORD_CONSTRAINT;
+          break;
+        case LIMWORD:
+          constraint = LIMWORD_CONSTRAINT;
+          break;
+        case NOTLIMWORD:
+          constraint = NOTLIMWORD_CONSTRAINT;
+          break;
+        default:
+          constraint = NO_CONSTRAINT;
+          break;
+        }
+
+      delete (i, &d->follows[currs[i]]);
+
+      for (idx_t j = 0; j < nnext; j++)
+        replace (&d->follows[nexts[j]], currs[i], &d->follows[currs[i]],
+                 constraint, &tmp);
+    }

-        for (idx_t j = 0; j < d->tindex; j++)
-          if (i != j && d->follows[j].nelem > 0)
-            replace (&d->follows[j], i, &d->follows[i], constraint, &tmp);
-      }
   free (tmp.elems);
+  free (currs);
+  free (nexts);
 }

 /* Returns the set of contexts for which there is at least one
-- 
2.28.0.rc0

