changeset 18607:db280259d3cc

dfa: performance improvement for removal of epsilon closure * lib/dfa.c (delete): Use binary search to find deleted index. (replace): New function. It replaces a position with the followed set. (epsclosure): Replace it with a new algorithm. Update caller.
author Norihiro Tanaka <noritnk@kcn.ne.jp>
date Wed, 14 Dec 2016 16:31:57 -0800
parents d421bd610f90
children 4e21be41ec70
files ChangeLog lib/dfa.c
diffstat 2 files changed, 93 insertions(+), 59 deletions(-) [+]
line wrap: on
line diff
--- a/ChangeLog	Sun Dec 18 14:32:40 2016 -0800
+++ b/ChangeLog	Wed Dec 14 16:31:57 2016 -0800
@@ -1,3 +1,11 @@
+2016-12-18  Norihiro Tanaka  <noritnk@kcn.ne.jp>
+
+	dfa: performance improvement for removal of epsilon closure
+	See Bug#22357#32.
+	* lib/dfa.c (delete): Use binary search to find deleted index.
+	(replace): New function.  It replaces a position with the followed set.
+	(epsclosure): Replace it with a new algorithm.  Update caller.
+
 2016-12-18  Bruno Haible  <bruno@clisp.org>
 
 	Split tests for getopt-posix and getopt-gnu.
--- a/lib/dfa.c	Sun Dec 18 14:32:40 2016 -0800
+++ b/lib/dfa.c	Wed Dec 14 16:31:57 2016 -0800
@@ -2085,17 +2085,62 @@
 }
 
 /* Delete a position from a set.  */
-static void
-delete (position p, position_set *s)
+static position
+delete (size_t del, position_set *s)
 {
-  size_t i;
-
-  for (i = 0; i < s->nelem; ++i)
-    if (p.index == s->elems[i].index)
-      break;
-  if (i < s->nelem)
-    for (--s->nelem; i < s->nelem; ++i)
-      s->elems[i] = s->elems[i + 1];
+  position p;
+  size_t count = s->nelem;
+  size_t lo = 0, hi = count;
+  while (lo < hi)
+    {
+      size_t mid = (lo + hi) >> 1;
+      if (s->elems[mid].index > del)
+        lo = mid + 1;
+      else
+        hi = mid;
+    }
+
+  if (lo < count && del == s->elems[lo].index)
+    {
+      p = s->elems[lo];
+      for (size_t i = lo + 1; i < s->nelem; i++)
+        s->elems[i - 1] = s->elems[i];
+      --s->nelem;
+    }
+  else
+    {
+      p.index = -1;
+      p.constraint = NO_CONSTRAINT;
+    }
+      return p;
+}
+
+/* Replace a position with the followed set.  */
+static void
+replace (position_set *dst, size_t del, position_set *add,
+         unsigned int constraint, position_set *tmp)
+{
+  position pos;
+
+  pos = delete (del, dst);
+
+  if (pos.index == (size_t) -1)
+    return;
+
+  copy (add, tmp);
+
+  pos.constraint &= constraint;
+
+  for (size_t i = 0; i < tmp->nelem; i++)
+    {
+      tmp->elems[i].constraint &= pos.constraint;
+
+      while (i < tmp->nelem && tmp->elems[i].constraint == 0)
+        delete (tmp->elems[i].index, tmp);
+    }
+
+  for (size_t i = 0; i < tmp->nelem; i++)
+    insert (tmp->elems[i], dst);
 }
 
 /* Find the index of the state corresponding to the given position set with
@@ -2187,63 +2232,48 @@
    constraint.  Repeat exhaustively until no funny positions are left.
    S->elems must be large enough to hold the result.  */
 static void
-epsclosure (position_set *s, struct dfa const *d, char *visited)
+epsclosure (position_set *initial, struct dfa const *d)
 {
-  size_t i, j;
-  position p, old;
-  bool initialized = false;
-
-  for (i = 0; i < s->nelem; ++i)
-    if (d->tokens[s->elems[i].index] >= NOTCHAR
-        && d->tokens[s->elems[i].index] != BACKREF
-        && d->tokens[s->elems[i].index] != ANYCHAR
-        && d->tokens[s->elems[i].index] != MBCSET
-        && d->tokens[s->elems[i].index] < CSET)
+  position_set tmp;
+  alloc_position_set (&tmp, d->nleaves);
+  for (size_t i = 0; i < d->tindex; ++i)
+    if (d->follows[i].nelem > 0 && d->tokens[i] >= NOTCHAR
+        && d->tokens[i] != BACKREF && d->tokens[i] != ANYCHAR
+        && d->tokens[i] != MBCSET && d->tokens[i] < CSET)
       {
-        if (!initialized)
-          {
-            memset (visited, 0, d->tindex * sizeof (*visited));
-            initialized = true;
-          }
-        old = s->elems[i];
-        p.constraint = old.constraint;
-        delete (s->elems[i], s);
-        if (visited[old.index])
-          {
-            --i;
-            continue;
-          }
-        visited[old.index] = 1;
-        switch (d->tokens[old.index])
+        unsigned int constraint;
+        switch (d->tokens[i])
           {
           case BEGLINE:
-            p.constraint &= BEGLINE_CONSTRAINT;
+            constraint = BEGLINE_CONSTRAINT;
             break;
           case ENDLINE:
-            p.constraint &= ENDLINE_CONSTRAINT;
+            constraint = ENDLINE_CONSTRAINT;
             break;
           case BEGWORD:
-            p.constraint &= BEGWORD_CONSTRAINT;
+            constraint = BEGWORD_CONSTRAINT;
             break;
           case ENDWORD:
-            p.constraint &= ENDWORD_CONSTRAINT;
+            constraint = ENDWORD_CONSTRAINT;
             break;
           case LIMWORD:
-            p.constraint &= LIMWORD_CONSTRAINT;
+            constraint = LIMWORD_CONSTRAINT;
             break;
           case NOTLIMWORD:
-            p.constraint &= NOTLIMWORD_CONSTRAINT;
+            constraint = NOTLIMWORD_CONSTRAINT;
             break;
           default:
+            constraint = NO_CONSTRAINT;
             break;
           }
-        for (j = 0; j < d->follows[old.index].nelem; ++j)
-          {
-            p.index = d->follows[old.index].elems[j].index;
-            insert (p, s);
-          }
-        /* Force rescan to start at the beginning.  */
-        i = -1;
+
+        delete (i, &d->follows[i]);
+
+        for (size_t j = 0; j < d->tindex; j++)
+          if (i != j && d->follows[j].nelem > 0)
+            replace (&d->follows[j], i, &d->follows[i], constraint, &tmp);
+
+        replace (initial, i, &d->follows[i], constraint, &tmp);
       }
 }
 
@@ -2370,7 +2400,6 @@
   int separate_contexts;        /* Context wanted by some position.  */
   size_t i, j;
   position *pos;
-  char *visited = xnmalloc (d->tindex, sizeof *visited);
 
 #ifdef DEBUG
   fprintf (stderr, "dfaanalyze:\n");
@@ -2511,14 +2540,12 @@
 #endif
     }
 
-  /* For each follow set that is the follow set of a real position, replace
-     it with its epsilon closure.  */
+#ifdef DEBUG
   for (i = 0; i < d->tindex; ++i)
     if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
         || d->tokens[i] == ANYCHAR || d->tokens[i] == MBCSET
         || d->tokens[i] >= CSET)
       {
-#ifdef DEBUG
         fprintf (stderr, "follows(%zu:", i);
         prtok (d->tokens[i]);
         fprintf (stderr, "):");
@@ -2528,18 +2555,18 @@
             prtok (d->tokens[d->follows[i].elems[j].index]);
           }
         putc ('\n', stderr);
+      }
 #endif
-        copy (&d->follows[i], &merged);
-        epsclosure (&merged, d, visited);
-        copy (&merged, &d->follows[i]);
-      }
 
   /* Get the epsilon closure of the firstpos of the regexp.  The result will
      be the set of positions of state 0.  */
   merged.nelem = 0;
   for (i = 0; i < stk[-1].nfirstpos; ++i)
     insert (firstpos[i], &merged);
-  epsclosure (&merged, d, visited);
+
+  /* For each follow set that is the follow set of a real position, replace
+     it with its epsilon closure.  */
+  epsclosure (&merged, d);
 
   /* Build the initial state.  */
   separate_contexts = state_separate_contexts (&merged);
@@ -2555,7 +2582,6 @@
   free (posalloc);
   free (stkalloc);
   free (merged.elems);
-  free (visited);
 }