Mercurial > gnulib
changeset 39855:a29036ff511d
dfa: simplify initial state
Simplifying the initial state enables easier optimization of the NFA.
* lib/dfa.c (enum token): Add new element BEG.
(prtok): Adjust due to adding element BEG.
(dfaparse): Put BEG at a head of tokens.
(state_index): Adjust due to adding element BEG.
(dfaanalyze): Concatenate BEG to other tokens, and simplify to
build initial state.
(dfamust): Adjust due to adding element BEG. DFAMUST ignores it.
author | Norihiro Tanaka <noritnk@kcn.ne.jp> |
---|---|
date | Tue, 18 Sep 2018 09:58:22 -0700 |
parents | 02e3e9bb4409 |
children | 469c01483bf1 |
files | ChangeLog lib/dfa.c |
diffstat | 2 files changed, 33 insertions(+), 16 deletions(-) [+] |
line wrap: on
line diff
--- a/ChangeLog Tue Sep 18 22:58:23 2018 +0200 +++ b/ChangeLog Tue Sep 18 09:58:22 2018 -0700 @@ -1,3 +1,15 @@ +2018-09-18 Norihiro Tanaka <noritnk@kcn.ne.jp> + + dfa: simplify initial state + Simplifying the initial state enables easier optimization of the NFA. + * lib/dfa.c (enum token): Add new element BEG. + (prtok): Adjust due to adding element BEG. + (dfaparse): Put BEG at a head of tokens. + (state_index): Adjust due to adding element BEG. + (dfaanalyze): Concatenate BEG to other tokens, and simplify to + build initial state. + (dfamust): Adjust due to adding element BEG. DFAMUST ignores it. + 2018-09-18 Bruno Haible <bruno@clisp.org> file-has-acl: Fix test failure on Cygwin 2.9.
--- a/lib/dfa.c Tue Sep 18 22:58:23 2018 +0200 +++ b/lib/dfa.c Tue Sep 18 09:58:22 2018 -0700 @@ -223,12 +223,15 @@ /* Predefined token values. */ enum { - END = -1, /* END is a terminal symbol that matches the + END = -2, /* END is a terminal symbol that matches the end of input; any value of END or less in the parse tree is such a symbol. Accepting states of the DFA are those that would have a transition on END. */ + BEG = -1, /* BEG is a beginning symbol that matches the + biginning of input. */ + /* Ordinary character values are terminal symbols that match themselves. */ EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches @@ -630,9 +633,9 @@ static void prtok (token t) { - if (t < 0) + if (t <= END) fprintf (stderr, "END"); - else if (t < NOTCHAR) + else if (0 <= t && t < NOTCHAR) { unsigned int ch = t; fprintf (stderr, "0x%02x", ch); @@ -642,6 +645,9 @@ char const *s; switch (t) { + case BEG: + s = "BEG"; + break; case EMPTY: s = "EMPTY"; break; @@ -1967,6 +1973,9 @@ if (!d->syntax.syntax_bits_set) dfaerror (_("no syntax specified")); + if (!d->nregexps) + addtok (d, BEG); + d->parse.tok = lex (d); d->parse.depth = d->depth; @@ -2180,7 +2189,7 @@ for (state_num j = 0; j < s->nelem; j++) { int c = s->elems[j].constraint; - if (d->tokens[s->elems[j].index] < 0) + if (d->tokens[s->elems[j].index] <= END) { if (succeeds_in_context (c, context, CTX_ANY)) constraint |= c; @@ -2380,6 +2389,8 @@ position_set merged; /* Result of merging sets. */ + addtok (d, CAT); + #ifdef DEBUG fprintf (stderr, "dfaanalyze:\n"); for (size_t i = 0; i < d->tindex; ++i) @@ -2540,26 +2551,20 @@ } #endif - /* Get the epsilon closure of the firstpos of the regexp. The result will - be the set of positions of state 0. */ - merged.nelem = 0; - for (size_t i = 0; i < stk[-1].nfirstpos; ++i) - insert (firstpos[i], &merged); - /* For each follow set that is the follow set of a real position, replace it with its epsilon closure. */ - epsclosure (&merged, d); + epsclosure (&d->follows[0], d); /* Context wanted by some position. */ - int separate_contexts = state_separate_contexts (&merged); + int separate_contexts = state_separate_contexts (&d->follows[0]); /* Build the initial state. */ if (separate_contexts & CTX_NEWLINE) - state_index (d, &merged, CTX_NEWLINE); + state_index (d, &d->follows[0], CTX_NEWLINE); d->initstate_notbol = d->min_trcount - = state_index (d, &merged, separate_contexts ^ CTX_ANY); + = state_index (d, &d->follows[0], separate_contexts ^ CTX_ANY); if (separate_contexts & CTX_LETTER) - d->min_trcount = state_index (d, &merged, CTX_LETTER); + d->min_trcount = state_index (d, &d->follows[0], CTX_LETTER); d->min_trcount++; d->trcount = 0; @@ -3783,7 +3788,7 @@ bool need_endline = false; bool case_fold_unibyte = d->syntax.case_fold && MB_CUR_MAX == 1; - for (size_t ri = 0; ri < d->tindex; ++ri) + for (size_t ri = 1; ri < d->tindex - 1; ++ri) { token t = d->tokens[ri]; switch (t)