# HG changeset patch # User Norihiro Tanaka # Date 1480339567 -32400 # Node ID 98c3a76378d6ad705ef56ef8729e7590d60221bf # Parent 61088a5dbe9afcddca8834ecf493fc32bc496e0b dfa: do not match middle of multibyte character * lib/dfa.c (transit_state): If it fails in matching a single byte character in a state including a period expression in a non-UTF8 multibyte locale, skip the trailing bytes. (dfa_supported): Revert previous change. diff -r 61088a5dbe9a -r 98c3a76378d6 ChangeLog --- a/ChangeLog Sun Nov 27 15:36:51 2016 -0800 +++ b/ChangeLog Mon Nov 28 22:26:07 2016 +0900 @@ -1,3 +1,11 @@ +2016-11-27 Norihiro Tanaka + + dfa: avoid match middle in multibyte character + * lib/dfa.c (transit_state): If fails in matching single byte characters + on a state including period expression in non-UTF8 multibyte locales, + skip trailing bytes. + (dfa_supported): Revert previous change. + 2016-11-27 Jim Meyering dfa: avoid false match in non-UTF8 multibyte locales diff -r 61088a5dbe9a -r 98c3a76378d6 lib/dfa.c --- a/lib/dfa.c Sun Nov 27 15:36:51 2016 -0800 +++ b/lib/dfa.c Mon Nov 28 22:26:07 2016 +0900 @@ -2913,7 +2913,7 @@ /* Calculate the state which can be reached from the state 's' by consuming 'mbclen' single bytes from the buffer. */ s1 = s; - for (i = 0; i < mbclen && 0 <= s; i++) + for (i = 0; i < mbclen && (i == 0 || d->min_trcount <= s); i++) s = transit_state_singlebyte (d, s, pp); *pp += mbclen - i; @@ -3272,12 +3272,6 @@ static bool _GL_ATTRIBUTE_PURE dfa_supported (struct dfa const *d) { - /* Declare any non-UTF8 multibyte locale "not supported." Otherwise, a - regexp like ".*7" would mistakenly match \uC9, e.g., via this command: - (export LC_ALL=zh_CN.gb18030; printf '\uC9\n' | grep '.*7') */ - if (d->localeinfo.multibyte && !d->localeinfo.using_utf8) - return false; - size_t i; for (i = 0; i < d->tindex; i++) {