changeset 38214:98c3a76378d6

dfa: do not match middle of multibyte character * lib/dfa.c (transit_state): If it fails in matching a single byte character in a state including a period expression in a non-UTF8 multibyte locale, skip the trailing bytes. (dfa_supported): Revert previous change.
author Norihiro Tanaka <noritnk@kcn.ne.jp>
date Mon, 28 Nov 2016 22:26:07 +0900
parents 61088a5dbe9a
children 1bb896d10746
files ChangeLog lib/dfa.c
diffstat 2 files changed, 9 insertions(+), 7 deletions(-) [+]
line wrap: on
line diff
--- a/ChangeLog	Sun Nov 27 15:36:51 2016 -0800
+++ b/ChangeLog	Mon Nov 28 22:26:07 2016 +0900
@@ -1,3 +1,11 @@
+2016-11-27  Norihiro Tanaka <noritnk@kcn.ne.jp>
+
+        dfa: avoid match middle in multibyte character
+        * lib/dfa.c (transit_state): If fails in matching single byte characters
+        on a state including period expression in non-UTF8 multibyte locales,
+        skip trailing bytes.
+        (dfa_supported): Revert previous change.
+
 2016-11-27  Jim Meyering  <meyering@fb.com>
 
 	dfa: avoid false match in non-UTF8 multibyte locales
--- a/lib/dfa.c	Sun Nov 27 15:36:51 2016 -0800
+++ b/lib/dfa.c	Mon Nov 28 22:26:07 2016 +0900
@@ -2913,7 +2913,7 @@
   /* Calculate the state which can be reached from the state 's' by
      consuming 'mbclen' single bytes from the buffer.  */
   s1 = s;
-  for (i = 0; i < mbclen && 0 <= s; i++)
+  for (i = 0; i < mbclen && (i == 0 || d->min_trcount <= s); i++)
     s = transit_state_singlebyte (d, s, pp);
   *pp += mbclen - i;
 
@@ -3272,12 +3272,6 @@
 static bool _GL_ATTRIBUTE_PURE
 dfa_supported (struct dfa const *d)
 {
-  /* Declare any non-UTF8 multibyte locale "not supported."  Otherwise, a
-     regexp like ".*7" would mistakenly match \uC9, e.g., via this command:
-     (export LC_ALL=zh_CN.gb18030; printf '\uC9\n' | grep '.*7')  */
-  if (d->localeinfo.multibyte && !d->localeinfo.using_utf8)
-    return false;
-
   size_t i;
   for (i = 0; i < d->tindex; i++)
     {