changeset 38213:61088a5dbe9a

dfa: avoid false match in non-UTF8 multibyte locales * lib/dfa.c (dfa_supported): Treat any non-UTF8 multibyte locale as "not supported" so that callers will resort to using regex-based matcher. This will surely hurt performance, but correctness trumps performance here, and the affected locales are less and less relevant, these days. See grep's bug report https://bugs.gnu.org/24975.
author Jim Meyering <meyering@fb.com>
date Sun, 27 Nov 2016 15:36:51 -0800
parents 0a8727662e3d
children 98c3a76378d6
files ChangeLog lib/dfa.c
diffstat 2 files changed, 15 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/ChangeLog	Sat Nov 26 22:52:00 2016 -0500
+++ b/ChangeLog	Sun Nov 27 15:36:51 2016 -0800
@@ -1,3 +1,12 @@
+2016-11-27  Jim Meyering  <meyering@fb.com>
+
+	dfa: avoid false match in non-UTF8 multibyte locales
+	* lib/dfa.c (dfa_supported): Treat any non-UTF8 multibyte locale
+	as "not supported" so that callers will resort to using regex-based
+	matcher.  This will surely hurt performance, but correctness trumps
+	performance here, and the affected locales are less and less relevant,
+	these days.  See grep's bug report https://bugs.gnu.org/24975.
+
 2016-11-27  Mike Frysinger  <vapier@gentoo.org>
 
 	ptsname_r: leverage AC_HEADER_MAJOR to provide major()
--- a/lib/dfa.c	Sat Nov 26 22:52:00 2016 -0500
+++ b/lib/dfa.c	Sun Nov 27 15:36:51 2016 -0800
@@ -3272,6 +3272,12 @@
 static bool _GL_ATTRIBUTE_PURE
 dfa_supported (struct dfa const *d)
 {
+  /* Declare any non-UTF8 multibyte locale "not supported."  Otherwise, a
+     regexp like ".*7" would mistakenly match \uC9, e.g., via this command:
+     (export LC_ALL=zh_CN.gb18030; printf '\uC9\n' | grep '.*7')  */
+  if (d->localeinfo.multibyte && !d->localeinfo.using_utf8)
+    return false;
+
   size_t i;
   for (i = 0; i < d->tindex; i++)
     {