changeset 37908:3efd6ed98abb

regex: treat [x] as x if x is a unibyte encoding error Problem reported by Aharon Robbins in: http://lists.gnu.org/archive/html/bug-gnulib/2016-01/msg00091.html * lib/regcomp.c (parse_byte) [!_LIBC && RE_ENABLE_I18N]: New function. (build_range_exp) [!_LIBC && RE_ENABLE_I18N]: Use it.
author Paul Eggert <eggert@cs.ucla.edu>
date Sun, 24 Jan 2016 00:55:44 -0800
parents 9005fb61c868
children 3686845e645c
files ChangeLog lib/regcomp.c
diffstat 2 files changed, 21 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/ChangeLog	Sun Jan 24 00:28:19 2016 -0800
+++ b/ChangeLog	Sun Jan 24 00:55:44 2016 -0800
@@ -1,5 +1,11 @@
 2016-01-24  Paul Eggert  <eggert@cs.ucla.edu>
 
+	regex: treat [x] as x if x is a unibyte encoding error
+	Problem reported by Aharon Robbins in:
+	http://lists.gnu.org/archive/html/bug-gnulib/2016-01/msg00091.html
+	* lib/regcomp.c (parse_byte) [!_LIBC && RE_ENABLE_I18N]: New function.
+	(build_range_exp) [!_LIBC && RE_ENABLE_I18N]: Use it.
+
 	closedir, dirfd, opendir: port to OpenSolaris 5.10
 	* m4/closedir.m4 (gl_FUNC_CLOSEDIR):
 	* m4/dirfd.m4 (gl_FUNC_DIRFD):
--- a/lib/regcomp.c	Sun Jan 24 00:28:19 2016 -0800
+++ b/lib/regcomp.c	Sun Jan 24 00:55:44 2016 -0800
@@ -2696,6 +2696,19 @@
 #define BRACKET_NAME_BUF_SIZE 32
 
 #ifndef _LIBC
+
+# ifdef RE_ENABLE_I18N
+/* Convert the byte B to the corresponding wide character.  In a
+   unibyte locale, treat B as itself if it is an encoding error.
+   In a multibyte locale, return WEOF if B is an encoding error.  */
+static wint_t
+parse_byte (unsigned char b, re_charset_t *mbcset)
+{
+  wint_t wc = __btowc (b);
+  return wc == WEOF && !mbcset ? b : wc;
+}
+#endif
+
   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
      Build the range expression which starts from START_ELEM, and ends
      at END_ELEM.  The result are written to MBCSET and SBCSET.
@@ -2747,9 +2760,9 @@
 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
 		 : 0));
     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
-		? __btowc (start_ch) : start_elem->opr.wch);
+		? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
-	      ? __btowc (end_ch) : end_elem->opr.wch);
+	      ? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
     if (start_wc == WEOF || end_wc == WEOF)
       return REG_ECOLLATE;
     else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0))