changeset 18292:24fde31e764d

mbrtowc: work around glibc bug#19932 Fix mbrtowc so that it never returns -1 in the C locale, as this conflicts with a future version of POSIX http://austingroupbugs.net/view.php?id=663#c2738 and causes problems with GNU grep: http://bugs.gnu.org/23234 See glibc bug 19932: https://sourceware.org/bugzilla/show_bug.cgi?id=19932 * doc/posix-functions/mbrlen.texi (mbrlen): * doc/posix-functions/mbrtowc.texi (mbrtowc): Document the glibc bug. * lib/mbrtowc.c [C_LOCALE_MAYBE_EILSEQ]: Include hard-locale.h, locale.h. (rpl_mbrtowc): Work around the C_LOCALE_MAYBE_EILSEQ bug, if the bug is possible. * m4/mbrtowc.m4 (gl_MBRTOWC_C_LOCALE): New macro. (gl_FUNC_MBRTOWC): Use it, and define C_LOCALE_MAYBE_EILSEQ as needed. * modules/hard-locale (License): Now LGPLv2+, for mbrtowc. * modules/mbrtowc (Depends-on): Add hard-locale. * modules/mbrtowc-tests (Files, TESTS): Add tests/test-mbrtowc5.sh. * tests/test-mbrtowc.c (main): Test for bug fix if arg is '5'. * tests/test-mbrtowc5.sh: New file.
author Paul Eggert <eggert@penguin.cs.ucla.edu>
date Sat, 09 Apr 2016 01:28:36 -0700
parents efbdbdd32f55
children 294fa0173b5e
files ChangeLog doc/posix-functions/mbrlen.texi doc/posix-functions/mbrtowc.texi lib/mbrtowc.c m4/mbrtowc.m4 modules/hard-locale modules/mbrtowc modules/mbrtowc-tests tests/test-mbrtowc.c tests/test-mbrtowc5.sh
diffstat 10 files changed, 128 insertions(+), 31 deletions(-) [+]
line wrap: on
line diff
--- a/ChangeLog	Wed Apr 06 14:02:39 2016 -0700
+++ b/ChangeLog	Sat Apr 09 01:28:36 2016 -0700
@@ -1,3 +1,27 @@
+2016-04-09  Paul Eggert  <eggert@penguin.cs.ucla.edu>
+
+	mbrtowc: work around glibc bug#19932
+	Fix mbrtowc so that it never returns -1 in the C locale,
+	as this conflicts with a future version of POSIX
+	http://austingroupbugs.net/view.php?id=663#c2738
+	and causes problems with GNU grep: http://bugs.gnu.org/23234
+	See glibc bug 19932:
+	https://sourceware.org/bugzilla/show_bug.cgi?id=19932
+	* doc/posix-functions/mbrlen.texi (mbrlen):
+	* doc/posix-functions/mbrtowc.texi (mbrtowc):
+	Document the glibc bug.
+	* lib/mbrtowc.c [C_LOCALE_MAYBE_EILSEQ]:
+	Include hard-locale.h, locale.h.
+	(rpl_mbrtowc): Work around the C_LOCALE_MAYBE_EILSEQ bug,
+	if the bug is possible.
+	* m4/mbrtowc.m4 (gl_MBRTOWC_C_LOCALE): New macro.
+	(gl_FUNC_MBRTOWC): Use it, and define C_LOCALE_MAYBE_EILSEQ as needed.
+	* modules/hard-locale (License): Now LGPLv2+, for mbrtowc.
+	* modules/mbrtowc (Depends-on): Add hard-locale.
+	* modules/mbrtowc-tests (Files, TESTS): Add tests/test-mbrtowc5.sh.
+	* tests/test-mbrtowc.c (main): Test for bug fix if arg is '5'.
+	* tests/test-mbrtowc5.sh: New file.
+
 2016-04-03  Pedro Alves  <palves@redhat.com>
 
 	stdint: detect good enough pre-C++11 stdint.h in C++ mode
--- a/doc/posix-functions/mbrlen.texi	Wed Apr 06 14:02:39 2016 -0700
+++ b/doc/posix-functions/mbrlen.texi	Sat Apr 09 01:28:36 2016 -0700
@@ -12,6 +12,10 @@
 This function is missing on some platforms:
 Minix 3.1.8, HP-UX 11.00, IRIX 6.5, Solaris 2.6, mingw, Interix 3.5.
 @item
+In the C or POSIX locales, this function can return @code{(size_t) -1}
+and set @code{errno} to @code{EILSEQ}:
+glibc 2.23.
+@item
 This function returns 0 instead of @code{(size_t) -2} when the input
 is empty:
 glibc 2.19.
--- a/doc/posix-functions/mbrtowc.texi	Wed Apr 06 14:02:39 2016 -0700
+++ b/doc/posix-functions/mbrtowc.texi	Sat Apr 09 01:28:36 2016 -0700
@@ -12,6 +12,10 @@
 This function is missing on some platforms:
 Minix 3.1.8, HP-UX 11.00, IRIX 6.5, Solaris 2.6, mingw, Interix 3.5.
 @item
+In the C or POSIX locales, this function can return @code{(size_t) -1}
+and set @code{errno} to @code{EILSEQ}:
+glibc 2.23.
+@item
 This function returns 0 instead of @code{(size_t) -2} when the input
 is empty:
 glibc 2.19.
--- a/lib/mbrtowc.c	Wed Apr 06 14:02:39 2016 -0700
+++ b/lib/mbrtowc.c	Sat Apr 09 01:28:36 2016 -0700
@@ -20,6 +20,11 @@
 /* Specification.  */
 #include <wchar.h>
 
+#if C_LOCALE_MAYBE_EILSEQ
+# include "hard-locale.h"
+# include <locale.h>
+#endif
+
 #if GNULIB_defined_mbstate_t
 /* Implement mbrtowc() on top of mbtowc().  */
 
@@ -328,6 +333,9 @@
 size_t
 rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
 {
+  size_t ret;
+  wchar_t wc;
+
 # if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG
   if (s == NULL)
     {
@@ -342,6 +350,9 @@
     return (size_t) -2;
 # endif
 
+  if (! pwc)
+    pwc = &wc;
+
 # if MBRTOWC_RETVAL_BUG
   {
     static mbstate_t internal_state;
@@ -357,8 +368,7 @@
         size_t count = 0;
         for (; n > 0; s++, n--)
           {
-            wchar_t wc;
-            size_t ret = mbrtowc (&wc, s, 1, ps);
+            ret = mbrtowc (&wc, s, 1, ps);
 
             if (ret == (size_t)(-1))
               return (size_t)(-1);
@@ -366,8 +376,7 @@
             if (ret != (size_t)(-2))
               {
                 /* The multibyte character has been completed.  */
-                if (pwc != NULL)
-                  *pwc = wc;
+                *pwc = wc;
                 return (wc == 0 ? 0 : count);
               }
           }
@@ -376,32 +385,23 @@
   }
 # endif
 
+  ret = mbrtowc (pwc, s, n, ps);
+
 # if MBRTOWC_NUL_RETVAL_BUG
-  {
-    wchar_t wc;
-    size_t ret = mbrtowc (&wc, s, n, ps);
+  if (ret < (size_t) -2 && !*pwc)
+    return 0;
+# endif
 
-    if (ret != (size_t)(-1) && ret != (size_t)(-2))
-      {
-        if (pwc != NULL)
-          *pwc = wc;
-        if (wc == 0)
-          ret = 0;
-      }
-    return ret;
-  }
-# else
-  {
-#   if MBRTOWC_NULL_ARG1_BUG
-    wchar_t dummy;
+# if C_LOCALE_MAYBE_EILSEQ
+  if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE))
+    {
+      unsigned char uc = *s;
+      *pwc = uc;
+      return 1;
+    }
+# endif
 
-    if (pwc == NULL)
-      pwc = &dummy;
-#   endif
-
-    return mbrtowc (pwc, s, n, ps);
-  }
-# endif
+  return ret;
 }
 
 #endif
--- a/m4/mbrtowc.m4	Wed Apr 06 14:02:39 2016 -0700
+++ b/m4/mbrtowc.m4	Sat Apr 09 01:28:36 2016 -0700
@@ -1,4 +1,4 @@
-# mbrtowc.m4 serial 26  -*- coding: utf-8 -*-
+# mbrtowc.m4 serial 27  -*- coding: utf-8 -*-
 dnl Copyright (C) 2001-2002, 2004-2005, 2008-2016 Free Software Foundation,
 dnl Inc.
 dnl This file is free software; the Free Software Foundation
@@ -40,6 +40,7 @@
       gl_MBRTOWC_RETVAL
       gl_MBRTOWC_NUL_RETVAL
       gl_MBRTOWC_EMPTY_INPUT
+      gl_MBRTOWC_C_LOCALE
       case "$gl_cv_func_mbrtowc_null_arg1" in
         *yes) ;;
         *) AC_DEFINE([MBRTOWC_NULL_ARG1_BUG], [1],
@@ -76,6 +77,13 @@
            REPLACE_MBRTOWC=1
            ;;
       esac
+      case $gl_cv_C_locale_sans_EILSEQ in
+        *yes) ;;
+        *) AC_DEFINE([C_LOCALE_MAYBE_EILSEQ], [1],
+             [Define to 1 if the C locale may have encoding errors.])
+           REPLACE_MBRTOWC=1
+           ;;
+      esac
     fi
   fi
 ])
@@ -577,6 +585,46 @@
     ])
 ])
 
+dnl Test whether mbrtowc reports encoding errors in the C locale.
+dnl Although POSIX was never intended to allow this, the GNU C Library
+dnl and other implementations do it.  See:
+dnl https://sourceware.org/bugzilla/show_bug.cgi?id=19932
+
+AC_DEFUN([gl_MBRTOWC_C_LOCALE],
+[
+  AC_CACHE_CHECK([whether the C locale is free of encoding errors],
+    [gl_cv_C_locale_sans_EILSEQ],
+    [
+     dnl Initial guess, used when cross-compiling or when no suitable locale
+     dnl is present.
+     gl_cv_C_locale_sans_EILSEQ="guessing no"
+
+     AC_RUN_IFELSE(
+       [AC_LANG_PROGRAM(
+          [[#include <limits.h>
+            #include <locale.h>
+            #include <wchar.h>
+          ]], [[
+            int i;
+            char *locale = setlocale (LC_ALL, "C");
+            if (! locale)
+              return 1;
+            for (i = CHAR_MIN; i <= CHAR_MAX; i++)
+              {
+                char c = i;
+                wchar_t wc;
+                mbstate_t mbs = { 0, };
+                size_t ss = mbrtowc (&wc, &c, 1, &mbs);
+                if (1 < ss)
+                  return 1;
+              }
+            return 0;
+          ]])],
+      [gl_cv_C_locale_sans_EILSEQ=yes],
+      [gl_cv_C_locale_sans_EILSEQ=no],
+      [:])])
+])
+
 # Prerequisites of lib/mbrtowc.c.
 AC_DEFUN([gl_PREREQ_MBRTOWC], [
   :
--- a/modules/hard-locale	Wed Apr 06 14:02:39 2016 -0700
+++ b/modules/hard-locale	Sat Apr 09 01:28:36 2016 -0700
@@ -20,7 +20,7 @@
 "hard-locale.h"
 
 License:
-GPL
+LGPLv2+
 
 Maintainer:
 Paul Eggert
--- a/modules/mbrtowc	Wed Apr 06 14:02:39 2016 -0700
+++ b/modules/mbrtowc	Sat Apr 09 01:28:36 2016 -0700
@@ -13,6 +13,7 @@
 Depends-on:
 wchar
 extensions
+hard-locale     [test $HAVE_MBRTOWC = 0 || test $REPLACE_MBRTOWC = 1]
 mbsinit         [test $HAVE_MBRTOWC = 0 || test $REPLACE_MBRTOWC = 1]
 localcharset    [test $HAVE_MBRTOWC = 0 || test $REPLACE_MBRTOWC = 1]
 streq           [test $HAVE_MBRTOWC = 0 || test $REPLACE_MBRTOWC = 1]
--- a/modules/mbrtowc-tests	Wed Apr 06 14:02:39 2016 -0700
+++ b/modules/mbrtowc-tests	Sat Apr 09 01:28:36 2016 -0700
@@ -3,6 +3,7 @@
 tests/test-mbrtowc2.sh
 tests/test-mbrtowc3.sh
 tests/test-mbrtowc4.sh
+tests/test-mbrtowc5.sh
 tests/test-mbrtowc.c
 tests/test-mbrtowc-w32-1.sh
 tests/test-mbrtowc-w32-2.sh
@@ -31,6 +32,7 @@
 Makefile.am:
 TESTS += \
   test-mbrtowc1.sh test-mbrtowc2.sh test-mbrtowc3.sh test-mbrtowc4.sh \
+  test-mbrtowc5.sh \
   test-mbrtowc-w32-1.sh test-mbrtowc-w32-2.sh test-mbrtowc-w32-3.sh \
   test-mbrtowc-w32-4.sh test-mbrtowc-w32-5.sh
 TESTS_ENVIRONMENT += \
@@ -39,4 +41,3 @@
   LOCALE_JA='@LOCALE_JA@' \
   LOCALE_ZH_CN='@LOCALE_ZH_CN@'
 check_PROGRAMS += test-mbrtowc test-mbrtowc-w32
-
--- a/tests/test-mbrtowc.c	Wed Apr 06 14:02:39 2016 -0700
+++ b/tests/test-mbrtowc.c	Sat Apr 09 01:28:36 2016 -0700
@@ -72,6 +72,10 @@
     for (c = 0; c < 0x100; c++)
       switch (c)
         {
+        default:
+          if (! (c && 1 < argc && argv[1][0] == '5'))
+            break;
+          /* Fall through.  */
         case '\t': case '\v': case '\f':
         case ' ': case '!': case '"': case '#': case '%':
         case '&': case '\'': case '(': case ')': case '*':
@@ -93,7 +97,8 @@
         case 'p': case 'q': case 'r': case 's': case 't':
         case 'u': case 'v': case 'w': case 'x': case 'y':
         case 'z': case '{': case '|': case '}': case '~':
-          /* c is in the ISO C "basic character set".  */
+          /* c is in the ISO C "basic character set", or argv[1] starts
+             with '5' so we are testing all nonnull bytes.  */
           buf[0] = c;
           wc = (wchar_t) 0xBADFACE;
           ret = mbrtowc (&wc, buf, 1, &state);
@@ -334,6 +339,10 @@
           ASSERT (mbsinit (&state));
         }
         return 0;
+
+      case '5':
+        /* C locale; tested above.  */
+        return 0;
       }
 
   return 1;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test-mbrtowc5.sh	Sat Apr 09 01:28:36 2016 -0700
@@ -0,0 +1,6 @@
+#!/bin/sh
+# Test whether the POSIX locale has encoding errors.
+LC_ALL=C \
+./test-mbrtowc${EXEEXT} 5 || exit
+LC_ALL=POSIX \
+./test-mbrtowc${EXEEXT} 5