changeset 31967:470134b3fc28 stable

Fix converting short char arrays with invalid UTF-8 (bug #63930). It looks like u8_conv_to_encoding checks the first input byte whether it looks like an initial byte of a UTF-8 multi-byte surrogate. It returns an empty string if the input buffer is too short to hold a surrogate of the expected length. Additionally, it looks like it drops the trailing byte if it looks like an initial byte of a multi-byte surrogate. But we need it to "convert" the invalid byte sequences, e.g., to the replacement character of the respective output encoding. * liboctave/wrappers/uniconv-wrappers.c (octave_u8_conv_to_encoding_intern): Add a new function that pads the input with zeros and zero terminates the input string. (octave_u8_conv_to_encoding, octave_u8_conv_to_encoding_strict, octave_u8_conv_to_encoding_offsets): Use new function. * scripts/strings/unicode2native.m: Add tests.
author Markus Mützel <markus.muetzel@gmx.de>
date Mon, 03 Apr 2023 17:05:52 +0200
parents 8e71a9fc470f
children 63038dcbd648
files liboctave/wrappers/uniconv-wrappers.c scripts/strings/unicode2native.m
diffstat 2 files changed, 44 insertions(+), 6 deletions(-) [+]
line wrap: on
line diff
--- a/liboctave/wrappers/uniconv-wrappers.c	Sun Apr 02 19:57:15 2023 -0700
+++ b/liboctave/wrappers/uniconv-wrappers.c	Mon Apr 03 17:05:52 2023 +0200
@@ -48,20 +48,48 @@
                                 src, srclen, NULL, NULL, lengthp);
 }
 
+static char *
+octave_u8_conv_to_encoding_intern (const char *tocode,
+                                   enum iconv_ilseq_handler handler,
+                                   const uint8_t *src, size_t srclen,
+                                   size_t *offsets, size_t *lengthp)
+{
+  // FIXME: It looks like the input to u8_conv_to_encoding must be at least
+  //        four bytes and zero-terminated to work correctly.  Zero-pad input.
+  //        Should this be fixed in gnulib or iconv instead?
+  size_t minlen = 4;
+  size_t padlen = (srclen > minlen ? srclen : minlen) + 1;
+  uint8_t *u8_str = (uint8_t *) malloc (padlen);
+  memcpy (u8_str, src, srclen);
+  for (size_t i_pad = 0; i_pad < padlen-srclen; i_pad++)
+    u8_str[srclen+i_pad] = 0;
+
+  // Convert from UTF-8 to output encoding
+  char *ret = u8_conv_to_encoding (tocode, handler, u8_str, padlen,
+                                   offsets, NULL, lengthp);
+  free ((void *) u8_str);
+
+  // FIXME: This assumes that "\0" is converted to a single byte.  This might
+  //        not be true for some exotic output encodings (like UTF-7?).
+  *lengthp = (*lengthp <= (padlen-srclen) ? 0 : *lengthp - (padlen-srclen));
+  
+  return ret;
+}
+
 char *
 octave_u8_conv_to_encoding (const char *tocode, const uint8_t *src,
                             size_t srclen, size_t *lengthp)
 {
-  return u8_conv_to_encoding (tocode, iconveh_question_mark,
-                              src, srclen, NULL, NULL, lengthp);
+  return octave_u8_conv_to_encoding_intern (tocode, iconveh_question_mark,
+                                            src, srclen, NULL, lengthp);
 }
 
 char *
 octave_u8_conv_to_encoding_strict (const char *tocode, const uint8_t *src,
                                    size_t srclen, size_t *lengthp)
 {
-  return u8_conv_to_encoding (tocode, iconveh_error,
-                              src, srclen, NULL, NULL, lengthp);
+  return octave_u8_conv_to_encoding_intern (tocode, iconveh_error,
+                                            src, srclen, NULL, lengthp);
 }
 
 char *
@@ -86,8 +114,8 @@
   (const char *tocode, const uint8_t *src, size_t srclen,
    size_t *offsets, size_t *lengthp)
 {
-  return u8_conv_to_encoding (tocode, iconveh_question_mark,
-                              src, srclen, offsets, NULL, lengthp);
+  return octave_u8_conv_to_encoding_intern (tocode, iconveh_question_mark,
+                                            src, srclen, offsets, lengthp);
 }
 
 char *
--- a/scripts/strings/unicode2native.m	Sun Apr 02 19:57:15 2023 -0700
+++ b/scripts/strings/unicode2native.m	Mon Apr 03 17:05:52 2023 +0200
@@ -79,6 +79,16 @@
 %!         uint8 ([164:166 0 167:170]));
 %!assert <*60480> (unicode2native (''), uint8 ([]))
 
+# short character arrays with invalid UTF-8
+%!testif HAVE_ICONV <*63930>
+%! assert (unicode2native (char (230), 'windows-1252'), uint8 (63));
+%! assert (unicode2native (char (249), 'windows-1252'), uint8 (63));
+%! assert (unicode2native (char (230:231), 'windows-1252'), uint8 ([63, 63]));
+%! assert (unicode2native (char (230:234), 'windows-1252'),
+%!         uint8 ([63, 63, 63, 63, 63]));
+%! assert (unicode2native (char ([230, 10]), 'windows-1252'),
+%!         uint8 ([63, 10]));
+
 %!error <Invalid call> unicode2native ()
 %!error <called with too many inputs> unicode2native ('a', 'ISO-8859-1', 'test')
 %!error <UTF8_STR must be a character vector> unicode2native (['ab'; 'cd'])