changeset 32072:f7206b6577c2 stable

unicode2native: Fix conversion to UTF-16 (bug #64139). * liboctave/wrappers/uniconv-wrappers.c (octave_u8_conv_to_encoding_intern): Avoid appending a zero-byte when converting to UTF-* to avoid having to strip a varying number of bytes after the conversion. * scripts/strings/unicode2native.m: Add test for conversion to UTF-16.
author Markus Mützel <markus.muetzel@gmx.de>
date Wed, 03 May 2023 20:43:36 +0200
parents bc46d7c2768f
children 24752aa8be11 66ecc0d4d6ce
files liboctave/wrappers/uniconv-wrappers.c scripts/strings/unicode2native.m
diffstat 2 files changed, 37 insertions(+), 8 deletions(-) [+]
line wrap: on
line diff
--- a/liboctave/wrappers/uniconv-wrappers.c	Tue May 02 11:38:24 2023 +0200
+++ b/liboctave/wrappers/uniconv-wrappers.c	Wed May 03 20:43:36 2023 +0200
@@ -58,21 +58,39 @@
   //        four bytes and zero-terminated to work correctly.  Zero-pad input.
   //        Should this be fixed in gnulib or iconv instead?
   size_t minlen = 4;
-  size_t padlen = (srclen > minlen ? srclen : minlen) + 1;
-  uint8_t *u8_str = (uint8_t *) malloc (padlen);
-  memcpy (u8_str, src, srclen);
-  for (size_t i_pad = 0; i_pad < padlen-srclen; i_pad++)
-    u8_str[srclen+i_pad] = 0;
+  size_t padlen = (srclen > minlen ? srclen : minlen);
+
+  // Do not zero-terminate when the output encoding is a UTF encoding, i.e.,
+  // the surrogates are different than a byte.
+  if ((tocode[0] != 'u' && tocode[0] != 'U')
+      || (tocode[1] != 't' && tocode[1] != 'T')
+      || (tocode[2] != 'f' && tocode[2] != 'F'))
+    padlen++;
+
+  uint8_t *u8_str;
+  const uint8_t *cu8_str;
+  if (srclen < padlen)
+    {
+      u8_str = (uint8_t *) malloc (padlen);
+      memcpy (u8_str, src, srclen);
+      for (size_t i_pad = 0; i_pad < padlen-srclen; i_pad++)
+        u8_str[srclen+i_pad] = 0;
+      cu8_str = u8_str;
+    }
+  else
+    cu8_str = src;
 
   // Convert from UTF-8 to output encoding
-  char *ret = u8_conv_to_encoding (tocode, handler, u8_str, padlen,
+  char *ret = u8_conv_to_encoding (tocode, handler, cu8_str, padlen,
                                    offsets, NULL, lengthp);
-  free ((void *) u8_str);
+
+  if (srclen > padlen)
+    free ((void *) u8_str);
 
   // FIXME: This assumes that "\0" is converted to a single byte.  This might
   //        not be true for some exotic output encodings (like UTF-7?).
   *lengthp = (*lengthp <= (padlen-srclen) ? 0 : *lengthp - (padlen-srclen));
-  
+
   return ret;
 }
 
--- a/scripts/strings/unicode2native.m	Tue May 02 11:38:24 2023 +0200
+++ b/scripts/strings/unicode2native.m	Wed May 03 20:43:36 2023 +0200
@@ -82,13 +82,24 @@
 # short character arrays with invalid UTF-8
 %!testif HAVE_ICONV <*63930>
 %! assert (unicode2native (char (230), 'windows-1252'), uint8 (63));
+%!testif HAVE_ICONV <*63930>
 %! assert (unicode2native (char (249), 'windows-1252'), uint8 (63));
+%!testif HAVE_ICONV <*63930>
 %! assert (unicode2native (char (230:231), 'windows-1252'), uint8 ([63, 63]));
+%!testif HAVE_ICONV <*63930>
 %! assert (unicode2native (char (230:234), 'windows-1252'),
 %!         uint8 ([63, 63, 63, 63, 63]));
+%!testif HAVE_ICONV <*63930>
 %! assert (unicode2native (char ([230, 10]), 'windows-1252'),
 %!         uint8 ([63, 10]));
 
+# target encoding with surrogates larger than a byte
+%!testif HAVE_ICONV <*64139>
+%! assert (typecast (unicode2native ('abcde',
+%!                                   ['utf-16', nthargout(3, 'computer'), 'e']),
+%!                   'uint16'),
+%!         uint16 (97:101));
+
 %!error <Invalid call> unicode2native ()
 %!error <called with too many inputs> unicode2native ('a', 'ISO-8859-1', 'test')
 %!error <UTF8_STR must be a character vector> unicode2native (['ab'; 'cd'])