diff scripts/strings/unicode2native.m @ 31967:470134b3fc28 stable

Fix converting short char arrays with invalid UTF-8 (bug #63930). It looks like u8_conv_to_encoding checks the first input byte whether it looks like an initial byte of a UTF-8 multi-byte surrogate. It returns an empty string if the input buffer is too short to hold a surrogate of the expected length. Additionally, it looks like it drops the trailing byte if it looks like an initial byte of a multi-byte surrogate. But we need it to "convert" the invalid byte sequences, e.g., to the replacement character of the respective output encoding. * liboctave/wrappers/uniconv-wrappers.c (octave_u8_conv_to_encoding_intern): Add a new function that pads the input with zeros and zero terminates the input string. (octave_u8_conv_to_encoding, octave_u8_conv_to_encoding_strict, octave_u8_conv_to_encoding_offsets): Use new function. * scripts/strings/unicode2native.m: Add tests.
author Markus Mützel <markus.muetzel@gmx.de>
date Mon, 03 Apr 2023 17:05:52 +0200
parents 597f3ee61a48
children f7206b6577c2
line wrap: on
line diff
--- a/scripts/strings/unicode2native.m	Sun Apr 02 19:57:15 2023 -0700
+++ b/scripts/strings/unicode2native.m	Mon Apr 03 17:05:52 2023 +0200
@@ -79,6 +79,16 @@
 %!         uint8 ([164:166 0 167:170]));
 %!assert <*60480> (unicode2native (''), uint8 ([]))
 
+# short character arrays with invalid UTF-8
+%!testif HAVE_ICONV <*63930>
+%! assert (unicode2native (char (230), 'windows-1252'), uint8 (63));
+%! assert (unicode2native (char (249), 'windows-1252'), uint8 (63));
+%! assert (unicode2native (char (230:231), 'windows-1252'), uint8 ([63, 63]));
+%! assert (unicode2native (char (230:234), 'windows-1252'),
+%!         uint8 ([63, 63, 63, 63, 63]));
+%! assert (unicode2native (char ([230, 10]), 'windows-1252'),
+%!         uint8 ([63, 10]));
+
 %!error <Invalid call> unicode2native ()
 %!error <called with too many inputs> unicode2native ('a', 'ISO-8859-1', 'test')
 %!error <UTF8_STR must be a character vector> unicode2native (['ab'; 'cd'])