Mercurial > octave
changeset 31967:470134b3fc28 stable
Fix converting short char arrays with invalid UTF-8 (bug #63930).
It looks like u8_conv_to_encoding checks the first input byte whether it looks
like an initial byte of a UTF-8 multi-byte surrogate. It returns an empty
string if the input buffer is too short to hold a surrogate of the expected
length. Additionally, it looks like it drops the trailing byte if it looks like
an initial byte of a multi-byte surrogate.
But we need it to "convert" the invalid byte sequences, e.g., to the
replacement character of the respective output encoding.
* liboctave/wrappers/uniconv-wrappers.c (octave_u8_conv_to_encoding_intern):
Add a new function that pads the input with zeros and zero terminates the
input string.
(octave_u8_conv_to_encoding, octave_u8_conv_to_encoding_strict,
octave_u8_conv_to_encoding_offsets): Use new function.
* scripts/strings/unicode2native.m: Add tests.
author | Markus Mützel <markus.muetzel@gmx.de> |
---|---|
date | Mon, 03 Apr 2023 17:05:52 +0200 |
parents | 8e71a9fc470f |
children | 63038dcbd648 |
files | liboctave/wrappers/uniconv-wrappers.c scripts/strings/unicode2native.m |
diffstat | 2 files changed, 44 insertions(+), 6 deletions(-) [+] |
line wrap: on
line diff
--- a/liboctave/wrappers/uniconv-wrappers.c Sun Apr 02 19:57:15 2023 -0700 +++ b/liboctave/wrappers/uniconv-wrappers.c Mon Apr 03 17:05:52 2023 +0200 @@ -48,20 +48,48 @@ src, srclen, NULL, NULL, lengthp); } +static char * +octave_u8_conv_to_encoding_intern (const char *tocode, + enum iconv_ilseq_handler handler, + const uint8_t *src, size_t srclen, + size_t *offsets, size_t *lengthp) +{ + // FIXME: It looks like the input to u8_conv_to_encoding must be at least + // four bytes and zero-terminated to work correctly. Zero-pad input. + // Should this be fixed in gnulib or iconv instead? + size_t minlen = 4; + size_t padlen = (srclen > minlen ? srclen : minlen) + 1; + uint8_t *u8_str = (uint8_t *) malloc (padlen); + memcpy (u8_str, src, srclen); + for (size_t i_pad = 0; i_pad < padlen-srclen; i_pad++) + u8_str[srclen+i_pad] = 0; + + // Convert from UTF-8 to output encoding + char *ret = u8_conv_to_encoding (tocode, handler, u8_str, padlen, + offsets, NULL, lengthp); + free ((void *) u8_str); + + // FIXME: This assumes that "\0" is converted to a single byte. This might + // not be true for some exotic output encodings (like UTF-7?). + *lengthp = (*lengthp <= (padlen-srclen) ? 0 : *lengthp - (padlen-srclen)); + + return ret; +} + char * octave_u8_conv_to_encoding (const char *tocode, const uint8_t *src, size_t srclen, size_t *lengthp) { - return u8_conv_to_encoding (tocode, iconveh_question_mark, - src, srclen, NULL, NULL, lengthp); + return octave_u8_conv_to_encoding_intern (tocode, iconveh_question_mark, + src, srclen, NULL, lengthp); } char * octave_u8_conv_to_encoding_strict (const char *tocode, const uint8_t *src, size_t srclen, size_t *lengthp) { - return u8_conv_to_encoding (tocode, iconveh_error, - src, srclen, NULL, NULL, lengthp); + return octave_u8_conv_to_encoding_intern (tocode, iconveh_error, + src, srclen, NULL, lengthp); } char * @@ -86,8 +114,8 @@ (const char *tocode, const uint8_t *src, size_t srclen, size_t *offsets, size_t *lengthp) { - return u8_conv_to_encoding (tocode, iconveh_question_mark, - src, srclen, offsets, NULL, lengthp); + return octave_u8_conv_to_encoding_intern (tocode, iconveh_question_mark, + src, srclen, offsets, lengthp); } char *
--- a/scripts/strings/unicode2native.m Sun Apr 02 19:57:15 2023 -0700 +++ b/scripts/strings/unicode2native.m Mon Apr 03 17:05:52 2023 +0200 @@ -79,6 +79,16 @@ %! uint8 ([164:166 0 167:170])); %!assert <*60480> (unicode2native (''), uint8 ([])) +# short character arrays with invalid UTF-8 +%!testif HAVE_ICONV <*63930> +%! assert (unicode2native (char (230), 'windows-1252'), uint8 (63)); +%! assert (unicode2native (char (249), 'windows-1252'), uint8 (63)); +%! assert (unicode2native (char (230:231), 'windows-1252'), uint8 ([63, 63])); +%! assert (unicode2native (char (230:234), 'windows-1252'), +%! uint8 ([63, 63, 63, 63, 63])); +%! assert (unicode2native (char ([230, 10]), 'windows-1252'), +%! uint8 ([63, 10])); + %!error <Invalid call> unicode2native () %!error <called with too many inputs> unicode2native ('a', 'ISO-8859-1', 'test') %!error <UTF8_STR must be a character vector> unicode2native (['ab'; 'cd'])