Mercurial > octave
changeset 32073:24752aa8be11
maint: Merge stable to default.
author | Markus Mützel <markus.muetzel@gmx.de> |
---|---|
date | Wed, 03 May 2023 20:45:33 +0200 |
parents | 31a68f3ced07 (current diff) f7206b6577c2 (diff) |
children | 03fe0b635d2e |
files | liboctave/wrappers/uniconv-wrappers.c |
diffstat | 2 files changed, 37 insertions(+), 8 deletions(-) [+] |
line wrap: on
line diff
--- a/liboctave/wrappers/uniconv-wrappers.c Tue May 02 11:38:56 2023 +0200 +++ b/liboctave/wrappers/uniconv-wrappers.c Wed May 03 20:45:33 2023 +0200 @@ -58,21 +58,39 @@ // four bytes and zero-terminated to work correctly. Zero-pad input. // Should this be fixed in gnulib or iconv instead? size_t minlen = 4; - size_t padlen = (srclen > minlen ? srclen : minlen) + 1; - uint8_t *u8_str = (uint8_t *) malloc (padlen); - memcpy (u8_str, src, srclen); - for (size_t i_pad = 0; i_pad < padlen-srclen; i_pad++) - u8_str[srclen+i_pad] = 0; + size_t padlen = (srclen > minlen ? srclen : minlen); + + // Do not zero-terminate when the output encoding is a UTF encoding, i.e., + // the surrogates are different than a byte. + if ((tocode[0] != 'u' && tocode[0] != 'U') + || (tocode[1] != 't' && tocode[1] != 'T') + || (tocode[2] != 'f' && tocode[2] != 'F')) + padlen++; + + uint8_t *u8_str; + const uint8_t *cu8_str; + if (srclen < padlen) + { + u8_str = (uint8_t *) malloc (padlen); + memcpy (u8_str, src, srclen); + for (size_t i_pad = 0; i_pad < padlen-srclen; i_pad++) + u8_str[srclen+i_pad] = 0; + cu8_str = u8_str; + } + else + cu8_str = src; // Convert from UTF-8 to output encoding - char *ret = u8_conv_to_encoding (tocode, handler, u8_str, padlen, + char *ret = u8_conv_to_encoding (tocode, handler, cu8_str, padlen, offsets, NULL, lengthp); - free ((void *) u8_str); + + if (srclen > padlen) + free ((void *) u8_str); // FIXME: This assumes that "\0" is converted to a single byte. This might // not be true for some exotic output encodings (like UTF-7?). *lengthp = (*lengthp <= (padlen-srclen) ? 0 : *lengthp - (padlen-srclen)); - + return ret; }
--- a/scripts/strings/unicode2native.m Tue May 02 11:38:56 2023 +0200 +++ b/scripts/strings/unicode2native.m Wed May 03 20:45:33 2023 +0200 @@ -82,13 +82,24 @@ # short character arrays with invalid UTF-8 %!testif HAVE_ICONV <*63930> %! assert (unicode2native (char (230), 'windows-1252'), uint8 (63)); +%!testif HAVE_ICONV <*63930> %! assert (unicode2native (char (249), 'windows-1252'), uint8 (63)); +%!testif HAVE_ICONV <*63930> %! assert (unicode2native (char (230:231), 'windows-1252'), uint8 ([63, 63])); +%!testif HAVE_ICONV <*63930> %! assert (unicode2native (char (230:234), 'windows-1252'), %! uint8 ([63, 63, 63, 63, 63])); +%!testif HAVE_ICONV <*63930> %! assert (unicode2native (char ([230, 10]), 'windows-1252'), %! uint8 ([63, 10])); +# target encoding with surrogates larger than a byte +%!testif HAVE_ICONV <*64139> +%! assert (typecast (unicode2native ('abcde', +%! ['utf-16', nthargout(3, 'computer'), 'e']), +%! 'uint16'), +%! uint16 (97:101)); + %!error <Invalid call> unicode2native () %!error <called with too many inputs> unicode2native ('a', 'ISO-8859-1', 'test') %!error <UTF8_STR must be a character vector> unicode2native (['ab'; 'cd'])