Mercurial > octave
changeset 31969:ab928435dd79
maint: Merge stable to default.
author | Markus Mützel <markus.muetzel@gmx.de> |
---|---|
date | Mon, 03 Apr 2023 18:23:09 +0200 |
parents | 007aeb31ecb1 (current diff) 63038dcbd648 (diff) |
children | 37700c8fba9c |
files | test/io.tst |
diffstat | 4 files changed, 124 insertions(+), 15 deletions(-) [+] |
line wrap: on
line diff
--- a/liboctave/util/oct-string.cc Sun Apr 02 19:57:52 2023 -0700 +++ b/liboctave/util/oct-string.cc Mon Apr 03 18:23:09 2023 +0200 @@ -617,29 +617,70 @@ const InternT* from, const InternT* from_end, const InternT*& from_next, ExternT* to, ExternT* to_end, ExternT*& to_next) const { - if (from_end < from) + if (from_end <= from) return std::codecvt<InternT, ExternT, StateT>::noconv; + // Check if buffer ends in a complete UTF-8 surrogate. + // FIXME: If this is the last call before a stream is closed, we should + // convert trailing bytes even if they look incomplete. + // How can we detect that? + std::size_t pop_end = 0; + if ((*(from_end-1) & 0b10000000) == 0b10000000) + { + // The last byte is part of a surrogate. Check if it is complete. + + // number of bytes of the surrogate in the buffer + std::size_t num_bytes_in_buf = 1; + // Find initial byte of surrogate + while (((*(from_end-num_bytes_in_buf) & 0b11000000) != 0b11000000) + && (num_bytes_in_buf < 4) + && (from_end-num_bytes_in_buf > from)) + num_bytes_in_buf++; + + // If the start of the surrogate is not in the buffer, we need to + // continue with the invalid UTF-8 sequence to avoid an infinite loop. + // Check if we found an initial byte and if there are enough bytes in the + // buffer to complete the surrogate. + if ((((*(from_end-num_bytes_in_buf) & 0b11100000) == 0b11000000) + && (num_bytes_in_buf < 2)) // incomplete 2-byte surrogate + || (((*(from_end-num_bytes_in_buf) & 0b11110000) == 0b11100000) + && (num_bytes_in_buf < 3)) // incomplete 3-byte surrogate + || (((*(from_end-num_bytes_in_buf) & 0b11111000) == 0b11110000) + && (num_bytes_in_buf < 4))) // incomplete 4-byte surrogate + pop_end = num_bytes_in_buf; + } + + std::size_t srclen = (from_end-from-pop_end) * sizeof (InternT); + std::size_t length = (to_end-to) * sizeof (ExternT); + if (srclen < 1 || length < 1) + return std::codecvt<InternT, ExternT, StateT>::partial; + // Convert from UTF-8 to output encoding - std::size_t srclen = (from_end-from) * sizeof (InternT); - std::size_t lengthp = (to_end-to) * sizeof (ExternT); const uint8_t *u8_str = reinterpret_cast<const uint8_t *> (from); char *enc_str = octave_u8_conv_to_encoding (m_enc.c_str (), u8_str, srclen, - &lengthp); + &length); + + if (length < 1) + return std::codecvt<InternT, ExternT, StateT>::partial; - size_t max = to_end - to; - if (lengthp < max) - max = lengthp; + size_t max = (to_end - to) * sizeof (ExternT); + // FIXME: If the output encoding is a multibyte or variable byte encoding, + // we should ensure that we don't cut off a "partial" surrogate from + // the output. + // Can this ever happen? + if (length < max) + max = length; // copy conversion result to output - // FIXME: Handle incomplete UTF-8 characters at end of buffer. std::copy_n (enc_str, max, to); ::free (enc_str); from_next = from + srclen; to_next = to + max; - return std::codecvt<InternT, ExternT, StateT>::ok; + return ((pop_end > 0 || max < length) + ? std::codecvt<InternT, ExternT, StateT>::partial + : std::codecvt<InternT, ExternT, StateT>::ok); } typename std::codecvt<InternT, ExternT, StateT>::result
--- a/liboctave/wrappers/uniconv-wrappers.c Sun Apr 02 19:57:52 2023 -0700 +++ b/liboctave/wrappers/uniconv-wrappers.c Mon Apr 03 18:23:09 2023 +0200 @@ -48,20 +48,48 @@ src, srclen, NULL, NULL, lengthp); } +static char * +octave_u8_conv_to_encoding_intern (const char *tocode, + enum iconv_ilseq_handler handler, + const uint8_t *src, size_t srclen, + size_t *offsets, size_t *lengthp) +{ + // FIXME: It looks like the input to u8_conv_to_encoding must be at least + // four bytes and zero-terminated to work correctly. Zero-pad input. + // Should this be fixed in gnulib or iconv instead? + size_t minlen = 4; + size_t padlen = (srclen > minlen ? srclen : minlen) + 1; + uint8_t *u8_str = (uint8_t *) malloc (padlen); + memcpy (u8_str, src, srclen); + for (size_t i_pad = 0; i_pad < padlen-srclen; i_pad++) + u8_str[srclen+i_pad] = 0; + + // Convert from UTF-8 to output encoding + char *ret = u8_conv_to_encoding (tocode, handler, u8_str, padlen, + offsets, NULL, lengthp); + free ((void *) u8_str); + + // FIXME: This assumes that "\0" is converted to a single byte. This might + // not be true for some exotic output encodings (like UTF-7?). + *lengthp = (*lengthp <= (padlen-srclen) ? 0 : *lengthp - (padlen-srclen)); + + return ret; +} + char * octave_u8_conv_to_encoding (const char *tocode, const uint8_t *src, size_t srclen, size_t *lengthp) { - return u8_conv_to_encoding (tocode, iconveh_question_mark, - src, srclen, NULL, NULL, lengthp); + return octave_u8_conv_to_encoding_intern (tocode, iconveh_question_mark, + src, srclen, NULL, lengthp); } char * octave_u8_conv_to_encoding_strict (const char *tocode, const uint8_t *src, size_t srclen, size_t *lengthp) { - return u8_conv_to_encoding (tocode, iconveh_error, - src, srclen, NULL, NULL, lengthp); + return octave_u8_conv_to_encoding_intern (tocode, iconveh_error, + src, srclen, NULL, lengthp); } char * @@ -86,8 +114,8 @@ (const char *tocode, const uint8_t *src, size_t srclen, size_t *offsets, size_t *lengthp) { - return u8_conv_to_encoding (tocode, iconveh_question_mark, - src, srclen, offsets, NULL, lengthp); + return octave_u8_conv_to_encoding_intern (tocode, iconveh_question_mark, + src, srclen, offsets, lengthp); } char *
--- a/scripts/strings/unicode2native.m Sun Apr 02 19:57:52 2023 -0700 +++ b/scripts/strings/unicode2native.m Mon Apr 03 18:23:09 2023 +0200 @@ -79,6 +79,16 @@ %! uint8 ([164:166 0 167:170])); %!assert <*60480> (unicode2native (''), uint8 ([])) +# short character arrays with invalid UTF-8 +%!testif HAVE_ICONV <*63930> +%! assert (unicode2native (char (230), 'windows-1252'), uint8 (63)); +%! assert (unicode2native (char (249), 'windows-1252'), uint8 (63)); +%! assert (unicode2native (char (230:231), 'windows-1252'), uint8 ([63, 63])); +%! assert (unicode2native (char (230:234), 'windows-1252'), +%! uint8 ([63, 63, 63, 63, 63])); +%! assert (unicode2native (char ([230, 10]), 'windows-1252'), +%! uint8 ([63, 10])); + %!error <Invalid call> unicode2native () %!error <called with too many inputs> unicode2native ('a', 'ISO-8859-1', 'test') %!error <UTF8_STR must be a character vector> unicode2native (['ab'; 'cd'])
--- a/test/io.tst Sun Apr 02 19:57:52 2023 -0700 +++ b/test/io.tst Mon Apr 03 18:23:09 2023 +0200 @@ -1233,3 +1233,33 @@ %! endfor %! endfor %! endfor + +# stream with transcoding +%!test <*63930> +%! w_modes = {"wb", "wt"}; +%! # 64 non-ASCII characters that can be represented in 'windows-1252' +%! f_texts{1} = repmat ('ÀÂÈÊÌàäéèêìîöòùû', 1, 4); +%! # prepend space to misalign surrogate border from a multiple of 2. +%! f_texts{2} = [' ', f_texts{1}]; +%! # byte values of character sequence in 'windows-1252' +%! native_bytes{1} = repmat (... +%! [192 194 200 202 204 224 228 233 232 234 236 238 246 242 249 251], ... +%! 1, 4).'; +%! native_bytes{2} = [double(' '); native_bytes{1}]; +%! for i_mode = 1:numel (w_modes) +%! for i_text = 1:numel (f_texts) +%! fname = tempname (); +%! fid = fopen (fname, w_modes{i_mode}, 'n', 'windows-1252'); +%! unwind_protect +%! fprintf (fid, f_texts{i_text}); +%! fclose (fid); +%! # open without encoding facet and read bytes +%! fid = fopen (fname, 'rb'); +%! buf = fread (fid); +%! assert (buf, native_bytes{i_text}); +%! unwind_protect_cleanup +%! fclose (fid); +%! unlink (fname); +%! end_unwind_protect +%! endfor +%! endfor