Mercurial > octave
changeset 31968:63038dcbd648 stable
Try to gather complete UTF-8 surrogates when converting encoding (bug #63930).
* liboctave/util/oct-string.cc (octave::string::codecvt_u8::do_out): Check if
end of buffer is a complete UTF-8 surrogate.
* test/io.tst: Add test case for transcoding with the buffer being aligned and
misaligned with the UTF-8 surrogate borders.
author | Markus Mützel <markus.muetzel@gmx.de> |
---|---|
date | Mon, 03 Apr 2023 17:09:17 +0200 |
parents | 470134b3fc28 |
children | ab928435dd79 701fbdfb3bc0 |
files | liboctave/util/oct-string.cc test/io.tst |
diffstat | 2 files changed, 80 insertions(+), 9 deletions(-) [+] |
line wrap: on
line diff
--- a/liboctave/util/oct-string.cc Mon Apr 03 17:05:52 2023 +0200 +++ b/liboctave/util/oct-string.cc Mon Apr 03 17:09:17 2023 +0200 @@ -617,29 +617,70 @@ const InternT* from, const InternT* from_end, const InternT*& from_next, ExternT* to, ExternT* to_end, ExternT*& to_next) const { - if (from_end < from) + if (from_end <= from) return std::codecvt<InternT, ExternT, StateT>::noconv; + // Check if buffer ends in a complete UTF-8 surrogate. + // FIXME: If this is the last call before a stream is closed, we should + // convert trailing bytes even if they look incomplete. + // How can we detect that? + std::size_t pop_end = 0; + if ((*(from_end-1) & 0b10000000) == 0b10000000) + { + // The last byte is part of a surrogate. Check if it is complete. + + // number of bytes of the surrogate in the buffer + std::size_t num_bytes_in_buf = 1; + // Find initial byte of surrogate + while (((*(from_end-num_bytes_in_buf) & 0b11000000) != 0b11000000) + && (num_bytes_in_buf < 4) + && (from_end-num_bytes_in_buf > from)) + num_bytes_in_buf++; + + // If the start of the surrogate is not in the buffer, we need to + // continue with the invalid UTF-8 sequence to avoid an infinite loop. + // Check if we found an initial byte and if there are enough bytes in the + // buffer to complete the surrogate. + if ((((*(from_end-num_bytes_in_buf) & 0b11100000) == 0b11000000) + && (num_bytes_in_buf < 2)) // incomplete 2-byte surrogate + || (((*(from_end-num_bytes_in_buf) & 0b11110000) == 0b11100000) + && (num_bytes_in_buf < 3)) // incomplete 3-byte surrogate + || (((*(from_end-num_bytes_in_buf) & 0b11111000) == 0b11110000) + && (num_bytes_in_buf < 4))) // incomplete 4-byte surrogate + pop_end = num_bytes_in_buf; + } + + std::size_t srclen = (from_end-from-pop_end) * sizeof (InternT); + std::size_t length = (to_end-to) * sizeof (ExternT); + if (srclen < 1 || length < 1) + return std::codecvt<InternT, ExternT, StateT>::partial; + // Convert from UTF-8 to output encoding - std::size_t srclen = (from_end-from) * sizeof (InternT); - std::size_t lengthp = (to_end-to) * sizeof (ExternT); const uint8_t *u8_str = reinterpret_cast<const uint8_t *> (from); char *enc_str = octave_u8_conv_to_encoding (m_enc.c_str (), u8_str, srclen, - &lengthp); + &length); + + if (length < 1) + return std::codecvt<InternT, ExternT, StateT>::partial; - size_t max = to_end - to; - if (lengthp < max) - max = lengthp; + size_t max = (to_end - to) * sizeof (ExternT); + // FIXME: If the output encoding is a multibyte or variable byte encoding, + // we should ensure that we don't cut off a "partial" surrogate from + // the output. + // Can this ever happen? + if (length < max) + max = length; // copy conversion result to output - // FIXME: Handle incomplete UTF-8 characters at end of buffer. std::copy_n (enc_str, max, to); ::free (enc_str); from_next = from + srclen; to_next = to + max; - return std::codecvt<InternT, ExternT, StateT>::ok; + return ((pop_end > 0 || max < length) + ? std::codecvt<InternT, ExternT, StateT>::partial + : std::codecvt<InternT, ExternT, StateT>::ok); } typename std::codecvt<InternT, ExternT, StateT>::result
--- a/test/io.tst Mon Apr 03 17:05:52 2023 +0200 +++ b/test/io.tst Mon Apr 03 17:09:17 2023 +0200 @@ -960,3 +960,33 @@ %! endfor %! endfor %! endfor + +# stream with transcoding +%!test <*63930> +%! w_modes = {"wb", "wt"}; +%! # 64 non-ASCII characters that can be represented in 'windows-1252' +%! f_texts{1} = repmat ('ÀÂÈÊÌàäéèêìîöòùû', 1, 4); +%! # prepend space to misalign surrogate border from a multiple of 2. +%! f_texts{2} = [' ', f_texts{1}]; +%! # byte values of character sequence in 'windows-1252' +%! native_bytes{1} = repmat (... +%! [192 194 200 202 204 224 228 233 232 234 236 238 246 242 249 251], ... +%! 1, 4).'; +%! native_bytes{2} = [double(' '); native_bytes{1}]; +%! for i_mode = 1:numel (w_modes) +%! for i_text = 1:numel (f_texts) +%! fname = tempname (); +%! fid = fopen (fname, w_modes{i_mode}, 'n', 'windows-1252'); +%! unwind_protect +%! fprintf (fid, f_texts{i_text}); +%! fclose (fid); +%! # open without encoding facet and read bytes +%! fid = fopen (fname, 'rb'); +%! buf = fread (fid); +%! assert (buf, native_bytes{i_text}); +%! unwind_protect_cleanup +%! fclose (fid); +%! unlink (fname); +%! end_unwind_protect +%! endfor +%! endfor