changeset 31968:63038dcbd648 stable

Try to gather complete UTF-8 surrogates when converting encoding (bug #63930). * liboctave/util/oct-string.cc (octave::string::codecvt_u8::do_out): Check if end of buffer is a complete UTF-8 surrogate. * test/io.tst: Add test case for transcoding with the buffer being aligned and misaligned with the UTF-8 surrogate borders.
author Markus Mützel <markus.muetzel@gmx.de>
date Mon, 03 Apr 2023 17:09:17 +0200
parents 470134b3fc28
children ab928435dd79 701fbdfb3bc0
files liboctave/util/oct-string.cc test/io.tst
diffstat 2 files changed, 80 insertions(+), 9 deletions(-) [+]
line wrap: on
line diff
--- a/liboctave/util/oct-string.cc	Mon Apr 03 17:05:52 2023 +0200
+++ b/liboctave/util/oct-string.cc	Mon Apr 03 17:09:17 2023 +0200
@@ -617,29 +617,70 @@
    const InternT* from, const InternT* from_end, const InternT*& from_next,
    ExternT* to, ExternT* to_end, ExternT*& to_next) const
 {
-  if (from_end < from)
+  if (from_end <= from)
     return std::codecvt<InternT, ExternT, StateT>::noconv;
 
+  // Check if buffer ends in a complete UTF-8 surrogate.
+  // FIXME: If this is the last call before a stream is closed, we should
+  //        convert trailing bytes even if they look incomplete.
+  //        How can we detect that?
+  std::size_t pop_end = 0;
+  if ((*(from_end-1) & 0b10000000) == 0b10000000)
+    {
+      // The last byte is part of a surrogate. Check if it is complete.
+
+      // number of bytes of the surrogate in the buffer
+      std::size_t num_bytes_in_buf = 1;
+      // Find initial byte of surrogate
+      while (((*(from_end-num_bytes_in_buf) & 0b11000000) != 0b11000000)
+             && (num_bytes_in_buf < 4)
+             && (from_end-num_bytes_in_buf > from))
+        num_bytes_in_buf++;
+
+      // If the start of the surrogate is not in the buffer, we need to
+      // continue with the invalid UTF-8 sequence to avoid an infinite loop.
+      // Check if we found an initial byte and if there are enough bytes in the
+      // buffer to complete the surrogate.
+      if ((((*(from_end-num_bytes_in_buf) & 0b11100000) == 0b11000000)
+           && (num_bytes_in_buf < 2))  // incomplete 2-byte surrogate
+          || (((*(from_end-num_bytes_in_buf) & 0b11110000) == 0b11100000)
+              && (num_bytes_in_buf < 3))  // incomplete 3-byte surrogate
+          || (((*(from_end-num_bytes_in_buf) & 0b11111000) == 0b11110000)
+              && (num_bytes_in_buf < 4)))  // incomplete 4-byte surrogate
+        pop_end = num_bytes_in_buf;
+    }
+
+  std::size_t srclen = (from_end-from-pop_end) * sizeof (InternT);
+  std::size_t length = (to_end-to) * sizeof (ExternT);
+  if (srclen < 1 || length < 1)
+    return std::codecvt<InternT, ExternT, StateT>::partial;
+
   // Convert from UTF-8 to output encoding
-  std::size_t srclen = (from_end-from) * sizeof (InternT);
-  std::size_t lengthp = (to_end-to) * sizeof (ExternT);
   const uint8_t *u8_str = reinterpret_cast<const uint8_t *> (from);
   char *enc_str = octave_u8_conv_to_encoding (m_enc.c_str (), u8_str, srclen,
-                                              &lengthp);
+                                              &length);
+
+  if (length < 1)
+    return std::codecvt<InternT, ExternT, StateT>::partial;
 
-  size_t max = to_end - to;
-  if (lengthp < max)
-    max = lengthp;
+  size_t max = (to_end - to) * sizeof (ExternT);
+  // FIXME: If the output encoding is a multibyte or variable byte encoding,
+  //        we should ensure that we don't cut off a "partial" surrogate from
+  //        the output.
+  //        Can this ever happen?
+  if (length < max)
+    max = length;
 
   // copy conversion result to output
-  // FIXME: Handle incomplete UTF-8 characters at end of buffer.
   std::copy_n (enc_str, max, to);
   ::free (enc_str);
 
   from_next = from + srclen;
   to_next = to + max;
 
-  return std::codecvt<InternT, ExternT, StateT>::ok;
+  return ((pop_end > 0 || max < length)
+          ? std::codecvt<InternT, ExternT, StateT>::partial
+          : std::codecvt<InternT, ExternT, StateT>::ok);
 }
 
 typename std::codecvt<InternT, ExternT, StateT>::result
--- a/test/io.tst	Mon Apr 03 17:05:52 2023 +0200
+++ b/test/io.tst	Mon Apr 03 17:09:17 2023 +0200
@@ -960,3 +960,33 @@
 %!      endfor
 %!    endfor
 %!  endfor
+
+# stream with transcoding
+%!test <*63930>
+%! w_modes = {"wb", "wt"};
+%! # 64 non-ASCII characters that can be represented in 'windows-1252'
+%! f_texts{1} = repmat ('ÀÂÈÊÌàäéèêìîöòùû', 1, 4);
+%! # prepend space to misalign surrogate border from a multiple of 2.
+%! f_texts{2} = [' ', f_texts{1}];
+%! # byte values of character sequence in 'windows-1252'
+%! native_bytes{1} = repmat (...
+%!   [192 194 200 202 204 224 228 233 232 234 236 238 246 242 249 251], ...
+%!   1, 4).';
+%! native_bytes{2} = [double(' '); native_bytes{1}];
+%! for i_mode = 1:numel (w_modes)
+%!   for i_text = 1:numel (f_texts)
+%!     fname = tempname ();
+%!     fid = fopen (fname, w_modes{i_mode}, 'n', 'windows-1252');
+%!     unwind_protect
+%!       fprintf (fid, f_texts{i_text});
+%!       fclose (fid);
+%!       # open without encoding facet and read bytes
+%!       fid = fopen (fname, 'rb');
+%!       buf = fread (fid);
+%!       assert (buf, native_bytes{i_text});
+%!     unwind_protect_cleanup
+%!       fclose (fid);
+%!       unlink (fname);
+%!     end_unwind_protect
+%!   endfor
+%! endfor