diff liboctave/util/oct-string.cc @ 31968:63038dcbd648 stable

Try to gather complete UTF-8 surrogates when converting encoding (bug #63930). * liboctave/util/oct-string.cc (octave::string::codecvt_u8::do_out): Check if end of buffer is a complete UTF-8 surrogate. * test/io.tst: Add test case for transcoding with the buffer being aligned and misaligned with the UTF-8 surrogate borders.
author Markus Mützel <markus.muetzel@gmx.de>
date Mon, 03 Apr 2023 17:09:17 +0200
parents 597f3ee61a48
children 701fbdfb3bc0
line wrap: on
line diff
--- a/liboctave/util/oct-string.cc	Mon Apr 03 17:05:52 2023 +0200
+++ b/liboctave/util/oct-string.cc	Mon Apr 03 17:09:17 2023 +0200
@@ -617,29 +617,70 @@
    const InternT* from, const InternT* from_end, const InternT*& from_next,
    ExternT* to, ExternT* to_end, ExternT*& to_next) const
 {
-  if (from_end < from)
+  if (from_end <= from)
     return std::codecvt<InternT, ExternT, StateT>::noconv;
 
+  // Check if buffer ends in a complete UTF-8 surrogate.
+  // FIXME: If this is the last call before a stream is closed, we should
+  //        convert trailing bytes even if they look incomplete.
+  //        How can we detect that?
+  std::size_t pop_end = 0;
+  if ((*(from_end-1) & 0b10000000) == 0b10000000)
+    {
+      // The last byte is part of a surrogate. Check if it is complete.
+
+      // number of bytes of the surrogate in the buffer
+      std::size_t num_bytes_in_buf = 1;
+      // Find initial byte of surrogate
+      while (((*(from_end-num_bytes_in_buf) & 0b11000000) != 0b11000000)
+             && (num_bytes_in_buf < 4)
+             && (from_end-num_bytes_in_buf > from))
+        num_bytes_in_buf++;
+
+      // If the start of the surrogate is not in the buffer, we need to
+      // continue with the invalid UTF-8 sequence to avoid an infinite loop.
+      // Check if we found an initial byte and if there are enough bytes in the
+      // buffer to complete the surrogate.
+      if ((((*(from_end-num_bytes_in_buf) & 0b11100000) == 0b11000000)
+           && (num_bytes_in_buf < 2))  // incomplete 2-byte surrogate
+          || (((*(from_end-num_bytes_in_buf) & 0b11110000) == 0b11100000)
+              && (num_bytes_in_buf < 3))  // incomplete 3-byte surrogate
+          || (((*(from_end-num_bytes_in_buf) & 0b11111000) == 0b11110000)
+              && (num_bytes_in_buf < 4)))  // incomplete 4-byte surrogate
+        pop_end = num_bytes_in_buf;
+    }
+
+  std::size_t srclen = (from_end-from-pop_end) * sizeof (InternT);
+  std::size_t length = (to_end-to) * sizeof (ExternT);
+  if (srclen < 1 || length < 1)
+    return std::codecvt<InternT, ExternT, StateT>::partial;
+
   // Convert from UTF-8 to output encoding
-  std::size_t srclen = (from_end-from) * sizeof (InternT);
-  std::size_t lengthp = (to_end-to) * sizeof (ExternT);
   const uint8_t *u8_str = reinterpret_cast<const uint8_t *> (from);
   char *enc_str = octave_u8_conv_to_encoding (m_enc.c_str (), u8_str, srclen,
-                                              &lengthp);
+                                              &length);
+
+  if (length < 1)
+    return std::codecvt<InternT, ExternT, StateT>::partial;
 
-  size_t max = to_end - to;
-  if (lengthp < max)
-    max = lengthp;
+  size_t max = (to_end - to) * sizeof (ExternT);
+  // FIXME: If the output encoding is a multibyte or variable byte encoding,
+  //        we should ensure that we don't cut off a "partial" surrogate from
+  //        the output.
+  //        Can this ever happen?
+  if (length < max)
+    max = length;
 
   // copy conversion result to output
-  // FIXME: Handle incomplete UTF-8 characters at end of buffer.
   std::copy_n (enc_str, max, to);
   ::free (enc_str);
 
   from_next = from + srclen;
   to_next = to + max;
 
-  return std::codecvt<InternT, ExternT, StateT>::ok;
+  return ((pop_end > 0 || max < length)
+          ? std::codecvt<InternT, ExternT, StateT>::partial
+          : std::codecvt<InternT, ExternT, StateT>::ok);
 }
 
 typename std::codecvt<InternT, ExternT, StateT>::result