Mercurial > octave

--- a/liboctave/util/oct-string.cc	Sun Apr 02 19:57:52 2023 -0700
+++ b/liboctave/util/oct-string.cc	Mon Apr 03 18:23:09 2023 +0200
@@ -617,29 +617,70 @@
    const InternT* from, const InternT* from_end, const InternT*& from_next,
    ExternT* to, ExternT* to_end, ExternT*& to_next) const
 {
-  if (from_end < from)
+  if (from_end <= from)
     return std::codecvt<InternT, ExternT, StateT>::noconv;

+  // Check if buffer ends in a complete UTF-8 surrogate.
+  // FIXME: If this is the last call before a stream is closed, we should
+  //        convert trailing bytes even if they look incomplete.
+  //        How can we detect that?
+  std::size_t pop_end = 0;
+  if ((*(from_end-1) & 0b10000000) == 0b10000000)
+    {
+      // The last byte is part of a surrogate. Check if it is complete.
+
+      // number of bytes of the surrogate in the buffer
+      std::size_t num_bytes_in_buf = 1;
+      // Find initial byte of surrogate
+      while (((*(from_end-num_bytes_in_buf) & 0b11000000) != 0b11000000)
+             && (num_bytes_in_buf < 4)
+             && (from_end-num_bytes_in_buf > from))
+        num_bytes_in_buf++;
+
+      // If the start of the surrogate is not in the buffer, we need to
+      // continue with the invalid UTF-8 sequence to avoid an infinite loop.
+      // Check if we found an initial byte and if there are enough bytes in the
+      // buffer to complete the surrogate.
+      if ((((*(from_end-num_bytes_in_buf) & 0b11100000) == 0b11000000)
+           && (num_bytes_in_buf < 2))  // incomplete 2-byte surrogate
+          || (((*(from_end-num_bytes_in_buf) & 0b11110000) == 0b11100000)
+              && (num_bytes_in_buf < 3))  // incomplete 3-byte surrogate
+          || (((*(from_end-num_bytes_in_buf) & 0b11111000) == 0b11110000)
+              && (num_bytes_in_buf < 4)))  // incomplete 4-byte surrogate
+        pop_end = num_bytes_in_buf;
+    }
+
+  std::size_t srclen = (from_end-from-pop_end) * sizeof (InternT);
+  std::size_t length = (to_end-to) * sizeof (ExternT);
+  if (srclen < 1 || length < 1)
+    return std::codecvt<InternT, ExternT, StateT>::partial;
+
   // Convert from UTF-8 to output encoding
-  std::size_t srclen = (from_end-from) * sizeof (InternT);
-  std::size_t lengthp = (to_end-to) * sizeof (ExternT);
   const uint8_t *u8_str = reinterpret_cast<const uint8_t *> (from);
   char *enc_str = octave_u8_conv_to_encoding (m_enc.c_str (), u8_str, srclen,
-                                              &lengthp);
+                                              &length);
+
+  if (length < 1)
+    return std::codecvt<InternT, ExternT, StateT>::partial;

-  size_t max = to_end - to;
-  if (lengthp < max)
-    max = lengthp;
+  size_t max = (to_end - to) * sizeof (ExternT);
+  // FIXME: If the output encoding is a multibyte or variable byte encoding,
+  //        we should ensure that we don't cut off a "partial" surrogate from
+  //        the output.
+  //        Can this ever happen?
+  if (length < max)
+    max = length;

   // copy conversion result to output
-  // FIXME: Handle incomplete UTF-8 characters at end of buffer.
   std::copy_n (enc_str, max, to);
   ::free (enc_str);

   from_next = from + srclen;
   to_next = to + max;

-  return std::codecvt<InternT, ExternT, StateT>::ok;
+  return ((pop_end > 0 || max < length)
+          ? std::codecvt<InternT, ExternT, StateT>::partial
+          : std::codecvt<InternT, ExternT, StateT>::ok);
 }

 typename std::codecvt<InternT, ExternT, StateT>::result
--- a/liboctave/wrappers/uniconv-wrappers.c	Sun Apr 02 19:57:52 2023 -0700
+++ b/liboctave/wrappers/uniconv-wrappers.c	Mon Apr 03 18:23:09 2023 +0200
@@ -48,20 +48,48 @@
                                 src, srclen, NULL, NULL, lengthp);
 }

+static char *
+octave_u8_conv_to_encoding_intern (const char *tocode,
+                                   enum iconv_ilseq_handler handler,
+                                   const uint8_t *src, size_t srclen,
+                                   size_t *offsets, size_t *lengthp)
+{
+  // FIXME: It looks like the input to u8_conv_to_encoding must be at least
+  //        four bytes and zero-terminated to work correctly.  Zero-pad input.
+  //        Should this be fixed in gnulib or iconv instead?
+  size_t minlen = 4;
+  size_t padlen = (srclen > minlen ? srclen : minlen) + 1;
+  uint8_t *u8_str = (uint8_t *) malloc (padlen);
+  memcpy (u8_str, src, srclen);
+  for (size_t i_pad = 0; i_pad < padlen-srclen; i_pad++)
+    u8_str[srclen+i_pad] = 0;
+
+  // Convert from UTF-8 to output encoding
+  char *ret = u8_conv_to_encoding (tocode, handler, u8_str, padlen,
+                                   offsets, NULL, lengthp);
+  free ((void *) u8_str);
+
+  // FIXME: This assumes that "\0" is converted to a single byte.  This might
+  //        not be true for some exotic output encodings (like UTF-7?).
+  *lengthp = (*lengthp <= (padlen-srclen) ? 0 : *lengthp - (padlen-srclen));
+
+  return ret;
+}
+
 char *
 octave_u8_conv_to_encoding (const char *tocode, const uint8_t *src,
                             size_t srclen, size_t *lengthp)
 {
-  return u8_conv_to_encoding (tocode, iconveh_question_mark,
-                              src, srclen, NULL, NULL, lengthp);
+  return octave_u8_conv_to_encoding_intern (tocode, iconveh_question_mark,
+                                            src, srclen, NULL, lengthp);
 }

 char *
 octave_u8_conv_to_encoding_strict (const char *tocode, const uint8_t *src,
                                    size_t srclen, size_t *lengthp)
 {
-  return u8_conv_to_encoding (tocode, iconveh_error,
-                              src, srclen, NULL, NULL, lengthp);
+  return octave_u8_conv_to_encoding_intern (tocode, iconveh_error,
+                                            src, srclen, NULL, lengthp);
 }

 char *
@@ -86,8 +114,8 @@
   (const char *tocode, const uint8_t *src, size_t srclen,
    size_t *offsets, size_t *lengthp)
 {
-  return u8_conv_to_encoding (tocode, iconveh_question_mark,
-                              src, srclen, offsets, NULL, lengthp);
+  return octave_u8_conv_to_encoding_intern (tocode, iconveh_question_mark,
+                                            src, srclen, offsets, lengthp);
 }

 char *
--- a/scripts/strings/unicode2native.m	Sun Apr 02 19:57:52 2023 -0700
+++ b/scripts/strings/unicode2native.m	Mon Apr 03 18:23:09 2023 +0200
@@ -79,6 +79,16 @@
 %!         uint8 ([164:166 0 167:170]));
 %!assert <*60480> (unicode2native (''), uint8 ([]))

+# short character arrays with invalid UTF-8
+%!testif HAVE_ICONV <*63930>
+%! assert (unicode2native (char (230), 'windows-1252'), uint8 (63));
+%! assert (unicode2native (char (249), 'windows-1252'), uint8 (63));
+%! assert (unicode2native (char (230:231), 'windows-1252'), uint8 ([63, 63]));
+%! assert (unicode2native (char (230:234), 'windows-1252'),
+%!         uint8 ([63, 63, 63, 63, 63]));
+%! assert (unicode2native (char ([230, 10]), 'windows-1252'),
+%!         uint8 ([63, 10]));
+
 %!error <Invalid call> unicode2native ()
 %!error <called with too many inputs> unicode2native ('a', 'ISO-8859-1', 'test')
 %!error <UTF8_STR must be a character vector> unicode2native (['ab'; 'cd'])
--- a/test/io.tst	Sun Apr 02 19:57:52 2023 -0700
+++ b/test/io.tst	Mon Apr 03 18:23:09 2023 +0200
@@ -1233,3 +1233,33 @@
 %!      endfor
 %!    endfor
 %!  endfor
+
+# stream with transcoding
+%!test <*63930>
+%! w_modes = {"wb", "wt"};
+%! # 64 non-ASCII characters that can be represented in 'windows-1252'
+%! f_texts{1} = repmat ('ÀÂÈÊÌàäéèêìîöòùû', 1, 4);
+%! # prepend space to misalign surrogate border from a multiple of 2.
+%! f_texts{2} = [' ', f_texts{1}];
+%! # byte values of character sequence in 'windows-1252'
+%! native_bytes{1} = repmat (...
+%!   [192 194 200 202 204 224 228 233 232 234 236 238 246 242 249 251], ...
+%!   1, 4).';
+%! native_bytes{2} = [double(' '); native_bytes{1}];
+%! for i_mode = 1:numel (w_modes)
+%!   for i_text = 1:numel (f_texts)
+%!     fname = tempname ();
+%!     fid = fopen (fname, w_modes{i_mode}, 'n', 'windows-1252');
+%!     unwind_protect
+%!       fprintf (fid, f_texts{i_text});
+%!       fclose (fid);
+%!       # open without encoding facet and read bytes
+%!       fid = fopen (fname, 'rb');
+%!       buf = fread (fid);
+%!       assert (buf, native_bytes{i_text});
+%!     unwind_protect_cleanup
+%!       fclose (fid);
+%!       unlink (fname);
+%!     end_unwind_protect
+%!   endfor
+%! endfor