changeset 28313:d13ad9dc9348

Store character vectors in .mat files (-v6 or -v7) as UTF-16 (bug #58368). * ls-mat5.cc (read_mat5_binary_element): Convert UTF-16 and UTF-32 vectors to UTF-8. Take UTF-8 as is. Fall back to replacing non-ASCII characters with "?" for non-convertable strings or ND arrays. (maybe_convert_to_u16): Add function that converts a string vector to UTF-16. (save_mat5_element_length): Get correct length if string is converted. (save_mat5_binary_element): Convert character vectors to UTF-16 to be able to correctly load them in Matlab if they contain non-ASCII characters. * load-save.cc: Add tests for saving and loading different types of strings. * unistr-wrappers.h, unistr-wrappers.cc (octave_u16_to_u8_wrapper, octave_u32_to_u8_wrapper, octave_u8_to_u16_wrapper): New wrappers for gnulib functions. * bootstrap.conf: Add gnulib modules to list.
author Markus Mützel <markus.muetzel@gmx.de>
date Fri, 15 May 2020 23:03:24 +0200
parents dddc6c2661bc
children bc904ed5aad4
files bootstrap.conf libinterp/corefcn/load-save.cc libinterp/corefcn/ls-mat5.cc liboctave/wrappers/unistr-wrappers.c liboctave/wrappers/unistr-wrappers.h
diffstat 5 files changed, 229 insertions(+), 62 deletions(-) [+]
line wrap: on
line diff
--- a/bootstrap.conf	Mon May 18 16:03:50 2020 +0200
+++ b/bootstrap.conf	Fri May 15 23:03:24 2020 +0200
@@ -117,9 +117,12 @@
   unictype/ctype-upper
   unictype/ctype-xdigit
   unistd
+  unistr/u16-to-u8
+  unistr/u32-to-u8
   unistr/u8-check
   unistr/u8-strmblen
   unistr/u8-strmbtouc
+  unistr/u8-to-u16
   unistr/u8-to-u32
   unlink
   unsetenv
--- a/libinterp/corefcn/load-save.cc	Mon May 18 16:03:50 2020 +0200
+++ b/libinterp/corefcn/load-save.cc	Fri May 15 23:03:24 2020 +0200
@@ -1820,6 +1820,46 @@
   return load_save_sys.save (args, nargout);
 }
 
+/*
+## Save and load strings with "-v6"
+%!test
+%! A = A2 = ["foo"; "bar"];
+%! B = B2 = "foobar";
+%! C = C2 = {"foo", "bar"};
+%! D = D2 = {"Saint Barthélemy", "Saint Kitts and Nevis"};
+%! mat_file = [tempname(), ".mat"];
+%! unwind_protect
+%!   save (mat_file, "A", "B", "C", "D", "-v6");
+%!   clear ("A", "B", "C", "D");
+%!   load (mat_file);
+%! unwind_protect_cleanup
+%!   unlink (mat_file);
+%! end_unwind_protect
+%! assert (A, A2);
+%! assert (B, B2);
+%! assert (C, C2);
+%! assert (D, D2);
+
+## Save and load strings with "-v7"
+%!testif HAVE_ZLIB
+%! A = A2 = ["foo"; "bar"];
+%! B = B2 = "foobar";
+%! C = C2 = {"foo", "bar"};
+%! D = D2 = {"Saint Barthélemy", "Saint Kitts and Nevis"};
+%! mat_file = [tempname(), ".mat"];
+%! unwind_protect
+%!   save (mat_file, "A", "B", "C", "D", "-v7");
+%!   clear ("A", "B", "C", "D");
+%!   load (mat_file);
+%! unwind_protect_cleanup
+%!   unlink (mat_file);
+%! end_unwind_protect
+%! assert (A, A2);
+%! assert (B, B2);
+%! assert (C, C2);
+%! assert (D, D2);
+*/
+
 DEFMETHOD (crash_dumps_octave_core, interp, args, nargout,
            doc: /* -*- texinfo -*-
 @deftypefn  {} {@var{val} =} crash_dumps_octave_core ()
--- a/libinterp/corefcn/ls-mat5.cc	Mon May 18 16:03:50 2020 +0200
+++ b/libinterp/corefcn/ls-mat5.cc	Fri May 15 23:03:24 2020 +0200
@@ -50,6 +50,7 @@
 #include "oct-time.h"
 #include "quit.h"
 #include "str-vec.h"
+#include "unistr-wrappers.h"
 
 #include "Cell.h"
 #include "defaults.h"
@@ -1404,58 +1405,84 @@
 
             tc = ctmp;
           }
-        else
+        else if (arrayclass == MAT_FILE_CHAR_CLASS)
           {
-            if (arrayclass == MAT_FILE_CHAR_CLASS)
+            bool converted = false;
+            if (re.isvector () && (type == miUTF16 || type == miUINT16))
               {
-                if (type == miUTF16 || type == miUTF32)
-                  {
-                    bool found_big_char = false;
-                    for (octave_idx_type i = 0; i < n; i++)
-                      {
-                        if (re(i) > 127)
-                          {
-                            re(i) = '?';
-                            found_big_char = true;
-                          }
-                      }
-
-                    if (found_big_char)
-                      warning_with_id ("Octave:load:unsupported-utf-char",
-                               "load: can not read non-ASCII portions of UTF characters; replacing unreadable characters with '?'");
-                  }
-                else if (type == miUTF8)
+                uint16NDArray u16 = re;
+                const uint16_t *u16_str
+                  = reinterpret_cast<const uint16_t *> (u16.data ());
+
+                // Convert to UTF-8.
+                size_t n8;
+                uint8_t *u8_str = octave_u16_to_u8_wrapper (u16_str,
+                                                            u16.numel (),
+                                                            nullptr, &n8);
+                if (u8_str)
                   {
-                    // Search for multi-byte encoded UTF8 characters and
-                    // replace with 0x3F for '?'...  Give the user a warning
-
-                    bool utf8_multi_byte = false;
-                    for (octave_idx_type i = 0; i < n; i++)
+                    // FIXME: Is there a better way to construct a charMatrix
+                    // from a non zero terminated buffer?
+                    tc = charMatrix (std::string (reinterpret_cast<char *> (u8_str), n8));
+                    free (u8_str);
+                    converted = true;
+                  }
+              }
+            else if (re.isvector () && (type == miUTF32 || type == miUINT32))
+              {
+                uint32NDArray u32 = re;
+                const uint32_t *u32_str
+                  = reinterpret_cast<const uint32_t *> (u32.data ());
+
+                // Convert to UTF-8.
+                size_t n8;
+                uint8_t *u8_str = octave_u32_to_u8_wrapper (u32_str,
+                                                            u32.numel (),
+                                                            nullptr, &n8);
+                if (u8_str)
+                  {
+                    // FIXME: Is there a better way to construct a charMatrix
+                    // from a non zero terminated buffer?
+                    tc = charMatrix (std::string (reinterpret_cast<char *> (u8_str), n8));
+                    free (u8_str);
+                    converted = true;
+                  }
+              }
+            else if (type == miUTF8 || type == miUINT8)
+              {
+                // Octave's internal encoding is UTF-8.  So we should be
+                // able to use this natively.
+                tc = re;
+                tc = tc.convert_to_str (false, true, '\'');
+                converted = true;
+              }
+
+            if (! converted)
+              {
+                // Fall back to manually replacing non-ASCII
+                // characters by "?".
+                bool found_big_char = false;
+                for (octave_idx_type i = 0; i < n; i++)
+                  {
+                    if (re(i) > 127)
                       {
-                        unsigned char a = static_cast<unsigned char> (re(i));
-                        if (a > 0x7f)
-                          utf8_multi_byte = true;
-                      }
-
-                    if (utf8_multi_byte)
-                      {
-                        warning_with_id ("Octave:load:unsupported-utf-char",
-                                         "load: can not read multi-byte encoded UTF8 characters; replacing unreadable characters with '?'");
-                        for (octave_idx_type i = 0; i < n; i++)
-                          {
-                            unsigned char a
-                              = static_cast<unsigned char> (re(i));
-                            if (a > 0x7f)
-                              re(i) = '?';
-                          }
+                        re(i) = '?';
+                        found_big_char = true;
                       }
                   }
+
+                if (found_big_char)
+                  warning_with_id ("Octave:load:unsupported-utf-char",
+                    "load: failed to convert from input to UTF-8; "
+                    "replacing non-ASCII characters with '?'");
+
                 tc = re;
                 tc = tc.convert_to_str (false, true, '\'');
               }
-            else
-              tc = re;
+
           }
+        else
+          tc = re;
       }
     }
 
@@ -2057,6 +2084,24 @@
   return ret;
 }
 
+static uint16_t *
+maybe_convert_to_u16 (const charNDArray& chm, size_t& n16_str)
+{
+  uint16_t *u16_str;
+  dim_vector dv = chm.dims ();
+
+  if (chm.ndims () == 2 && dv(0) == 1)
+    {
+      const uint8_t *u8_str = reinterpret_cast<const uint8_t *> (chm.data ());
+      u16_str = octave_u8_to_u16_wrapper (u8_str, chm.numel (),
+                                          nullptr, &n16_str);
+    }
+  else
+    u16_str = nullptr;
+
+  return u16_str;
+}
+
 int
 save_mat5_element_length (const octave_value& tc, const std::string& name,
                           bool save_as_floats, bool mat7_format)
@@ -2074,9 +2119,23 @@
   if (tc.is_string ())
     {
       charNDArray chm = tc.char_array_value ();
+      // convert to UTF-16
+      size_t n16_str;
+      uint16_t *u16_str = maybe_convert_to_u16 (chm, n16_str);
       ret += 8;
-      if (chm.numel () > 2)
-        ret += PAD (2 * chm.numel ());
+
+      octave_idx_type str_len;
+      if (u16_str)
+        {
+          // Count number of elements in converted string
+          str_len = 2 * n16_str;
+          free (u16_str);
+        }
+      else
+        str_len = chm.numel ();
+
+      if (str_len > 2)
+        ret += PAD (str_len);
     }
   else if (tc.issparse ())
     {
@@ -2401,15 +2460,36 @@
 
   write_mat5_tag (os, miINT32, dim_len);
 
-  for (int i = 0; i < nd; i++)
+  // Strings need to be converted here (or dim-vector will be off).
+  charNDArray chm;
+  uint16_t *u16_str;
+  size_t n16_str;
+  bool conv_u16 = false;
+  if (tc.is_string ())
     {
-      int32_t n = dv(i);
+      chm = tc.char_array_value ();
+      u16_str = maybe_convert_to_u16 (chm, n16_str);
+
+      if (u16_str)
+        conv_u16 = true;
+    }
+
+  if (conv_u16)
+    {
+      int32_t n = 1;
       os.write (reinterpret_cast<char *> (&n), 4);
+      os.write (reinterpret_cast<char *> (&n16_str), 4);
     }
+  else
+    for (int i = 0; i < nd; i++)
+      {
+        int32_t n = dv(i);
+        os.write (reinterpret_cast<char *> (&n), 4);
+      }
 
   if (PAD (dim_len) > dim_len)
     {
-      static char buf[9]="\x00\x00\x00\x00\x00\x00\x00\x00";
+      static char buf[9] = "\x00\x00\x00\x00\x00\x00\x00\x00";
       os.write (buf, PAD (dim_len) - dim_len);
     }
 
@@ -2432,24 +2512,35 @@
   // data element
   if (tc.is_string ())
     {
-      charNDArray chm = tc.char_array_value ();
-      octave_idx_type nel = chm.numel ();
-      octave_idx_type len = nel*2;
-      octave_idx_type paddedlength = PAD (len);
-
-      OCTAVE_LOCAL_BUFFER (int16_t, buf, nel+3);
-      write_mat5_tag (os, miUINT16, len);
-
-      const char *s = chm.data ();
-
-      for (octave_idx_type i = 0; i < nel; i++)
-        buf[i] = *s++ & 0x00FF;
-
-      os.write (reinterpret_cast<char *> (buf), len);
+      octave_idx_type len;
+      octave_idx_type paddedlength;
+
+      if (conv_u16)
+        {
+          // converted UTF-16
+          len = n16_str*2;
+          paddedlength = PAD (len);
+
+          write_mat5_tag (os, miUTF16, len);
+
+          os.write (reinterpret_cast<char *> (u16_str), len);
+
+          free (u16_str);
+        }
+      else
+        {
+          // write as UTF-8
+          len = chm.numel ();
+          paddedlength = PAD (len);
+
+          write_mat5_tag (os, miUTF8, len);
+
+          os.write (chm.data (), len);
+        }
 
       if (paddedlength > len)
         {
-          static char padbuf[9]="\x00\x00\x00\x00\x00\x00\x00\x00";
+          static char padbuf[9] = "\x00\x00\x00\x00\x00\x00\x00\x00";
           os.write (padbuf, paddedlength - len);
         }
     }
--- a/liboctave/wrappers/unistr-wrappers.c	Mon May 18 16:03:50 2020 +0200
+++ b/liboctave/wrappers/unistr-wrappers.c	Fri May 15 23:03:24 2020 +0200
@@ -49,6 +49,27 @@
   return u8_strmbtouc (puc, src);
 }
 
+uint8_t *
+octave_u16_to_u8_wrapper (const uint16_t *src, size_t src_len,
+                          uint8_t *result_buf, size_t *lengthp)
+{
+  return u16_to_u8 (src, src_len, result_buf, lengthp);
+}
+
+uint8_t *
+octave_u32_to_u8_wrapper (const uint32_t *src, size_t src_len,
+                          uint8_t *result_buf, size_t *lengthp)
+{
+  return u32_to_u8 (src, src_len, result_buf, lengthp);
+}
+
+uint16_t *
+octave_u8_to_u16_wrapper (const uint8_t *src, size_t src_len,
+                          uint16_t *result_buf, size_t *lengthp)
+{
+  return u8_to_u16 (src, src_len, result_buf, lengthp);
+}
+
 uint32_t *
 octave_u8_to_u32_wrapper (const uint8_t *src, size_t src_len,
                           uint32_t *result_buf, size_t *lengthp)
--- a/liboctave/wrappers/unistr-wrappers.h	Mon May 18 16:03:50 2020 +0200
+++ b/liboctave/wrappers/unistr-wrappers.h	Fri May 15 23:03:24 2020 +0200
@@ -39,6 +39,18 @@
 extern int
 octave_u8_strmbtouc_wrapper (uint32_t *puc, const uint8_t *src);
 
+extern uint8_t *
+octave_u16_to_u8_wrapper (const uint16_t *src, size_t src_len,
+                          uint8_t *result_buf, size_t *lengthp);
+
+extern uint8_t *
+octave_u32_to_u8_wrapper (const uint32_t *src, size_t src_len,
+                          uint8_t *result_buf, size_t *lengthp);
+
+extern uint16_t *
+octave_u8_to_u16_wrapper (const uint8_t *src, size_t src_len,
+                          uint16_t *result_buf, size_t *lengthp);
+
 extern uint32_t *
 octave_u8_to_u32_wrapper (const uint8_t *src, size_t src_len,
                           uint32_t *result_buf, size_t *lengthp);