Mercurial > octave
changeset 28313:d13ad9dc9348
Store character vectors in .mat files (-v6 or -v7) as UTF-16 (bug #58368).
* ls-mat5.cc (read_mat5_binary_element): Convert UTF-16 and UTF-32 vectors to
UTF-8. Take UTF-8 as is. Fall back to replacing non-ASCII characters with "?"
for non-convertable strings or ND arrays.
(maybe_convert_to_u16): Add function that converts a string vector to UTF-16.
(save_mat5_element_length): Get correct length if string is converted.
(save_mat5_binary_element): Convert character vectors to UTF-16 to be able to
correctly load them in Matlab if they contain non-ASCII characters.
* load-save.cc: Add tests for saving and loading different types of strings.
* unistr-wrappers.h, unistr-wrappers.cc (octave_u16_to_u8_wrapper,
octave_u32_to_u8_wrapper, octave_u8_to_u16_wrapper): New wrappers for gnulib
functions.
* bootstrap.conf: Add gnulib modules to list.
author | Markus Mützel <markus.muetzel@gmx.de> |
---|---|
date | Fri, 15 May 2020 23:03:24 +0200 |
parents | dddc6c2661bc |
children | bc904ed5aad4 |
files | bootstrap.conf libinterp/corefcn/load-save.cc libinterp/corefcn/ls-mat5.cc liboctave/wrappers/unistr-wrappers.c liboctave/wrappers/unistr-wrappers.h |
diffstat | 5 files changed, 229 insertions(+), 62 deletions(-) [+] |
line wrap: on
line diff
--- a/bootstrap.conf Mon May 18 16:03:50 2020 +0200 +++ b/bootstrap.conf Fri May 15 23:03:24 2020 +0200 @@ -117,9 +117,12 @@ unictype/ctype-upper unictype/ctype-xdigit unistd + unistr/u16-to-u8 + unistr/u32-to-u8 unistr/u8-check unistr/u8-strmblen unistr/u8-strmbtouc + unistr/u8-to-u16 unistr/u8-to-u32 unlink unsetenv
--- a/libinterp/corefcn/load-save.cc Mon May 18 16:03:50 2020 +0200 +++ b/libinterp/corefcn/load-save.cc Fri May 15 23:03:24 2020 +0200 @@ -1820,6 +1820,46 @@ return load_save_sys.save (args, nargout); } +/* +## Save and load strings with "-v6" +%!test +%! A = A2 = ["foo"; "bar"]; +%! B = B2 = "foobar"; +%! C = C2 = {"foo", "bar"}; +%! D = D2 = {"Saint Barthélemy", "Saint Kitts and Nevis"}; +%! mat_file = [tempname(), ".mat"]; +%! unwind_protect +%! save (mat_file, "A", "B", "C", "D", "-v6"); +%! clear ("A", "B", "C", "D"); +%! load (mat_file); +%! unwind_protect_cleanup +%! unlink (mat_file); +%! end_unwind_protect +%! assert (A, A2); +%! assert (B, B2); +%! assert (C, C2); +%! assert (D, D2); + +## Save and load strings with "-v7" +%!testif HAVE_ZLIB +%! A = A2 = ["foo"; "bar"]; +%! B = B2 = "foobar"; +%! C = C2 = {"foo", "bar"}; +%! D = D2 = {"Saint Barthélemy", "Saint Kitts and Nevis"}; +%! mat_file = [tempname(), ".mat"]; +%! unwind_protect +%! save (mat_file, "A", "B", "C", "D", "-v7"); +%! clear ("A", "B", "C", "D"); +%! load (mat_file); +%! unwind_protect_cleanup +%! unlink (mat_file); +%! end_unwind_protect +%! assert (A, A2); +%! assert (B, B2); +%! assert (C, C2); +%! assert (D, D2); +*/ + DEFMETHOD (crash_dumps_octave_core, interp, args, nargout, doc: /* -*- texinfo -*- @deftypefn {} {@var{val} =} crash_dumps_octave_core ()
--- a/libinterp/corefcn/ls-mat5.cc Mon May 18 16:03:50 2020 +0200 +++ b/libinterp/corefcn/ls-mat5.cc Fri May 15 23:03:24 2020 +0200 @@ -50,6 +50,7 @@ #include "oct-time.h" #include "quit.h" #include "str-vec.h" +#include "unistr-wrappers.h" #include "Cell.h" #include "defaults.h" @@ -1404,58 +1405,84 @@ tc = ctmp; } - else + else if (arrayclass == MAT_FILE_CHAR_CLASS) { - if (arrayclass == MAT_FILE_CHAR_CLASS) + bool converted = false; + if (re.isvector () && (type == miUTF16 || type == miUINT16)) { - if (type == miUTF16 || type == miUTF32) - { - bool found_big_char = false; - for (octave_idx_type i = 0; i < n; i++) - { - if (re(i) > 127) - { - re(i) = '?'; - found_big_char = true; - } - } - - if (found_big_char) - warning_with_id ("Octave:load:unsupported-utf-char", - "load: can not read non-ASCII portions of UTF characters; replacing unreadable characters with '?'"); - } - else if (type == miUTF8) + uint16NDArray u16 = re; + const uint16_t *u16_str + = reinterpret_cast<const uint16_t *> (u16.data ()); + + // Convert to UTF-8. + size_t n8; + uint8_t *u8_str = octave_u16_to_u8_wrapper (u16_str, + u16.numel (), + nullptr, &n8); + if (u8_str) { - // Search for multi-byte encoded UTF8 characters and - // replace with 0x3F for '?'... Give the user a warning - - bool utf8_multi_byte = false; - for (octave_idx_type i = 0; i < n; i++) + // FIXME: Is there a better way to construct a charMatrix + // from a non zero terminated buffer? + tc = charMatrix (std::string (reinterpret_cast<char *> (u8_str), n8)); + free (u8_str); + converted = true; + } + } + else if (re.isvector () && (type == miUTF32 || type == miUINT32)) + { + uint32NDArray u32 = re; + const uint32_t *u32_str + = reinterpret_cast<const uint32_t *> (u32.data ()); + + // Convert to UTF-8. + size_t n8; + uint8_t *u8_str = octave_u32_to_u8_wrapper (u32_str, + u32.numel (), + nullptr, &n8); + if (u8_str) + { + // FIXME: Is there a better way to construct a charMatrix + // from a non zero terminated buffer? + tc = charMatrix (std::string (reinterpret_cast<char *> (u8_str), n8)); + free (u8_str); + converted = true; + } + } + else if (type == miUTF8 || type == miUINT8) + { + // Octave's internal encoding is UTF-8. So we should be + // able to use this natively. + tc = re; + tc = tc.convert_to_str (false, true, '\''); + converted = true; + } + + if (! converted) + { + // Fall back to manually replacing non-ASCII + // characters by "?". + bool found_big_char = false; + for (octave_idx_type i = 0; i < n; i++) + { + if (re(i) > 127) { - unsigned char a = static_cast<unsigned char> (re(i)); - if (a > 0x7f) - utf8_multi_byte = true; - } - - if (utf8_multi_byte) - { - warning_with_id ("Octave:load:unsupported-utf-char", - "load: can not read multi-byte encoded UTF8 characters; replacing unreadable characters with '?'"); - for (octave_idx_type i = 0; i < n; i++) - { - unsigned char a - = static_cast<unsigned char> (re(i)); - if (a > 0x7f) - re(i) = '?'; - } + re(i) = '?'; + found_big_char = true; } } + + if (found_big_char) + warning_with_id ("Octave:load:unsupported-utf-char", + "load: failed to convert from input to UTF-8; " + "replacing non-ASCII characters with '?'"); + tc = re; tc = tc.convert_to_str (false, true, '\''); } - else - tc = re; + } + else + tc = re; } } @@ -2057,6 +2084,24 @@ return ret; } +static uint16_t * +maybe_convert_to_u16 (const charNDArray& chm, size_t& n16_str) +{ + uint16_t *u16_str; + dim_vector dv = chm.dims (); + + if (chm.ndims () == 2 && dv(0) == 1) + { + const uint8_t *u8_str = reinterpret_cast<const uint8_t *> (chm.data ()); + u16_str = octave_u8_to_u16_wrapper (u8_str, chm.numel (), + nullptr, &n16_str); + } + else + u16_str = nullptr; + + return u16_str; +} + int save_mat5_element_length (const octave_value& tc, const std::string& name, bool save_as_floats, bool mat7_format) @@ -2074,9 +2119,23 @@ if (tc.is_string ()) { charNDArray chm = tc.char_array_value (); + // convert to UTF-16 + size_t n16_str; + uint16_t *u16_str = maybe_convert_to_u16 (chm, n16_str); ret += 8; - if (chm.numel () > 2) - ret += PAD (2 * chm.numel ()); + + octave_idx_type str_len; + if (u16_str) + { + // Count number of elements in converted string + str_len = 2 * n16_str; + free (u16_str); + } + else + str_len = chm.numel (); + + if (str_len > 2) + ret += PAD (str_len); } else if (tc.issparse ()) { @@ -2401,15 +2460,36 @@ write_mat5_tag (os, miINT32, dim_len); - for (int i = 0; i < nd; i++) + // Strings need to be converted here (or dim-vector will be off). + charNDArray chm; + uint16_t *u16_str; + size_t n16_str; + bool conv_u16 = false; + if (tc.is_string ()) { - int32_t n = dv(i); + chm = tc.char_array_value (); + u16_str = maybe_convert_to_u16 (chm, n16_str); + + if (u16_str) + conv_u16 = true; + } + + if (conv_u16) + { + int32_t n = 1; os.write (reinterpret_cast<char *> (&n), 4); + os.write (reinterpret_cast<char *> (&n16_str), 4); } + else + for (int i = 0; i < nd; i++) + { + int32_t n = dv(i); + os.write (reinterpret_cast<char *> (&n), 4); + } if (PAD (dim_len) > dim_len) { - static char buf[9]="\x00\x00\x00\x00\x00\x00\x00\x00"; + static char buf[9] = "\x00\x00\x00\x00\x00\x00\x00\x00"; os.write (buf, PAD (dim_len) - dim_len); } @@ -2432,24 +2512,35 @@ // data element if (tc.is_string ()) { - charNDArray chm = tc.char_array_value (); - octave_idx_type nel = chm.numel (); - octave_idx_type len = nel*2; - octave_idx_type paddedlength = PAD (len); - - OCTAVE_LOCAL_BUFFER (int16_t, buf, nel+3); - write_mat5_tag (os, miUINT16, len); - - const char *s = chm.data (); - - for (octave_idx_type i = 0; i < nel; i++) - buf[i] = *s++ & 0x00FF; - - os.write (reinterpret_cast<char *> (buf), len); + octave_idx_type len; + octave_idx_type paddedlength; + + if (conv_u16) + { + // converted UTF-16 + len = n16_str*2; + paddedlength = PAD (len); + + write_mat5_tag (os, miUTF16, len); + + os.write (reinterpret_cast<char *> (u16_str), len); + + free (u16_str); + } + else + { + // write as UTF-8 + len = chm.numel (); + paddedlength = PAD (len); + + write_mat5_tag (os, miUTF8, len); + + os.write (chm.data (), len); + } if (paddedlength > len) { - static char padbuf[9]="\x00\x00\x00\x00\x00\x00\x00\x00"; + static char padbuf[9] = "\x00\x00\x00\x00\x00\x00\x00\x00"; os.write (padbuf, paddedlength - len); } }
--- a/liboctave/wrappers/unistr-wrappers.c Mon May 18 16:03:50 2020 +0200 +++ b/liboctave/wrappers/unistr-wrappers.c Fri May 15 23:03:24 2020 +0200 @@ -49,6 +49,27 @@ return u8_strmbtouc (puc, src); } +uint8_t * +octave_u16_to_u8_wrapper (const uint16_t *src, size_t src_len, + uint8_t *result_buf, size_t *lengthp) +{ + return u16_to_u8 (src, src_len, result_buf, lengthp); +} + +uint8_t * +octave_u32_to_u8_wrapper (const uint32_t *src, size_t src_len, + uint8_t *result_buf, size_t *lengthp) +{ + return u32_to_u8 (src, src_len, result_buf, lengthp); +} + +uint16_t * +octave_u8_to_u16_wrapper (const uint8_t *src, size_t src_len, + uint16_t *result_buf, size_t *lengthp) +{ + return u8_to_u16 (src, src_len, result_buf, lengthp); +} + uint32_t * octave_u8_to_u32_wrapper (const uint8_t *src, size_t src_len, uint32_t *result_buf, size_t *lengthp)
--- a/liboctave/wrappers/unistr-wrappers.h Mon May 18 16:03:50 2020 +0200 +++ b/liboctave/wrappers/unistr-wrappers.h Fri May 15 23:03:24 2020 +0200 @@ -39,6 +39,18 @@ extern int octave_u8_strmbtouc_wrapper (uint32_t *puc, const uint8_t *src); +extern uint8_t * +octave_u16_to_u8_wrapper (const uint16_t *src, size_t src_len, + uint8_t *result_buf, size_t *lengthp); + +extern uint8_t * +octave_u32_to_u8_wrapper (const uint32_t *src, size_t src_len, + uint8_t *result_buf, size_t *lengthp); + +extern uint16_t * +octave_u8_to_u16_wrapper (const uint8_t *src, size_t src_len, + uint16_t *result_buf, size_t *lengthp); + extern uint32_t * octave_u8_to_u32_wrapper (const uint8_t *src, size_t src_len, uint32_t *result_buf, size_t *lengthp);