Mercurial > octave
changeset 27785:3f5026fd8da8
Create valid UTF-8 from .m file content (bug #57341).
* oct-string.[cc,h] (octave::string::u8_validate): New function to validate
UTF-8 encoded strings. This is done using the generic replacement character or
by mapping invalid UTF-8 bytes to their corresponding Unicode code points.
* strfns.cc (F__u8_validate__): New function.
* input.cc (file_reader::get_input): Validate UTF-8 input.
author | Markus Mützel <markus.muetzel@gmx.de> |
---|---|
date | Wed, 04 Dec 2019 22:42:47 +0100 |
parents | 873ef98668d1 |
children | 188fb5415ab5 |
files | libinterp/corefcn/input.cc libinterp/corefcn/strfns.cc liboctave/util/oct-string.cc liboctave/util/oct-string.h |
diffstat | 4 files changed, 114 insertions(+), 20 deletions(-) [+] |
line wrap: on
line diff
--- a/libinterp/corefcn/input.cc Sun Dec 08 14:42:04 2019 +0100 +++ b/libinterp/corefcn/input.cc Wed Dec 04 22:42:47 2019 +0100 @@ -40,6 +40,7 @@ #include "file-ops.h" #include "iconv-wrappers.h" #include "localcharset-wrapper.h" +#include "oct-string.h" #include "quit.h" #include "str-vec.h" #include "uniconv-wrappers.h" @@ -874,32 +875,39 @@ encoding = mfile_encoding; if (encoding.compare ("utf-8") == 0) - { - // Check for BOM and strip it - if (src_str.compare (0, 3, "\xef\xbb\xbf") == 0) - src_str.erase (0, 3); - } + { + // Check for BOM and strip it + if (src_str.compare (0, 3, "\xef\xbb\xbf") == 0) + src_str.erase (0, 3); + + // replace invalid portions of the string + // FIXME: Include file name that corresponds to m_file. + if (string::u8_validate ("get_input", src_str) > 0) + warning_with_id ("octave:get_input:invalid_utf8", + "Invalid UTF-8 byte sequences have been replaced."); + } else - { - // convert encoding to UTF-8 before returning string - const char *src = src_str.c_str (); - size_t srclen = src_str.length (); + { + // convert encoding to UTF-8 before returning string + const char *src = src_str.c_str (); + size_t srclen = src_str.length (); - size_t length; - uint8_t *utf8_str; + size_t length; + uint8_t *utf8_str; - utf8_str = octave_u8_conv_from_encoding (encoding.c_str (), src, srclen, - &length); + utf8_str = octave_u8_conv_from_encoding (encoding.c_str (), src, srclen, + &length); - if (! utf8_str) - error ("file_reader::get_input: converting from codepage '%s' to UTF-8: %s", - encoding.c_str (), std::strerror (errno)); + if (! utf8_str) + error ("file_reader::get_input: " + "converting from codepage '%s' to UTF-8: %s", + encoding.c_str (), std::strerror (errno)); - unwind_protect frame; - frame.add_fcn (::free, static_cast<void *> (utf8_str)); + unwind_protect frame; + frame.add_fcn (::free, static_cast<void *> (utf8_str)); - src_str = std::string (reinterpret_cast<char *> (utf8_str), length); - } + src_str = std::string (reinterpret_cast<char *> (utf8_str), length); + } return src_str; }
--- a/libinterp/corefcn/strfns.cc Sun Dec 08 14:42:04 2019 +0100 +++ b/libinterp/corefcn/strfns.cc Wed Dec 04 22:42:47 2019 +0100 @@ -1041,6 +1041,29 @@ %!assert (unicode_idx (["aäou"; "Ä∞"]), [1 2 2 3 4; 5 5 6 6 6]); */ +DEFUN (__u8_validate__, args, , + doc: /* -*- texinfo -*- +@deftypefn {} {@var{out_str} =} __u8_validate__ (in_str) +Return string with valid UTF-8. + +On encountering invalid UTF-8, the bytes are interpreted as the Unicode code +points U+0080–U+00FF with the same value as the byte, thus interpreting the +bytes according to ISO-8859-1. + +@end deftypefn */) +{ + if (args.length () != 1) + print_usage (); + + // Input check + std::string in_str = + args(0).xstring_value ("__u8_validate__: Not a string."); + + octave::string::u8_validate ("__u8_validate__", in_str); + + return ovl (in_str); +} + DEFUN (newline, args, , doc: /* -*- texinfo -*- @deftypefn {} {} newline
--- a/liboctave/util/oct-string.cc Sun Dec 08 14:42:04 2019 +0100 +++ b/liboctave/util/oct-string.cc Wed Dec 04 22:42:47 2019 +0100 @@ -35,6 +35,7 @@ #include "lo-ieee.h" #include "lo-mappers.h" #include "uniconv-wrappers.h" +#include "unistr-wrappers.h" #include "unwind-prot.h" template <typename T> @@ -550,6 +551,58 @@ return retval; } +unsigned int +octave::string::u8_validate (const std::string& who, + std::string& in_str, + const octave::string::u8_fallback_type type) +{ + std::string out_str; + + unsigned int num_replacements = 0; + const char *in_chr = in_str.c_str (); + const char *inv_utf8 = in_chr; + const char * const in_end = in_chr + in_str.length (); + while (inv_utf8 && in_chr < in_end) + { + inv_utf8 = reinterpret_cast<const char *> + (octave_u8_check_wrapper (reinterpret_cast<const uint8_t *> (in_chr), + in_end - in_chr)); + + if (inv_utf8 == nullptr) + out_str.append (in_chr, in_end - in_chr); + else + { + num_replacements++; + out_str.append (in_chr, inv_utf8 - in_chr); + in_chr = inv_utf8 + 1; + + if (type == U8_REPLACEMENT_CHAR) + out_str.append ("\xef\xbf\xbd"); + else if (type == U8_ISO_8859_1) + { + std::string fallback = "iso-8859-1"; + size_t lengthp; + uint8_t *val_utf8 = octave_u8_conv_from_encoding + (fallback.c_str (), inv_utf8, 1, &lengthp); + + if (! val_utf8) + (*current_liboctave_error_handler) + ("%s: converting from codepage '%s' to UTF-8 failed: %s", + who.c_str (), fallback.c_str (), std::strerror (errno)); + + octave::unwind_protect frame; + frame.add_fcn (::free, static_cast<void *> (val_utf8)); + + out_str.append (reinterpret_cast<const char *> (val_utf8), + lengthp); + } + } + } + + in_str = out_str; + return num_replacements; +} + template <typename T> std::string rational_approx (T val, int len)
--- a/liboctave/util/oct-string.h Sun Dec 08 14:42:04 2019 +0100 +++ b/liboctave/util/oct-string.h Wed Dec 04 22:42:47 2019 +0100 @@ -133,6 +133,16 @@ extern OCTAVE_API std::string u8_from_encoding (const std::string& who, const std::string& native_string, const std::string& encoding); + + enum u8_fallback_type + { + U8_REPLACEMENT_CHAR, + U8_ISO_8859_1 + }; + + extern OCTAVE_API unsigned int + u8_validate (const std::string& who, std::string& in_string, + const u8_fallback_type type = U8_REPLACEMENT_CHAR); } }