Mercurial > octave
diff liboctave/util/oct-string.cc @ 27785:3f5026fd8da8
Create valid UTF-8 from .m file content (bug #57341).
* oct-string.[cc,h] (octave::string::u8_validate): New function to validate
UTF-8 encoded strings. This is done using the generic replacement character or
by mapping invalid UTF-8 bytes to their corresponding Unicode code points.
* strfns.cc (F__u8_validate__): New function.
* input.cc (file_reader::get_input): Validate UTF-8 input.
author | Markus Mützel <markus.muetzel@gmx.de> |
---|---|
date | Wed, 04 Dec 2019 22:42:47 +0100 |
parents | d503426130bf |
children | b442ec6dda5c |
line wrap: on
line diff
--- a/liboctave/util/oct-string.cc Sun Dec 08 14:42:04 2019 +0100 +++ b/liboctave/util/oct-string.cc Wed Dec 04 22:42:47 2019 +0100 @@ -35,6 +35,7 @@ #include "lo-ieee.h" #include "lo-mappers.h" #include "uniconv-wrappers.h" +#include "unistr-wrappers.h" #include "unwind-prot.h" template <typename T> @@ -550,6 +551,58 @@ return retval; } +unsigned int +octave::string::u8_validate (const std::string& who, + std::string& in_str, + const octave::string::u8_fallback_type type) +{ + std::string out_str; + + unsigned int num_replacements = 0; + const char *in_chr = in_str.c_str (); + const char *inv_utf8 = in_chr; + const char * const in_end = in_chr + in_str.length (); + while (inv_utf8 && in_chr < in_end) + { + inv_utf8 = reinterpret_cast<const char *> + (octave_u8_check_wrapper (reinterpret_cast<const uint8_t *> (in_chr), + in_end - in_chr)); + + if (inv_utf8 == nullptr) + out_str.append (in_chr, in_end - in_chr); + else + { + num_replacements++; + out_str.append (in_chr, inv_utf8 - in_chr); + in_chr = inv_utf8 + 1; + + if (type == U8_REPLACEMENT_CHAR) + out_str.append ("\xef\xbf\xbd"); + else if (type == U8_ISO_8859_1) + { + std::string fallback = "iso-8859-1"; + size_t lengthp; + uint8_t *val_utf8 = octave_u8_conv_from_encoding + (fallback.c_str (), inv_utf8, 1, &lengthp); + + if (! val_utf8) + (*current_liboctave_error_handler) + ("%s: converting from codepage '%s' to UTF-8 failed: %s", + who.c_str (), fallback.c_str (), std::strerror (errno)); + + octave::unwind_protect frame; + frame.add_fcn (::free, static_cast<void *> (val_utf8)); + + out_str.append (reinterpret_cast<const char *> (val_utf8), + lengthp); + } + } + } + + in_str = out_str; + return num_replacements; +} + template <typename T> std::string rational_approx (T val, int len)