changeset 27785:3f5026fd8da8

Create valid UTF-8 from .m file content (bug #57341). * oct-string.[cc,h] (octave::string::u8_validate): New function to validate UTF-8 encoded strings. This is done using the generic replacement character or by mapping invalid UTF-8 bytes to their corresponding Unicode code points. * strfns.cc (F__u8_validate__): New function. * input.cc (file_reader::get_input): Validate UTF-8 input.
author Markus Mützel <markus.muetzel@gmx.de>
date Wed, 04 Dec 2019 22:42:47 +0100
parents 873ef98668d1
children 188fb5415ab5
files libinterp/corefcn/input.cc libinterp/corefcn/strfns.cc liboctave/util/oct-string.cc liboctave/util/oct-string.h
diffstat 4 files changed, 114 insertions(+), 20 deletions(-) [+]
line wrap: on
line diff
--- a/libinterp/corefcn/input.cc	Sun Dec 08 14:42:04 2019 +0100
+++ b/libinterp/corefcn/input.cc	Wed Dec 04 22:42:47 2019 +0100
@@ -40,6 +40,7 @@
 #include "file-ops.h"
 #include "iconv-wrappers.h"
 #include "localcharset-wrapper.h"
+#include "oct-string.h"
 #include "quit.h"
 #include "str-vec.h"
 #include "uniconv-wrappers.h"
@@ -874,32 +875,39 @@
       encoding = mfile_encoding;
 
     if (encoding.compare ("utf-8") == 0)
-    {
-      // Check for BOM and strip it
-      if (src_str.compare (0, 3, "\xef\xbb\xbf") == 0)
-        src_str.erase (0, 3);
-    }
+      {
+        // Check for BOM and strip it
+        if (src_str.compare (0, 3, "\xef\xbb\xbf") == 0)
+          src_str.erase (0, 3);
+
+        // replace invalid portions of the string
+        // FIXME: Include file name that corresponds to m_file.
+        if (string::u8_validate ("get_input", src_str) > 0)
+          warning_with_id ("octave:get_input:invalid_utf8",
+                           "Invalid UTF-8 byte sequences have been replaced.");
+      }
     else
-    {
-      // convert encoding to UTF-8 before returning string
-      const char *src = src_str.c_str ();
-      size_t srclen = src_str.length ();
+      {
+        // convert encoding to UTF-8 before returning string
+        const char *src = src_str.c_str ();
+        size_t srclen = src_str.length ();
 
-      size_t length;
-      uint8_t *utf8_str;
+        size_t length;
+        uint8_t *utf8_str;
 
-      utf8_str = octave_u8_conv_from_encoding (encoding.c_str (), src, srclen,
-                                               &length);
+        utf8_str = octave_u8_conv_from_encoding (encoding.c_str (), src, srclen,
+                                                 &length);
 
-      if (! utf8_str)
-        error ("file_reader::get_input: converting from codepage '%s' to UTF-8: %s",
-               encoding.c_str (), std::strerror (errno));
+        if (! utf8_str)
+          error ("file_reader::get_input: "
+                 "converting from codepage '%s' to UTF-8: %s",
+                 encoding.c_str (), std::strerror (errno));
 
-      unwind_protect frame;
-      frame.add_fcn (::free, static_cast<void *> (utf8_str));
+        unwind_protect frame;
+        frame.add_fcn (::free, static_cast<void *> (utf8_str));
 
-      src_str = std::string (reinterpret_cast<char *> (utf8_str), length);
-    }
+        src_str = std::string (reinterpret_cast<char *> (utf8_str), length);
+      }
 
     return src_str;
   }
--- a/libinterp/corefcn/strfns.cc	Sun Dec 08 14:42:04 2019 +0100
+++ b/libinterp/corefcn/strfns.cc	Wed Dec 04 22:42:47 2019 +0100
@@ -1041,6 +1041,29 @@
 %!assert (unicode_idx (["aäou"; "Ä∞"]), [1 2 2 3 4; 5 5 6 6 6]);
 */
 
+DEFUN (__u8_validate__, args, ,
+       doc: /* -*- texinfo -*-
+@deftypefn {} {@var{out_str} =} __u8_validate__ (in_str)
+Return string with valid UTF-8.
+
+On encountering invalid UTF-8, the bytes are interpreted as the Unicode code
+points U+0080–U+00FF with the same value as the byte, thus interpreting the
+bytes according to ISO-8859-1.
+
+@end deftypefn */)
+{
+  if (args.length () != 1)
+    print_usage ();
+
+  // Input check
+  std::string in_str =
+      args(0).xstring_value ("__u8_validate__: Not a string.");
+
+  octave::string::u8_validate ("__u8_validate__", in_str);
+
+  return ovl (in_str);
+}
+
 DEFUN (newline, args, ,
        doc: /* -*- texinfo -*-
 @deftypefn {} {} newline
--- a/liboctave/util/oct-string.cc	Sun Dec 08 14:42:04 2019 +0100
+++ b/liboctave/util/oct-string.cc	Wed Dec 04 22:42:47 2019 +0100
@@ -35,6 +35,7 @@
 #include "lo-ieee.h"
 #include "lo-mappers.h"
 #include "uniconv-wrappers.h"
+#include "unistr-wrappers.h"
 #include "unwind-prot.h"
 
 template <typename T>
@@ -550,6 +551,58 @@
   return retval;
 }
 
+unsigned int
+octave::string::u8_validate (const std::string& who,
+                             std::string& in_str,
+                             const octave::string::u8_fallback_type type)
+{
+  std::string out_str;
+
+  unsigned int num_replacements = 0;
+  const char *in_chr = in_str.c_str ();
+  const char *inv_utf8 = in_chr;
+  const char * const in_end = in_chr + in_str.length ();
+  while (inv_utf8 && in_chr < in_end)
+    {
+      inv_utf8 = reinterpret_cast<const char *>
+          (octave_u8_check_wrapper (reinterpret_cast<const uint8_t *> (in_chr),
+                                    in_end - in_chr));
+
+      if (inv_utf8 == nullptr)
+        out_str.append (in_chr, in_end - in_chr);
+      else
+        {
+          num_replacements++;
+          out_str.append (in_chr, inv_utf8 - in_chr);
+          in_chr = inv_utf8 + 1;
+
+          if (type == U8_REPLACEMENT_CHAR)
+            out_str.append ("\xef\xbf\xbd");
+          else if (type == U8_ISO_8859_1)
+            {
+              std::string fallback = "iso-8859-1";
+              size_t lengthp;
+              uint8_t *val_utf8 = octave_u8_conv_from_encoding
+                                    (fallback.c_str (), inv_utf8, 1, &lengthp);
+
+              if (! val_utf8)
+                (*current_liboctave_error_handler)
+                  ("%s: converting from codepage '%s' to UTF-8 failed: %s",
+                   who.c_str (), fallback.c_str (), std::strerror (errno));
+
+              octave::unwind_protect frame;
+              frame.add_fcn (::free, static_cast<void *> (val_utf8));
+
+              out_str.append (reinterpret_cast<const char *> (val_utf8),
+                              lengthp);
+            }
+        }
+    }
+
+  in_str = out_str;
+  return num_replacements;
+}
+
 template <typename T>
 std::string
 rational_approx (T val, int len)
--- a/liboctave/util/oct-string.h	Sun Dec 08 14:42:04 2019 +0100
+++ b/liboctave/util/oct-string.h	Wed Dec 04 22:42:47 2019 +0100
@@ -133,6 +133,16 @@
     extern OCTAVE_API std::string
     u8_from_encoding (const std::string& who, const std::string& native_string,
                       const std::string& encoding);
+
+    enum u8_fallback_type
+    {
+      U8_REPLACEMENT_CHAR,
+      U8_ISO_8859_1
+    };
+
+    extern OCTAVE_API unsigned int
+    u8_validate (const std::string& who, std::string& in_string,
+                 const u8_fallback_type type = U8_REPLACEMENT_CHAR);
   }
 }