diff liboctave/util/oct-string.cc @ 27785:3f5026fd8da8

Create valid UTF-8 from .m file content (bug #57341). * oct-string.[cc,h] (octave::string::u8_validate): New function to validate UTF-8 encoded strings. This is done using the generic replacement character or by mapping invalid UTF-8 bytes to their corresponding Unicode code points. * strfns.cc (F__u8_validate__): New function. * input.cc (file_reader::get_input): Validate UTF-8 input.
author Markus Mützel <markus.muetzel@gmx.de>
date Wed, 04 Dec 2019 22:42:47 +0100
parents d503426130bf
children b442ec6dda5c
line wrap: on
line diff
--- a/liboctave/util/oct-string.cc	Sun Dec 08 14:42:04 2019 +0100
+++ b/liboctave/util/oct-string.cc	Wed Dec 04 22:42:47 2019 +0100
@@ -35,6 +35,7 @@
 #include "lo-ieee.h"
 #include "lo-mappers.h"
 #include "uniconv-wrappers.h"
+#include "unistr-wrappers.h"
 #include "unwind-prot.h"
 
 template <typename T>
@@ -550,6 +551,58 @@
   return retval;
 }
 
+unsigned int
+octave::string::u8_validate (const std::string& who,
+                             std::string& in_str,
+                             const octave::string::u8_fallback_type type)
+{
+  std::string out_str;
+
+  unsigned int num_replacements = 0;
+  const char *in_chr = in_str.c_str ();
+  const char *inv_utf8 = in_chr;
+  const char * const in_end = in_chr + in_str.length ();
+  while (inv_utf8 && in_chr < in_end)
+    {
+      inv_utf8 = reinterpret_cast<const char *>
+          (octave_u8_check_wrapper (reinterpret_cast<const uint8_t *> (in_chr),
+                                    in_end - in_chr));
+
+      if (inv_utf8 == nullptr)
+        out_str.append (in_chr, in_end - in_chr);
+      else
+        {
+          num_replacements++;
+          out_str.append (in_chr, inv_utf8 - in_chr);
+          in_chr = inv_utf8 + 1;
+
+          if (type == U8_REPLACEMENT_CHAR)
+            out_str.append ("\xef\xbf\xbd");
+          else if (type == U8_ISO_8859_1)
+            {
+              std::string fallback = "iso-8859-1";
+              size_t lengthp;
+              uint8_t *val_utf8 = octave_u8_conv_from_encoding
+                                    (fallback.c_str (), inv_utf8, 1, &lengthp);
+
+              if (! val_utf8)
+                (*current_liboctave_error_handler)
+                  ("%s: converting from codepage '%s' to UTF-8 failed: %s",
+                   who.c_str (), fallback.c_str (), std::strerror (errno));
+
+              octave::unwind_protect frame;
+              frame.add_fcn (::free, static_cast<void *> (val_utf8));
+
+              out_str.append (reinterpret_cast<const char *> (val_utf8),
+                              lengthp);
+            }
+        }
+    }
+
+  in_str = out_str;
+  return num_replacements;
+}
+
 template <typename T>
 std::string
 rational_approx (T val, int len)