diff libinterp/corefcn/strfns.cc @ 27785:3f5026fd8da8

Create valid UTF-8 from .m file content (bug #57341). * oct-string.[cc,h] (octave::string::u8_validate): New function to validate UTF-8 encoded strings. This is done using the generic replacement character or by mapping invalid UTF-8 bytes to their corresponding Unicode code points. * strfns.cc (F__u8_validate__): New function. * input.cc (file_reader::get_input): Validate UTF-8 input.
author Markus Mützel <markus.muetzel@gmx.de>
date Wed, 04 Dec 2019 22:42:47 +0100
parents 4fef3ab19046
children afbaad39d25c
line wrap: on
line diff
--- a/libinterp/corefcn/strfns.cc	Sun Dec 08 14:42:04 2019 +0100
+++ b/libinterp/corefcn/strfns.cc	Wed Dec 04 22:42:47 2019 +0100
@@ -1041,6 +1041,29 @@
 %!assert (unicode_idx (["aäou"; "Ä∞"]), [1 2 2 3 4; 5 5 6 6 6]);
 */
 
+DEFUN (__u8_validate__, args, ,
+       doc: /* -*- texinfo -*-
+@deftypefn {} {@var{out_str} =} __u8_validate__ (in_str)
+Return string with valid UTF-8.
+
+On encountering invalid UTF-8, the bytes are interpreted as the Unicode code
+points U+0080–U+00FF with the same value as the byte, thus interpreting the
+bytes according to ISO-8859-1.
+
+@end deftypefn */)
+{
+  if (args.length () != 1)
+    print_usage ();
+
+  // Input check
+  std::string in_str =
+      args(0).xstring_value ("__u8_validate__: Not a string.");
+
+  octave::string::u8_validate ("__u8_validate__", in_str);
+
+  return ovl (in_str);
+}
+
 DEFUN (newline, args, ,
        doc: /* -*- texinfo -*-
 @deftypefn {} {} newline