diff libinterp/corefcn/input.cc @ 25361:82445187633e

Add support for arbitrary character encodings in m-files (bug #53842). * input.cc: Add new variable "Vmfile_encoding". Convert from local encoding to UTF-8 in file_reader::get_input. Add new function "__mfile_encoding__". * iconv-wrappers.[c/h]: Add new wrapper files for iconv_open and iconv_close. * wrappers/module.mk: Add new files.
author Markus Mützel <markus.muetzel@gmx.de>
date Sat, 05 May 2018 20:13:18 +0200
parents 6652d3823428
children 9cc1ca6538e3
line wrap: on
line diff
--- a/libinterp/corefcn/input.cc	Tue May 08 11:53:34 2018 -0400
+++ b/libinterp/corefcn/input.cc	Sat May 05 20:13:18 2018 +0200
@@ -37,8 +37,11 @@
 
 #include "cmd-edit.h"
 #include "file-ops.h"
+#include "iconv-wrappers.h"
+#include "localcharset-wrapper.h"
 #include "quit.h"
 #include "str-vec.h"
+#include "uniconv-wrappers.h"
 
 #include "bp-table.h"
 #include "builtin-defun-decls.h"
@@ -110,6 +113,13 @@
 
 static hook_function_list input_event_hook_functions;
 
+// Codepage which is used to read .m files
+#if defined (OCTAVE_USE_WINDOWS_API)
+static std::string Vmfile_encoding = "system";
+#else
+static std::string Vmfile_encoding = "utf-8";
+#endif
+
 // For octave_quit.
 void
 remove_input_event_hook_functions (void)
@@ -777,7 +787,34 @@
 
     eof = false;
 
-    return octave_fgets (m_file, eof);
+    std::string src_str = octave_fgets (m_file, eof);
+    std::string encoding = Vmfile_encoding.compare ("system") == 0
+                           ? octave_locale_charset_wrapper ()
+                           : Vmfile_encoding;
+
+    if (encoding.compare ("utf-8") != 0)
+    {
+      // convert encoding to UTF-8 before returning string
+      const char *src = src_str.c_str ();
+      size_t srclen = src_str.length ();
+
+      size_t length;
+      uint8_t *utf8_str = nullptr;
+
+      utf8_str = octave_u8_conv_from_encoding (encoding.c_str (), src, srclen,
+                                               &length);
+
+      if (! utf8_str)
+        error ("file_reader::get_input: converting from codepage '%s' to UTF-8: %s",
+               encoding.c_str (), std::strerror (errno));
+
+      octave::unwind_protect frame;
+      frame.add_fcn (::free, static_cast<void *> (utf8_str));
+
+      src_str = std::string (reinterpret_cast<char *> (utf8_str), length);
+    }
+
+    return src_str;
   }
 
   const std::string eval_string_reader::s_in_src ("eval_string");
@@ -1386,3 +1423,46 @@
 
   return retval;
 }
+
+DEFUN (__mfile_encoding__, args, ,
+       doc: /* -*- texinfo -*-
+@deftypefn {} {@var{current_encoding} =} __mfile_encoding__ (@var{new_encoding})
+Set and query the codepage that is used for reading .m files.
+@end deftypefn */)
+{
+  int nargin = args.length ();
+
+  if (nargin > 1)
+    print_usage ();
+
+  if (nargin > 0)
+    {
+      std::string str = args(0).xstring_value (
+        "__mfile_encoding__: NEW_ENCODING must be a string designating a valid codepage.");
+      if (str.empty ())
+#if defined (OCTAVE_USE_WINDOWS_API)
+        Vmfile_encoding = "system";
+#else
+        Vmfile_encoding = "utf-8";
+#endif
+      else
+        {
+          std::transform (str.begin (), str.end (), str.begin (), ::tolower);
+
+          std::string codepage = (str.compare ("system") == 0)
+                                 ? octave_locale_charset_wrapper () : str;
+
+          // check if valid codepage
+          void *codec = octave_iconv_open_wrapper (codepage.c_str (), "utf-8");
+
+          if (errno == EINVAL)
+            error ("__mfile_encoding__: Conversion from codepage '%s' not supported", 
+                   codepage.c_str ());
+
+          octave_iconv_close_wrapper (codec);
+
+          Vmfile_encoding = str;
+        }
+    }
+    return ovl (Vmfile_encoding);
+}
\ No newline at end of file