changeset 25361:82445187633e

Add support for arbitrary character encodings in m-files (bug #53842). * input.cc: Add new variable "Vmfile_encoding". Convert from local encoding to UTF-8 in file_reader::get_input. Add new function "__mfile_encoding__". * iconv-wrappers.[c/h]: Add new wrapper files for iconv_open and iconv_close. * wrappers/module.mk: Add new files.
author Markus Mützel <markus.muetzel@gmx.de>
date Sat, 05 May 2018 20:13:18 +0200
parents bc5f225bc578
children def1b446ba64
files libinterp/corefcn/input.cc liboctave/wrappers/iconv-wrappers.c liboctave/wrappers/iconv-wrappers.h liboctave/wrappers/module.mk
diffstat 4 files changed, 164 insertions(+), 1 deletions(-) [+]
line wrap: on
line diff
--- a/libinterp/corefcn/input.cc	Tue May 08 11:53:34 2018 -0400
+++ b/libinterp/corefcn/input.cc	Sat May 05 20:13:18 2018 +0200
@@ -37,8 +37,11 @@
 
 #include "cmd-edit.h"
 #include "file-ops.h"
+#include "iconv-wrappers.h"
+#include "localcharset-wrapper.h"
 #include "quit.h"
 #include "str-vec.h"
+#include "uniconv-wrappers.h"
 
 #include "bp-table.h"
 #include "builtin-defun-decls.h"
@@ -110,6 +113,13 @@
 
 static hook_function_list input_event_hook_functions;
 
+// Codepage which is used to read .m files
+#if defined (OCTAVE_USE_WINDOWS_API)
+static std::string Vmfile_encoding = "system";
+#else
+static std::string Vmfile_encoding = "utf-8";
+#endif
+
 // For octave_quit.
 void
 remove_input_event_hook_functions (void)
@@ -777,7 +787,34 @@
 
     eof = false;
 
-    return octave_fgets (m_file, eof);
+    std::string src_str = octave_fgets (m_file, eof);
+    std::string encoding = Vmfile_encoding.compare ("system") == 0
+                           ? octave_locale_charset_wrapper ()
+                           : Vmfile_encoding;
+
+    if (encoding.compare ("utf-8") != 0)
+    {
+      // convert encoding to UTF-8 before returning string
+      const char *src = src_str.c_str ();
+      size_t srclen = src_str.length ();
+
+      size_t length;
+      uint8_t *utf8_str = nullptr;
+
+      utf8_str = octave_u8_conv_from_encoding (encoding.c_str (), src, srclen,
+                                               &length);
+
+      if (! utf8_str)
+        error ("file_reader::get_input: converting from codepage '%s' to UTF-8: %s",
+               encoding.c_str (), std::strerror (errno));
+
+      octave::unwind_protect frame;
+      frame.add_fcn (::free, static_cast<void *> (utf8_str));
+
+      src_str = std::string (reinterpret_cast<char *> (utf8_str), length);
+    }
+
+    return src_str;
   }
 
   const std::string eval_string_reader::s_in_src ("eval_string");
@@ -1386,3 +1423,46 @@
 
   return retval;
 }
+
+DEFUN (__mfile_encoding__, args, ,
+       doc: /* -*- texinfo -*-
+@deftypefn {} {@var{current_encoding} =} __mfile_encoding__ (@var{new_encoding})
+Set and query the codepage that is used for reading .m files.
+@end deftypefn */)
+{
+  int nargin = args.length ();
+
+  if (nargin > 1)
+    print_usage ();
+
+  if (nargin > 0)
+    {
+      std::string str = args(0).xstring_value (
+        "__mfile_encoding__: NEW_ENCODING must be a string designating a valid codepage.");
+      if (str.empty ())
+#if defined (OCTAVE_USE_WINDOWS_API)
+        Vmfile_encoding = "system";
+#else
+        Vmfile_encoding = "utf-8";
+#endif
+      else
+        {
+          std::transform (str.begin (), str.end (), str.begin (), ::tolower);
+
+          std::string codepage = (str.compare ("system") == 0)
+                                 ? octave_locale_charset_wrapper () : str;
+
+          // check if valid codepage
+          void *codec = octave_iconv_open_wrapper (codepage.c_str (), "utf-8");
+
+          if (errno == EINVAL)
+            error ("__mfile_encoding__: Conversion from codepage '%s' not supported", 
+                   codepage.c_str ());
+
+          octave_iconv_close_wrapper (codec);
+
+          Vmfile_encoding = str;
+        }
+    }
+    return ovl (Vmfile_encoding);
+}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/liboctave/wrappers/iconv-wrappers.c	Sat May 05 20:13:18 2018 +0200
@@ -0,0 +1,41 @@
+/*
+
+Copyright (C) 2018 Markus Mützel
+
+This file is part of Octave.
+
+Octave is free software: you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+Octave is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Octave; see the file COPYING.  If not, see
+<https://www.gnu.org/licenses/>.
+
+*/
+
+#if defined (HAVE_CONFIG_H)
+#  include "config.h"
+#endif
+
+#include "iconv.h"
+
+#include "iconv-wrappers.h"
+
+iconv_t
+octave_iconv_open_wrapper (const char *tocode, const char *fromcode)
+{
+  return iconv_open (tocode, fromcode);
+}
+
+int
+octave_iconv_close_wrapper (iconv_t cd)
+{
+  return iconv_close (cd);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/liboctave/wrappers/iconv-wrappers.h	Sat May 05 20:13:18 2018 +0200
@@ -0,0 +1,40 @@
+/*
+
+Copyright (C) 2018 Markus Mützel
+
+This file is part of Octave.
+
+Octave is free software: you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+Octave is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Octave; see the file COPYING.  If not, see
+<https://www.gnu.org/licenses/>.
+
+*/
+
+#if ! defined (octave_iconv_wrappers_h)
+#define octave_iconv_wrappers_h 1
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+extern void *
+octave_iconv_open_wrapper (const char *tocode, const char *fromcode);
+
+extern int
+octave_iconv_close_wrapper (void *cd);
+
+#if defined __cplusplus
+}
+#endif
+
+#endif
--- a/liboctave/wrappers/module.mk	Tue May 08 11:53:34 2018 -0400
+++ b/liboctave/wrappers/module.mk	Sat May 05 20:13:18 2018 +0200
@@ -11,6 +11,7 @@
   %reldir%/getopt-wrapper.h \
   %reldir%/glob-wrappers.h \
   %reldir%/hash-wrappers.h \
+  %reldir%/iconv-wrappers.h \
   %reldir%/localcharset-wrapper.h \
   %reldir%/math-wrappers.h \
   %reldir%/mkostemp-wrapper.h \
@@ -48,6 +49,7 @@
   %reldir%/getopt-wrapper.c \
   %reldir%/glob-wrappers.c \
   %reldir%/hash-wrappers.c \
+  %reldir%/iconv-wrappers.c \
   %reldir%/localcharset-wrapper.c \
   %reldir%/math-wrappers.c \
   %reldir%/mkostemp-wrapper.c \