diff libinterp/corefcn/input.cc @ 28953:dff830c84726

Add function "dir_encoding" to set a file encoding per directory (bug #49685). * corefcn/load-path.h (load_path::read_dir_config): Add new function. * corefcn/load-path.cc (load_path::read_dir_config): Add new function. (load_path::add): Call read_dir_config on adding a new directory to the load path. * corefcn/input.h (input_system::dir_encoding, input_system::set_dir_encoding): Add new functions. Add private unordered_map "m_dir_encoding". * corefcn/input.cc (input_system::dir_encoding, input_system::set_dir_encoding): Add new functions. (base_reader, file_reader, input_reader): Add new constructors that accept an encoding (use mfile_encoding by default). Add private property "m_encoding". (load_path_dir): Static function to get the part of the directory that would be added to the load path. (Fdir_encoding): New function. * corefcn/interpreter.cc (interpreter::chdir): Call load_path::read_dir_config. * parse-tree/lex.h (lexer): Add new constructor that accepts an encoding. * parse-tree/parse.h (parser): Add new constructor that accepts an encoding. * parse-tree/oct-parse.yy (parse_fcn_file): Pass dir_encoding to parser. * doc/interpreter/func.txi: Add docstring of function "dir_encoding" to manual.
author Markus Mützel <markus.muetzel@gmx.de>
date Fri, 18 Sep 2020 17:15:32 +0200
parents 89a425f2c202
children 02b97abbc6fc
line wrap: on
line diff
--- a/libinterp/corefcn/input.cc	Sat Oct 17 02:13:39 2020 -0700
+++ b/libinterp/corefcn/input.cc	Fri Sep 18 17:15:32 2020 +0200
@@ -541,6 +541,92 @@
     return retval;
   }
 
+  // Get part of the directory that would be added to the load path
+  static std::string load_path_dir (const std::string& dir)
+  {
+    std::string lp_dir = dir;
+
+    // strip trailing filesep
+    size_t ipos = lp_dir.find_last_not_of (sys::file_ops::dir_sep_chars ());
+    if (ipos != std::string::npos)
+      lp_dir = lp_dir.erase (ipos+1);
+
+    // strip trailing private folder
+    ipos = lp_dir.find_last_of (sys::file_ops::dir_sep_chars ());
+    if (ipos != std::string::npos
+        && lp_dir.substr (ipos+1).compare ("private") == 0)
+      {
+        lp_dir = lp_dir.erase (ipos);
+        ipos = lp_dir.find_last_of (sys::file_ops::dir_sep_chars ());
+      }
+
+    // strip trailing @class folder
+    if (ipos != std::string::npos && lp_dir[ipos+1] == '@')
+      {
+        lp_dir = lp_dir.erase (ipos);
+        ipos = lp_dir.find_last_of (sys::file_ops::dir_sep_chars ());
+      }
+
+    // strip (nested) +namespace folders
+    while (ipos != std::string::npos && lp_dir[ipos+1] == '+')
+      {
+        lp_dir = lp_dir.erase (ipos);
+        ipos = lp_dir.find_last_of (sys::file_ops::dir_sep_chars ());
+      }
+
+    return lp_dir;
+  }
+
+  std::string input_system::dir_encoding (const std::string& dir)
+  {
+    std::string enc = m_mfile_encoding;
+
+    auto enc_it = m_dir_encoding.find (load_path_dir (dir));
+    if (enc_it != m_dir_encoding.end ())
+      enc = enc_it->second;
+
+    return enc;
+  }
+
+  void input_system::set_dir_encoding (const std::string& dir,
+                                       std::string& enc)
+  {
+    // use lower case
+    std::transform (enc.begin (), enc.end (), enc.begin (), ::tolower);
+
+    if (enc.compare ("delete") == 0)
+      {
+        // Remove path from map
+        m_dir_encoding.erase (load_path_dir (dir));
+        return;
+      }
+    else if (enc.compare ("utf-8"))
+      {
+        // Check for valid encoding name.
+        // FIXME: This will probably not happen very often and opening the
+        //        encoder doesn't take long.
+        //        Should we cache working encoding identifiers anyway?
+        void *codec
+          = octave_iconv_open_wrapper (enc.c_str (), "utf-8");
+
+        if (codec == reinterpret_cast<void *> (-1))
+          {
+            if (errno == EINVAL)
+              error ("dir_encoding: conversion from encoding '%s' "
+                     "not supported", enc.c_str ());
+            else
+              error ("dir_encoding: error %d opening encoding '%s'.",
+                     errno, enc.c_str ());
+          }
+        else
+          octave_iconv_close_wrapper (codec);
+      }
+
+    m_dir_encoding[load_path_dir (dir)] = enc;
+
+    return;
+  }
+
   bool input_system::yes_or_no (const std::string& prompt)
   {
     std::string prompt_string = prompt + "(yes or no) ";
@@ -808,7 +894,14 @@
   public:
 
     file_reader (interpreter& interp, FILE *f_arg)
-      : base_reader (interp), m_file (f_arg) { }
+      : base_reader (interp), m_file (f_arg)
+    {
+      octave::input_system& input_sys = interp.get_input_system ();
+      m_encoding = input_sys.mfile_encoding ();
+    }
+
+    file_reader (interpreter& interp, FILE *f_arg, const std::string& enc)
+      : base_reader (interp), m_file (f_arg), m_encoding (enc) { }
 
     std::string get_input (const std::string& prompt, bool& eof);
 
@@ -820,6 +913,8 @@
 
     FILE *m_file;
 
+    std::string m_encoding;
+
     static const std::string s_in_src;
   };
 
@@ -853,6 +948,10 @@
     : m_rep (new file_reader (interp, file))
   { }
 
+  input_reader::input_reader (interpreter& interp, FILE *file, const std::string& enc)
+    : m_rep (new file_reader (interp, file, enc))
+  { }
+
   input_reader::input_reader (interpreter& interp, const std::string& str)
     : m_rep (new eval_string_reader (interp, str))
   { }
@@ -882,9 +981,15 @@
 
     std::string src_str = octave_fgets (m_file, eof);
 
-    input_system& input_sys = m_interpreter.get_input_system ();
+    std::string mfile_encoding;
 
-    std::string mfile_encoding = input_sys.mfile_encoding ();
+    if (m_encoding.empty ())
+      {
+        input_system& input_sys = m_interpreter.get_input_system ();
+        mfile_encoding = input_sys.mfile_encoding ();
+      }
+    else
+      mfile_encoding = m_encoding;
 
     std::string encoding;
     if (mfile_encoding.compare ("system") == 0)
@@ -1402,3 +1507,70 @@
 
   return input_sys.mfile_encoding (args, nargout);
 }
+
+DEFMETHOD (dir_encoding, interp, args, nargout,
+           doc: /* -*- texinfo -*-
+@deftypefn {}  {@var{current_encoding} =} dir_encoding (@var{dir})
+@deftypefnx {} {@var{prev_encoding} =} dir_encoding (@var{dir}, @var{encoding})
+@deftypefnx {} {} dir_encoding (@dots{})
+Set and query the @var{encoding} that is used for reading m-files in @var{dir}.
+
+That encoding overrides the (globally set) m-file encoding.
+
+The string @var{DIR} must match the form how the directory would appear in the
+load path.
+
+The @var{encoding} must be a valid encoding identifier or @code{"delete"}.  In
+the latter case, the (globally set) m-file encoding will be used for the given
+@var{dir}.
+
+The currently or previously used encoding is returned in @var{current_encoding}
+or @var{prev_encoding}, respectively.  The output argument must be explicitly
+requested.
+
+The directory encoding is automatically read from the file @file{.oct_config}
+when a new path is added to the load path (for example with @code{addpath}).
+To set the encoding for all files in the same folder, that file must contain
+a line starting with @code{"encoding="} followed by the encoding identifier.
+
+For example to set the file encoding for all files in the same folder to
+ISO 8859-1 (Latin-1), create a file @file{.oct_config} with the following
+content:
+
+@example
+encoding=iso8859-1
+@end example
+
+If the file encoding is changed after the files have already been parsed, the
+files have to be parsed again for that change to take effect.  That can be done
+with the command @code{clear all}.
+
+@seealso{addpath, path}
+@end deftypefn */)
+{
+  int nargin = args.length ();
+
+  if (nargin < 1 || nargin > 2)
+    print_usage ();
+
+  std::string dir
+    = args(0).xstring_value ("dir_encoding: DIR must be a string");
+
+  octave_value retval;
+
+  octave::input_system& input_sys = interp.get_input_system ();
+
+  if (nargout > 0)
+    retval = input_sys.dir_encoding (dir);
+
+  if (nargin > 1)
+    {
+      std::string encoding
+        = args(1).xstring_value ("dir_encoding: ENCODING must be a string");
+
+      input_sys.set_dir_encoding (dir, encoding);
+    }
+
+  return ovl (retval);
+
+}