changeset 29091:b924b916dc91

matlab.lang.makeValidName: reimplement in C++ (patch #9998) Rationale: `matlab.lang.makeValidName` was pure Octave code and only available from the interpreter via `octave::feval`. This is too slow for an Octave function ensuring a valid variable name, especially, when called many times from C++. For example in jsondecode. * libinterp/corefcn/utils.h: new function `octave::make_valid_name` and new helper class `octave::make_valid_name_options`. * libinterp/corefcn/utils.cc: new functions `F__make_valid_name__` and `octave::make_valid_name` and new helper class `octave::make_valid_name_options`. * scripts/+matlab/+lang/makeValidName.m: replace Octave code implementation by `F__make_valid_name__` call.
author Kai T. Ohlhus <k.ohlhus@gmail.com>
date Mon, 23 Nov 2020 16:11:46 +0900
parents f61d1faacfca
children 38e22065d9ec
files libinterp/corefcn/utils.cc libinterp/corefcn/utils.h scripts/+matlab/+lang/makeValidName.m
diffstat 3 files changed, 223 insertions(+), 99 deletions(-) [+]
line wrap: on
line diff
--- a/libinterp/corefcn/utils.cc	Mon Nov 23 12:41:34 2020 +0900
+++ b/libinterp/corefcn/utils.cc	Mon Nov 23 16:11:46 2020 +0900
@@ -133,6 +133,166 @@
 
 namespace octave
 {
+  bool
+  make_valid_name (std::string& str, const make_valid_name_options& options)
+  {
+    // If `isvarname (str)`, no modifications necessary.
+    if (valid_identifier (str) && ! iskeyword (str))
+      return false;
+
+    // Change whitespace followed by lowercase letter to uppercase, except
+    // for the first
+    bool previous = false;
+    bool any_non_space = false;
+    for (char& c : str)
+    {
+      c = ((any_non_space && previous && std::isalpha (c)) ? std::toupper (c)
+                                                           : c);
+      previous = std::isspace (c);
+      any_non_space |= (! previous);  // once true, always true
+    }
+
+    // Remove any whitespace.
+    str.erase (std::remove_if (str.begin(), str.end(),
+                               [] (unsigned char x)
+                                  { return std::isspace(x); }),
+               str.end());
+    if (str.empty ())
+      str = options.get_prefix ();
+
+    // Add prefix and capitalize first character, if `str` is a reserved
+    // keyword.
+    if (iskeyword (str))
+    {
+      str[0] = std::toupper (str[0]);
+      str = options.get_prefix () + str;
+    }
+
+    // Add prefix if first character is not a letter or underscore.
+    if (! std::isalpha (str[0]) && str[0] != '_')
+      str = options.get_prefix () + str;
+
+    // Replace non alphanumerics or underscores
+    if (options.get_replacement_style () == "underscore")
+      for (char& c : str)
+        c = (std::isalnum (c) ? c : '_');
+    else if (options.get_replacement_style () == "delete")
+      str.erase (std::remove_if (str.begin(), str.end(),
+                                 [] (unsigned char x)
+                                    { return ! std::isalnum (x) && x != '_'; }),
+                 str.end());
+    else if (options.get_replacement_style () == "hex")
+    {
+      const std::string permitted_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+                                          "abcdefghijklmnopqrstuvwxyz"
+                                          "_0123456789";
+      // Get the first non-permitted char.
+      size_t pos = str.find_first_not_of (permitted_chars);
+      // Buffer for hex string "0xFF" (+1 for null terminator).
+      char hex_str[5];
+      // Repeat until end of string.
+      while (pos != std::string::npos)
+      {
+        // Replace non-permitted char by it's hex value.
+        std::snprintf (hex_str, sizeof (hex_str), "0x%02X", str[pos]);
+        str.replace (pos, 1, hex_str);
+        // Get the next occurrence from the last position.
+        // (-1 for null terminator)
+        pos = str.find_first_not_of (permitted_chars,
+                                     pos + sizeof (hex_str) - 1);
+      }
+    }
+
+    return true;
+  }
+
+  make_valid_name_options::make_valid_name_options (const octave_value_list& args)
+  {
+    auto nargs = args.length ();
+    if (nargs == 0)
+      return;
+
+    // nargs = 2, 4, 6, ... permitted
+    if (nargs % 2)
+      error ("makeValidName: property/value options must occur in pairs");
+
+    auto str_to_lower = [] (std::string& s)
+                           {
+                             std::transform (s.begin(), s.end(), s.begin(),
+                                             [] (unsigned char c)
+                                                { return std::tolower(c); });
+                           };
+
+    for (auto i = 0; i < nargs; i = i + 2)
+    {
+      std::string parameter = args(i).xstring_value ("makeValidName: "
+        "option argument must be a string");
+      str_to_lower (parameter);
+      if (parameter == "replacementstyle")
+      {
+        m_replacement_style = args(i + 1).xstring_value ("makeValidName: "
+          "'ReplacementStyle' value must be a string");
+        str_to_lower (m_replacement_style);
+        if ((m_replacement_style != "underscore")
+            && (m_replacement_style != "delete")
+            && (m_replacement_style != "hex"))
+          error ("makeValidName: invalid 'ReplacementStyle' value '%s'",
+                 m_replacement_style.c_str ());
+      }
+      else if (parameter == "prefix")
+      {
+        m_prefix = args(i + 1).xstring_value ("makeValidName: "
+          "'Prefix' value must be a string");
+        if (! octave::valid_identifier (m_prefix)
+            || octave::iskeyword (m_prefix))
+          error ("makeValidName: invalid 'Prefix' value '%s'",
+                 m_prefix.c_str ());
+      }
+      else
+        error ("makeValidName: unknown property '%s'", parameter.c_str ());
+    }
+  }
+}
+
+DEFUN (__make_valid_name__, args, ,
+       doc: /* -*- texinfo -*-
+@deftypefn  {} {@var{varname} =} __make_valid_name__ (@var{str})
+@deftypefnx {} {@var{varname} =} __make_valid_name__ (@var{str}, @qcode{"ReplacementStyle"})
+@deftypefnx {} {@var{varname} =} __make_valid_name__ (@var{str}, @qcode{"ReplacementStyle"}, @qcode{"Prefix"})
+@deftypefnx {} {[@var{varname}, @var{ismodified}] =} __make_valid_name__ (@dots{})
+Return a valid variable name @var{varname} from input @var{str}.
+
+For more documentation, see @code{matlab.lang.makeValidName}.
+
+@seealso{isvarname, matlab.lang.makeValidName}
+@end deftypefn */)
+{
+  auto nargin = args.length ();
+  if (nargin < 1)
+    print_usage ();
+
+  octave::make_valid_name_options options (args.slice (1, nargin - 1));
+
+  if (args(0).is_string ())
+  {
+    std::string varname = args(0).string_value ();
+    bool is_modified = octave::make_valid_name (varname, options);
+    return ovl (varname, is_modified);
+  }
+  else if (args(0).iscellstr ())
+  {
+    Array<std::string> varnames = args(0).cellstr_value ();
+    Array<bool> is_modified (varnames.dims ());
+    for (auto i = 0; i < varnames.numel (); i++)
+      is_modified(i) = octave::make_valid_name (varnames(i), options);
+    return ovl (varnames, is_modified);
+  }
+  else
+    error ("makeValidName: STR must be a string or cellstr");
+}
+
+namespace octave
+{
   // Return TRUE if F and G are both names for the same file.
 
   bool same_file (const std::string& f, const std::string& g)
--- a/libinterp/corefcn/utils.h	Mon Nov 23 12:41:34 2020 +0900
+++ b/libinterp/corefcn/utils.h	Mon Nov 23 16:11:46 2020 +0900
@@ -46,6 +46,63 @@
   extern OCTINTERP_API bool valid_identifier (const char *s);
   extern OCTINTERP_API bool valid_identifier (const std::string& s);
 
+  //! Helper class for `make_valid_name` function calls.
+  //!
+  //! Extracting options separately for multiple (e.g. 1000+) function calls
+  //! avoids expensive repetitive parsing of the very same options.
+
+  class
+  OCTINTERP_API
+  make_valid_name_options
+  {
+  public:
+
+    //! Default options for `make_valid_name` function calls.
+    //!
+    //! Calling the constructor without arguments is equivalent to:
+    //!
+    //! @code{.cc}
+    //! make_valid_name_options (ovl ("ReplacementStyle", "underscore",
+    //!                               "Prefix", "x"));
+    //! @endcode
+
+    make_valid_name_options () = default;
+
+    //! Extract attribute-value-pairs from an octave_value_list of strings.
+    //!
+    //! If attributes occur multiple times, the rightmost pair is chosen.
+    //!
+    //! @code{.cc}
+    //! make_valid_name_options (ovl ("ReplacementStyle", "hex", ...));
+    //! @endcode
+
+    make_valid_name_options (const octave_value_list& args);
+
+    //! @return ReplacementStyle, see `help matlab.lang.makeValidName`.
+
+    const std::string&
+    get_replacement_style () const { return m_replacement_style; }
+
+    //! @return Prefix, see `help matlab.lang.makeValidName`.
+
+    const std::string& get_prefix () const { return m_prefix; }
+
+  private:
+
+    std::string m_replacement_style{"underscore"};
+    std::string m_prefix{"x"};
+  };
+
+  //! Modify @p str to be a valid variable name.
+  //!
+  //! @param str input string
+  //! @param options see also `help matlab.lang.makeValidName`.
+  //!
+  //! @return true, if @p str was modified.
+
+  extern OCTINTERP_API bool
+  make_valid_name (std::string& str, const make_valid_name_options& options);
+
   extern OCTINTERP_API bool
   same_file (const std::string& f, const std::string& g);
 
--- a/scripts/+matlab/+lang/makeValidName.m	Mon Nov 23 12:41:34 2020 +0900
+++ b/scripts/+matlab/+lang/makeValidName.m	Mon Nov 23 16:11:46 2020 +0900
@@ -65,102 +65,9 @@
 ## @seealso{iskeyword, isvarname, matlab.lang.makeUniqueStrings}
 ## @end deftypefn
 
-function [varname, ismodified] = makeValidName (str, varargin)
-
-  if (nargin == 0)
-    print_usage ();
-  endif
-
-  if (! ischar (str) && ! iscellstr (str))
-    error ("makeValidName: STR must be a string or cellstr");
-  endif
-
-  if (mod (nargin - 1, 2) != 0)
-    error ("makeValidName: property/value options must occur in pairs");
-  endif
-
-  varname = cellstr (str);
-  ismodified = false (size (varname));
-  convert2char = ischar (str);
-  opts = struct ("replacementstyle", "underscore", "prefix", "x");
-
-  for i = 1:2:numel (varargin)
-    if (! ischar (varargin{i}))
-      error ("makeValidName: option argument must be a string");
-    endif
-    parameter = tolower (varargin{i});
-    value = varargin{i+1};
-    switch (parameter)
-      case "replacementstyle"
-        if (! ischar (value))
-          error ('makeValidName: "ReplacementStyle" value must be a string');
-        endif
-        value = tolower (value);
-        if (! any (strcmp (value, {"underscore", "delete", "hex"})))
-          error ('makeValidName: invalid "ReplacementStyle" value "%s"', value);
-        endif
-        opts.replacementstyle = value;
-
-      case "prefix"
-        if (! isvarname (value))
-          error ('makeValidName: invalid "Prefix" value "%s"', value);
-        endif
-        opts.prefix = value;
-
-      otherwise
-        error ('makeValidName: unknown property "%s"', parameter);
-    endswitch
-  endfor
+function [varname, ismodified] = makeValidName (varargin)
 
-  for i = 1:numel (varname)
-    if (! isvarname (varname{i}))
-      ismodified(i) = true;
-
-      ## Remove leading and trailing whitespace
-      varname{i} = strtrim (varname{i});
-      if (isempty (varname{i}))
-        varname{i} = opts.prefix;
-      endif
-
-      ## Add prefix if input is a reserved keyword
-      if (iskeyword (varname{i}))
-        varname{i} = [opts.prefix, toupper(varname{i}(1)), varname{i}(2:end)];
-      endif
-
-      ## Change whitespace followed by lowercase letter to uppercase
-      idx = regexp (varname{i}, '\s[a-z]');
-      varname{i}(idx+1) = toupper (varname{i}(idx+1));
-
-      ## Remove any whitespace character
-      varname{i}(isspace (varname{i})) = "";
-
-      ## Add prefix if first character is not a letter or underscore
-      char1 = varname{i}(1);
-      if (! isalpha (char1) && char1 != "_")
-        varname{i} = [opts.prefix varname{i}];
-      endif
-
-      ## Replace non alphanumerics or underscores
-      idx = regexp (varname{i}, '[^0-9a-zA-Z_]');
-      switch (opts.replacementstyle)
-        case "underscore"
-          varname{i}(idx) = "_";
-
-        case "delete"
-          varname{i}(idx) = "";
-
-        case "hex"
-          for j = numel (idx):-1:1
-            varname{i} = strrep (varname{i}, varname{i}(idx(j)),
-                                 sprintf ("0x%02X",varname{i}(idx(j))));
-          endfor
-      endswitch
-    endif
-  endfor
-
-  if (convert2char)
-    varname = char (varname);
-  endif
+  [varname, ismodified] = __make_valid_name__ (varargin{:});
 
 endfunction
 
@@ -223,11 +130,11 @@
 %!error <options must occur in pairs> matlab.lang.makeValidName ("a", "opt1")
 %!error <option argument must be a string>
 %! matlab.lang.makeValidName ("a", 1, 2)
-%!error <"ReplacementStyle" value must be a string>
+%!error <'ReplacementStyle' value must be a string>
 %! matlab.lang.makeValidName ("a", "ReplacementStyle", 1);
-%!error <invalid "ReplacementStyle" value "foobar">
+%!error <invalid 'ReplacementStyle' value 'foobar'>
 %! matlab.lang.makeValidName ("a", "ReplacementStyle", "foobar");
-%!error <invalid "Prefix" value "1_">
+%!error <invalid 'Prefix' value '1_'>
 %! matlab.lang.makeValidName ("a", "Prefix", "1_");
-%!error <unknown property "foobar">
+%!error <unknown property 'foobar'>
 %! matlab.lang.makeValidName ("a", "foobar", 1);