Mercurial > octave

diff src/DLD-FUNCTIONS/regexp.cc @ 14024:fc9f204faea0
refactor regexp (bug #34440) * liboctave/regexp.h, liboctave/regexp.cc: New files. Provide classes and functions for regular expressions. Adapted from src/DLD-FUNCTIONS/regexp.cc. * regex-match.h, regex-match.cc: Delete * liboctave/Makefile.am (INCS, LIBOCTAVE_CXX_SOURCES): Update. * variables.cc (name_matches_any_pattern): Use new regexp class. * symtab.h (symbol_table::regexp_global_variables, symbol_table::do_clear_variable_regexp, symbol_table::do_regexp): Likewise. * DLD-FUNCTIONS/regexp.cc (parse_options): New function. (octregexp, octcellregexp, octregexprep): Extract matching code for use in new regexp class. Use new regexp class to provide required functionality.
author: John W. Eaton <jwe@octave.org>
date: Sun, 11 Dec 2011 22:19:57 -0500
parents: 9cae456085c2
children: 72c96de7a403
--- a/src/DLD-FUNCTIONS/regexp.cc	Sun Dec 11 18:28:35 2011 -0500
+++ b/src/DLD-FUNCTIONS/regexp.cc	Sun Dec 11 22:19:57 2011 -0500
@@ -25,501 +25,106 @@
 #include <config.h>
 #endif
 
-#include <algorithm>
+#include <list>
 #include <sstream>
 
-#include "defun-dld.h"
-#include "error.h"
-#include "gripes.h"
-#include "oct-obj.h"
-#include "utils.h"
-
-#include "Cell.h"
-#include "oct-map.h"
-#include "str-vec.h"
-#include "quit.h"
-#include "parse.h"
-#include "oct-locbuf.h"
-
 #include <pcre.h>
 
-// Define the maximum number of retries for a pattern that
-// possibly results in an infinite recursion.
-#define PCRE_MATCHLIMIT_MAX 10
-
-// The regexp is constructed as a linked list to avoid resizing the
-// return values in arrays at each new match.
-
-// FIXME don't bother collecting and composing return values the user
-// doesn't want.
-
-class regexp_elem
-{
-public:
-  regexp_elem (const string_vector& _named_token, const Cell& _t,
-               const std::string& _m, const Matrix& _te, double _s,
-               double _e) :
-    named_token (_named_token), t (_t), m (_m), te (_te), s (_s), e (_e) { }
-
-  regexp_elem (const regexp_elem &a) : named_token (a.named_token), t (a.t),
-                                       m (a.m), te (a.te), s (a.s), e (a.e)
-                                       { }
-
-  string_vector named_token;
-  Cell t;
-  std::string m;
-  Matrix te;
-  double s;
-  double e;
-};
-
-typedef std::list<regexp_elem>::const_iterator const_iterator;
-
-#define MAXLOOKBEHIND 10
-static bool lookbehind_warned = false;
+#include "base-list.h"
+#include "oct-locbuf.h"
+#include "quit.h"
+#include "regexp.h"
+#include "str-vec.h"
 
-static int
-octregexp_list (const octave_value_list &args, const std::string &nm,
-                bool case_insensitive, std::list<regexp_elem> &lst,
-                string_vector &named, int &nopts, bool &once)
-{
-  int sz = 0;
-
-  int nargin = args.length ();
-  bool lineanchors = false;
-  bool dotexceptnewline = false;
-  bool freespacing = false;
-
-  nopts = nargin - 2;
-  once = false;
+#include "defun-dld.h"
+#include "Cell.h"
+#include "error.h"
+#include "gripes.h"
+#include "oct-map.h"
+#include "oct-obj.h"
+#include "utils.h"
 
-  std::string buffer = args(0).string_value ();
-  size_t max_length = (buffer.length () > MAXLOOKBEHIND ?
-                       MAXLOOKBEHIND: buffer.length ());
-
-  if (error_state)
-    {
-      gripe_wrong_type_arg (nm.c_str (), args(0));
-      return 0;
-    }
+static void
+parse_options (regexp::opts& options, const octave_value_list& args,
+               const std::string& who, int skip, bool& extra_args)
+{
+  int nargin = args.length ();
 
-  std::string pattern = args(1).string_value ();
+  extra_args = false;
 
-  if (error_state)
-    {
-      gripe_wrong_type_arg (nm.c_str (), args(1));
-      return 0;
-    }
-
-  for (int i = 2; i < nargin; i++)
+  for (int i = skip; i < nargin; i++)
     {
       std::string str = args(i).string_value ();
 
       if (error_state)
         {
-          error ("%s: optional arguments must be strings", nm.c_str ());
+          error ("%s: optional arguments must be character strings",
+                 who.c_str ());
           break;
         }
 
       std::transform (str.begin (), str.end (), str.begin (), tolower);
 
       if (str.find ("once", 0) == 0)
-        {
-          once = true;
-          nopts--;
-        }
+        options.once (true);
       else if (str.find ("matchcase", 0) == 0)
-        {
-          case_insensitive = false;
-          nopts--;
-        }
+        options.case_insensitive (false);
       else if (str.find ("ignorecase", 0) == 0)
-        {
-          case_insensitive = true;
-          nopts--;
-        }
+        options.case_insensitive (true);
       else if (str.find ("dotall", 0) == 0)
-        {
-          dotexceptnewline = false;
-          nopts--;
-        }
+        options.dotexceptnewline (false);
       else if (str.find ("stringanchors", 0) == 0)
-        {
-          lineanchors = false;
-          nopts--;
-        }
+        options.lineanchors (false);
       else if (str.find ("literalspacing", 0) == 0)
-        {
-          freespacing = false;
-          nopts--;
-        }
+        options.freespacing (false);
       else if (str.find ("dotexceptnewline", 0) == 0)
-        {
-          dotexceptnewline = true;
-          nopts--;
-        }
+        options.dotexceptnewline (true);
       else if (str.find ("lineanchors", 0) == 0)
-        {
-          lineanchors = true;
-          nopts--;
-        }
+        options.lineanchors (true);
       else if (str.find ("freespacing", 0) == 0)
-        {
-          freespacing = true;
-          nopts--;
-        }
-      else if (str.find ("start", 0) && str.find ("end", 0)
-               && str.find ("tokenextents", 0) && str.find ("match", 0)
-               && str.find ("tokens", 0) && str.find ("names", 0)
-               && str.find ("split", 0))
-        error ("%s: unrecognized option", nm.c_str ());
+        options.freespacing (true);
+      else if (str.find ("start", 0) == 0
+               || str.find ("end", 0) == 0
+               || str.find ("tokenextents", 0) == 0
+               || str.find ("match", 0) == 0
+               || str.find ("tokens", 0) == 0
+               || str.find ("names", 0) == 0
+               || str.find ("split", 0) == 0)
+        extra_args = true;
+      else
+        error ("%s: unrecognized option", who.c_str ());
     }
-
-  if (!error_state)
-    {
-      Cell t;
-      std::string m;
-      double s, e;
-
-      // named tokens "(?<name>...)" are only treated with PCRE not regex.
-
-      size_t pos = 0;
-      size_t new_pos;
-      int nnames = 0;
-      int inames = 0;
-      std::ostringstream buf;
-      Array<int> named_idx;
-
-      while ((new_pos = pattern.find ("(?", pos)) != std::string::npos)
-        {
-          if (pattern.at (new_pos + 2) == '<'
-              && !(pattern.at (new_pos + 3) == '='
-                   || pattern.at (new_pos + 3) == '!'))
-            {
-              // The syntax of named tokens in pcre is "(?P<name>...)" while
-              // we need a syntax "(?<name>...)", so fix that here. Also an
-              // expression like
-              // "(?<first>\w+)\s+(?<last>\w+)|(?<last>\w+),\s+(?<first>\w+)"
-              // should be perfectly legal, while pcre does not allow the same
-              // named token name on both sides of the alternative. Also fix
-              // that here by replacing name tokens by dummy names, and dealing
-              // with the dummy names later.
-
-              size_t tmp_pos = pattern.find_first_of ('>', new_pos);
-
-              if (tmp_pos == std::string::npos)
-                {
-                  error ("regexp: syntax error in pattern");
-                  break;
-                }
-
-              std::string tmp_name =
-                pattern.substr (new_pos+3, tmp_pos-new_pos-3);
-
-              bool found = false;
-
-              for (int i = 0; i < nnames; i++)
-                {
-                  if (named(i) == tmp_name)
-                    {
-                      named_idx.resize (dim_vector (inames+1, 1));
-                      named_idx(inames) = i;
-                      found = true;
-                      break;
-                    }
-                }
-
-              if (! found)
-                {
-                  named_idx.resize (dim_vector (inames+1, 1));
-                  named_idx(inames) = nnames;
-                  named.append (tmp_name);
-                  nnames++;
-                }
-
-              if (new_pos - pos > 0)
-                buf << pattern.substr (pos, new_pos-pos);
-              if (inames < 10)
-                buf << "(?P<n00" << inames++;
-              else if (inames < 100)
-                buf << "(?P<n0" << inames++;
-              else
-                buf << "(?P<n" << inames++;
-
-              pos = tmp_pos;
-            }
-          else if (pattern.at (new_pos + 2) == '<')
-            {
-              // Find lookbehind operators of arbitrary length (ie like
-              // "(?<=[a-z]*)") and replace with a maximum length operator
-              // as PCRE can not yet handle arbitrary length lookahead
-              // operators. Use the string length as the maximum length to
-              // avoid issues.
-
-              int brackets = 1;
-              size_t tmp_pos1 = new_pos + 2;
-              size_t tmp_pos2 = tmp_pos1;
-
-              while (tmp_pos1 <= pattern.length () && brackets > 0)
-                {
-                  char ch = pattern.at (tmp_pos1);
-
-                  if (ch == '(')
-                    brackets++;
-                  else if (ch == ')')
-                    {
-                      if (brackets > 1)
-                        tmp_pos2 = tmp_pos1;
-
-                      brackets--;
-                    }
-
-                  tmp_pos1++;
-                }
-
-              if (brackets != 0)
-                {
-                  buf << pattern.substr (pos, new_pos - pos) << "(?";
-                  pos = new_pos + 2;
-                }
-              else
-                {
-                  size_t tmp_pos3 = pattern.find_first_of ("*+", tmp_pos2);
-
-                  if (tmp_pos3 != std::string::npos && tmp_pos3 < tmp_pos1)
-                    {
-                      if (!lookbehind_warned)
-                        {
-                          lookbehind_warned = true;
-                          warning ("%s: arbitrary length lookbehind patterns are only supported up to length %d",
-                                   nm.c_str (), MAXLOOKBEHIND);
-                        }
-
-                      buf << pattern.substr (pos, new_pos - pos) << "(";
-
-                      size_t i;
-
-                      if (pattern.at (tmp_pos3) == '*')
-                        i = 0;
-                      else
-                        i = 1;
-
-                      for (; i < max_length + 1; i++)
-                        {
-                          buf << pattern.substr (new_pos, tmp_pos3 - new_pos)
-                              << "{" << i << "}";
-                          buf << pattern.substr (tmp_pos3 + 1,
-                                                 tmp_pos1 - tmp_pos3 - 1);
-                          if (i != max_length)
-                            buf << "|";
-                        }
-                      buf << ")";
-                    }
-                  else
-                    buf << pattern.substr (pos, tmp_pos1 - pos);
-
-                  pos = tmp_pos1;
-                }
-            }
-          else
-            {
-              buf << pattern.substr (pos, new_pos - pos) << "(?";
-              pos = new_pos + 2;
-            }
-
-        }
-
-      buf << pattern.substr (pos);
-
-      if (error_state)
-        return 0;
-
-      // Compile expression
-      const char *err;
-      int erroffset;
-      std::string buf_str = buf.str ();
-
-      pcre *re = pcre_compile (buf_str.c_str (),
-                               ((case_insensitive ? PCRE_CASELESS : 0)
-                                | (dotexceptnewline ? 0 : PCRE_DOTALL)
-                                | (lineanchors ? PCRE_MULTILINE : 0)
-                                | (freespacing ? PCRE_EXTENDED : 0)),
-                               &err, &erroffset, 0);
-
-      if (re == 0)
-        {
-          error ("%s: %s at position %d of expression", nm.c_str (),
-                 err, erroffset);
-          return 0;
-        }
-
-      int subpatterns;
-      int namecount;
-      int nameentrysize;
-      char *nametable;
-      int idx = 0;
-
-      pcre_fullinfo (re, 0, PCRE_INFO_CAPTURECOUNT,  &subpatterns);
-      pcre_fullinfo (re, 0, PCRE_INFO_NAMECOUNT, &namecount);
-      pcre_fullinfo (re, 0, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize);
-      pcre_fullinfo (re, 0, PCRE_INFO_NAMETABLE, &nametable);
-
-      OCTAVE_LOCAL_BUFFER (int, ovector, (subpatterns+1)*3);
-      OCTAVE_LOCAL_BUFFER (int, nidx, namecount);
-
-      for (int i = 0; i < namecount; i++)
-        {
-          // Index of subpattern in first two bytes MSB first of name.
-          // Extract index.
-          nidx[i] = (static_cast<int> (nametable[i*nameentrysize])) << 8
-            | static_cast<int> (nametable[i*nameentrysize+1]);
-        }
-
-      while (true)
-        {
-          OCTAVE_QUIT;
-
-          int matches = pcre_exec (re, 0, buffer.c_str (),
-                                   buffer.length (), idx,
-                                   (idx ? PCRE_NOTBOL : 0),
-                                   ovector, (subpatterns+1)*3);
-
-          if (matches == PCRE_ERROR_MATCHLIMIT)
-            {
-              // Try harder; start with default value for MATCH_LIMIT
-              // and increase it.
-              warning ("your pattern caused PCRE to hit its MATCH_LIMIT; trying harder now, but this will be slow");
-
-              pcre_extra pe;
-
-              pcre_config (PCRE_CONFIG_MATCH_LIMIT,
-                           static_cast <void *> (&pe.match_limit));
-
-              pe.flags = PCRE_EXTRA_MATCH_LIMIT;
-
-              int i = 0;
-              while (matches == PCRE_ERROR_MATCHLIMIT
-                     && i++ < PCRE_MATCHLIMIT_MAX)
-                {
-                  OCTAVE_QUIT;
-
-                  pe.match_limit *= 10;
-                  matches = pcre_exec (re, &pe, buffer.c_str (),
-                                       buffer.length (), idx,
-                                       (idx ? PCRE_NOTBOL : 0),
-                                       ovector, (subpatterns+1)*3);
-                }
-            }
-
-          if (matches < 0 && matches != PCRE_ERROR_NOMATCH)
-            {
-              error ("%s: internal error calling pcre_exec; error code from pcre_exec is %i",
-                     nm.c_str (), matches);
-              pcre_free (re);
-              return 0;
-            }
-          else if (matches == PCRE_ERROR_NOMATCH)
-            break;
-          else if (ovector[1] <= ovector[0])
-            {
-              // Zero sized match.  Skip to next char.
-              idx = ovector[0] + 1;
-              if (idx < buffer.length ())
-                continue;
-              else
-                break;
-            }
-          else
-            {
-              int pos_match = 0;
-              Matrix te (matches-1, 2);
-
-              for (int i = 1; i < matches; i++)
-                {
-                  if (ovector[2*i] >= 0 && ovector[2*i+1] > 0
-                      && (i == 1 || ovector[2*i] != ovector[2*i-2]
-                          || ovector[2*i-1] != ovector[2*i+1])
-                      && ovector[2*i] >= 0 && ovector[2*i+1] > 0)
-                    {
-                      te(pos_match,0) = double (ovector[2*i]+1);
-                      te(pos_match++,1) = double (ovector[2*i+1]);
-                    }
-                }
-
-              te.resize (pos_match, 2);
-
-              s = double (ovector[0]+1);
-              e = double (ovector[1]);
-
-              const char **listptr;
-              int status = pcre_get_substring_list (buffer.c_str (), ovector,
-                                                   matches, &listptr);
-
-              if (status == PCRE_ERROR_NOMEMORY)
-                {
-                  error ("%s: cannot allocate memory in pcre_get_substring_list",
-                        nm.c_str ());
-                  pcre_free (re);
-                  return 0;
-                }
-
-              Cell cell_t (dim_vector (1, pos_match));
-              string_vector named_tokens (nnames);
-              int pos_offset = 0;
-              pos_match = 0;
-
-              for (int i = 1; i < matches; i++)
-                {
-                  if (ovector[2*i] >= 0 && ovector[2*i+1] > 0)
-                    {
-                      if (i == 1 || ovector[2*i] != ovector[2*i-2]
-                          || ovector[2*i-1] != ovector[2*i+1])
-                        {
-                          if (namecount > 0)
-                            named_tokens(named_idx(i-pos_offset-1)) =
-                              std::string (*(listptr+nidx[i-pos_offset-1]));
-                          cell_t(pos_match++) =
-                            std::string (*(listptr+i));
-                        }
-                      else
-                        pos_offset++;
-                    }
-                }
-
-              m =  std::string (*listptr);
-              t = cell_t;
-
-              pcre_free_substring_list (listptr);
-
-              regexp_elem new_elem (named_tokens, t, m, te, s, e);
-              lst.push_back (new_elem);
-              idx = ovector[1];
-              sz++;
-
-              if (once || idx >= buffer.length ())
-                break;
-
-            }
-        }
-
-      pcre_free (re);
-    }
-
-  return sz;
 }
 
 static octave_value_list
-octregexp (const octave_value_list &args, int nargout, const std::string &nm,
-           bool case_insensitive)
+octregexp (const octave_value_list &args, int nargout,
+           const std::string &who, bool case_insensitive = false)
 {
   octave_value_list retval;
+
   int nargin = args.length ();
-  std::list<regexp_elem> lst;
-  string_vector named;
-  int nopts;
-  bool once;
+
+  // Make sure we have string, pattern
+  const std::string buffer = args(0).string_value ();
+  if (error_state)
+    return retval;
+
+  const std::string pattern = args(1).string_value ();
+  if (error_state)
+    return retval;
 
-  int sz = octregexp_list (args, nm, case_insensitive, lst, named, nopts, once);
+  regexp::opts options;
+  options.case_insensitive (case_insensitive);
+  bool extra_options = false;
+  parse_options (options, args, who, 2, extra_options);
+  if (error_state)
+    return retval;
+
+  regexp::match_data rx_lst = regexp_match (pattern, buffer, options, who);
+
+  string_vector named_pats = rx_lst.named_patterns ();
+
+  size_t sz = rx_lst.size ();
 
   if (! error_state)
     {
@@ -532,47 +137,54 @@
 
       if (sz == 1)
         {
-          for (int j = 0; j < named.length (); j++)
-            nmap.assign (named(j), lst.begin()->named_token (j));
+          string_vector named_tokens = rx_lst.begin()->named_tokens ();
+
+          for (int j = 0; j < named_pats.length (); j++)
+            nmap.assign (named_pats(j), named_tokens(j));
 
           retval(5) = nmap;
         }
       else
         {
-          for (int j = 0; j < named.length (); j++)
+          for (int j = 0; j < named_pats.length (); j++)
             {
               Cell tmp (dim_vector (1, sz));
 
               i = 0;
-              for (const_iterator p = lst.begin (); p != lst.end (); p++)
-                tmp(i++) = p->named_token (j);
+              for (regexp::match_data::const_iterator p = rx_lst.begin ();
+                   p != rx_lst.end (); p++)
+                {
+                  string_vector named_tokens = p->named_tokens ();
 
-              nmap.assign (named(j), octave_value (tmp));
+                  tmp(i++) = named_tokens(j);
+                }
+
+              nmap.assign (named_pats(j), octave_value (tmp));
             }
 
           retval(5) = nmap;
         }
 
-      std::string buffer = args(0).string_value ();
-
-      if (once)
+      if (options.once ())
         {
-          retval(4) = sz ? lst.front ().t : Cell ();
-          retval(3) = sz ? lst.front ().m : std::string ();
-          retval(2) = sz ? lst.front ().te : Matrix ();
+          regexp::match_data::const_iterator p = rx_lst.begin ();
+
+          retval(4) = sz ? p->tokens () : Cell ();
+          retval(3) = sz ? p->match_string () : std::string ();
+          retval(2) = sz ? p->token_extents () : Matrix ();
 
           if (sz)
             {
-              double e = lst.front ().e;
-              double s = lst.front ().s;
+              double start = p->start ();
+              double end = p->end ();
 
-              Cell sp (dim_vector (1, 2));
-              sp(0) = buffer.substr (0, s-1);
-              sp(1) = buffer.substr (e);
+              Cell split (dim_vector (1, 2));
+              split(0) = buffer.substr (0, start-1);
+              split(1) = buffer.substr (end);
 
-              retval(6) = sp;
-              retval(1) = e;
-              retval(0) = s;
+              retval(6) = split;
+              retval(1) = end;
+              retval(0) = start;
             }
           else
             {
@@ -583,39 +195,45 @@
         }
       else
         {
-          Cell t (dim_vector (1, sz));
-          Cell m (dim_vector (1, sz));
-          Cell te (dim_vector (1, sz));
-          NDArray e (dim_vector (1, sz));
-          NDArray s (dim_vector (1, sz));
-          Cell sp (dim_vector (1, sz+1));
+          Cell tokens (dim_vector (1, sz));
+          Cell match_string (dim_vector (1, sz));
+          Cell token_extents (dim_vector (1, sz));
+          NDArray end (dim_vector (1, sz));
+          NDArray start (dim_vector (1, sz));
+          Cell split (dim_vector (1, sz+1));
           size_t sp_start = 0;
 
           i = 0;
-          for (const_iterator p = lst.begin (); p != lst.end (); p++)
+          for (regexp::match_data::const_iterator p = rx_lst.begin ();
+               p != rx_lst.end (); p++)
             {
-              t(i) = p->t;
-              m(i) = p->m;
-              te(i) = p->te;
-              e(i) = p->e;
-              s(i) = p->s;
-              sp(i) = buffer.substr (sp_start, p->s-sp_start-1);
-              sp_start = p->e;
+              double s = p->start ();
+              double e = p->end ();
+
+              string_vector tmp = p->tokens ();
+              tokens(i) = Cell (dim_vector (1, tmp.length ()), tmp);
+              match_string(i) = p->match_string ();
+              token_extents(i) = p->token_extents ();
+              end(i) = e;
+              start(i) = s;
+              split(i) = buffer.substr (sp_start, s-sp_start-1);
+              sp_start = e;
               i++;
             }
 
-          sp(i) = buffer.substr (sp_start);
+          split(i) = buffer.substr (sp_start);
 
-          retval(6) = sp;
-          retval(4) = t;
-          retval(3) = m;
-          retval(2) = te;
-          retval(1) = e;
-          retval(0) = s;
+          retval(6) = split;
+          retval(4) = tokens;
+          retval(3) = match_string;
+          retval(2) = token_extents;
+          retval(1) = end;
+          retval(0) = start;
         }
 
       // Alter the order of the output arguments
-      if (nopts > 0)
+
+      if (extra_options)
         {
           int n = 0;
           octave_value_list new_retval;
@@ -682,7 +300,7 @@
 
 static octave_value_list
 octcellregexp (const octave_value_list &args, int nargout,
-               const std::string &nm, bool case_insensitive)
+               const std::string &who, bool case_insensitive = false)
 {
   octave_value_list retval;
 
@@ -705,7 +323,7 @@
               for (octave_idx_type i = 0; i < cellstr.numel (); i++)
                 {
                   new_args(0) = cellstr(i);
-                  octave_value_list tmp = octregexp (new_args, nargout, nm,
+                  octave_value_list tmp = octregexp (new_args, nargout, who,
                                                      case_insensitive);
 
                   if (error_state)
@@ -725,7 +343,7 @@
               for (octave_idx_type i = 0; i < cellpat.numel (); i++)
                 {
                   new_args(1) = cellpat(i);
-                  octave_value_list tmp = octregexp (new_args, nargout, nm,
+                  octave_value_list tmp = octregexp (new_args, nargout, who,
                                                      case_insensitive);
 
                   if (error_state)
@@ -739,7 +357,7 @@
             {
 
               if (cellstr.dims () != cellpat.dims ())
-                error ("%s: Inconsistent cell array dimensions", nm.c_str ());
+                error ("%s: inconsistent cell array dimensions", who.c_str ());
               else
                 {
                   for (int j = 0; j < nargout; j++)
@@ -750,7 +368,7 @@
                       new_args(0) = cellstr(i);
                       new_args(1) = cellpat(i);
 
-                      octave_value_list tmp = octregexp (new_args, nargout, nm,
+                      octave_value_list tmp = octregexp (new_args, nargout, who,
                                                          case_insensitive);
 
                       if (error_state)
@@ -772,7 +390,7 @@
           for (octave_idx_type i = 0; i < cellstr.numel (); i++)
             {
               new_args(0) = cellstr(i);
-              octave_value_list tmp = octregexp (new_args, nargout, nm,
+              octave_value_list tmp = octregexp (new_args, nargout, who,
                                                  case_insensitive);
 
               if (error_state)
@@ -799,7 +417,7 @@
       for (octave_idx_type i = 0; i < cellpat.numel (); i++)
         {
           new_args(1) = cellpat(i);
-          octave_value_list tmp = octregexp (new_args, nargout, nm,
+          octave_value_list tmp = octregexp (new_args, nargout, who,
                                              case_insensitive);
 
           if (error_state)
@@ -816,7 +434,7 @@
         }
     }
   else
-    retval = octregexp (args, nargout, nm, case_insensitive);
+    retval = octregexp (args, nargout, who, case_insensitive);
 
   return retval;
 
@@ -1022,9 +640,9 @@
   if (nargin < 2)
     print_usage ();
   else if (args(0).is_cell () || args(1).is_cell ())
-    retval = octcellregexp (args, nargout, "regexp", false);
+    retval = octcellregexp (args, nargout, "regexp");
   else
-    retval = octregexp (args, nargout, "regexp", false);
+    retval = octregexp (args, nargout, "regexp");
 
   return retval;
 }
@@ -1402,7 +1020,7 @@
 
 
 static octave_value
-octregexprep (const octave_value_list &args, const std::string &nm)
+octregexprep (const octave_value_list &args, const std::string &who)
 {
   octave_value retval;
 
@@ -1423,12 +1041,9 @@
 
   // Pack options excluding 'tokenize' and various output
   // reordering strings into regexp arg list
-  octave_value_list regexpargs (nargin-1, octave_value ());
+  octave_value_list regexpargs (nargin-3, octave_value ());
 
-  regexpargs(0) = args (0);
-  regexpargs(1) = args (1);
-
-  int len = 2;
+  int len = 0;
   for (int i = 3; i < nargin; i++)
     {
       const std::string opt = args(i).string_value ();
@@ -1441,165 +1056,13 @@
     }
   regexpargs.resize (len);
 
-  // Identify replacement tokens; build a vector of group numbers in
-  // the replacement string so that we can quickly calculate the size
-  // of the replacement.
-  int tokens = 0;
-  for (size_t i=1; i < replacement.size (); i++)
-    {
-      if (replacement[i-1]=='$' && isdigit (replacement[i]))
-        {
-          tokens++;
-          i++;
-        }
-    }
-  std::vector<int> token (tokens);
-
-  int kk = 0;
-  for (size_t i = 1; i < replacement.size (); i++)
-    {
-      if (replacement[i-1]=='$' && isdigit (replacement[i]))
-        {
-          token[kk++] = replacement[i]-'0';
-          i++;
-        }
-    }
-
-  // Perform replacement
-  std::string rep;
-
-  if (tokens > 0)
-    {
-      std::list<regexp_elem> lst;
-      string_vector named;
-      int nopts;
-      bool once;
-      int sz = octregexp_list (regexpargs, nm , false, lst, named, nopts, once);
-
-      if (error_state)
-        return retval;
-      if (sz == 0)
-        {
-          retval = args(0);
-          return retval;
-        }
-
-      // Determine replacement length
-      const size_t replen = replacement.size () - 2*tokens;
-      int delta = 0;
-      const_iterator p = lst.begin ();
-      for (int i = 0; i < sz; i++)
-        {
-          OCTAVE_QUIT;
-
-          const Matrix pairs (p->te);
-          size_t pairlen = 0;
-          for (int j = 0; j < tokens; j++)
-            {
-              if (token[j] == 0)
-                pairlen += static_cast<size_t> (p->e - p->s) + 1;
-              else if (token[j] <= pairs.rows ())
-                pairlen += static_cast<size_t> (pairs(token[j]-1,1)
-                                                - pairs(token[j]-1,0)) + 1;
-            }
-          delta += static_cast<int> (replen + pairlen)
-            - static_cast<int> (p->e - p->s + 1);
-          p++;
-        }
-
-      // Build replacement string
-      rep.reserve (buffer.size () + delta);
-      size_t from = 0;
-      p = lst.begin ();
-      for (int i = 0; i < sz; i++)
-        {
-          OCTAVE_QUIT;
+  regexp::opts options;
+  bool extra_args = false;
+  parse_options (options, regexpargs, who, 0, extra_args);
+  if (error_state)
+    return retval;
 
-          const Matrix pairs (p->te);
-          rep.append (&buffer[from], static_cast<size_t> (p->s - 1) - from);
-          from = static_cast<size_t> (p->e - 1) + 1;
-          for (size_t j = 1; j < replacement.size (); j++)
-            {
-              if (replacement[j-1]=='$' && isdigit (replacement[j]))
-                {
-                  int k = replacement[j]-'0';
-                  if (k == 0)
-                    {
-                      // replace with entire match
-                      rep.append (&buffer[static_cast<size_t> (p->e - 1)],
-                                  static_cast<size_t> (p->e - p->s) + 1);
-                    }
-                  else if (k <= pairs.rows ())
-                    {
-                      // replace with group capture
-                      rep.append (&buffer[static_cast<size_t> (pairs(k-1,0)-1)],
-                                  static_cast<size_t> (pairs(k-1,1)
-                                                       - pairs(k-1,0)) + 1);
-                    }
-                  else
-                    {
-                      // replace with nothing
-                    }
-                  j++;
-                }
-              else
-                {
-                  rep.append (1, replacement[j-1]);
-                }
-              if (j+1 == replacement.size ())
-                {
-                  rep.append (1, replacement[j]);
-                }
-            }
-          p++;
-        }
-      rep.append (&buffer[from], buffer.size () - from);
-    }
-  else
-    {
-      std::list<regexp_elem> lst;
-      string_vector named;
-      int nopts;
-      bool once;
-      int sz = octregexp_list (regexpargs, nm, false, lst, named, nopts, once);
-
-      if (error_state)
-        return retval;
-      if (sz == 0)
-        {
-          retval = args (0);
-          return retval;
-        }
-
-      // Determine replacement length
-      const size_t replen = replacement.size ();
-      int delta = 0;
-      const_iterator p = lst.begin ();
-      for (int i = 0; i < sz; i++)
-        {
-          OCTAVE_QUIT;
-          delta += static_cast<int> (replen)
-            - static_cast<int> (p->e - p->s + 1);
-          p++;
-        }
-
-      // Build replacement string
-      rep.reserve (buffer.size () + delta);
-      size_t from = 0;
-      p = lst.begin ();
-      for (int i = 0; i < sz; i++)
-        {
-          OCTAVE_QUIT;
-          rep.append (&buffer[from], static_cast<size_t> (p->s - 1) - from);
-          from = static_cast<size_t> (p->e - 1) + 1;
-          rep.append (replacement);
-          p++;
-        }
-      rep.append (&buffer[from], buffer.size () - from);
-    }
-
-  retval = rep;
-  return retval;
+  return regexp_replace (pattern, buffer, replacement, options, who);
 }
 
 DEFUN_DLD (regexprep, args, ,
@@ -1672,7 +1135,7 @@
         {
           dv1 = pat.dims ();
           if (rep.numel () != 1 && dv1 != rep.dims ())
-            error ("regexprep: Inconsistent cell array dimensions");
+            error ("regexprep: inconsistent cell array dimensions");
         }
       else if (rep.numel () != 1)
         dv1 = rep.dims ();
author	John W. Eaton <jwe@octave.org>
date	Sun, 11 Dec 2011 22:19:57 -0500
parents	9cae456085c2
children	72c96de7a403