# HG changeset patch # User John W. Eaton # Date 1323659997 18000 # Node ID fc9f204faea0b520a8ea9d455e4f5c865b1b09a0 # Parent d51b321b5fef9470eae473989d83756ed8e73fec refactor regexp (bug #34440) * liboctave/regexp.h, liboctave/regexp.cc: New files. Provide classes and functions for regular expressions. Adapted from src/DLD-FUNCTIONS/regexp.cc. * regex-match.h, regex-match.cc: Delete * liboctave/Makefile.am (INCS, LIBOCTAVE_CXX_SOURCES): Update. * variables.cc (name_matches_any_pattern): Use new regexp class. * symtab.h (symbol_table::regexp_global_variables, symbol_table::do_clear_variable_regexp, symbol_table::do_regexp): Likewise. * DLD-FUNCTIONS/regexp.cc (parse_options): New function. (octregexp, octcellregexp, octregexprep): Extract matching code for use in new regexp class. Use new regexp class to provide required functionality. diff -r d51b321b5fef -r fc9f204faea0 liboctave/Makefile.am --- a/liboctave/Makefile.am Sun Dec 11 18:28:35 2011 -0500 +++ b/liboctave/Makefile.am Sun Dec 11 22:19:57 2011 -0500 @@ -244,7 +244,7 @@ randgamma.h \ randmtzig.h \ randpoisson.h \ - regex-match.h \ + regexp.h \ singleton-cleanup.h \ sparse-sort.h \ sparse-util.h \ @@ -453,7 +453,7 @@ oct-time.cc \ oct-uname.cc \ pathsearch.cc \ - regex-match.cc \ + regexp.cc \ singleton-cleanup.cc \ sparse-sort.cc \ sparse-util.cc \ diff -r d51b321b5fef -r fc9f204faea0 liboctave/regex-match.cc --- a/liboctave/regex-match.cc Sun Dec 11 18:28:35 2011 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,149 +0,0 @@ -/* - -Copyright (C) 2008-2011 David Bateman - -This file is part of Octave. - -Octave is free software; you can redistribute it and/or modify it -under the terms of the GNU General Public License as published by the -Free Software Foundation; either version 3 of the License, or (at your -option) any later version. - -Octave is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -for more details. - -You should have received a copy of the GNU General Public License -along with Octave; see the file COPYING. If not, see -. - -*/ - -#ifdef HAVE_CONFIG_H -#include -#endif - -#include -#include -#include - -#include "regex-match.h" -#include "str-vec.h" -#include "oct-locbuf.h" - -regex_match& -regex_match::operator = (const regex_match& gm) -{ - if (this != &gm) - { -#if HAVE_REGEX - for (int i = 0; i < pat.length (); i++) - regfree (compiled +i); - delete [] compiled; -#endif - pat = gm.pat; - case_insen = gm.case_insen; - init (); - } - return *this; -} - -regex_match::~regex_match (void) -{ -#if HAVE_REGEX - for (int i = 0; i < pat.length (); i++) - regfree (compiled +i); - delete [] compiled; -#endif -} - - -void -regex_match::set_pattern (const std::string& p) -{ -#if HAVE_REGEX - for (int i = 0; i < pat.length (); i++) - regfree (compiled +i); - delete [] compiled; -#endif - pat = p; - init (); -} - -void -regex_match::set_pattern (const string_vector& p) -{ -#if HAVE_REGEX - for (int i = 0; i < pat.length (); i++) - regfree (compiled +i); - delete [] compiled; -#endif - pat = p; - init (); -} - -void -regex_match::init (void) -{ -#ifdef HAVE_REGEX - int npat = pat.length (); - int err = 0; - int i; - - compiled = new regex_t [npat]; - - for (i = 0; i < npat; i++) - { - err = regcomp (compiled + i, pat(i).c_str (), - (REG_NOSUB | REG_EXTENDED | - (case_insen ? REG_ICASE : 0))); - if (err) - break; - } - - if (err) - { - int len = regerror (err, compiled + i, 0, 0); - OCTAVE_LOCAL_BUFFER (char, errmsg, len); - regerror(err, compiled + i, errmsg, len); - (*current_liboctave_error_handler) ("%s in pattern (%s)", errmsg, - pat(i).c_str()); - - for (int j = 0; j < i + 1; j++) - regfree (compiled + j); - } -#else - (*current_liboctave_error_handler) - ("regex not available in this version of Octave"); -#endif -} - -bool -regex_match::match (const std::string& s) -{ -#if HAVE_REGEX - int npat = pat.length (); - - const char *str = s.c_str (); - - for (int i = 0; i < npat; i++) - if (regexec (compiled + i, str, 0, 0, 0) == 0) - return true; -#endif - - return false; -} - -Array -regex_match::match (const string_vector& s) -{ - int n = s.length (); - - Array retval (dim_vector (n, 1)); - - for (int i = 0; i < n; i++) - retval(i) = match (s[i]); - - return retval; -} diff -r d51b321b5fef -r fc9f204faea0 liboctave/regex-match.h --- a/liboctave/regex-match.h Sun Dec 11 18:28:35 2011 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,99 +0,0 @@ -/* - -Copyright (C) 2008-2011 David Bateman - -This file is part of Octave. - -Octave is free software; you can redistribute it and/or modify it -under the terms of the GNU General Public License as published by the -Free Software Foundation; either version 3 of the License, or (at your -option) any later version. - -Octave is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -for more details. - -You should have received a copy of the GNU General Public License -along with Octave; see the file COPYING. If not, see -. - -*/ - -#if !defined (octave_regex_match_h) -#define octave_regex_match_h 1 - -#include - -#if defined (HAVE_REGEX) -#if defined (__MINGW32__) -#define __restrict -#endif -#include -#include -#endif - -#include "Array.h" -#include "str-vec.h" - -class -OCTAVE_API -regex_match -{ -public: - - regex_match (const std::string& p, bool insen = false) - : pat (p), case_insen (insen) -#if HAVE_REGEX - , compiled (0) -#endif - { - init (); - } - - regex_match (const string_vector& p = string_vector (), bool insen = false) - : pat (p), case_insen (insen) -#if HAVE_REGEX - , compiled (0) -#endif - { - init (); - } - - regex_match (const regex_match& gm) - : pat (gm.pat), case_insen (gm.case_insen) -#if HAVE_REGEX - , compiled (0) -#endif - { - init (); - } - - regex_match& operator = (const regex_match& gm); - - ~regex_match (void); - - void set_pattern (const std::string& p); - - void set_pattern (const string_vector& p); - - bool match (const std::string&); - - Array match (const string_vector&); - -private: - - void init (void); - - // Regex pattern(s). - string_vector pat; - - // Should match be case insensitive - bool case_insen; - -#if HAVE_REGEX - regex_t *compiled; -#endif -}; - -#endif diff -r d51b321b5fef -r fc9f204faea0 liboctave/regexp.cc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/liboctave/regexp.cc Sun Dec 11 22:19:57 2011 -0500 @@ -0,0 +1,575 @@ +/* + +Copyright (C) 2011 John W. Eaton +Copyright (C) 2005-2011 David Bateman +Copyright (C) 2002-2005 Paul Kienzle + +This file is part of Octave. + +Octave is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +Octave is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with Octave; see the file COPYING. If not, see +. + +*/ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include + +#include + +#include "Matrix.h" +#include "base-list.h" +#include "lo-error.h" +#include "oct-locbuf.h" +#include "quit.h" +#include "regexp.h" +#include "str-vec.h" + +// Define the maximum number of retries for a pattern that possibly +// results in an infinite recursion. +#define PCRE_MATCHLIMIT_MAX 10 + +// FIXME -- should this be configurable? +#define MAXLOOKBEHIND 10 + +static bool lookbehind_warned = false; + +// FIXME -- don't bother collecting and composing return values the user +// doesn't want. + +void +regexp::free (void) +{ + if (data) + pcre_free (static_cast (data)); +} + +void +regexp::compile_internal (void) +{ + // If we had a previously compiled pattern, release it. + free (); + + size_t max_length = MAXLOOKBEHIND; + + size_t pos = 0; + size_t new_pos; + int inames = 0; + std::ostringstream buf; + + while ((new_pos = pattern.find ("(?", pos)) != std::string::npos) + { + if (pattern.at (new_pos + 2) == '<' + && !(pattern.at (new_pos + 3) == '=' + || pattern.at (new_pos + 3) == '!')) + { + // The syntax of named tokens in pcre is "(?P...)" while + // we need a syntax "(?...)", so fix that here. Also an + // expression like + // "(?\w+)\s+(?\w+)|(?\w+),\s+(?\w+)" + // should be perfectly legal, while pcre does not allow the same + // named token name on both sides of the alternative. Also fix + // that here by replacing name tokens by dummy names, and dealing + // with the dummy names later. + + size_t tmp_pos = pattern.find_first_of ('>', new_pos); + + if (tmp_pos == std::string::npos) + { + (*current_liboctave_error_handler) + ("regexp: syntax error in pattern"); + return; + } + + std::string tmp_name = + pattern.substr (new_pos+3, tmp_pos-new_pos-3); + + bool found = false; + + for (int i = 0; i < nnames; i++) + { + if (named_pats(i) == tmp_name) + { + named_idx.resize (dim_vector (inames+1, 1)); + named_idx(inames) = i; + found = true; + break; + } + } + + if (! found) + { + named_idx.resize (dim_vector (inames+1, 1)); + named_idx(inames) = nnames; + named_pats.append (tmp_name); + nnames++; + } + + if (new_pos - pos > 0) + buf << pattern.substr (pos, new_pos-pos); + if (inames < 10) + buf << "(?P 0) + { + char ch = pattern.at (tmp_pos1); + + if (ch == '(') + brackets++; + else if (ch == ')') + { + if (brackets > 1) + tmp_pos2 = tmp_pos1; + + brackets--; + } + + tmp_pos1++; + } + + if (brackets != 0) + { + buf << pattern.substr (pos, new_pos - pos) << "(?"; + pos = new_pos + 2; + } + else + { + size_t tmp_pos3 = pattern.find_first_of ("*+", tmp_pos2); + + if (tmp_pos3 != std::string::npos && tmp_pos3 < tmp_pos1) + { + if (!lookbehind_warned) + { + lookbehind_warned = true; + (*current_liboctave_warning_handler) + ("%s: arbitrary length lookbehind patterns are only supported up to length %d", + who.c_str (), MAXLOOKBEHIND); + } + + buf << pattern.substr (pos, new_pos - pos) << "("; + + size_t i; + + if (pattern.at (tmp_pos3) == '*') + i = 0; + else + i = 1; + + for (; i < max_length + 1; i++) + { + buf << pattern.substr (new_pos, tmp_pos3 - new_pos) + << "{" << i << "}"; + buf << pattern.substr (tmp_pos3 + 1, + tmp_pos1 - tmp_pos3 - 1); + if (i != max_length) + buf << "|"; + } + buf << ")"; + } + else + buf << pattern.substr (pos, tmp_pos1 - pos); + + pos = tmp_pos1; + } + } + else + { + buf << pattern.substr (pos, new_pos - pos) << "(?"; + pos = new_pos + 2; + } + + } + + buf << pattern.substr (pos); + + const char *err; + int erroffset; + std::string buf_str = buf.str (); + + int pcre_options + = ((options.case_insensitive () ? PCRE_CASELESS : 0) + | (options.dotexceptnewline () ? 0 : PCRE_DOTALL) + | (options.lineanchors () ? PCRE_MULTILINE : 0) + | (options.freespacing () ? PCRE_EXTENDED : 0)); + + data = pcre_compile (buf_str.c_str (), pcre_options, &err, &erroffset, 0); + + if (! data) + (*current_liboctave_error_handler) + ("%s: %s at position %d of expression", who.c_str (), + err, erroffset); +} + +regexp::match_data +regexp::match (const std::string& buffer) +{ + regexp::match_data retval; + + std::list lst; + + int subpatterns; + int namecount; + int nameentrysize; + char *nametable; + size_t idx = 0; + + pcre *re = static_cast (data); + + pcre_fullinfo (re, 0, PCRE_INFO_CAPTURECOUNT, &subpatterns); + pcre_fullinfo (re, 0, PCRE_INFO_NAMECOUNT, &namecount); + pcre_fullinfo (re, 0, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize); + pcre_fullinfo (re, 0, PCRE_INFO_NAMETABLE, &nametable); + + OCTAVE_LOCAL_BUFFER (int, ovector, (subpatterns+1)*3); + OCTAVE_LOCAL_BUFFER (int, nidx, namecount); + + for (int i = 0; i < namecount; i++) + { + // Index of subpattern in first two bytes MSB first of name. + // Extract index. + nidx[i] = (static_cast (nametable[i*nameentrysize])) << 8 + | static_cast (nametable[i*nameentrysize+1]); + } + + while (true) + { + OCTAVE_QUIT; + + int matches = pcre_exec (re, 0, buffer.c_str (), + buffer.length (), idx, + (idx ? PCRE_NOTBOL : 0), + ovector, (subpatterns+1)*3); + + if (matches == PCRE_ERROR_MATCHLIMIT) + { + // Try harder; start with default value for MATCH_LIMIT + // and increase it. + (*current_liboctave_warning_handler) + ("your pattern caused PCRE to hit its MATCH_LIMIT; trying harder now, but this will be slow"); + + pcre_extra pe; + + pcre_config (PCRE_CONFIG_MATCH_LIMIT, + static_cast (&pe.match_limit)); + + pe.flags = PCRE_EXTRA_MATCH_LIMIT; + + int i = 0; + while (matches == PCRE_ERROR_MATCHLIMIT + && i++ < PCRE_MATCHLIMIT_MAX) + { + OCTAVE_QUIT; + + pe.match_limit *= 10; + matches = pcre_exec (re, &pe, buffer.c_str (), + buffer.length (), idx, + (idx ? PCRE_NOTBOL : 0), + ovector, (subpatterns+1)*3); + } + } + + if (matches < 0 && matches != PCRE_ERROR_NOMATCH) + { + (*current_liboctave_error_handler) + ("%s: internal error calling pcre_exec; error code from pcre_exec is %i", + who.c_str (), matches); + return retval; + } + else if (matches == PCRE_ERROR_NOMATCH) + break; + else if (ovector[1] <= ovector[0]) + { + // Zero sized match. Skip to next char. + idx = ovector[0] + 1; + if (idx < buffer.length ()) + continue; + else + break; + } + else + { + int pos_match = 0; + Matrix token_extents (matches-1, 2); + + for (int i = 1; i < matches; i++) + { + if (ovector[2*i] >= 0 && ovector[2*i+1] > 0 + && (i == 1 || ovector[2*i] != ovector[2*i-2] + || ovector[2*i-1] != ovector[2*i+1]) + && ovector[2*i] >= 0 && ovector[2*i+1] > 0) + { + token_extents(pos_match,0) = double (ovector[2*i]+1); + token_extents(pos_match++,1) = double (ovector[2*i+1]); + } + } + + token_extents.resize (pos_match, 2); + + double start = double (ovector[0]+1); + double end = double (ovector[1]); + + const char **listptr; + int status = pcre_get_substring_list (buffer.c_str (), ovector, + matches, &listptr); + + if (status == PCRE_ERROR_NOMEMORY) + { + (*current_liboctave_error_handler) + ("%s: cannot allocate memory in pcre_get_substring_list", + who.c_str ()); + return retval; + } + + string_vector tokens (pos_match); + string_vector named_tokens (nnames); + int pos_offset = 0; + pos_match = 0; + + for (int i = 1; i < matches; i++) + { + if (ovector[2*i] >= 0 && ovector[2*i+1] > 0) + { + if (i == 1 || ovector[2*i] != ovector[2*i-2] + || ovector[2*i-1] != ovector[2*i+1]) + { + if (namecount > 0) + named_tokens(named_idx(i-pos_offset-1)) = + std::string (*(listptr+nidx[i-pos_offset-1])); + + tokens(pos_match++) = std::string (*(listptr+i)); + } + else + pos_offset++; + } + } + + std::string match_string = std::string (*listptr); + + pcre_free_substring_list (listptr); + + regexp::match_element new_elem (named_tokens, tokens, match_string, + token_extents, start, end); + lst.push_back (new_elem); + idx = ovector[1]; + + if (options.once () || idx >= buffer.length ()) + break; + } + } + + retval = regexp::match_data (lst, named_pats); + + return retval; +} + +bool +regexp::is_match (const std::string& buffer) +{ + regexp::match_data rx_lst = match (buffer); + + regexp::match_data::const_iterator p = rx_lst.begin (); + + std::string match_string = p->match_string (); + + return ! match_string.empty (); +} + +Array +regexp::is_match (const string_vector& buffer) +{ + octave_idx_type len = buffer.length (); + + Array retval (len, 1); + + for (octave_idx_type i = 0; i < buffer.length (); i++) + retval(i) = is_match (buffer(i)); + + return retval; +} + +std::string +regexp::replace (const std::string& buffer, const std::string& replacement) +{ + std::string retval; + + // Identify replacement tokens; build a vector of group numbers in + // the replacement string so that we can quickly calculate the size + // of the replacement. + + int tokens = 0; + for (size_t i=1; i < replacement.size (); i++) + { + if (replacement[i-1]=='$' && isdigit (replacement[i])) + { + tokens++; + i++; + } + } + std::vector token (tokens); + + int kk = 0; + for (size_t i = 1; i < replacement.size (); i++) + { + if (replacement[i-1]=='$' && isdigit (replacement[i])) + { + token[kk++] = replacement[i]-'0'; + i++; + } + } + + regexp::match_data rx_lst = match (buffer); + + size_t sz = rx_lst.size (); + + if (sz == 0) + { + retval = buffer; + return retval; + } + + std::string rep; + + if (tokens > 0) + { + // Determine replacement length + const size_t replen = replacement.size () - 2*tokens; + int delta = 0; + regexp::match_data::const_iterator p = rx_lst.begin (); + for (size_t i = 0; i < sz; i++) + { + OCTAVE_QUIT; + + double start = p->start (); + double end = p->end (); + + const Matrix pairs (p->token_extents ()); + size_t pairlen = 0; + for (int j = 0; j < tokens; j++) + { + if (token[j] == 0) + pairlen += static_cast (end - start) + 1; + else if (token[j] <= pairs.rows ()) + pairlen += static_cast (pairs(token[j]-1,1) + - pairs(token[j]-1,0)) + 1; + } + delta += (static_cast (replen + pairlen) + - static_cast (end - start + 1)); + p++; + } + + // Build replacement string + rep.reserve (buffer.size () + delta); + size_t from = 0; + p = rx_lst.begin (); + for (size_t i = 0; i < sz; i++) + { + OCTAVE_QUIT; + + double start = p->start (); + double end = p->end (); + + const Matrix pairs (p->token_extents ()); + rep.append (&buffer[from], static_cast (start - 1) - from); + from = static_cast (end - 1) + 1; + + for (size_t j = 1; j < replacement.size (); j++) + { + if (replacement[j-1]=='$' && isdigit (replacement[j])) + { + int k = replacement[j]-'0'; + if (k == 0) + { + // replace with entire match + rep.append (&buffer[static_cast (end - 1)], + static_cast (end - start) + 1); + } + else if (k <= pairs.rows ()) + { + // replace with group capture + rep.append (&buffer[static_cast (pairs(k-1,0)-1)], + static_cast (pairs(k-1,1) + - pairs(k-1,0)) + 1); + } + else + { + // replace with nothing + } + j++; + } + else + rep.append (1, replacement[j-1]); + + if (j+1 == replacement.size ()) + rep.append (1, replacement[j]); + } + p++; + } + rep.append (&buffer[from], buffer.size () - from); + } + else + { + // Determine replacement length + const size_t replen = replacement.size (); + int delta = 0; + regexp::match_data::const_iterator p = rx_lst.begin (); + for (size_t i = 0; i < sz; i++) + { + OCTAVE_QUIT; + delta += static_cast (replen) + - static_cast (p->end () - p->start () + 1); + p++; + } + + // Build replacement string + rep.reserve (buffer.size () + delta); + size_t from = 0; + p = rx_lst.begin (); + for (size_t i = 0; i < sz; i++) + { + OCTAVE_QUIT; + rep.append (&buffer[from], + static_cast (p->start () - 1) - from); + from = static_cast (p->end () - 1) + 1; + rep.append (replacement); + p++; + } + rep.append (&buffer[from], buffer.size () - from); + } + + retval = rep; + return retval; +} diff -r d51b321b5fef -r fc9f204faea0 liboctave/regexp.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/liboctave/regexp.h Sun Dec 11 22:19:57 2011 -0500 @@ -0,0 +1,281 @@ +/* + +Copyright (C) 2011 John W. Eaton +Copyright (C) 2005-2011 David Bateman + +This file is part of Octave. + +Octave is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +Octave is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with Octave; see the file COPYING. If not, see +. + +*/ + +#if !defined (octave_regexp_match_h) +#define octave_regexp_match_h 1 + +#include +#include +#include + +#include "Array.h" +#include "Matrix.h" +#include "base-list.h" +#include "str-vec.h" + +class regexp +{ +public: + + class opts; + class match_data; + + regexp (const std::string& pat = "", + const regexp::opts& opt = regexp::opts (), + const std::string& w = "regexp") + : pattern (pat), options (opt), data (0), named_pats (), + nnames (0), named_idx (), who (w) + { + compile_internal (); + } + + regexp (const regexp& rx) + : pattern (rx.pattern), data (rx.data), named_pats (rx.named_pats), + nnames (rx.nnames), named_idx (rx.named_idx) + { } + + regexp& operator = (const regexp& rx) + { + if (this != &rx) + { + pattern = rx.pattern; + data = rx.data; + named_pats = rx.named_pats; + nnames = rx.nnames; + named_idx = rx.named_idx; + } + + return *this; + } + + ~regexp (void) { free (); } + + void compile (const std::string& pat, + const regexp::opts& opt = regexp::opts ()) + { + pattern = pat; + options = opt; + compile_internal (); + } + + match_data match (const std::string& buffer); + + bool is_match (const std::string& buffer); + + Array is_match (const string_vector& buffer); + + std::string replace (const std::string& buffer, + const std::string& replacement); + + struct opts + { + public: + + opts (void) + : x_case_insensitive (false), x_dotexceptnewline (false), + x_freespacing (false), x_lineanchors (false), x_once (false) { } + + opts (const opts& o) + : x_case_insensitive (o.x_case_insensitive), + x_dotexceptnewline (o.x_dotexceptnewline), + x_freespacing (o.x_freespacing), + x_lineanchors (o.x_lineanchors), + x_once (o.x_once) + { } + + opts& operator = (const opts& o) + { + if (this != &o) + { + x_case_insensitive = o.x_case_insensitive; + x_dotexceptnewline = o.x_dotexceptnewline; + x_freespacing = o.x_freespacing; + x_lineanchors = o.x_lineanchors; + x_once = o.x_once; + } + + return *this; + } + + ~opts (void) { } + + void case_insensitive (bool val) { x_case_insensitive = val; } + void dotexceptnewline (bool val) { x_dotexceptnewline = val; } + void freespacing (bool val) { x_freespacing = val; } + void lineanchors (bool val) { x_lineanchors = val; } + void once (bool val) { x_once = val; } + + bool case_insensitive (void) const { return x_case_insensitive; } + bool dotexceptnewline (void) const { return x_dotexceptnewline; } + bool freespacing (void) const { return x_freespacing; } + bool lineanchors (void) const { return x_lineanchors; } + bool once (void) const { return x_once; } + + private: + + bool x_case_insensitive; + bool x_dotexceptnewline; + bool x_freespacing; + bool x_lineanchors; + bool x_once; + }; + + class match_element + { + public: + + match_element (const string_vector& nt, const string_vector& t, + const std::string& ms, const Matrix& te, + double s, double e) + : x_match_string (ms), x_named_tokens (nt), x_tokens (t), + x_token_extents (te), x_start (s), x_end (e) + { } + + match_element (const match_element &a) + : x_match_string (a.x_match_string), + x_named_tokens (a.x_named_tokens), x_tokens (a.x_tokens), + x_token_extents (a.x_token_extents), + x_start (a.x_start), x_end (a.x_end) + { } + + std::string match_string (void) const { return x_match_string; } + string_vector named_tokens (void) const { return x_named_tokens; } + string_vector tokens (void) const { return x_tokens; } + Matrix token_extents (void) const { return x_token_extents; } + double start (void) const { return x_start; } + double end (void) const { return x_end; } + + private: + + std::string x_match_string; + string_vector x_named_tokens; + string_vector x_tokens; + Matrix x_token_extents; + double x_start; + double x_end; + }; + + class match_data : public octave_base_list + { + public: + + match_data (void) + : octave_base_list (), named_pats () + { } + + match_data (const std::list& l, const string_vector& np) + : octave_base_list (l), named_pats (np) + { } + + match_data (const match_data& rx_lst) + : octave_base_list (rx_lst), + named_pats (rx_lst.named_pats) + { } + + match_data& operator = (const match_data& rx_lst) + { + if (this != &rx_lst) + { + octave_base_list::operator = (rx_lst); + named_pats = rx_lst.named_pats; + } + + return *this; + } + + ~match_data (void) { } + + string_vector named_patterns (void) { return named_pats; } + + private: + + string_vector named_pats; + }; + +private: + + // The pattern we've been asked to match. + std::string pattern; + + opts options; + + // Internal data describing the regular expression. + void *data; + + std::string m; + string_vector named_pats; + int nnames; + Array named_idx; + std::string who; + + void free (void); + + void compile_internal (void); +}; + +inline regexp::match_data +regexp_match (const std::string& pat, + const std::string& buffer, + const regexp::opts& opt = regexp::opts (), + const std::string& who = "regexp") +{ + regexp rx (pat, opt, who); + + return rx.match (buffer); +} + +inline bool +is_regexp_match (const std::string& pat, + const std::string& buffer, + const regexp::opts& opt = regexp::opts (), + const std::string& who = "regexp") +{ + regexp rx (pat, opt, who); + + return rx.is_match (buffer); +} + +inline Array +is_regexp_match (const std::string& pat, + const string_vector& buffer, + const regexp::opts& opt = regexp::opts (), + const std::string& who = "regexp") +{ + regexp rx (pat, opt, who); + + return rx.is_match (buffer); +} + +inline std::string +regexp_replace (const std::string& pat, + const std::string& buffer, + const std::string& replacement, + const regexp::opts& opt = regexp::opts (), + const std::string& who = "regexp") +{ + regexp rx (pat, opt, who); + + return rx.replace (buffer, replacement); +} + +#endif diff -r d51b321b5fef -r fc9f204faea0 src/DLD-FUNCTIONS/regexp.cc --- a/src/DLD-FUNCTIONS/regexp.cc Sun Dec 11 18:28:35 2011 -0500 +++ b/src/DLD-FUNCTIONS/regexp.cc Sun Dec 11 22:19:57 2011 -0500 @@ -25,501 +25,106 @@ #include #endif -#include +#include #include -#include "defun-dld.h" -#include "error.h" -#include "gripes.h" -#include "oct-obj.h" -#include "utils.h" - -#include "Cell.h" -#include "oct-map.h" -#include "str-vec.h" -#include "quit.h" -#include "parse.h" -#include "oct-locbuf.h" - #include -// Define the maximum number of retries for a pattern that -// possibly results in an infinite recursion. -#define PCRE_MATCHLIMIT_MAX 10 - -// The regexp is constructed as a linked list to avoid resizing the -// return values in arrays at each new match. - -// FIXME don't bother collecting and composing return values the user -// doesn't want. - -class regexp_elem -{ -public: - regexp_elem (const string_vector& _named_token, const Cell& _t, - const std::string& _m, const Matrix& _te, double _s, - double _e) : - named_token (_named_token), t (_t), m (_m), te (_te), s (_s), e (_e) { } - - regexp_elem (const regexp_elem &a) : named_token (a.named_token), t (a.t), - m (a.m), te (a.te), s (a.s), e (a.e) - { } - - string_vector named_token; - Cell t; - std::string m; - Matrix te; - double s; - double e; -}; - -typedef std::list::const_iterator const_iterator; - -#define MAXLOOKBEHIND 10 -static bool lookbehind_warned = false; +#include "base-list.h" +#include "oct-locbuf.h" +#include "quit.h" +#include "regexp.h" +#include "str-vec.h" -static int -octregexp_list (const octave_value_list &args, const std::string &nm, - bool case_insensitive, std::list &lst, - string_vector &named, int &nopts, bool &once) -{ - int sz = 0; - - int nargin = args.length (); - bool lineanchors = false; - bool dotexceptnewline = false; - bool freespacing = false; - - nopts = nargin - 2; - once = false; +#include "defun-dld.h" +#include "Cell.h" +#include "error.h" +#include "gripes.h" +#include "oct-map.h" +#include "oct-obj.h" +#include "utils.h" - std::string buffer = args(0).string_value (); - size_t max_length = (buffer.length () > MAXLOOKBEHIND ? - MAXLOOKBEHIND: buffer.length ()); - - if (error_state) - { - gripe_wrong_type_arg (nm.c_str (), args(0)); - return 0; - } +static void +parse_options (regexp::opts& options, const octave_value_list& args, + const std::string& who, int skip, bool& extra_args) +{ + int nargin = args.length (); - std::string pattern = args(1).string_value (); + extra_args = false; - if (error_state) - { - gripe_wrong_type_arg (nm.c_str (), args(1)); - return 0; - } - - for (int i = 2; i < nargin; i++) + for (int i = skip; i < nargin; i++) { std::string str = args(i).string_value (); if (error_state) { - error ("%s: optional arguments must be strings", nm.c_str ()); + error ("%s: optional arguments must be character strings", + who.c_str ()); break; } std::transform (str.begin (), str.end (), str.begin (), tolower); if (str.find ("once", 0) == 0) - { - once = true; - nopts--; - } + options.once (true); else if (str.find ("matchcase", 0) == 0) - { - case_insensitive = false; - nopts--; - } + options.case_insensitive (false); else if (str.find ("ignorecase", 0) == 0) - { - case_insensitive = true; - nopts--; - } + options.case_insensitive (true); else if (str.find ("dotall", 0) == 0) - { - dotexceptnewline = false; - nopts--; - } + options.dotexceptnewline (false); else if (str.find ("stringanchors", 0) == 0) - { - lineanchors = false; - nopts--; - } + options.lineanchors (false); else if (str.find ("literalspacing", 0) == 0) - { - freespacing = false; - nopts--; - } + options.freespacing (false); else if (str.find ("dotexceptnewline", 0) == 0) - { - dotexceptnewline = true; - nopts--; - } + options.dotexceptnewline (true); else if (str.find ("lineanchors", 0) == 0) - { - lineanchors = true; - nopts--; - } + options.lineanchors (true); else if (str.find ("freespacing", 0) == 0) - { - freespacing = true; - nopts--; - } - else if (str.find ("start", 0) && str.find ("end", 0) - && str.find ("tokenextents", 0) && str.find ("match", 0) - && str.find ("tokens", 0) && str.find ("names", 0) - && str.find ("split", 0)) - error ("%s: unrecognized option", nm.c_str ()); + options.freespacing (true); + else if (str.find ("start", 0) == 0 + || str.find ("end", 0) == 0 + || str.find ("tokenextents", 0) == 0 + || str.find ("match", 0) == 0 + || str.find ("tokens", 0) == 0 + || str.find ("names", 0) == 0 + || str.find ("split", 0) == 0) + extra_args = true; + else + error ("%s: unrecognized option", who.c_str ()); } - - if (!error_state) - { - Cell t; - std::string m; - double s, e; - - // named tokens "(?...)" are only treated with PCRE not regex. - - size_t pos = 0; - size_t new_pos; - int nnames = 0; - int inames = 0; - std::ostringstream buf; - Array named_idx; - - while ((new_pos = pattern.find ("(?", pos)) != std::string::npos) - { - if (pattern.at (new_pos + 2) == '<' - && !(pattern.at (new_pos + 3) == '=' - || pattern.at (new_pos + 3) == '!')) - { - // The syntax of named tokens in pcre is "(?P...)" while - // we need a syntax "(?...)", so fix that here. Also an - // expression like - // "(?\w+)\s+(?\w+)|(?\w+),\s+(?\w+)" - // should be perfectly legal, while pcre does not allow the same - // named token name on both sides of the alternative. Also fix - // that here by replacing name tokens by dummy names, and dealing - // with the dummy names later. - - size_t tmp_pos = pattern.find_first_of ('>', new_pos); - - if (tmp_pos == std::string::npos) - { - error ("regexp: syntax error in pattern"); - break; - } - - std::string tmp_name = - pattern.substr (new_pos+3, tmp_pos-new_pos-3); - - bool found = false; - - for (int i = 0; i < nnames; i++) - { - if (named(i) == tmp_name) - { - named_idx.resize (dim_vector (inames+1, 1)); - named_idx(inames) = i; - found = true; - break; - } - } - - if (! found) - { - named_idx.resize (dim_vector (inames+1, 1)); - named_idx(inames) = nnames; - named.append (tmp_name); - nnames++; - } - - if (new_pos - pos > 0) - buf << pattern.substr (pos, new_pos-pos); - if (inames < 10) - buf << "(?P 0) - { - char ch = pattern.at (tmp_pos1); - - if (ch == '(') - brackets++; - else if (ch == ')') - { - if (brackets > 1) - tmp_pos2 = tmp_pos1; - - brackets--; - } - - tmp_pos1++; - } - - if (brackets != 0) - { - buf << pattern.substr (pos, new_pos - pos) << "(?"; - pos = new_pos + 2; - } - else - { - size_t tmp_pos3 = pattern.find_first_of ("*+", tmp_pos2); - - if (tmp_pos3 != std::string::npos && tmp_pos3 < tmp_pos1) - { - if (!lookbehind_warned) - { - lookbehind_warned = true; - warning ("%s: arbitrary length lookbehind patterns are only supported up to length %d", - nm.c_str (), MAXLOOKBEHIND); - } - - buf << pattern.substr (pos, new_pos - pos) << "("; - - size_t i; - - if (pattern.at (tmp_pos3) == '*') - i = 0; - else - i = 1; - - for (; i < max_length + 1; i++) - { - buf << pattern.substr (new_pos, tmp_pos3 - new_pos) - << "{" << i << "}"; - buf << pattern.substr (tmp_pos3 + 1, - tmp_pos1 - tmp_pos3 - 1); - if (i != max_length) - buf << "|"; - } - buf << ")"; - } - else - buf << pattern.substr (pos, tmp_pos1 - pos); - - pos = tmp_pos1; - } - } - else - { - buf << pattern.substr (pos, new_pos - pos) << "(?"; - pos = new_pos + 2; - } - - } - - buf << pattern.substr (pos); - - if (error_state) - return 0; - - // Compile expression - const char *err; - int erroffset; - std::string buf_str = buf.str (); - - pcre *re = pcre_compile (buf_str.c_str (), - ((case_insensitive ? PCRE_CASELESS : 0) - | (dotexceptnewline ? 0 : PCRE_DOTALL) - | (lineanchors ? PCRE_MULTILINE : 0) - | (freespacing ? PCRE_EXTENDED : 0)), - &err, &erroffset, 0); - - if (re == 0) - { - error ("%s: %s at position %d of expression", nm.c_str (), - err, erroffset); - return 0; - } - - int subpatterns; - int namecount; - int nameentrysize; - char *nametable; - int idx = 0; - - pcre_fullinfo (re, 0, PCRE_INFO_CAPTURECOUNT, &subpatterns); - pcre_fullinfo (re, 0, PCRE_INFO_NAMECOUNT, &namecount); - pcre_fullinfo (re, 0, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize); - pcre_fullinfo (re, 0, PCRE_INFO_NAMETABLE, &nametable); - - OCTAVE_LOCAL_BUFFER (int, ovector, (subpatterns+1)*3); - OCTAVE_LOCAL_BUFFER (int, nidx, namecount); - - for (int i = 0; i < namecount; i++) - { - // Index of subpattern in first two bytes MSB first of name. - // Extract index. - nidx[i] = (static_cast (nametable[i*nameentrysize])) << 8 - | static_cast (nametable[i*nameentrysize+1]); - } - - while (true) - { - OCTAVE_QUIT; - - int matches = pcre_exec (re, 0, buffer.c_str (), - buffer.length (), idx, - (idx ? PCRE_NOTBOL : 0), - ovector, (subpatterns+1)*3); - - if (matches == PCRE_ERROR_MATCHLIMIT) - { - // Try harder; start with default value for MATCH_LIMIT - // and increase it. - warning ("your pattern caused PCRE to hit its MATCH_LIMIT; trying harder now, but this will be slow"); - - pcre_extra pe; - - pcre_config (PCRE_CONFIG_MATCH_LIMIT, - static_cast (&pe.match_limit)); - - pe.flags = PCRE_EXTRA_MATCH_LIMIT; - - int i = 0; - while (matches == PCRE_ERROR_MATCHLIMIT - && i++ < PCRE_MATCHLIMIT_MAX) - { - OCTAVE_QUIT; - - pe.match_limit *= 10; - matches = pcre_exec (re, &pe, buffer.c_str (), - buffer.length (), idx, - (idx ? PCRE_NOTBOL : 0), - ovector, (subpatterns+1)*3); - } - } - - if (matches < 0 && matches != PCRE_ERROR_NOMATCH) - { - error ("%s: internal error calling pcre_exec; error code from pcre_exec is %i", - nm.c_str (), matches); - pcre_free (re); - return 0; - } - else if (matches == PCRE_ERROR_NOMATCH) - break; - else if (ovector[1] <= ovector[0]) - { - // Zero sized match. Skip to next char. - idx = ovector[0] + 1; - if (idx < buffer.length ()) - continue; - else - break; - } - else - { - int pos_match = 0; - Matrix te (matches-1, 2); - - for (int i = 1; i < matches; i++) - { - if (ovector[2*i] >= 0 && ovector[2*i+1] > 0 - && (i == 1 || ovector[2*i] != ovector[2*i-2] - || ovector[2*i-1] != ovector[2*i+1]) - && ovector[2*i] >= 0 && ovector[2*i+1] > 0) - { - te(pos_match,0) = double (ovector[2*i]+1); - te(pos_match++,1) = double (ovector[2*i+1]); - } - } - - te.resize (pos_match, 2); - - s = double (ovector[0]+1); - e = double (ovector[1]); - - const char **listptr; - int status = pcre_get_substring_list (buffer.c_str (), ovector, - matches, &listptr); - - if (status == PCRE_ERROR_NOMEMORY) - { - error ("%s: cannot allocate memory in pcre_get_substring_list", - nm.c_str ()); - pcre_free (re); - return 0; - } - - Cell cell_t (dim_vector (1, pos_match)); - string_vector named_tokens (nnames); - int pos_offset = 0; - pos_match = 0; - - for (int i = 1; i < matches; i++) - { - if (ovector[2*i] >= 0 && ovector[2*i+1] > 0) - { - if (i == 1 || ovector[2*i] != ovector[2*i-2] - || ovector[2*i-1] != ovector[2*i+1]) - { - if (namecount > 0) - named_tokens(named_idx(i-pos_offset-1)) = - std::string (*(listptr+nidx[i-pos_offset-1])); - cell_t(pos_match++) = - std::string (*(listptr+i)); - } - else - pos_offset++; - } - } - - m = std::string (*listptr); - t = cell_t; - - pcre_free_substring_list (listptr); - - regexp_elem new_elem (named_tokens, t, m, te, s, e); - lst.push_back (new_elem); - idx = ovector[1]; - sz++; - - if (once || idx >= buffer.length ()) - break; - - } - } - - pcre_free (re); - } - - return sz; } static octave_value_list -octregexp (const octave_value_list &args, int nargout, const std::string &nm, - bool case_insensitive) +octregexp (const octave_value_list &args, int nargout, + const std::string &who, bool case_insensitive = false) { octave_value_list retval; + int nargin = args.length (); - std::list lst; - string_vector named; - int nopts; - bool once; + + // Make sure we have string, pattern + const std::string buffer = args(0).string_value (); + if (error_state) + return retval; + + const std::string pattern = args(1).string_value (); + if (error_state) + return retval; - int sz = octregexp_list (args, nm, case_insensitive, lst, named, nopts, once); + regexp::opts options; + options.case_insensitive (case_insensitive); + bool extra_options = false; + parse_options (options, args, who, 2, extra_options); + if (error_state) + return retval; + + regexp::match_data rx_lst = regexp_match (pattern, buffer, options, who); + + string_vector named_pats = rx_lst.named_patterns (); + + size_t sz = rx_lst.size (); if (! error_state) { @@ -532,47 +137,54 @@ if (sz == 1) { - for (int j = 0; j < named.length (); j++) - nmap.assign (named(j), lst.begin()->named_token (j)); + string_vector named_tokens = rx_lst.begin()->named_tokens (); + + for (int j = 0; j < named_pats.length (); j++) + nmap.assign (named_pats(j), named_tokens(j)); retval(5) = nmap; } else { - for (int j = 0; j < named.length (); j++) + for (int j = 0; j < named_pats.length (); j++) { Cell tmp (dim_vector (1, sz)); i = 0; - for (const_iterator p = lst.begin (); p != lst.end (); p++) - tmp(i++) = p->named_token (j); + for (regexp::match_data::const_iterator p = rx_lst.begin (); + p != rx_lst.end (); p++) + { + string_vector named_tokens = p->named_tokens (); - nmap.assign (named(j), octave_value (tmp)); + tmp(i++) = named_tokens(j); + } + + nmap.assign (named_pats(j), octave_value (tmp)); } retval(5) = nmap; } - std::string buffer = args(0).string_value (); - - if (once) + if (options.once ()) { - retval(4) = sz ? lst.front ().t : Cell (); - retval(3) = sz ? lst.front ().m : std::string (); - retval(2) = sz ? lst.front ().te : Matrix (); + regexp::match_data::const_iterator p = rx_lst.begin (); + + retval(4) = sz ? p->tokens () : Cell (); + retval(3) = sz ? p->match_string () : std::string (); + retval(2) = sz ? p->token_extents () : Matrix (); if (sz) { - double e = lst.front ().e; - double s = lst.front ().s; + double start = p->start (); + double end = p->end (); - Cell sp (dim_vector (1, 2)); - sp(0) = buffer.substr (0, s-1); - sp(1) = buffer.substr (e); + Cell split (dim_vector (1, 2)); + split(0) = buffer.substr (0, start-1); + split(1) = buffer.substr (end); - retval(6) = sp; - retval(1) = e; - retval(0) = s; + retval(6) = split; + retval(1) = end; + retval(0) = start; } else { @@ -583,39 +195,45 @@ } else { - Cell t (dim_vector (1, sz)); - Cell m (dim_vector (1, sz)); - Cell te (dim_vector (1, sz)); - NDArray e (dim_vector (1, sz)); - NDArray s (dim_vector (1, sz)); - Cell sp (dim_vector (1, sz+1)); + Cell tokens (dim_vector (1, sz)); + Cell match_string (dim_vector (1, sz)); + Cell token_extents (dim_vector (1, sz)); + NDArray end (dim_vector (1, sz)); + NDArray start (dim_vector (1, sz)); + Cell split (dim_vector (1, sz+1)); size_t sp_start = 0; i = 0; - for (const_iterator p = lst.begin (); p != lst.end (); p++) + for (regexp::match_data::const_iterator p = rx_lst.begin (); + p != rx_lst.end (); p++) { - t(i) = p->t; - m(i) = p->m; - te(i) = p->te; - e(i) = p->e; - s(i) = p->s; - sp(i) = buffer.substr (sp_start, p->s-sp_start-1); - sp_start = p->e; + double s = p->start (); + double e = p->end (); + + string_vector tmp = p->tokens (); + tokens(i) = Cell (dim_vector (1, tmp.length ()), tmp); + match_string(i) = p->match_string (); + token_extents(i) = p->token_extents (); + end(i) = e; + start(i) = s; + split(i) = buffer.substr (sp_start, s-sp_start-1); + sp_start = e; i++; } - sp(i) = buffer.substr (sp_start); + split(i) = buffer.substr (sp_start); - retval(6) = sp; - retval(4) = t; - retval(3) = m; - retval(2) = te; - retval(1) = e; - retval(0) = s; + retval(6) = split; + retval(4) = tokens; + retval(3) = match_string; + retval(2) = token_extents; + retval(1) = end; + retval(0) = start; } // Alter the order of the output arguments - if (nopts > 0) + + if (extra_options) { int n = 0; octave_value_list new_retval; @@ -682,7 +300,7 @@ static octave_value_list octcellregexp (const octave_value_list &args, int nargout, - const std::string &nm, bool case_insensitive) + const std::string &who, bool case_insensitive = false) { octave_value_list retval; @@ -705,7 +323,7 @@ for (octave_idx_type i = 0; i < cellstr.numel (); i++) { new_args(0) = cellstr(i); - octave_value_list tmp = octregexp (new_args, nargout, nm, + octave_value_list tmp = octregexp (new_args, nargout, who, case_insensitive); if (error_state) @@ -725,7 +343,7 @@ for (octave_idx_type i = 0; i < cellpat.numel (); i++) { new_args(1) = cellpat(i); - octave_value_list tmp = octregexp (new_args, nargout, nm, + octave_value_list tmp = octregexp (new_args, nargout, who, case_insensitive); if (error_state) @@ -739,7 +357,7 @@ { if (cellstr.dims () != cellpat.dims ()) - error ("%s: Inconsistent cell array dimensions", nm.c_str ()); + error ("%s: inconsistent cell array dimensions", who.c_str ()); else { for (int j = 0; j < nargout; j++) @@ -750,7 +368,7 @@ new_args(0) = cellstr(i); new_args(1) = cellpat(i); - octave_value_list tmp = octregexp (new_args, nargout, nm, + octave_value_list tmp = octregexp (new_args, nargout, who, case_insensitive); if (error_state) @@ -772,7 +390,7 @@ for (octave_idx_type i = 0; i < cellstr.numel (); i++) { new_args(0) = cellstr(i); - octave_value_list tmp = octregexp (new_args, nargout, nm, + octave_value_list tmp = octregexp (new_args, nargout, who, case_insensitive); if (error_state) @@ -799,7 +417,7 @@ for (octave_idx_type i = 0; i < cellpat.numel (); i++) { new_args(1) = cellpat(i); - octave_value_list tmp = octregexp (new_args, nargout, nm, + octave_value_list tmp = octregexp (new_args, nargout, who, case_insensitive); if (error_state) @@ -816,7 +434,7 @@ } } else - retval = octregexp (args, nargout, nm, case_insensitive); + retval = octregexp (args, nargout, who, case_insensitive); return retval; @@ -1022,9 +640,9 @@ if (nargin < 2) print_usage (); else if (args(0).is_cell () || args(1).is_cell ()) - retval = octcellregexp (args, nargout, "regexp", false); + retval = octcellregexp (args, nargout, "regexp"); else - retval = octregexp (args, nargout, "regexp", false); + retval = octregexp (args, nargout, "regexp"); return retval; } @@ -1402,7 +1020,7 @@ static octave_value -octregexprep (const octave_value_list &args, const std::string &nm) +octregexprep (const octave_value_list &args, const std::string &who) { octave_value retval; @@ -1423,12 +1041,9 @@ // Pack options excluding 'tokenize' and various output // reordering strings into regexp arg list - octave_value_list regexpargs (nargin-1, octave_value ()); + octave_value_list regexpargs (nargin-3, octave_value ()); - regexpargs(0) = args (0); - regexpargs(1) = args (1); - - int len = 2; + int len = 0; for (int i = 3; i < nargin; i++) { const std::string opt = args(i).string_value (); @@ -1441,165 +1056,13 @@ } regexpargs.resize (len); - // Identify replacement tokens; build a vector of group numbers in - // the replacement string so that we can quickly calculate the size - // of the replacement. - int tokens = 0; - for (size_t i=1; i < replacement.size (); i++) - { - if (replacement[i-1]=='$' && isdigit (replacement[i])) - { - tokens++; - i++; - } - } - std::vector token (tokens); - - int kk = 0; - for (size_t i = 1; i < replacement.size (); i++) - { - if (replacement[i-1]=='$' && isdigit (replacement[i])) - { - token[kk++] = replacement[i]-'0'; - i++; - } - } - - // Perform replacement - std::string rep; - - if (tokens > 0) - { - std::list lst; - string_vector named; - int nopts; - bool once; - int sz = octregexp_list (regexpargs, nm , false, lst, named, nopts, once); - - if (error_state) - return retval; - if (sz == 0) - { - retval = args(0); - return retval; - } - - // Determine replacement length - const size_t replen = replacement.size () - 2*tokens; - int delta = 0; - const_iterator p = lst.begin (); - for (int i = 0; i < sz; i++) - { - OCTAVE_QUIT; - - const Matrix pairs (p->te); - size_t pairlen = 0; - for (int j = 0; j < tokens; j++) - { - if (token[j] == 0) - pairlen += static_cast (p->e - p->s) + 1; - else if (token[j] <= pairs.rows ()) - pairlen += static_cast (pairs(token[j]-1,1) - - pairs(token[j]-1,0)) + 1; - } - delta += static_cast (replen + pairlen) - - static_cast (p->e - p->s + 1); - p++; - } - - // Build replacement string - rep.reserve (buffer.size () + delta); - size_t from = 0; - p = lst.begin (); - for (int i = 0; i < sz; i++) - { - OCTAVE_QUIT; + regexp::opts options; + bool extra_args = false; + parse_options (options, regexpargs, who, 0, extra_args); + if (error_state) + return retval; - const Matrix pairs (p->te); - rep.append (&buffer[from], static_cast (p->s - 1) - from); - from = static_cast (p->e - 1) + 1; - for (size_t j = 1; j < replacement.size (); j++) - { - if (replacement[j-1]=='$' && isdigit (replacement[j])) - { - int k = replacement[j]-'0'; - if (k == 0) - { - // replace with entire match - rep.append (&buffer[static_cast (p->e - 1)], - static_cast (p->e - p->s) + 1); - } - else if (k <= pairs.rows ()) - { - // replace with group capture - rep.append (&buffer[static_cast (pairs(k-1,0)-1)], - static_cast (pairs(k-1,1) - - pairs(k-1,0)) + 1); - } - else - { - // replace with nothing - } - j++; - } - else - { - rep.append (1, replacement[j-1]); - } - if (j+1 == replacement.size ()) - { - rep.append (1, replacement[j]); - } - } - p++; - } - rep.append (&buffer[from], buffer.size () - from); - } - else - { - std::list lst; - string_vector named; - int nopts; - bool once; - int sz = octregexp_list (regexpargs, nm, false, lst, named, nopts, once); - - if (error_state) - return retval; - if (sz == 0) - { - retval = args (0); - return retval; - } - - // Determine replacement length - const size_t replen = replacement.size (); - int delta = 0; - const_iterator p = lst.begin (); - for (int i = 0; i < sz; i++) - { - OCTAVE_QUIT; - delta += static_cast (replen) - - static_cast (p->e - p->s + 1); - p++; - } - - // Build replacement string - rep.reserve (buffer.size () + delta); - size_t from = 0; - p = lst.begin (); - for (int i = 0; i < sz; i++) - { - OCTAVE_QUIT; - rep.append (&buffer[from], static_cast (p->s - 1) - from); - from = static_cast (p->e - 1) + 1; - rep.append (replacement); - p++; - } - rep.append (&buffer[from], buffer.size () - from); - } - - retval = rep; - return retval; + return regexp_replace (pattern, buffer, replacement, options, who); } DEFUN_DLD (regexprep, args, , @@ -1672,7 +1135,7 @@ { dv1 = pat.dims (); if (rep.numel () != 1 && dv1 != rep.dims ()) - error ("regexprep: Inconsistent cell array dimensions"); + error ("regexprep: inconsistent cell array dimensions"); } else if (rep.numel () != 1) dv1 = rep.dims (); diff -r d51b321b5fef -r fc9f204faea0 src/symtab.h --- a/src/symtab.h Sun Dec 11 18:28:35 2011 -0500 +++ b/src/symtab.h Sun Dec 11 22:19:57 2011 -0500 @@ -31,7 +31,7 @@ #include #include "glob-match.h" -#include "regex-match.h" +#include "regexp.h" class tree_argument_list; class octave_user_function; @@ -1684,7 +1684,7 @@ { std::list retval; - regex_match pat (pattern); + ::regexp pat (pattern); for (global_table_const_iterator p = global_table.begin (); p != global_table.end (); p++) @@ -1693,7 +1693,7 @@ // the results from regexp_variables and regexp_global_variables // may be handled the same way. - if (pat.match (p->first)) + if (pat.is_match (p->first)) retval.push_back (symbol_record (p->first, p->second, symbol_record::global)); } @@ -2315,7 +2315,7 @@ void do_clear_variable_regexp (const std::string& pat) { - regex_match pattern (pat); + ::regexp pattern (pat); for (table_iterator p = table.begin (); p != table.end (); p++) { @@ -2323,7 +2323,7 @@ if (sr.is_defined () || sr.is_global ()) { - if (pattern.match (sr.name ())) + if (pattern.is_match (sr.name ())) sr.clear (); } } @@ -2390,11 +2390,11 @@ { std::list retval; - regex_match pat (pattern); + ::regexp pat (pattern); for (table_const_iterator p = table.begin (); p != table.end (); p++) { - if (pat.match (p->first)) + if (pat.is_match (p->first)) { const symbol_record& sr = p->second; diff -r d51b321b5fef -r fc9f204faea0 src/variables.cc --- a/src/variables.cc Sun Dec 11 18:28:35 2011 -0500 +++ b/src/variables.cc Sun Dec 11 22:19:57 2011 -0500 @@ -36,7 +36,7 @@ #include "oct-env.h" #include "file-ops.h" #include "glob-match.h" -#include "regex-match.h" +#include "regexp.h" #include "str-vec.h" #include @@ -2050,9 +2050,7 @@ { if (have_regexp) { - regex_match pattern (patstr); - - if (pattern.match (nm)) + if (is_regexp_match (patstr, nm)) { retval = true; break;