# HG changeset patch # User Rik # Date 1373413677 25200 # Node ID 99122191d3dde944b90afe6de921535ef58e2573 # Parent 06897f865f0b56fa7a14242e36e0d821dd9b399c maint: Rename regexp.h to lo-regexp.h, regexp.cc to lo-regexp.cc in liboctave. Remove namespace hit with regexp.cc in libinterp/corefcn. * liboctave/util/lo-regexp.cc: Renamed from regexp.cc. * liboctave/util/lo-regexp.h: Renamed from regexp.h. * liboctave/util/module.mk: Add files to build system. * libinterp/corefcn/regexp.cc, libinterp/corefcn/symtab.h, libinterp/corefcn/variables.cc: Update #include stmts to new name. diff -r 06897f865f0b -r 99122191d3dd libinterp/corefcn/regexp.cc --- a/libinterp/corefcn/regexp.cc Tue Jul 09 16:08:16 2013 -0700 +++ b/libinterp/corefcn/regexp.cc Tue Jul 09 16:47:57 2013 -0700 @@ -33,7 +33,7 @@ #include "base-list.h" #include "oct-locbuf.h" #include "quit.h" -#include "regexp.h" +#include "lo-regexp.h" #include "str-vec.h" #include "defun.h" diff -r 06897f865f0b -r 99122191d3dd libinterp/corefcn/symtab.h --- a/libinterp/corefcn/symtab.h Tue Jul 09 16:08:16 2013 -0700 +++ b/libinterp/corefcn/symtab.h Tue Jul 09 16:47:57 2013 -0700 @@ -31,7 +31,7 @@ #include #include "glob-match.h" -#include "regexp.h" +#include "lo-regexp.h" class tree_argument_list; class octave_user_function; diff -r 06897f865f0b -r 99122191d3dd libinterp/corefcn/variables.cc --- a/libinterp/corefcn/variables.cc Tue Jul 09 16:08:16 2013 -0700 +++ b/libinterp/corefcn/variables.cc Tue Jul 09 16:47:57 2013 -0700 @@ -36,7 +36,7 @@ #include "oct-env.h" #include "file-ops.h" #include "glob-match.h" -#include "regexp.h" +#include "lo-regexp.h" #include "str-vec.h" #include diff -r 06897f865f0b -r 99122191d3dd liboctave/util/lo-regexp.cc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/liboctave/util/lo-regexp.cc Tue Jul 09 16:47:57 2013 -0700 @@ -0,0 +1,620 @@ +/* + +Copyright (C) 2012 John W. Eaton +Copyright (C) 2005-2012 David Bateman +Copyright (C) 2002-2005 Paul Kienzle + +This file is part of Octave. + +Octave is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +Octave is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with Octave; see the file COPYING. If not, see +. + +*/ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include + +#if defined (HAVE_PCRE_H) +#include +#elif defined (HAVE_PCRE_PCRE_H) +#include +#endif + +#include "Matrix.h" +#include "base-list.h" +#include "lo-error.h" +#include "oct-locbuf.h" +#include "quit.h" +#include "lo-regexp.h" +#include "str-vec.h" + +// Define the maximum number of retries for a pattern that possibly +// results in an infinite recursion. +#define PCRE_MATCHLIMIT_MAX 10 + +// FIXME -- should this be configurable? +#define MAXLOOKBEHIND 10 + +static bool lookbehind_warned = false; + +// FIXME -- don't bother collecting and composing return values the user +// doesn't want. + +void +regexp::free (void) +{ + if (data) + pcre_free (static_cast (data)); +} + +void +regexp::compile_internal (void) +{ + // If we had a previously compiled pattern, release it. + free (); + + size_t max_length = MAXLOOKBEHIND; + + size_t pos = 0; + size_t new_pos; + int inames = 0; + std::ostringstream buf; + + while ((new_pos = pattern.find ("(?", pos)) != std::string::npos) + { + if (pattern.at (new_pos + 2) == '<' + && !(pattern.at (new_pos + 3) == '=' + || pattern.at (new_pos + 3) == '!')) + { + // The syntax of named tokens in pcre is "(?P...)" while + // we need a syntax "(?...)", so fix that here. Also an + // expression like + // "(?\w+)\s+(?\w+)|(?\w+),\s+(?\w+)" + // should be perfectly legal, while pcre does not allow the same + // named token name on both sides of the alternative. Also fix + // that here by replacing name tokens by dummy names, and dealing + // with the dummy names later. + + size_t tmp_pos = pattern.find_first_of ('>', new_pos); + + if (tmp_pos == std::string::npos) + { + (*current_liboctave_error_handler) + ("regexp: syntax error in pattern"); + return; + } + + std::string tmp_name = + pattern.substr (new_pos+3, tmp_pos-new_pos-3); + + bool found = false; + + for (int i = 0; i < nnames; i++) + { + if (named_pats(i) == tmp_name) + { + named_idx.resize (dim_vector (inames+1, 1)); + named_idx(inames) = i; + found = true; + break; + } + } + + if (! found) + { + named_idx.resize (dim_vector (inames+1, 1)); + named_idx(inames) = nnames; + named_pats.append (tmp_name); + nnames++; + } + + if (new_pos - pos > 0) + buf << pattern.substr (pos, new_pos-pos); + if (inames < 10) + buf << "(?P 0) + { + char ch = pattern.at (tmp_pos1); + + if (ch == '(') + brackets++; + else if (ch == ')') + { + if (brackets > 1) + tmp_pos2 = tmp_pos1; + + brackets--; + } + + tmp_pos1++; + } + + if (brackets != 0) + { + buf << pattern.substr (pos, new_pos - pos) << "(?"; + pos = new_pos + 2; + } + else + { + size_t tmp_pos3 = pattern.find_first_of ("*+", tmp_pos2); + + if (tmp_pos3 != std::string::npos && tmp_pos3 < tmp_pos1) + { + if (!lookbehind_warned) + { + lookbehind_warned = true; + (*current_liboctave_warning_handler) + ("%s: arbitrary length lookbehind patterns are only supported up to length %d", + who.c_str (), MAXLOOKBEHIND); + } + + buf << pattern.substr (pos, new_pos - pos) << "("; + + size_t i; + + if (pattern.at (tmp_pos3) == '*') + i = 0; + else + i = 1; + + for (; i < max_length + 1; i++) + { + buf << pattern.substr (new_pos, tmp_pos3 - new_pos) + << "{" << i << "}"; + buf << pattern.substr (tmp_pos3 + 1, + tmp_pos1 - tmp_pos3 - 1); + if (i != max_length) + buf << "|"; + } + buf << ")"; + } + else + buf << pattern.substr (pos, tmp_pos1 - pos); + + pos = tmp_pos1; + } + } + else + { + buf << pattern.substr (pos, new_pos - pos) << "(?"; + pos = new_pos + 2; + } + + } + + buf << pattern.substr (pos); + + const char *err; + int erroffset; + std::string buf_str = buf.str (); + + int pcre_options + = ((options.case_insensitive () ? PCRE_CASELESS : 0) + | (options.dotexceptnewline () ? 0 : PCRE_DOTALL) + | (options.lineanchors () ? PCRE_MULTILINE : 0) + | (options.freespacing () ? PCRE_EXTENDED : 0)); + + data = pcre_compile (buf_str.c_str (), pcre_options, &err, &erroffset, 0); + + if (! data) + (*current_liboctave_error_handler) + ("%s: %s at position %d of expression", who.c_str (), + err, erroffset); +} + +regexp::match_data +regexp::match (const std::string& buffer) +{ + regexp::match_data retval; + + std::list lst; + + int subpatterns; + int namecount; + int nameentrysize; + char *nametable; + size_t idx = 0; + + pcre *re = static_cast (data); + + pcre_fullinfo (re, 0, PCRE_INFO_CAPTURECOUNT, &subpatterns); + pcre_fullinfo (re, 0, PCRE_INFO_NAMECOUNT, &namecount); + pcre_fullinfo (re, 0, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize); + pcre_fullinfo (re, 0, PCRE_INFO_NAMETABLE, &nametable); + + OCTAVE_LOCAL_BUFFER (int, ovector, (subpatterns+1)*3); + OCTAVE_LOCAL_BUFFER (int, nidx, namecount); + + for (int i = 0; i < namecount; i++) + { + // Index of subpattern in first two bytes MSB first of name. + // Extract index. + nidx[i] = (static_cast (nametable[i*nameentrysize])) << 8 + | static_cast (nametable[i*nameentrysize+1]); + } + + while (true) + { + OCTAVE_QUIT; + + int matches = pcre_exec (re, 0, buffer.c_str (), + buffer.length (), idx, + (idx ? PCRE_NOTBOL : 0), + ovector, (subpatterns+1)*3); + + if (matches == PCRE_ERROR_MATCHLIMIT) + { + // Try harder; start with default value for MATCH_LIMIT + // and increase it. + (*current_liboctave_warning_handler) + ("your pattern caused PCRE to hit its MATCH_LIMIT; trying harder now, but this will be slow"); + + pcre_extra pe; + + pcre_config (PCRE_CONFIG_MATCH_LIMIT, + static_cast (&pe.match_limit)); + + pe.flags = PCRE_EXTRA_MATCH_LIMIT; + + int i = 0; + while (matches == PCRE_ERROR_MATCHLIMIT + && i++ < PCRE_MATCHLIMIT_MAX) + { + OCTAVE_QUIT; + + pe.match_limit *= 10; + matches = pcre_exec (re, &pe, buffer.c_str (), + buffer.length (), idx, + (idx ? PCRE_NOTBOL : 0), + ovector, (subpatterns+1)*3); + } + } + + if (matches < 0 && matches != PCRE_ERROR_NOMATCH) + { + (*current_liboctave_error_handler) + ("%s: internal error calling pcre_exec; error code from pcre_exec is %i", + who.c_str (), matches); + return retval; + } + else if (matches == PCRE_ERROR_NOMATCH) + break; + else if (ovector[1] <= ovector[0] && ! options.emptymatch ()) + { + // Zero length match. Skip to next char. + idx = ovector[0] + 1; + if (idx < buffer.length ()) + continue; + else + break; + } + else + { + int pos_match = 0; + Matrix token_extents (matches-1, 2); + + for (int i = 1; i < matches; i++) + { + if (ovector[2*i] >= 0 && ovector[2*i+1] > 0 + && (i == 1 || ovector[2*i] != ovector[2*i-2] + || ovector[2*i-1] != ovector[2*i+1])) + { + token_extents(pos_match,0) = double (ovector[2*i]+1); + token_extents(pos_match++,1) = double (ovector[2*i+1]); + } + } + + token_extents.resize (pos_match, 2); + + double start = double (ovector[0]+1); + double end = double (ovector[1]); + + const char **listptr; + int status = pcre_get_substring_list (buffer.c_str (), ovector, + matches, &listptr); + + if (status == PCRE_ERROR_NOMEMORY) + { + (*current_liboctave_error_handler) + ("%s: cannot allocate memory in pcre_get_substring_list", + who.c_str ()); + return retval; + } + + string_vector tokens (pos_match); + string_vector named_tokens (nnames); + int pos_offset = 0; + pos_match = 0; + + for (int i = 1; i < matches; i++) + { + if (ovector[2*i] >= 0 && ovector[2*i+1] > 0) + { + if (i == 1 || ovector[2*i] != ovector[2*i-2] + || ovector[2*i-1] != ovector[2*i+1]) + { + if (namecount > 0) + { + // FIXME: Should probably do this with a map() + // rather than a linear search. However, + // the number of captured, named expressions + // is usually pretty small (< 4) + for (int j = 0; j < namecount; j++) + { + if (nidx[j] == i) + { + named_tokens(named_idx(j)) = + std::string (*(listptr+i-pos_offset)); + break; + } + } + } + + tokens(pos_match++) = std::string (*(listptr+i)); + } + else + pos_offset++; + } + } + + std::string match_string = std::string (*listptr); + + pcre_free_substring_list (listptr); + + regexp::match_element new_elem (named_tokens, tokens, match_string, + token_extents, start, end); + lst.push_back (new_elem); + + if (ovector[1] <= ovector[0]) + { + // Zero length match. Skip to next char. + idx = ovector[0] + 1; + if (idx <= buffer.length ()) + continue; + } + else + idx = ovector[1]; + + if (options.once () || idx >= buffer.length ()) + break; + } + } + + retval = regexp::match_data (lst, named_pats); + + return retval; +} + +bool +regexp::is_match (const std::string& buffer) +{ + regexp::match_data rx_lst = match (buffer); + + return rx_lst.size () > 0; +} + +Array +regexp::is_match (const string_vector& buffer) +{ + octave_idx_type len = buffer.length (); + + Array retval (dim_vector (len, 1)); + + for (octave_idx_type i = 0; i < buffer.length (); i++) + retval(i) = is_match (buffer(i)); + + return retval; +} + +// Declare rep_token_t used in processing replacement string +typedef struct + { + size_t pos; + int num; + } rep_token_t; + + +std::string +regexp::replace (const std::string& buffer, const std::string& replacement) +{ + std::string retval; + + regexp::match_data rx_lst = match (buffer); + + size_t num_matches = rx_lst.size (); + + if (num_matches == 0) + { + retval = buffer; + return retval; + } + + // Identify replacement tokens; build a vector of group numbers in + // the replacement string so that we can quickly calculate the size + // of the replacement. + + // FIXME: All code assumes that only 10 tokens ($0-$9) exist. + // $11 represents $1 followed by the character '1' rather than + // the eleventh capture buffer. + + std::string repstr = replacement; + std::vector tokens; + tokens.reserve (5); // Reserve memory for 5 pattern replacements + + for (size_t i=0; i < repstr.size (); i++) + { + if (repstr[i] == '\\') + { + if (i < repstr.size () - 1 && repstr[i+1] == '$') + { + repstr.erase (i,1); // erase backslash + i++; // skip over '$' + continue; + } + if (i < repstr.size () - 1 && repstr[i+1] == '\\') + { + repstr.erase (i,1); // erase 1st backslash + continue; + } + } + else if (repstr[i] == '$') + { + if (i < repstr.size () - 1 && isdigit (repstr[i+1])) + { + rep_token_t tmp_token; + + tmp_token.pos = i; + tmp_token.num = repstr[i+1]-'0'; + tokens.push_back (tmp_token); + } + } + } + + std::string rep; + int num_tokens = tokens.size (); + + if (num_tokens > 0) + { + // Determine replacement length + const size_t replen = repstr.size () - 2*num_tokens; + int delta = 0; + regexp::match_data::const_iterator p = rx_lst.begin (); + for (size_t i = 0; i < num_matches; i++) + { + OCTAVE_QUIT; + + double start = p->start (); + double end = p->end (); + + const Matrix pairs (p->token_extents ()); + size_t pairlen = 0; + for (int j = 0; j < num_tokens; j++) + { + if (tokens[j].num == 0) + pairlen += static_cast (end - start) + 1; + else if (tokens[j].num <= pairs.rows ()) + pairlen += static_cast (pairs(tokens[j].num-1,1) + - pairs(tokens[j].num-1,0)) + 1; + } + delta += (static_cast (replen + pairlen) + - static_cast (end - start + 1)); + p++; + } + + // Build replacement string + rep.reserve (buffer.size () + delta); + size_t from = 0; + p = rx_lst.begin (); + for (size_t i = 0; i < num_matches; i++) + { + OCTAVE_QUIT; + + double start = p->start (); + double end = p->end (); + + const Matrix pairs (p->token_extents ()); + rep.append (&buffer[from], static_cast (start - 1) - from); + from = static_cast (end - 1) + 1; + + size_t cur_pos = 0; + + for (int j = 0; j < num_tokens; j++) + { + rep.append (&repstr[cur_pos], (tokens[j].pos) - cur_pos); + cur_pos = tokens[j].pos+2; + + int k = tokens[j].num; + if (k == 0) + { + // replace with entire match + rep.append (&buffer[static_cast (end - 1)], + static_cast (end - start) + 1); + } + else if (k <= pairs.rows ()) + { + // replace with group capture + rep.append (&buffer[static_cast (pairs(k-1,0)-1)], + static_cast (pairs(k-1,1) + - pairs(k-1,0)) + 1); + } + else + { + // replace with nothing + } + } + if (cur_pos < repstr.size ()) + rep.append (&repstr[cur_pos], repstr.size () - cur_pos); + + p++; + } + rep.append (&buffer[from], buffer.size () - from); + } + else + { + // Determine repstr length + const size_t replen = repstr.size (); + int delta = 0; + regexp::match_data::const_iterator p = rx_lst.begin (); + for (size_t i = 0; i < num_matches; i++) + { + OCTAVE_QUIT; + delta += static_cast (replen) + - static_cast (p->end () - p->start () + 1); + p++; + } + + // Build replacement string + rep.reserve (buffer.size () + delta); + size_t from = 0; + p = rx_lst.begin (); + for (size_t i = 0; i < num_matches; i++) + { + OCTAVE_QUIT; + rep.append (&buffer[from], + static_cast (p->start () - 1) - from); + from = static_cast (p->end () - 1) + 1; + rep.append (repstr); + p++; + } + rep.append (&buffer[from], buffer.size () - from); + } + + retval = rep; + return retval; +} diff -r 06897f865f0b -r 99122191d3dd liboctave/util/lo-regexp.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/liboctave/util/lo-regexp.h Tue Jul 09 16:47:57 2013 -0700 @@ -0,0 +1,289 @@ +/* + +Copyright (C) 2012 John W. Eaton +Copyright (C) 2005-2012 David Bateman + +This file is part of Octave. + +Octave is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +Octave is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with Octave; see the file COPYING. If not, see +. + +*/ + +#if !defined (octave_liboctave_regexp_match_h) +#define octave_liboctave_regexp_match_h 1 + +#include +#include +#include + +#include "Array.h" +#include "Matrix.h" +#include "base-list.h" +#include "str-vec.h" + +class +OCTAVE_API +regexp +{ +public: + + class opts; + class match_data; + + regexp (const std::string& pat = "", + const regexp::opts& opt = regexp::opts (), + const std::string& w = "regexp") + : pattern (pat), options (opt), data (0), named_pats (), + nnames (0), named_idx (), who (w) + { + compile_internal (); + } + + regexp (const regexp& rx) + : pattern (rx.pattern), data (rx.data), named_pats (rx.named_pats), + nnames (rx.nnames), named_idx (rx.named_idx) + { } + + regexp& operator = (const regexp& rx) + { + if (this != &rx) + { + pattern = rx.pattern; + data = rx.data; + named_pats = rx.named_pats; + nnames = rx.nnames; + named_idx = rx.named_idx; + } + + return *this; + } + + ~regexp (void) { free (); } + + void compile (const std::string& pat, + const regexp::opts& opt = regexp::opts ()) + { + pattern = pat; + options = opt; + compile_internal (); + } + + match_data match (const std::string& buffer); + + bool is_match (const std::string& buffer); + + Array is_match (const string_vector& buffer); + + std::string replace (const std::string& buffer, + const std::string& replacement); + + class opts + { + public: + + opts (void) + : x_case_insensitive (false), x_dotexceptnewline (false), + x_emptymatch (false), x_freespacing (false), x_lineanchors (false), + x_once (false) { } + + opts (const opts& o) + : x_case_insensitive (o.x_case_insensitive), + x_dotexceptnewline (o.x_dotexceptnewline), + x_emptymatch (o.x_emptymatch), + x_freespacing (o.x_freespacing), + x_lineanchors (o.x_lineanchors), + x_once (o.x_once) + { } + + opts& operator = (const opts& o) + { + if (this != &o) + { + x_case_insensitive = o.x_case_insensitive; + x_dotexceptnewline = o.x_dotexceptnewline; + x_emptymatch = o.x_emptymatch; + x_freespacing = o.x_freespacing; + x_lineanchors = o.x_lineanchors; + x_once = o.x_once; + } + + return *this; + } + + ~opts (void) { } + + void case_insensitive (bool val) { x_case_insensitive = val; } + void dotexceptnewline (bool val) { x_dotexceptnewline = val; } + void emptymatch (bool val) { x_emptymatch = val; } + void freespacing (bool val) { x_freespacing = val; } + void lineanchors (bool val) { x_lineanchors = val; } + void once (bool val) { x_once = val; } + + bool case_insensitive (void) const { return x_case_insensitive; } + bool dotexceptnewline (void) const { return x_dotexceptnewline; } + bool emptymatch (void) const { return x_emptymatch; } + bool freespacing (void) const { return x_freespacing; } + bool lineanchors (void) const { return x_lineanchors; } + bool once (void) const { return x_once; } + + private: + + bool x_case_insensitive; + bool x_dotexceptnewline; + bool x_emptymatch; + bool x_freespacing; + bool x_lineanchors; + bool x_once; + }; + + class match_element + { + public: + + match_element (const string_vector& nt, const string_vector& t, + const std::string& ms, const Matrix& te, + double s, double e) + : x_match_string (ms), x_named_tokens (nt), x_tokens (t), + x_token_extents (te), x_start (s), x_end (e) + { } + + match_element (const match_element &a) + : x_match_string (a.x_match_string), + x_named_tokens (a.x_named_tokens), x_tokens (a.x_tokens), + x_token_extents (a.x_token_extents), + x_start (a.x_start), x_end (a.x_end) + { } + + std::string match_string (void) const { return x_match_string; } + string_vector named_tokens (void) const { return x_named_tokens; } + string_vector tokens (void) const { return x_tokens; } + Matrix token_extents (void) const { return x_token_extents; } + double start (void) const { return x_start; } + double end (void) const { return x_end; } + + private: + + std::string x_match_string; + string_vector x_named_tokens; + string_vector x_tokens; + Matrix x_token_extents; + double x_start; + double x_end; + }; + + class match_data : public octave_base_list + { + public: + + match_data (void) + : octave_base_list (), named_pats () + { } + + match_data (const std::list& l, const string_vector& np) + : octave_base_list (l), named_pats (np) + { } + + match_data (const match_data& rx_lst) + : octave_base_list (rx_lst), + named_pats (rx_lst.named_pats) + { } + + match_data& operator = (const match_data& rx_lst) + { + if (this != &rx_lst) + { + octave_base_list::operator = (rx_lst); + named_pats = rx_lst.named_pats; + } + + return *this; + } + + ~match_data (void) { } + + string_vector named_patterns (void) { return named_pats; } + + private: + + string_vector named_pats; + }; + +private: + + // The pattern we've been asked to match. + std::string pattern; + + opts options; + + // Internal data describing the regular expression. + void *data; + + std::string m; + string_vector named_pats; + int nnames; + Array named_idx; + std::string who; + + void free (void); + + void compile_internal (void); +}; + +inline regexp::match_data +regexp_match (const std::string& pat, + const std::string& buffer, + const regexp::opts& opt = regexp::opts (), + const std::string& who = "regexp") +{ + regexp rx (pat, opt, who); + + return rx.match (buffer); +} + +inline bool +is_regexp_match (const std::string& pat, + const std::string& buffer, + const regexp::opts& opt = regexp::opts (), + const std::string& who = "regexp") +{ + regexp rx (pat, opt, who); + + return rx.is_match (buffer); +} + +inline Array +is_regexp_match (const std::string& pat, + const string_vector& buffer, + const regexp::opts& opt = regexp::opts (), + const std::string& who = "regexp") +{ + regexp rx (pat, opt, who); + + return rx.is_match (buffer); +} + +inline std::string +regexp_replace (const std::string& pat, + const std::string& buffer, + const std::string& replacement, + const regexp::opts& opt = regexp::opts (), + const std::string& who = "regexp") +{ + regexp rx (pat, opt, who); + + return rx.replace (buffer, replacement); +} + +#endif diff -r 06897f865f0b -r 99122191d3dd liboctave/util/module.mk --- a/liboctave/util/module.mk Tue Jul 09 16:08:16 2013 -0700 +++ b/liboctave/util/module.mk Tue Jul 09 16:47:57 2013 -0700 @@ -34,7 +34,7 @@ util/oct-sort.h \ util/oct-sparse.h \ util/pathsearch.h \ - util/regexp.h \ + util/lo-regexp.h \ util/singleton-cleanup.h \ util/sparse-sort.h \ util/sparse-util.h \ @@ -66,7 +66,7 @@ util/oct-mutex.cc \ util/oct-shlib.cc \ util/pathsearch.cc \ - util/regexp.cc \ + util/lo-regexp.cc \ util/singleton-cleanup.cc \ util/sparse-sort.cc \ util/sparse-util.cc \ diff -r 06897f865f0b -r 99122191d3dd liboctave/util/regexp.cc --- a/liboctave/util/regexp.cc Tue Jul 09 16:08:16 2013 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,620 +0,0 @@ -/* - -Copyright (C) 2012 John W. Eaton -Copyright (C) 2005-2012 David Bateman -Copyright (C) 2002-2005 Paul Kienzle - -This file is part of Octave. - -Octave is free software; you can redistribute it and/or modify it -under the terms of the GNU General Public License as published by the -Free Software Foundation; either version 3 of the License, or (at your -option) any later version. - -Octave is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -for more details. - -You should have received a copy of the GNU General Public License -along with Octave; see the file COPYING. If not, see -. - -*/ - -#ifdef HAVE_CONFIG_H -#include -#endif - -#include -#include -#include -#include - -#if defined (HAVE_PCRE_H) -#include -#elif defined (HAVE_PCRE_PCRE_H) -#include -#endif - -#include "Matrix.h" -#include "base-list.h" -#include "lo-error.h" -#include "oct-locbuf.h" -#include "quit.h" -#include "regexp.h" -#include "str-vec.h" - -// Define the maximum number of retries for a pattern that possibly -// results in an infinite recursion. -#define PCRE_MATCHLIMIT_MAX 10 - -// FIXME -- should this be configurable? -#define MAXLOOKBEHIND 10 - -static bool lookbehind_warned = false; - -// FIXME -- don't bother collecting and composing return values the user -// doesn't want. - -void -regexp::free (void) -{ - if (data) - pcre_free (static_cast (data)); -} - -void -regexp::compile_internal (void) -{ - // If we had a previously compiled pattern, release it. - free (); - - size_t max_length = MAXLOOKBEHIND; - - size_t pos = 0; - size_t new_pos; - int inames = 0; - std::ostringstream buf; - - while ((new_pos = pattern.find ("(?", pos)) != std::string::npos) - { - if (pattern.at (new_pos + 2) == '<' - && !(pattern.at (new_pos + 3) == '=' - || pattern.at (new_pos + 3) == '!')) - { - // The syntax of named tokens in pcre is "(?P...)" while - // we need a syntax "(?...)", so fix that here. Also an - // expression like - // "(?\w+)\s+(?\w+)|(?\w+),\s+(?\w+)" - // should be perfectly legal, while pcre does not allow the same - // named token name on both sides of the alternative. Also fix - // that here by replacing name tokens by dummy names, and dealing - // with the dummy names later. - - size_t tmp_pos = pattern.find_first_of ('>', new_pos); - - if (tmp_pos == std::string::npos) - { - (*current_liboctave_error_handler) - ("regexp: syntax error in pattern"); - return; - } - - std::string tmp_name = - pattern.substr (new_pos+3, tmp_pos-new_pos-3); - - bool found = false; - - for (int i = 0; i < nnames; i++) - { - if (named_pats(i) == tmp_name) - { - named_idx.resize (dim_vector (inames+1, 1)); - named_idx(inames) = i; - found = true; - break; - } - } - - if (! found) - { - named_idx.resize (dim_vector (inames+1, 1)); - named_idx(inames) = nnames; - named_pats.append (tmp_name); - nnames++; - } - - if (new_pos - pos > 0) - buf << pattern.substr (pos, new_pos-pos); - if (inames < 10) - buf << "(?P 0) - { - char ch = pattern.at (tmp_pos1); - - if (ch == '(') - brackets++; - else if (ch == ')') - { - if (brackets > 1) - tmp_pos2 = tmp_pos1; - - brackets--; - } - - tmp_pos1++; - } - - if (brackets != 0) - { - buf << pattern.substr (pos, new_pos - pos) << "(?"; - pos = new_pos + 2; - } - else - { - size_t tmp_pos3 = pattern.find_first_of ("*+", tmp_pos2); - - if (tmp_pos3 != std::string::npos && tmp_pos3 < tmp_pos1) - { - if (!lookbehind_warned) - { - lookbehind_warned = true; - (*current_liboctave_warning_handler) - ("%s: arbitrary length lookbehind patterns are only supported up to length %d", - who.c_str (), MAXLOOKBEHIND); - } - - buf << pattern.substr (pos, new_pos - pos) << "("; - - size_t i; - - if (pattern.at (tmp_pos3) == '*') - i = 0; - else - i = 1; - - for (; i < max_length + 1; i++) - { - buf << pattern.substr (new_pos, tmp_pos3 - new_pos) - << "{" << i << "}"; - buf << pattern.substr (tmp_pos3 + 1, - tmp_pos1 - tmp_pos3 - 1); - if (i != max_length) - buf << "|"; - } - buf << ")"; - } - else - buf << pattern.substr (pos, tmp_pos1 - pos); - - pos = tmp_pos1; - } - } - else - { - buf << pattern.substr (pos, new_pos - pos) << "(?"; - pos = new_pos + 2; - } - - } - - buf << pattern.substr (pos); - - const char *err; - int erroffset; - std::string buf_str = buf.str (); - - int pcre_options - = ((options.case_insensitive () ? PCRE_CASELESS : 0) - | (options.dotexceptnewline () ? 0 : PCRE_DOTALL) - | (options.lineanchors () ? PCRE_MULTILINE : 0) - | (options.freespacing () ? PCRE_EXTENDED : 0)); - - data = pcre_compile (buf_str.c_str (), pcre_options, &err, &erroffset, 0); - - if (! data) - (*current_liboctave_error_handler) - ("%s: %s at position %d of expression", who.c_str (), - err, erroffset); -} - -regexp::match_data -regexp::match (const std::string& buffer) -{ - regexp::match_data retval; - - std::list lst; - - int subpatterns; - int namecount; - int nameentrysize; - char *nametable; - size_t idx = 0; - - pcre *re = static_cast (data); - - pcre_fullinfo (re, 0, PCRE_INFO_CAPTURECOUNT, &subpatterns); - pcre_fullinfo (re, 0, PCRE_INFO_NAMECOUNT, &namecount); - pcre_fullinfo (re, 0, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize); - pcre_fullinfo (re, 0, PCRE_INFO_NAMETABLE, &nametable); - - OCTAVE_LOCAL_BUFFER (int, ovector, (subpatterns+1)*3); - OCTAVE_LOCAL_BUFFER (int, nidx, namecount); - - for (int i = 0; i < namecount; i++) - { - // Index of subpattern in first two bytes MSB first of name. - // Extract index. - nidx[i] = (static_cast (nametable[i*nameentrysize])) << 8 - | static_cast (nametable[i*nameentrysize+1]); - } - - while (true) - { - OCTAVE_QUIT; - - int matches = pcre_exec (re, 0, buffer.c_str (), - buffer.length (), idx, - (idx ? PCRE_NOTBOL : 0), - ovector, (subpatterns+1)*3); - - if (matches == PCRE_ERROR_MATCHLIMIT) - { - // Try harder; start with default value for MATCH_LIMIT - // and increase it. - (*current_liboctave_warning_handler) - ("your pattern caused PCRE to hit its MATCH_LIMIT; trying harder now, but this will be slow"); - - pcre_extra pe; - - pcre_config (PCRE_CONFIG_MATCH_LIMIT, - static_cast (&pe.match_limit)); - - pe.flags = PCRE_EXTRA_MATCH_LIMIT; - - int i = 0; - while (matches == PCRE_ERROR_MATCHLIMIT - && i++ < PCRE_MATCHLIMIT_MAX) - { - OCTAVE_QUIT; - - pe.match_limit *= 10; - matches = pcre_exec (re, &pe, buffer.c_str (), - buffer.length (), idx, - (idx ? PCRE_NOTBOL : 0), - ovector, (subpatterns+1)*3); - } - } - - if (matches < 0 && matches != PCRE_ERROR_NOMATCH) - { - (*current_liboctave_error_handler) - ("%s: internal error calling pcre_exec; error code from pcre_exec is %i", - who.c_str (), matches); - return retval; - } - else if (matches == PCRE_ERROR_NOMATCH) - break; - else if (ovector[1] <= ovector[0] && ! options.emptymatch ()) - { - // Zero length match. Skip to next char. - idx = ovector[0] + 1; - if (idx < buffer.length ()) - continue; - else - break; - } - else - { - int pos_match = 0; - Matrix token_extents (matches-1, 2); - - for (int i = 1; i < matches; i++) - { - if (ovector[2*i] >= 0 && ovector[2*i+1] > 0 - && (i == 1 || ovector[2*i] != ovector[2*i-2] - || ovector[2*i-1] != ovector[2*i+1])) - { - token_extents(pos_match,0) = double (ovector[2*i]+1); - token_extents(pos_match++,1) = double (ovector[2*i+1]); - } - } - - token_extents.resize (pos_match, 2); - - double start = double (ovector[0]+1); - double end = double (ovector[1]); - - const char **listptr; - int status = pcre_get_substring_list (buffer.c_str (), ovector, - matches, &listptr); - - if (status == PCRE_ERROR_NOMEMORY) - { - (*current_liboctave_error_handler) - ("%s: cannot allocate memory in pcre_get_substring_list", - who.c_str ()); - return retval; - } - - string_vector tokens (pos_match); - string_vector named_tokens (nnames); - int pos_offset = 0; - pos_match = 0; - - for (int i = 1; i < matches; i++) - { - if (ovector[2*i] >= 0 && ovector[2*i+1] > 0) - { - if (i == 1 || ovector[2*i] != ovector[2*i-2] - || ovector[2*i-1] != ovector[2*i+1]) - { - if (namecount > 0) - { - // FIXME: Should probably do this with a map() - // rather than a linear search. However, - // the number of captured, named expressions - // is usually pretty small (< 4) - for (int j = 0; j < namecount; j++) - { - if (nidx[j] == i) - { - named_tokens(named_idx(j)) = - std::string (*(listptr+i-pos_offset)); - break; - } - } - } - - tokens(pos_match++) = std::string (*(listptr+i)); - } - else - pos_offset++; - } - } - - std::string match_string = std::string (*listptr); - - pcre_free_substring_list (listptr); - - regexp::match_element new_elem (named_tokens, tokens, match_string, - token_extents, start, end); - lst.push_back (new_elem); - - if (ovector[1] <= ovector[0]) - { - // Zero length match. Skip to next char. - idx = ovector[0] + 1; - if (idx <= buffer.length ()) - continue; - } - else - idx = ovector[1]; - - if (options.once () || idx >= buffer.length ()) - break; - } - } - - retval = regexp::match_data (lst, named_pats); - - return retval; -} - -bool -regexp::is_match (const std::string& buffer) -{ - regexp::match_data rx_lst = match (buffer); - - return rx_lst.size () > 0; -} - -Array -regexp::is_match (const string_vector& buffer) -{ - octave_idx_type len = buffer.length (); - - Array retval (dim_vector (len, 1)); - - for (octave_idx_type i = 0; i < buffer.length (); i++) - retval(i) = is_match (buffer(i)); - - return retval; -} - -// Declare rep_token_t used in processing replacement string -typedef struct - { - size_t pos; - int num; - } rep_token_t; - - -std::string -regexp::replace (const std::string& buffer, const std::string& replacement) -{ - std::string retval; - - regexp::match_data rx_lst = match (buffer); - - size_t num_matches = rx_lst.size (); - - if (num_matches == 0) - { - retval = buffer; - return retval; - } - - // Identify replacement tokens; build a vector of group numbers in - // the replacement string so that we can quickly calculate the size - // of the replacement. - - // FIXME: All code assumes that only 10 tokens ($0-$9) exist. - // $11 represents $1 followed by the character '1' rather than - // the eleventh capture buffer. - - std::string repstr = replacement; - std::vector tokens; - tokens.reserve (5); // Reserve memory for 5 pattern replacements - - for (size_t i=0; i < repstr.size (); i++) - { - if (repstr[i] == '\\') - { - if (i < repstr.size () - 1 && repstr[i+1] == '$') - { - repstr.erase (i,1); // erase backslash - i++; // skip over '$' - continue; - } - if (i < repstr.size () - 1 && repstr[i+1] == '\\') - { - repstr.erase (i,1); // erase 1st backslash - continue; - } - } - else if (repstr[i] == '$') - { - if (i < repstr.size () - 1 && isdigit (repstr[i+1])) - { - rep_token_t tmp_token; - - tmp_token.pos = i; - tmp_token.num = repstr[i+1]-'0'; - tokens.push_back (tmp_token); - } - } - } - - std::string rep; - int num_tokens = tokens.size (); - - if (num_tokens > 0) - { - // Determine replacement length - const size_t replen = repstr.size () - 2*num_tokens; - int delta = 0; - regexp::match_data::const_iterator p = rx_lst.begin (); - for (size_t i = 0; i < num_matches; i++) - { - OCTAVE_QUIT; - - double start = p->start (); - double end = p->end (); - - const Matrix pairs (p->token_extents ()); - size_t pairlen = 0; - for (int j = 0; j < num_tokens; j++) - { - if (tokens[j].num == 0) - pairlen += static_cast (end - start) + 1; - else if (tokens[j].num <= pairs.rows ()) - pairlen += static_cast (pairs(tokens[j].num-1,1) - - pairs(tokens[j].num-1,0)) + 1; - } - delta += (static_cast (replen + pairlen) - - static_cast (end - start + 1)); - p++; - } - - // Build replacement string - rep.reserve (buffer.size () + delta); - size_t from = 0; - p = rx_lst.begin (); - for (size_t i = 0; i < num_matches; i++) - { - OCTAVE_QUIT; - - double start = p->start (); - double end = p->end (); - - const Matrix pairs (p->token_extents ()); - rep.append (&buffer[from], static_cast (start - 1) - from); - from = static_cast (end - 1) + 1; - - size_t cur_pos = 0; - - for (int j = 0; j < num_tokens; j++) - { - rep.append (&repstr[cur_pos], (tokens[j].pos) - cur_pos); - cur_pos = tokens[j].pos+2; - - int k = tokens[j].num; - if (k == 0) - { - // replace with entire match - rep.append (&buffer[static_cast (end - 1)], - static_cast (end - start) + 1); - } - else if (k <= pairs.rows ()) - { - // replace with group capture - rep.append (&buffer[static_cast (pairs(k-1,0)-1)], - static_cast (pairs(k-1,1) - - pairs(k-1,0)) + 1); - } - else - { - // replace with nothing - } - } - if (cur_pos < repstr.size ()) - rep.append (&repstr[cur_pos], repstr.size () - cur_pos); - - p++; - } - rep.append (&buffer[from], buffer.size () - from); - } - else - { - // Determine repstr length - const size_t replen = repstr.size (); - int delta = 0; - regexp::match_data::const_iterator p = rx_lst.begin (); - for (size_t i = 0; i < num_matches; i++) - { - OCTAVE_QUIT; - delta += static_cast (replen) - - static_cast (p->end () - p->start () + 1); - p++; - } - - // Build replacement string - rep.reserve (buffer.size () + delta); - size_t from = 0; - p = rx_lst.begin (); - for (size_t i = 0; i < num_matches; i++) - { - OCTAVE_QUIT; - rep.append (&buffer[from], - static_cast (p->start () - 1) - from); - from = static_cast (p->end () - 1) + 1; - rep.append (repstr); - p++; - } - rep.append (&buffer[from], buffer.size () - from); - } - - retval = rep; - return retval; -} diff -r 06897f865f0b -r 99122191d3dd liboctave/util/regexp.h --- a/liboctave/util/regexp.h Tue Jul 09 16:08:16 2013 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,289 +0,0 @@ -/* - -Copyright (C) 2012 John W. Eaton -Copyright (C) 2005-2012 David Bateman - -This file is part of Octave. - -Octave is free software; you can redistribute it and/or modify it -under the terms of the GNU General Public License as published by the -Free Software Foundation; either version 3 of the License, or (at your -option) any later version. - -Octave is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -for more details. - -You should have received a copy of the GNU General Public License -along with Octave; see the file COPYING. If not, see -. - -*/ - -#if !defined (octave_regexp_match_h) -#define octave_regexp_match_h 1 - -#include -#include -#include - -#include "Array.h" -#include "Matrix.h" -#include "base-list.h" -#include "str-vec.h" - -class -OCTAVE_API -regexp -{ -public: - - class opts; - class match_data; - - regexp (const std::string& pat = "", - const regexp::opts& opt = regexp::opts (), - const std::string& w = "regexp") - : pattern (pat), options (opt), data (0), named_pats (), - nnames (0), named_idx (), who (w) - { - compile_internal (); - } - - regexp (const regexp& rx) - : pattern (rx.pattern), data (rx.data), named_pats (rx.named_pats), - nnames (rx.nnames), named_idx (rx.named_idx) - { } - - regexp& operator = (const regexp& rx) - { - if (this != &rx) - { - pattern = rx.pattern; - data = rx.data; - named_pats = rx.named_pats; - nnames = rx.nnames; - named_idx = rx.named_idx; - } - - return *this; - } - - ~regexp (void) { free (); } - - void compile (const std::string& pat, - const regexp::opts& opt = regexp::opts ()) - { - pattern = pat; - options = opt; - compile_internal (); - } - - match_data match (const std::string& buffer); - - bool is_match (const std::string& buffer); - - Array is_match (const string_vector& buffer); - - std::string replace (const std::string& buffer, - const std::string& replacement); - - class opts - { - public: - - opts (void) - : x_case_insensitive (false), x_dotexceptnewline (false), - x_emptymatch (false), x_freespacing (false), x_lineanchors (false), - x_once (false) { } - - opts (const opts& o) - : x_case_insensitive (o.x_case_insensitive), - x_dotexceptnewline (o.x_dotexceptnewline), - x_emptymatch (o.x_emptymatch), - x_freespacing (o.x_freespacing), - x_lineanchors (o.x_lineanchors), - x_once (o.x_once) - { } - - opts& operator = (const opts& o) - { - if (this != &o) - { - x_case_insensitive = o.x_case_insensitive; - x_dotexceptnewline = o.x_dotexceptnewline; - x_emptymatch = o.x_emptymatch; - x_freespacing = o.x_freespacing; - x_lineanchors = o.x_lineanchors; - x_once = o.x_once; - } - - return *this; - } - - ~opts (void) { } - - void case_insensitive (bool val) { x_case_insensitive = val; } - void dotexceptnewline (bool val) { x_dotexceptnewline = val; } - void emptymatch (bool val) { x_emptymatch = val; } - void freespacing (bool val) { x_freespacing = val; } - void lineanchors (bool val) { x_lineanchors = val; } - void once (bool val) { x_once = val; } - - bool case_insensitive (void) const { return x_case_insensitive; } - bool dotexceptnewline (void) const { return x_dotexceptnewline; } - bool emptymatch (void) const { return x_emptymatch; } - bool freespacing (void) const { return x_freespacing; } - bool lineanchors (void) const { return x_lineanchors; } - bool once (void) const { return x_once; } - - private: - - bool x_case_insensitive; - bool x_dotexceptnewline; - bool x_emptymatch; - bool x_freespacing; - bool x_lineanchors; - bool x_once; - }; - - class match_element - { - public: - - match_element (const string_vector& nt, const string_vector& t, - const std::string& ms, const Matrix& te, - double s, double e) - : x_match_string (ms), x_named_tokens (nt), x_tokens (t), - x_token_extents (te), x_start (s), x_end (e) - { } - - match_element (const match_element &a) - : x_match_string (a.x_match_string), - x_named_tokens (a.x_named_tokens), x_tokens (a.x_tokens), - x_token_extents (a.x_token_extents), - x_start (a.x_start), x_end (a.x_end) - { } - - std::string match_string (void) const { return x_match_string; } - string_vector named_tokens (void) const { return x_named_tokens; } - string_vector tokens (void) const { return x_tokens; } - Matrix token_extents (void) const { return x_token_extents; } - double start (void) const { return x_start; } - double end (void) const { return x_end; } - - private: - - std::string x_match_string; - string_vector x_named_tokens; - string_vector x_tokens; - Matrix x_token_extents; - double x_start; - double x_end; - }; - - class match_data : public octave_base_list - { - public: - - match_data (void) - : octave_base_list (), named_pats () - { } - - match_data (const std::list& l, const string_vector& np) - : octave_base_list (l), named_pats (np) - { } - - match_data (const match_data& rx_lst) - : octave_base_list (rx_lst), - named_pats (rx_lst.named_pats) - { } - - match_data& operator = (const match_data& rx_lst) - { - if (this != &rx_lst) - { - octave_base_list::operator = (rx_lst); - named_pats = rx_lst.named_pats; - } - - return *this; - } - - ~match_data (void) { } - - string_vector named_patterns (void) { return named_pats; } - - private: - - string_vector named_pats; - }; - -private: - - // The pattern we've been asked to match. - std::string pattern; - - opts options; - - // Internal data describing the regular expression. - void *data; - - std::string m; - string_vector named_pats; - int nnames; - Array named_idx; - std::string who; - - void free (void); - - void compile_internal (void); -}; - -inline regexp::match_data -regexp_match (const std::string& pat, - const std::string& buffer, - const regexp::opts& opt = regexp::opts (), - const std::string& who = "regexp") -{ - regexp rx (pat, opt, who); - - return rx.match (buffer); -} - -inline bool -is_regexp_match (const std::string& pat, - const std::string& buffer, - const regexp::opts& opt = regexp::opts (), - const std::string& who = "regexp") -{ - regexp rx (pat, opt, who); - - return rx.is_match (buffer); -} - -inline Array -is_regexp_match (const std::string& pat, - const string_vector& buffer, - const regexp::opts& opt = regexp::opts (), - const std::string& who = "regexp") -{ - regexp rx (pat, opt, who); - - return rx.is_match (buffer); -} - -inline std::string -regexp_replace (const std::string& pat, - const std::string& buffer, - const std::string& replacement, - const regexp::opts& opt = regexp::opts (), - const std::string& who = "regexp") -{ - regexp rx (pat, opt, who); - - return rx.replace (buffer, replacement); -} - -#endif