# HG changeset patch # User David Bateman # Date 1220978213 14400 # Node ID 806c1e8a07c8374b7c8b7203dd1199daa8464509 # Parent 233de4b9b25901a3c77fef2b97b6e0d0a2ca8899 Treat PCRE lookbehind operators in a manner that is approximately correct diff -r 233de4b9b259 -r 806c1e8a07c8 src/ChangeLog --- a/src/ChangeLog Mon Sep 08 15:21:44 2008 -0400 +++ b/src/ChangeLog Tue Sep 09 12:36:53 2008 -0400 @@ -1,3 +1,10 @@ +2008-09-09 David Bateman + + * DLD-FUNCTIONS/regexp.cc (octregexp_list): Distinguish between + matlab named tokens and perl lookbehind expressions. For + lookbehind expression replace "*" and "+" with a limited number of + fixed length expressions to simulate arbitrary length look behind. + 2008-09-08 John W. Eaton * ls-oct-ascii.cc (std::string extract_keyword (std::istream&, diff -r 233de4b9b259 -r 806c1e8a07c8 src/DLD-FUNCTIONS/regexp.cc --- a/src/DLD-FUNCTIONS/regexp.cc Mon Sep 08 15:21:44 2008 -0400 +++ b/src/DLD-FUNCTIONS/regexp.cc Tue Sep 09 12:36:53 2008 -0400 @@ -80,6 +80,9 @@ typedef std::list::const_iterator const_iterator; +#define MAXLOOKBEHIND 10 +static bool lookbehind_warned = false; + static int octregexp_list (const octave_value_list &args, const std::string &nm, bool case_insensitive, std::list &lst, @@ -96,6 +99,9 @@ once = false; std::string buffer = args(0).string_value (); + size_t max_length = (buffer.length () > MAXLOOKBEHIND ? + MAXLOOKBEHIND: buffer.length ()); + if (error_state) { gripe_wrong_type_arg (nm.c_str(), args(0)); @@ -190,12 +196,6 @@ // named tokens "(?...)" are only treated with PCRE not regex. #if HAVE_PCRE - // The syntax of named tokens in pcre is "(?P...)" while we need - // a syntax "(?...)", so fix that here. Also an expression like - // "(?\w+)\s+(?\w+)|(?\w+),\s+(?\w+)" should - // be perfectly legal, while pcre does not allow the same named token - // name on both sides of the alternative. Also fix that here by replacing - // name tokens by dummy names, and dealing with the dummy names later. size_t pos = 0; size_t new_pos; @@ -204,44 +204,131 @@ std::ostringstream buf; Array named_idx; - while ((new_pos = pattern.find ("(?<",pos)) != NPOS) + while ((new_pos = pattern.find ("(?",pos)) != NPOS) { - size_t tmp_pos = pattern.find_first_of ('>',new_pos); + if (pattern.at (new_pos + 2) == '<' && + !(pattern.at (new_pos + 3) == '=' || + pattern.at (new_pos + 3) == '!')) + { + // The syntax of named tokens in pcre is "(?P...)" while + // we need a syntax "(?...)", so fix that here. Also an + // expression like + // "(?\w+)\s+(?\w+)|(?\w+),\s+(?\w+)" + // should be perfectly legal, while pcre does not allow the same + // named token name on both sides of the alternative. Also fix + // that here by replacing name tokens by dummy names, and dealing + // with the dummy names later. + + size_t tmp_pos = pattern.find_first_of ('>',new_pos); + + if (tmp_pos == NPOS) + { + error ("syntax error in pattern"); + break; + } + + std::string tmp_name = + pattern.substr(new_pos+3,tmp_pos-new_pos-3); + bool found = false; + + for (int i = 0; i < nnames; i++) + if (named(i) == tmp_name) + { + named_idx.resize(inames+1); + named_idx(inames) = i; + found = true; + break; + } + if (! found) + { + named_idx.resize(inames+1); + named_idx(inames) = nnames; + named.append(tmp_name); + nnames++; + } + + if (new_pos - pos > 0) + buf << pattern.substr(pos,new_pos-pos); + if (inames < 10) + buf << "(?P 0) + { + char ch = pattern.at (tmp_pos1); + if (ch == '(') + brackets++; + else if (ch == ')') + { + if (brackets > 1) + tmp_pos2 = tmp_pos1; + + brackets--; + } + tmp_pos1++; + } + + if (brackets != 0) + { + buf << pattern.substr (pos, new_pos - pos) << "(?"; + pos = new_pos + 2; + } + else + { + size_t tmp_pos3 = pattern.find_first_of ("*+", tmp_pos2); + if (tmp_pos3 != NPOS && tmp_pos3 < tmp_pos1) + { + if (!lookbehind_warned) + { + lookbehind_warned = true; + warning ("%s: arbitrary length lookbehind patterns are only support up to length %d", nm.c_str(), MAXLOOKBEHIND); + } + + buf << pattern.substr (pos, new_pos - pos) << "("; + + size_t i; + if (pattern.at (tmp_pos3) == '*') + i = 0; + else + i = 1; + + for (; i < max_length + 1; i++) + { + buf < 0) - buf << pattern.substr(pos,new_pos-pos); - if (inames < 10) - buf << "(?P