Mercurial > octave
diff src/DLD-FUNCTIONS/regexp.cc @ 5785:6b9cec830d72
[project @ 2006-05-03 19:32:46 by dbateman]
author | dbateman |
---|---|
date | Wed, 03 May 2006 19:32:48 +0000 |
parents | 12eeebfa7ead |
children | beb5b95de2c0 |
line wrap: on
line diff
--- a/src/DLD-FUNCTIONS/regexp.cc Wed May 03 05:57:16 2006 +0000 +++ b/src/DLD-FUNCTIONS/regexp.cc Wed May 03 19:32:48 2006 +0000 @@ -20,9 +20,6 @@ */ -// FIXME -// regexprep should be written as an m-file based on regexp - #ifdef HAVE_CONFIG_H #include <config.h> #endif @@ -39,6 +36,8 @@ #include "Cell.h" #include "oct-map.h" #include "str-vec.h" +#include "quit.h" +#include "parse.h" #ifdef HAVE_PCRE #include <pcre.h> @@ -51,37 +50,67 @@ #endif #endif -static octave_value_list -octregexp (const octave_value_list &args, int nargout, const std::string &nm, - bool case_insensitive) +// The regexp is constructed as a linked list to avoid resizing the +// return values in arrays at each new match. + +// FIXME don't bother collecting and composing return values the user +// doesn't want. + +class regexp_elem { - octave_value_list retval; +public: + regexp_elem (const string_vector _named_token, const Cell _t, + const std::string _m, const Matrix _te, const double _s, + const double _e) : + named_token (_named_token), t (_t), m (_m), te (_te), s (_s), e (_e) { } + + regexp_elem (const regexp_elem &a) : named_token (a.named_token), t (a.t), + m (a.m), te (a.te), s (a.s), e (a.e) + { } + + string_vector named_token; + Cell t; + std::string m; + Matrix te; + double s; + double e; +}; + +typedef std::list<regexp_elem>::const_iterator const_iterator; + +static int +octregexp_list (const octave_value_list &args, const std::string &nm, + bool case_insensitive, std::list<regexp_elem> &lst, + string_vector &named, int &nopts) +{ + int sz = 0; #if defined (HAVE_REGEX) || defined (HAVE_PCRE) int nargin = args.length(); - int nopts = nargin - 2; bool once = false; bool lineanchors = false; bool dotexceptnewline = false; bool freespacing = false; + nopts = nargin - 2; + if (nargin < 2) { print_usage(nm); - return retval; + return 0; } std::string buffer = args(0).string_value (); if (error_state) { gripe_wrong_type_arg (nm.c_str(), args(0)); - return retval; + return 0; } std::string pattern = args(1).string_value (); if (error_state) { gripe_wrong_type_arg (nm.c_str(), args(1)); - return retval; + return 0; } for (int i = 2; i < nargin; i++) @@ -98,28 +127,6 @@ once = true; nopts--; } -#if HAVE_PCRE - // Only accept these options with pcre - else if (str.find("dotall", 0) == 0) - { - dotexceptnewline = false; - nopts--; - } - else if (str.find("dotexceptnewline", 0) == 0) - { - dotexceptnewline = true; - nopts--; - } - else if (str.find("stringanchors", 0) == 0) - { - lineanchors = false; - nopts--; - } - else if (str.find("lineanchors", 0) == 0) - { - lineanchors = true; - nopts--; - } else if (str.find("matchcase", 0) == 0) { case_insensitive = false; @@ -130,9 +137,14 @@ case_insensitive = true; nopts--; } - else if (str.find("freespacing", 0) == 0) + else if (str.find("dotall", 0) == 0) { - freespacing = true; + dotexceptnewline = false; + nopts--; + } + else if (str.find("stringanchors", 0) == 0) + { + lineanchors = false; nopts--; } else if (str.find("literalspacing", 0) == 0) @@ -140,6 +152,23 @@ freespacing = false; nopts--; } +#if HAVE_PCRE + // Only accept these options with pcre + else if (str.find("dotexceptnewline", 0) == 0) + { + dotexceptnewline = true; + nopts--; + } + else if (str.find("lineanchors", 0) == 0) + { + lineanchors = true; + nopts--; + } + else if (str.find("freespacing", 0) == 0) + { + freespacing = true; + nopts--; + } else if (str.find("start", 0) && str.find("end", 0) && str.find("tokenextents", 0) && str.find("match", 0) && str.find("tokens", 0) && str.find("names", 0)) @@ -149,7 +178,7 @@ str.find("dotexceptnewline", 0) == 0 || str.find("lineanchors", 0) == 0 || str.find("freespacing", 0) == 0) - error ("%s: named tokens not implemented in this version", nm.c_str()); + error ("%s: %s not implemented in this version", str.c_str(), nm.c_str()); else if (str.find("start", 0) && str.find("end", 0) && str.find("tokenextents", 0) && str.find("match", 0) && str.find("tokens", 0)) @@ -159,9 +188,9 @@ if (!error_state) { - Octave_map nmap; - Cell t, m, te; - NDArray s, e; + Cell t; + std::string m; + double s, e; // named tokens "(?<name>...)" are only treated with PCRE not regex. #if HAVE_PCRE @@ -174,13 +203,11 @@ size_t pos = 0; size_t new_pos; - string_vector named; int nnames = 0; int inames = 0; std::ostringstream buf; Array<int> named_idx; - // Add mode flags while ((new_pos = pattern.find ("(?<",pos)) != NPOS) { size_t tmp_pos = pattern.find_first_of ('>',new_pos); @@ -224,7 +251,7 @@ buf << pattern.substr(pos); if (error_state) - return retval; + return 0; // Compile expression pcre *re; @@ -241,7 +268,7 @@ if (re == NULL) { error("%s: %s at position %d of expression", nm.c_str(), err, erroffset); - return retval; + return 0; } int subpatterns; @@ -249,7 +276,6 @@ int nameentrysize; char *nametable; int idx = 0; - int sz = 0; pcre_fullinfo(re, NULL, PCRE_INFO_CAPTURECOUNT, &subpatterns); pcre_fullinfo(re, NULL, PCRE_INFO_NAMECOUNT, &namecount); @@ -267,10 +293,10 @@ static_cast<int>(nametable[i*nameentrysize+1]); } - Cell named_tokens(dim_vector(nnames,1)); - while(true) { + OCTAVE_QUIT; + int matches = pcre_exec(re, NULL, buffer.c_str(), buffer.length(), idx, (idx ? PCRE_NOTBOL : 0), @@ -280,7 +306,7 @@ { error ("%s: internal error calling pcre_exec", nm.c_str()); pcre_free(re); - return retval; + return 0; } else if (matches == PCRE_ERROR_NOMATCH) break; @@ -288,26 +314,19 @@ break; else { - // FIXME Should collect arguments in a linked structure and - // resize and assign the return value a single time to make - // this function O(n) rather than O(n^2) as it currently is. int pos_match = 0; - s.resize (dim_vector(1, sz+1)); - s(sz) = double (ovector[0]+1); - e.resize (dim_vector(1, sz+1)); - e(sz) = double (ovector[1]); - te.resize(dim_vector(1, sz+1)); - Matrix mat_te(matches-1,2); + Matrix te(matches-1,2); for (int i = 1; i < matches; i++) { if (ovector[2*i] >= 0 && ovector[2*i+1] > 0) { - mat_te(pos_match,0) = double (ovector[2*i]+1); - mat_te(pos_match++,1) = double (ovector[2*i+1]); + te(pos_match,0) = double (ovector[2*i]+1); + te(pos_match++,1) = double (ovector[2*i+1]); } } - mat_te.resize(pos_match,2); - te(sz) = mat_te; + te.resize(pos_match,2); + s = double (ovector[0]+1); + e = double (ovector[1]); const char **listptr; int status = pcre_get_substring_list(buffer.c_str(), ovector, @@ -317,53 +336,42 @@ error("%s: cannot allocate memory in pcre_get_substring_list", nm.c_str()); pcre_free(re); - return retval; + return 0; } - m.resize (dim_vector(1, sz+1)); - m(sz) = std::string(*listptr); - - t.resize (dim_vector(1, sz+1)); Cell cell_t (dim_vector(1,pos_match)); pos_match = 0; for (int i = 1; i < matches; i++) if (ovector[2*i] >= 0 && ovector[2*i+1] > 0) cell_t(pos_match++) = std::string(*(listptr+i)); - t(sz) = cell_t; + m = std::string(*listptr); + t = cell_t; + + string_vector named_tokens(nnames); if (namecount > 0) for (int i = 1; i < matches; i++) { if (ovector[2*i] >= 0 && ovector[2*i+1] > 0) { - if (sz == 0) - { - named_tokens(named_idx(i-1)) = - std::string(*(listptr+nidx[i-1])); - } - else - { - Cell tmp = named_tokens(named_idx(i-1)); - tmp.resize(dim_vector(1,sz+1)); - tmp(sz) = std::string(*(listptr+nidx[i-1])); - named_tokens(named_idx(i-1)) = tmp; - } + named_tokens(named_idx(i-1)) = + std::string(*(listptr+nidx[i-1])); } } pcre_free_substring_list(listptr); + regexp_elem new_elem (named_tokens, t, m, te, s, e); + lst.push_back (new_elem); + idx = ovector[1]; + sz++; + if (once) break; - idx = ovector[1]; - sz++; } } - for (int i = 0; i < nnames; i++) - nmap.assign (named(i), named_tokens(i)); - pcre_free(re); #else regex_t compiled; @@ -377,18 +385,19 @@ error("%s: %s in pattern (%s)", nm.c_str(), errmsg, pattern.c_str()); regfree(&compiled); - return retval; + return 0; } int subexpr = 1; int idx = 0; - int sz = 0; for (unsigned int i=0; i < pattern.length(); i++) subexpr += ( pattern[i] == '(' ? 1 : 0 ); OCTAVE_LOCAL_BUFFER (regmatch_t, match, subexpr ); while(true) { + OCTAVE_QUIT; + if (regexec(&compiled, buffer.c_str() + idx, subexpr, match, (idx ? REG_NOTBOL : 0)) == 0) { @@ -397,31 +406,28 @@ while (matches < subexpr && match[matches].rm_so >= 0) matches++; - s.resize (dim_vector(1, sz+1)); - s(sz) = double (match[0].rm_so+1+idx); - e.resize (dim_vector(1, sz+1)); - e(sz) = double (match[0].rm_eo+idx); - te.resize(dim_vector(1, sz+1)); - Matrix mat_te(matches-1,2); + s = double (match[0].rm_so+1+idx); + e = double (match[0].rm_eo+idx); + Matrix te(matches-1,2); for (int i = 1; i < matches; i++) { - mat_te(i-1,0) = double (match[i].rm_so+1+idx); - mat_te(i-1,1) = double (match[i].rm_eo+idx); + te(i-1,0) = double (match[i].rm_so+1+idx); + te(i-1,1) = double (match[i].rm_eo+idx); } - te(sz) = mat_te; - m.resize (dim_vector(1, sz+1)); - m(sz) = buffer.substr (match[0].rm_so+idx, + m = buffer.substr (match[0].rm_so+idx, match[0].rm_eo-match[0].rm_so); - t.resize (dim_vector(1, sz+1)); Cell cell_t (dim_vector(1,matches-1)); for (int i = 1; i < matches; i++) cell_t(i-1) = buffer.substr (match[i].rm_so+idx, match[i].rm_eo-match[i].rm_so); - t(sz) = cell_t; + t = cell_t; idx += match[0].rm_eo; + + regexp_elem new_elem (Octave_map(), t, m, te, s, e); + lst.push_back (new_elem); sz++; if (once) @@ -432,12 +438,82 @@ } regfree(&compiled); #endif + } +#else + error ("%s: not available in this version of Octave", nm.c_str()); +#endif + return sz; +} - retval(5) = nmap; +static octave_value_list +octregexp (const octave_value_list &args, int nargout, const std::string &nm, + bool case_insensitive) +{ + octave_value_list retval; + int nargin = args.length(); + std::list<regexp_elem> lst; + string_vector named; + int nopts; + int sz = octregexp_list (args, nm, case_insensitive, lst, named, nopts); + + if (! error_state) + { + // Converted the linked list in the correct form for the return values + + octave_idx_type i = 0; +#ifdef HAVE_PCRE + Octave_map nmap; + if (sz == 1) + { + for (int j = 0; j < named.length(); j++) + nmap.assign (named(j), lst.begin()->named_token(j)); + retval(5) = nmap; + } + else + { + for (int j = 0; j < named.length (); j++) + { + i = 0; + Cell tmp(dim_vector (1, sz)); + for (const_iterator p = lst.begin(); p != lst.end(); p++) + tmp(i++) = p->named_token(j); + nmap.assign (named(j), octave_value (tmp)); + } + retval(5) = nmap; + } +#else + retval(5) = Octave_map(); +#endif + + Cell t (dim_vector(1, sz)); + i = 0; + for (const_iterator p = lst.begin(); p != lst.end(); p++) + t(i++) = p->t; retval(4) = t; + + Cell m (dim_vector(1, sz)); + i = 0; + for (const_iterator p = lst.begin(); p != lst.end(); p++) + m(i++) = p->m; retval(3) = m; + + + Cell te (dim_vector(1, sz)); + i = 0; + for (const_iterator p = lst.begin(); p != lst.end(); p++) + te(i++) = p->te; retval(2) = te; + + NDArray e (dim_vector(1, sz)); + i = 0; + for (const_iterator p = lst.begin(); p != lst.end(); p++) + e(i++) = p->e; retval(1) = e; + + NDArray s (dim_vector(1, sz)); + i = 0; + for (const_iterator p = lst.begin(); p != lst.end(); p++) + s(i++) = p->s; retval(0) = s; // Alter the order of the output arguments @@ -448,16 +524,15 @@ new_retval.resize(nargout); OCTAVE_LOCAL_BUFFER (int, arg_used, 6); - for (int i = 0; i < 6; i++) - arg_used[i] = false; + for (int j = 0; j < 6; j++) + arg_used[j] = false; - for (int i = 2; i < nargin; i++) + for (int j = 2; j < nargin; j++) { int k = 0; - std::string str = args(i).string_value(); + std::string str = args(j).string_value(); std::transform (str.begin (), str.end (), str.begin (), tolower); if (str.find("once", 0) == 0 -#if HAVE_PCRE || str.find("stringanchors", 0) == 0 || str.find("lineanchors", 0) == 0 || str.find("matchcase", 0) == 0 @@ -466,7 +541,6 @@ || str.find("dotexceptnewline", 0) == 0 || str.find("literalspacing", 0) == 0 || str.find("freespacing", 0) == 0 -#endif ) continue; else if (str.find("start", 0) == 0) @@ -492,10 +566,10 @@ // Fill in the rest of the arguments if (n < nargout) { - for (int i = 0; i < 6; i++) + for (int j = 0; j < 6; j++) { - if (! arg_used[i]) - new_retval(n++) = retval(i); + if (! arg_used[j]) + new_retval(n++) = retval(j); } } @@ -503,9 +577,6 @@ } } -#else - error ("%s: not available in this version of Octave", nm.c_str()); -#endif return retval; } @@ -652,23 +723,23 @@ ## Check that anchoring of pattern works correctly %!assert(regexp('abcabc','^abc'),1); %!assert(regexp('abcabc','abc$'),4); -%!assert(regexp('abcabc','^abc$'),[]); +%!assert(regexp('abcabc','^abc$'),zeros(1,0)); %!test %! [s, e, te, m, t] = regexp(' No Match ', 'f(.*)uck'); -%! assert (s,[]) -%! assert (e,[]) -%! assert (te,{}) -%! assert (m, {}) -%! assert (t, {}) +%! assert (s,zeros(1,0)) +%! assert (e,zeros(1,0)) +%! assert (te,cell(1,0)) +%! assert (m, cell(1,0)) +%! assert (t, cell(1,0)) %!test %! [s, e, te, m, t] = regexp(' FiRetrUck ', 'f(.*)uck'); -%! assert (s,[]) -%! assert (e,[]) -%! assert (te,{}) -%! assert (m, {}) -%! assert (t, {}) +%! assert (s,zeros(1,0)) +%! assert (e,zeros(1,0)) +%! assert (te,cell(1,0)) +%! assert (m, cell(1,0)) +%! assert (t, cell(1,0)) %!test %! [s, e, te, m, t] = regexp(' firetruck ', 'f(.*)uck'); @@ -797,8 +868,8 @@ %!test %! if (!isempty(findstr(octave_config_info ("DEFS"),"HAVE_PCRE"))) %! assert (regexp("this word",'(?-x)s w','literalspacing'),4) -%! assert (regexp("this word",'s w','freespacing'),[]) -%! assert (regexp("this word",'(?x)s w'),[]) +%! assert (regexp("this word",'s w','freespacing'),zeros(1,0)) +%! assert (regexp("this word",'(?x)s w'),zeros(1,0)) %! endif %!error regexp('string', 'tri', 'BadArg'); @@ -827,15 +898,15 @@ ## Check that anchoring of pattern works correctly %!assert(regexpi('abcabc','^abc'),1); %!assert(regexpi('abcabc','abc$'),4); -%!assert(regexpi('abcabc','^abc$'),[]); +%!assert(regexpi('abcabc','^abc$'),zeros(1,0)); %!test %! [s, e, te, m, t] = regexpi(' No Match ', 'f(.*)uck'); -%! assert (s,[]) -%! assert (e,[]) -%! assert (te,{}) -%! assert (m, {}) -%! assert (t, {}) +%! assert (s,zeros(1,0)) +%! assert (e,zeros(1,0)) +%! assert (te,cell(1,0)) +%! assert (m, cell(1,0)) +%! assert (t, cell(1,0)) %!test %! [s, e, te, m, t] = regexpi(' FiRetrUck ', 'f(.*)uck'); @@ -956,8 +1027,8 @@ %!test %! if (!isempty(findstr(octave_config_info ("DEFS"),"HAVE_PCRE"))) %! assert (regexpi("this word",'(?-x)s w','literalspacing'),4) -%! assert (regexpi("this word",'s w','freespacing'),[]) -%! assert (regexpi("this word",'(?x)s w'),[]) +%! assert (regexpi("this word",'s w','freespacing'),zeros(1,0)) +%! assert (regexpi("this word",'(?x)s w'),zeros(1,0)) %! endif %!error regexpi('string', 'tri', 'BadArg'); @@ -965,6 +1036,293 @@ */ +DEFUN_DLD(regexprep, args, , + "-*- texinfo -*-\n\ +@deftypefn {Function File} @var{string} = regexprep(@var{string}, @var{pat}, @var{repstr}, @var{options})\n\ +Replace matches of @var{pat} in @var{string} with @var{repstr}.\n\ +\n\ +\n\ +The replacement can contain @code{$i}, which subsubstitutes\n\ +for the ith set of parentheses in the match string. E.g.,\n\ +@example\n\ +\n\ + regexprep(\"Bill Dunn\",'(\\w+) (\\w+)','$2, $1')\n\ +\n\ +@end example\n\ +returns \"Dunn, Bill\"\n\ +\n\ +@var{options} may be zero or more of\n\ +@table @samp\n\ +\n\ +@item once\n\ +Replace only the first occurance of @var{pat} in the result.\n\ +\n\ +@item warnings\n\ +This option is present for compatibility but is ignored.\n\ +\n\ +@item ignorecase or matchcase\n\ +Ignore case for the pattern matching (see @code{regexpi}).\n\ +Alternatively, use (?i) or (?-i) in the pattern.\n\ +\n\ +@item lineanchors and stringanchors\n\ +Whether characters ^ and $ match the beginning and ending of lines.\n\ +Alternatively, use (?m) or (?-m) in the pattern.\n\ +\n\ +@item dotexceptnewline and dotall\n\ +Whether . matches newlines in the string.\n\ +Alternatively, use (?s) or (?-s) in the pattern.\n\ +\n\ +@item freespacing or literalspacing\n\ +Whether whitespace and # comments can be used to make the regular expression more readable.\n\ +Alternatively, use (?x) or (?-x) in the pattern.\n\ +\n\ +@end table\n\ +@seealso{regexp,regexpi}\n\ +@end deftypefn") +{ + octave_value_list retval; + + int nargin = args.length(); + + if (nargin < 3) + { + print_usage("regexprep"); + return retval; + } + + // Make sure we have string,pattern,replacement + const std::string buffer = args(0).string_value (); + if (error_state) return retval; + const std::string pattern = args(1).string_value (); + if (error_state) return retval; + const std::string replacement = args(2).string_value (); + if (error_state) return retval; + + // Pack options excluding 'tokenize' and various output + // reordering strings into regexp arg list + octave_value_list regexpargs(nargin-1,octave_value()); + regexpargs(0) = args(0); + regexpargs(1) = args(1); + int len=2; + for (int i = 3; i < nargin; i++) + { + const std::string opt = args(i).string_value(); + if (opt != "tokenize" && opt != "start" && opt != "end" + && opt != "tokenextents" && opt != "match" && opt != "tokens" + && opt != "names" && opt != "warnings") + { + regexpargs(len++) = args(i); + } + } + regexpargs.resize(len); + + // Identify replacement tokens; build a vector of group numbers in + // the replacement string so that we can quickly calculate the size + // of the replacement. + int tokens = 0; + for (size_t i=1; i < replacement.size(); i++) + { + if (replacement[i-1]=='$' && isdigit(replacement[i])) + { + tokens++, i++; + } + } + std::vector<int> token(tokens); + int kk = 0; + for (size_t i = 1; i < replacement.size(); i++) + { + if (replacement[i-1]=='$' && isdigit(replacement[i])) + { + token[kk++] = replacement[i]-'0'; + i++; + } + } + + // Perform replacement + std::string rep; + if (tokens > 0) + { + std::list<regexp_elem> lst; + string_vector named; + int nopts; + int sz = octregexp_list (regexpargs, "regexprep", false, lst, named, + nopts); + + if (error_state) + return retval; + if (sz == 0) + { + retval(0) = args(0); + return retval; + } + + // Determine replacement length + const size_t replen = replacement.size() - 2*tokens; + int delta = 0; + const_iterator p = lst.begin(); + for (int i = 0; i < sz; i++) + { + OCTAVE_QUIT; + + const Matrix pairs(p->te); + size_t pairlen = 0; + for (int j = 0; j < tokens; j++) + { + if (token[j] == 0) + pairlen += static_cast<size_t>(p->e - p->s) + 1; + else if (token[j] <= pairs.rows()) + pairlen += static_cast<size_t>(pairs(token[j]-1,1) - + pairs(token[j]-1,0)) + 1; + } + delta += static_cast<int>(replen + pairlen) - + static_cast<int>(p->e - p->s + 1); + p++; + } + + // Build replacement string + rep.reserve(buffer.size()+delta); + size_t from = 0; + p = lst.begin(); + for (int i=0; i < sz; i++) + { + OCTAVE_QUIT; + + const Matrix pairs(p->te); + rep.append(&buffer[from], static_cast<size_t>(p->s - 1) - from); + from = static_cast<size_t>(p->e - 1) + 1; + for (size_t j = 1; j < replacement.size(); j++) + { + if (replacement[j-1]=='$' && isdigit(replacement[j])) + { + int k = replacement[j]-'0'; + if (k == 0) + { + // replace with entire match + rep.append(&buffer[static_cast<size_t>(p->e - 1)], + static_cast<size_t>(p->e - p->s) + 1); + } + else if (k <= pairs.rows()) + { + // replace with group capture + rep.append(&buffer[static_cast<size_t>(pairs(k-1,0)-1)], + static_cast<size_t>(pairs(k-1,1) - + pairs(k-1,0))+1); + } + else + { + // replace with nothing + } + j++; + } + else + { + rep.append(1,replacement[j-1]); + } + if (j+1 == replacement.size()) + { + rep.append(1,replacement[j]); + } + } + p++; + } + rep.append(&buffer[from],buffer.size()-from); + } + else + { + std::list<regexp_elem> lst; + string_vector named; + int nopts; + int sz = octregexp_list (regexpargs, "regexprep", false, lst, named, + nopts); + + if (error_state) + return retval; + if (sz == 0) + { + retval(0) = args(0); + return retval; + } + + // Determine replacement length + const size_t replen = replacement.size(); + int delta = 0; + const_iterator p = lst.begin(); + for (int i = 0; i < sz; i++) + { + OCTAVE_QUIT; + delta += static_cast<int>(replen) - + static_cast<int>(p->e - p->s + 1); + p++; + } + + // Build replacement string + rep.reserve(buffer.size()+delta); + size_t from = 0; + p = lst.begin(); + for (int i=0; i < sz; i++) + { + OCTAVE_QUIT; + rep.append(&buffer[from], static_cast<size_t>(p->s - 1) - from); + from = static_cast<size_t>(p->e - 1) + 1; + rep.append(replacement); + p++; + } + rep.append(&buffer[from],buffer.size()-from); + } + + retval(0) = rep; + return retval; +} + +/* +%!test # Replace with empty +%! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>'; +%! t = regexprep(xml,'<[!?][^>]*>',''); +%! assert(t,' <tag v="hello">some stuff</tag>') + +%!test # Replace with non-empty +%! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>'; +%! t = regexprep(xml,'<[!?][^>]*>','?'); +%! assert(t,'? <tag v="hello">some stuff?</tag>') + +%!test # Check that 'tokenize' is ignored +%! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>'; +%! t = regexprep(xml,'<[!?][^>]*>','','tokenize'); +%! assert(t,' <tag v="hello">some stuff</tag>') + +%!test # Capture replacement +%! if (!isempty(findstr(octave_config_info ("DEFS"),"HAVE_PCRE"))) +%! data = "Bob Smith\nDavid Hollerith\nSam Jenkins"; +%! result = "Smith, Bob\nHollerith, David\nJenkins, Sam"; +%! t = regexprep(data,'(?m)^(\w+)\s+(\w+)$','$2, $1'); +%! assert(t,result) +%! end + +# Return the original if no match +%!assert(regexprep('hello','world','earth'),'hello') + +## Test a general replacement +%!assert(regexprep("a[b]c{d}e-f=g", "[^A-Za-z0-9_]", "_"), "a_b_c_d_e_f_g"); + +## Make sure it works at the beginning and end +%!assert(regexprep("a[b]c{d}e-f=g", "a", "_"), "_[b]c{d}e-f=g"); +%!assert(regexprep("a[b]c{d}e-f=g", "g", "_"), "a[b]c{d}e-f=_"); + +## Options +%!assert(regexprep("a[b]c{d}e-f=g", "[^A-Za-z0-9_]", "_", "once"), "a_b]c{d}e-f=g"); +%!assert(regexprep("a[b]c{d}e-f=g", "[^A-Z0-9_]", "_", "ignorecase"), "a_b_c_d_e_f_g"); + +## Option combinations +%!assert(regexprep("a[b]c{d}e-f=g", "[^A-Z0-9_]", "_", "once", "ignorecase"), "a_b]c{d}e-f=g"); + +## End conditions on replacement +%!assert(regexprep("abc","(b)",".$1"),"a.bc"); +%!assert(regexprep("abc","(b)","$1"),"abc"); +%!assert(regexprep("abc","(b)","$1."),"ab.c"); +%!assert(regexprep("abc","(b)","$1.."),"ab..c"); + +*/ + /* ;;; Local Variables: *** ;;; mode: C++ ***