Mercurial > octave
view libinterp/corefcn/regexp.cc @ 30564:796f54d4ddbf stable
update Octave Project Developers copyright for the new year
In files that have the "Octave Project Developers" copyright notice,
update for 2021.
In all .txi and .texi files except gpl.txi and gpl.texi in the
doc/liboctave and doc/interpreter directories, change the copyright
to "Octave Project Developers", the same as used for other source
files. Update copyright notices for 2022 (not done since 2019). For
gpl.txi and gpl.texi, change the copyright notice to be "Free Software
Foundation, Inc." and leave the date at 2007 only because this file
only contains the text of the GPL, not anything created by the Octave
Project Developers.
Add Paul Thomas to contributors.in.
author | John W. Eaton <jwe@octave.org> |
---|---|
date | Tue, 28 Dec 2021 18:22:40 -0500 |
parents | 7d6709900da7 |
children | 83f9f8bda883 5cf18ef0377c |
line wrap: on
line source
//////////////////////////////////////////////////////////////////////// // // Copyright (C) 2002-2022 The Octave Project Developers // // See the file COPYRIGHT.md in the top-level directory of this // distribution or <https://octave.org/copyright/>. // // This file is part of Octave. // // Octave is free software: you can redistribute it and/or modify it // under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // Octave is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with Octave; see the file COPYING. If not, see // <https://www.gnu.org/licenses/>. // //////////////////////////////////////////////////////////////////////// #if defined (HAVE_CONFIG_H) # include "config.h" #endif #include <list> #include <sstream> #include <pcre.h> #include "base-list.h" #include "oct-locbuf.h" #include "quit.h" #include "lo-regexp.h" #include "str-vec.h" #include "defun.h" #include "Cell.h" #include "error.h" #include "errwarn.h" #include "oct-map.h" #include "ovl.h" #include "utils.h" OCTAVE_NAMESPACE_BEGIN // Replace backslash escapes in a string with the real values. We need // two special functions instead of the one in utils.cc because the set // of escape sequences used for regexp patterns and replacement strings // is different from those used in the *printf functions. static std::string do_regexp_ptn_string_escapes (const std::string& s, bool is_sq_str) { std::string retval; std::size_t i = 0; std::size_t j = 0; std::size_t len = s.length (); retval.resize (len); while (j < len) { if (s[j] == '\\' && j+1 < len) { switch (s[++j]) { case 'b': // backspace if (is_sq_str) retval[i] = '\b'; else { // Pass escape sequence through retval[i] = '\\'; retval[++i] = 'b'; } break; // Translate \< and \> to PCRE patterns for pseudo-word boundary case '<': // begin word boundary retval.insert (i, "(?<=\\W|^)"); i += 8; break; case '>': // end word boundary retval.insert (i, "(?=\\W|$)"); i += 7; break; case 'o': // octal input { bool bad_esc_seq = (j+1 >= len); bool brace = false; if (! bad_esc_seq && s[++j] == '{') { brace = true; j++; } int tmpi = 0; std::size_t k; for (k = j; k < std::min (j+3+brace, len); k++) { int digit = s[k] - '0'; if (digit < 0 || digit > 7) break; tmpi <<= 3; tmpi += digit; } if (bad_esc_seq || (brace && s[k++] != '}')) { tmpi = 0; warning (R"(malformed octal escape sequence '\o' -- converting to '\0')"); } retval[i] = tmpi; j = k - 1; break; } default: // pass escape sequence through retval[i] = '\\'; retval[++i] = s[j]; break; } } else { retval[i] = s[j]; } i++; j++; } retval.resize (i); return retval; } static std::string do_regexp_rep_string_escapes (const std::string& s) { std::string retval; std::size_t i = 0; std::size_t j = 0; std::size_t len = s.length (); retval.resize (len); while (j < len) { if (s[j] == '\\' && j+1 < len) { switch (s[++j]) { case 'a': // alarm retval[i] = '\a'; break; case 'b': // backspace retval[i] = '\b'; break; case 'f': // formfeed retval[i] = '\f'; break; case 'n': // newline retval[i] = '\n'; break; case 'r': // carriage return retval[i] = '\r'; break; case 't': // horizontal tab retval[i] = '\t'; break; case 'v': // vertical tab retval[i] = '\v'; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': // octal input { std::size_t k; int tmpi = s[j] - '0'; for (k = j+1; k < std::min (j+3, len); k++) { int digit = s[k] - '0'; if (digit < 0 || digit > 7) break; tmpi <<= 3; tmpi += digit; } retval[i] = tmpi; j = k - 1; break; } case 'o': // octal input { bool bad_esc_seq = (j+1 >= len); bool brace = false; if (! bad_esc_seq && s[++j] == '{') { brace = true; j++; } int tmpi = 0; std::size_t k; for (k = j; k < std::min (j+3+brace, len); k++) { int digit = s[k] - '0'; if (digit < 0 || digit > 7) break; tmpi <<= 3; tmpi += digit; } if (bad_esc_seq || (brace && s[k++] != '}')) { warning (R"(malformed octal escape sequence '\o' -- converting to '\0')"); tmpi = 0; } retval[i] = tmpi; j = k - 1; break; } case 'x': // hex input { bool bad_esc_seq = (j+1 >= len); bool brace = false; if (! bad_esc_seq && s[++j] == '{') { brace = true; j++; } int tmpi = 0; std::size_t k; for (k = j; k < std::min (j+2+brace, len); k++) { if (! isxdigit (s[k])) break; tmpi <<= 4; int digit = s[k]; if (digit >= 'a') tmpi += digit - 'a' + 10; else if (digit >= 'A') tmpi += digit - 'A' + 10; else tmpi += digit - '0'; } if (bad_esc_seq || (brace && s[k++] != '}')) { warning (R"(malformed hex escape sequence '\x' -- converting to '\0')"); tmpi = 0; } retval[i] = tmpi; j = k - 1; break; } // Both dollar sign (for capture buffer) and backslash are // passed through with their escape backslash. The processing // for these must occur during the actual replacement operation // in lo-regexp.cc. case '$': // pass dollar sign through with escape retval[i] = '\\'; retval[++i] = '$'; break; case '\\': // pass backslash through with escape retval[i] = '\\'; retval[++i] = '\\'; break; default: // convert escaped character to unescaped char retval[i] = s[j]; break; } } else { retval[i] = s[j]; } i++; j++; } retval.resize (i); return retval; } static void parse_options (regexp::opts& options, const octave_value_list& args, const std::string& who, int skip, bool& extra_args) { extra_args = false; for (int i = skip; i < args.length (); i++) { std::string str; str = args(i).xstring_value ("%s: optional arguments must be strings", who.c_str ()); std::transform (str.begin (), str.end (), str.begin (), tolower); if (str.find ("once", 0) == 0) options.once (true); else if (str.find ("matchcase", 0) == 0) options.case_insensitive (false); else if (str.find ("ignorecase", 0) == 0) options.case_insensitive (true); else if (str.find ("dotall", 0) == 0) options.dotexceptnewline (false); else if (str.find ("stringanchors", 0) == 0) options.lineanchors (false); else if (str.find ("literalspacing", 0) == 0) options.freespacing (false); else if (str.find ("noemptymatch", 0) == 0) options.emptymatch (false); else if (str.find ("dotexceptnewline", 0) == 0) options.dotexceptnewline (true); else if (str.find ("lineanchors", 0) == 0) options.lineanchors (true); else if (str.find ("freespacing", 0) == 0) options.freespacing (true); else if (str.find ("emptymatch", 0) == 0) options.emptymatch (true); else if (str.find ("start", 0) == 0 || str.find ("end", 0) == 0 || str.find ("tokenextents", 0) == 0 || str.find ("match", 0) == 0 || str.find ("tokens", 0) == 0 || str.find ("names", 0) == 0 || str.find ("split", 0) == 0) extra_args = true; else error ("%s: unrecognized option", who.c_str ()); } } static octave_value_list octregexp (const octave_value_list& args, int nargout, const std::string& who, bool case_insensitive = false) { octave_value_list retval; int nargin = args.length (); // Make sure we have string, pattern const std::string buffer = args(0).string_value (); std::string pattern = args(1).string_value (); // Rewrite pattern for PCRE pattern = do_regexp_ptn_string_escapes (pattern, args(1).is_sq_string ()); regexp::opts options; options.case_insensitive (case_insensitive); bool extra_options = false; parse_options (options, args, who, 2, extra_options); const regexp::match_data rx_lst = regexp::match (pattern, buffer, options, who); string_vector named_pats = rx_lst.named_patterns (); std::size_t sz = rx_lst.size (); // Converted the linked list in the correct form for the return values octave_map nmap (dim_vector ((sz == 0 ? 0 : 1), sz), named_pats); retval.resize (7); if (sz != 0) { for (int j = 0; j < named_pats.numel (); j++) { Cell ctmp (dim_vector (1, sz)); octave_idx_type i = 0; for (const auto& match_data : rx_lst) { string_vector named_tokens = match_data.named_tokens (); ctmp(i++) = named_tokens(j); } nmap.assign (named_pats(j), ctmp); } } retval(5) = nmap; if (options.once ()) { auto p = rx_lst.begin (); retval(4) = (sz ? p->tokens () : Cell ()); retval(3) = (sz ? p->match_string () : ""); retval(2) = (sz ? p->token_extents () : Matrix ()); if (sz) { double start = p->start (); double end = p->end (); Cell split (dim_vector (1, 2)); split(0) = buffer.substr (0, start-1); split(1) = buffer.substr (end); retval(6) = split; retval(1) = end; retval(0) = start; } else { retval(6) = buffer; retval(1) = Matrix (); retval(0) = Matrix (); } } else { Cell tokens (dim_vector (1, sz)); Cell match_string (dim_vector (1, sz)); Cell token_extents (dim_vector (1, sz)); NDArray end (dim_vector (1, sz)); NDArray start (dim_vector (1, sz)); Cell split (dim_vector (1, sz+1)); std::size_t sp_start = 0; octave_idx_type i = 0; for (const auto& match_data : rx_lst) { double s = match_data.start (); double e = match_data.end (); string_vector tmp = match_data.tokens (); tokens(i) = Cell (dim_vector (1, tmp.numel ()), tmp); match_string(i) = match_data.match_string (); token_extents(i) = match_data.token_extents (); end(i) = e; start(i) = s; split(i) = buffer.substr (sp_start, s-sp_start-1); sp_start = e; i++; } split(i) = buffer.substr (sp_start); retval(6) = split; retval(4) = tokens; retval(3) = match_string; retval(2) = token_extents; retval(1) = end; retval(0) = start; } // Alter the order of the output arguments if (extra_options) { int n = 0; octave_value_list new_retval; new_retval.resize (nargout); bool arg_used[7] {}; for (int j = 2; j < nargin; j++) { int k = 0; std::string str = args(j).string_value (); std::transform (str.begin (), str.end (), str.begin (), tolower); if (str.find ("once", 0) == 0 || str.find ("stringanchors", 0) == 0 || str.find ("lineanchors", 0) == 0 || str.find ("matchcase", 0) == 0 || str.find ("ignorecase", 0) == 0 || str.find ("dotall", 0) == 0 || str.find ("dotexceptnewline", 0) == 0 || str.find ("literalspacing", 0) == 0 || str.find ("freespacing", 0) == 0 || str.find ("noemptymatch", 0) == 0 || str.find ("emptymatch", 0) == 0) continue; else if (str.find ("start", 0) == 0) k = 0; else if (str.find ("end", 0) == 0) k = 1; else if (str.find ("tokenextents", 0) == 0) k = 2; else if (str.find ("match", 0) == 0) k = 3; else if (str.find ("tokens", 0) == 0) k = 4; else if (str.find ("names", 0) == 0) k = 5; else if (str.find ("split", 0) == 0) k = 6; new_retval(n++) = retval(k); arg_used[k] = true; if (n == nargout) break; } // Fill in the rest of the arguments if (n < nargout) { for (int j = 0; j < 7; j++) { if (! arg_used[j]) new_retval(n++) = retval(j); } } retval = new_retval; } return retval; } static octave_value_list octcellregexp (const octave_value_list& args, int nargout, const std::string& who, bool case_insensitive = false) { octave_value_list retval; if (args(0).iscell ()) { OCTAVE_LOCAL_BUFFER (Cell, newretval, nargout); octave_value_list new_args = args; Cell cellstr = args(0).cell_value (); if (args(1).iscell ()) { Cell cellpat = args(1).cell_value (); if (cellpat.numel () == 1) { for (int j = 0; j < nargout; j++) newretval[j].resize (cellstr.dims ()); new_args(1) = cellpat(0); for (octave_idx_type i = 0; i < cellstr.numel (); i++) { new_args(0) = cellstr(i); octave_value_list tmp = octregexp (new_args, nargout, who, case_insensitive); for (int j = 0; j < nargout; j++) newretval[j](i) = tmp(j); } } else if (cellstr.numel () == 1) { for (int j = 0; j < nargout; j++) newretval[j].resize (cellpat.dims ()); new_args(0) = cellstr(0); for (octave_idx_type i = 0; i < cellpat.numel (); i++) { new_args(1) = cellpat(i); octave_value_list tmp = octregexp (new_args, nargout, who, case_insensitive); for (int j = 0; j < nargout; j++) newretval[j](i) = tmp(j); } } else if (cellstr.numel () == cellpat.numel ()) { if (cellstr.dims () != cellpat.dims ()) error ("%s: inconsistent cell array dimensions", who.c_str ()); for (int j = 0; j < nargout; j++) newretval[j].resize (cellstr.dims ()); for (octave_idx_type i = 0; i < cellstr.numel (); i++) { new_args(0) = cellstr(i); new_args(1) = cellpat(i); octave_value_list tmp = octregexp (new_args, nargout, who, case_insensitive); for (int j = 0; j < nargout; j++) newretval[j](i) = tmp(j); } } else error ("regexp: cell array arguments must be scalar or equal size"); } else { for (int j = 0; j < nargout; j++) newretval[j].resize (cellstr.dims ()); for (octave_idx_type i = 0; i < cellstr.numel (); i++) { new_args(0) = cellstr(i); octave_value_list tmp = octregexp (new_args, nargout, who, case_insensitive); for (int j = 0; j < nargout; j++) newretval[j](i) = tmp(j); } } for (int j = 0; j < nargout; j++) retval(j) = octave_value (newretval[j]); } else if (args(1).iscell ()) { OCTAVE_LOCAL_BUFFER (Cell, newretval, nargout); octave_value_list new_args = args; Cell cellpat = args(1).cell_value (); for (int j = 0; j < nargout; j++) newretval[j].resize (cellpat.dims ()); for (octave_idx_type i = 0; i < cellpat.numel (); i++) { new_args(1) = cellpat(i); octave_value_list tmp = octregexp (new_args, nargout, who, case_insensitive); for (int j = 0; j < nargout; j++) newretval[j](i) = tmp(j); } for (int j = 0; j < nargout; j++) retval(j) = octave_value (newretval[j]); } else retval = octregexp (args, nargout, who, case_insensitive); return retval; } DEFUN (regexp, args, nargout, doc: /* -*- texinfo -*- @deftypefn {} {[@var{s}, @var{e}, @var{te}, @var{m}, @var{t}, @var{nm}, @var{sp}] =} regexp (@var{str}, @var{pat}) @deftypefnx {} {[@dots{}] =} regexp (@var{str}, @var{pat}, "@var{opt1}", @dots{}) Regular expression string matching. Search for @var{pat} in UTF-8 encoded @var{str} and return the positions and substrings of any matches, or empty values if there are none. The matched pattern @var{pat} can include any of the standard regex operators, including: @table @code @item . Match any character @item * + ? @{@} Repetition operators, representing @table @code @item * Match zero or more times @item + Match one or more times @item ? Match zero or one times @item @{@var{n}@} Match exactly @var{n} times @item @{@var{n},@} Match @var{n} or more times @item @{@var{m},@var{n}@} Match between @var{m} and @var{n} times @end table @item [@dots{}] [^@dots{}] List operators. The pattern will match any character listed between @qcode{"["} and @qcode{"]"}. If the first character is @qcode{"^"} then the pattern is inverted and any character except those listed between brackets will match. Escape sequences defined below can also be used inside list operators. For example, a template for a floating point number might be @code{[-+.\d]+}. @item () (?:) Grouping operator. The first form, parentheses only, also creates a token. @item | Alternation operator. Match one of a choice of regular expressions. The alternatives must be delimited by the grouping operator @code{()} above. @item ^ $ Anchoring operators. Requires pattern to occur at the start (@code{^}) or end (@code{$}) of the string. @end table In addition, the following escaped characters have special meaning. @table @code @item \d Match any digit @item \D Match any non-digit @item \s Match any whitespace character @item \S Match any non-whitespace character @item \w Match any word character @item \W Match any non-word character @item \< Match the beginning of a word @item \> Match the end of a word @item \B Match within a word @end table Implementation Note: For compatibility with @sc{matlab}, escape sequences in @var{pat} (e.g., @qcode{"@backslashchar{}n"} => newline) are expanded even when @var{pat} has been defined with single quotes. To disable expansion use a second backslash before the escape sequence (e.g., "@backslashchar{}@backslashchar{}n") or use the @code{regexptranslate} function. The outputs of @code{regexp} default to the order given below @table @var @item s The start indices of each matching substring @item e The end indices of each matching substring @item te The extents of each matched token surrounded by @code{(@dots{})} in @var{pat} @item m A cell array of the text of each match @item t A cell array of the text of each token matched @item nm A structure containing the text of each matched named token, with the name being used as the fieldname. A named token is denoted by @code{(?<name>@dots{})}. @item sp A cell array of the text not returned by match, i.e., what remains if you split the string based on @var{pat}. @end table Particular output arguments, or the order of the output arguments, can be selected by additional @var{opt} arguments. These are strings and the correspondence between the output arguments and the optional argument are @multitable @columnfractions 0.2 0.3 0.3 0.2 @item @tab @qcode{'start'} @tab @var{s} @tab @item @tab @qcode{'end'} @tab @var{e} @tab @item @tab @qcode{'tokenExtents'} @tab @var{te} @tab @item @tab @qcode{'match'} @tab @var{m} @tab @item @tab @qcode{'tokens'} @tab @var{t} @tab @item @tab @qcode{'names'} @tab @var{nm} @tab @item @tab @qcode{'split'} @tab @var{sp} @tab @end multitable Additional arguments are summarized below. @table @samp @item once Return only the first occurrence of the pattern. @item matchcase Make the matching case sensitive. (default) Alternatively, use (?-i) in the pattern. @item ignorecase Ignore case when matching the pattern to the string. Alternatively, use (?i) in the pattern. @item stringanchors Match the anchor characters at the beginning and end of the string. (default) Alternatively, use (?-m) in the pattern. @item lineanchors Match the anchor characters at the beginning and end of the line. Alternatively, use (?m) in the pattern. @item dotall The pattern @code{.} matches all characters including the newline character. (default) Alternatively, use (?s) in the pattern. @item dotexceptnewline The pattern @code{.} matches all characters except the newline character. Alternatively, use (?-s) in the pattern. @item literalspacing All characters in the pattern, including whitespace, are significant and are used in pattern matching. (default) Alternatively, use (?-x) in the pattern. @item freespacing The pattern may include arbitrary whitespace and also comments beginning with the character @samp{#}. Alternatively, use (?x) in the pattern. @item noemptymatch Zero-length matches are not returned. (default) @item emptymatch Return zero-length matches. @code{regexp ('a', 'b*', 'emptymatch')} returns @code{[1 2]} because there are zero or more @qcode{'b'} characters at positions 1 and end-of-string. @end table Stack Limitation Note: Pattern searches are done with a recursive function which can overflow the program stack when there are a high number of matches. For example, @example @code{regexp (repmat ('a', 1, 1e5), '(a)+')} @end example @noindent may lead to a segfault. As an alternative, consider constructing pattern searches that reduce the number of matches (e.g., by creatively using set complement), and then further processing the return variables (now reduced in size) with successive @code{regexp} searches. @seealso{regexpi, strfind, regexprep} @end deftypefn */) { if (args.length () < 2) print_usage (); octave_value_list retval; if (args(0).iscell () || args(1).iscell ()) retval = (octcellregexp (args, (nargout > 0 ? nargout : 1), "regexp")); else retval = octregexp (args, nargout, "regexp"); return retval; } /* ## PCRE_ERROR_MATCHLIMIT test %!test %! s = sprintf ('\t4\n0000\t-0.00\t-0.0000\t4\t-0.00\t-0.0000\t4\n0000\t-0.00\t-0.0000\t0\t-0.00\t-'); %! ws = warning ("query"); %! unwind_protect %! warning ("off"); %! regexp (s, '(\s*-*\d+[.]*\d*\s*)+\n'); %! unwind_protect_cleanup %! warning (ws); %! end_unwind_protect ## segfault test %!assert (regexp ("abcde", "."), [1,2,3,4,5]) ## Infinite loop test %!assert (isempty (regexp ("abcde", ""))) ## Check that anchoring of pattern works correctly %!assert (regexp ('abcabc', '^abc'), 1) %!assert (regexp ('abcabc', 'abc$'), 4) %!assert (regexp ('abcabc', '^abc$'), zeros (1,0)) %!test %! [s, e, te, m, t] = regexp (' No Match ', 'f(.*)uck'); %! assert (s, zeros (1,0)); %! assert (e, zeros (1,0)); %! assert (te, cell (1,0)); %! assert (m, cell (1,0)); %! assert (t, cell (1,0)); %!test %! [s, e, te, m, t] = regexp (' FiRetrUck ', 'f(.*)uck'); %! assert (s, zeros (1,0)); %! assert (e, zeros (1,0)); %! assert (te, cell (1,0)); %! assert (m, cell (1,0)); %! assert (t, cell (1,0)); %!test %! [s, e, te, m, t] = regexp (' firetruck ', 'f(.*)uck'); %! assert (s, 2); %! assert (e, 10); %! assert (te{1}, [3, 7]); %! assert (m{1}, 'firetruck'); %! assert (t{1}{1}, 'iretr'); %!test %! [s, e, te, m, t] = regexp ('short test string', '\w*r\w*'); %! assert (s, [1, 12]); %! assert (e, [5, 17]); %! assert (size (te), [1, 2]); %! assert (isempty (te{1})); %! assert (isempty (te{2})); %! assert (m{1}, 'short'); %! assert (m{2}, 'string'); %! assert (size (t), [1, 2]); %! assert (isempty (t{1})); %! assert (isempty (t{2})); %!test %! [s, e, te, m, t] = regexp ('short test string', '\w*r\w*', 'once'); %! assert (s, 1); %! assert (e, 5); %! assert (isempty (te)); %! assert (m, 'short'); %! assert (isempty (t)); %!test %! [m, te, e, s, t] = regexp ('short test string', '\w*r\w*', 'once', 'match', 'tokenExtents', 'end', 'start', 'tokens'); %! assert (s, 1); %! assert (e, 5); %! assert (isempty (te)); %! assert (m, 'short'); %! assert (isempty (t)); %!test %! [s, e, te, m, t, nm] = regexp ('short test string', '(?<word1>\w*t)\s*(?<word2>\w*t)'); %! assert (s, 1); %! assert (e, 10); %! assert (size (te), [1, 1]); %! assert (te{1}, [1,5; 7,10]); %! assert (m{1}, 'short test'); %! assert (size (t), [1, 1]); %! assert (t{1}{1}, 'short'); %! assert (t{1}{2}, 'test'); %! assert (size (nm), [1, 1]); %! assert (! isempty (fieldnames (nm))); %! assert (sort (fieldnames (nm)), {'word1';'word2'}); %! assert (nm.word1, 'short'); %! assert (nm.word2, 'test'); %!test %! [nm, m, te, e, s, t] = regexp ('short test string', '(?<word1>\w*t)\s*(?<word2>\w*t)', 'names', 'match', 'tokenExtents', 'end', 'start', 'tokens'); %! assert (s, 1); %! assert (e, 10); %! assert (size (te), [1, 1]); %! assert (te{1}, [1,5; 7,10]); %! assert (m{1}, 'short test'); %! assert (size (t), [1, 1]); %! assert (t{1}{1}, 'short'); %! assert (t{1}{2}, 'test'); %! assert (size (nm), [1, 1]); %! assert (! isempty (fieldnames (nm))); %! assert (sort (fieldnames (nm)), {'word1';'word2'}); %! assert (nm.word1, 'short'); %! assert (nm.word2, 'test'); %!test %! [t, nm] = regexp ("John Davis\nRogers, James", '(?<first>\w+)\s+(?<last>\w+)|(?<last>\w+),\s+(?<first>\w+)', 'tokens', 'names'); %! assert (size (t), [1, 2]); %! assert (t{1}{1}, "John"); %! assert (t{1}{2}, "Davis"); %! assert (t{2}{1}, "Rogers"); %! assert (t{2}{2}, "James"); %! assert (size (nm), [1, 2]); %! assert (nm(1).first, "John"); %! assert (nm(1).last, "Davis"); %! assert (nm(2).first, "James"); %! assert (nm(2).last, "Rogers"); ## Tests for nulls in strings properly matching %!test %! str = "A\0B\0\0C"; %! ptn = '(\0+)'; # also test null in single-quote pattern %! M = regexp (str, ptn, "match"); %! assert (size (M), [1, 2]); %! assert (double (M{1}), [0]); %! assert (double (M{2}), [0, 0]); %!test %! str = "A\0B\0\0C"; %! ptn = "(\0+)"; # also test null in double-quote pattern %! T = regexp (str, ptn, "tokens"); %! assert (size (T), [1, 2]); %! assert (double (T{1}{1}), [0]); %! assert (double (T{2}{1}), [0, 0]); %!test %! str = "A\0B\0\0C"; %! ptn = '(?<namedtoken>\0+)'; %! NT = regexp (str, ptn, "names"); %! assert (size (NT), [1, 2]); %! assert (double (NT(1).namedtoken), [0]); %! assert (double (NT(2).namedtoken), [0, 0]); ## Tests for named tokens %!test %! ## Parenthesis in named token (ie (int)) causes a problem %! assert (regexp ('qwe int asd', ['(?<typestr>(int))'], 'names'), %! struct ('typestr', 'int')); %!test <*35683> %! ## Mix of named and unnamed tokens can cause segfault %! str = "abcde"; %! ptn = '(?<T1>a)(\w+)(?<T2>d\w+)'; %! tokens = regexp (str, ptn, "names"); %! assert (isstruct (tokens) && numel (tokens) == 1); %! assert (tokens.T1, "a"); %! assert (tokens.T2, "de"); ## Test options to regexp %!assert (regexp ("abc\nabc", '.'), [1:7]) %!assert (regexp ("abc\nabc", '.', 'dotall'), [1:7]) %!test %! assert (regexp ("abc\nabc", '(?s).'), [1:7]); %! assert (regexp ("abc\nabc", '.', 'dotexceptnewline'), [1,2,3,5,6,7]); %! assert (regexp ("abc\nabc", '(?-s).'), [1,2,3,5,6,7]); %!assert (regexp ("caseCaSe", 'case'), 1) %!assert (regexp ("caseCaSe", 'case', "matchcase"), 1) %!assert (regexp ("caseCaSe", 'case', "ignorecase"), [1,5]) %!test %! assert (regexp ("caseCaSe", '(?-i)case'), 1); %! assert (regexp ("caseCaSe", '(?i)case'), [1, 5]); %!assert (regexp ("abc\nabc", 'c$'), 7) %!assert (regexp ("abc\nabc", 'c$', "stringanchors"), 7) %!test %! assert (regexp ("abc\nabc", '(?-m)c$'), 7); %! assert (regexp ("abc\nabc", 'c$',"lineanchors"), [3, 7]); %! assert (regexp ("abc\nabc", '(?m)c$'), [3,7]); %!assert (regexp ("this word", 's w'), 4) %!assert (regexp ("this word", 's w', 'literalspacing'), 4) %!test %! assert (regexp ("this word", '(?-x)s w', 'literalspacing'), 4); %! assert (regexp ("this word", 's w', 'freespacing'), zeros (1,0)); %! assert (regexp ("this word", '(?x)s w'), zeros (1,0)); %!test %! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '[VOCT]*', 'noemptymatch'); %! assert (s, [1 5]); %! assert (e, [3 5]); %! assert (te, { zeros(0,2), zeros(0,2) }); %! assert (m, { "OCT", "V" }); %! assert (t, { cell(1,0), cell(1,0) }); %! assert (isempty (fieldnames (nm))); %! assert (sp, { "", "A", "E" }); %!test %! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '([VOCT]*)', 'noemptymatch'); %! assert (s, [1 5]); %! assert (e, [3 5]); %! assert (te, { [1 3], [5 5] }); %! assert (m, { "OCT", "V" }); %! assert (t, { {"OCT"}, {"V"} }); %! assert (isempty (fieldnames (nm))); %! assert (sp, { "", "A", "E" }); %!test %! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '[VOCT]*', 'emptymatch'); %! assert (s, [1 4 5 6 7]); %! assert (e, [3 3 5 5 6]); %! assert (te, repmat ({zeros(0,2)}, [1, 5])); %! assert (m, { "OCT", "", "V", "", "" }); %! assert (t, repmat({cell(1,0)}, [1, 5])); %! assert (isempty (fieldnames (nm))); %! assert (sp, { "", "", "A", "", "E", "" }); %!test %! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '([VOCT]*)', 'emptymatch'); %! assert (s, [1 4 5 6 7]); %! assert (e, [3 3 5 5 6]); %! assert (te, { [1 3], [4 3], [5 5], [6 5], [7 6] }); %! assert (m, { "OCT", "", "V", "", "" }); %! assert (t, { {"OCT"}, {""}, {"V"}, {""}, {""} }); %! assert (isempty (fieldnames (nm))); %! assert (sp, { "", "", "A", "", "E", "" }); %!assert (regexp ({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'}, '-'), {6;[1,5,9];zeros(1,0)}) %!assert (regexp ({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'}, {'-';'f';'q'}), {6;[3,7];[1,9]}) %!assert (regexp ('Strings', {'t','s'}), {2, 7}) ## Test case for lookaround operators %!test %! assert (regexp ('Iraq', 'q(?!u)'), 4); %! assert (regexp ('quit', 'q(?!u)'), zeros (1, 0)); %! assert (regexp ('quit', 'q(?=u)' , 'match'), {'q'}); %! assert (regexp ("quit", 'q(?=u+)', 'match'), {'q'}); %! assert (regexp ("qit", 'q(?=u+)', 'match'), cell (1, 0)); %! assert (regexp ("qit", 'q(?=u*)', 'match'), {'q'}); %! assert (regexp ('thingamabob', '(?<=a)b'), 9); ## Tests for split option. %!shared str %! str = "foo bar foo"; %!test %! [a, b] = regexp (str, "f..", "match", "split"); %! assert (a, {"foo", "foo"}); %! assert (b, {"", " bar ", ""}); %!test %! [a, b] = regexp (str, "f..", "match", "split", "once"); %! assert (a, "foo"); %! assert (b, {"", " bar foo"}); %!test %! [a, b] = regexp (str, "fx.", "match", "split"); %! assert (a, cell (1, 0)); %! assert (b, {"foo bar foo"}); %!test %! [a, b] = regexp (str, "fx.", "match", "split", "once"); %! assert (a, "");; %! assert (b, "foo bar foo"); %!shared str %! str = "foo bar"; %!test %! [a, b] = regexp (str, "f..", "match", "split"); %! assert (a, {"foo"}); %! assert (b, {"", " bar"}); %!test %! [a, b] = regexp (str, "b..", "match", "split"); %! assert (a, {"bar"}); %! assert (b, {"foo ", ""}); %!test %! [a, b] = regexp (str, "x", "match", "split"); %! assert (a, cell (1, 0)); %! assert (b, {"foo bar"}); %!test %! [a, b] = regexp (str, "[o]+", "match", "split"); %! assert (a, {"oo"}); %! assert (b, {"f", " bar"}); ## Test escape sequences are expanded even in single-quoted strings %!assert (regexp ("\n", '\n'), 1) %!assert (regexp ("\n", "\n"), 1) ## Test escape sequences are silently converted %!test <*45407> %! assert (regexprep ('s', 's', 'x\.y'), 'x.y'); %! assert (regexprep ('s', '(s)', 'x\$1y'), 'x$1y'); %! assert (regexprep ('s', '(s)', 'x\\$1y'), 'x\sy'); ## Test start-of-word / end-of-word patterns for Matlab compatibility %!test <*59992> %! assert (regexp ('foo!+bar', '\<\w'), [1, 6]); %! assert (regexp ('foo!+bar', '.\>'), [3, 4, 8]); %! assert (regexp ('foo!+bar\nbar!+foo', '.\>'), [3, 4, 8, 13, 14, 18]); %! assert (regexp ('foo!+bar\nbar!+foo', '\<\w'), [1, 6, 10, 16]); ## Test input validation %!error regexp ('string', 'tri', 'BadArg') %!error regexp ('string') */ DEFUN (regexpi, args, nargout, doc: /* -*- texinfo -*- @deftypefn {} {[@var{s}, @var{e}, @var{te}, @var{m}, @var{t}, @var{nm}, @var{sp}] =} regexpi (@var{str}, @var{pat}) @deftypefnx {} {[@dots{}] =} regexpi (@var{str}, @var{pat}, "@var{opt1}", @dots{}) Case insensitive regular expression string matching. Search for @var{pat} in UTF-8 encoded @var{str} and return the positions and substrings of any matches, or empty values if there are none. @xref{XREFregexp,,@code{regexp}}, for details on the syntax of the search pattern. @seealso{regexp} @end deftypefn */) { if (args.length () < 2) print_usage (); if (args(0).iscell () || args(1).iscell ()) return octcellregexp (args, (nargout > 0 ? nargout : 1), "regexpi", true); else return octregexp (args, nargout, "regexpi", true); } /* ## segfault test %!assert (regexpi ("abcde", "."), [1,2,3,4,5]) ## Check that anchoring of pattern works correctly %!assert (regexpi ('abcabc', '^ABC'), 1) %!assert (regexpi ('abcabc', 'ABC$'), 4) %!assert (regexpi ('abcabc', '^ABC$'), zeros (1,0)) %!test %! [s, e, te, m, t] = regexpi (' No Match ', 'f(.*)uck'); %! assert (s, zeros (1,0)); %! assert (e, zeros (1,0)); %! assert (te, cell (1,0)); %! assert (m, cell (1,0)); %! assert (t, cell (1,0)); %!test %! [s, e, te, m, t] = regexpi (' FiRetrUck ', 'f(.*)uck'); %! assert (s, 2); %! assert (e, 10); %! assert (te{1}, [3, 7]); %! assert (m{1}, 'FiRetrUck'); %! assert (t{1}{1}, 'iRetr'); %!test %! [s, e, te, m, t] = regexpi (' firetruck ', 'f(.*)uck'); %! assert (s, 2); %! assert (e, 10); %! assert (te{1}, [3, 7]); %! assert (m{1}, 'firetruck'); %! assert (t{1}{1}, 'iretr'); %!test %! [s, e, te, m, t] = regexpi ('ShoRt Test String', '\w*r\w*'); %! assert (s, [1, 12]); %! assert (e, [5, 17]); %! assert (size (te), [1, 2]); %! assert (isempty (te{1})); %! assert (isempty (te{2})); %! assert (m{1}, 'ShoRt'); %! assert (m{2}, 'String'); %! assert (size (t), [1, 2]); %! assert (isempty (t{1})); %! assert (isempty (t{2})); %!test %! [s, e, te, m, t] = regexpi ('ShoRt Test String', '\w*r\w*', 'once'); %! assert (s, 1); %! assert (e, 5); %! assert (isempty (te)); %! assert (m, 'ShoRt'); %! assert (isempty (t)); %!test %! [m, te, e, s, t] = regexpi ('ShoRt Test String', '\w*r\w*', 'once', 'match', 'tokenExtents', 'end', 'start', 'tokens'); %! assert (s, 1); %! assert (e, 5); %! assert (isempty (te)); %! assert (m, 'ShoRt'); %! assert (isempty (t)); %!test %! [s, e, te, m, t, nm] = regexpi ('ShoRt Test String', '(?<word1>\w*t)\s*(?<word2>\w*t)'); %! assert (s, 1); %! assert (e, 10); %! assert (size (te), [1, 1]); %! assert (te{1}, [1,5; 7,10]); %! assert (m{1}, 'ShoRt Test'); %! assert (size (t), [1, 1]); %! assert (t{1}{1}, 'ShoRt'); %! assert (t{1}{2}, 'Test'); %! assert (size (nm), [1, 1]); %! assert (! isempty (fieldnames (nm))); %! assert (sort (fieldnames (nm)), {'word1';'word2'}); %! assert (nm.word1, 'ShoRt'); %! assert (nm.word2, 'Test'); %!test %! [nm, m, te, e, s, t] = regexpi ('ShoRt Test String', '(?<word1>\w*t)\s*(?<word2>\w*t)', 'names', 'match', 'tokenExtents', 'end', 'start', 'tokens'); %! assert (s, 1); %! assert (e, 10); %! assert (size (te), [1, 1]); %! assert (te{1}, [1,5; 7,10]); %! assert (m{1}, 'ShoRt Test'); %! assert (size (t), [1, 1]); %! assert (t{1}{1}, 'ShoRt'); %! assert (t{1}{2}, 'Test'); %! assert (size (nm), [1, 1]); %! assert (! isempty (fieldnames (nm))); %! assert (sort (fieldnames (nm)), {'word1';'word2'}); %! assert (nm.word1, 'ShoRt'); %! assert (nm.word2, 'Test'); %!assert (regexpi ("abc\nabc", '.'), [1:7]) %!assert (regexpi ("abc\nabc", '.', 'dotall'), [1:7]) %!test %! assert (regexpi ("abc\nabc", '(?s).'), [1:7]); %! assert (regexpi ("abc\nabc", '.', 'dotexceptnewline'), [1,2,3,5,6,7]); %! assert (regexpi ("abc\nabc", '(?-s).'), [1,2,3,5,6,7]); %!assert (regexpi ("caseCaSe", 'case'), [1, 5]) %!assert (regexpi ("caseCaSe", 'case', "matchcase"), 1) %!assert (regexpi ("caseCaSe", 'case', "ignorecase"), [1, 5]) %!test %! assert (regexpi ("caseCaSe", '(?-i)case'), 1); %! assert (regexpi ("caseCaSe", '(?i)case'), [1, 5]); %!assert (regexpi ("abc\nabc", 'C$'), 7) %!assert (regexpi ("abc\nabc", 'C$', "stringanchors"), 7) %!test %! assert (regexpi ("abc\nabc", '(?-m)C$'), 7); %! assert (regexpi ("abc\nabc", 'C$', "lineanchors"), [3, 7]); %! assert (regexpi ("abc\nabc", '(?m)C$'), [3, 7]); %!assert (regexpi ("this word", 'S w'), 4) %!assert (regexpi ("this word", 'S w', 'literalspacing'), 4) %!test %! assert (regexpi ("this word", '(?-x)S w', 'literalspacing'), 4); %! assert (regexpi ("this word", 'S w', 'freespacing'), zeros (1,0)); %! assert (regexpi ("this word", '(?x)S w'), zeros (1,0)); %!error regexpi ('string', 'tri', 'BadArg') %!error regexpi ('string') %!assert (regexpi ({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'}, '-'), {6;[1,5,9];zeros(1, 0)}) %!assert (regexpi ({'asdfg-dfd', '-dfd-dfd-', 'qasfdfdaq'}, '-'), {6, [1,5,9], zeros(1,0)}) %!assert (regexpi ({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'}, {'-';'f';'q'}), {6;[3,7];[1,9]}) %!assert (regexpi ('Strings', {'t', 's'}), {2, [1, 7]}) %!assert (regexpi ("\n", '\n'), 1) %!assert (regexpi ("\n", "\n"), 1) */ static octave_value octregexprep (const octave_value_list& args, const std::string& who) { int nargin = args.length (); // Make sure we have string, pattern, replacement const std::string buffer = args(0).string_value (); std::string pattern = args(1).string_value (); // Rewrite pattern for PCRE pattern = do_regexp_ptn_string_escapes (pattern, args(1).is_sq_string ()); std::string replacement = args(2).string_value (); // Matlab compatibility. if (args(2).is_sq_string ()) replacement = do_regexp_rep_string_escapes (replacement); // Pack options excluding 'tokenize' and various output // reordering strings into regexp arg list octave_value_list regexpargs (nargin-3, octave_value ()); int len = 0; for (int i = 3; i < nargin; i++) { const std::string opt = args(i).string_value (); if (opt != "tokenize" && opt != "start" && opt != "end" && opt != "tokenextents" && opt != "match" && opt != "tokens" && opt != "names" && opt != "split" && opt != "warnings") { regexpargs(len++) = args(i); } } regexpargs.resize (len); regexp::opts options; bool extra_args = false; parse_options (options, regexpargs, who, 0, extra_args); return regexp::replace (pattern, buffer, replacement, options, who); } DEFUN (regexprep, args, , doc: /* -*- texinfo -*- @deftypefn {} {@var{outstr} =} regexprep (@var{string}, @var{pat}, @var{repstr}) @deftypefnx {} {@var{outstr} =} regexprep (@var{string}, @var{pat}, @var{repstr}, "@var{opt1}", @dots{}) Replace occurrences of pattern @var{pat} in @var{string} with @var{repstr}. The pattern is a regular expression as documented for @code{regexp}. @xref{XREFregexp,,@code{regexp}}. All strings must be UTF-8 encoded. The replacement string may contain @code{$i}, which substitutes for the ith set of parentheses in the match string. For example, @example regexprep ("Bill Dunn", '(\w+) (\w+)', '$2, $1') @end example @noindent returns @qcode{"Dunn, Bill"} Options in addition to those of @code{regexp} are @table @samp @item once Replace only the first occurrence of @var{pat} in the result. @item warnings This option is present for compatibility but is ignored. @end table Implementation Note: For compatibility with @sc{matlab}, escape sequences in @var{pat} (e.g., @qcode{"@backslashchar{}n"} => newline) are expanded even when @var{pat} has been defined with single quotes. To disable expansion use a second backslash before the escape sequence (e.g., "@backslashchar{}@backslashchar{}n") or use the @code{regexptranslate} function. @seealso{regexp, regexpi, strrep} @end deftypefn */) { if (args.length () < 3) print_usage (); octave_value_list retval; if (args(0).iscell () || args(1).iscell () || args(2).iscell ()) { Cell str, pat, rep; dim_vector dv0; dim_vector dv1 (1, 1); if (args(0).iscell ()) str = args(0).cell_value (); else str = Cell (args(0)); if (args(1).iscell ()) pat = args(1).cell_value (); else pat = Cell (args(1)); if (args(2).iscell ()) rep = args(2).cell_value (); else rep = Cell (args(2)); dv0 = str.dims (); if (pat.numel () != 1) { dv1 = pat.dims (); if (rep.numel () != 1 && dv1 != rep.dims ()) error ("regexprep: inconsistent cell array dimensions"); } else if (rep.numel () != 1) dv1 = rep.dims (); Cell ret (dv0); octave_value_list new_args = args; for (octave_idx_type i = 0; i < dv0.numel (); i++) { new_args(0) = str(i); if (pat.numel () == 1) new_args(1) = pat(0); if (rep.numel () == 1) new_args(2) = rep(0); for (octave_idx_type j = 0; j < dv1.numel (); j++) { if (pat.numel () != 1) new_args(1) = pat(j); if (rep.numel () != 1) new_args(2) = rep(j); new_args(0) = octregexprep (new_args, "regexprep"); } ret(i) = new_args(0); } retval = (args(0).iscell () ? ovl (ret) : ovl (ret(0))); } else retval = octregexprep (args, "regexprep"); return retval; } /* %!test # Replace with empty %! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>'; %! t = regexprep (xml, '<[!?][^>]*>', ''); %! assert (t, ' <tag v="hello">some stuff</tag>'); %!test # Replace with non-empty %! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>'; %! t = regexprep (xml, '<[!?][^>]*>', '?'); %! assert (t, '? <tag v="hello">some stuff?</tag>'); %!test # Check that 'tokenize' is ignored %! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>'; %! t = regexprep (xml, '<[!?][^>]*>', '', 'tokenize'); %! assert (t, ' <tag v="hello">some stuff</tag>'); ## Test capture replacement %!test %! data = "Bob Smith\nDavid Hollerith\nSam Jenkins"; %! result = "Smith, Bob\nHollerith, David\nJenkins, Sam"; %! t = regexprep (data, '(?m)^(\w+)\s+(\w+)$', '$2, $1'); %! assert (t, result); ## Return the original if no match %!assert (regexprep ('hello', 'world', 'earth'), 'hello') ## Test emptymatch option %!assert (regexprep ('World', '^', 'Hello '), 'World') %!assert (regexprep ('World', '^', 'Hello ', 'emptymatch'), 'Hello World') ## Test a general replacement %!assert (regexprep ("a[b]c{d}e-f=g", "[^A-Za-z0-9_]", "_"), "a_b_c_d_e_f_g") ## Make sure replacements work at the beginning and end of string %!assert (regexprep ("a[b]c{d}e-f=g", "a", "_"), "_[b]c{d}e-f=g") %!assert (regexprep ("a[b]c{d}e-f=g", "g", "_"), "a[b]c{d}e-f=_") ## Test options "once" and "ignorecase" %!assert (regexprep ("a[b]c{d}e-f=g", "[^A-Za-z0-9_]", "_", "once"), %! "a_b]c{d}e-f=g") %!assert (regexprep ("a[b]c{d}e-f=g", "[^A-Z0-9_]", "_", "ignorecase"), %! "a_b_c_d_e_f_g") ## Option combinations %!assert (regexprep ("a[b]c{d}e-f=g", "[^A-Z0-9_]", "_", "once", "ignorecase"), %! "a_b]c{d}e-f=g") ## End conditions on replacement %!assert (regexprep ("abc", "(b)", ".$1"), "a.bc") %!assert (regexprep ("abc", "(b)", "$1"), "abc") %!assert (regexprep ("abc", "(b)", "$1."), "ab.c") %!assert (regexprep ("abc", "(b)", "$1.."), "ab..c") ## Test cell array arguments %!assert (regexprep ("abc", {"b","a"}, "?"), "??c") %!assert (regexprep ({"abc","cba"}, "b", "?"), {"a?c","c?a"}) %!assert (regexprep ({"abc","cba"}, {"b","a"}, {"?","!"}), {"!?c","c?!"}) ## Nasty lookbehind expression %!test %! warning ("off", "Octave:regexp-lookbehind-limit", "local"); %! assert (regexprep ('x^(-1)+y(-1)+z(-1)=0', '(?<=[a-z]+)\(\-[1-9]*\)', %! '_minus1'),'x^(-1)+y_minus1+z_minus1=0'); ## Verify escape sequences in pattern %!assert (regexprep ("\n", '\n', "X"), "X") %!assert (regexprep ("\n", "\n", "X"), "X") ## Verify NULLs in pattern and replacement string %!assert (regexprep ("A\0A", "\0", ","), "A,A") %!assert (regexprep ("A\0A", '\0', ","), "A,A") %!assert (regexprep ("A,A", "A", "B\0B"), "B\0B,B\0B") %!assert (regexprep ("A,A", "A", 'B\0B'), "B\0B,B\0B") ## Empty matches were broken on ARM architecture %!test <*52810> %! assert (strcmp (regexprep ("\nabc", "^(\t*)(abc)$", "$1$2", "lineanchors"), "\nabc")) */ OCTAVE_NAMESPACE_END