Mercurial > jwe > octave
diff src/DLD-FUNCTIONS/regexp.cc @ 12462:e4dbfe3019b1
Use PCRE regular expressions throughout Octave.
author | Rik <octave@nomad.inbox5.com> |
---|---|
date | Sat, 19 Feb 2011 18:21:58 -0800 |
parents | 02669a1aa070 |
children | dfeea9cae79e |
line wrap: on
line diff
--- a/src/DLD-FUNCTIONS/regexp.cc Sat Feb 19 14:25:40 2011 -0800 +++ b/src/DLD-FUNCTIONS/regexp.cc Sat Feb 19 18:21:58 2011 -0800 @@ -885,9 +885,7 @@ @deftypefnx {Loadable Function} {[@dots{}] =} regexp (@var{str}, @var{pat}, \"@var{opt1}\", @dots{})\n\ Regular expression string matching. Search for @var{pat} in @var{str} and\n\ return the positions and substrings of any matches, or empty values if there\n\ -are none. Note, some features and extended options are only available when\n\ -Octave is compiled with support for Perl Compatible Regular Expressions\n\ -(PCRE).\n\ +are none.\n\ \n\ The matched pattern @var{pat} can include any of the standard regex\n\ operators, including:\n\ @@ -924,11 +922,9 @@ and \"]\". If the first character is \"^\" then the pattern is inverted and\n\ any character except those listed between brackets will match.\n\ \n\ -With PCRE support, escape sequences defined below can be used inside list\n\ +Escape sequences defined below can also be used inside list\n\ operators. For example, a template for a floating point number might be\n\ -@code{[-+.\\d]+}. POSIX regular expressions do not use escape sequences\n\ -and any backslash @samp{\\} will be interpreted literally as one\n\ -of the list of characters to match.\n\ +@code{[-+.\\d]+}.\n\ \n\ @item ()\n\ Grouping operator\n\ @@ -975,14 +971,8 @@ @item \\d\n\ Match any digit\n\ \n\ -This sequence is only available with PCRE support. For POSIX regular\n\ -expressions use the following list operator @code{[0-9]}.\n\ -\n\ @item \\D\n\ Match any non-digit\n\ -\n\ -This sequence is only available with PCRE support. For POSIX regular\n\ -expressions use the following list operator @code{[^0-9]}.\n\ @end table\n\ \n\ The outputs of @code{regexp} default to the order given below\n\ @@ -1007,7 +997,7 @@ @item nm\n\ A structure containing the text of each matched named token, with the name\n\ being used as the fieldname. A named token is denoted by\n\ -@code{(?<name>@dots{})} and is only available with PCRE support.\n\ +@code{(?<name>@dots{})}.\n\ @end table\n\ \n\ Particular output arguments, or the order of the output arguments, can be\n\ @@ -1033,49 +1023,46 @@ @item matchcase\n\ Make the matching case sensitive. (default)\n\ \n\ -Alternatively, use (?-i) in the pattern when PCRE is available.\n\ +Alternatively, use (?-i) in the pattern.\n\ \n\ @item ignorecase\n\ Ignore case when matching the pattern to the string.\n\ \n\ -Alternatively, use (?i) in the pattern when PCRE is available.\n\ +Alternatively, use (?i) in the pattern.\n\ \n\ @item stringanchors\n\ Match the anchor characters at the beginning and end of the string. \n\ (default)\n\ \n\ -Alternatively, use (?-m) in the pattern when PCRE is available.\n\ +Alternatively, use (?-m) in the pattern.\n\ \n\ @item lineanchors\n\ Match the anchor characters at the beginning and end of the line.\n\ -Only available when Octave is compiled with PCRE.\n\ \n\ -Alternatively, use (?m) in the pattern when PCRE is available.\n\ +Alternatively, use (?m) in the pattern.\n\ \n\ @item dotall\n\ The pattern @code{.} matches all characters including the newline character.\n\ (default)\n\ \n\ -Alternatively, use (?s) in the pattern when PCRE is available.\n\ +Alternatively, use (?s) in the pattern.\n\ \n\ @item dotexceptnewline\n\ The pattern @code{.} matches all characters except the newline character.\n\ -Only available when Octave is compiled with PCRE.\n\ \n\ -Alternatively, use (?-s) in the pattern when PCRE is available.\n\ +Alternatively, use (?-s) in the pattern.\n\ \n\ @item literalspacing\n\ All characters in the pattern, including whitespace, are significant and are\n\ used in pattern matching. (default)\n\ \n\ -Alternatively, use (?-x) in the pattern when PCRE is available.\n\ +Alternatively, use (?-x) in the pattern.\n\ \n\ @item freespacing\n\ The pattern may include arbitrary whitespace and also comments beginning with\n\ the character @samp{#}.\n\ -Only available when Octave is compiled with PCRE.\n\ \n\ -Alternatively, use (?x) in the pattern when PCRE is available.\n\ +Alternatively, use (?x) in the pattern.\n\ \n\ @end table\n\ @seealso{regexpi, strfind, regexprep}\n\ @@ -1168,7 +1155,7 @@ %! assert (m,'short') %! assert (isempty(t)) -%!testif HAVE_PCRE +%!test %! [s, e, te, m, t, nm] = regexp('short test string','(?<word1>\w*t)\s*(?<word2>\w*t)'); %! assert (s,1) %! assert (e,10) @@ -1184,7 +1171,7 @@ %! assert (nm.word1,'short') %! assert (nm.word2,'test') -%!testif HAVE_PCRE +%!test %! [nm, m, te, e, s, t] = regexp('short test string','(?<word1>\w*t)\s*(?<word2>\w*t)', 'names', 'match', 'tokenExtents', 'end', 'start', 'tokens'); %! assert (s,1) %! assert (e,10) @@ -1200,7 +1187,7 @@ %! assert (nm.word1,'short') %! assert (nm.word2,'test') -%!testif HAVE_PCRE +%!test %! [t, nm] = regexp("John Davis\nRogers, James",'(?<first>\w+)\s+(?<last>\w+)|(?<last>\w+),\s+(?<first>\w+)','tokens','names'); %! assert (size(t), [1,2]); %! assert (t{1}{1},'John'); @@ -1213,13 +1200,13 @@ %! assert (nm.last{1},'Davis'); %! assert (nm.last{2},'Rogers'); -%!testif HAVE_PCRE +%!test %! # Parenthesis in named token (ie (int)) causes a problem %! assert (regexp('qwe int asd', ['(?<typestr>(int))'], 'names'), struct ('typestr', 'int')); %!assert(regexp("abc\nabc",'.'),[1:7]) %!assert(regexp("abc\nabc",'.','dotall'),[1:7]) -%!testif HAVE_PCRE +%!test %! assert(regexp("abc\nabc",'(?s).'),[1:7]) %! assert(regexp("abc\nabc",'.','dotexceptnewline'),[1,2,3,5,6,7]) %! assert(regexp("abc\nabc",'(?-s).'),[1,2,3,5,6,7]) @@ -1227,20 +1214,20 @@ %!assert(regexp("caseCaSe",'case'),1) %!assert(regexp("caseCaSe",'case',"matchcase"),1) %!assert(regexp("caseCaSe",'case',"ignorecase"),[1,5]) -%!testif HAVE_PCRE +%!test %! assert(regexp("caseCaSe",'(?-i)case'),1) %! assert(regexp("caseCaSe",'(?i)case'),[1,5]) %!assert (regexp("abc\nabc",'c$'),7) %!assert (regexp("abc\nabc",'c$',"stringanchors"),7) -%!testif HAVE_PCRE +%!test %! assert (regexp("abc\nabc",'(?-m)c$'),7) %! assert (regexp("abc\nabc",'c$',"lineanchors"),[3,7]) %! assert (regexp("abc\nabc",'(?m)c$'),[3,7]) %!assert (regexp("this word",'s w'),4) %!assert (regexp("this word",'s w','literalspacing'),4) -%!testif HAVE_PCRE +%!test %! assert (regexp("this word",'(?-x)s w','literalspacing'),4) %! assert (regexp("this word",'s w','freespacing'),zeros(1,0)) %! assert (regexp("this word",'(?x)s w'),zeros(1,0)) @@ -1254,7 +1241,7 @@ %!assert(regexp('Strings',{'t','s'}),{2,7}) ## Test case for lookaround operators -%!testif HAVE_PCRE +%!test %! assert(regexp('Iraq','q(?!u)'),4) %! assert(regexp('quit','q(?!u)'), zeros(1,0)) %! assert(regexp('quit','q(?=u)','match'), {'q'}) @@ -1353,7 +1340,7 @@ %! assert (m,'ShoRt') %! assert (isempty(t)) -%!testif HAVE_PCRE +%!test %! [s, e, te, m, t, nm] = regexpi('ShoRt Test String','(?<word1>\w*t)\s*(?<word2>\w*t)'); %! assert (s,1) %! assert (e,10) @@ -1369,7 +1356,7 @@ %! assert (nm.word1,'ShoRt') %! assert (nm.word2,'Test') -%!testif HAVE_PCRE +%!test %! [nm, m, te, e, s, t] = regexpi('ShoRt Test String','(?<word1>\w*t)\s*(?<word2>\w*t)', 'names', 'match', 'tokenExtents', 'end', 'start', 'tokens'); %! assert (s,1) %! assert (e,10) @@ -1387,7 +1374,7 @@ %!assert(regexpi("abc\nabc",'.'),[1:7]) %!assert(regexpi("abc\nabc",'.','dotall'),[1:7]) -%!testif HAVE_PCRE +%!test %! assert(regexpi("abc\nabc",'(?s).'),[1:7]) %! assert(regexpi("abc\nabc",'.','dotexceptnewline'),[1,2,3,5,6,7]) %! assert(regexpi("abc\nabc",'(?-s).'),[1,2,3,5,6,7]) @@ -1395,20 +1382,20 @@ %!assert(regexpi("caseCaSe",'case'),[1,5]) %!assert(regexpi("caseCaSe",'case',"matchcase"),1) %!assert(regexpi("caseCaSe",'case',"ignorecase"),[1,5]) -%!testif HAVE_PCRE +%!test %! assert(regexpi("caseCaSe",'(?-i)case'),1) %! assert(regexpi("caseCaSe",'(?i)case'),[1,5]) %!assert (regexpi("abc\nabc",'C$'),7) %!assert (regexpi("abc\nabc",'C$',"stringanchors"),7) -%!testif HAVE_PCRE +%!test %! assert (regexpi("abc\nabc",'(?-m)C$'),7) %! assert (regexpi("abc\nabc",'C$',"lineanchors"),[3,7]) %! assert (regexpi("abc\nabc",'(?m)C$'),[3,7]) %!assert (regexpi("this word",'S w'),4) %!assert (regexpi("this word",'S w','literalspacing'),4) -%!testif HAVE_PCRE +%!test %! assert (regexpi("this word",'(?-x)S w','literalspacing'),4) %! assert (regexpi("this word",'S w','freespacing'),zeros(1,0)) %! assert (regexpi("this word",'(?x)S w'),zeros(1,0)) @@ -1746,7 +1733,7 @@ %! assert(t,' <tag v="hello">some stuff</tag>') ## Test capture replacement -%!testif HAVE_PCRE +%!test %! data = "Bob Smith\nDavid Hollerith\nSam Jenkins"; %! result = "Smith, Bob\nHollerith, David\nJenkins, Sam"; %! t = regexprep(data,'(?m)^(\w+)\s+(\w+)$','$2, $1'); @@ -1781,7 +1768,7 @@ %!assert(regexprep({"abc","cba"},{"b","a"},{"?","!"}),{"!?c","c?!"}) # Nasty lookbehind expression -%!testif HAVE_PCRE +%!test %! assert(regexprep('x^(-1)+y(-1)+z(-1)=0','(?<=[a-z]+)\(\-[1-9]*\)','_minus1'),'x^(-1)+y_minus1+z_minus1=0') */