changeset 14536:6d5c951ec520

Add 'emptymatch', 'noemptymatch' options to regular expressions. * NEWS: Announce new options. * liboctave/regexp.cc (regexp::match): Add processing option for zero length matches. * liboctave/regexp.h (class opts): Add emptymatch option to constructors, setter/getter routines, private variable. * DLD-FUNCTIONS/regexp.cc (parse_options): Add emptymatch to options parsing routine. * DLD-FUNCTIONS/regexp.cc (octregexp): Ignore emptymatch when determing output ordering of arguments. * DLD-FUNCTIONS/regexp.cc (Fregexp): Add new options to docstring. Add %!tests for new behavior. * DLD-FUNCTIONS/regexp.cc (Fregexprep): Add %!tests for new behavior.
author Rik <octave@nomad.inbox5.com>
date Sat, 07 Apr 2012 09:43:53 -0700
parents 8150ccfffa22
children f8e041f11b18
files NEWS liboctave/regexp.cc liboctave/regexp.h src/DLD-FUNCTIONS/regexp.cc
diffstat 4 files changed, 95 insertions(+), 8 deletions(-) [+]
line wrap: on
line diff
--- a/NEWS	Fri Apr 06 19:21:16 2012 -0400
+++ b/NEWS	Sat Apr 07 09:43:53 2012 -0700
@@ -1,6 +1,18 @@
 Summary of important user-visible changes for version 3.8:
 ---------------------------------------------------------
 
+ ** 'emptymatch', 'noemptymatch' options added to regular expressions.
+
+    With this addition Octave now accepts the entire set of Matlab options
+    for regular expressions.  'noemptymatch' is the default, but 'emptymatch'
+    has certain uses where you need to match an assertion rather than actual
+    characters.  For example, 
+
+    regexprep ('World', '^', 'Hello ', 'emptymatch')
+      => Hello World
+
+    where the pattern is actually the assertion '^' or start-of-line.
+
  ** Redundant terminal comma accepted by parser
 
     A redundant terminal comma is now accepted in matrix
--- a/liboctave/regexp.cc	Fri Apr 06 19:21:16 2012 -0400
+++ b/liboctave/regexp.cc	Sat Apr 07 09:43:53 2012 -0700
@@ -315,9 +315,9 @@
         }
       else if (matches == PCRE_ERROR_NOMATCH)
         break;
-      else if (ovector[1] <= ovector[0])
+      else if (ovector[1] <= ovector[0] && ! options.emptymatch ())
         {
-          // Zero sized match.  Skip to next char.
+          // Zero length match.  Skip to next char.
           idx = ovector[0] + 1;
           if (idx < buffer.length ())
             continue;
@@ -400,7 +400,16 @@
           regexp::match_element new_elem (named_tokens, tokens, match_string,
                                           token_extents, start, end);
           lst.push_back (new_elem);
-          idx = ovector[1];
+
+          if (ovector[1] <= ovector[0])
+          {
+            // Zero length match.  Skip to next char.
+            idx = ovector[0] + 1;
+            if (idx <= buffer.length ())
+              continue;
+          }
+          else 
+            idx = ovector[1];
 
           if (options.once () || idx >= buffer.length ())
             break;
--- a/liboctave/regexp.h	Fri Apr 06 19:21:16 2012 -0400
+++ b/liboctave/regexp.h	Sat Apr 07 09:43:53 2012 -0700
@@ -95,11 +95,13 @@
 
     opts (void)
       : x_case_insensitive (false), x_dotexceptnewline (false),
-        x_freespacing (false), x_lineanchors (false), x_once (false) { }
+        x_emptymatch (false), x_freespacing (false), x_lineanchors (false),
+        x_once (false) { }
 
     opts (const opts& o)
       : x_case_insensitive (o.x_case_insensitive),
         x_dotexceptnewline (o.x_dotexceptnewline),
+        x_emptymatch (o.x_emptymatch),
         x_freespacing (o.x_freespacing),
         x_lineanchors (o.x_lineanchors),
         x_once (o.x_once)
@@ -111,6 +113,7 @@
         {
           x_case_insensitive = o.x_case_insensitive;
           x_dotexceptnewline = o.x_dotexceptnewline;
+          x_emptymatch = o.x_emptymatch;
           x_freespacing = o.x_freespacing;
           x_lineanchors = o.x_lineanchors;
           x_once = o.x_once;
@@ -123,12 +126,14 @@
 
     void case_insensitive (bool val) { x_case_insensitive = val; }
     void dotexceptnewline (bool val) { x_dotexceptnewline = val; }
+    void emptymatch (bool val) { x_emptymatch = val; }
     void freespacing (bool val) { x_freespacing = val; }
     void lineanchors (bool val) { x_lineanchors = val; }
     void once (bool val) { x_once = val; }
 
     bool case_insensitive (void) const { return x_case_insensitive; }
     bool dotexceptnewline (void) const { return x_dotexceptnewline; }
+    bool emptymatch (void) const { return x_emptymatch; }
     bool freespacing (void) const { return x_freespacing; }
     bool lineanchors (void) const { return x_lineanchors; }
     bool once (void) const { return x_once; }
@@ -137,6 +142,7 @@
 
     bool x_case_insensitive;
     bool x_dotexceptnewline;
+    bool x_emptymatch;
     bool x_freespacing;
     bool x_lineanchors;
     bool x_once;
--- a/src/DLD-FUNCTIONS/regexp.cc	Fri Apr 06 19:21:16 2012 -0400
+++ b/src/DLD-FUNCTIONS/regexp.cc	Sat Apr 07 09:43:53 2012 -0700
@@ -77,12 +77,16 @@
         options.lineanchors (false);
       else if (str.find ("literalspacing", 0) == 0)
         options.freespacing (false);
+      else if (str.find ("noemptymatch", 0) == 0)
+        options.emptymatch (false);
       else if (str.find ("dotexceptnewline", 0) == 0)
         options.dotexceptnewline (true);
       else if (str.find ("lineanchors", 0) == 0)
         options.lineanchors (true);
       else if (str.find ("freespacing", 0) == 0)
         options.freespacing (true);
+      else if (str.find ("emptymatch", 0) == 0)
+        options.emptymatch (true);
       else if (str.find ("start", 0) == 0
                || str.find ("end", 0) == 0
                || str.find ("tokenextents", 0) == 0
@@ -257,7 +261,9 @@
                   || str.find ("dotall", 0) == 0
                   || str.find ("dotexceptnewline", 0) == 0
                   || str.find ("literalspacing", 0) == 0
-                  || str.find ("freespacing", 0) == 0)
+                  || str.find ("freespacing", 0) == 0
+                  || str.find ("noemptymatch", 0) == 0
+                  || str.find ("emptymatch", 0) == 0)
                 continue;
               else if (str.find ("start", 0) == 0)
                 k = 0;
@@ -488,8 +494,8 @@
 operators.  For example, a template for a floating point number might be\n\
 @code{[-+.\\d]+}.\n\
 \n\
-@item ()\n\
-Grouping operator\n\
+@item () (?:)\n\
+Grouping operator.  The first form, parentheses only, also creates a token.\n\
 \n\
 @item |\n\
 Alternation operator.  Match one of a choice of regular expressions.  The\n\
@@ -562,7 +568,8 @@
 @code{(?<name>@dots{})}.\n\
 \n\
 @item sp\n\
-A cell array of the text not returned by match.\n\
+A cell array of the text not returned by match, i.e., what remains if you\n\
+split the string based on @var{pat}.\n\
 @end table\n\
 \n\
 Particular output arguments, or the order of the output arguments, can be\n\
@@ -630,6 +637,15 @@
 \n\
 Alternatively, use (?x) in the pattern.\n\
 \n\
+@item noemptymatch\n\
+Zero-length matches are not returned.  (default)\n\
+\n\
+@item emptymatch\n\
+Return zero-length matches.\n\
+\n\
+@code{regexp ('a', 'b*', 'emptymatch'} returns @code{[1 2]} because there are\n\
+zero or more 'b' characters at positions 1 and end-of-string.\n\
+\n\
 @end table\n\
 @seealso{regexpi, strfind, regexprep}\n\
 @end deftypefn")
@@ -810,6 +826,46 @@
 %! assert (regexp ("this word", 's w', 'freespacing'), zeros (1,0));
 %! assert (regexp ("this word", '(?x)s w'), zeros (1,0));
 
+%!test
+%! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '[VOCT]*', 'noemptymatch');
+%! assert (s, [1 5]);
+%! assert (e, [3 5]);
+%! assert (te, { zeros(0,2), zeros(0,2) });
+%! assert (m, { "OCT", "V" });
+%! assert (t, { cell(1,0), cell(1,0) });
+%! assert (isempty (fieldnames (nm)));
+%! assert (sp, { "", "A", "E" });
+
+%!test
+%! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '([VOCT]*)', 'noemptymatch');
+%! assert (s, [1 5]);
+%! assert (e, [3 5]);
+%! assert (te, { [1 3], [5 5] });
+%! assert (m, { "OCT", "V" });
+%! assert (t, { {"OCT"}, {"V"} });
+%! assert (isempty (fieldnames (nm)));
+%! assert (sp, { "", "A", "E" });
+
+%!test
+%! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '[VOCT]*', 'emptymatch');
+%! assert (s, [1 4 5 6 7]);
+%! assert (e, [3 3 5 5 6]);
+%! assert (te, repmat ({zeros(0,2)}, [1, 5]));
+%! assert (m, { "OCT", "", "V", "", "" });
+%! assert (t, repmat({cell(1,0)}, [1, 5]));
+%! assert (isempty (fieldnames (nm)));
+%! assert (sp, { "", "", "A", "", "E", "" });
+
+%!test
+%! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '([VOCT]*)', 'emptymatch');
+%! assert (s, [1 4 5 6 7]);
+%! assert (e, [3 3 5 5 6]);
+%! assert (te, { [1 3], [4 3], [5 5], [6 5], [7 6] });
+%! assert (m, { "OCT", "", "V", "", "" });
+%! assert (t, { {"OCT"}, {""}, {"V"}, {""}, {""} });
+%! assert (isempty (fieldnames (nm)));
+%! assert (sp, { "", "", "A", "", "E", "" });
+
 %!error regexp ('string', 'tri', 'BadArg')
 %!error regexp ('string')
 
@@ -1213,6 +1269,10 @@
 ## Return the original if no match
 %!assert (regexprep ('hello', 'world', 'earth'), 'hello')
 
+## Test emptymatch
+%!assert (regexprep ('World', '^', 'Hello '), 'World')
+%!assert (regexprep ('World', '^', 'Hello ', 'emptymatch'), 'Hello World')
+
 ## Test a general replacement
 %!assert (regexprep ("a[b]c{d}e-f=g", "[^A-Za-z0-9_]", "_"), "a_b_c_d_e_f_g")