changeset 5779:12eeebfa7ead

[project @ 2006-04-27 19:30:14 by dbateman]
author dbateman
date Thu, 27 Apr 2006 19:30:14 +0000
parents 70f67d85558d
children cbf717bf8150
files src/ChangeLog src/DLD-FUNCTIONS/regexp.cc
diffstat 2 files changed, 172 insertions(+), 8 deletions(-) [+]
line wrap: on
line diff
--- a/src/ChangeLog	Thu Apr 27 01:34:08 2006 +0000
+++ b/src/ChangeLog	Thu Apr 27 19:30:14 2006 +0000
@@ -1,3 +1,8 @@
+2006-04-27  David Bateman  <dbateman@free.fr>
+
+	* DLD-FUNCTIONS/regexp.cc (octregexp): Fix for infinite loop in
+	regexp. Include news regexp arguments, and associated tests.
+
 2006-04-26  Bill Denney  <denney@seas.upenn.edu>
 
  	* pager.cc (Fterminal_size): Add list_in_columns to @seealso.
--- a/src/DLD-FUNCTIONS/regexp.cc	Thu Apr 27 01:34:08 2006 +0000
+++ b/src/DLD-FUNCTIONS/regexp.cc	Thu Apr 27 19:30:14 2006 +0000
@@ -60,6 +60,9 @@
   int nargin = args.length();
   int nopts = nargin - 2;
   bool once = false;
+  bool lineanchors = false;
+  bool dotexceptnewline = false;
+  bool freespacing = false;
 
   if (nargin < 2)
     {
@@ -95,13 +98,57 @@
 	  once = true;
 	  nopts--;
 	}
-#ifdef HAVE_PCRE
+#if HAVE_PCRE
+      // Only accept these options with pcre
+      else if (str.find("dotall", 0) == 0)
+	{
+	  dotexceptnewline = false;
+	  nopts--;
+	}
+      else if (str.find("dotexceptnewline", 0) == 0)
+	{
+	  dotexceptnewline = true;
+	  nopts--;
+	}
+      else if (str.find("stringanchors", 0) == 0)
+	{
+	  lineanchors = false;
+	  nopts--;
+	}
+      else if (str.find("lineanchors", 0) == 0)
+	{
+	  lineanchors = true;
+	  nopts--;
+	}
+      else if (str.find("matchcase", 0) == 0)
+	{
+	  case_insensitive = false;
+	  nopts--;
+	}
+      else if (str.find("ignorecase", 0) == 0)
+	{
+	  case_insensitive = true;
+	  nopts--;
+	}
+      else if (str.find("freespacing", 0) == 0)
+	{
+	  freespacing = true;
+	  nopts--;
+	}
+      else if (str.find("literalspacing", 0) == 0)
+	{
+	  freespacing = false;
+	  nopts--;
+	}
       else if (str.find("start", 0) && str.find("end", 0) &&
 	       str.find("tokenextents", 0) && str.find("match", 0) &&
 	       str.find("tokens", 0) && str.find("names", 0))
 	error ("%s: unrecognized option", nm.c_str());
 #else
-      else if (str.find("names", 0) == 0)
+      else if (str.find("names", 0) == 0 ||
+	       str.find("dotexceptnewline", 0) == 0 ||
+	       str.find("lineanchors", 0) == 0 ||
+	       str.find("freespacing", 0) == 0)
 	error ("%s: named tokens not implemented in this version", nm.c_str());
       else if (str.find("start", 0) && str.find("end", 0) &&
 	       str.find("tokenextents", 0) && str.find("match", 0) &&
@@ -133,6 +180,7 @@
       std::ostringstream buf;
       Array<int> named_idx;
 
+      // Add mode flags
       while ((new_pos = pattern.find ("(?<",pos)) != NPOS)
 	{
 	  size_t tmp_pos = pattern.find_first_of ('>',new_pos);
@@ -184,7 +232,10 @@
       int erroffset;
       std::string buf_str = buf.str ();
       re = pcre_compile (buf_str.c_str (),
-			 (case_insensitive ? PCRE_CASELESS : 0),
+			 (case_insensitive ? PCRE_CASELESS : 0) |
+			 (dotexceptnewline ? 0 : PCRE_DOTALL) |
+			 (lineanchors ? PCRE_MULTILINE : 0) |
+			 (freespacing ? PCRE_EXTENDED : 0),
 			 &err, &erroffset, NULL);
     
       if (re == NULL) {
@@ -212,8 +263,8 @@
 	{
 	  // Index of subpattern in first two bytes MSB first of name.
 	  // Extract index.
-	  nidx[i] = ((int)nametable[i*nameentrysize]) << 8 |
-	    (int)nametable[i*nameentrysize+1];
+	  nidx[i] = (static_cast<int>(nametable[i*nameentrysize])) << 8 |
+	    static_cast<int>(nametable[i*nameentrysize+1]);
 	}
 
       Cell named_tokens(dim_vector(nnames,1));
@@ -233,8 +284,13 @@
 	    }
 	  else if (matches == PCRE_ERROR_NOMATCH)
 	    break;
+	  else if (ovector[1] <= ovector[0])
+	    break;
 	  else
 	    {
+	      // FIXME Should collect arguments in a linked structure and
+	      // resize and assign the return value a single time to make
+	      // this function O(n) rather than O(n^2) as it currently is.
 	      int pos_match = 0;
 	      s.resize (dim_vector(1, sz+1));
 	      s(sz) = double (ovector[0]+1);
@@ -400,7 +456,18 @@
 	      int k = 0;
 	      std::string str = args(i).string_value();
 	      std::transform (str.begin (), str.end (), str.begin (), tolower);
-	      if (str.find("once", 0) == 0)
+	      if (str.find("once", 0) == 0
+#if HAVE_PCRE
+		  || str.find("stringanchors", 0) == 0
+		  || str.find("lineanchors", 0) == 0
+		  || str.find("matchcase", 0) == 0
+		  || str.find("ignorecase", 0) == 0
+		  || str.find("dotall", 0) == 0
+		  || str.find("dotexceptnewline", 0) == 0
+		  || str.find("literalspacing", 0) == 0
+		  || str.find("freespacing", 0) == 0
+#endif
+	      )
 		continue;
 	      else if (str.find("start", 0) == 0)
 		k = 0;
@@ -437,7 +504,7 @@
     }
 
 #else
-  error ("%s: not available in this version of Octave", nm);
+  error ("%s: not available in this version of Octave", nm.c_str());
 #endif
   return retval;
 }
@@ -551,7 +618,27 @@
 @end multitable\n\
 \n\
 A further optional argument is 'once', that limits the number of returned\n\
-matches to the first match.\n\
+matches to the first match. Additional arguments are\n\
+\n\
+@table @asis\n\
+@item matchcase\n\
+Make the matching case sensitive.\n\
+@item ignorecase\n\
+Make the matching case insensitive.\n\
+@item stringanchors\n\
+Match the anchor characters at the beginning and end of the string.\n\
+@item lineanchors\n\
+Match the anchor characters at the beginning and end of the line.\n\
+@item dotall\n\
+The character @code{.} matches the newline character.\n\
+@item dotexceptnewline\n\
+The character @code{.} matches all but the newline character.\n\
+@item freespacing\n\
+The pattern can include arbitrary whitespace and comments starting with\n\
+@code{#}.\n\
+@item literalspacing\n\
+The pattern is taken literally.\n\
+@end table\n\
 @end deftypefn")
 {
   return octregexp (args, nargout, "regexp", false);
@@ -678,6 +765,42 @@
 %!   assert (nm.last{2},'Rogers');
 %! endif
 
+%!assert(regexp("abc\nabc",'.'),[1:7])
+%!assert(regexp("abc\nabc",'.','dotall'),[1:7])
+%!test
+%! if (!isempty(findstr(octave_config_info ("DEFS"),"HAVE_PCRE")))
+%!   assert(regexp("abc\nabc",'(?s).'),[1:7])
+%!   assert(regexp("abc\nabc",'.','dotexceptnewline'),[1,2,3,5,6,7])
+%!   assert(regexp("abc\nabc",'(?-s).'),[1,2,3,5,6,7])
+%! endif
+
+%!assert(regexp("caseCaSe",'case'),1)
+%!assert(regexp("caseCaSe",'case',"matchcase"),1)
+%!assert(regexp("caseCaSe",'case',"ignorecase"),[1,5])
+%!test
+%! if (!isempty(findstr(octave_config_info ("DEFS"),"HAVE_PCRE")))
+%!   assert(regexp("caseCaSe",'(?-i)case'),1)
+%!   assert(regexp("caseCaSe",'(?i)case'),[1,5])
+%! endif
+
+%!assert (regexp("abc\nabc",'c$'),7)
+%!assert (regexp("abc\nabc",'c$',"stringanchors"),7)
+%!test
+%! if (!isempty(findstr(octave_config_info ("DEFS"),"HAVE_PCRE")))
+%!   assert (regexp("abc\nabc",'(?-m)c$'),7)
+%!   assert (regexp("abc\nabc",'c$',"lineanchors"),[3,7])
+%!   assert (regexp("abc\nabc",'(?m)c$'),[3,7])
+%! endif
+
+%!assert (regexp("this word",'s w'),4)
+%!assert (regexp("this word",'s w','literalspacing'),4)
+%!test
+%! if (!isempty(findstr(octave_config_info ("DEFS"),"HAVE_PCRE")))
+%!   assert (regexp("this word",'(?-x)s w','literalspacing'),4)
+%!   assert (regexp("this word",'s w','freespacing'),[])
+%!   assert (regexp("this word",'(?x)s w'),[])
+%! endif
+
 %!error regexp('string', 'tri', 'BadArg');
 %!error regexp('string');
 
@@ -801,6 +924,42 @@
 %!   assert (nm.word2,'Test')
 %! endif
 
+%!assert(regexpi("abc\nabc",'.'),[1:7])
+%!assert(regexpi("abc\nabc",'.','dotall'),[1:7])
+%!test
+%! if (!isempty(findstr(octave_config_info ("DEFS"),"HAVE_PCRE")))
+%!   assert(regexpi("abc\nabc",'(?s).'),[1:7])
+%!   assert(regexpi("abc\nabc",'.','dotexceptnewline'),[1,2,3,5,6,7])
+%!   assert(regexpi("abc\nabc",'(?-s).'),[1,2,3,5,6,7])
+%! endif
+
+%!assert(regexpi("caseCaSe",'case'),[1,5])
+%!assert(regexpi("caseCaSe",'case',"matchcase"),1)
+%!assert(regexpi("caseCaSe",'case',"ignorecase"),[1,5])
+%!test
+%! if (!isempty(findstr(octave_config_info ("DEFS"),"HAVE_PCRE")))
+%!   assert(regexpi("caseCaSe",'(?-i)case'),1)
+%!   assert(regexpi("caseCaSe",'(?i)case'),[1,5])
+%! endif
+
+%!assert (regexpi("abc\nabc",'c$'),7)
+%!assert (regexpi("abc\nabc",'c$',"stringanchors"),7)
+%!test
+%! if (!isempty(findstr(octave_config_info ("DEFS"),"HAVE_PCRE")))
+%!   assert (regexpi("abc\nabc",'(?-m)c$'),7)
+%!   assert (regexpi("abc\nabc",'c$',"lineanchors"),[3,7])
+%!   assert (regexpi("abc\nabc",'(?m)c$'),[3,7])
+%! endif
+
+%!assert (regexpi("this word",'s w'),4)
+%!assert (regexpi("this word",'s w','literalspacing'),4)
+%!test
+%! if (!isempty(findstr(octave_config_info ("DEFS"),"HAVE_PCRE")))
+%!   assert (regexpi("this word",'(?-x)s w','literalspacing'),4)
+%!   assert (regexpi("this word",'s w','freespacing'),[])
+%!   assert (regexpi("this word",'(?x)s w'),[])
+%! endif
+
 %!error regexpi('string', 'tri', 'BadArg');
 %!error regexpi('string');