diff libinterp/corefcn/regexp.cc @ 29350:8f0d0d4690c0

Change regexp start-of-word/end-of-word behavior for Matlab compatiblity (bug #59992). * regexp.cc (do_regexp_ptn_string_escapes): Resize retval to length of input string rather than "length + 0" (random, unrelated change). Translate start-of-word ('\<') to positive look-behind pattern '(?<=\W|^)'. Translate end-of-word ('\>') to positive look-ahead pattern '(?=\W|$)'. Add BIST test for bug #59992.
author Rik <rik@octave.org>
date Tue, 02 Feb 2021 14:50:47 -0800
parents 10a35049bad7
children 7854d5752dd2
line wrap: on
line diff
--- a/libinterp/corefcn/regexp.cc	Tue Feb 02 16:40:33 2021 -0500
+++ b/libinterp/corefcn/regexp.cc	Tue Feb 02 14:50:47 2021 -0800
@@ -60,7 +60,7 @@
   size_t j = 0;
   size_t len = s.length ();
 
-  retval.resize (len+i);
+  retval.resize (len);
 
   while (j < len)
     {
@@ -79,11 +79,15 @@
                 }
               break;
 
-            // Translate \< and \> to PCRE word boundary
+            // Translate \< and \> to PCRE patterns for pseudo-word boundary
             case '<': // begin word boundary
+              retval.insert (i, "(?<=\\W|^)");
+              i += 8;
+              break;
+
             case '>': // end word boundary
-              retval[i] = '\\';
-              retval[++i] = 'b';
+              retval.insert (i, "(?=\\W|$)");
+              i += 7;
               break;
 
             case 'o': // octal input
@@ -1178,12 +1182,19 @@
 %!assert (regexp ("\n", '\n'), 1)
 %!assert (regexp ("\n", "\n"), 1)
 
-# Test escape sequences are silently converted
+## Test escape sequences are silently converted
 %!test <*45407>
 %! assert (regexprep ('s', 's', 'x\.y'), 'x.y');
 %! assert (regexprep ('s', '(s)', 'x\$1y'), 'x$1y');
 %! assert (regexprep ('s', '(s)', 'x\\$1y'), 'x\sy');
 
+## Test start-of-word / end-of-word patterns for Matlab compatibility
+%!test <*59992>
+%! assert (regexp ('foo!+bar', '\<\w'), [1, 6]);
+%! assert (regexp ('foo!+bar', '.\>'), [3, 4, 8]);
+%! assert (regexp ('foo!+bar\nbar!+foo', '.\>'), [3, 4, 8, 13, 14, 18]);
+%! assert (regexp ('foo!+bar\nbar!+foo', '\<\w'), [1, 6, 10, 16]);
+
 ## Test input validation
 %!error regexp ('string', 'tri', 'BadArg')
 %!error regexp ('string')