diff libinterp/corefcn/regexp.cc @ 15541:9db32cabeacf

Fix backslash handling in regexp pattern (Bug #37092) * NEWS: Give an example of how escape sequence processing in single-quoted regular expressions works. * libinterp/corefcn/regexp.cc(do_regexp_string_escapes): Rename to do_regexp_ptn_string_escapes. Only sequence to expand is '\b' for backspace. Others are handled by PCRE. * libinterp/corefcn/regexp.cc(do_regexp_rep_string_escapes): New function to do escape sequence processing for the replacement string since the sequences to expand differ from that of the regexp pattern. * liboctave/util/regexp.cc(regexp::replace): Process backslashes in replacement string so that '\$1' results in '$1' rather than replacement with first capture buffer.
author Rik <rik@octave.org>
date Wed, 17 Oct 2012 20:13:19 -0700
parents 2fc554ffbc28
children 7eff3032d144
line wrap: on
line diff
--- a/libinterp/corefcn/regexp.cc	Wed Oct 17 15:56:33 2012 -0700
+++ b/libinterp/corefcn/regexp.cc	Wed Oct 17 20:13:19 2012 -0700
@@ -45,12 +45,12 @@
 #include "utils.h"
 
 // Replace backslash escapes in a string with the real values.  We need
-// this special function instead of the one in utils.cc because the set
-// of escape sequences used in regexps is different from those used in
-// the *printf functions.
+// two special functions instead of the one in utils.cc because the set
+// of escape sequences used for regexp patterns and replacement strings
+// is different from those used in the *printf functions.
 
 static std::string
-do_regexp_string_escapes (const std::string& s)
+do_regexp_ptn_string_escapes (const std::string& s)
 {
   std::string retval;
 
@@ -66,11 +66,56 @@
         {
           switch (s[++j])
             {
-            case '$':
-              retval[i] = '$';
+            case 'b': // backspace
+              retval[i] = '\b';
               break;
 
-            case 'a':
+#if 0
+// FIXME : To be complete, we need to handle \oN, \o{N}.
+//         The PCRE library already handles \N where N
+//         is an octal number.  New code needs to merely
+//         replace \oN or \o{N} with \N.
+            case 'o': // octal number
+#endif
+
+            default:  // pass escape sequence through
+              retval[i] = '\\';
+              retval[++i] = s[j];
+              break;
+            }
+        }
+      else
+        {
+          retval[i] = s[j];
+        }
+
+      i++;
+      j++;
+    }
+
+  retval.resize (i);
+
+  return retval;
+}
+
+static std::string
+do_regexp_rep_string_escapes (const std::string& s)
+{
+  std::string retval;
+
+  size_t i = 0;
+  size_t j = 0;
+  size_t len = s.length ();
+
+  retval.resize (len);
+
+  while (j < len)
+    {
+      if (s[j] == '\\' && j+1 < len)
+        {
+          switch (s[++j])
+            {
+            case 'a': // alarm
               retval[i] = '\a';
               break;
 
@@ -98,10 +143,6 @@
               retval[i] = '\v';
               break;
 
-            case '\\': // backslash
-              retval[i] = '\\';
-              break;
-
 #if 0
 // FIXME -- to be complete, we need to handle \oN, \o{N}, \xN, and
 // \x{N}.  Hex digits may be upper or lower case.  Brackets are
@@ -110,8 +151,8 @@
             case 'o': // octal number
             case 'x': // hex number
 #endif
-
-            default:
+ 
+            default:  // pass escape sequence through
               retval[i] = '\\';
               retval[++i] = s[j];
               break;
@@ -205,7 +246,7 @@
     return retval;
   // Matlab compatibility.
   if (args(1).is_sq_string ())
-    pattern = do_regexp_string_escapes (pattern);
+    pattern = do_regexp_ptn_string_escapes (pattern);
 
   regexp::opts options;
   options.case_insensitive (case_insensitive);
@@ -1196,14 +1237,14 @@
     return retval;
   // Matlab compatibility.
   if (args(1).is_sq_string ())
-    pattern = do_regexp_string_escapes (pattern);
+    pattern = do_regexp_ptn_string_escapes (pattern);
 
   std::string replacement = args(2).string_value ();
   if (error_state)
     return retval;
   // Matlab compatibility.
   if (args(2).is_sq_string ())
-    replacement = do_regexp_string_escapes (replacement);
+    replacement = do_regexp_rep_string_escapes (replacement);
 
   // Pack options excluding 'tokenize' and various output
   // reordering strings into regexp arg list