changeset 20246:f2bc7d23295d

Add special hex/octal escape sequence processing for regexp. * NEWS: Announce change done for Matlab compatibility. * regexp.cc (do_regexp_ptn_string_escapes): Add case for parsing '\oNNN' or '\o{NNN}' octal sequences. PCRE already handles the hex case. * regexp.cc (do_regexp_rep_string_escapes): Add case for parsing '\oNNN' or '\o{NNN}' octal sequences. Add case for parsing '\xNN' or '\x{NN}'.
author Rik <rik@octave.org>
date Thu, 21 May 2015 10:22:27 -0700
parents 29eb47fe8e8c
children 561af1ab6099
files NEWS libinterp/corefcn/regexp.cc
diffstat 2 files changed, 106 insertions(+), 14 deletions(-) [+]
line wrap: on
line diff
--- a/NEWS	Wed May 20 17:18:41 2015 -0700
+++ b/NEWS	Thu May 21 10:22:27 2015 -0700
@@ -6,6 +6,12 @@
     The *printf family of functions now supports octal and hex escape
     sequences in single-quoted strings for Matlab compatibility.
 
+ ** Special octal and hex escape sequences for the pattern and replacement
+    strings in regular expressions are now interpreted for Matlab compatibility.
+
+    octal: '\oNNN' or '\o{NNN}'
+    hex  : '\xNN'  or '\x{NN}'
+
  ** mkfifo now interprets the MODE argument as an octal, not decimal, integer.
     This is consistent with the equivalent shell command. 
 
--- a/libinterp/corefcn/regexp.cc	Wed May 20 17:18:41 2015 -0700
+++ b/libinterp/corefcn/regexp.cc	Thu May 21 10:22:27 2015 -0700
@@ -77,13 +77,38 @@
               retval[++i] = 'b';
               break;
 
-#if 0
-// FIXME: To be complete, we need to handle \oN, \o{N}.
-//        The PCRE library already handles \N where N
-//        is an octal number.  New code needs to merely
-//        replace \oN or \o{N} with \N.
-            case 'o': // octal number
-#endif
+            case 'o': // octal input
+            {
+              bool bad_esc_seq = (j+1 >= len);
+
+              bool brace = false;
+              if (! bad_esc_seq && s[++j] == '{')
+                {
+                  brace = true;
+                  j++;
+                }
+
+              int tmpi = 0;
+              size_t k;
+              for (k = j; k < std::min (j+3+brace, len); k++)
+                {
+                  int digit = s[k] - '0';
+                  if (digit < 0 || digit > 7)
+                    break;
+                  tmpi <<= 3;
+                  tmpi += digit;
+                }
+              if (bad_esc_seq || (brace && s[k++] != '}'))
+                {
+                  bad_esc_seq = true;
+                  tmpi = 0;
+                  warning ("malformed octal escape sequence '\\o' --\
+ converting to '\\0'");
+                }
+              retval[i] = tmpi;
+              j = k - 1;
+              break;
+            }
 
             default:  // pass escape sequence through
               retval[i] = '\\';
@@ -150,14 +175,75 @@
               retval[i] = '\v';
               break;
 
-#if 0
-// FIXME: to be complete, we need to handle \oN, \o{N}, \xN, and
-// \x{N}.  Hex digits may be upper or lower case.  Brackets are
-// optional, so \x5Bz is the same as \x{5B}z.
+            case 'o': // octal input
+            {
+              bool bad_esc_seq = (j+1 >= len);
+
+              bool brace = false;
+              if (! bad_esc_seq && s[++j] == '{')
+                {
+                  brace = true;
+                  j++;
+                }
+
+              int tmpi = 0;
+              size_t k;
+              for (k = j; k < std::min (j+3+brace, len); k++)
+                {
+                  int digit = s[k] - '0';
+                  if (digit < 0 || digit > 7)
+                    break;
+                  tmpi <<= 3;
+                  tmpi += digit;
+                }
+              if (bad_esc_seq || (brace && s[k++] != '}'))
+                {
+                  warning ("malformed octal escape sequence '\\o' --\
+ converting to '\\0'");
+                  tmpi = 0;
+                }
+              retval[i] = tmpi;
+              j = k - 1;
+              break;
+            }
 
-            case 'o': // octal number
-            case 'x': // hex number
-#endif
+            case 'x': // hex input
+            {
+              bool bad_esc_seq = (j+1 >= len);
+
+              bool brace = false;
+              if (! bad_esc_seq && s[++j] == '{')
+                {
+                  brace = true;
+                  j++;
+                }
+
+              int tmpi = 0;
+              size_t k;
+              for (k = j; k < std::min (j+2+brace, len); k++)
+                {
+                  if (! isxdigit (s[k]))
+                    break;
+
+                  tmpi <<= 4;
+                  int digit = s[k];
+                  if (digit >= 'a')
+                    tmpi += digit - 'a' + 10;
+                  else if (digit >= 'A')
+                    tmpi += digit - 'A' + 10;
+                  else
+                    tmpi += digit - '0';
+                }
+              if (bad_esc_seq || (brace && s[k++] != '}'))
+                {
+                  warning ("malformed hex escape sequence '\\x' --\
+ converting to '\\0'");
+                  tmpi = 0;
+                }
+              retval[i] = tmpi;
+              j = k - 1;
+              break;
+            }
 
             default:  // pass escape sequence through
               retval[i] = '\\';