changeset 24741:00dfa167c1fe

Fix handling of NULL character in regular expressions for Matlab compatibility. * regexp.cc (Fregexp): Add BIST tests for null characters in single- and double-quoted patterns. Add BIST tests for returning NULL values in matches, tokens, and named tokens. * lo-regexp.cc (regexp::compile_internal): Replace NULLs in pattern with escaped octal sequence '\000' so that c_str() conversion of std::string does not truncated pattern. * lo-regexp.cc (regexp::match): Use constructor std::string (const char* s, size_t n) with explicitly specified length so that embedded NULLs in character buffers do not result in a truncated length string. Use for construction of match, tokens, and named_tokens.
author Rik <rik@octave.org>
date Mon, 12 Feb 2018 11:52:46 -0800
parents d85470c4f09c
children 9db2f757c7c0
files libinterp/corefcn/regexp.cc liboctave/util/lo-regexp.cc
diffstat 2 files changed, 41 insertions(+), 6 deletions(-) [+]
line wrap: on
line diff
--- a/libinterp/corefcn/regexp.cc	Mon Feb 12 10:18:12 2018 -0500
+++ b/libinterp/corefcn/regexp.cc	Mon Feb 12 11:52:46 2018 -0800
@@ -985,10 +985,36 @@
 %! assert (nm(2).first, "James");
 %! assert (nm(2).last, "Rogers");
 
+## Tests for nulls in strings properly matching
+%!test
+%! str = "A\0B\0\0C";
+%! ptn = '(\0+)';  # also test null in single-quote pattern
+%! M = regexp (str, ptn, "match");
+%! assert (size (M), [1, 2]);
+%! assert (double (M{1}), [0]);
+%! assert (double (M{2}), [0, 0]);
+
+%!test
+%! str = "A\0B\0\0C";
+%! ptn = "(\0+)";  # also test null in double-quote pattern
+%! T = regexp (str, ptn, "tokens");
+%! assert (size (T), [1, 2]);
+%! assert (double (T{1}{1}), [0]);
+%! assert (double (T{2}{1}), [0, 0]);
+
+%!test
+%! str = "A\0B\0\0C";
+%! ptn = '(?<namedtoken>\0+)';
+%! NT = regexp (str, ptn, "names");
+%! assert (size (NT), [1, 2]);
+%! assert (double (NT(1).namedtoken), [0]);
+%! assert (double (NT(2).namedtoken), [0, 0]);
+
 ## Tests for named tokens
 %!test
 %! ## Parenthesis in named token (ie (int)) causes a problem
-%! assert (regexp ('qwe int asd', ['(?<typestr>(int))'], 'names'), struct ('typestr', 'int'));
+%! assert (regexp ('qwe int asd', ['(?<typestr>(int))'], 'names'),
+%!         struct ('typestr', 'int'));
 
 %!test <*35683>
 %! ## Mix of named and unnamed tokens can cause segfault
--- a/liboctave/util/lo-regexp.cc	Mon Feb 12 10:18:12 2018 -0500
+++ b/liboctave/util/lo-regexp.cc	Mon Feb 12 11:52:46 2018 -0800
@@ -220,9 +220,14 @@
 
     buf << pattern.substr (pos);
 
+    // Replace NULLs with escape sequence because conversion function c_str() 
+    // will terminate string early at embedded NULLs.
+    std::string buf_str = buf.str ();
+    while ((pos = buf_str.find ('\0')) != std::string::npos)
+      buf_str.replace (pos, 1, "\\000");
+
     const char *err;
     int erroffset;
-    std::string buf_str = buf.str ();
 
     int pcre_options
       = (  (options.case_insensitive () ? PCRE_CASELESS : 0)
@@ -353,6 +358,9 @@
                 ("%s: cannot allocate memory in pcre_get_substring_list",
                  who.c_str ());
 
+            // Must use explicit length constructor as match can contain '\0'.
+            std::string match_string = std::string (*listptr, end - start + 1);
+
             string_vector tokens (pos_match);
             string_vector named_tokens (nnames);
             int pos_offset = 0;
@@ -375,22 +383,23 @@
                               {
                                 if (nidx[j] == i)
                                   {
+                                    size_t len = ovector[2*i+1] - ovector[2*i];
                                     named_tokens(named_idx(j)) =
-                                      std::string (*(listptr+i-pos_offset));
+                                      std::string (*(listptr+i-pos_offset),
+                                                   len);
                                     break;
                                   }
                               }
                           }
 
-                        tokens(pos_match++) = std::string (*(listptr+i));
+                        size_t len = ovector[2*i+1] - ovector[2*i];
+                        tokens(pos_match++) = std::string (*(listptr+i), len);
                       }
                     else
                       pos_offset++;
                   }
               }
 
-            std::string match_string = std::string (*listptr);
-
             pcre_free_substring_list (listptr);
 
             regexp::match_element new_elem (named_tokens, tokens, match_string,