Mercurial > jwe > octave

--- a/libinterp/corefcn/regexp.cc	Mon Feb 12 10:18:12 2018 -0500
+++ b/libinterp/corefcn/regexp.cc	Mon Feb 12 11:52:46 2018 -0800
@@ -985,10 +985,36 @@
 %! assert (nm(2).first, "James");
 %! assert (nm(2).last, "Rogers");

+## Tests for nulls in strings properly matching
+%!test
+%! str = "A\0B\0\0C";
+%! ptn = '(\0+)';  # also test null in single-quote pattern
+%! M = regexp (str, ptn, "match");
+%! assert (size (M), [1, 2]);
+%! assert (double (M{1}), [0]);
+%! assert (double (M{2}), [0, 0]);
+
+%!test
+%! str = "A\0B\0\0C";
+%! ptn = "(\0+)";  # also test null in double-quote pattern
+%! T = regexp (str, ptn, "tokens");
+%! assert (size (T), [1, 2]);
+%! assert (double (T{1}{1}), [0]);
+%! assert (double (T{2}{1}), [0, 0]);
+
+%!test
+%! str = "A\0B\0\0C";
+%! ptn = '(?<namedtoken>\0+)';
+%! NT = regexp (str, ptn, "names");
+%! assert (size (NT), [1, 2]);
+%! assert (double (NT(1).namedtoken), [0]);
+%! assert (double (NT(2).namedtoken), [0, 0]);
+
 ## Tests for named tokens
 %!test
 %! ## Parenthesis in named token (ie (int)) causes a problem
-%! assert (regexp ('qwe int asd', ['(?<typestr>(int))'], 'names'), struct ('typestr', 'int'));
+%! assert (regexp ('qwe int asd', ['(?<typestr>(int))'], 'names'),
+%!         struct ('typestr', 'int'));

 %!test <*35683>
 %! ## Mix of named and unnamed tokens can cause segfault
--- a/liboctave/util/lo-regexp.cc	Mon Feb 12 10:18:12 2018 -0500
+++ b/liboctave/util/lo-regexp.cc	Mon Feb 12 11:52:46 2018 -0800
@@ -220,9 +220,14 @@

     buf << pattern.substr (pos);

+    // Replace NULLs with escape sequence because conversion function c_str()
+    // will terminate string early at embedded NULLs.
+    std::string buf_str = buf.str ();
+    while ((pos = buf_str.find ('\0')) != std::string::npos)
+      buf_str.replace (pos, 1, "\\000");
+
     const char *err;
     int erroffset;
-    std::string buf_str = buf.str ();

     int pcre_options
       = (  (options.case_insensitive () ? PCRE_CASELESS : 0)
@@ -353,6 +358,9 @@
                 ("%s: cannot allocate memory in pcre_get_substring_list",
                  who.c_str ());

+            // Must use explicit length constructor as match can contain '\0'.
+            std::string match_string = std::string (*listptr, end - start + 1);
+
             string_vector tokens (pos_match);
             string_vector named_tokens (nnames);
             int pos_offset = 0;
@@ -375,22 +383,23 @@
                               {
                                 if (nidx[j] == i)
                                   {
+                                    size_t len = ovector[2*i+1] - ovector[2*i];
                                     named_tokens(named_idx(j)) =
-                                      std::string (*(listptr+i-pos_offset));
+                                      std::string (*(listptr+i-pos_offset),
+                                                   len);
                                     break;
                                   }
                               }
                           }

-                        tokens(pos_match++) = std::string (*(listptr+i));
+                        size_t len = ovector[2*i+1] - ovector[2*i];
+                        tokens(pos_match++) = std::string (*(listptr+i), len);
                       }
                     else
                       pos_offset++;
                   }
               }

-            std::string match_string = std::string (*listptr);
-
             pcre_free_substring_list (listptr);

             regexp::match_element new_elem (named_tokens, tokens, match_string,