diff liboctave/util/lo-regexp.cc @ 24741:00dfa167c1fe

Fix handling of NULL character in regular expressions for Matlab compatibility. * regexp.cc (Fregexp): Add BIST tests for null characters in single- and double-quoted patterns. Add BIST tests for returning NULL values in matches, tokens, and named tokens. * lo-regexp.cc (regexp::compile_internal): Replace NULLs in pattern with escaped octal sequence '\000' so that c_str() conversion of std::string does not truncated pattern. * lo-regexp.cc (regexp::match): Use constructor std::string (const char* s, size_t n) with explicitly specified length so that embedded NULLs in character buffers do not result in a truncated length string. Use for construction of match, tokens, and named_tokens.
author Rik <rik@octave.org>
date Mon, 12 Feb 2018 11:52:46 -0800
parents a27dcb26f872
children 6652d3823428
line wrap: on
line diff
--- a/liboctave/util/lo-regexp.cc	Mon Feb 12 10:18:12 2018 -0500
+++ b/liboctave/util/lo-regexp.cc	Mon Feb 12 11:52:46 2018 -0800
@@ -220,9 +220,14 @@
 
     buf << pattern.substr (pos);
 
+    // Replace NULLs with escape sequence because conversion function c_str() 
+    // will terminate string early at embedded NULLs.
+    std::string buf_str = buf.str ();
+    while ((pos = buf_str.find ('\0')) != std::string::npos)
+      buf_str.replace (pos, 1, "\\000");
+
     const char *err;
     int erroffset;
-    std::string buf_str = buf.str ();
 
     int pcre_options
       = (  (options.case_insensitive () ? PCRE_CASELESS : 0)
@@ -353,6 +358,9 @@
                 ("%s: cannot allocate memory in pcre_get_substring_list",
                  who.c_str ());
 
+            // Must use explicit length constructor as match can contain '\0'.
+            std::string match_string = std::string (*listptr, end - start + 1);
+
             string_vector tokens (pos_match);
             string_vector named_tokens (nnames);
             int pos_offset = 0;
@@ -375,22 +383,23 @@
                               {
                                 if (nidx[j] == i)
                                   {
+                                    size_t len = ovector[2*i+1] - ovector[2*i];
                                     named_tokens(named_idx(j)) =
-                                      std::string (*(listptr+i-pos_offset));
+                                      std::string (*(listptr+i-pos_offset),
+                                                   len);
                                     break;
                                   }
                               }
                           }
 
-                        tokens(pos_match++) = std::string (*(listptr+i));
+                        size_t len = ovector[2*i+1] - ovector[2*i];
+                        tokens(pos_match++) = std::string (*(listptr+i), len);
                       }
                     else
                       pos_offset++;
                   }
               }
 
-            std::string match_string = std::string (*listptr);
-
             pcre_free_substring_list (listptr);
 
             regexp::match_element new_elem (named_tokens, tokens, match_string,