diff libinterp/corefcn/regexp.cc @ 24741:00dfa167c1fe

Fix handling of NULL character in regular expressions for Matlab compatibility. * regexp.cc (Fregexp): Add BIST tests for null characters in single- and double-quoted patterns. Add BIST tests for returning NULL values in matches, tokens, and named tokens. * lo-regexp.cc (regexp::compile_internal): Replace NULLs in pattern with escaped octal sequence '\000' so that c_str() conversion of std::string does not truncated pattern. * lo-regexp.cc (regexp::match): Use constructor std::string (const char* s, size_t n) with explicitly specified length so that embedded NULLs in character buffers do not result in a truncated length string. Use for construction of match, tokens, and named_tokens.
author Rik <rik@octave.org>
date Mon, 12 Feb 2018 11:52:46 -0800
parents a27dcb26f872
children 6e670c58c6f0
line wrap: on
line diff
--- a/libinterp/corefcn/regexp.cc	Mon Feb 12 10:18:12 2018 -0500
+++ b/libinterp/corefcn/regexp.cc	Mon Feb 12 11:52:46 2018 -0800
@@ -985,10 +985,36 @@
 %! assert (nm(2).first, "James");
 %! assert (nm(2).last, "Rogers");
 
+## Tests for nulls in strings properly matching
+%!test
+%! str = "A\0B\0\0C";
+%! ptn = '(\0+)';  # also test null in single-quote pattern
+%! M = regexp (str, ptn, "match");
+%! assert (size (M), [1, 2]);
+%! assert (double (M{1}), [0]);
+%! assert (double (M{2}), [0, 0]);
+
+%!test
+%! str = "A\0B\0\0C";
+%! ptn = "(\0+)";  # also test null in double-quote pattern
+%! T = regexp (str, ptn, "tokens");
+%! assert (size (T), [1, 2]);
+%! assert (double (T{1}{1}), [0]);
+%! assert (double (T{2}{1}), [0, 0]);
+
+%!test
+%! str = "A\0B\0\0C";
+%! ptn = '(?<namedtoken>\0+)';
+%! NT = regexp (str, ptn, "names");
+%! assert (size (NT), [1, 2]);
+%! assert (double (NT(1).namedtoken), [0]);
+%! assert (double (NT(2).namedtoken), [0, 0]);
+
 ## Tests for named tokens
 %!test
 %! ## Parenthesis in named token (ie (int)) causes a problem
-%! assert (regexp ('qwe int asd', ['(?<typestr>(int))'], 'names'), struct ('typestr', 'int'));
+%! assert (regexp ('qwe int asd', ['(?<typestr>(int))'], 'names'),
+%!         struct ('typestr', 'int'));
 
 %!test <*35683>
 %! ## Mix of named and unnamed tokens can cause segfault