Mercurial > jwe > octave
changeset 24741:00dfa167c1fe
Fix handling of NULL character in regular expressions for Matlab compatibility.
* regexp.cc (Fregexp): Add BIST tests for null characters in single- and
double-quoted patterns. Add BIST tests for returning NULL values in
matches, tokens, and named tokens.
* lo-regexp.cc (regexp::compile_internal): Replace NULLs in pattern with
escaped octal sequence '\000' so that c_str() conversion of std::string
does not truncated pattern.
* lo-regexp.cc (regexp::match): Use constructor
std::string (const char* s, size_t n) with explicitly specified length
so that embedded NULLs in character buffers do not result in a truncated
length string. Use for construction of match, tokens, and named_tokens.
author | Rik <rik@octave.org> |
---|---|
date | Mon, 12 Feb 2018 11:52:46 -0800 |
parents | d85470c4f09c |
children | 9db2f757c7c0 |
files | libinterp/corefcn/regexp.cc liboctave/util/lo-regexp.cc |
diffstat | 2 files changed, 41 insertions(+), 6 deletions(-) [+] |
line wrap: on
line diff
--- a/libinterp/corefcn/regexp.cc Mon Feb 12 10:18:12 2018 -0500 +++ b/libinterp/corefcn/regexp.cc Mon Feb 12 11:52:46 2018 -0800 @@ -985,10 +985,36 @@ %! assert (nm(2).first, "James"); %! assert (nm(2).last, "Rogers"); +## Tests for nulls in strings properly matching +%!test +%! str = "A\0B\0\0C"; +%! ptn = '(\0+)'; # also test null in single-quote pattern +%! M = regexp (str, ptn, "match"); +%! assert (size (M), [1, 2]); +%! assert (double (M{1}), [0]); +%! assert (double (M{2}), [0, 0]); + +%!test +%! str = "A\0B\0\0C"; +%! ptn = "(\0+)"; # also test null in double-quote pattern +%! T = regexp (str, ptn, "tokens"); +%! assert (size (T), [1, 2]); +%! assert (double (T{1}{1}), [0]); +%! assert (double (T{2}{1}), [0, 0]); + +%!test +%! str = "A\0B\0\0C"; +%! ptn = '(?<namedtoken>\0+)'; +%! NT = regexp (str, ptn, "names"); +%! assert (size (NT), [1, 2]); +%! assert (double (NT(1).namedtoken), [0]); +%! assert (double (NT(2).namedtoken), [0, 0]); + ## Tests for named tokens %!test %! ## Parenthesis in named token (ie (int)) causes a problem -%! assert (regexp ('qwe int asd', ['(?<typestr>(int))'], 'names'), struct ('typestr', 'int')); +%! assert (regexp ('qwe int asd', ['(?<typestr>(int))'], 'names'), +%! struct ('typestr', 'int')); %!test <*35683> %! ## Mix of named and unnamed tokens can cause segfault
--- a/liboctave/util/lo-regexp.cc Mon Feb 12 10:18:12 2018 -0500 +++ b/liboctave/util/lo-regexp.cc Mon Feb 12 11:52:46 2018 -0800 @@ -220,9 +220,14 @@ buf << pattern.substr (pos); + // Replace NULLs with escape sequence because conversion function c_str() + // will terminate string early at embedded NULLs. + std::string buf_str = buf.str (); + while ((pos = buf_str.find ('\0')) != std::string::npos) + buf_str.replace (pos, 1, "\\000"); + const char *err; int erroffset; - std::string buf_str = buf.str (); int pcre_options = ( (options.case_insensitive () ? PCRE_CASELESS : 0) @@ -353,6 +358,9 @@ ("%s: cannot allocate memory in pcre_get_substring_list", who.c_str ()); + // Must use explicit length constructor as match can contain '\0'. + std::string match_string = std::string (*listptr, end - start + 1); + string_vector tokens (pos_match); string_vector named_tokens (nnames); int pos_offset = 0; @@ -375,22 +383,23 @@ { if (nidx[j] == i) { + size_t len = ovector[2*i+1] - ovector[2*i]; named_tokens(named_idx(j)) = - std::string (*(listptr+i-pos_offset)); + std::string (*(listptr+i-pos_offset), + len); break; } } } - tokens(pos_match++) = std::string (*(listptr+i)); + size_t len = ovector[2*i+1] - ovector[2*i]; + tokens(pos_match++) = std::string (*(listptr+i), len); } else pos_offset++; } } - std::string match_string = std::string (*listptr); - pcre_free_substring_list (listptr); regexp::match_element new_elem (named_tokens, tokens, match_string,