# HG changeset patch # User Rik # Date 1427596654 25200 # Node ID 7569e880d56c2b162c5e402ad1b586567a82764f # Parent 30681f6e3f658459ac8030b1a4b89c0b978ce291 strsplit.m: Overhaul function and fix bug #44641. * strsplit.m: Rewrite docstring. Match variables in documentation to those in function. Change undocumented 3rd argument to only accept logical args. Use single quotes around regexp patterns to avoid lots of backslash escaping. Define is_simple variable to avoid repeated strncmpi calls. Escape special regexp characters in "simple" split expressions (bug #44641). Wrap some BIST tests to less than 80 columns. Add BIST tests for bug #44641. diff -r 30681f6e3f65 -r 7569e880d56c scripts/strings/strsplit.m --- a/scripts/strings/strsplit.m Sat Mar 28 17:18:05 2015 +0300 +++ b/scripts/strings/strsplit.m Sat Mar 28 19:37:34 2015 -0700 @@ -17,21 +17,35 @@ ## . ## -*- texinfo -*- -## @deftypefn {Function File} {[@var{cstr}] =} strsplit (@var{s}) -## @deftypefnx {Function File} {[@var{cstr}] =} strsplit (@var{s}, @var{del}) +## @deftypefn {Function File} {[@var{cstr}] =} strsplit (@var{str}) +## @deftypefnx {Function File} {[@var{cstr}] =} strsplit (@var{str}, @var{del}) ## @deftypefnx {Function File} {[@var{cstr}] =} strsplit (@dots{}, @var{name}, @var{value}) ## @deftypefnx {Function File} {[@var{cstr}, @var{matches}] =} strsplit (@dots{}) -## Split the string @var{s} using the delimiters specified by @var{del} -## and return a cell string array of substrings. If a delimiter is not -## specified the string, @var{s}, is split at whitespace. The delimiter, -## @var{del} may be a string, a scalar cell string, or cell string array. -## By default, consecutive delimiters in the input string @var{s} are -## collapsed into one. +## Split the string @var{str} using the delimiters specified by @var{del} +## and return a cell string array of substrings. +## +## If a delimiter is not specified the string is split at whitespace +## @code{@{" ", "\f", "\n", "\r", "\t", "\v"@}}. Otherwise, the delimiter, +## @var{del} must be a string or cell array of strings. By default, +## consecutive delimiters in the input string @var{s} are collapsed into one +## resulting in a single split. +## +## Supported @var{name}/@var{value} pair arguments are: ## -## The second output, @var{matches}, returns the delimiters which were matched -## in the original string. +## @itemize +## @item @var{collapsedelimiters} which may take the value of @code{true} +## (default) or @code{false}. ## -## Example: +## @item @var{delimitertype} which may take the value of @qcode{"simple"} +## (default) or @qcode{"regularexpression"}. A simple delimiter matches the +## text exactly as written. Otherwise, the syntax for regular expressions +## outlined in @code{regexp} is used. +## @end itemize +## +## The optional second output, @var{matches}, returns the delimiters which were +## matched in the original string. +## +## Examples with simple delimiters: ## ## @example ## strsplit ("a b c") @@ -50,7 +64,7 @@ ## [1,3] = c ## @} ## -## strsplit ("a foo b,bar c", @{"\s", "foo", "bar"@}) +## strsplit ("a foo b,bar c", @{" ", ",", "foo", "bar"@}) ## @result{} ## @{ ## [1,1] = a @@ -58,7 +72,7 @@ ## [1,3] = c ## @} ## -## strsplit ("a,,b, c", @{",", " "@}, false) +## strsplit ("a,,b, c", @{",", " "@}, "collapsedelimiters", false) ## @result{} ## @{ ## [1,1] = a @@ -70,21 +84,10 @@ ## ## @end example ## -## Supported @var{name}/@var{value} pair arguments are; -## -## @itemize -## @item @var{collapsedelimiters} may take the value of @var{true} or -## @var{false} with the default being @var{false}. -## -## @item @var{delimitertype} may take the value of @code{simple} or -## @code{regularexpression}. The default is @var{delimitertype} is -## @code{simple}. -## @end itemize -## -## Example: +## Examples with regularexpression delimiters: ## ## @smallexample -## strsplit ("a foo b,bar c", ",|\\s|foo|bar", "delimitertype", "regularexpression") +## strsplit ("a foo b,bar c", ',|\s|foo|bar', "delimitertype", "regularexpression") ## @result{} ## @{ ## [1,1] = a @@ -92,7 +95,7 @@ ## [1,3] = c ## @} ## -## strsplit ("a,,b, c", "[, ]", false, "delimitertype", "regularexpression") +## strsplit ("a,,b, c", '[, ]', "collapsedelimiters", false, "delimitertype", "regularexpression") ## @result{} ## @{ ## [1,1] = a @@ -125,10 +128,10 @@ ## @seealso{ostrsplit, strjoin, strtok, regexp} ## @end deftypefn -function [result, matches] = strsplit (str, del, varargin) +function [cstr, matches] = strsplit (str, del, varargin) args.collapsedelimiters = true; - args.delimitertype = "default"; + args.delimitertype = "simple"; [reg, params] = parseparams (varargin); @@ -137,16 +140,17 @@ elseif (numel (reg) > 1) print_usage (); elseif (numel (reg) == 1) - if (islogical (reg{1}) || isnumeric (reg{1})) + ## This is undocumented behavior to accept a logical 3rd arg. + if (islogical (reg{1})) args.collapsedelimiters = reg{1}; else print_usage (); endif endif fields = fieldnames (args); - for n = 1:2:numel(params) + for n = 1:2:numel (params) if (any (strcmpi (params{n}, fields))) - args.(lower(params{n})) = params{n+1}; + args.(tolower (params{n})) = params{n+1}; elseif (ischar (varargin{n})) error ("strsplit:invalid_parameter_name", "strsplit: invalid parameter name, '%s'", varargin{n}); @@ -155,23 +159,19 @@ endif endfor - if (strcmpi (args.delimitertype, "default")) - args.delimitertype = "simple"; - endif - ## Save the length of the "delimitertype" parameter length_deltype = length (args.delimitertype); - if (nargin == 1 || (nargin > 1 && (islogical (del) || isnumeric (del)))) + if (nargin == 1 || (nargin > 1 && islogical (del))) if (nargin > 1) ## Second input is the "collapsedelimiters" parameter args.collapsedelimiters = del; endif ## Set proper default for the delimiter type - if (strncmpi (args.delimitertype, "simple", length (args.delimitertype))) - del = {" ","\f","\n","\r","\t","\v"}; + if (strncmpi (args.delimitertype, "simple", length_deltype)) + del = {" ", "\f", "\n", "\r", "\t", "\v"}; else - del = "\\s"; + del = '\s'; endif endif @@ -182,58 +182,69 @@ endif if (strncmpi (args.delimitertype, "simple", length_deltype)) + is_simple = true; + elseif (strncmpi (args.delimitertype, "regularexpression", length_deltype)) + is_simple = false; + else + error ("strsplit:invalid_delimitertype", "strsplit: Invalid DELIMITERTYPE"); + endif + + if (is_simple) if (iscellstr (del)) del = cellfun (@do_string_escapes, del, "uniformoutput", false); else del = do_string_escapes (del); endif - ## This is clumsy, but needed for multi-row strings - del = regexprep (del, '([^\w])', '\\$1'); + ## Escape characters which have a special meaning in regexp. + del = regexprep (del, '([{}()[\]^$.*?|\\])', '\\$1'); endif if (isempty (str)) - result = {str}; - elseif (strncmpi (args.delimitertype, "regularexpression", length_deltype) - || strncmpi (args.delimitertype, "simple", length_deltype)) + cstr = {str}; + else if (iscellstr (del)) - del = sprintf ('%s|', del{:}); + del = sprintf ("%s|", del{:}); del(end) = []; endif if (args.collapsedelimiters) - del = ["(", del, ")+"]; + del = [ "(" del ")+" ]; endif - [result, ~, ~, ~, matches] = regexp (str, del, "split"); - else - error ("strsplit:invalid_delimitertype", - "strsplit: Invalid DELIMITERTYPE"); + [cstr, matches] = regexp (str, del, "split", "match"); endif + endfunction %!shared str %! str = "The rain in Spain stays mainly in the plain."; -% Split on all whitespace. + +## Split on all whitespace. %!assert (strsplit (str), {"The", "rain", "in", "Spain", "stays", ... -%! "mainly", "in", "the", "plain."}) -% Split on "ain". +%! "mainly", "in", "the", "plain."}) +## Split on "ain". %!assert (strsplit (str, "ain"), {"The r", " in Sp", " stays m", ... -%! "ly in the pl", "."}) -% Split on " " and "ain" (treating multiple delimiters as one). +%! "ly in the pl", "."}) +## Split on " " and "ain" (treating multiple delimiters as one). +%!test +%! s = strsplit (str, '\s|ain', true, "delimitertype", "r"); +%! assert (s, {"The", "r", "in", "Sp", "stays", ... +%! "m", "ly", "in", "the", "pl", "."}); %!test %! s = strsplit (str, '\s|ain', true, "delimitertype", "r"); -%! assert (s, {"The", "r", "in", "Sp", "stays", "m", "ly", "in", "the", "pl", "."}) -%!test -%! s = strsplit (str, "\\s|ain", true, "delimitertype", "r"); -%! assert (s, {"The", "r", "in", "Sp", "stays", "m", "ly", "in", "the", "pl", "."}) +%! assert (s, {"The", "r", "in", "Sp", "stays", ... +%! "m", "ly", "in", "the", "pl", "."}); %!test -%! [s, m] = strsplit (str, {"\\s", "ain"}, true, "delimitertype", "r"); -%! assert (s, {"The", "r", "in", "Sp", "stays", "m", "ly", "in", "the", "pl", "."}) -%! assert (m, {" ", "ain ", " ", "ain ", " ", "ain", " ", " ", " ", "ain"}) -% Split on " " and "ain", and treat multiple delimiters separately. +%! [s, m] = strsplit (str, {'\s', 'ain'}, true, "delimitertype", "r"); +%! assert (s, {"The", "r", "in", "Sp", "stays", ... +%! "m", "ly", "in", "the", "pl", "."}); +%! assert (m, {" ", "ain ", " ", "ain ", " ", "ain", " ", " ", " ", "ain"}); +## Split on " " and "ain", and treat multiple delimiters separately. %!test %! [s, m] = strsplit (str, {" ", "ain"}, "collapsedelimiters", false); -%! assert (s, {"The", "r", "", "in", "Sp", "", "stays", "m", "ly", "in", "the", "pl", "."}) -%! assert (m, {" ", "ain", " ", " ", "ain", " ", " ", "ain", " ", " ", " ", "ain"}) +%! assert (s, {"The", "r", "", "in", "Sp", "", "stays", ... +%! "m", "ly", "in", "the", "pl", "."}); +%! assert (m, {" ", "ain", " ", " ", "ain", " ", " ", "ain", ... +%! " ", " ", " ", "ain"}); %!assert (strsplit ("road to hell"), {"road", "to", "hell"}) %!assert (strsplit ("road to hell", " "), {"road", "to", "hell"}) @@ -252,19 +263,19 @@ %!assert (strsplit (["a,bc,,de"], ",", false, "delimitertype", "r"), {"a", "bc", "", "de"}) %!assert (strsplit (["a,bc,de"], ",", true, "delimitertype", "r"), {"a", "bc", "de"}) %!assert (strsplit (["a,bc,de"], "[, ]", true, "delimitertype", "r"), {"a", "bc", "de"}) -%!assert (strsplit ("hello \t world", 1, "delimitertype", "r"), {"hello", "world"}); +%!assert (strsplit ("hello \t world", true, "delimitertype", "r"), {"hello", "world"}); %!assert (strsplit ("foo\tbar", '\t', "delimitertype", "r"), {"foo", "bar"}) %!assert (strsplit ("foo\tbar", '\t', "delimitertype", "s"), {"foo", "bar"}) ## Test "match" for consecutive delmiters %!test -%! [a, m] = strsplit ("a\t \nb", '\s', 'delimitertype', 'regularexpression', -%! 'collapsedelimiters', false); +%! [a, m] = strsplit ("a\t \nb", '\s', "delimitertype", "regularexpression", +%! "collapsedelimiters", false); %! assert (a, {"a", "", "", "b"}) %! assert (m, {"\t", " ", "\n"}) %!test -%! [a, m] = strsplit ("a\t \nb", '\s', false, 'delimitertype', 'regularexpression'); +%! [a, m] = strsplit ("a\t \nb", '\s', false, "delimitertype", "regularexpression"); %! assert (a, {"a", "", "", "b"}) %! assert (m, {"\t", " ", "\n"}) %!test @@ -276,7 +287,7 @@ %! assert (a, {"a", "b"}) %! assert (m, {"\t \n"}) %!test -%! [s, m] = strsplit ("hello \t world", 1); +%! [s, m] = strsplit ("hello \t world", true); %! assert (s, {"hello", "world"}); %! assert (m, {" \t "}); @@ -286,9 +297,15 @@ %! assert (strsplit ("aa", "a"), {"", ""}) %! assert (strsplit ("aaa", "a"), {"", ""}) +## Bug #44641 +%!assert (strsplit ("xxx strsplit ("abc", "b", "foo", "true") %!error strsplit (123, "b") -%!error strsplit ("abc", "def", ones (3,3)) +%!error strsplit ("abc", "def", "collapsedelimiters", ones (3,3)) +%!error strsplit ("abc", "b", "delimitertype", "foobar")