Mercurial > octave-nkf

--- a/scripts/strings/strsplit.m	Thu Jun 06 10:48:57 2013 +0800
+++ b/scripts/strings/strsplit.m	Thu Jun 06 11:03:22 2013 +0800
@@ -19,18 +19,17 @@
 ## -*- texinfo -*-
 ## @deftypefn  {Function File} {[@var{cstr}] =} strsplit (@var{s})
 ## @deftypefnx {Function File} {[@var{cstr}] =} strsplit (@var{s}, @var{del})
-## @deftypefnx {Function File} {[@var{cstr}] =} strsplit (@var{s}, @var{del}, @var{collapsedelimiters})
 ## @deftypefnx {Function File} {[@var{cstr}] =} strsplit (@dots{}, @var{name}, @var{value})
 ## @deftypefnx {Function File} {[@var{cstr}, @var{matches}] =} strsplit (@dots{})
 ## Split the string @var{s} using the delimiters specified by @var{del}
-## and return a cell array of strings.  For a single delimiter, @var{del}
-## may be a string, or a scalar cell-string.  For multible delimiters,
-## @var{del} must be a cell-string array.  Unless @var{collapsedelimiters} is
-## specified to be @var{false}, consecutive delimiters are collapsed into one.
+## and return a cell-string array of sub-strings.  If a delmiter is not
+## specified the string, @var{s}, is split at whitespace.  The delimiter,
+## @var{del} may be a string, a scalar cell-string, or cell-string array.
+## @var{del} must be a cell-string array.  By default, consecutive
+## delimiters in the input string, @var{s}, are collapsed into one.
 ##
 ## The second output, @var{matches}, returns the delmiters which were matched
-## in the original string.  The matched delimiters are uneffected by the
-## @var{collapsedelimiters}.
+## in the original string.
 ##
 ## Example:
 ##
@@ -76,21 +75,11 @@
 ## Supported @var{name}/@var{value} pair arguments are;
 ##
 ## @itemize
-## @item @var{collapsedelimiters} may take the value of @var{true} or @var{false}
-## with the default being @var{false}.
-## @item @var{delimitertype} may take the value of @code{legacy},
-## @code{simple} or @code{regularexpression}.
-## If @var{delimitertype} is equal to @code{legacy}, each individual
-## character of @var{del} is used to split the input.  For both @code{simple}
-## and @code{regularexpression}, the string is split at the boundaries of the
-## delimiter string.  If @var{delimiter} is a cell-string, then the string
-## is split at the boundaries of each of the cells' strings.  @var{simple}
-## delimiters may contain escaped characters, but are otherwise treated as
-## literal strings.
-##
-## If the specified delimiters are single characters, the default is
-## @var{delimitertype} is @code{legacy}.  Otherwise the default
-## @var{delimitertype} is @code{simple}.
+## @item @var{collapsedelimiters} may take the value of @var{true} or
+## @var{false} with the default being @var{false}.
+## @item @var{delimitertype} may take the value of @code{simple} or
+## @code{regularexpression}. The default is @var{delimitertype} is
+## @code{simple}.
 ## @end itemize
 ##
 ## Example:
@@ -115,16 +104,6 @@
 ##             [1,5] = c
 ##           @}
 ##
-## strsplit ("a,,b, c", ", ", false, "delimitertype", "legacy")
-##       @result{}
-##           @{
-##             [1,1] = a
-##             [1,2] =
-##             [1,3] = b
-##             [1,4] =
-##             [1,5] = c
-##           @}
-##
 ## strsplit ("a,\t,b, c", @{',', '\s'@}, "delimitertype", "regularexpression")
 ##       @result{}
 ##           @{
@@ -132,10 +111,21 @@
 ##             [1,2] = b
 ##             [1,3] = c
 ##           @}
+##
+## strsplit ("a,\t,b, c", @{',', ' ', '\t'@}, "collapsedelimiters", false)
+##       @result{}
+##           @{
+##             [1,1] = a
+##             [1,2] =
+##             [1,3] =
+##             [1,4] = b
+##             [1,5] =
+##             [1,6] = c
+##           @}
 ## @end group
 ## @end example
 ##
-## @seealso{strjoin, strtok, regexp}
+## @seealso{ostrsplit, strjoin, strtok, regexp}
 ## @end deftypefn

 function [result, matches] = strsplit (str, del, varargin)
@@ -169,15 +159,7 @@
   endfor

   if (strcmpi (args.delimitertype, "default"))
-    if (nargin == 1 || numel (del) == 1
-      || (nargin > 1 && (islogical (del) || isnumeric (del)))
-      || iscell (del) && all (cellfun (@numel, del) < 2))
-      ## For single character delimiters, default to "legacy"
-      args.delimitertype = "legacy";
-    else
-      ## For multi-character delimiters, default to "simple"
-      args.delimitertype = "simple";
-    endif
+    args.delimitertype = "simple";
   endif

   # Save the length of the "delimitertype" parameter
@@ -191,8 +173,6 @@
     ## Set proper default for the delimiter type
     if (strncmpi (args.delimitertype, "simple", numel (args.delimitertype)))
       del = {" ","\f","\n","\r","\t","\v"};
-    elseif (strncmpi (args.delimitertype, "legacy", numel (args.delimitertype)))
-      del = " \f\n\r\t\v";
     else
       del = "\\s";
     endif
@@ -210,70 +190,12 @@
     else
       del = do_string_escapes (del);
     endif
+    % This is clumsy, but needed for multi-row strings
     del = regexprep (del, '([^\w])', '\\$1');
   endif

-  if (rows (str) > 1)
-    tmp = char (del(1));
-    str = [str, repmat(tmp,rows(str),1)];
-    str = reshape (str.', 1, numel (str));
-    str(end-numel(tmp)+1:end) = [];
-  endif
-
   if (isempty (str))
     result = {str};
-  elseif (strncmpi (args.delimitertype, "legacy", length_deltype))
-    ## Legacy splitting is fast
-    if (! ischar (del))
-      if (iscell (del) && all (cellfun (@numel, del) < 2))
-        del = [del{:}];
-      else
-        error ("strsplit:legacy_delimiter_must_be_char",
-          "%s %s", "strsplit: for DELIMITERTYPE = ""legacy"" ",
-           "DEL must be a string, or a cell array scalar character elements.")
-      endif
-    endif
-    if (strcmp (typeinfo (del), "sq_string"))
-      del = do_string_escapes (del);
-    endif
-    ## Split str at each character contained in del
-    if (isscalar (del))
-      ## Single separator
-      idx = find (str == del);
-    else
-      ## Multiple separators
-      idx = strchr (str, del);
-    endif
-
-    ## Get substring lengths.
-    if (isempty (idx))
-      strlens = length (str);
-    else
-      strlens = [idx(1)-1, diff(idx)-1, numel(str)-idx(end)];
-    endif
-    if (nargout > 1)
-      ## Grab the separators
-      matches = num2cell (str(idx)(:)).';
-      if (args.collapsedelimiters)
-        ## Collapse the consequtive delimiters
-        ## TODO - is there a vectorized way?
-        for m = numel(matches):-1:2
-          if (strlens(m) == 0)
-            matches{m-1} = [matches{m-1:m}];
-            matches(m) = [];
-          endif
-        end
-      endif
-    endif
-    ## Remove separators.
-    str(idx) = [];
-    if (args.collapsedelimiters)
-      ## Omit zero lengths.
-      strlens = strlens(strlens != 0);
-    endif
-
-    ## Convert!
-    result = mat2cell (str, 1, strlens);
   elseif (strncmpi (args.delimitertype, "regularexpression", length_deltype)
           || strncmpi (args.delimitertype, "simple", length_deltype))
     if (iscellstr (del))
@@ -290,9 +212,6 @@
   endif
 endfunction

-% Mimic the old strsplit()
-%!assert (cellfun (@numel, strsplit (["a,b,c";"1,2   "], ",")), [1 1 2 1 4])
-
 %!shared str
 %! str = "The rain in Spain stays mainly in the plain.";
 % Split on all whitespace.
@@ -323,7 +242,7 @@
 %!assert (strsplit ("road to^hell", {" ","^"}), {"road", "to", "hell"})
 %!assert (strsplit ("road   to--hell", {" ","-"}, true), {"road", "to", "hell"})
 %!assert (strsplit (["a,bc,,de"], ",", false, "delimitertype", "s"), {"a", "bc", "", "de"})
-%!assert (strsplit (["a,bc,,de"], ",", false), {"a", "bc", char(ones(1,0)), "de"})
+%!assert (strsplit (["a,bc,,de"], ",", false), {"a", "bc", "", "de"})
 %!assert (strsplit (["a,bc,de"], ",", true), {"a", "bc", "de"})
 %!assert (strsplit (["a,bc,de"], {","," "}, true), {"a", "bc", "de"})

@@ -337,14 +256,6 @@
 %!assert (strsplit (["a,bc,de"], "[, ]", true, "delimitertype", "r"), {"a", "bc", "de"})
 %!assert (strsplit ("hello \t world", 1, "delimitertype", "r"), {"hello", "world"});

-%!assert (strsplit ("road to hell", " ", false, "delimitertype", "l"), {"road", "to", "hell"})
-%!assert (strsplit ("road to^hell", " ^", false, "delimitertype", "l"), {"road", "to", "hell"})
-%!assert (strsplit ("road   to--hell", " -", true, "delimitertype", "l"), {"road", "to", "hell"})
-%!assert (strsplit (["a,bc";",de"], ",", false, "delimitertype", "l"), {"a", "bc", char(ones(1,0)), "de "})
-%!assert (strsplit (["a,bc";",de"], ",", true, "delimitertype", "l"), {"a", "bc", "de "})
-%!assert (strsplit (["a,bc";",de"], ", ", true, "delimitertype", "l"), {"a", "bc", "de"})
-
-%!assert (strsplit ("foo\tbar", '\t', "delimitertype", "l"), {"foo", "bar"})
 %!assert (strsplit ("foo\tbar", '\t', "delimitertype", "r"), {"foo", "bar"})
 %!assert (strsplit ("foo\tbar", '\t', "delimitertype", "s"), {"foo", "bar"})

@@ -367,14 +278,16 @@
 %! assert (a, {"a", "b"})
 %! assert (m, {"\t \n"})
 %!test
-%! [a, m] = strsplit ("a123b", "123", "delimitertype", "legacy");
-%! assert (a, {"a", "b"})
-%! assert (m, {"123"})
-%!test
 %! [s, m] = strsplit ("hello \t world", 1);
 %! assert (s, {"hello", "world"});
 %! assert (m, {" \t "});

+%% Compatibility
+%! assert (strsplit ("", "a"), {""})
+%! assert (strsplit ("a", "a"), {"", ""})
+%! assert (strsplit ("aa", "a"), {"", ""})
+%! assert (strsplit ("aaa", "a"), {"", ""})
+
 %% Test input validation
 %!error strsplit ()
 %!error strsplit ("abc", "b", true, 4)