diff scripts/strings/strsplit.m @ 16403:1de4ec2a856d

Matlab compatibility for strsplit() * scripts/strings/strsplit.m: Matlab compatible version. * NEWS: Mention break in backward compatibility. * scripts/deprecated/javafields.m: Modify call to strsplit(). * scripts/deprecated/javamethods.m: ditto * scripts/general/fieldnames.m: ditto * scripts/general/int2str.m: ditto * scripts/general/methods.m: ditto * scripts/general/num2str.m: ditto * scripts/help/gen_doc_cache.m: ditto * scripts/help/help.m: ditto * scripts/help/lookfor.m: ditto * scripts/io/strread.m: ditto * scripts/java/javaclasspath.m: ditto * scripts/miscellaneous/compare_versions.m: ditto * scripts/miscellaneous/computer.m: ditto * scripts/miscellaneous/fact.m: ditto * scripts/miscellaneous/tar.m: ditto * scripts/miscellaneous/unpack.m: ditto * scripts/miscellaneous/what.m: ditto * scripts/miscellaneous/zip.m: ditto * scripts/pkg/private/configure_make.m: ditto * scripts/pkg/private/fix_depends.m: ditto * scripts/pkg/private/generate_lookfor_cache.m: ditto * scripts/pkg/private/list_forge_packages.m: ditto * scripts/pkg/private/unload_packages.m: ditto * scripts/pkg/private/write_index.m: ditto * scripts/plot/private/__file_filter__.m: ditto * scripts/plot/private/__fltk_file_filter__.m: ditto * scripts/plot/private/__go_draw_axes__.m: ditto * scripts/plot/private/__next_line_style__.m: ditto * scripts/strings/untabify.m: ditto * scripts/testfun/rundemos.m: ditto * scripts/testfun/runtests.m: ditto
author Ben Abbott <bpabbott@mac.com>
date Sun, 31 Mar 2013 19:19:04 -0400
parents e92dde30f754
children 5be43435bd5b
line wrap: on
line diff
--- a/scripts/strings/strsplit.m	Sun Mar 31 23:56:11 2013 +0200
+++ b/scripts/strings/strsplit.m	Sun Mar 31 19:19:04 2013 -0400
@@ -17,20 +17,33 @@
 ## <http://www.gnu.org/licenses/>.
 
 ## -*- texinfo -*-
-## @deftypefn  {Function File} {[@var{cstr}] =} strsplit (@var{s}, @var{sep})
-## @deftypefnx {Function File} {[@var{cstr}] =} strsplit (@var{s}, @var{sep}, @var{strip_empty})
-## Split the string @var{s} using one or more separators @var{sep} and return
-## a cell array of strings.  Consecutive separators and separators at
-## boundaries result in empty strings, unless @var{strip_empty} is true.
-## The default value of @var{strip_empty} is false.
+## @deftypefn  {Function File} {[@var{cstr}] =} strsplit (@var{s})
+## @deftypefnx {Function File} {[@var{cstr}] =} strsplit (@var{s}, @var{del})
+## @deftypefnx {Function File} {[@var{cstr}] =} strsplit (@var{s}, @var{del}, @var{collapsedelimiters})
+## @deftypefnx {Function File} {[@var{cstr}] =} strsplit (@dots{}, @var{name}, @var{value})
+## @deftypefnx {Function File} {[@var{cstr}, @var{matches}] =} strsplit (@dots{})
+## Split the string @var{s} using the delimiters specified by @var{del} and return
+## a cell array of strings.  For a single delimiter, @var{del} may be a string,
+## or a scalar cell-string.  For multible delimiters, @var{del} must be a cell-string
+## array.  Unless @var{collapsedelimiters} is specified to be @var{false}, consecutive
+## delimiters are collapsed into one.
 ##
-## 2-D character arrays are split at separators and at the original column
-## boundaries.
+## The second output, @var{matches}, returns the delmiters which were matched
+## in the original string.  The matched delimiters are uneffected by the
+## @var{collapsedelimiters}.
 ##
 ## Example:
 ##
 ## @example
 ## @group
+## strsplit ("a b c")
+##       @result{}
+##           @{
+##             [1,1] = a
+##             [1,2] = b
+##             [1,3] = c
+##           @}
+##
 ## strsplit ("a,b,c", ",")
 ##       @result{}
 ##           @{
@@ -39,80 +52,221 @@
 ##             [1,3] = c
 ##           @}
 ##
-## strsplit (["a,b" ; "cde"], ",")
+## strsplit ("a foo b,bar c", @{"\s", "foo", "bar"@})
+##       @result{}
+##           @{
+##             [1,1] = a
+##             [1,2] = b
+##             [1,3] = c
+##           @}
+##
+## strsplit ("a,,b, c", @{",", " "@}, false)
+##       @result{}
+##           @{
+##             [1,1] = a
+##             [1,2] = 
+##             [1,3] = b
+##             [1,4] = 
+##             [1,5] = c
+##           @}
+##
+## @end group
+## @end example
+##
+## Supported @var{name}/@var{value} pair arguments are;
+##
+## @itemize
+## @item @code{collapsedelimiters} may take the value of @var{true} or @var{false}
+## with the default being @var{false}.
+## @item @code{delimitertype} may take the value of @code{simple} or @code{regularexpression},
+## with the default being @code{simple}.
+## @end itemize
+## 
+## Example:
+##
+## @example
+## @group
+## strsplit ("a foo b,bar c", ",|\\s|foo|bar", "delimitertype", "regularexpression")
+##       @result{}
+##           @{
+##             [1,1] = a
+##             [1,2] = b
+##             [1,3] = c
+##           @}
+## 
+## strsplit ("a,,b, c", "[, ]", false, "delimitertype", "regularexpression")
+##       @result{}
+##           @{
+##             [1,1] = a
+##             [1,2] = 
+##             [1,3] = b
+##             [1,4] = 
+##             [1,5] = c
+##           @}
+## 
+## strsplit ("a,\t,b, c", @{',', '\s'@}, "delimitertype", "regularexpression")
 ##       @result{}
 ##           @{
 ##             [1,1] = a
 ##             [1,2] = b
-##             [1,3] = cde
+##             [1,3] = c
 ##           @}
 ## @end group
 ## @end example
-## @seealso{strtok}
+## 
+## @seealso{strtok, regexp}
 ## @end deftypefn
 
-function cstr = strsplit (s, sep, strip_empty = false)
+function [result, matches] = strsplit (str, del, varargin)
 
-  if (nargin < 2 || nargin > 3)
+  args.collapsedelimiters = true;
+  args.delimitertype = "simple";
+
+  [reg, params] = parseparams (varargin);
+
+  if (numel (reg) > 1)
     print_usage ();
-  elseif (! ischar (s) || ! ischar (sep))
-    error ("strsplit: S and SEP must be string values");
-  elseif (! isscalar (strip_empty))
-    error ("strsplit: STRIP_EMPTY must be a scalar value");
+  elseif (numel (reg) == 1)
+    if (islogical (reg{1}) || isnumeric (reg{1}))
+      args.collapsedelimiters = reg{1};
+    else
+      print_usage ();
+    endif
+  endif
+  fields = fieldnames (args);
+  for n = 1:2:numel(params)
+    if (any (strcmpi (params{n}, fields)))
+      args.(lower(params{n})) = params{n+1};
+    elseif (ischar (varargin{n}))
+      error ("strsplit:invalid_parameter_name",
+        sprintf ("strsplit: Invalid parameter name, `%s'", varargin{n}))
+    else
+      print_usage ();
+    endif
+  endfor
+
+  # Save the length of the "delimitertype" parameter
+  length_deltype = numel (args.delimitertype);
+
+  if (nargin == 1 || (nargin > 1 && (islogical (del) || isnumeric (del))))
+    if (nargin > 1)
+      ## Second input is the "collapsedelimiters" parameter
+      args.collapsedelimiters = del;
+    endif
+    ## Set proper default for the delimiter type
+    if (strncmpi (args.delimitertype, "simple", numel (args.delimitertype)))
+      del = {" ","\f","\n","\r","\t","\v"};
+    else
+      del = "\\s";
+    endif
+  endif
+
+  if (nargin < 1)
+    print_usage ();
+  elseif (! ischar (str) || (! ischar (del) && ! iscellstr (del)))
+    error ("strsplit: S and DEL must be string values");
+  elseif (rows (str) > 1)
+    error ("strsplit: S must be a string value");
+  elseif (! isscalar (args.collapsedelimiters))
+    error ("strsplit: COLLAPSEDELIMITERS must be a scalar value");
+  endif
+
+  if (strncmpi (args.delimitertype, "simple", length_deltype))
+    if (iscellstr (del))
+      del = cellfun (@(x) regexp2simple (x, false), del, "uniformoutput",
+        false);
+    else
+      del = regexp2simple (del, false);
+    endif
   endif
 
-  if (isempty (s))
-    cstr = cell (size (s));
-  else
-    if (rows (s) > 1)
-      ## For 2-D arrays, add separator character at line boundaries
-      ## and transform to single string
-      s(:, end+1) = sep(1);
-      s = reshape (s.', 1, numel (s));
-      s(end) = []; 
+  if (isempty (str))
+    result = {str};
+  elseif (strncmpi (args.delimitertype, "regularexpression", length_deltype)
+          || strncmpi (args.delimitertype, "simple", length_deltype))
+    if (iscellstr (del))
+      del = sprintf ('%s|', del{:});
+      del(end) = [];
     endif
-
-    ## Split s according to delimiter
-    if (isscalar (sep))
-      ## Single separator
-      idx = find (s == sep);
-    else
-      ## Multiple separators
-      idx = strchr (s, sep);
+    [result, ~, ~, ~, matches] = regexp (str, del, "split");
+    if (args.collapsedelimiters)
+      result(cellfun (@isempty, result)) = [];
+    endif
+    if (strncmpi (args.delimitertype, "simple", length_deltype))
+      matches = cellfun (@(x) regexp2simple (x, true), matches,
+        "uniformoutput", false);
     endif
+  else
+    error ("strsplit:invalid_delimitertype", 
+      sprintf ("strsplit: Invalid DELIMITERTYPE"))
+  endif
+endfunction
 
-    ## Get substring lengths.
-    if (isempty (idx))
-      strlens = length (s);
-    else
-      strlens = [idx(1)-1, diff(idx)-1, numel(s)-idx(end)];
-    endif
-    ## Remove separators.
-    s(idx) = [];
-    if (strip_empty)
-      ## Omit zero lengths.
-      strlens = strlens(strlens != 0);
-    endif
-
-    ## Convert!
-    cstr = mat2cell (s, 1, strlens);
+function str = regexp2simple (str, reverse = false)
+  rep = {'\', '[', ']', '{', '}', '$', '^', '(', ')', '*', '+', '.', '?', '|'};
+  if (reverse)
+    ## backslash must go last
+    for r = numel(rep):-1:1
+      str = strrep (str, [char(92), rep{r}], rep{r});
+    endfor
+  else
+    ## backslash must go first
+    for r = 1:numel(rep)
+      str = strrep (str, rep{r}, [char(92), rep{r}]);
+    endfor
   endif
-
 endfunction
 
+%!shared str
+%! str = "The rain in Spain stays mainly in the plain.";
+% Split on all whitespace.
+%!assert (strsplit (str), {"The", "rain", "in", "Spain", "stays", ...
+%! "mainly", "in", "the", "plain."})
+% Split on "ain".
+%!assert (strsplit (str, "ain"), {"The r", " in Sp", " stays m", ...
+%!  "ly in the pl", "."})
+% Split on " " and "ain" (treating multiple delimiters as one).
+%!test
+%! s = strsplit (str, '\s|ain', true, "delimitertype", "r");
+%! assert (s, {"The", "r", "in", "Sp", "stays", "m", "ly", "in", "the", "pl", "."})
+%!test
+%! s = strsplit (str, "\\s|ain", true, "delimitertype", "r");
+%! assert (s, {"The", "r", "in", "Sp", "stays", "m", "ly", "in", "the", "pl", "."})
+%!test
+%! [s, m] = strsplit (str, {"\\s", "ain"}, true, "delimitertype", "r");
+%! assert (s, {"The", "r", "in", "Sp", "stays", "m", "ly", "in", "the", "pl", "."})
+%! assert (m, {" ", "ain", " ", " ", "ain", " ", " ", "ain", " ", " ", " ", "ain"})
+% Split on " " and "ain", and treat multiple delimiters separately.
+%!test
+%! [s, m] = strsplit (str, {" ", "ain"}, "collapsedelimiters", false);
+%! assert (s, {"The", "r", "", "in", "Sp", "", "stays", "m", "ly", "in", "the", "pl", "."})
+%! assert (m, {" ", "ain", " ", " ", "ain", " ", " ", "ain", " ", " ", " ", "ain"})
 
+%!assert (strsplit ("road to hell"), {"road", "to", "hell"})
 %!assert (strsplit ("road to hell", " "), {"road", "to", "hell"})
-%!assert (strsplit ("road to^hell", " ^"), {"road", "to", "hell"})
-%!assert (strsplit ("road   to--hell", " -", true), {"road", "to", "hell"})
-%!assert (strsplit (["a,bc";",de"], ","), {"a", "bc", char(ones(1,0)), "de "})
-%!assert (strsplit (["a,bc";",de"], ",", true), {"a", "bc", "de "})
-%!assert (strsplit (["a,bc";",de"], ", ", true), {"a", "bc", "de"})
+%!assert (strsplit ("road to^hell", {" ","^"}), {"road", "to", "hell"})
+%!assert (strsplit ("road   to--hell", {" ","-"}, true), {"road", "to", "hell"})
+%!assert (strsplit (["a,bc,,de"], ",", false), {"a", "bc", "", "de"})
+%!assert (strsplit (["a,bc,de"], ",", true), {"a", "bc", "de"})
+%!assert (strsplit (["a,bc,de"], {","," "}, true), {"a", "bc", "de"})
+%!test
+%! [s, m] = strsplit ("hello \t world", 1);
+%! assert (s, {"hello", "world"});
+%! assert (m, {" ", "\t", " "});
+
+%!assert (strsplit ("road to hell", " ", "delimitertype", "r"), {"road", "to", "hell"})
+%!assert (strsplit ("road to^hell", '\^| ', "delimitertype", "r"), {"road", "to", "hell"})
+%!assert (strsplit ("road to^hell", "[ ^]", "delimitertype", "r"), {"road", "to", "hell"})
+%!assert (strsplit ("road   to--hell", "[ -]", false, "delimitertype", "r"), {"road", "", "", "to", "", "hell"})
+%!assert (strsplit (["a,bc,de"], ",", "delimitertype", "r"), {"a", "bc", "de"})
+%!assert (strsplit (["a,bc,,de"], ",", false, "delimitertype", "r"), {"a", "bc", "", "de"})
+%!assert (strsplit (["a,bc,de"], ",", true, "delimitertype", "r"), {"a", "bc", "de"})
+%!assert (strsplit (["a,bc,de"], "[, ]", true, "delimitertype", "r"), {"a", "bc", "de"})
+%!assert (strsplit ("hello \t world", 1, "delimitertype", "r"), {"hello", "world"});
 
 %% Test input validation
 %!error strsplit ()
-%!error strsplit ("abc")
 %!error strsplit ("abc", "b", true, 4)
-%!error <S and SEP must be string values> strsplit (123, "b")
-%!error <S and SEP must be string values> strsplit ("abc", 1)
-%!error <STRIP_EMPTY must be a scalar value> strsplit ("abc", "def", ones (3,3))
+%!error <S and DEL must be string values> strsplit (123, "b")
+%!error <COLLAPSEDELIMITERS must be a scalar value> strsplit ("abc", "def", ones (3,3))