changeset 12866:fe6e2afcd9ee

Revamp strread, textscan, textread functions for Matlab compatability Implemented ML-compatible whitespace and delimiter defaults Implemented ML-compatible options: 'whitespace', treatasempty', format string repeat count, user-specified comment style, uneven-length output arrays, %n and %u conversion specifiers (provisionally) Implemented processing of given-width format specifiers * textscan.m: Add new tests. Implement EndofLine, ReturnOnError, TreatAsEmpty options. Improve whitespace handling. * textread.m: Add new tests Implement EndofLine option. Improve whitespace handling. * strread.m: Major rewrite.
author Philip Nienhuis <prnienhuis@users.sf.net>
date Fri, 22 Jul 2011 13:05:26 -0700
parents a4d1581f9e72
children b00181c65533
files NEWS doc/interpreter/contributors.in scripts/io/strread.m scripts/io/textread.m scripts/io/textscan.m
diffstat 5 files changed, 704 insertions(+), 148 deletions(-) [+]
line wrap: on
line diff
--- a/NEWS	Wed Jul 20 10:41:59 2011 -0700
+++ b/NEWS	Fri Jul 22 13:05:26 2011 -0700
@@ -3,6 +3,16 @@
 
  ** The PCRE library is now required to build Octave.
 
+ ** strread, textscan, and textread have been completely revamped.
+
+    They now support nearly all Matlab functionality including:
+
+    * ML-compatible whitespace and delimiter defaults
+
+    * ML-compatible options: 'whitespace', treatasempty',
+    format string repeat count, user-specified comment style, uneven-length
+    output arrays, %n and %u conversion specifiers (provisionally)
+ 
  ** New functions added.
 
     iscolumn
--- a/doc/interpreter/contributors.in	Wed Jul 20 10:41:59 2011 -0700
+++ b/doc/interpreter/contributors.in	Fri Jul 22 13:05:26 2011 -0700
@@ -152,6 +152,7 @@
 Victor Munoz
 Carmen Navarrete
 Todd Neal
+Philip Nienhuis
 Al Niessner
 Rick Niles
 Takuji Nishimura
--- a/scripts/io/strread.m	Wed Jul 20 10:41:59 2011 -0700
+++ b/scripts/io/strread.m	Fri Jul 22 13:05:26 2011 -0700
@@ -19,7 +19,9 @@
 ## -*- texinfo -*-
 ## @deftypefn  {Function File} {[@var{a}, @dots{}] =} strread (@var{str})
 ## @deftypefnx {Function File} {[@var{a}, @dots{}] =} strread (@var{str}, @var{format})
+## @deftypefnx {Function File} {[@var{a}, @dots{}] =} strread (@var{str}, @var{format}, @var{format_repeat})
 ## @deftypefnx {Function File} {[@var{a}, @dots{}] =} strread (@var{str}, @var{format}, @var{prop1}, @var{value1}, @dots{})
+## @deftypefnx {Function File} {[@var{a}, @dots{}] =} strread (@var{str}, @var{format}, @var{format_repeat}, @var{prop1}, @var{value1}, @dots{})
 ## Read data from a string.
 ##
 ## The string @var{str} is split into words that are repeatedly matched to the
@@ -38,17 +40,27 @@
 ##
 ## @item %d
 ## @itemx %f
-## The word is parsed as a number.
+## @itemx %u
+## @itemx %n
+## The word is parsed as a number (and converted to double).
+##
+## @item %*', '%*f', '%*s
+## The word is skipped.
 ##
-## @item %*
-## The word is skipped.
+## For %s and %d, %f, %n, %u and the associated %*s @dots{} specifiers an
+## optional width can be specified as %Ns, etc. where N is an integer > 1.
+## For %f, formats like %N.Mf are allowed.
+##
+## @item literals
+## In addition the format may contain literal character strings; these will be
+## skipped during reading.
 ## @end table
 ##
 ## Parsed word corresponding to the first specifier are returned in the first
 ## output argument and likewise for the rest of the specifiers.
 ##
 ## By default, @var{format} is @t{"%f"}, meaning that numbers are read from
-## @var{str}.
+## @var{str}.  This will do if @var{str} contains only numeric fields.
 ##
 ## For example, the string
 ##
@@ -68,6 +80,18 @@
 ## [@var{a}, @var{b}, @var{c}] = strread (@var{str}, "%s %s %f");
 ## @end example
 ##
+## Optional numeric argument @var{format_repeat} can be used for
+## limiting the number of items read:
+## @table @asis
+## @item -1
+## (default) read all of the string until the end.
+##
+## @item N
+## Read N times @var{nargout} items.  0 (zero) is an acceptable
+## value for @var{format_repeat}.
+##
+## @end table
+##
 ## The behavior of @code{strread} can be changed via property-value
 ## pairs.  The following properties are recognized:
 ##
@@ -87,83 +111,144 @@
 ##
 ## @item "matlab"
 ## Everything from @code{%} characters to the nearest end-line is skipped.
+##
+## @item user-supplied.  Two options:
+## (1) One string, or 1x1 cell string: Skip everything to the right of it;
+## (2) 2x1 cell string array: Everything between the left and right strings
+## is skipped.
 ## @end itemize
 ##
 ## @item "delimiter"
-## Any character in @var{value} will be used to split @var{str} into words.
+## Any character in @var{value} will be used to split @var{str} into words 
+## (default value = any whitespace).
+##
+## @item "whitespace"
+## Any character in @var{value} will be interpreted as whitespace and
+## trimmed; the string defining whitespace must be enclosed in double
+## quotes for proper processing of special characters like \t.
+## The default value for whitespace = " \b\r\n\t" (note the space).
 ##
 ## @item "emptyvalue"
 ## Parts of the output where no word is available is filled with @var{value}.
+##
+## @item "treatasempty"
+## Treat single occurrences (surrounded by delimiters or whitespace) of the
+## string(s) in @var{value} as missing values.
+##
+## @item "returnonerror"
+## If @var{value} true (1, default), ignore read errors and return normally.
+## If false (0), return an error.
+## 
 ## @end table
 ##
-## @seealso{textread, load, dlmread, fscanf}
+## @seealso{textscan, textread, load, dlmread, fscanf}
 ## @end deftypefn
 
 function varargout = strread (str, format = "%f", varargin)
+
   ## Check input
   if (nargin < 1)
     print_usage ();
   endif
 
-  if (!ischar (str) || !ischar (format))
+  if (isempty (format))
+    format = "%f";
+  endif
+
+  if (! ischar (str) || ! ischar (format))
     error ("strread: STR and FORMAT arguments must be strings");
   endif
 
-  ## Parse options
+  ## Check for format string repeat count
+  format_repeat_count = -1;
+  if (nargin > 2 && isnumeric (varargin{1}))
+    if (varargin{1} >= 0)
+      format_repeat_count = varargin{1};
+    endif
+    if (nargin > 3)
+      varargin = varargin(2:end);
+    else
+      varargin = {};
+    endif
+  endif
+
+  ## Parse options.  First initialize defaults
   comment_flag = false;
-  numeric_fill_value = 0;
-  white_spaces = " \n\r\t\b";
+  numeric_fill_value = NaN;
+  white_spaces = " \b\r\n\t";
   delimiter_str = "";
+  eol_char = "";
+  empty_str = "";
+  err_action = 0;
   for n = 1:2:length (varargin)
-    switch (lower (varargin {n}))
+    switch (lower (varargin{n}))
+      case "bufsize"
+        ## We could synthesize this, but that just seems weird...
+        warning ('strread: property "bufsize" is not implemented');
       case "commentstyle"
         comment_flag = true;
-        switch (lower (varargin {n+1}))
+        switch (lower (varargin{n+1}))
           case "c"
-            comment_specif = {"/*", "*/"};
+            [comment_start, comment_end] = deal ("/*", "*/");
           case "c++"
-            comment_specif = {"//", "\n"};
+            [comment_start, comment_end] = deal ("//", "\n");
           case "shell"
-            comment_specif = {"#", "\n"};
+            [comment_start, comment_end] = deal ("#", "\n");
           case "matlab"
-            comment_specif = {"%", "\n"};
+            [comment_start, comment_end] = deal ("%", "\n");
           otherwise
-            warning ("strread: unknown comment style '%s'", val);
+            if (ischar (varargin{n+1}) ||
+               (numel (varargin{n+1}) == 1 && iscellstr (varargin{n+1})))
+              tmp = char (varargin{n+1});
+              [comment_start, comment_end] = deal (tmp, "\n");
+            elseif (iscellstr (varargin{n+1}) && numel (varargin{n+1}) == 2)
+              [comment_start, comment_end] = deal (varargin{n+1}{:});
+            else
+              ## FIXME - a user may have numeric values specified: {'//', 7}
+              ##         this will lead to an error in the warning message
+              error ("strread: unknown or unrecognized comment style '%s'", 
+                      varargin{n+1});
+            endif
         endswitch
       case "delimiter"
-        delimiter_str = varargin {n+1};
+        delimiter_str = varargin{n+1};
       case "emptyvalue"
-        numeric_fill_value = varargin {n+1};
-      case "bufsize"
-        ## XXX: We could synthesize this, but that just seems weird...
-        warning ("strread: property \"bufsize\" is not implemented");
+        numeric_fill_value = varargin{n+1};
+      case "expchars"
+        warning ('strread: property "expchars" is not implemented');
       case "whitespace"
-        white_spaces = varargin {n+1};
-      case "expchars"
-        warning ("strread: property \"expchars\" is not implemented");
+        white_spaces = varargin{n+1};
+      ## The following parameters are specific to textscan and textread
+      case "endofline"
+        eol_char = varargin{n+1};
+      case "returnonerror"
+        err_action = varargin{n+1};
+      case "treatasempty"
+        empty_str = varargin{n+1};
+        if (ischar (empty_str))
+          empty_str = {empty_str};
+        endif
       otherwise
-        warning ("strread: unknown property \"%s\"", varargin {n});
+        warning ('strread: unknown property "%s"', varargin{n});
     endswitch
   endfor
-  if (isempty (delimiter_str))
-    delimiter_str = white_spaces;
-  endif
 
-  ## Parse format string
+  ## Parse format string to compare nr. of conversion fields and nargout
   idx = strfind (format, "%")';
-  specif = format ([idx, idx+1]);
+  specif = format([idx, idx+1]);
   nspecif = length (idx);
   idx_star = strfind (format, "%*");
   nfields = length (idx) - length (idx_star);
-
-  if (max (nargout, 1) != nfields)
-    error ("strread: the number of output variables must match that specified byFORMAT");
+  ## If str only has numeric fields, a (default) format ("%f") will do.
+  ## Otherwise:
+  if ((max (nargout, 1) != nfields) && ! strcmp (format, "%f"))
+    error ("strread: the number of output variables must match that specified by FORMAT");
   endif
 
   ## Remove comments
   if (comment_flag)
-    cstart = strfind (str, comment_specif{1});
-    cstop  = strfind (str, comment_specif{2});
+    cstart = strfind (str, comment_start);
+    cstop  = strfind (str, comment_end);
     if (length (cstart) > 0)
       ## Ignore nested openers.
       [idx, cidx] = unique (lookup (cstop, cstart), "first");
@@ -181,71 +266,323 @@
       cstop = cstop(cidx);
     endif
     len = length (str);
-    c2len = length (comment_specif{2});
+    c2len = length (comment_end);
     str = cellslices (str, [1, cstop + c2len], [cstart - 1, len]);
     str = [str{:}];
   endif
 
-  ## Determine the number of words per line
-  format = strrep (format, "%", " %");
-  [~, ~, ~, fmt_words] = regexp (format, '[^ ]+');
-
+  if (strcmpi (strtrim (format), "%f"))
+    ## Default format specified.  Expand it (to desired nargout)
+    num_words_per_line = nargout;
+    fmt_words = cell (nargout, 1);
+    fmt_words (1:nargout) = format;
+  else
+    ## Determine the number of words per line as a first guess.  Forms
+    ## like %f<literal) (w/o delimiter in between) are fixed further on
+    format = strrep (format, "%", " %");
+    fmt_words = regexp (format, '[^ ]+', 'match');
+    ## Format conversion specifiers following literals w/o space/delim
+    ## in between are separate now.  Separate those w trailing literals
+    idy2 = find (! cellfun ("isempty", strfind (fmt_words, "%")));
+    a = strfind (fmt_words(idy2), "%");
+    b = regexp (fmt_words(idy2), '[nfdus]', 'end');
+    for jj = 1:numel (a)
+      ii = numel (a) - jj + 1;
+      if (! (length (fmt_words{idy2(ii)}) == b{ii}(1)))
+        ## Fix format_words
+        fmt_words(idy2(ii)+1 : end+1) = fmt_words(idy2(ii) : end);
+        fmt_words{idy2(ii)} = fmt_words{idy2(ii)}(a{ii} : b{ii}(1));
+        fmt_words{idy2(ii)+1} = fmt_words{idy2(ii)+1}(b{ii}+1:end);
+      endif
+    endfor
+  endif
   num_words_per_line = numel (fmt_words);
-  for m = 1:numel(fmt_words)
-    ## Convert formats such as "%Ns" to "%s" (see the FIXME below)
-    if (length (fmt_words{m}) > 2)
-      if (strcmp (fmt_words{m}(1:2), "%*"))
-        fmt_words{m} = "%*";
-      elseif (fmt_words{m}(1) == "%")
-        fmt_words{m} = fmt_words{m}([1, end]);
+
+  if (! isempty (white_spaces))
+    ## Check for overlapping whitespaces and delimiters & trim whitespace
+    if (! isempty (delimiter_str))
+      [ovlp, iw] = intersect (white_spaces, delimiter_str);
+      if (! isempty (ovlp))
+        ## Remove delimiter chars from white_spaces
+        white_spaces = cell2mat (strsplit (white_spaces, white_spaces(iw)));
       endif
     endif
-  endfor
+  endif
+
+  if (isempty (delimiter_str))
+    delimiter_str = " ";
+  endif
+
+  if (! isempty (eol_char))
+    ## eol_char is delimiter by default. First separate CRLF from single CR & LF
+    if (strcmp (eol_char, "\r\n"))
+      ## Strip CR from CRLF sequences
+      str = strrep (str, "\r\n", "\n");
+      ## CR serves no further purpose in function
+      eol_char = "\n";  
+    endif
+    ## Add eol_char to delimiter collection
+    delimiter_str = unique ([delimiter_str eol_char]);
+  endif
+
+  pad_out = 0;
+  ## If needed, trim whitespace
+  if (! isempty (white_spaces))
+    ## Check if trailing "\n" might signal padding output arrays to equal size
+    ## before it is trimmed away below
+    if ((str(end) == 10) && (nargout > 1)) 
+      pad_out = 1;
+    endif
+    ## Remove repeated white_space chars.  First find white_space positions
+    idx = strchr (str, white_spaces);
+    ## Find repeated white_spaces
+    idx2 = ! (idx(2:end) - idx(1:end-1) - 1);
+    ## Set all whitespace chars to spaces
+    ## FIXME: this implies real spaces are always part of white_spaces
+    str(idx) = ' ';
+    ## Set all repeated white_space to \0
+    str(idx(idx2)) = "\0";
+    str = strsplit (str, "\0");
+    ## Reconstruct trimmed str
+    str = cell2mat (str);
+    ## Remove leading & trailing space, but preserve delimiters.
+    str = strtrim (str);
+  endif
 
   ## Split 'str' into words
   words = split_by (str, delimiter_str);
+  if (! isempty (white_spaces))
+    ## Trim leading and trailing white_spaces
+    words = strtrim (words);
+  endif
   num_words = numel (words);
+  ## First guess at number of lines in file (ignoring leading/trailing literals)
   num_lines = ceil (num_words / num_words_per_line);
 
-  ## For each specifier
+  ## Replace TreatAsEmpty char sequences by empty strings
+  if (! isempty (empty_str))
+    ## FIXME: There should be a simpler way to do this with cellfun
+    for ii = 1:numel (empty_str)
+      idz = strmatch (empty_str{ii}, words, "exact");
+      words(idz) = {""};
+    endfor
+  endif
+  
+  ## We now may have to cope with 3 cases:
+  ## A: Trailing literals (%f<literal>) w/o delimiter in between.
+  ## B: Leading literals (<literal>%f) w/o delimiter in between.
+  ## C. Skipping leftover parts of specified skip fields (%*N )
+  ## fmt_words has been split properly now, but words{} has only been split on
+  ## delimiter positions.  Some words columns may have to be split further.
+  ## We also don't know the number of lines (as EndOfLine may have been set to
+  ## "" (empty) by the caller).
+
+  ## Find indices and pointers to possible literals in fmt_words
+  idf = cellfun ("isempty", strfind (fmt_words, "%"));
+  ## Find indices and pointers to "%*" (skip) conversion specifiers
+  idg = ! cellfun ("isempty", strfind (fmt_words, "%*"));
+  ## Unselect those with specified width ("%*N")
+  st = regexp (fmt_words, '\d');
+  idy = find (idf);
+
+  ## If needed, split up columns in three steps:
+  if (! isempty (idy))
+    ## Try-catch because complexity of strings to read can be infinite    
+    try
+
+      ## 1. Assess "period" in the split-up words array ( < num_words_per_line).
+      ## Could be done using EndOfLine but that prohibits EndOfLine = "" option.
+      fmt_in_word = cell (num_words_per_line, 1);
+      words_period = 1;
+      ## For each literal in turn
+      for ii = 1:numel (idy)
+        fmt_in_word(idy(ii)) = num_words;
+        ## Find *current* "return period" for fmt_word{idy(ii)} in words 
+        ## Search in first num_words_per_line of words
+        litptrs = find (! cellfun ("isempty", strfind ...
+                   (words(1:min (10*num_words_per_line, num_words)), ...
+                   fmt_words{idy(ii)})));
+        if (length (litptrs) > 1)
+          litptr = sum (unique (litptrs(2:end) .- litptrs(1:end-1)));
+        endif
+      endfor
+      words_period = max (words_period, litptr);
+      num_lines = ceil (num_words / words_period);
+
+      ## 2. Pad words array so that it can be reshaped
+      tmp_lines = ceil (num_words / words_period);
+      num_words_padded = tmp_lines * words_period - num_words;
+      if (num_words_padded)
+        words = [words'; cell(num_words_padded, 1)]; 
+      endif
+      words = reshape (words, words_period, tmp_lines);
+
+      ## 3. Do the column splitting on rectangular words array
+      icol = 1; ii = 1;    # icol = current column, ii = current fmt_word
+      while (ii <= num_words_per_line)
+
+        ## Check if fmt_words(ii) contains a literal
+        if (idf(ii))             # Yes, fmt_words(ii) = literal
+          [s, e] = regexp (words{icol, 1}, fmt_words{ii});
+          if (isempty (s))
+            warning ("Literal '%s' not found in column %d", fmt_words{ii}, icol);
+          else
+            if (! strcmp (fmt_words{ii}, words{icol, 1}))
+              ## Column doesn't exactly match literal => split needed.  Add a column
+              words(icol+1:end+1, :) = words(icol:end, :); 
+              ## Watch out for empty cells
+              jptr = find (! cellfun ("isempty", words(icol, :)));
+
+              ## Distinguish leading or trailing literals
+              if (!isempty (s) && s(1) == 1)
+                ## Leading literal.  Assign literal to icol, paste rest in icol + 1
+                ## Apply only to those cells that do have something beyond literal
+                jptr = find ([cellfun(@(x) length(x), words(icol+1, jptr), ...
+                              "UniformOutput", false){:}] > e(1));
+                words(icol+1, jptr) = cellfun ...
+                  (@(x) substr(x, e(1)+1, length(x)-e(1)), words(icol, jptr), ...
+                  "UniformOutput", false);
+                words(icol, jptr) = fmt_words{ii};
+
+              else
+                ## Trailing literal.  If preceding format == '%s' this is an error
+                if (! isempty (strfind (fmt_words{ii-1}, "%s")))
+                  warning ("Ambiguous '%s' specifier next to literal in column %d", icol);
+                else
+                  ## Some invoked code to avoid regexp which seems demanding
+                  ## on large files
+                  ## FIXME: this assumes char(254)/char(255) won't occur in input!
+                  clear wrds;
+                  wrds(1:2:2*numel (words(icol, jptr))) = ...
+                       strrep (words(icol, jptr), fmt_words{ii}, ...
+                       [char(255) char(254)]);
+                  wrds(2:2:2*numel (words(icol, jptr))-1) = char(255);
+                  wrds = strsplit ([wrds{:}], char(255));
+                  words(icol, jptr) = ...
+                    wrds(find (cellfun ("isempty", strfind (wrds, char(254)))));
+                  wrds(find (cellfun ("isempty", strfind (wrds, char(254))))) ...
+                     = char(255);
+                  words(icol+1, jptr) = strsplit (strrep ([wrds{2:end}], ...
+                     char(254), fmt_words{ii}), char(255));
+                endif
+                ## Former trailing literal may now be leading for next specifier
+                --ii;
+              endif
+            endif
+          endif
+
+        else
+          ## Conv. specifier.  Peek if next fmt_word needs split from current column
+          if (ii < num_words_per_line && idf(ii+1))
+            if (! isempty (strfind (words{icol, 1}, fmt_words{ii+1})))
+              --icol;
+            endif
+          endif
+        endif
+        ## Next fmt_word, next column
+        ++ii; ++icol;
+      endwhile
+
+      ## Done.  Reshape words back into 1 long vector and strip padded empty words
+      words = reshape (words, 1, numel (words))(1 : end-num_words_padded);
+
+    catch
+      warning ("strread: unable to parse text or file with given format string");
+      return;
+
+    end_try_catch
+  endif
+  
+  ## For each specifier, process corresponding column
   k = 1;
   for m = 1:num_words_per_line
-    data = words (m:num_words_per_line:end);
-    ## Map to format
-    ## FIXME - add support for formats like "%4s" or "<%s>", "%[a-zA-Z]"
-    ##         Someone with regexp experience is needed.
-    switch fmt_words{m}
-      case "%s"
-        data (end+1:num_lines) = {""};
-        varargout {k} = data';
-        k++;
-      case {"%d", "%f"}
-        n = cellfun (@isempty, data);
-        data = str2double (data);
-        data(n) = numeric_fill_value;
-        data (end+1:num_lines) = numeric_fill_value;
-        varargout {k} = data.';
-        k++;
-      case {"%*", "%*s"}
-        ## skip the word
-      otherwise
-        ## Ensure descriptive content is consistent
-        if (numel (unique (data)) > 1
-            || ! strcmpi (unique (data), fmt_words{m}))
-          error ("strread: FORMAT does not match data");
-        endif
-    endswitch
+    try
+      if (format_repeat_count < 0)
+        data = words(m:num_words_per_line:end);
+      elseif (format_repeat_count == 0)
+        data = {};
+      else
+        lastline = ...
+          min (num_words_per_line * format_repeat_count + m - 1, numel (words));
+        data = words(m:num_words_per_line:lastline);
+      endif
+
+      ## Map to format
+      ## FIXME - add support for formats like "<%s>", "%[a-zA-Z]"
+      ##         Someone with regexp experience is needed.
+      switch fmt_words{m}(1:min (2, length (fmt_words{m})))
+        case "%s"
+          if (pad_out)
+            data(end+1:num_lines) = {""}; 
+          endif
+          varargout{k} = data';
+          k++;
+        case {"%d", "%u", "%f", "%n"}
+          n = cellfun ("isempty", data);
+          ### FIXME - erroneously formatted data lead to NaN, not an error
+          data = str2double (data);
+          data(n) = numeric_fill_value;
+          if (pad_out)
+            data(end+1:num_lines) = numeric_fill_value;
+          endif
+          varargout{k} = data.';
+          k++;
+        case {"%0", "%1", "%2", "%3", "%4", "%5", "%6", "%7", "%8", "%9"}
+          nfmt = strsplit (fmt_words{m}(2:end-1), '.');
+          swidth = str2num (nfmt{1});
+          switch fmt_words{m}(end)
+            case {"d", "u", "f", "n%"}
+              n = cellfun ("isempty", data);
+              ### FIXME - erroneously formatted data lead to NaN, not an error
+              ###         => ReturnOnError can't be implemented for numeric data
+              data = str2double (strtrunc (data, swidth));
+              data(n) = numeric_fill_value;
+              if (pad_out)
+                data(end+1:num_lines) = numeric_fill_value;
+              endif
+              if (numel (nfmt) > 1)
+                sprec = str2num (nfmt{2});
+                data = 10^-sprec * round (10^sprec * data);
+              endif
+              varargout{k} = data.';
+              k++;
+            case "s"
+              if (pad_out)
+                data(end+1:num_lines) = {""}
+              endif
+              varargout{k} = strtrunc (data, 3)';
+              k++;
+            otherwise
+          endswitch
+        case {"%*", "%*s"}
+          ## skip the word
+        otherwise
+          ## Ensure descriptive content is consistent
+          if (numel (unique (data)) > 1
+              || ! strcmpi (unique (data), fmt_words{m}))
+            error ("strread: FORMAT does not match data");
+          endif
+      endswitch
+    catch
+      ## As strread processes columnwise, ML-compatible error processing
+      ## (row after row) is not feasible. In addition Octave sets unrecognizable
+      ## numbers to NaN w/o error.  But maybe Octave is better in this respect.
+      if (err_action)
+        ## Just try the next column where ML bails out
+      else
+        rethrow (lasterror);
+      endif
+    end_try_catch
   endfor
+
 endfunction
 
 function out = split_by (text, sep)
-  sep = union (sep, "\n");
-  pat = sprintf ('[^%s]+', sep);
-  [~, ~, ~, out] = regexp (text, pat);
-  out(cellfun (@isempty, out)) = {""};
-  out = strtrim (out);
+  out = strsplit (text, sep);
+  out(cellfun ("isempty", out)) = {""};
 endfunction
 
+
 %!test
 %! [a, b] = strread ("1 2", "%f%f");
 %! assert (a == 1 && b == 2);
@@ -254,14 +591,14 @@
 %! str = "# comment\n# comment\n1 2 3";
 %! [a, b] = strread (str, '%d %s', 'commentstyle', 'shell');
 %! assert (a, [1; 3]);
-%! assert (b, {"2"; ""});
+%! assert (b, {"2"});
 
 %!test
 %! str = '';
 %! a = rand (10, 1);
-%! b = char (round (65 + 20 * rand (10, 1)));
+%! b = char (randi ([65, 85], 10, 1));
 %! for k = 1:10
-%!   str = sprintf ('%s %.6f %s\n', str, a (k), b (k));
+%!   str = sprintf ('%s %.6f %s\n', str, a(k), b(k));
 %! endfor
 %! [aa, bb] = strread (str, '%f %s');
 %! assert (a, aa, 1e-5);
@@ -270,9 +607,9 @@
 %!test
 %! str = '';
 %! a = rand (10, 1);
-%! b = char (round (65 + 20 * rand (10, 1)));
+%! b = char (randi ([65, 85], 10, 1));
 %! for k = 1:10
-%!   str = sprintf ('%s %.6f %s\n', str, a (k), b (k));
+%!   str = sprintf ('%s %.6f %s\n', str, a(k), b(k));
 %! endfor
 %! aa = strread (str, '%f %*s');
 %! assert (a, aa, 1e-5);
@@ -294,3 +631,63 @@
 %! a = strread ("a b c, d e, , f", "%s", "delimiter", ",");
 %! assert (a, {"a b c"; "d e"; ""; "f"});
 
+%!test
+%! # Bug #33536
+%! [a, b, c] = strread ("1,,2", "%s%s%s", "delimiter", ",");
+%! assert (a{1}, '1');
+%! assert (b{1}, '');
+%! assert (c{1}, '2');
+
+%!test
+%! # Bug #33536
+%! a = strread ("[SomeText]", "%s", "delimiter", "]");
+%! assert (a{1}, "[SomeText");
+%! assert (a{2}, '');
+
+%!test
+%! dat = "Data file.\r\n=  =  =  =  =\r\nCOMPANY    : <Company name>\r\n";
+%! a = strread (dat, "%s", 'delimiter', "\n", 'whitespace', '', 'endofline', "\r\n");
+%! assert (a{2}, "=  =  =  =  =");
+%! assert (double (a{3}(end-5:end)), [32 110 97 109 101 62]);
+
+%!test
+%! [a, b, c, d] = strread ("1,2,3,,5,6", "%d%d%d%d", 'delimiter', ',');
+%! assert (c, 3);
+%! assert (d, NaN);
+
+%!test
+%! [a, b, c, d] = strread ("1,2,3,,5,6\n", "%d%d%d%d", 'delimiter', ',');
+%! assert (c, [3; NaN]);
+%! assert (d, [NaN; NaN]);
+
+%!test
+%! # Default format (= %f)
+%1 [a, b, c] = strread ("0.12 0.234 0.3567");
+%1 assert (a, 0.12);
+%1 assert (b, 0.234);
+%1 assert (c, 0.3567);
+
+%!test
+%! [a, b] = strread('0.41 8.24 3.57 6.24 9.27', "%f%f", 2, 'delimiter', ' ');
+%1 assert (a, [0.41; 3.57]);
+
+%!test
+%! # TreatAsEmpty
+%! [a, b, c, d] = strread ("1,2,3,NN,5,6\n", "%d%d%d%d", 'delimiter', ',', 'TreatAsEmpty', 'NN');
+%! assert (c, [3; NaN]);
+%! assert (d, [NaN; NaN]);
+
+%!test
+%! # No delimiters at all besides EOL.  Plain reading numbers & strings
+%! str = "Text1Text2Text\nText398Text4Text\nText57Text";
+%! c = textscan (str, "Text%dText%1sText");
+%! assert (c{1}, [1; 398; 57]);
+%! assert (c{2}(1:2), {'2'; '4'});
+%! assert (isempty (c{2}{3}), true);
+
+%!test
+%! # No delimiters at all besides EOL.  Skip fields, even empty fields
+%! str = "Text1Text2Text\nTextText4Text\nText57Text";
+%! c = textscan (str, "Text%*dText%dText");
+%! assert (c{1}, [2; 4; NaN]);
+
--- a/scripts/io/textread.m	Wed Jul 20 10:41:59 2011 -0700
+++ b/scripts/io/textread.m	Fri Jul 22 13:05:26 2011 -0700
@@ -25,13 +25,21 @@
 ## The file @var{filename} is read and parsed according to @var{format}.  The
 ## function behaves like @code{strread} except it works by parsing a file
 ## instead of a string.  See the documentation of @code{strread} for details.
+##
 ## In addition to the options supported by @code{strread}, this function
-## supports one more:
+## supports two more:
+##
 ## @itemize
 ## @item "headerlines":
+## The first @var{value} number of lines of @var{filename} are skipped.
+##
+## @item "endofline":
+## Specify a single character or "\r\n".  If no value is given, it will be
+## inferred from the file.  If set to "" (empty string) EOLs are ignored as
+## delimiters.
 ## @end itemize
-## The first @var{value} number of lines of @var{str} are skipped.
-## @seealso{strread, load, dlmread, fscanf}
+##
+## @seealso{strread, load, dlmread, fscanf, textscan}
 ## @end deftypefn
 
 function varargout = textread (filename, format = "%f", varargin)
@@ -51,22 +59,55 @@
     error ("textread: could not open '%s' for reading", filename);
   endif
 
-  ## Maybe skip header lines
+  ## Skip header lines if requested
   headerlines = find (strcmpi (varargin, "headerlines"), 1);
-  if (! isempty (headerlines))
-    hdr_lines = floor (varargin{headerlines + 1});
-    ## Beware of zero valued headerline, fskipl will count lines to EOF then
-    if (hdr_lines > 0)
-      fskipl (fid, hdr_lines);
-    endif
+  ## Beware of zero valued headerline, fskipl would skip to EOF
+  if (! isempty (headerlines) && (varargin{headerlines + 1} > 0))
+    fskipl (fid, varargin{headerlines + 1});
     varargin(headerlines:headerlines+1) = [];
   endif
 
   str = fread (fid, "char=>char").';
   fclose (fid);
 
-  ## Call strread to make it do the real work
-  [varargout{1:max (nargout, 1)}] = strread (str, format, varargin {:});
+  if (isempty (str))
+    warning ("textread: empty file");
+  else
+    endofline = find (strcmpi (varargin, "endofline"), 1);
+    if (! isempty (endofline))
+      ## 'endofline' option set by user.  
+      endofline = find (strcmpi (varargin, "endofline"), 1);
+      if (! ischar (varargin{endofline + 1})); 
+        error ("textscan: character value required for EndOfLine"); 
+      endif
+    else
+      ## Determine EOL from file.  Search for EOL candidates in first 3000 chars
+      eol_srch_len = min (length (str), 3000);
+      ## First try DOS (CRLF)
+      if (! isempty (findstr ("\r\n", str(1 : eol_srch_len))))
+        eol_char = "\r\n";
+      ## Perhaps old Macintosh? (CR)
+      elseif (! isempty (findstr ("\r", str(1 : eol_srch_len))))
+        eol_char = "\r";
+      ## Otherwise, use plain UNIX (LF)
+      else
+        eol_char = "\n";
+      endif
+      ## Set up default endofline param value
+      nargs = numel (varargin);
+      varargin(nargs+1:nargs+2) = {'endofline', eol_char};
+    endif
+
+    ## Set up default whitespace param value if needed
+    if (isempty (find (strcmpi ('whitespace', varargin))))
+      nargs = numel (varargin);
+      varargin(nargs+1:nargs+2) = {'whitespace', " \b\t"};
+    endif
+
+    ## Call strread to make it do the real work
+    [varargout{1:max (nargout, 1)}] = strread (str, format, varargin {:});
+
+  endif
 
 endfunction
 
--- a/scripts/io/textscan.m	Wed Jul 20 10:41:59 2011 -0700
+++ b/scripts/io/textscan.m	Fri Jul 22 13:05:26 2011 -0700
@@ -28,12 +28,24 @@
 ## The file associated with @var{fid} is read and parsed according to
 ## @var{format}.  The function behaves like @code{strread} except it works by
 ## parsing a file instead of a string.  See the documentation of
-## @code{strread} for details.  In addition to the options supported by
-## @code{strread}, this function supports one more:
+## @code{strread} for details.  
+##
+## In addition to the options supported by
+## @code{strread}, this function supports a few more:
+##
 ## @itemize
 ## @item "headerlines":
+## The first @var{value} number of lines of @var{str} are skipped.
+##
+## @item "endofline":
+## Specify a single character or "\r\n".  If no value is given, it will be
+## inferred from the file.  If set to "" (empty string) EOLs are ignored as
+## delimiters.
+##
+## @item "returnonerror":
+## If set to numerical 1 or true (default), return normally when read errors
+## have been encountered.  If set to 0 or false, return an error and no data.
 ## @end itemize
-## The first @var{value} number of lines of @var{str} are skipped.
 ##
 ## The optional input, @var{n}, specifes the number of lines to be read from
 ## the file, associated with @var{fid}.
@@ -47,15 +59,25 @@
 ## @seealso{dlmread, fscanf, load, strread, textread}
 ## @end deftypefn
 
-function [C, p] = textscan (fid, format, varargin)
+function [C, position] = textscan (fid, format = "%f", varargin)
 
   ## Check input
   if (nargin < 1)
     print_usage ();
-  elseif (nargin == 1 || isempty (format))
+  endif
+
+  if (isempty (format))
     format = "%f";
   endif
 
+  if (! (isa (fid, "double") && fid > 0) && ! ischar (fid))
+    error ("textscan: first argument must be a file id or character string");
+  endif
+
+  if (! ischar (format))
+    error ("textscan: FORMAT must be a valid specification");
+  endif
+
   if (nargin > 2 && isnumeric (varargin{1}))
     nlines = varargin{1};
     args = varargin(2:end);
@@ -70,66 +92,132 @@
     args{end+1} = NaN;
   endif
 
-  if (isa (fid, "double") && fid > 0 || ischar (fid))
-    if (ischar (format))
-      if (ischar (fid))
-        if (nargout == 2)
-          error ("textscan: cannot provide position information for character input");
-        endif
-        str = fid;
+  ## Check default parameter values that differ for strread & textread
+
+  ipos = find (strcmpi (args, "whitespace"));
+  if (isempty (ipos))
+    ## Matlab default whitespace = " \b\t"
+    args{end+1} = "whitespace";
+    args{end+1} = " \b\t";
+    whitespace = " \b\t";
+  else
+    ## Check if there's at least one string format specifier
+    fmt = strrep (format, "%", " %");
+    [~, ~, ~, fmt] = regexp (fmt, '[^ ]+');
+    fmt = strtrim (fmt(strmatch ("%", fmt)))
+    has_str_fmt = all (cellfun ("isempty", strfind (strtrim (fmt(strmatch ("%", fmt))), 's')));
+    ## If there is a format, AND whitespace value = empty, 
+    ## don't add a space (char(32)) to whitespace
+    if (! (isempty (args{ipos+1}) &&  has_str_fmt))
+      args {ipos+1} = unique ([" " whitespace]);
+    endif
+  endif
+
+  if (! any (strcmpi (args, "delimiter")))
+    ## Matlab says default delimiter = whitespace.  
+    ## strread() will pick this up further
+    args{end+1} = "delimiter";
+    args{end+1} = "";
+  endif
+
+  if (any (strcmpi (args, "returnonerror")))
+    ## Because of the way strread() reads data (columnwise) this parameter
+    ## can't be neatly implemented.  strread() will pick it up anyway
+    warning ('ReturnOnError is not fully implemented');
+  else
+    ## Set default value (=true)
+    args{end+1} = "returnonerror";
+    args{end+1} = 1;
+  endif
+
+  if (ischar (fid))
+    ## Read from a text string
+    if (nargout == 2)
+      error ("textscan: cannot provide position information for character input");
+    endif
+    str = fid;
+  else
+    ## Skip header lines if requested
+    headerlines = find (strcmpi (args, "headerlines"), 1);
+    ## Beware of zero valued headerline, fskipl would skip to EOF
+    if (! isempty (headerlines) && (args{headerlines + 1} > 0))
+      fskipl (fid, varargin{headerlines + 1});
+    endif
+    if (isfinite (nlines))
+      str = "";
+      ## FIXME: Can this be done without slow for loop?
+      for n = 1:nlines
+        str = strcat (str, fgets (fid));
+      endfor
+    else
+      str = fread (fid, "char=>char").';
+    endif
+  endif
+
+  ## Check for empty result
+  if (isempty (str))
+    warning ("textscan: no data read");
+    C = [];
+  else
+    ## Check value of 'endofline'.  String or file doesn't seem to matter
+    endofline = find (strcmpi (args, "endofline"), 1);
+    if (! isempty (endofline))
+      if (! ischar (args{endofline + 1})) 
+        error ("textscan: character value required for EndOfLine"); 
+      endif
+    else
+      ## Determine EOL from file.  Search for EOL candidates in first 3000 chars
+      BUFLEN = 3000;
+      ## First try DOS (CRLF)
+      eol_srch_len = min (length (str), 3000);
+      if (! isempty (findstr ("\r\n", str(1 : eol_srch_len))))
+        eol_char = "\r\n";
+      ## Perhaps old Macintosh? (CR)
+      elseif (! isempty (findstr ("\r", str(1 : eol_srch_len))))
+        eol_char = "\r";
+      ## Otherwise, use plain UNIX (LF)
       else
-        ## Maybe skip header lines
-        headerlines = find (strcmpi (args, "headerlines"), 1);
-        if (! isempty (headerlines))
-          hdr_lines = floor (varargin{headerlines + 1});
-          ## Beware of zero valued headerline, fskipl will count lines to EOF
-          if (hdr_lines > 0)
-            fskipl (fid, hdr_lines);
-          endif
-        endif
-        if (isfinite (nlines))
-          str = "";
-          for n = 1:nlines
-            str = strcat (str, fgets (fid));
-          endfor
-            else
-          str = fread (fid, "char=>char").';
-        endif
+        eol_char = "\n";
       endif
+      ## Set up the default endofline param value
+      args{end+1} = "endofline";
+      args{end+1} = eol_char;
+    endif
+
+    ## Determine the number of data fields
+    num_fields = numel (strfind (format, "%")) - ...
+                 numel (idx_star = strfind (format, "%*"));
 
-      ## Determine the number of data fields
-      num_fields = numel (strfind (format, "%")) - ...
-                   numel (idx_star = strfind (format, "%*"));
+    ## Strip trailing EOL to avoid returning stray missing values (f. strread)
+    if (strcmp (str(end-length (eol_char) + 1 : end), eol_char));
+      str = str(1 : end-length (eol_char)); 
+    endif
 
-      ## Call strread to make it do the real work
-      C = cell (1, num_fields);
-      [C{:}] = strread (str, format, args{:});
+    ## Call strread to make it do the real work
+    C = cell (1, num_fields);
+    [C{:}] = strread (str, format, args{:});
 
-      if (ischar (fid) && isfinite (nlines))
-        C = cellfun (@(x) x(1:nlines), C, "uniformoutput", false);
-      endif
+    if (ischar (fid) && isfinite (nlines))
+      C = cellfun (@(x) x(1:nlines), C, "uniformoutput", false);
+    endif
 
-      if (nargout == 2)
-        p = ftell (fid);
-      endif
+    if (nargout == 2)
+      position = ftell (fid);
+    endif
 
-    else
-      error ("textscan: FORMAT must be a valid specification");
-    endif
-  else
-    error ("textscan: first argument must be a file id or character string");
   endif
 
 endfunction
 
+
 %!test
 %! str = "1,  2,  3,  4\n 5,  ,  ,  8\n 9, 10, 11, 12";
 %! fmtstr = "%f %d %f %s";
 %! c = textscan (str, fmtstr, 2, "delimiter", ",", "emptyvalue", -Inf);
-%! assert (isequal (c{1}, [1;5]))
+%! assert (isequal (c{1}, [1;5]));
 %! assert (length (c{1}), 2);
-%! assert (iscellstr (c{4}))
-%! assert (isequal (c{3}, [3; -Inf]))
+%! assert (iscellstr (c{4}));
+%! assert (isequal (c{3}, [3; -Inf]));
 
 %!test
 %! b = [10:10:100];
@@ -137,7 +225,26 @@
 %! str = sprintf ("%g miles/hr = %g kilometers/hr\n", b);
 %! fmt = "%f miles/hr = %f kilometers/hr";
 %! c = textscan (str, fmt);
-%! assert (b(1,:)', c{1})
-%! assert (b(2,:)', c{2})
+%! assert (b(1,:)', c{1});
+%! assert (b(2,:)', c{2});
+
+#%!test
+#%! str = "13, 72, NA, str1, 25\r\n// Middle line\r\n36, na, 05, str3, 6";
+#%! a = textscan(str, '%d %n %f %s %n', 'delimiter', ',','treatAsEmpty', {'NA', 'na'},'commentStyle', '//');
+#%! assert (a{1}, [13; 36]);
+#%! assert (a{2}, [72; NaN]);
+#%! assert (a{3}, [NaN; 5]);
+#%! assert (a{4}, {"str1"; "str3"});
+#%! assert (a{5}, [25; 6]);
 
+%!test
+%! str = "Km:10 = hhhBjjj miles16hour\r\n";
+%! str = [str "Km:15 = hhhJjjj miles241hour\r\n"];
+%! str = [str "Km:2 = hhhRjjj miles3hour\r\n"];
+%! str = [str "Km:25 = hhhZ\r\n"];
+%! fmt = "Km:%d = hhh%1sjjj miles%dhour";
+%! a = textscan (str, fmt, 'delimiter', ' ');
+%! assert (a{1}', [10 15 2 25], 1e-5);
+%! assert (a{2}', {'B' 'J' 'R' 'Z'});
+%! assert (a{3}', [16 241 3 NaN], 1e-5);