changeset 16357:0cbe330f39a2

textscan.m, textread.m: allow reading multi-column data files with empty format + tests (bug #38317)
author Philip Nienhuis <prnienhuis@users.sf.net>
date Fri, 22 Mar 2013 17:46:04 +0100
parents df643a532b61
children 0db0926c2d0f
files scripts/io/textread.m scripts/io/textscan.m
diffstat 2 files changed, 255 insertions(+), 8 deletions(-) [+]
line wrap: on
line diff
--- a/scripts/io/textread.m	Thu Mar 21 21:38:36 2013 -0700
+++ b/scripts/io/textread.m	Fri Mar 22 17:46:04 2013 +0100
@@ -44,6 +44,11 @@
 ## The optional input @var{n} specifies the number of data lines to read; in
 ## this sense it differs slightly from the format repeat count in strread.
 ##
+## If the format string is empty (not: omitted) and the file contains only
+## numeric data (excluding headerlines), textread will return a rectangular
+## matrix with the number of columns matching the number of numeric fields on
+## the first data line of the file. Empty fields are returned as zero values.
+##
 ## @seealso{strread, load, dlmread, fscanf, textscan}
 ## @end deftypefn
 
@@ -174,9 +179,46 @@
   ## Call strread to make it do the real work
   [varargout{1:max (nargout, 1)}] = strread (str, format, varargin {:});
 
+  ## Hack to concatenate/reshape numeric output into 2D array (undocumented ML)
+  ## In ML this only works in case of an empty format string
+  if (isempty (format))
+    ## Get number of fields per line. 
+    ## 1. Get eol_char position
+    iwhsp = find (strcmpi ("whitespace", varargin));
+    whsp = varargin{iwhsp + 1};
+    idx = regexp (str, eol_char, "once");
+    ## 2. Get first data line til EOL. Avoid corner case of just one line
+    if (! isempty (idx))
+      str = str(1:idx-1);
+    endif
+    idelimiter = find (strcmpi (varargin, "delimiter"), 1);
+    if (isempty (idelimiter))
+      ## Assume delimiter = whitespace
+      ## 3A. whitespace incl. consecutive whitespace => single space
+      str = regexprep (str, sprintf ("[%s]+", whsp), ' ');
+      ## 4A. Remove possible leading & trailing spaces
+      str = strtrim (str);
+      ## 5A. Count spaces, add one to get nr of data fields per line
+      ncols = numel (strfind (str, " ")) + 1;
+    else
+      ## 3B. Just count delimiters. FIXME: delimiters could occur in literals
+      delimiter = varargin {idelimiter+1};
+      ncols = numel (regexp (str, sprintf ("[%s]", delimiter))) + 1;
+    endif
+    ## 6. Reshape; watch out, we need a transpose
+    nrows = ceil (numel (varargout{1}) / ncols);
+    pad = mod (numel (varargout{1}), ncols);
+    if (pad > 0)
+      pad = ncols - pad;
+      varargout{1}(end+1 : end+pad) = NaN;
+    endif
+    varargout{1} = reshape (varargout{1}, ncols, nrows)';
+    ## ML replaces empty values with NaNs
+    varargout{1}(find (isnan (varargout{1}))) = 0;
+  endif
+
 endfunction
 
-
 %!test
 %! f = tmpnam ();
 %! d = rand (5, 3);
@@ -195,6 +237,76 @@
 %! unlink (f);
 %! assert (a, d(2:7, 1), 1e-2);
 
+%% Test reading 2D matrix with empty format
+%!test
+%! f = tmpnam ();
+%! d = rand (5, 2);
+%! dlmwrite (f, d, "precision", "%5.2f");
+%! A = textread (f, "", "headerlines", 3);
+%! unlink (f);
+%! assert (A, d(4:5, :), 1e-2);
+
+%% Read multiple lines using empty format string
+%!test
+%! f = tmpnam ();
+%! unlink (f);
+%! fid = fopen (f, "w");
+%! d = rand (1, 4);
+%! fprintf (fid, "  %f %f   %f  %f ", d);
+%! fclose (fid);
+%! A = textread (f, "");
+%! unlink (f);
+%! assert (A, d, 1e-6);
+
+%% Empty format, corner case = one line w/o EOL
+%!test
+%! f = tmpnam ();
+%! unlink (f);
+%! fid = fopen (f, "w");
+%! d = rand (1, 4);
+%! fprintf (fid, "  %f %f   %f  %f ", d);
+%! fclose (fid);
+%! A = textread (f, "");
+%! unlink (f);
+%! assert (A, d, 1e-6);
+
+%% Read multiple lines using empty format string, missing data (should be 0)
+%!test
+%! f = tmpnam ();
+%! unlink (f);
+%! fid = fopen (f, "w");
+%! d = rand (1, 4);
+%! fprintf (fid, "%f, %f, ,  %f,  %f ", d);
+%! fclose (fid);
+%! A = textread (f, "");
+%! unlink (f);
+%! assert (A, [ d(1:2) 0 d(3:4)], 1e-6);
+
+%% Test with empty positions - ML returns 0 for empty fields
+%!test
+%! f = tmpnam ();
+%! unlink (f);
+%! fid = fopen (f, "w");
+%! d = rand (1, 4);
+%! fprintf (fid, ",2,,4\n5,,7,\n");
+%! fclose (fid);
+%! A = textread (f, "", "delimiter", ",");
+%! unlink (f);
+%! assert (A, [0 2 0 4; 5 0 7 0], 1e-6);
+
+%% Another test with empty format + positions, now with more incomplete lower
+%% row (must be appended with zeros to get rectangular matrix)
+%!test
+%! f = tmpnam ();
+%! unlink (f);
+%! fid = fopen (f, "w");
+%! d = rand (1, 4);
+%! fprintf (fid, ",2,,4\n5,\n");
+%! fclose (fid);
+%! A = textread (f, "", "delimiter", ",");
+%! unlink (f);
+%! assert (A, [0 2 0 4; 5 0 0 0], 1e-6);
+
 %% Test input validation
 %!error textread ()
 %!error textread (1)
--- a/scripts/io/textscan.m	Thu Mar 21 21:38:36 2013 -0700
+++ b/scripts/io/textscan.m	Fri Mar 22 17:46:04 2013 +0100
@@ -67,19 +67,26 @@
 ## The second output, @var{position}, provides the position, in characters,
 ## from the beginning of the file.
 ##
+## If the format string is empty (not: omitted) and the file contains only
+## numeric data (excluding headerlines), textscan will return data in a number
+## of columns matching the number of numeric fields on the first data line of
+## the file.
+##
 ## @seealso{dlmread, fscanf, load, strread, textread}
 ## @end deftypefn
 
 function [C, position] = textscan (fid, format = "%f", varargin)
 
   BUFLENGTH = 4096;               ## Read buffer
-  
+  emptfmt = 0;                    ## Signals deliberately empty format string
+
   ## Check input
   if (nargin < 1)
     print_usage ();
   endif
 
   if (isempty (format))
+    emptfmt = 1;
     format = "%f";
   endif
 
@@ -132,6 +139,9 @@
     ## Matlab says default delimiter = whitespace.
     ## strread() will pick this up further
     args(end+1:end+2) = {'delimiter', ""};
+    delimiter = "";
+  else
+    delimiter = args{find (strcmpi (args, "delimiter")) + 1};
   endif
 
   collop = false;
@@ -157,6 +167,15 @@
     args(end+1:end+2) = {"returnonerror", 1};
   endif
 
+  ## Check if a headerlines argument is specified
+  headerlines = find (strcmpi (args, "headerlines"), 1);
+  if (! isempty (headerlines))
+    ## Yep. But it is stray when reading from strings...
+    if (ischar (fid))
+      warning ("textscan: 'headerlines' ignored when reading from strings");
+    endif
+  endif
+
   if (ischar (fid))
     ## Read from a text string
     if (nargout == 2)
@@ -166,7 +185,6 @@
   else
     st_pos = ftell (fid);
     ## Skip header lines if requested
-    headerlines = find (strcmpi (args, "headerlines"), 1);
     if (! isempty (headerlines))
       ## Beware of missing or wrong headerline value
       if (headerlines  == numel (args)
@@ -268,9 +286,10 @@
   endif
 
   ## Strip trailing EOL to avoid returning stray missing values (f. strread).
-  ## However, in case of CollectOutput request, presence of EOL is required
+  ## However, in case of CollectOutput request, presence of EOL is required;
+  ## also in case of deliberately entered empty format string
   eol_at_end = strcmp (str(end-length (eol_char) + 1 : end), eol_char);
-  if (collop)
+  if (collop || emptfmt)
     if (! eol_at_end)
       str(end+1 : end+length (eol_char)) = eol_char;
     endif
@@ -284,6 +303,36 @@
   C = cell (1, num_fields);
   [C{:}] = strread (str, format, args{:});
 
+  ## I.c.o. empty format, match nr. of cols to nr. of fields on first read line
+  if (emptfmt)
+    ## Find end of first line
+    eoi = index (str, eol_char);
+    if (eoi)
+      ## str contains an EOL, proceed with assessing nr. of columns
+      ncols = countcols (C, str(1 : eoi-1), delimiter, whitespace);
+      ## See if lowermost data row must be completed
+      pad = mod (numel (C{1}), ncols);
+      if (pad)
+        ## Textscan returns NaNs for empty fields
+        C(1) = [C{1}; NaN(ncols - pad, 1)]; 
+      endif
+      ## Replace NaNs with EmptyValue, if any
+      ipos = find (strcmpi (args, "emptyvalue"));
+      if (ipos)
+        C{1}(find (isnan (C{1}))) = args{ipos+1};
+      endif
+      ## Compute nr. of rows
+      nrows = floor (numel (C{1}) / ncols);
+      ## Reshape C; watch out, transpose needed
+      C(1) = reshape (C{1}, ncols, numel (C{1}) / ncols)';
+      ## Distribute columns over C and wipe cols 2:end of C{1}
+      for ii=2:ncols
+        C(ii) = C{1}(:, ii);
+      endfor
+      C{1} = C{1}(:, 1);
+    endif 
+  endif
+
   ## If requested, collect output columns of same class
   if (collop)
     C = colloutp (C);
@@ -297,6 +346,21 @@
 endfunction
 
 
+## Assess nr of data fields on first line of data
+function ncols = countcols (C, str, dlm, wsp)
+
+  if (isempty (dlm))
+    ## Field separator = whitespace. Fold multiple whitespace into one
+    str = regexprep (str, sprintf ("[%s]", wsp), " ");
+    str = strtrim (str);
+    ncols = numel (strfind (str, " ")) + 1;
+  else
+    ncols = numel (regexp (str, sprintf ("[%s]", dlm))) + 1;
+  endif
+
+endfunction
+
+
 ## Collect consecutive columns of same class into one cell column
 function C = colloutp (C)
 
@@ -520,6 +584,77 @@
 %! assert (A{1}, [d(1); d(3)], 1e-6);
 %! assert (A{2}, [d(2); d(4)], 1e-6);
 
-%!error <missing or illegal value for> textread (file_in_loadpath ("textscan.m"), "", "headerlines")
-%!error <missing or illegal value for> textread (file_in_loadpath ("textscan.m"), "", "headerlines", 'hh')
-%!error <character value required for> textread (file_in_loadpath ("textscan.m"), "", "endofline", true)
+%% Tests reading with empty format, should return proper nr of columns
+%!test
+%! f = tmpnam ();
+%! fid = fopen (f, "w+");
+%! fprintf (fid, " 1 2 3 4\n5 6 7 8");
+%! fseek (fid, 0, "bof");
+%! A = textscan (fid, "");
+%! fclose (fid);
+%! unlink (f);
+%! assert (A{1}, [1 ; 5], 1e-6);
+%! assert (A{2}, [2 ; 6], 1e-6);
+%! assert (A{3}, [3 ; 7], 1e-6);
+%! assert (A{4}, [4 ; 8], 1e-6);
+
+%% Tests reading with empty format; empty fields & incomplete lower row
+%!test
+%! f = tmpnam ();
+%! fid = fopen (f, "w+");
+%! fprintf (fid, " ,2,,4\n5,6");
+%! fseek (fid, 0, "bof");
+%! A = textscan (fid, "", "delimiter", ",", "EmptyValue", 999, "CollectOutput" , 1);
+%! fclose (fid);
+%! unlink (f);
+%! assert (A{1}, [999, 2, 999, 4; 5, 6, 999, 999], 1e-6);
+
+%% Error message tests
+
+%!test
+%! f = tmpnam ();
+%! fid = fopen (f, "w+");
+%! msg1 = "Missing or illegal value for 'headerlines'";
+%! try
+%! A = textscan (fid, "", "headerlines");
+%! end_try_catch;
+%! fclose (fid);
+%! unlink (f);
+%! assert (msg1, lasterr);
+
+%!test
+%! f = tmpnam ();
+%! fid = fopen (f, "w+");
+%! msg1 = "Missing or illegal value for 'headerlines'";
+%! try
+%! A = textscan (fid, "", "headerlines", "hh");
+%! end_try_catch;
+%! fclose (fid);
+%! unlink (f);
+%! assert (msg1, lasterr);
+
+%!test
+%! f = tmpnam ();
+%! fid = fopen (f, "w+");
+%! fprintf (fid,"some_string");
+%! fseek (fid, 0, "bof");
+%! msg1 = "textscan: illegal EndOfLine character value specified";
+%! try
+%! A = textscan (fid, "%f", "EndOfLine", "\n\r");
+%! end_try_catch;
+%! fclose (fid);
+%! unlink (f);
+%! assert (msg1, lasterr);
+
+%!test
+%! f = tmpnam ();
+%! fid = fopen (f, "w+");
+%! fprintf (fid,"some_string");
+%! fseek (fid, 0, "bof");
+%! msg1 = "textscan: character value required for EndOfLine";
+%! try
+%! A = textscan (fid, "%f", "EndOfLine", 33);
+%! end_try_catch;
+%! fclose (fid);
+%! unlink (f);
+%! assert (msg1, lasterr);