changeset 14565:98aaebc56d7c

2012-03-25 Philip Nienhuis <prnienhuis@users.sf.net> * textscan.m, textread.m Updated texinfo header (@var{n} format repeat count section) Replaced slow fgets / str concat section by block reading Supplied varargout in some cases to avoid unneeded errors Improvements to coding style * textscan.m Moved some code upward to avoid having multiple fclose statements
author Philip Nienhuis <prnienhuis@@users.sf.net>
date Fri, 30 Mar 2012 18:44:01 +0200
parents a459d42bb0a6
children 3a9a56999ce5
files scripts/io/textread.m scripts/io/textscan.m
diffstat 2 files changed, 123 insertions(+), 46 deletions(-) [+]
line wrap: on
line diff
--- a/scripts/io/textread.m	Sat Apr 14 19:33:45 2012 -0700
+++ b/scripts/io/textread.m	Fri Mar 30 18:44:01 2012 +0200
@@ -41,14 +41,16 @@
 ## delimiters.
 ## @end itemize
 ##
-## The optional input @var{n} specifes the number of times to use
-## @var{format} when parsing, i.e., the format repeat count.
+## The optional input @var{n} specifes the number of data lines to read; in
+## this sense it differs slightly from the format repeat count in strread.
 ##
 ## @seealso{strread, load, dlmread, fscanf, textscan}
 ## @end deftypefn
 
 function varargout = textread (filename, format = "%f", varargin)
 
+  BUFLENGTH = 4096;       # Read buffer to speed up processing @var{n}
+
   ## Check input
   if (nargin < 1)
     print_usage ();
@@ -58,6 +60,17 @@
     error ("textread: FILENAME and FORMAT arguments must be strings");
   endif
 
+  if (! isempty (varargin) && isnumeric (varargin{1}))
+    nlines = varargin{1};
+  else
+    nlines = Inf;
+  endif
+  if (nlines < 1)
+    printf ("textread: N = 0, no data read\n");
+    varargout = cell (1, nargout);
+    return
+  endif
+
   ## Read file
   fid = fopen (filename, "r");
   if (fid == -1)
@@ -71,28 +84,13 @@
     fskipl (fid, varargin{headerlines + 1});
     varargin(headerlines:headerlines+1) = [];
   endif
-  
-  if (! isempty (varargin) && isnumeric (varargin{1}))
-    nlines = varargin{1};
-  else
-    nlines = Inf;
-  endif
+  st_pos = ftell (fid);
 
-  if (isfinite (nlines) && (nlines >= 0))
-    str = tmp_str = "";
-    n = 0;
-    ## FIXME: Can this be done without slow loop?
-    while (ischar (tmp_str) && n++ <= nlines)
-      str = strcat (str, tmp_str);
-      tmp_str = fgets (fid);
-    endwhile
-  else
-    str = fread (fid, "char=>char").';
-  endif
-  fclose (fid);
-
-  if (isempty (str))
+  ## Read a first file chunk. Rest follows after endofline processing
+  [str, count] = fscanf (fid, "%c", BUFLENGTH);
+  if (isempty (str) || count < 1)
     warning ("textread: empty file");
+    varargout = cell (1, nargout);
     return;
   endif
 
@@ -103,8 +101,8 @@
       error ("textread: character value required for EndOfLine");
     endif
   else
-    ## Determine EOL from file.  Search for EOL candidates in first 3000 chars
-    eol_srch_len = min (length (str), 3000);
+    ## Determine EOL from file.  Search for EOL candidates in first BUFLENGTH chars
+    eol_srch_len = min (length (str), BUFLENGTH);
     ## First try DOS (CRLF)
     if (! isempty (strfind ("\r\n", str(1 : eol_srch_len))))
       eol_char = "\r\n";
@@ -116,12 +114,49 @@
       eol_char = "\n";
     endif
     ## Set up default endofline param value
-    varargin(end+1:end+2) = {'endofline', eol_char};
+    varargin(end+1:end+2) = {"endofline", eol_char};
   endif
-
+ 
+  ## Now that we know what EOL looks like, we can process format_repeat_count.
+  ## FIXME The below isn't ML-compatible: counts lines, not format string uses
+  if (isfinite (nlines) && (nlines > 0))
+    l_eol_char = length (eol_char);
+    eoi = findstr (str, eol_char);
+    n_eoi = length (eoi);
+    nblks = 0;
+    ## Avoid slow repeated str concatenation, first seek requested end of data
+    while (n_eoi < nlines && count == BUFLENGTH)
+      [nstr, count] = fscanf (fid, "%c", BUFLENGTH);
+      if (count > 0)
+        ## Watch out for multichar EOL being missed across buffer boundaries
+        if (l_eol_char > 1)
+          str = [str(end - length (eol_char) + 2 : end) nstr];
+        else
+          str = nstr;
+        endif
+        eoi = findstr (str, eol_char);
+        n_eoi += numel (eoi);
+        ++nblks;
+      endif
+    endwhile
+    ## Found EOL delimiting last requested line. Compute ptr (incl. EOL)
+    if (isempty (eoi))
+      printf ("textread: format repeat count specified but no endofline found\n");
+      eoi_pos = nblks * BUFLENGTH + count;
+    else
+      eoi_pos = (nblks * BUFLENGTH) + eoi(end + min (nlines, n_eoi) - n_eoi);
+    endif
+    fseek (fid, st_pos, "bof");
+    str = fscanf (fid, "%c", eoi_pos);
+  else
+    fseek (fid, st_pos, "bof");
+    str = fread(fid, "char=>char").';
+  endif
+  fclose (fid);
+ 
   ## Set up default whitespace param value if needed
-  if (isempty (find (strcmpi ('whitespace', varargin))))
-    varargin(end+1:end+2) = {'whitespace', " \b\t"};
+  if (isempty (find (strcmpi ("whitespace", varargin))))
+    varargin(end+1:end+2) = {"whitespace", " \b\t"};
   endif
 
   ## Call strread to make it do the real work
--- a/scripts/io/textscan.m	Sat Apr 14 19:33:45 2012 -0700
+++ b/scripts/io/textscan.m	Fri Mar 30 18:44:01 2012 +0200
@@ -52,8 +52,11 @@
 ## have been encountered.  If set to 0 or false, return an error and no data.
 ## @end itemize
 ##
-## The optional input @var{n} specifes the number of times to use
-## @var{format} when parsing, i.e., the format repeat count.
+## When reading from a character string, optional input argument @var{n}
+## specifes the number of times @var{format} should be used (i.e., to limit
+## the amount of data read).
+## When reading fro file, @var{n} specifes the number of data lines to read;
+## in this sense it differs slightly from the format repeat count in strread.
 ##
 ## The output @var{C} is a cell array whose length is given by the number
 ## of format specifiers.
@@ -66,6 +69,8 @@
 
 function [C, position] = textscan (fid, format = "%f", varargin)
 
+  BUFLENGTH = 4096;               ## Read buffer
+  
   ## Check input
   if (nargin < 1)
     print_usage ();
@@ -89,6 +94,11 @@
   else
     nlines = Inf;
   endif
+  if (nlines < 1)
+    printf ("textscan: N = 0, no data read\n");
+    C = [];
+    return
+  endif
 
   if (! any (strcmpi (args, "emptyvalue")))
     ## Matlab returns NaNs for missing values
@@ -148,26 +158,17 @@
     endif
     str = fid;
   else
+    st_pos = ftell (fid);
     ## Skip header lines if requested
     headerlines = find (strcmpi (args, "headerlines"), 1);
     ## Beware of zero valued headerline, fskipl would skip to EOF
     if (! isempty (headerlines) && (args{headerlines + 1} > 0))
       fskipl (fid, varargin{headerlines + 1});
       args(headerlines:headerlines+1) = [];
+      st_pos = ftell (fid);
     endif
-    if (isfinite (nlines) && (nlines >= 0))
-      str = tmp_str = "";
-      n = 0;
-      ## FIXME: Can this be done without slow loop?
-      while (ischar (tmp_str) && n++ < nlines)
-        tmp_str = fgets (fid);
-        if (ischar (tmp_str))
-          str = strcat (str, tmp_str);
-        endif
-      endwhile
-    else
-      str = fread (fid, "char=>char").';
-    endif
+    ## Read a first file chunk. Rest follows after endofline processing
+    [str, count] = fscanf (fid, "%c", BUFLENGTH);
   endif
 
   ## Check for empty result
@@ -189,8 +190,8 @@
       error ("textscan: character value required for EndOfLine");
     endif
   else
-    ## Determine EOL from file.  Search for EOL candidates in first 3000 chars
-    eol_srch_len = min (length (str), 3000);
+    ## Determine EOL from file.  Search for EOL candidates in first BUFLENGTH chars
+    eol_srch_len = min (length (str), BUFLENGTH);
     ## First try DOS (CRLF)
     if (! isempty (strfind ("\r\n", str(1 : eol_srch_len))))
       eol_char = "\r\n";
@@ -202,7 +203,47 @@
       eol_char = "\n";
     endif
     ## Set up the default endofline param value
-    args(end+1:end+2) = {'endofline', eol_char};
+    args(end+1:end+2) = {"endofline", eol_char};
+  endif
+
+  if (!ischar (fid))
+    ## Now that we know what EOL looks like, we can process format_repeat_count.
+    ## FIXME The below isn't ML-compatible: counts lines, not format string uses
+    if (isfinite (nlines) && (nlines >= 0))
+      l_eol_char = length (eol_char);
+      eoi = findstr (str, eol_char);
+      n_eoi = length (eoi);
+      nblks = 0;
+      ## Avoid slow repeated str concatenation, first seek requested end of data
+      while (n_eoi < nlines && count == BUFLENGTH)
+        [nstr, count] = fscanf (fid, "%c", BUFLENGTH);
+        if (count > 0)
+          ## Watch out for multichar EOL being missed across buffer boundaries
+          if (l_eol_char > 1)
+            str = [str(end - length (eol_char) + 2 : end) nstr];
+          else
+            str = nstr;
+          endif
+          eoi = findstr (str, eol_char);
+          n_eoi += numel (eoi);
+          ++nblks;
+        endif
+      endwhile
+      ## OK, found EOL delimiting last requested line. Compute ptr (incl. EOL)
+      if (isempty (eoi))
+        printf ("textscan: format repeat count specified but no endofline found\n");
+        data_size = nblks * BUFLENGTH + count;
+      else
+        ## Compute data size to read incl complete EOL
+        data_size = (nblks * BUFLENGTH) + eoi(end + min (nlines, n_eoi) - n_eoi) \
+                    + l_eol_char - 1;
+      endif
+      fseek (fid, st_pos, "bof");
+      str = fscanf (fid, "%c", data_size);
+    else
+      fseek (fid, st_pos, "bof");
+      str = fread (fid, "char=>char").';
+    endif
   endif
 
   ## Determine the number of data fields
@@ -223,6 +264,7 @@
   endif
 
   if (nargout == 2)
+    ## Remember file position (persistent var)
     position = ftell (fid);
   endif