diff scripts/io/textread.m @ 14565:98aaebc56d7c

2012-03-25 Philip Nienhuis <prnienhuis@users.sf.net> * textscan.m, textread.m Updated texinfo header (@var{n} format repeat count section) Replaced slow fgets / str concat section by block reading Supplied varargout in some cases to avoid unneeded errors Improvements to coding style * textscan.m Moved some code upward to avoid having multiple fclose statements
author Philip Nienhuis <prnienhuis@@users.sf.net>
date Fri, 30 Mar 2012 18:44:01 +0200
parents df5488e46dca
children 1804d5422f61
line wrap: on
line diff
--- a/scripts/io/textread.m	Sat Apr 14 19:33:45 2012 -0700
+++ b/scripts/io/textread.m	Fri Mar 30 18:44:01 2012 +0200
@@ -41,14 +41,16 @@
 ## delimiters.
 ## @end itemize
 ##
-## The optional input @var{n} specifes the number of times to use
-## @var{format} when parsing, i.e., the format repeat count.
+## The optional input @var{n} specifes the number of data lines to read; in
+## this sense it differs slightly from the format repeat count in strread.
 ##
 ## @seealso{strread, load, dlmread, fscanf, textscan}
 ## @end deftypefn
 
 function varargout = textread (filename, format = "%f", varargin)
 
+  BUFLENGTH = 4096;       # Read buffer to speed up processing @var{n}
+
   ## Check input
   if (nargin < 1)
     print_usage ();
@@ -58,6 +60,17 @@
     error ("textread: FILENAME and FORMAT arguments must be strings");
   endif
 
+  if (! isempty (varargin) && isnumeric (varargin{1}))
+    nlines = varargin{1};
+  else
+    nlines = Inf;
+  endif
+  if (nlines < 1)
+    printf ("textread: N = 0, no data read\n");
+    varargout = cell (1, nargout);
+    return
+  endif
+
   ## Read file
   fid = fopen (filename, "r");
   if (fid == -1)
@@ -71,28 +84,13 @@
     fskipl (fid, varargin{headerlines + 1});
     varargin(headerlines:headerlines+1) = [];
   endif
-  
-  if (! isempty (varargin) && isnumeric (varargin{1}))
-    nlines = varargin{1};
-  else
-    nlines = Inf;
-  endif
+  st_pos = ftell (fid);
 
-  if (isfinite (nlines) && (nlines >= 0))
-    str = tmp_str = "";
-    n = 0;
-    ## FIXME: Can this be done without slow loop?
-    while (ischar (tmp_str) && n++ <= nlines)
-      str = strcat (str, tmp_str);
-      tmp_str = fgets (fid);
-    endwhile
-  else
-    str = fread (fid, "char=>char").';
-  endif
-  fclose (fid);
-
-  if (isempty (str))
+  ## Read a first file chunk. Rest follows after endofline processing
+  [str, count] = fscanf (fid, "%c", BUFLENGTH);
+  if (isempty (str) || count < 1)
     warning ("textread: empty file");
+    varargout = cell (1, nargout);
     return;
   endif
 
@@ -103,8 +101,8 @@
       error ("textread: character value required for EndOfLine");
     endif
   else
-    ## Determine EOL from file.  Search for EOL candidates in first 3000 chars
-    eol_srch_len = min (length (str), 3000);
+    ## Determine EOL from file.  Search for EOL candidates in first BUFLENGTH chars
+    eol_srch_len = min (length (str), BUFLENGTH);
     ## First try DOS (CRLF)
     if (! isempty (strfind ("\r\n", str(1 : eol_srch_len))))
       eol_char = "\r\n";
@@ -116,12 +114,49 @@
       eol_char = "\n";
     endif
     ## Set up default endofline param value
-    varargin(end+1:end+2) = {'endofline', eol_char};
+    varargin(end+1:end+2) = {"endofline", eol_char};
   endif
-
+ 
+  ## Now that we know what EOL looks like, we can process format_repeat_count.
+  ## FIXME The below isn't ML-compatible: counts lines, not format string uses
+  if (isfinite (nlines) && (nlines > 0))
+    l_eol_char = length (eol_char);
+    eoi = findstr (str, eol_char);
+    n_eoi = length (eoi);
+    nblks = 0;
+    ## Avoid slow repeated str concatenation, first seek requested end of data
+    while (n_eoi < nlines && count == BUFLENGTH)
+      [nstr, count] = fscanf (fid, "%c", BUFLENGTH);
+      if (count > 0)
+        ## Watch out for multichar EOL being missed across buffer boundaries
+        if (l_eol_char > 1)
+          str = [str(end - length (eol_char) + 2 : end) nstr];
+        else
+          str = nstr;
+        endif
+        eoi = findstr (str, eol_char);
+        n_eoi += numel (eoi);
+        ++nblks;
+      endif
+    endwhile
+    ## Found EOL delimiting last requested line. Compute ptr (incl. EOL)
+    if (isempty (eoi))
+      printf ("textread: format repeat count specified but no endofline found\n");
+      eoi_pos = nblks * BUFLENGTH + count;
+    else
+      eoi_pos = (nblks * BUFLENGTH) + eoi(end + min (nlines, n_eoi) - n_eoi);
+    endif
+    fseek (fid, st_pos, "bof");
+    str = fscanf (fid, "%c", eoi_pos);
+  else
+    fseek (fid, st_pos, "bof");
+    str = fread(fid, "char=>char").';
+  endif
+  fclose (fid);
+ 
   ## Set up default whitespace param value if needed
-  if (isempty (find (strcmpi ('whitespace', varargin))))
-    varargin(end+1:end+2) = {'whitespace', " \b\t"};
+  if (isempty (find (strcmpi ("whitespace", varargin))))
+    varargin(end+1:end+2) = {"whitespace", " \b\t"};
   endif
 
   ## Call strread to make it do the real work