changeset 14835:821708f96efd

strread.m: more robust word parsing and style improvements
author Philip Nienhuis <prnienhuis@users.sf.net>
date Tue, 03 Jul 2012 21:58:23 +0200
parents 619fedc6ea61
children a1e1f914ae79
files scripts/io/strread.m
diffstat 1 files changed, 78 insertions(+), 60 deletions(-) [+]
line wrap: on
line diff
--- a/scripts/io/strread.m	Tue Jul 03 08:50:43 2012 -0700
+++ b/scripts/io/strread.m	Tue Jul 03 21:58:23 2012 +0200
@@ -226,7 +226,7 @@
     switch (lower (varargin{n}))
       case "bufsize"
         ## We could synthesize this, but that just seems weird...
-        warning ('strread: property "bufsize" is not implemented');
+        warning ("strread: property 'bufsize' is not implemented");
       case "commentstyle"
         comment_flag = true;
         switch (lower (varargin{n+1}))
@@ -259,7 +259,7 @@
       case "emptyvalue"
         numeric_fill_value = varargin{n+1};
       case "expchars"
-        warning ('strread: property "expchars" is not implemented');
+        warning ("strread: property 'expchars' is not implemented");
       case "whitespace"
         white_spaces = varargin{n+1};
         if (strcmp (typeinfo (white_spaces), "sq_string"))
@@ -281,10 +281,10 @@
         elseif (ischar (varargin{n+1}))
           empty_str = varargin(n+1);
         else
-          error ('strread: "treatasempty" value must be string or cellstr');
+          error ("strread: 'treatasempty' value must be string or cellstr");
         endif
       otherwise
-        warning ('strread: unknown property "%s"', varargin{n});
+        warning ("strread: unknown property '%s'", varargin{n});
     endswitch
   endfor
 
@@ -297,10 +297,10 @@
     ## Determine the number of words per line as a first guess.  Forms
     ## like %f<literal>) (w/o delimiter in between) are fixed further on
     format = strrep (format, "%", " %");
-    fmt_words = regexp (format, '[^ ]+', 'match');
-    ## Format conversion specifiers following literals w/o space/delim
-    ## in between are separate now.  Separate those w trailing literals
-    idy2 = find (! cellfun ("isempty", strfind (fmt_words, "%")));
+    fmt_words = regexp (format, '[^ ]+', "match");
+    
+    ## Find position of conversion specifiers (they start with %)
+    idy2 = find (! cellfun ("isempty", regexp (fmt_words, '^%')));
 
     ## Check for unsupported format specifiers
     errpat = '(\[.*\]|[cq]|[nfdu]8|[nfdu]16|[nfdu]32|[nfdu]64)';
@@ -308,12 +308,15 @@
       error ("strread: %q, %c, %[] or bit width format specifiers are not supported yet.");
     endif
 
+    ## Format conversion specifiers following literals w/o space/delim
+    ## in between are separate now.  Separate those w trailing literals
     a = strfind (fmt_words(idy2), "%");
-    b = regexp (fmt_words(idy2), '[nfdus]', 'end');
+    b = regexp (fmt_words(idy2), '[nfdus]', "end");
     for jj = 1:numel (a)
+      ## From right to left to avoid losing track
       ii = numel (a) - jj + 1;
       if (! (length (fmt_words{idy2(ii)}) == b{ii}(1)))
-        ## Fix format_words
+        ## Split fmt_words(ii) into % conv specifier and trailing literal
         fmt_words(idy2(ii)+1 : end+1) = fmt_words(idy2(ii) : end);
         fmt_words{idy2(ii)} = fmt_words{idy2(ii)}(a{ii} : b{ii}(1));
         fmt_words{idy2(ii)+1} = fmt_words{idy2(ii)+1}(b{ii}+1:end);
@@ -333,7 +336,7 @@
   ## Remove comments in str
   if (comment_flag)
     ## Expand 'eol_char' here, after option processing which may have set value
-    comment_end = regexprep (comment_end, 'eol_char', eol_char);
+    comment_end = regexprep (comment_end, "eol_char", eol_char);
     cstart = strfind (str, comment_start);
     cstop  = strfind (str, comment_end);
     ## Treat end of string as additional comment stop
@@ -363,7 +366,8 @@
   endif
 
   if (! isempty (white_spaces))
-    ## For numeric fields, whitespace is always a delimiter, but not for text fields
+    ## For numeric fields, whitespace is always a delimiter, but not for text
+    ## fields
     if (isempty (strfind (format, "%s")))
       ## Add whitespace to delimiter set
       delimiter_str = unique ([white_spaces delimiter_str]);
@@ -399,16 +403,15 @@
        str = str(2:end);
     endif
     ## Check for single delimiter followed/preceded by whitespace
-    ## FIXME: Double strrep on str is enormously expensive of CPU time.
-    ## Can this be eliminated
     if (! isempty (delimiter_str))
       dlmstr = setdiff (delimiter_str, " ");
       rxp_dlmwsp = sprintf ("( [%s]|[%s] )", dlmstr, dlmstr);
       str = regexprep (str, rxp_dlmwsp, delimiter_str(1));
     endif
+    ## Wipe leading and trailing whitespace on each line (it may be
+    ## delimiter too)
     ## FIXME: Double strrep on str is enormously expensive of CPU time.
     ## Can this be eliminated
-    ## Wipe leading and trailing whitespace on each line (it may be delimiter too)
     if (! isempty (eol_char))
       str = strrep (str, [eol_char " "], eol_char);
       str = strrep (str, [" " eol_char], eol_char);
@@ -418,13 +421,12 @@
   ## Split 'str' into words
   words = split_by (str, delimiter_str, mult_dlms_s1, eol_char);
   if (! isempty (white_spaces))
-    ## Trim leading and trailing white_spaces
-    ## FIXME: Is this correct?  strtrim clears what matches isspace(), not
-    ## necessarily what is in white_spaces.
+    ## Trim leading and trailing 'white_spaces'. All whitespace has
+    ## been converted to space above
     words = strtrim (words);
   endif
   num_words = numel (words);
-  ## First guess at number of lines in file (ignoring leading/trailing literals)
+  ## First guess at nr. of lines in file (ignoring leading/trailing literals)
   num_lines = ceil (num_words / num_words_per_line);
 
   ## Replace TreatAsEmpty char sequences by empty strings
@@ -436,9 +438,8 @@
   endif
 
   ## fmt_words has been split properly now, but words{} has only been split on
-  ## delimiter positions. 
-  ## As numeric fields can also be separated by whitespace, more splits may be
-  ## needed.
+  ## delimiter positions. As numeric fields can also be separated by
+  ## whitespace, more splits may be needed.
   ## We also don't know the number of lines (as EndOfLine may have been set to
   ## "" (empty) by the caller).
   ##
@@ -447,6 +448,8 @@
   ## B: Leading literals (<literal>%f) w/o delimiter in between.
   ## C. Skipping leftover parts of specified skip fields (%*N )
   ## Some words columns may have to be split further to fix these.
+  ## To find out, we'll match fmt_words to the words array to see what
+  ## needs to be done. fwptr tracks which {fmt_words}# starts in what {words}#
 
   ## Find indices and pointers to possible literals in fmt_words
   idf = cellfun ("isempty", strfind (fmt_words, "%"));
@@ -454,7 +457,7 @@
   idg = ! cellfun ("isempty", regexp (fmt_words, '%\*?\d'));
   idy = find (idf | idg);
   ## Find indices to numeric conversion specifiers
-  idn = ! cellfun ("isempty", regexp (fmt_words, "%[dnfu]"));
+  idn = ! cellfun ("isempty", regexp (fmt_words, '%[dnfu]'));
 
   ## If needed, split up columns in three steps:
   if (! isempty (idy))
@@ -464,12 +467,16 @@
       ## 1. Assess "period" in the split-up words array ( < num_words_per_line).
       ## Could be done using EndOfLine but that prohibits EndOfLine = "" option.
       ## Alternative below goes by simply parsing a first grab of words
-      ## and counting words until the fmt_words array is exhausted:
-      iwrd = 1; iwrdp = 0; iwrdl = length (words{iwrd});
+      ## and matching fmt_words to words until the fmt_words array is exhausted.
+      ## iwrd: ptr to current analyzed word; iwrdp: ptr to pos before analyzed char
+      iwrd = 1; iwrdp = 0; iwrdl = length (words{1});
+      fwptr = zeros (1, numel (fmt_words));
       ii = 1;
       while ii <= numel (fmt_words)
 
         nxt_wrd = 0;
+        ## Keep track of which words nr. every fmt_words{} is (starts) in.
+        fwptr(ii) = iwrd;
 
         if (idf(ii))
           ## Literal expected
@@ -493,8 +500,9 @@
 
         elseif (idg(ii))
           ## Fixed width specifier (%N or %*N): read just a part of word
-          iwrdp += floor ...
-           (str2double (fmt_words{ii}(regexp(fmt_words{ii}, '\d') : end-1)));
+          sw = regexp (fmt_words{ii}, '\d', "once");
+          ew = regexp (fmt_words{ii}, '[nfuds]') - 1;
+          iwrdp += floor (str2double (fmt_words{ii}(sw:ew)));
           if (iwrdp > iwrdl)
             ## Match error. Field extends beyond word boundary.
             warning  ...
@@ -532,10 +540,11 @@
           if (iwrd > numel (words))
             ## Apparently EOF; assume incomplete row already at L.1 of data
             ii = numel (fmt_words);
-          elseif (ii < numel (fmt_words))
+          elseif (ii < numel (fmt_words) && iwrd <= numel (words))
             iwrdl = length (words{iwrd});
           endif
         endif
+
         ++ii;
 
       endwhile
@@ -544,12 +553,11 @@
       num_lines = ceil (num_words / words_period);
 
       ## 2. Pad words array so that it can be reshaped
-      tmp_lines = ceil (num_words / words_period);
-      num_words_padded = tmp_lines * words_period - num_words;
+      num_words_padded = num_lines * words_period - num_words;
       if (num_words_padded)
         words = [words'; cell(num_words_padded, 1)];
       endif
-      words = reshape (words, words_period, tmp_lines);
+      words = reshape (words, words_period, num_lines);
 
       ## 3. Do the column splitting on rectangular words array
       icol = 1; ii = 1;    # icol = current column, ii = current fmt_word
@@ -582,6 +590,7 @@
                 (@(x) substr(x, e(1)+1, length(x)-e(1)), words(icol, jptr), ...
                 "UniformOutput", false);
               words(icol, jptr) = fmt_words{ii};
+              fwptr = [fwptr(1:ii) (++fwptr(ii+1:end))];
 
             else
               if (! idg(ii) && ! isempty (strfind (fmt_words{ii-1}, "%s")))
@@ -589,11 +598,19 @@
                 warning ("Ambiguous '%s' specifier next to literal in column %d", icol);
               elseif (idg(ii))
                 ## Current field = fixed width. Strip into icol, rest in icol+1
-                wdth = floor (str2double (fmt_words{ii}(regexp(fmt_words{ii}, ...
-                              '\d') : end-1)));
+                sw = regexp (fmt_words{ii}, '\d', "once");
+                ew = regexp (fmt_words{ii}, '[nfuds]') - 1;
+                wdth = floor (str2double (fmt_words{ii}(sw:ew)));
                 words(icol+1, jptr) = cellfun (@(x) x(wdth+1:end),
                      words(icol,jptr), "UniformOutput", false);
-                words(icol, jptr) = strtrunc (words(icol, jptr), wdth);
+                if (isempty ([words(icol+1, :){:}]))
+                  ## Apparently split wasn't needed as turns out to cover
+                  ## entire column. So delete column again
+                  words(icol+1, :) = [];
+                else
+                  words(icol, jptr) = strtrunc (words(icol, jptr), wdth);
+                  fwptr = [fwptr(1:ii) (++fwptr(ii+1:end))];
+                endif
               else
                 ## FIXME: this assumes char(254)/char(255) won't occur in input!
                 clear wrds;
@@ -610,6 +627,7 @@
                    char(254), fmt_words{ii}), char(255));
                 ## Former trailing literal may now be leading for next specifier
                 --ii;
+                fwptr = [fwptr(1:ii) (++fwptr(ii+1:end))];
               endif
             endif
           endif
@@ -617,9 +635,7 @@
         else
           ## Conv. specifier.  Peek if next fmt_word needs split from current column
           if (ii < num_words_per_line)
-            if (idf(ii+1) && (! isempty (strfind (words{icol, 1}, fmt_words{ii+1}))))
-              --icol;
-            elseif (idg(ii+1))
+            if (fwptr(ii) == fwptr(ii+1))
               --icol;
             endif
           endif
@@ -678,10 +694,12 @@
           varargout{k} = data.';
           k++;
         case {"%0", "%1", "%2", "%3", "%4", "%5", "%6", "%7", "%8", "%9"}
-          nfmt = strsplit (fmt_words{m}(2:end-1), '.');
+          sw = regexp (fmt_words{m}, '\d', "once");
+          ew = regexp (fmt_words{m}, '[nfudsq]') - 1;
+          nfmt = strsplit (fmt_words{m}(2:ew), ".");
           swidth = str2double (nfmt{1});
-          switch fmt_words{m}(end)
-            case {"d", "u", "f", "n%"}
+          switch fmt_words{m}(ew+1)
+            case {"d", "u", "f", "n"}
               n = cellfun ("isempty", data);
               ### FIXME - erroneously formatted data lead to NaN, not an error
               ###         => ReturnOnError can't be implemented for numeric data
@@ -774,7 +792,7 @@
 %! a = rand (10, 1);
 %! b = char (randi ([65, 85], 10, 1));
 %! for k = 1:10
-%!   str = sprintf ('%s %.6f %s\n', str, a(k), b(k));
+%!   str = sprintf ("%s %.6f %s\n", str, a(k), b(k));
 %! endfor
 %! [aa, bb] = strread (str, "%f %s");
 %! assert (a, aa, 1e-6);
@@ -785,19 +803,19 @@
 %! a = rand (10, 1);
 %! b = char (randi ([65, 85], 10, 1));
 %! for k = 1:10
-%!   str = sprintf ('%s %.6f %s\n', str, a(k), b(k));
+%!   str = sprintf ("%s %.6f %s\n", str, a(k), b(k));
 %! endfor
 %! aa = strread (str, "%f %*s");
 %! assert (a, aa, 1e-6);
 
 %!test
-%! str = sprintf ('/* this is\nacomment*/ 1 2 3');
+%! str = sprintf ("/* this is\nacomment*/ 1 2 3");
 %! a = strread (str, "%f", "commentstyle", "c");
 %! assert (a, [1; 2; 3]);
 
 %!test
 %! str = "# comment\n# comment\n1 2 3";
-%! [a, b] = strread (str, '%n %s', 'commentstyle', 'shell', 'endofline', "\n");
+%! [a, b] = strread (str, "%n %s", "commentstyle", "shell", "endofline", "\n");
 %! assert (a, [1; 3]);
 %! assert (b, {"2"});
 
@@ -922,12 +940,12 @@
 %! assert (c, [0.94; 0.87], 0.01)
 
 %!test
-%! [a, b] = strread (['Empty 1' char(10)], 'Empty%s %f');
+%! [a, b] = strread (["Empty 1" char(10)], "Empty%s %f");
 %! assert (a{1}, '1');
 %! assert (b, NaN);
 
 %!test
-%! [a, b] = strread (['Empty' char(10)], 'Empty%f %f');
+%! [a, b] = strread (["Empty" char(10)], "Empty%f %f");
 %! assert (a, NaN);
 %! assert (b, NaN);
 
@@ -940,17 +958,17 @@
 
 %% Unsupported format specifiers
 %!test
-%!error <format specifiers are not supported> strread ('a', '%c')
-%!error <format specifiers are not supported> strread ('a', '%*c %d')
-%!error <format specifiers are not supported> strread ('a', '%q')
-%!error <format specifiers are not supported> strread ('a', '%*q %d')
-%!error <format specifiers are not supported> strread ('a', '%[a]')
-%!error <format specifiers are not supported> strread ('a', '%*[a] %d')
-%!error <format specifiers are not supported> strread ('a', '%[^a]')
-%!error <format specifiers are not supported> strread ('a', '%*[â] %d')
-%!error <format specifiers are not supported> strread ('a', '%d8')
-%!error <format specifiers are not supported> strread ('a', '%*d8 %s')
-%!error <format specifiers are not supported> strread ('a', '%f64')
-%!error <format specifiers are not supported> strread ('a', '%*f64 %s')
-%!error <format specifiers are not supported> strread ('a', '%u32')
-%!error <format specifiers are not supported> strread ('a', '%*u32 %d')
+%!error <format specifiers are not supported> strread ("a", "%c")
+%!error <format specifiers are not supported> strread ("a", "%*c %d")
+%!error <format specifiers are not supported> strread ("a", "%q")
+%!error <format specifiers are not supported> strread ("a", "%*q %d")
+%!error <format specifiers are not supported> strread ("a", "%[a]")
+%!error <format specifiers are not supported> strread ("a", "%*[a] %d")
+%!error <format specifiers are not supported> strread ("a", "%[^a]")
+%!error <format specifiers are not supported> strread ("a", "%*[^a] %d")
+%!error <format specifiers are not supported> strread ("a", "%d8")
+%!error <format specifiers are not supported> strread ("a", "%*d8 %s")
+%!error <format specifiers are not supported> strread ("a", "%f64")
+%!error <format specifiers are not supported> strread ("a", "%*f64 %s")
+%!error <format specifiers are not supported> strread ("a", "%u32")
+%!error <format specifiers are not supported> strread ("a", "%*u32 %d")