changeset 28277:01358faf49c9

importdata.m: Overhaul column header detection (bug #58328). * importdata.m: Change regular expression for delimiter detection to more accurately detect numbers. Move logic for determination of colheaders and rowheaders to occur immediately after first numeric data is found. For Matlab compatibility restrict rowheaders to only one text column. Adjust BIST tests.
author Rik <rik@octave.org>
date Fri, 08 May 2020 19:56:16 -0700
parents dd5ab20e547a
children 0421b1455758
files scripts/io/importdata.m
diffstat 1 files changed, 38 insertions(+), 13 deletions(-) [+]
line wrap: on
line diff
--- a/scripts/io/importdata.m	Fri May 08 22:23:03 2020 -0400
+++ b/scripts/io/importdata.m	Fri May 08 19:56:16 2020 -0700
@@ -175,8 +175,8 @@
 
     ## If no delimiter determined yet, make a guess.
     if (isempty (delimiter))
-      ## This pattern can be fooled, but mostly does the job just fine.
-      delim = regexpi (row, '[-+\d.e*ij ]+([^-+\de.ij])[-+\de*.ij ]',
+      ## Look for number, DELIMITER, DELIMITER*, number
+      delim = regexpi (row, '[-+]?\d*[.]?\d+(?:[ed][-+]?\d+)?[ij]?([^-+\d.deij])\1*[-+]?\d*[.]?\d+(?:[ed][-+]?\d+)?[ij]?',
                        'tokens', 'once');
       if (! isempty (delim))
         delimiter = delim{1};
@@ -200,6 +200,14 @@
       row = -1;
       break;
     else
+      ## The number of header rows and header columns is now known.
+      header_cols = find (! isnan (row_data), 1) - 1;
+      has_rowheaders = (header_cols == 1);
+
+      ## Set colheaders output from textdata if appropriate
+      ## NOTE: Octave chooses to be Matlab incompatible and return
+      ## both 'rowheaders' and 'colheaders' when they are present.
+      ## Matlab allows only one to be present at a time.
       if (! isempty (output.textdata))
         if (delimiter == " ")
           output.colheaders = regexp (strtrim (output.textdata{end}),
@@ -207,9 +215,22 @@
         else
           output.colheaders = ostrsplit (output.textdata{end}, delimiter);
         endif
+       
+        nc_hdr = numel (output.colheaders);
+        nc_dat = numel (row_data);
+        if (! has_rowheaders)
+          if (nc_hdr != nc_dat)
+            output = rmfield (output, {"rowheaders", "colheaders"});
+          else
+            output = rmfield (output, "rowheaders");
+          endif
+        else
+          if (nc_hdr != nc_dat-1)
+            output = rmfield (output, "colheaders");
+          endif
+        endif
       endif
-      header_cols = find (! isnan (row_data), 1) - 1;
-      ## The number of header rows and header columns is now known.
+
       break;
     endif
 
@@ -256,6 +277,10 @@
   endif
 
   ## Go back and correct any individual values that did not convert.
+  ## FIXME: This is only efficient when the number of bad conversions is small.
+  ##        Any file with 'rowheaders' will cause the for loop to execute over
+  ##        *every* line in the file.
+
   na_idx = isna (output.data);
   if (header_cols > 0)
     na_idx = [(true (rows (na_idx), header_cols)), na_idx];
@@ -284,21 +309,19 @@
 
       text = text(! strcmpi (text, "NA"));  #  Remove valid "NA" entries
       if (! isempty (text))
-        output.textdata = [output.textdata; text(:)];
+        output.textdata(end+1, 1:columns (text)) = text;
       endif
 
-      if (header_cols)
-        output.rowheaders(end+1, :) = fields(1:header_cols);
+      if (has_rowheaders)
+        output.rowheaders(end+1, 1) = fields(1);
       endif
     endfor
 
   endif
 
-  ## Final cleanup to satisfy output configuration
+  ## Final cleanup to satisfy Matlab compatibility
   if (all (cellfun ("isempty", output.textdata)))
     output = output.data;
-  elseif (! isempty (output.rowheaders) && ! isempty (output.colheaders))
-    output = struct ("data", {output.data}, "textdata", {output.textdata});
   endif
 
 endfunction
@@ -377,7 +400,6 @@
 %! A.data = [3.1 -7.2 0; 0.012 6.5 128];
 %! A.textdata = {"This is a header row."; ...
 %!               "this row does not contain any data, but the next one does."};
-%! A.colheaders = A.textdata (2);
 %! fn  = tempname ();
 %! fid = fopen (fn, "w");
 %! fprintf (fid, "%s\n", A.textdata{:});
@@ -393,6 +415,7 @@
 %! ## Column headers, only last row is returned in colheaders
 %! A.data = [3.1 -7.2 0; 0.012 6.5 128];
 %! A.textdata = {"Label1\tLabel2\tLabel3";
+%!               "";
 %!               "col 1\tcol 2\tcol 3"};
 %! A.colheaders = {"col 1", "col 2", "col 3"};
 %! fn  = tempname ();
@@ -404,7 +427,7 @@
 %! unlink (fn);
 %! assert (a, A);
 %! assert (d, "\t");
-%! assert (h, 2);
+%! assert (h, 3);
 
 %!test
 %! ## Row headers
@@ -425,9 +448,11 @@
 %! ## Row/Column headers and Header Text
 %! A.data = [3.1 -7.2 0; 0.012 6.5 128];
 %! A.textdata = {"This is introductory header text"
-%!               "      col1 col2 col3"
+%!               "col1\tcol2\tcol3"
 %!               "row1"
 %!               "row2"};
+%! A.rowheaders = A.textdata(3:4);
+%! A.colheaders = {"col1", "col2", "col3"};
 %! fn  = tempname ();
 %! fid = fopen (fn, "w");
 %! fprintf (fid, "%s\n", A.textdata{1:2});