Mercurial > forge
changeset 11403:e7510cc59705 octave-forge
Detect very bogus csv files
author | cdemills |
---|---|
date | Mon, 21 Jan 2013 20:57:31 +0000 |
parents | 87bfb970cf0b |
children | 07a36f1d7316 |
files | extra/dataframe/inst/@dataframe/dataframe.m |
diffstat | 1 files changed, 98 insertions(+), 94 deletions(-) [+] |
line wrap: on
line diff
--- a/extra/dataframe/inst/@dataframe/dataframe.m Sun Jan 20 19:00:38 2013 +0000 +++ b/extra/dataframe/inst/@dataframe/dataframe.m Mon Jan 21 20:57:31 2013 +0000 @@ -297,107 +297,111 @@ endif endwhile endif - x = cell (1+length (lines)-indl, size (dummy, 2)); - empty_lines = []; cmt_lines = []; - while (indl <= length (lines)) - dummy = content{indl}; - if (all (cellfun ('size', dummy, 2) == 0)) - empty_lines = [empty_lines indj]; - indl = indl + 1; indj = indj + 1; - continue; - endif - %# does it looks like a comment line ? - if (regexp (dummy{1}, ['^\s*' char(35)])) - empty_lines = [empty_lines indj]; - cmt_lines = strvcat (cmt_lines, horzcat (dummy{:})); - indl = indl + 1; indj = indj + 1; - continue; - endif - - %# try to convert to float - if (1) - the_line = cellfun (@(x) sscanf (x, "%f", locales), dummy, \ - 'UniformOutput', false); - else - %# this faster code requires a patch to src/file-io.cc in - %# the main Octave tree - the_line = sscanf (dummy, "%f", locales); - the_line = cellfun (@(x) x{1}, the_line, \ - 'UniformOutput', false); - endif - - indk = 1; indm = 1; - while (indk <= size (the_line, 2)) - if (isempty (the_line{indk}) || any (size (the_line{indk}) > 1)) - %#if indi > 1 && indk > 1, disp('line 117 '); keyboard; %#endif - if (unquot) - try - %# remove quotes and leading space(s) - x(indj, indm) = regexp (dummy{indk}, '[^''" ].*[^''"]', 'match'){1}; - catch - %# if the previous test fails, try a simpler one - in = regexp (dummy{indk}, '[^'' ]+', 'match'); - if (~isempty (in)) - x(indj, indm) = in{1}; + if (indl > length (lines)) + x = []; + else + x = cell (1+length (lines)-indl, size (dummy, 2)); + empty_lines = []; cmt_lines = []; + while (indl <= length (lines)) + dummy = content{indl}; + if (all (cellfun ('size', dummy, 2) == 0)) + empty_lines = [empty_lines indj]; + indl = indl + 1; indj = indj + 1; + continue; + endif + %# does it looks like a comment line ? + if (regexp (dummy{1}, ['^\s*' char(35)])) + empty_lines = [empty_lines indj]; + cmt_lines = strvcat (cmt_lines, horzcat (dummy{:})); + indl = indl + 1; indj = indj + 1; + continue; + endif + + %# try to convert to float + if (1) + the_line = cellfun (@(x) sscanf (x, "%f", locales), dummy, \ + 'UniformOutput', false); + else + %# this faster code requires a patch to src/file-io.cc in + %# the main Octave tree + the_line = sscanf (dummy, "%f", locales); + the_line = cellfun (@(x) x{1}, the_line, \ + 'UniformOutput', false); + endif + + indk = 1; indm = 1; + while (indk <= size (the_line, 2)) + if (isempty (the_line{indk}) || any (size (the_line{indk}) > 1)) + %#if indi > 1 && indk > 1, disp('line 117 '); keyboard; %#endif + if (unquot) + try + %# remove quotes and leading space(s) + x(indj, indm) = regexp (dummy{indk}, '[^''" ].*[^''"]', 'match'){1}; + catch + %# if the previous test fails, try a simpler one + in = regexp (dummy{indk}, '[^'' ]+', 'match'); + if (~isempty (in)) + x(indj, indm) = in{1}; %# else %# x(indj, indk) = []; + endif + end_try_catch + else + %# no conversion possible, store and remove leading space(s) + x(indj, indm) = regexp (dummy{indk}, '[^ ].*', 'match'); + endif + elseif (~isempty (regexp (dummy{indk}, '[/:-]')) && ... + ~isempty (datefmt)) + %# does it look like a date ? + datetime = dummy{indk}; + + if (datefields > 1) + %# concatenate the required number of fields + indc = 1; + for indc = (2:datefields) + datetime = cstrcat(datetime, ' ', dummy{indk+indc-1}); + endfor + else + %# ensure spaces are unique + datetime = regexprep (datetime, '[ ]+', ' '); + endif + + try + datetime = datevec (datetime, datefmt); + timeval = struct ("usec", 0, "sec", floor (datetime (6)), + "min", datetime(5), "hour", datetime(4), + "mday", datetime(3), "mon", datetime(2)-1, + "year", datetime(1)-1900); + timeval.usec = 1e6*(datetime(6) - timeval.sec); + x(indj, indm) = str2num (strftime ([char(37) 's'], timeval)) + ... + timeval.usec * 1e-6; + if (datefields > 1) + %# skip fields successfully converted + indk = indk + (datefields - 1); endif + catch + %# store it as is + x(indj, indm) = the_line{indk}; end_try_catch - else - %# no conversion possible, store and remove leading space(s) - x(indj, indm) = regexp (dummy{indk}, '[^ ].*', 'match'); - endif - elseif (~isempty (regexp (dummy{indk}, '[/:-]')) && ... - ~isempty (datefmt)) - %# does it look like a date ? - datetime = dummy{indk}; - - if (datefields > 1) - %# concatenate the required number of fields - indc = 1; - for indc = (2:datefields) - datetime = cstrcat(datetime, ' ', dummy{indk+indc-1}); - endfor - else - %# ensure spaces are unique - datetime = regexprep (datetime, '[ ]+', ' '); - endif - - try - datetime = datevec (datetime, datefmt); - timeval = struct ("usec", 0, "sec", floor (datetime (6)), - "min", datetime(5), "hour", datetime(4), - "mday", datetime(3), "mon", datetime(2)-1, - "year", datetime(1)-1900); - timeval.usec = 1e6*(datetime(6) - timeval.sec); - x(indj, indm) = str2num (strftime ([char(37) 's'], timeval)) + ... - timeval.usec * 1e-6; - if (datefields > 1) - %# skip fields successfully converted - indk = indk + (datefields - 1); - endif - catch - %# store it as is + else x(indj, indm) = the_line{indk}; - end_try_catch - else - x(indj, indm) = the_line{indk}; - endif - indk = indk + 1; indm = indm + 1; + endif + indk = indk + 1; indm = indm + 1; + endwhile + indl = indl + 1; indj = indj + 1; endwhile - indl = indl + 1; indj = indj + 1; - endwhile - - if (~isempty (empty_lines)) - x(empty_lines, :) = []; + + if (~isempty (empty_lines)) + x(empty_lines, :) = []; + endif + + %# detect empty columns + empty_lines = find (0 == sum (cellfun ('size', x, 2))); + if (~isempty (empty_lines)) + x(:, empty_lines) = []; + endif endif - - %# detect empty columns - empty_lines = find (0 == sum (cellfun ('size', x, 2))); - if (~isempty (empty_lines)) - x(:, empty_lines) = []; - endif - + clear UTF8_BOM fid in lines indl the_line content empty_lines clear datetime timeval idx