Mercurial > octave-nkf
changeset 12866:fe6e2afcd9ee
Revamp strread, textscan, textread functions for Matlab compatability
Implemented ML-compatible whitespace and delimiter defaults
Implemented ML-compatible options: 'whitespace', treatasempty',
format string repeat count, user-specified comment style, uneven-length
output arrays, %n and %u conversion specifiers (provisionally)
Implemented processing of given-width format specifiers
* textscan.m: Add new tests. Implement EndofLine, ReturnOnError, TreatAsEmpty
options. Improve whitespace handling.
* textread.m: Add new tests Implement EndofLine option. Improve whitespace
handling.
* strread.m: Major rewrite.
author | Philip Nienhuis <prnienhuis@users.sf.net> |
---|---|
date | Fri, 22 Jul 2011 13:05:26 -0700 |
parents | a4d1581f9e72 |
children | b00181c65533 |
files | NEWS doc/interpreter/contributors.in scripts/io/strread.m scripts/io/textread.m scripts/io/textscan.m |
diffstat | 5 files changed, 704 insertions(+), 148 deletions(-) [+] |
line wrap: on
line diff
--- a/NEWS Wed Jul 20 10:41:59 2011 -0700 +++ b/NEWS Fri Jul 22 13:05:26 2011 -0700 @@ -3,6 +3,16 @@ ** The PCRE library is now required to build Octave. + ** strread, textscan, and textread have been completely revamped. + + They now support nearly all Matlab functionality including: + + * ML-compatible whitespace and delimiter defaults + + * ML-compatible options: 'whitespace', treatasempty', + format string repeat count, user-specified comment style, uneven-length + output arrays, %n and %u conversion specifiers (provisionally) + ** New functions added. iscolumn
--- a/doc/interpreter/contributors.in Wed Jul 20 10:41:59 2011 -0700 +++ b/doc/interpreter/contributors.in Fri Jul 22 13:05:26 2011 -0700 @@ -152,6 +152,7 @@ Victor Munoz Carmen Navarrete Todd Neal +Philip Nienhuis Al Niessner Rick Niles Takuji Nishimura
--- a/scripts/io/strread.m Wed Jul 20 10:41:59 2011 -0700 +++ b/scripts/io/strread.m Fri Jul 22 13:05:26 2011 -0700 @@ -19,7 +19,9 @@ ## -*- texinfo -*- ## @deftypefn {Function File} {[@var{a}, @dots{}] =} strread (@var{str}) ## @deftypefnx {Function File} {[@var{a}, @dots{}] =} strread (@var{str}, @var{format}) +## @deftypefnx {Function File} {[@var{a}, @dots{}] =} strread (@var{str}, @var{format}, @var{format_repeat}) ## @deftypefnx {Function File} {[@var{a}, @dots{}] =} strread (@var{str}, @var{format}, @var{prop1}, @var{value1}, @dots{}) +## @deftypefnx {Function File} {[@var{a}, @dots{}] =} strread (@var{str}, @var{format}, @var{format_repeat}, @var{prop1}, @var{value1}, @dots{}) ## Read data from a string. ## ## The string @var{str} is split into words that are repeatedly matched to the @@ -38,17 +40,27 @@ ## ## @item %d ## @itemx %f -## The word is parsed as a number. +## @itemx %u +## @itemx %n +## The word is parsed as a number (and converted to double). +## +## @item %*', '%*f', '%*s +## The word is skipped. ## -## @item %* -## The word is skipped. +## For %s and %d, %f, %n, %u and the associated %*s @dots{} specifiers an +## optional width can be specified as %Ns, etc. where N is an integer > 1. +## For %f, formats like %N.Mf are allowed. +## +## @item literals +## In addition the format may contain literal character strings; these will be +## skipped during reading. ## @end table ## ## Parsed word corresponding to the first specifier are returned in the first ## output argument and likewise for the rest of the specifiers. ## ## By default, @var{format} is @t{"%f"}, meaning that numbers are read from -## @var{str}. +## @var{str}. This will do if @var{str} contains only numeric fields. ## ## For example, the string ## @@ -68,6 +80,18 @@ ## [@var{a}, @var{b}, @var{c}] = strread (@var{str}, "%s %s %f"); ## @end example ## +## Optional numeric argument @var{format_repeat} can be used for +## limiting the number of items read: +## @table @asis +## @item -1 +## (default) read all of the string until the end. +## +## @item N +## Read N times @var{nargout} items. 0 (zero) is an acceptable +## value for @var{format_repeat}. +## +## @end table +## ## The behavior of @code{strread} can be changed via property-value ## pairs. The following properties are recognized: ## @@ -87,83 +111,144 @@ ## ## @item "matlab" ## Everything from @code{%} characters to the nearest end-line is skipped. +## +## @item user-supplied. Two options: +## (1) One string, or 1x1 cell string: Skip everything to the right of it; +## (2) 2x1 cell string array: Everything between the left and right strings +## is skipped. ## @end itemize ## ## @item "delimiter" -## Any character in @var{value} will be used to split @var{str} into words. +## Any character in @var{value} will be used to split @var{str} into words +## (default value = any whitespace). +## +## @item "whitespace" +## Any character in @var{value} will be interpreted as whitespace and +## trimmed; the string defining whitespace must be enclosed in double +## quotes for proper processing of special characters like \t. +## The default value for whitespace = " \b\r\n\t" (note the space). ## ## @item "emptyvalue" ## Parts of the output where no word is available is filled with @var{value}. +## +## @item "treatasempty" +## Treat single occurrences (surrounded by delimiters or whitespace) of the +## string(s) in @var{value} as missing values. +## +## @item "returnonerror" +## If @var{value} true (1, default), ignore read errors and return normally. +## If false (0), return an error. +## ## @end table ## -## @seealso{textread, load, dlmread, fscanf} +## @seealso{textscan, textread, load, dlmread, fscanf} ## @end deftypefn function varargout = strread (str, format = "%f", varargin) + ## Check input if (nargin < 1) print_usage (); endif - if (!ischar (str) || !ischar (format)) + if (isempty (format)) + format = "%f"; + endif + + if (! ischar (str) || ! ischar (format)) error ("strread: STR and FORMAT arguments must be strings"); endif - ## Parse options + ## Check for format string repeat count + format_repeat_count = -1; + if (nargin > 2 && isnumeric (varargin{1})) + if (varargin{1} >= 0) + format_repeat_count = varargin{1}; + endif + if (nargin > 3) + varargin = varargin(2:end); + else + varargin = {}; + endif + endif + + ## Parse options. First initialize defaults comment_flag = false; - numeric_fill_value = 0; - white_spaces = " \n\r\t\b"; + numeric_fill_value = NaN; + white_spaces = " \b\r\n\t"; delimiter_str = ""; + eol_char = ""; + empty_str = ""; + err_action = 0; for n = 1:2:length (varargin) - switch (lower (varargin {n})) + switch (lower (varargin{n})) + case "bufsize" + ## We could synthesize this, but that just seems weird... + warning ('strread: property "bufsize" is not implemented'); case "commentstyle" comment_flag = true; - switch (lower (varargin {n+1})) + switch (lower (varargin{n+1})) case "c" - comment_specif = {"/*", "*/"}; + [comment_start, comment_end] = deal ("/*", "*/"); case "c++" - comment_specif = {"//", "\n"}; + [comment_start, comment_end] = deal ("//", "\n"); case "shell" - comment_specif = {"#", "\n"}; + [comment_start, comment_end] = deal ("#", "\n"); case "matlab" - comment_specif = {"%", "\n"}; + [comment_start, comment_end] = deal ("%", "\n"); otherwise - warning ("strread: unknown comment style '%s'", val); + if (ischar (varargin{n+1}) || + (numel (varargin{n+1}) == 1 && iscellstr (varargin{n+1}))) + tmp = char (varargin{n+1}); + [comment_start, comment_end] = deal (tmp, "\n"); + elseif (iscellstr (varargin{n+1}) && numel (varargin{n+1}) == 2) + [comment_start, comment_end] = deal (varargin{n+1}{:}); + else + ## FIXME - a user may have numeric values specified: {'//', 7} + ## this will lead to an error in the warning message + error ("strread: unknown or unrecognized comment style '%s'", + varargin{n+1}); + endif endswitch case "delimiter" - delimiter_str = varargin {n+1}; + delimiter_str = varargin{n+1}; case "emptyvalue" - numeric_fill_value = varargin {n+1}; - case "bufsize" - ## XXX: We could synthesize this, but that just seems weird... - warning ("strread: property \"bufsize\" is not implemented"); + numeric_fill_value = varargin{n+1}; + case "expchars" + warning ('strread: property "expchars" is not implemented'); case "whitespace" - white_spaces = varargin {n+1}; - case "expchars" - warning ("strread: property \"expchars\" is not implemented"); + white_spaces = varargin{n+1}; + ## The following parameters are specific to textscan and textread + case "endofline" + eol_char = varargin{n+1}; + case "returnonerror" + err_action = varargin{n+1}; + case "treatasempty" + empty_str = varargin{n+1}; + if (ischar (empty_str)) + empty_str = {empty_str}; + endif otherwise - warning ("strread: unknown property \"%s\"", varargin {n}); + warning ('strread: unknown property "%s"', varargin{n}); endswitch endfor - if (isempty (delimiter_str)) - delimiter_str = white_spaces; - endif - ## Parse format string + ## Parse format string to compare nr. of conversion fields and nargout idx = strfind (format, "%")'; - specif = format ([idx, idx+1]); + specif = format([idx, idx+1]); nspecif = length (idx); idx_star = strfind (format, "%*"); nfields = length (idx) - length (idx_star); - - if (max (nargout, 1) != nfields) - error ("strread: the number of output variables must match that specified byFORMAT"); + ## If str only has numeric fields, a (default) format ("%f") will do. + ## Otherwise: + if ((max (nargout, 1) != nfields) && ! strcmp (format, "%f")) + error ("strread: the number of output variables must match that specified by FORMAT"); endif ## Remove comments if (comment_flag) - cstart = strfind (str, comment_specif{1}); - cstop = strfind (str, comment_specif{2}); + cstart = strfind (str, comment_start); + cstop = strfind (str, comment_end); if (length (cstart) > 0) ## Ignore nested openers. [idx, cidx] = unique (lookup (cstop, cstart), "first"); @@ -181,71 +266,323 @@ cstop = cstop(cidx); endif len = length (str); - c2len = length (comment_specif{2}); + c2len = length (comment_end); str = cellslices (str, [1, cstop + c2len], [cstart - 1, len]); str = [str{:}]; endif - ## Determine the number of words per line - format = strrep (format, "%", " %"); - [~, ~, ~, fmt_words] = regexp (format, '[^ ]+'); - + if (strcmpi (strtrim (format), "%f")) + ## Default format specified. Expand it (to desired nargout) + num_words_per_line = nargout; + fmt_words = cell (nargout, 1); + fmt_words (1:nargout) = format; + else + ## Determine the number of words per line as a first guess. Forms + ## like %f<literal) (w/o delimiter in between) are fixed further on + format = strrep (format, "%", " %"); + fmt_words = regexp (format, '[^ ]+', 'match'); + ## Format conversion specifiers following literals w/o space/delim + ## in between are separate now. Separate those w trailing literals + idy2 = find (! cellfun ("isempty", strfind (fmt_words, "%"))); + a = strfind (fmt_words(idy2), "%"); + b = regexp (fmt_words(idy2), '[nfdus]', 'end'); + for jj = 1:numel (a) + ii = numel (a) - jj + 1; + if (! (length (fmt_words{idy2(ii)}) == b{ii}(1))) + ## Fix format_words + fmt_words(idy2(ii)+1 : end+1) = fmt_words(idy2(ii) : end); + fmt_words{idy2(ii)} = fmt_words{idy2(ii)}(a{ii} : b{ii}(1)); + fmt_words{idy2(ii)+1} = fmt_words{idy2(ii)+1}(b{ii}+1:end); + endif + endfor + endif num_words_per_line = numel (fmt_words); - for m = 1:numel(fmt_words) - ## Convert formats such as "%Ns" to "%s" (see the FIXME below) - if (length (fmt_words{m}) > 2) - if (strcmp (fmt_words{m}(1:2), "%*")) - fmt_words{m} = "%*"; - elseif (fmt_words{m}(1) == "%") - fmt_words{m} = fmt_words{m}([1, end]); + + if (! isempty (white_spaces)) + ## Check for overlapping whitespaces and delimiters & trim whitespace + if (! isempty (delimiter_str)) + [ovlp, iw] = intersect (white_spaces, delimiter_str); + if (! isempty (ovlp)) + ## Remove delimiter chars from white_spaces + white_spaces = cell2mat (strsplit (white_spaces, white_spaces(iw))); endif endif - endfor + endif + + if (isempty (delimiter_str)) + delimiter_str = " "; + endif + + if (! isempty (eol_char)) + ## eol_char is delimiter by default. First separate CRLF from single CR & LF + if (strcmp (eol_char, "\r\n")) + ## Strip CR from CRLF sequences + str = strrep (str, "\r\n", "\n"); + ## CR serves no further purpose in function + eol_char = "\n"; + endif + ## Add eol_char to delimiter collection + delimiter_str = unique ([delimiter_str eol_char]); + endif + + pad_out = 0; + ## If needed, trim whitespace + if (! isempty (white_spaces)) + ## Check if trailing "\n" might signal padding output arrays to equal size + ## before it is trimmed away below + if ((str(end) == 10) && (nargout > 1)) + pad_out = 1; + endif + ## Remove repeated white_space chars. First find white_space positions + idx = strchr (str, white_spaces); + ## Find repeated white_spaces + idx2 = ! (idx(2:end) - idx(1:end-1) - 1); + ## Set all whitespace chars to spaces + ## FIXME: this implies real spaces are always part of white_spaces + str(idx) = ' '; + ## Set all repeated white_space to \0 + str(idx(idx2)) = "\0"; + str = strsplit (str, "\0"); + ## Reconstruct trimmed str + str = cell2mat (str); + ## Remove leading & trailing space, but preserve delimiters. + str = strtrim (str); + endif ## Split 'str' into words words = split_by (str, delimiter_str); + if (! isempty (white_spaces)) + ## Trim leading and trailing white_spaces + words = strtrim (words); + endif num_words = numel (words); + ## First guess at number of lines in file (ignoring leading/trailing literals) num_lines = ceil (num_words / num_words_per_line); - ## For each specifier + ## Replace TreatAsEmpty char sequences by empty strings + if (! isempty (empty_str)) + ## FIXME: There should be a simpler way to do this with cellfun + for ii = 1:numel (empty_str) + idz = strmatch (empty_str{ii}, words, "exact"); + words(idz) = {""}; + endfor + endif + + ## We now may have to cope with 3 cases: + ## A: Trailing literals (%f<literal>) w/o delimiter in between. + ## B: Leading literals (<literal>%f) w/o delimiter in between. + ## C. Skipping leftover parts of specified skip fields (%*N ) + ## fmt_words has been split properly now, but words{} has only been split on + ## delimiter positions. Some words columns may have to be split further. + ## We also don't know the number of lines (as EndOfLine may have been set to + ## "" (empty) by the caller). + + ## Find indices and pointers to possible literals in fmt_words + idf = cellfun ("isempty", strfind (fmt_words, "%")); + ## Find indices and pointers to "%*" (skip) conversion specifiers + idg = ! cellfun ("isempty", strfind (fmt_words, "%*")); + ## Unselect those with specified width ("%*N") + st = regexp (fmt_words, '\d'); + idy = find (idf); + + ## If needed, split up columns in three steps: + if (! isempty (idy)) + ## Try-catch because complexity of strings to read can be infinite + try + + ## 1. Assess "period" in the split-up words array ( < num_words_per_line). + ## Could be done using EndOfLine but that prohibits EndOfLine = "" option. + fmt_in_word = cell (num_words_per_line, 1); + words_period = 1; + ## For each literal in turn + for ii = 1:numel (idy) + fmt_in_word(idy(ii)) = num_words; + ## Find *current* "return period" for fmt_word{idy(ii)} in words + ## Search in first num_words_per_line of words + litptrs = find (! cellfun ("isempty", strfind ... + (words(1:min (10*num_words_per_line, num_words)), ... + fmt_words{idy(ii)}))); + if (length (litptrs) > 1) + litptr = sum (unique (litptrs(2:end) .- litptrs(1:end-1))); + endif + endfor + words_period = max (words_period, litptr); + num_lines = ceil (num_words / words_period); + + ## 2. Pad words array so that it can be reshaped + tmp_lines = ceil (num_words / words_period); + num_words_padded = tmp_lines * words_period - num_words; + if (num_words_padded) + words = [words'; cell(num_words_padded, 1)]; + endif + words = reshape (words, words_period, tmp_lines); + + ## 3. Do the column splitting on rectangular words array + icol = 1; ii = 1; # icol = current column, ii = current fmt_word + while (ii <= num_words_per_line) + + ## Check if fmt_words(ii) contains a literal + if (idf(ii)) # Yes, fmt_words(ii) = literal + [s, e] = regexp (words{icol, 1}, fmt_words{ii}); + if (isempty (s)) + warning ("Literal '%s' not found in column %d", fmt_words{ii}, icol); + else + if (! strcmp (fmt_words{ii}, words{icol, 1})) + ## Column doesn't exactly match literal => split needed. Add a column + words(icol+1:end+1, :) = words(icol:end, :); + ## Watch out for empty cells + jptr = find (! cellfun ("isempty", words(icol, :))); + + ## Distinguish leading or trailing literals + if (!isempty (s) && s(1) == 1) + ## Leading literal. Assign literal to icol, paste rest in icol + 1 + ## Apply only to those cells that do have something beyond literal + jptr = find ([cellfun(@(x) length(x), words(icol+1, jptr), ... + "UniformOutput", false){:}] > e(1)); + words(icol+1, jptr) = cellfun ... + (@(x) substr(x, e(1)+1, length(x)-e(1)), words(icol, jptr), ... + "UniformOutput", false); + words(icol, jptr) = fmt_words{ii}; + + else + ## Trailing literal. If preceding format == '%s' this is an error + if (! isempty (strfind (fmt_words{ii-1}, "%s"))) + warning ("Ambiguous '%s' specifier next to literal in column %d", icol); + else + ## Some invoked code to avoid regexp which seems demanding + ## on large files + ## FIXME: this assumes char(254)/char(255) won't occur in input! + clear wrds; + wrds(1:2:2*numel (words(icol, jptr))) = ... + strrep (words(icol, jptr), fmt_words{ii}, ... + [char(255) char(254)]); + wrds(2:2:2*numel (words(icol, jptr))-1) = char(255); + wrds = strsplit ([wrds{:}], char(255)); + words(icol, jptr) = ... + wrds(find (cellfun ("isempty", strfind (wrds, char(254))))); + wrds(find (cellfun ("isempty", strfind (wrds, char(254))))) ... + = char(255); + words(icol+1, jptr) = strsplit (strrep ([wrds{2:end}], ... + char(254), fmt_words{ii}), char(255)); + endif + ## Former trailing literal may now be leading for next specifier + --ii; + endif + endif + endif + + else + ## Conv. specifier. Peek if next fmt_word needs split from current column + if (ii < num_words_per_line && idf(ii+1)) + if (! isempty (strfind (words{icol, 1}, fmt_words{ii+1}))) + --icol; + endif + endif + endif + ## Next fmt_word, next column + ++ii; ++icol; + endwhile + + ## Done. Reshape words back into 1 long vector and strip padded empty words + words = reshape (words, 1, numel (words))(1 : end-num_words_padded); + + catch + warning ("strread: unable to parse text or file with given format string"); + return; + + end_try_catch + endif + + ## For each specifier, process corresponding column k = 1; for m = 1:num_words_per_line - data = words (m:num_words_per_line:end); - ## Map to format - ## FIXME - add support for formats like "%4s" or "<%s>", "%[a-zA-Z]" - ## Someone with regexp experience is needed. - switch fmt_words{m} - case "%s" - data (end+1:num_lines) = {""}; - varargout {k} = data'; - k++; - case {"%d", "%f"} - n = cellfun (@isempty, data); - data = str2double (data); - data(n) = numeric_fill_value; - data (end+1:num_lines) = numeric_fill_value; - varargout {k} = data.'; - k++; - case {"%*", "%*s"} - ## skip the word - otherwise - ## Ensure descriptive content is consistent - if (numel (unique (data)) > 1 - || ! strcmpi (unique (data), fmt_words{m})) - error ("strread: FORMAT does not match data"); - endif - endswitch + try + if (format_repeat_count < 0) + data = words(m:num_words_per_line:end); + elseif (format_repeat_count == 0) + data = {}; + else + lastline = ... + min (num_words_per_line * format_repeat_count + m - 1, numel (words)); + data = words(m:num_words_per_line:lastline); + endif + + ## Map to format + ## FIXME - add support for formats like "<%s>", "%[a-zA-Z]" + ## Someone with regexp experience is needed. + switch fmt_words{m}(1:min (2, length (fmt_words{m}))) + case "%s" + if (pad_out) + data(end+1:num_lines) = {""}; + endif + varargout{k} = data'; + k++; + case {"%d", "%u", "%f", "%n"} + n = cellfun ("isempty", data); + ### FIXME - erroneously formatted data lead to NaN, not an error + data = str2double (data); + data(n) = numeric_fill_value; + if (pad_out) + data(end+1:num_lines) = numeric_fill_value; + endif + varargout{k} = data.'; + k++; + case {"%0", "%1", "%2", "%3", "%4", "%5", "%6", "%7", "%8", "%9"} + nfmt = strsplit (fmt_words{m}(2:end-1), '.'); + swidth = str2num (nfmt{1}); + switch fmt_words{m}(end) + case {"d", "u", "f", "n%"} + n = cellfun ("isempty", data); + ### FIXME - erroneously formatted data lead to NaN, not an error + ### => ReturnOnError can't be implemented for numeric data + data = str2double (strtrunc (data, swidth)); + data(n) = numeric_fill_value; + if (pad_out) + data(end+1:num_lines) = numeric_fill_value; + endif + if (numel (nfmt) > 1) + sprec = str2num (nfmt{2}); + data = 10^-sprec * round (10^sprec * data); + endif + varargout{k} = data.'; + k++; + case "s" + if (pad_out) + data(end+1:num_lines) = {""} + endif + varargout{k} = strtrunc (data, 3)'; + k++; + otherwise + endswitch + case {"%*", "%*s"} + ## skip the word + otherwise + ## Ensure descriptive content is consistent + if (numel (unique (data)) > 1 + || ! strcmpi (unique (data), fmt_words{m})) + error ("strread: FORMAT does not match data"); + endif + endswitch + catch + ## As strread processes columnwise, ML-compatible error processing + ## (row after row) is not feasible. In addition Octave sets unrecognizable + ## numbers to NaN w/o error. But maybe Octave is better in this respect. + if (err_action) + ## Just try the next column where ML bails out + else + rethrow (lasterror); + endif + end_try_catch endfor + endfunction function out = split_by (text, sep) - sep = union (sep, "\n"); - pat = sprintf ('[^%s]+', sep); - [~, ~, ~, out] = regexp (text, pat); - out(cellfun (@isempty, out)) = {""}; - out = strtrim (out); + out = strsplit (text, sep); + out(cellfun ("isempty", out)) = {""}; endfunction + %!test %! [a, b] = strread ("1 2", "%f%f"); %! assert (a == 1 && b == 2); @@ -254,14 +591,14 @@ %! str = "# comment\n# comment\n1 2 3"; %! [a, b] = strread (str, '%d %s', 'commentstyle', 'shell'); %! assert (a, [1; 3]); -%! assert (b, {"2"; ""}); +%! assert (b, {"2"}); %!test %! str = ''; %! a = rand (10, 1); -%! b = char (round (65 + 20 * rand (10, 1))); +%! b = char (randi ([65, 85], 10, 1)); %! for k = 1:10 -%! str = sprintf ('%s %.6f %s\n', str, a (k), b (k)); +%! str = sprintf ('%s %.6f %s\n', str, a(k), b(k)); %! endfor %! [aa, bb] = strread (str, '%f %s'); %! assert (a, aa, 1e-5); @@ -270,9 +607,9 @@ %!test %! str = ''; %! a = rand (10, 1); -%! b = char (round (65 + 20 * rand (10, 1))); +%! b = char (randi ([65, 85], 10, 1)); %! for k = 1:10 -%! str = sprintf ('%s %.6f %s\n', str, a (k), b (k)); +%! str = sprintf ('%s %.6f %s\n', str, a(k), b(k)); %! endfor %! aa = strread (str, '%f %*s'); %! assert (a, aa, 1e-5); @@ -294,3 +631,63 @@ %! a = strread ("a b c, d e, , f", "%s", "delimiter", ","); %! assert (a, {"a b c"; "d e"; ""; "f"}); +%!test +%! # Bug #33536 +%! [a, b, c] = strread ("1,,2", "%s%s%s", "delimiter", ","); +%! assert (a{1}, '1'); +%! assert (b{1}, ''); +%! assert (c{1}, '2'); + +%!test +%! # Bug #33536 +%! a = strread ("[SomeText]", "%s", "delimiter", "]"); +%! assert (a{1}, "[SomeText"); +%! assert (a{2}, ''); + +%!test +%! dat = "Data file.\r\n= = = = =\r\nCOMPANY : <Company name>\r\n"; +%! a = strread (dat, "%s", 'delimiter', "\n", 'whitespace', '', 'endofline', "\r\n"); +%! assert (a{2}, "= = = = ="); +%! assert (double (a{3}(end-5:end)), [32 110 97 109 101 62]); + +%!test +%! [a, b, c, d] = strread ("1,2,3,,5,6", "%d%d%d%d", 'delimiter', ','); +%! assert (c, 3); +%! assert (d, NaN); + +%!test +%! [a, b, c, d] = strread ("1,2,3,,5,6\n", "%d%d%d%d", 'delimiter', ','); +%! assert (c, [3; NaN]); +%! assert (d, [NaN; NaN]); + +%!test +%! # Default format (= %f) +%1 [a, b, c] = strread ("0.12 0.234 0.3567"); +%1 assert (a, 0.12); +%1 assert (b, 0.234); +%1 assert (c, 0.3567); + +%!test +%! [a, b] = strread('0.41 8.24 3.57 6.24 9.27', "%f%f", 2, 'delimiter', ' '); +%1 assert (a, [0.41; 3.57]); + +%!test +%! # TreatAsEmpty +%! [a, b, c, d] = strread ("1,2,3,NN,5,6\n", "%d%d%d%d", 'delimiter', ',', 'TreatAsEmpty', 'NN'); +%! assert (c, [3; NaN]); +%! assert (d, [NaN; NaN]); + +%!test +%! # No delimiters at all besides EOL. Plain reading numbers & strings +%! str = "Text1Text2Text\nText398Text4Text\nText57Text"; +%! c = textscan (str, "Text%dText%1sText"); +%! assert (c{1}, [1; 398; 57]); +%! assert (c{2}(1:2), {'2'; '4'}); +%! assert (isempty (c{2}{3}), true); + +%!test +%! # No delimiters at all besides EOL. Skip fields, even empty fields +%! str = "Text1Text2Text\nTextText4Text\nText57Text"; +%! c = textscan (str, "Text%*dText%dText"); +%! assert (c{1}, [2; 4; NaN]); +
--- a/scripts/io/textread.m Wed Jul 20 10:41:59 2011 -0700 +++ b/scripts/io/textread.m Fri Jul 22 13:05:26 2011 -0700 @@ -25,13 +25,21 @@ ## The file @var{filename} is read and parsed according to @var{format}. The ## function behaves like @code{strread} except it works by parsing a file ## instead of a string. See the documentation of @code{strread} for details. +## ## In addition to the options supported by @code{strread}, this function -## supports one more: +## supports two more: +## ## @itemize ## @item "headerlines": +## The first @var{value} number of lines of @var{filename} are skipped. +## +## @item "endofline": +## Specify a single character or "\r\n". If no value is given, it will be +## inferred from the file. If set to "" (empty string) EOLs are ignored as +## delimiters. ## @end itemize -## The first @var{value} number of lines of @var{str} are skipped. -## @seealso{strread, load, dlmread, fscanf} +## +## @seealso{strread, load, dlmread, fscanf, textscan} ## @end deftypefn function varargout = textread (filename, format = "%f", varargin) @@ -51,22 +59,55 @@ error ("textread: could not open '%s' for reading", filename); endif - ## Maybe skip header lines + ## Skip header lines if requested headerlines = find (strcmpi (varargin, "headerlines"), 1); - if (! isempty (headerlines)) - hdr_lines = floor (varargin{headerlines + 1}); - ## Beware of zero valued headerline, fskipl will count lines to EOF then - if (hdr_lines > 0) - fskipl (fid, hdr_lines); - endif + ## Beware of zero valued headerline, fskipl would skip to EOF + if (! isempty (headerlines) && (varargin{headerlines + 1} > 0)) + fskipl (fid, varargin{headerlines + 1}); varargin(headerlines:headerlines+1) = []; endif str = fread (fid, "char=>char").'; fclose (fid); - ## Call strread to make it do the real work - [varargout{1:max (nargout, 1)}] = strread (str, format, varargin {:}); + if (isempty (str)) + warning ("textread: empty file"); + else + endofline = find (strcmpi (varargin, "endofline"), 1); + if (! isempty (endofline)) + ## 'endofline' option set by user. + endofline = find (strcmpi (varargin, "endofline"), 1); + if (! ischar (varargin{endofline + 1})); + error ("textscan: character value required for EndOfLine"); + endif + else + ## Determine EOL from file. Search for EOL candidates in first 3000 chars + eol_srch_len = min (length (str), 3000); + ## First try DOS (CRLF) + if (! isempty (findstr ("\r\n", str(1 : eol_srch_len)))) + eol_char = "\r\n"; + ## Perhaps old Macintosh? (CR) + elseif (! isempty (findstr ("\r", str(1 : eol_srch_len)))) + eol_char = "\r"; + ## Otherwise, use plain UNIX (LF) + else + eol_char = "\n"; + endif + ## Set up default endofline param value + nargs = numel (varargin); + varargin(nargs+1:nargs+2) = {'endofline', eol_char}; + endif + + ## Set up default whitespace param value if needed + if (isempty (find (strcmpi ('whitespace', varargin)))) + nargs = numel (varargin); + varargin(nargs+1:nargs+2) = {'whitespace', " \b\t"}; + endif + + ## Call strread to make it do the real work + [varargout{1:max (nargout, 1)}] = strread (str, format, varargin {:}); + + endif endfunction
--- a/scripts/io/textscan.m Wed Jul 20 10:41:59 2011 -0700 +++ b/scripts/io/textscan.m Fri Jul 22 13:05:26 2011 -0700 @@ -28,12 +28,24 @@ ## The file associated with @var{fid} is read and parsed according to ## @var{format}. The function behaves like @code{strread} except it works by ## parsing a file instead of a string. See the documentation of -## @code{strread} for details. In addition to the options supported by -## @code{strread}, this function supports one more: +## @code{strread} for details. +## +## In addition to the options supported by +## @code{strread}, this function supports a few more: +## ## @itemize ## @item "headerlines": +## The first @var{value} number of lines of @var{str} are skipped. +## +## @item "endofline": +## Specify a single character or "\r\n". If no value is given, it will be +## inferred from the file. If set to "" (empty string) EOLs are ignored as +## delimiters. +## +## @item "returnonerror": +## If set to numerical 1 or true (default), return normally when read errors +## have been encountered. If set to 0 or false, return an error and no data. ## @end itemize -## The first @var{value} number of lines of @var{str} are skipped. ## ## The optional input, @var{n}, specifes the number of lines to be read from ## the file, associated with @var{fid}. @@ -47,15 +59,25 @@ ## @seealso{dlmread, fscanf, load, strread, textread} ## @end deftypefn -function [C, p] = textscan (fid, format, varargin) +function [C, position] = textscan (fid, format = "%f", varargin) ## Check input if (nargin < 1) print_usage (); - elseif (nargin == 1 || isempty (format)) + endif + + if (isempty (format)) format = "%f"; endif + if (! (isa (fid, "double") && fid > 0) && ! ischar (fid)) + error ("textscan: first argument must be a file id or character string"); + endif + + if (! ischar (format)) + error ("textscan: FORMAT must be a valid specification"); + endif + if (nargin > 2 && isnumeric (varargin{1})) nlines = varargin{1}; args = varargin(2:end); @@ -70,66 +92,132 @@ args{end+1} = NaN; endif - if (isa (fid, "double") && fid > 0 || ischar (fid)) - if (ischar (format)) - if (ischar (fid)) - if (nargout == 2) - error ("textscan: cannot provide position information for character input"); - endif - str = fid; + ## Check default parameter values that differ for strread & textread + + ipos = find (strcmpi (args, "whitespace")); + if (isempty (ipos)) + ## Matlab default whitespace = " \b\t" + args{end+1} = "whitespace"; + args{end+1} = " \b\t"; + whitespace = " \b\t"; + else + ## Check if there's at least one string format specifier + fmt = strrep (format, "%", " %"); + [~, ~, ~, fmt] = regexp (fmt, '[^ ]+'); + fmt = strtrim (fmt(strmatch ("%", fmt))) + has_str_fmt = all (cellfun ("isempty", strfind (strtrim (fmt(strmatch ("%", fmt))), 's'))); + ## If there is a format, AND whitespace value = empty, + ## don't add a space (char(32)) to whitespace + if (! (isempty (args{ipos+1}) && has_str_fmt)) + args {ipos+1} = unique ([" " whitespace]); + endif + endif + + if (! any (strcmpi (args, "delimiter"))) + ## Matlab says default delimiter = whitespace. + ## strread() will pick this up further + args{end+1} = "delimiter"; + args{end+1} = ""; + endif + + if (any (strcmpi (args, "returnonerror"))) + ## Because of the way strread() reads data (columnwise) this parameter + ## can't be neatly implemented. strread() will pick it up anyway + warning ('ReturnOnError is not fully implemented'); + else + ## Set default value (=true) + args{end+1} = "returnonerror"; + args{end+1} = 1; + endif + + if (ischar (fid)) + ## Read from a text string + if (nargout == 2) + error ("textscan: cannot provide position information for character input"); + endif + str = fid; + else + ## Skip header lines if requested + headerlines = find (strcmpi (args, "headerlines"), 1); + ## Beware of zero valued headerline, fskipl would skip to EOF + if (! isempty (headerlines) && (args{headerlines + 1} > 0)) + fskipl (fid, varargin{headerlines + 1}); + endif + if (isfinite (nlines)) + str = ""; + ## FIXME: Can this be done without slow for loop? + for n = 1:nlines + str = strcat (str, fgets (fid)); + endfor + else + str = fread (fid, "char=>char").'; + endif + endif + + ## Check for empty result + if (isempty (str)) + warning ("textscan: no data read"); + C = []; + else + ## Check value of 'endofline'. String or file doesn't seem to matter + endofline = find (strcmpi (args, "endofline"), 1); + if (! isempty (endofline)) + if (! ischar (args{endofline + 1})) + error ("textscan: character value required for EndOfLine"); + endif + else + ## Determine EOL from file. Search for EOL candidates in first 3000 chars + BUFLEN = 3000; + ## First try DOS (CRLF) + eol_srch_len = min (length (str), 3000); + if (! isempty (findstr ("\r\n", str(1 : eol_srch_len)))) + eol_char = "\r\n"; + ## Perhaps old Macintosh? (CR) + elseif (! isempty (findstr ("\r", str(1 : eol_srch_len)))) + eol_char = "\r"; + ## Otherwise, use plain UNIX (LF) else - ## Maybe skip header lines - headerlines = find (strcmpi (args, "headerlines"), 1); - if (! isempty (headerlines)) - hdr_lines = floor (varargin{headerlines + 1}); - ## Beware of zero valued headerline, fskipl will count lines to EOF - if (hdr_lines > 0) - fskipl (fid, hdr_lines); - endif - endif - if (isfinite (nlines)) - str = ""; - for n = 1:nlines - str = strcat (str, fgets (fid)); - endfor - else - str = fread (fid, "char=>char").'; - endif + eol_char = "\n"; endif + ## Set up the default endofline param value + args{end+1} = "endofline"; + args{end+1} = eol_char; + endif + + ## Determine the number of data fields + num_fields = numel (strfind (format, "%")) - ... + numel (idx_star = strfind (format, "%*")); - ## Determine the number of data fields - num_fields = numel (strfind (format, "%")) - ... - numel (idx_star = strfind (format, "%*")); + ## Strip trailing EOL to avoid returning stray missing values (f. strread) + if (strcmp (str(end-length (eol_char) + 1 : end), eol_char)); + str = str(1 : end-length (eol_char)); + endif - ## Call strread to make it do the real work - C = cell (1, num_fields); - [C{:}] = strread (str, format, args{:}); + ## Call strread to make it do the real work + C = cell (1, num_fields); + [C{:}] = strread (str, format, args{:}); - if (ischar (fid) && isfinite (nlines)) - C = cellfun (@(x) x(1:nlines), C, "uniformoutput", false); - endif + if (ischar (fid) && isfinite (nlines)) + C = cellfun (@(x) x(1:nlines), C, "uniformoutput", false); + endif - if (nargout == 2) - p = ftell (fid); - endif + if (nargout == 2) + position = ftell (fid); + endif - else - error ("textscan: FORMAT must be a valid specification"); - endif - else - error ("textscan: first argument must be a file id or character string"); endif endfunction + %!test %! str = "1, 2, 3, 4\n 5, , , 8\n 9, 10, 11, 12"; %! fmtstr = "%f %d %f %s"; %! c = textscan (str, fmtstr, 2, "delimiter", ",", "emptyvalue", -Inf); -%! assert (isequal (c{1}, [1;5])) +%! assert (isequal (c{1}, [1;5])); %! assert (length (c{1}), 2); -%! assert (iscellstr (c{4})) -%! assert (isequal (c{3}, [3; -Inf])) +%! assert (iscellstr (c{4})); +%! assert (isequal (c{3}, [3; -Inf])); %!test %! b = [10:10:100]; @@ -137,7 +225,26 @@ %! str = sprintf ("%g miles/hr = %g kilometers/hr\n", b); %! fmt = "%f miles/hr = %f kilometers/hr"; %! c = textscan (str, fmt); -%! assert (b(1,:)', c{1}) -%! assert (b(2,:)', c{2}) +%! assert (b(1,:)', c{1}); +%! assert (b(2,:)', c{2}); + +#%!test +#%! str = "13, 72, NA, str1, 25\r\n// Middle line\r\n36, na, 05, str3, 6"; +#%! a = textscan(str, '%d %n %f %s %n', 'delimiter', ',','treatAsEmpty', {'NA', 'na'},'commentStyle', '//'); +#%! assert (a{1}, [13; 36]); +#%! assert (a{2}, [72; NaN]); +#%! assert (a{3}, [NaN; 5]); +#%! assert (a{4}, {"str1"; "str3"}); +#%! assert (a{5}, [25; 6]); +%!test +%! str = "Km:10 = hhhBjjj miles16hour\r\n"; +%! str = [str "Km:15 = hhhJjjj miles241hour\r\n"]; +%! str = [str "Km:2 = hhhRjjj miles3hour\r\n"]; +%! str = [str "Km:25 = hhhZ\r\n"]; +%! fmt = "Km:%d = hhh%1sjjj miles%dhour"; +%! a = textscan (str, fmt, 'delimiter', ' '); +%! assert (a{1}', [10 15 2 25], 1e-5); +%! assert (a{2}', {'B' 'J' 'R' 'Z'}); +%! assert (a{3}', [16 241 3 NaN], 1e-5);