Mercurial > octave-nkf

## Copyright (C) 2009-2013 Eric Chassande-Mottin, CNRS (France)
## Parts Copyright (C) 2012 Philip Nienhuis
##
## This file is part of Octave.
##
## Octave is free software; you can redistribute it and/or modify it
## under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 3 of the License, or (at
## your option) any later version.
##
## Octave is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Octave; see the file COPYING.  If not, see
## <http://www.gnu.org/licenses/>.

## -*- texinfo -*-
## @deftypefn  {Function File} {[@var{a}, @dots{}] =} strread (@var{str})
## @deftypefnx {Function File} {[@var{a}, @dots{}] =} strread (@var{str}, @var{format})
## @deftypefnx {Function File} {[@var{a}, @dots{}] =} strread (@var{str}, @var{format}, @var{format_repeat})
## @deftypefnx {Function File} {[@var{a}, @dots{}] =} strread (@var{str}, @var{format}, @var{prop1}, @var{value1}, @dots{})
## @deftypefnx {Function File} {[@var{a}, @dots{}] =} strread (@var{str}, @var{format}, @var{format_repeat}, @var{prop1}, @var{value1}, @dots{})
## Read data from a string.
##
## The string @var{str} is split into words that are repeatedly matched to the
## specifiers in @var{format}.  The first word is matched to the first
## specifier, the second to the second specifier and so forth.  If there are
## more words than specifiers, the process is repeated until all words have
## been processed.
##
## The string @var{format} describes how the words in @var{str} should be
## parsed.
## It may contain any combination of the following specifiers:
##
## @table @code
## @item %s
## The word is parsed as a string.
##
## @item  %f
## @itemx %n
## The word is parsed as a number and converted to double.
##
## @item  %d
## @itemx %u
## The word is parsed as a number and converted to int32.
##
## @item %*', '%*f', '%*s
## The word is skipped.
##
## For %s and %d, %f, %n, %u and the associated %*s @dots{} specifiers an
## optional width can be specified as %Ns, etc. where N is an integer > 1.
## For %f, format specifiers like %N.Mf are allowed.
##
## @item literals
## In addition the format may contain literal character strings; these will be
## skipped during reading.
## @end table
##
## Parsed word corresponding to the first specifier are returned in the first
## output argument and likewise for the rest of the specifiers.
##
## By default, @var{format} is @t{"%f"}, meaning that numbers are read from
## @var{str}.  This will do if @var{str} contains only numeric fields.
##
## For example, the string
##
## @example
## @group
## @var{str} = "\
## Bunny Bugs   5.5\n\
## Duck Daffy  -7.5e-5\n\
## Penguin Tux   6"
## @end group
## @end example
##
## @noindent
## can be read using
##
## @example
## [@var{a}, @var{b}, @var{c}] = strread (@var{str}, "%s %s %f");
## @end example
##
## Optional numeric argument @var{format_repeat} can be used for
## limiting the number of items read:
##
## @table @asis
## @item -1
## (default) read all of the string until the end.
##
## @item N
## Read N times @var{nargout} items.  0 (zero) is an acceptable
## value for @var{format_repeat}.
## @end table
##
## The behavior of @code{strread} can be changed via property-value
## pairs.  The following properties are recognized:
##
## @table @asis
## @item @qcode{"commentstyle"}
## Parts of @var{str} are considered comments and will be skipped.
## @var{value} is the comment style and can be any of the following.
##
## @itemize
## @item @qcode{"shell"}
## Everything from @code{#} characters to the nearest end-of-line is skipped.
##
## @item @qcode{"c"}
## Everything between @code{/*} and @code{*/} is skipped.
##
## @item @qcode{"c++"}
## Everything from @code{//} characters to the nearest end-of-line is skipped.
##
## @item @qcode{"matlab"}
## Everything from @code{%} characters to the nearest end-of-line is skipped.
##
## @item user-supplied.  Two options:
## (1) One string, or 1x1 cell string: Skip everything to the right of it;
## (2) 2x1 cell string array: Everything between the left and right strings
## is skipped.
## @end itemize
##
## @item @qcode{"delimiter"}
## Any character in @var{value} will be used to split @var{str} into words
## (default value = any whitespace).
##
## @item @qcode{"emptyvalue"}:
## Value to return for empty numeric values in non-whitespace delimited data.
## The default is NaN@.  When the data type does not support NaN
## (int32 for example), then default is zero.
##
## @item @qcode{"multipledelimsasone"}
## Treat a series of consecutive delimiters, without whitespace in between,
## as a single delimiter.  Consecutive delimiter series need not be vertically
## @qcode{"aligned"}.
##
## @item @qcode{"treatasempty"}
## Treat single occurrences (surrounded by delimiters or whitespace) of the
## string(s) in @var{value} as missing values.
##
## @item @qcode{"returnonerror"}
## If @var{value} true (1, default), ignore read errors and return normally.
## If false (0), return an error.
##
## @item @qcode{"whitespace"}
## Any character in @var{value} will be interpreted as whitespace and
## trimmed; the string defining whitespace must be enclosed in double
## quotes for proper processing of special characters like \t.
## The default value for whitespace = @qcode{" \b\r\n\t"} (note the space).
## Unless whitespace is set to '' (empty) AND at least one @qcode{"%s"} format
## conversion specifier is supplied, a space is always part of whitespace.
##
## @end table
##
## When the number of words in @var{str} doesn't match an exact multiple
## of the number of format conversion specifiers, strread's behavior
## depends on the last character of @var{str}:
##
## @table @asis
## @item last character = @qcode{"\n"}
## Data columns are padded with empty fields or Nan so that all columns
## have equal length
##
## @item last character is not @qcode{"\n"}
## Data columns are not padded; strread returns columns of unequal length
##
## @end table
##
## @seealso{textscan, textread, load, dlmread, fscanf}
## @end deftypefn

function varargout = strread (str, format = "%f", varargin)

  ## Check input
  if (nargin < 1)
    print_usage ();
  endif

  if (isempty (str))
    ## Return empty args (no match), rather than raising an error
    varargout = cell (1, nargout);
    return;
  endif

  if (isempty (format))
    format = "%f";
  endif

  if (! ischar (str) || ! ischar (format))
    error ("strread: STR and FORMAT arguments must be strings");
  endif

  if (strcmp (typeinfo (format), "sq_string"))
    format = do_string_escapes (format);
  endif

  ## Parse format string to compare number of conversion fields and nargout
  nfields = length (strfind (format, "%")) - length (strfind (format, "%*"));
  ## If str only has numeric fields, a (default) format ("%f") will do.
  ## Otherwise:
  if ((max (nargout, 1) != nfields) && ! strcmp (format, "%f"))
    error ("strread: the number of output variables must match that specified by FORMAT");
  endif

  ## Check for format string repeat count
  format_repeat_count = -1;
  if (nargin > 2 && isnumeric (varargin{1}))
    if (varargin{1} >= 0)
      format_repeat_count = varargin{1};
    endif
    if (nargin > 3)
      varargin = varargin(2:end);
    else
      varargin = {};
    endif
  endif

  ## Parse options.  First initialize defaults
  comment_flag = false;
  delimiter_str = "";
  empty_str = "";
  eol_char = "";
  err_action = 0;
  mult_dlms_s1 = false;
  numeric_fill_value = NaN;
  white_spaces = " \b\r\n\t";
  for n = 1:2:length (varargin)
    switch (lower (varargin{n}))
      case "bufsize"
        ## We could synthesize this, but that just seems weird...
        warning ("strread: property 'bufsize' is not implemented");
      case "commentstyle"
        comment_flag = true;
        switch (lower (varargin{n+1}))
          case "c"
            [comment_start, comment_end] = deal ("/*", "*/");
          case "c++"
            [comment_start, comment_end] = deal ("//", "eol_char");
          case "shell"
            [comment_start, comment_end] = deal ("#" , "eol_char");
          case "matlab"
            [comment_start, comment_end] = deal ("%" , "eol_char");
          otherwise
            if (ischar (varargin{n+1}) ||
               (numel (varargin{n+1}) == 1 && iscellstr (varargin{n+1})))
              [comment_start, comment_end] = deal (char (varargin{n+1}), "eol_char");
            elseif (iscellstr (varargin{n+1}) && numel (varargin{n+1}) == 2)
              [comment_start, comment_end] = deal (varargin{n+1}{:});
            else
              ## FIXME - a user may have numeric values specified: {'//', 7}
              ##         this will lead to an error in the warning message
              error ("strread: unknown or unrecognized comment style '%s'",
                      varargin{n+1});
            endif
        endswitch
      case "delimiter"
        delimiter_str = varargin{n+1};
        if (strcmp (typeinfo (delimiter_str), "sq_string"))
          delimiter_str = do_string_escapes (delimiter_str);
        endif
      case "emptyvalue"
        numeric_fill_value = varargin{n+1};
      case "expchars"
        warning ("strread: property 'expchars' is not implemented");
      case "whitespace"
        white_spaces = varargin{n+1};
        if (strcmp (typeinfo (white_spaces), "sq_string"))
          white_spaces = do_string_escapes (white_spaces);
        endif
      ## The following parameters are specific to textscan and textread
      case "endofline"
        eol_char = varargin{n+1};
        if (strcmp (typeinfo (eol_char), "sq_string"))
          eol_char = do_string_escapes (eol_char);
        endif
      case "returnonerror"
        err_action = varargin{n+1};
      case "multipledelimsasone"
        mult_dlms_s1 = varargin{n+1};
      case "treatasempty"
        if (iscellstr (varargin{n+1}))
          empty_str = varargin{n+1};
        elseif (ischar (varargin{n+1}))
          empty_str = varargin(n+1);
        else
          error ("strread: 'treatasempty' value must be string or cellstr");
        endif
      otherwise
        warning ("strread: unknown property '%s'", varargin{n});
    endswitch
  endfor

  ## First parse of FORMAT
  if (strcmpi (strtrim (format), "%f"))
    ## Default format specified.  Expand it (to desired nargout)
    fmt_words = cell (max (nargout, 1), 1);
    fmt_words (1:max (nargout, 1)) = format;
  else
    ## Determine the number of words per line as a first guess.  Forms
    ## like %f<literal>) (w/o delimiter in between) are fixed further on
    format = strrep (format, "%", " %");
    fmt_words = regexp (format, '[^ ]+', "match");

    ## Find position of conversion specifiers (they start with %)
    idy2 = find (! cellfun ("isempty", regexp (fmt_words, '^%')));

    ## Check for unsupported format specifiers
    errpat = '(\[.*\]|[cq]|[nfdu]8|[nfdu]16|[nfdu]32|[nfdu]64)';
    if (! all (cellfun ("isempty", regexp (fmt_words(idy2), errpat))))
      error ("strread: %q, %c, %[] or bit width format specifiers are not supported yet.");
    endif

    ## Format conversion specifiers following literals w/o space/delim
    ## in between are separate now.  Separate those w trailing literals
    a = strfind (fmt_words(idy2), "%");
    b = regexp (fmt_words(idy2), '[nfdus]', "end");
    for jj = 1:numel (a)
      ## From right to left to avoid losing track
      ii = numel (a) - jj + 1;
      ## Check for illegal format specifiers
      if (isempty (b{ii}))
        error ("strread: unknown format specifier #%d ('%s')\n",
              ii, fmt_words{idy2(ii)});
      endif
      if (! (length (fmt_words{idy2(ii)}) == b{ii}(1)))
        ## Split fmt_words(ii) into % conv specifier and trailing literal
        fmt_words(idy2(ii)+1 : end+1) = fmt_words(idy2(ii) : end);
        fmt_words{idy2(ii)} = fmt_words{idy2(ii)}(a{ii} : b{ii}(1));
        fmt_words{idy2(ii)+1} = fmt_words{idy2(ii)+1}(b{ii}+1:end);
      endif
    endfor
  endif
  num_words_per_line = numel (fmt_words);

  ## Special handling for CRLF EOL character in str
  if (! isempty (eol_char) && strcmp (eol_char, "\r\n"))
    ## Strip CR from CRLF sequences
    str = strrep (str, "\r\n", "\n");
    ## CR serves no further purpose in function
    eol_char = "\n";
  endif

  ## Remove comments in str
  if (comment_flag)
    ## Expand 'eol_char' here, after option processing which may have set value
    comment_end = regexprep (comment_end, "eol_char", eol_char);
    cstart = strfind (str, comment_start);
    cstop  = strfind (str, comment_end);
    ## Treat end of string as additional comment stop
    if (isempty (cstop) || cstop(end) != length (str))
      cstop(end+1) = length (str);
    endif
    if (! isempty (cstart))
      ## Ignore nested openers.
      [idx, cidx] = unique (lookup (cstop, cstart), "first");
      if (idx(end) == length (cstop))
        cidx(end) = []; # Drop the last one if orphaned.
      endif
      cstart = cstart(cidx);
    endif
    if (! isempty (cstop))
      ## Ignore nested closers.
      [idx, cidx] = unique (lookup (cstart, cstop), "first");
      if (idx(1) == 0)
        cidx(1) = []; # Drop the first one if orphaned.
      endif
      cstop = cstop(cidx);
    endif
    len = length (str);
    c2len = length (comment_end);
    str = cellslices (str, [1, cstop + c2len], [cstart - 1, len]);
    str = [str{:}];
  endif

  if (! isempty (white_spaces))
    ## For numeric fields, whitespace is always a delimiter, but not for text
    ## fields
    if (isempty (strfind (format, "%s")))
      ## Add whitespace to delimiter set
      delimiter_str = unique ([white_spaces delimiter_str]);
    else
      ## Remove any delimiter chars from white_spaces list
      white_spaces = setdiff (white_spaces, delimiter_str);
    endif
  endif
  if (isempty (delimiter_str))
    delimiter_str = " ";
  endif
  if (! isempty (eol_char))
    ## Add eol_char to delimiter collection
    delimiter_str = unique ([delimiter_str eol_char]);
    ## .. and remove it from whitespace collection
    white_spaces = strrep (white_spaces, eol_char, '');
  endif

  ii = numel (fmt_words);
  while (ii > 0)
    if (ismember (fmt_words{ii}, delimiter_str)(1))
      fmt_words(ii) = [];
      --num_words_per_line;
    endif
    --ii;
  endwhile

  pad_out = 0;
  ## Trim whitespace if needed
  if (! isempty (white_spaces))
    ## Check if trailing "\n" might signal padding output arrays to equal size
    ## before it is trimmed away below
    if (str(end) == "\n" && nargout > 1)
      pad_out = 1;
    endif
    ## Condense all repeated whitespace into one single space
    ## FIXME: this will also fold repeated whitespace in a char field
    rxp_wsp = sprintf ("[%s]+", white_spaces);
    str = regexprep (str, rxp_wsp, ' ');
    ## Remove possible leading space at string
    if (str(1) == " ")
       str = str(2:end);
    endif
    ## Check for single delimiter followed/preceded by whitespace
    if (! isempty (delimiter_str))
      dlmstr = setdiff (delimiter_str, " ");
      if (! isempty (dlmstr))
        rxp_dlmwsp = sprintf ('( [%s] | [%s]|[%s] )', dlmstr, dlmstr, dlmstr);
        str = regexprep (str, rxp_dlmwsp, delimiter_str(1));
      endif
    endif
    ## Wipe leading and trailing whitespace on each line (it may be
    ## delimiter too)
    ## FIXME: Double strrep on str is enormously expensive of CPU time.
    ## Can this be eliminated
    if (! isempty (eol_char))
      str = strrep (str, [eol_char " "], eol_char);
      str = strrep (str, [" " eol_char], eol_char);
    endif
  endif

  ## Split 'str' into words
  words = split_by (str, delimiter_str, mult_dlms_s1, eol_char);
  if (! isempty (white_spaces))
    ## Trim leading and trailing 'white_spaces'. All whitespace has
    ## been converted to space above
    words = strtrim (words);
  endif
  num_words = numel (words);
  ## First guess at nr. of lines in file (ignoring leading/trailing literals)
  num_lines = ceil (num_words / num_words_per_line);

  ## Replace TreatAsEmpty char sequences by empty strings
  if (! isempty (empty_str))
    for ii = 1:numel (empty_str)
      idz = strncmp (empty_str{ii}, words, length (empty_str{ii}));
      words(idz) = {""};
    endfor
  endif

  ## fmt_words has been split properly now, but words{} has only been split on
  ## delimiter positions. As numeric fields can also be separated by
  ## whitespace, more splits may be needed.
  ## We also don't know the number of lines (as EndOfLine may have been set to
  ## "" (empty) by the caller).
  ##
  ## We also may have to cope with 3 cases as far as literals go:
  ## A: Trailing literals (%f<literal>) w/o delimiter in between.
  ## B: Leading literals (<literal>%f) w/o delimiter in between.
  ## C. Skipping leftover parts of specified skip fields (%*N )
  ## Some words columns may have to be split further to fix these.
  ## To find out, we'll match fmt_words to the words array to see what
  ## needs to be done. fwptr tracks which {fmt_words}# starts in what {words}#

  ## Find indices and pointers to possible literals in fmt_words
  idf = cellfun ("isempty", strfind (fmt_words, "%"));
  ## Find indices and pointers to conversion specifiers with fixed width
  idg = ! cellfun ("isempty", regexp (fmt_words, '%\*?\d'));
  idy = find (idf | idg);
  ## Find indices to numeric conversion specifiers
  idn = ! cellfun ("isempty", regexp (fmt_words, '%[dnfu]'));

  ## If needed, split up columns in three steps:
  if (! isempty (idy))
    ## Try-catch because complexity of strings to read can be infinite
    try

      ## 1. Assess "period" in the split-up words array ( < num_words_per_line).
      ## Could be done using EndOfLine but that prohibits EndOfLine = "" option.
      ## Alternative below goes by simply parsing a first grab of words
      ## and matching fmt_words to words until the fmt_words array is exhausted.
      ## iwrd: ptr to current analyzed word; iwrdp: ptr to pos before analyzed char
      iwrd = 1; iwrdp = 0; iwrdl = length (words{1});
      fwptr = zeros (1, numel (fmt_words));
      ii = 1;
      while (ii <= numel (fmt_words))

        nxt_wrd = 0;
        ## Keep track of which words nr. every fmt_words{} is (starts) in.
        fwptr(ii) = iwrd;

        if (idf(ii))
          ## Literal expected
          if (isempty (strfind (fmt_words{ii}, words(iwrd))))
            ## Not found in current word; supposed to be in next word
            nxt_wrd = 1;
          else
            ## Found it in current word.  Subtract literal length
            iwrdp += length (fmt_words{ii});
            if (iwrdp > iwrdl)
              ## Parse error.  Literal extends beyond delimiter (word boundary)
              warning ("strread: literal '%s' (fmt spec # %d) does not match data", ...
                fmt_words{ii}, ii);
              ## Word assumed to be completely "used up". Next word
              nxt_wrd = 1;
            elseif (iwrdp == iwrdl)
              ## Word completely "used up". Next word
              nxt_wrd = 1;
            endif
          endif

        elseif (idg(ii))
          ## Fixed width specifier (%N or %*N): read just a part of word
          sw = regexp (fmt_words{ii}, '\d', "once");
          ew = regexp (fmt_words{ii}, '[nfuds]') - 1;
          iwrdp += floor (str2double (fmt_words{ii}(sw:ew)));
          if (iwrdp > iwrdl)
            ## Match error. Field extends beyond word boundary.
            warning  ...
            ("strread: field width '%s' (fmt spec # %d) extends beyond actual word limit", ...
               fmt_words{ii}, ii);
            ## Assume word to be completely "used up".  Next word
            nxt_wrd = 1;
          elseif (iwrdp == iwrdl)
            ## Word completely "used up".  Next word
            nxt_wrd = 1;
          endif

        else
          ## A simple format conv. specifier. Either (1) uses rest of word, or
          ## (2) is squeezed between current iwrdp and next literal, or (3) uses
          ## next word. (3) is already taken care of.  So just check (1) & (2)
          if (ii < numel (fmt_words) && idf(ii+1))
            ## Next fmt_word is a literal...
            if (! index (words{iwrd}(iwrdp+1:end), fmt_words{ii+1}))
              ## ...but not found in current word => field uses rest of word
              nxt_wrd = 1;
            else
              ## ..or it IS found.  Add inferred width of current conversion field
              iwrdp += index (words{iwrd}(iwrdp+1:end), fmt_words{ii+1}) - 1;
            endif
          elseif (iwrdp <= iwrdl)
            ## No bordering literal to the right => field occupies (rest of) word
            nxt_wrd = 1;
          endif

        endif

        if (nxt_wrd)
          ++iwrd; iwrdp = 0;
          if (iwrd > numel (words))
            ## Apparently EOF; assume incomplete row already at L.1 of data
            ii = numel (fmt_words);
          elseif (ii < numel (fmt_words) && iwrd <= numel (words))
            iwrdl = length (words{iwrd});
          endif
        endif

        ++ii;

      endwhile
      ## Done
      words_period = max (iwrd - 1, 1);
      num_lines = ceil (num_words / words_period);

      ## 2. Pad words array so that it can be reshaped
      num_words_padded = num_lines * words_period - num_words;
      if (num_words_padded)
        words = [words'; cell(num_words_padded, 1)];
      endif
      words = reshape (words, words_period, num_lines);

      ## 3. Do the column splitting on rectangular words array
      icol = 1; ii = 1;    # icol = current column, ii = current fmt_word
      while (ii <= num_words_per_line)

        ## Check if fmt_words(ii) contains a literal or fixed-width
        if ((idf(ii) || idg(ii)) && (rows (words) < num_words_per_line))
          if (idf(ii))
            s = strfind (words(icol, 1), fmt_words{ii});
            if (isempty (s{:}))
              error ("strread: Literal '%s' not found in column %d", fmt_words{ii}, icol);
            endif
            s = s{:}(1);
            e = s(1) + length (fmt_words{ii}) - 1;
          endif
          if (! strcmp (fmt_words{ii}, words{icol, 1}))
            ## Column doesn't exactly match literal => split needed.  Insert a column
            words(icol+1:end+1, :) = words(icol:end, :);
            ## Watch out for empty cells
            jptr = find (! cellfun ("isempty", words(icol, :)));

            ## Distinguish leading or trailing literals
            if (! idg(ii) && ! isempty (s) && s(1) == 1)
              ## Leading literal.  Assign literal to icol, paste rest in icol + 1
              ## Apply only to those cells that do have something beyond literal
              jptr = find (cellfun ("length", words(icol+1, jptr), ...
                                    "UniformOutput", false) > e(1));
              words(icol+1, :) = {""};
              words(icol+1, jptr) = cellfun ...
                (@(x) substr (x, e(1)+1, length (x) - e(1)), words(icol, jptr), ...
                "UniformOutput", false);
              words(icol, jptr) = fmt_words{ii};
              fwptr = [fwptr(1:ii) (++fwptr(ii+1:end))];

            else
              if (! idg(ii) && ! isempty (strfind (fmt_words{ii-1}, "%s")))
                ## Trailing literal.  If preceding format == '%s' this is an error
                warning ...
                 ("strread: ambiguous '%s' specifier next to literal in column %d", icol);
              elseif (idg(ii))
                ## Current field = fixed width. Strip into icol, rest in icol+1
                sw = regexp (fmt_words{ii}, '\d', "once");
                ew = regexp (fmt_words{ii}, '[nfuds]') - 1;
                wdth = floor (str2double (fmt_words{ii}(sw:ew)));
                words(icol+1, jptr) = cellfun (@(x) x(wdth+1:end),
                     words(icol,jptr), "UniformOutput", false);
                if (isempty ([words(icol+1, :){:}]))
                  ## Apparently split wasn't needed as turns out to cover
                  ## entire column. So delete column again
                  words(icol+1, :) = [];
                else
                  words(icol, jptr) = strtrunc (words(icol, jptr), wdth);
                  fwptr = [fwptr(1:ii) (++fwptr(ii+1:end))];
                endif
              else
                ## FIXME: this assumes char(254)/char(255) won't occur in input!
                clear wrds;
                wrds(1:2:2*numel (words(icol, jptr))) = ...
                     strrep (words(icol, jptr), fmt_words{ii}, ...
                     [char(255) char(254)]);
                wrds(2:2:2*numel (words(icol, jptr))-1) = char (255);
                wrds = ostrsplit ([wrds{:}], char (255));
                words(icol, jptr) = ...
                  wrds(find (cellfun ("isempty", strfind (wrds, char (254)))));
                wrds(find (cellfun ("isempty", strfind (wrds, char (254))))) ...
                   = char (255);
                words(icol+1, jptr) = ostrsplit (strrep ([wrds{2:end}], ...
                   char (254), fmt_words{ii}), char (255));
                ## Former trailing literal may now be leading for next specifier
                --ii;
                fwptr = [fwptr(1:ii) (++fwptr(ii+1:end))];
              endif
            endif
          endif

        else
          ## Conv. specifier.  Peek if next fmt_word needs split from current column
          if (ii < num_words_per_line)
            if (fwptr(ii) == fwptr(ii+1))
              --icol;
            endif
          endif
        endif
        ## Next fmt_word, next column
        ++ii; ++icol;
      endwhile

      ## Done.  Reshape words back into 1 long vector and strip padded empty words
      words = reshape (words, 1, numel (words))(1 : end-num_words_padded);

    catch
      warning ("strread: unable to parse text or file with given format string");
      return;

    end_try_catch
  endif

  ## For each specifier, process corresponding column
  k = 1;
  for m = 1:num_words_per_line
    try
      if (format_repeat_count < 0)
        data = words(m:num_words_per_line:end);
      elseif (format_repeat_count == 0)
        data = {};
      else
        lastline = ...
          min (num_words_per_line * format_repeat_count + m - 1, numel (words));
        data = words(m:num_words_per_line:lastline);
      endif

      ## Map to format
      ## FIXME - add support for formats like "<%s>", "%[a-zA-Z]"
      ##         Someone with regexp experience is needed.
      switch (fmt_words{m}(1:min (2, length (fmt_words{m}))))
        case "%s"
          if (pad_out)
            data(end+1:num_lines) = {""};
          endif
          varargout{k} = data';
          k++;
        case {"%d", "%u", "%f", "%n"}
          n = cellfun ("isempty", data);
          ### FIXME - erroneously formatted data lead to NaN, not an error
          data = str2double (data);
          if (! isempty (regexp (fmt_words{m}, "%[du]")))
            ## Cast to integer
            ## FIXME: NaNs will be transformed into zeros
            data = int32 (data);
          endif
          data(n) = numeric_fill_value;
          if (pad_out)
            data(end+1:num_lines) = numeric_fill_value;
          endif
          varargout{k} = data.';
          k++;
        case {"%0", "%1", "%2", "%3", "%4", "%5", "%6", "%7", "%8", "%9"}
          sw = regexp (fmt_words{m}, '\d', "once");
          ew = regexp (fmt_words{m}, '[nfudsq]') - 1;
          nfmt = ostrsplit (fmt_words{m}(2:ew), ".");
          swidth = str2double (nfmt{1});
          switch (fmt_words{m}(ew+1))
            case {"d", "u", "f", "n"}
              n = cellfun ("isempty", data);
              ### FIXME - erroneously formatted data lead to NaN, not an error
              ###         => ReturnOnError can't be implemented for numeric data
              data = str2double (strtrunc (data, swidth));
              data(n) = numeric_fill_value;
              if (pad_out)
                data(end+1:num_lines) = numeric_fill_value;
              endif
              if (numel (nfmt) > 1)
                sprec = str2double (nfmt{2});
                data = 10^-sprec * round (10^sprec * data);
              elseif (! isempty (regexp (fmt_words{m}, "[du]")))
                ## Cast to integer
                ## FIXME: NaNs will be transformed into zeros
                data = int32 (data);
              endif
              varargout{k} = data.';
              k++;
            case "s"
              if (pad_out)
                data(end+1:num_lines) = {""};
              endif
              varargout{k} = strtrunc (data, swidth)';
              k++;
            otherwise
          endswitch
        case {"%*", "%*s"}
          ## skip the word
        otherwise
          ## Ensure descriptive content is consistent.
          ## Test made a bit lax to accomodate for incomplete last lines
          n = find (! cellfun ("isempty", data));
          if (numel (unique (data(n))) > 1
              || ! strcmpi (unique (data), fmt_words{m}))
            error ("strread: FORMAT does not match data");
          endif
      endswitch
    catch
      ## As strread processes columnwise, ML-compatible error processing
      ## (row after row) is not feasible. In addition Octave sets unrecognizable
      ## numbers to NaN w/o error.  But maybe Octave is better in this respect.
      if (err_action)
        ## Just try the next column where ML bails out
      else
        rethrow (lasterror);
      endif
    end_try_catch
  endfor

endfunction

function out = split_by (text, sep, mult_dlms_s1, eol_char)

  ## Check & if needed, process MultipleDelimsAsOne parameter
  if (mult_dlms_s1)
    mult_dlms_s1 = true;
    ## FIXME: Should re-implement strsplit() function here in order
    ## to avoid strrep on megabytes of data.
    ## If \n is in sep collection we need to enclose it in text
    ## to avoid it being included in consecutive delim series
    enchr = ' ';
    ## However watch out if eol_char is also in delimiters
    if (index (sep, eol_char)); enchr = char (255); endif
    text = strrep (text, eol_char, [enchr eol_char enchr]);
  else
    mult_dlms_s1 = false;
  endif

  ## Split text string along delimiters
  out = ostrsplit (text, sep, mult_dlms_s1);
  if (index (sep, eol_char)); out = strrep (out, char (255), ''); endif
  ## In case of trailing delimiter, strip stray last empty word
  if (!isempty (out) && any (sep == text(end)))
    out(end) = [];
  endif

  ## Empty cells converted to empty cellstrings.
  out(cellfun ("isempty", out)) = {""};

endfunction


%!test
%! [a, b] = strread ("1 2", "%f%f");
%! assert (a, 1);
%! assert (b, 2);

%!test
%! str = "";
%! a = rand (10, 1);
%! b = char (randi ([65, 85], 10, 1));
%! for k = 1:10
%!   str = sprintf ("%s %.6f %s\n", str, a(k), b(k));
%! endfor
%! [aa, bb] = strread (str, "%f %s");
%! assert (aa, a, 1e-6);
%! assert (bb, cellstr (b));

%!test
%! str = "";
%! a = rand (10, 1);
%! b = char (randi ([65, 85], 10, 1));
%! for k = 1:10
%!   str = sprintf ("%s %.6f %s\n", str, a(k), b(k));
%! endfor
%! aa = strread (str, "%f %*s");
%! assert (aa, a, 1e-6);

%!test
%! str = sprintf ("/* this is\nacomment*/ 1 2 3");
%! a = strread (str, "%f", "commentstyle", "c");
%! assert (a, [1; 2; 3]);

%!test
%! str = "# comment\n# comment\n1 2 3";
%! [a, b] = strread (str, "%n %s", "commentstyle", "shell", "endofline", "\n");
%! assert (a, [1; 3]);
%! assert (b, {"2"});

%!test
%! str = sprintf ("Tom 100 miles/hr\nDick 90 miles/hr\nHarry 80 miles/hr");
%! fmt = "%s %f miles/hr";
%! c = cell (1, 2);
%! [c{:}] = strread (str, fmt);
%! assert (c{1}, {"Tom"; "Dick"; "Harry"})
%! assert (c{2}, [100; 90; 80])

%!test
%! a = strread ("a b c, d e, , f", "%s", "delimiter", ",");
%! assert (a, {"a b c"; "d e"; ""; "f"});

%!test
%! ## Bug #33536
%! [a, b, c] = strread ("1,,2", "%s%s%s", "delimiter", ",");
%! assert (a{1}, "1");
%! assert (b{1}, "");
%! assert (c{1}, "2");

%!test
%! ## Bug #33536
%! a = strread ("[SomeText]", "[%s", "delimiter", "]");
%! assert (a{1}, "SomeText");

%!test
%! dat = "Data file.\r\n=  =  =  =  =\r\nCOMPANY    : <Company name>\r\n";
%! a = strread (dat, "%s", "delimiter", "\n", "whitespace", "", "endofline", "\r\n");
%! assert (a{2}, "=  =  =  =  =");
%! assert (double (a{3}(end-5:end)), [32 110 97 109 101 62]);

%!test
%! [a, b, c, d] = strread ("1,2,3,,5,6", "%d%f%d%f", "delimiter", ",");
%! assert (c, int32 (3));
%! assert (d, NaN);

%!test
%! [a, b, c, d] = strread ("1,2,3,,5,6\n", "%d%d%f%d", "delimiter", ",");
%! assert (c, [3; NaN]);
%! assert (d, int32 ([0; 0]));

%!test
%! ## Default format (= %f)
%1 [a, b, c] = strread ("0.12 0.234 0.3567");
%1 assert (a, 0.12);
%1 assert (b, 0.234);
%1 assert (c, 0.3567);

%!test
%! [a, b] = strread ("0.41 8.24 3.57 6.24 9.27", "%f%f", 2, "delimiter", " ");
%1 assert (a, [0.41; 3.57]);

%!test
%! ## TreatAsEmpty
%! [a, b, c, d] = strread ("1,2,3,NN,5,6\n", "%d%d%d%f", "delimiter", ",", "TreatAsEmpty", "NN");
%! assert (c, int32 ([3; 0]));
%! assert (d, [NaN; NaN]);

%!test
%! ## No delimiters at all besides EOL.  Plain reading numbers & strings
%! str = "Text1Text2Text\nText398Text4Text\nText57Text";
%! [a, b] = strread (str, "Text%dText%1sText");
%! assert (a, int32 ([1; 398; 57]));
%! assert (b(1:2), {"2"; "4"});
%! assert (isempty (b{3}), true);

%% MultipleDelimsAsOne
%!test
%! str = "11, 12, 13,, 15\n21,, 23, 24, 25\n,, 33, 34, 35";
%! [a b c d] = strread (str, "%f %f %f %f", "delimiter", ",", "multipledelimsasone", 1, "endofline", "\n");
%! assert (a', [11, 21, NaN]);
%! assert (b', [12, 23, 33]);
%! assert (c', [13, 24, 34]);
%! assert (d', [15, 25, 35]);

%% delimiter as sq_string and dq_string
%!test
%! assert (strread ("1\n2\n3", "%d", "delimiter", "\n"),
%!         strread ("1\n2\n3", "%d", "delimiter", '\n'))

%% whitespace as sq_string and dq_string
%!test
%! assert (strread ("1\b2\r3\b4\t5", "%d", "whitespace", "\b\r\n\t"),
%!         strread ("1\b2\r3\b4\t5", "%d", "whitespace", '\b\r\n\t'))

%!test
%! str =  "0.31 0.86 0.94\n 0.60 0.72 0.87";
%! fmt = "%f %f %f";
%! args = {"delimiter", " ", "endofline", "\n", "whitespace", " "};
%! [a, b, c] = strread (str, fmt, args {:});
%! assert (a, [0.31; 0.60], 0.01)
%! assert (b, [0.86; 0.72], 0.01)
%! assert (c, [0.94; 0.87], 0.01)

%!test
%! str =  "0.31,0.86,0.94\n0.60,0.72,0.87";
%! fmt = "%f %f %f";
%! args = {"delimiter", ",", "endofline", "\n", "whitespace", " "};
%! [a, b, c] = strread (str, fmt, args {:});
%! assert (a, [0.31; 0.60], 0.01)
%! assert (b, [0.86; 0.72], 0.01)
%! assert (c, [0.94; 0.87], 0.01)

%!test
%! str =  "0.31 0.86 0.94\n 0.60 0.72 0.87";
%! fmt = "%f %f %f";
%! args = {"delimiter", ",", "endofline", "\n", "whitespace", " "};
%! [a, b, c] = strread (str, fmt, args {:});
%! assert (a, [0.31; 0.60], 0.01)
%! assert (b, [0.86; 0.72], 0.01)
%! assert (c, [0.94; 0.87], 0.01)

%!test
%! str =  "0.31, 0.86, 0.94\n 0.60, 0.72, 0.87";
%! fmt = "%f %f %f";
%! args = {"delimiter", ",", "endofline", "\n", "whitespace", " "};
%! [a, b, c] = strread (str, fmt, args {:});
%! assert (a, [0.31; 0.60], 0.01)
%! assert (b, [0.86; 0.72], 0.01)
%! assert (c, [0.94; 0.87], 0.01)

%!test
%! [a, b] = strread (["Empty 1" char(10)], "Empty%s %f");
%! assert (a{1}, '1');
%! assert (b, NaN);

%!test
%! [a, b] = strread (["Empty" char(10)], "Empty%f %f");
%! assert (a, NaN);
%! assert (b, NaN);

%!test
%! ## Bug #35999
%! [a, b, c] = strread ("", "%f");
%! assert (isempty (a));
%! assert (isempty (b));
%! assert (isempty (c));

%!test
%! ## bug #37023
%! [a, b] = strread (" 1. 1 \n  2 3 \n", "%f %f", "endofline", "\n");
%! assert (a, [1; 2], 1e-15);
%! assert (b, [1; 3], 1e-15);

%% Test for no output arg (interactive use)
%!test
%! assert (strread (",2,,4\n5,,7,", "", "delimiter", ","), [NaN; 2; NaN; 4; 5; NaN; 7]);

%% Test #1 bug #42609
%!test
%! [a, b, c] = strread ("1 2 3\n4 5 6\n7 8 9\n", "%f %f %f\n");
%! assert (a, [1; 4; 7]);
%! assert (b, [2; 5; 8]);
%! assert (c, [3; 6; 9]);

%% Test #2 bug #42609
%!test
%! [a, b, c] = strread ("1 2\n3\n4 5\n6\n7 8\n9\n", "%f %f\n%f");
%! assert (a, [1;4;7]);
%! assert (b, [2; 5; 8]);
%! assert (c, [3; 6; 9]);

%% Test #3 bug #42609
%!test
%! [a, b, c] = strread ("1 2 3\n4 5 6\n7 8 9\n", '%f %f %f\n');
%! assert (a, [1; 4; 7]);
%! assert (b, [2; 5; 8]);
%! assert (c, [3; 6; 9]);

%% Test #3 bug #42609
%!test
%! [a, b, c] = strread ("1 2\n3\n4 5\n6\n7 8\n9\n", '%f %f\n%f');
%! assert (a, [1;4;7]);
%! assert (b, [2; 5; 8]);
%! assert (c, [3; 6; 9]);

%% Unsupported format specifiers
%!test
%!error <format specifiers are not supported> strread ("a", "%c")
%!error <format specifiers are not supported> strread ("a", "%*c %d")
%!error <format specifiers are not supported> strread ("a", "%q")
%!error <format specifiers are not supported> strread ("a", "%*q %d")
%!error <format specifiers are not supported> strread ("a", "%[a]")
%!error <format specifiers are not supported> strread ("a", "%*[a] %d")
%!error <format specifiers are not supported> strread ("a", "%[^a]")
%!error <format specifiers are not supported> strread ("a", "%*[^a] %d")
%!error <format specifiers are not supported> strread ("a", "%d8")
%!error <format specifiers are not supported> strread ("a", "%*d8 %s")
%!error <format specifiers are not supported> strread ("a", "%f64")
%!error <format specifiers are not supported> strread ("a", "%*f64 %s")
%!error <format specifiers are not supported> strread ("a", "%u32")
%!error <format specifiers are not supported> strread ("a", "%*u32 %d")

%% Illegal format specifiers
%!test
%!error <unknown format specifier> strread ("1.0", "%z")
author	John W. Eaton <jwe@octave.org>
date	Tue, 20 Jan 2015 10:05:42 -0500
parents	589354cf668f 446c46af4b42
children	0e1f5a750d00