Mercurial > jwe > octave
changeset 21467:3aa83b8ce94b
* textscan.h, textscan.cc: More style fixes.
author | John W. Eaton <jwe@octave.org> |
---|---|
date | Thu, 17 Mar 2016 17:46:56 -0400 |
parents | d1421f6fd518 |
children | 7d06afd924be |
files | libinterp/corefcn/textscan.cc libinterp/corefcn/textscan.h |
diffstat | 2 files changed, 1413 insertions(+), 1407 deletions(-) [+] |
line wrap: on
line diff
--- a/libinterp/corefcn/textscan.cc Thu Mar 17 15:21:02 2016 -0700 +++ b/libinterp/corefcn/textscan.cc Thu Mar 17 17:46:56 2016 -0400 @@ -1,4 +1,5 @@ /* + Copyright (C) 2015-2016 Lachlan Andrew, Monash University This file is part of Octave. @@ -35,362 +36,1052 @@ #include "textscan.h" #include "utils.h" -DEFUN (textscan, args, , - "-*- texinfo -*-\n\ -@deftypefn {} {@var{C} =} textscan (@var{fid}, @var{format})\n\ -@deftypefnx {} {@var{C} =} textscan (@var{fid}, @var{format}, @var{repeat})\n\ -@deftypefnx {} {@var{C} =} textscan (@var{fid}, @var{format}, @var{param}, @var{value}, @dots{})\n\ -@deftypefnx {} {@var{C} =} textscan (@var{fid}, @var{format}, @var{repeat}, @var{param}, @var{value}, @dots{})\n\ -@deftypefnx {} {@var{C} =} textscan (@var{str}, @dots{})\n\ -@deftypefnx {} {[@var{C}, @var{position}] =} textscan (@dots{})\n\ -Read data from a text file or string.\n\ -\n\ -The string @var{str} or file associated with @var{fid} is read from and\n\ -parsed according to @var{format}.\n\ -The function is an extension of @code{strread} and @code{textread}.\n\ -Differences include: the ability to read from either a file or a string,\n\ -additional options, and additional format specifiers.\n\ -\n\ -The input is interpreted as a sequence of \"words\", delimiters\n\ -(such as whitespace) and literals.\n\ -The characters that form delimiters and whitespace are determined\n\ -by the options.\n\ -The format consists of format specifiers interspersed between literals.\n\ -In the format, whitespace forms a delimiter between consecutive literals,\n\ -but is otherwise ignored.\n\ -\n\ -The output @var{C} is a cell array whose second dimension is determined\n\ -by the number of format specifiers.\n\ -\n\ -The first word of the input is matched to the first specifier of the\n\ -format and placed in the first column of the output;\n\ -the second is matched to the second specifier and placed in the second column\n\ -and so forth.\n\ -If there are more words than specifiers, the process is repeated until all\n\ -words have been processed or the limit imposed by @var{repeat} has been met\n\ -(see below).\n\ -\n\ -The string @var{format} describes how the words in @var{str} should be\n\ -parsed.\n\ -As in @var{fscanf}, any (non-whitespace) text in the format that is\n\ -not one of these specifiers is considered a literal;\n\ -if there is a literal between two format specifiers then that same literal\n\ -must appear in the input stream between the matching words.\n\ -\n\ -The following specifiers are valid:\n\ -\n\ -@table @code\n\ -@item %f\n\ -@itemx %f64\n\ -@itemx %n\n\ -The word is parsed as a number and converted to double.\n\ -\n\ -@item %f32\n\ -The word is parsed as a number and converted to single (float).\n\ -\n\ -@item %d\n\ -@itemx %d8\n\ -@itemx %d16\n\ -@itemx %d32\n\ -@itemx %d64\n\ -The word is parsed as a number and converted to int8, int16, int32 or int64.\n\ -If not size is specified, int32 is used.\n\ -\n\ -@item %u\n\ -@itemx %u8\n\ -@itemx %u16\n\ -@itemx %u32\n\ -@itemx %u64\n\ -The word is parsed as a number and converted to uint8, uint16, uint32 or\n\ -uint64. If not size is specified, uint32 is used.\n\ -\n\ -@item %s\n\ -The word is parsed as a string, ending at the last character before\n\ -whitespace, an end-of-line or a delimiter specified in the options.\n\ -\n\ -@item %q\n\ -The word is parsed as a \"quoted string\".\n\ -If the first character of the string is a double quote (\") then the string\n\ -includes everything until a matching double quote, including whitespace,\n\ -delimiters and end of line characters.\n\ -If a pair of consecutive double quotes appears in the input,\n\ -it is replaced in the output by a single double quote.\n\ -That is, the input \"He said \"\"Hello\"\"\" would return the value\n\ -'He said \"Hello\"'.\n\ -\n\ -@item %c\n\ -The next character of the input is read.\n\ -This includes delimiters, whitespace and end of line characters.\n\ -\n\ -@item %[...]\n\ -@itemx %[^...]\n\ -In the first form, the word consists of the longest run consisting of only\n\ -characters between the brackets.\n\ -Ranges of characters can be specified by a hyphen;\n\ -for example, %[0-9a-zA-Z] matches all alphanumeric characters\n\ -(if the underlying character set is ASCII).\n\ -Since Matlab treats hyphens literally, this expansion only applies to\n\ -alphanumeric characters.\n\ -To include '-' in the set, it should appear first or last in the brackets;\n\ -to include ']', it should be the first character.\n\ -If the first character is '^' then the word consists of characters\n\ -NOT listed.\n\ -\n\ -@item %N...\n\ -For %s, %c %d, %f, %n, %u, an optional width can be specified as %Ns etc.\n\ -where N is an integer > 1.\n\ -For %c, this causes exactly the next N characters to be read instead of\n\ -a single character.\n\ -For the other specifiers, it is an upper bound on the\n\ -number of characters read;\n\ -normal delimiters can cause fewer characters to be read.\n\ -For complex numbers, this limit applies to the real and imaginary\n\ -components individually.\n\ -For %f and %n, format specifiers like %N.Mf are allowed, where M is an upper\n\ -bound on number of characters after the decimal point to be considered;\n\ -subsequent digits are skipped.\n\ -For example, the specifier %8.2f would read 12.345e6 as 1.234e7.\n\ -\n\ -@item %*...\n\ -The word specified by the remainder of the conversion specifier is skipped.\n\ -\n\ -@item literals\n\ -In addition the format may contain literal character strings;\n\ -these will be skipped during reading.\n\ -If the input string does not match this literal, the processing terminates,\n\ -unless \"ReturnOnError\" is set to \"continue\".\n\ -@end table\n\ -\n\ -Parsed words corresponding to the first specifier are returned in the first\n\ -output argument and likewise for the rest of the specifiers.\n\ -\n\ -By default, if there is only one input argument, @var{format} is @t{\"%f\"}.\n\ -This means that numbers are read from @var{str} into a single column vector.\n\ -If @var{format} is explicitly empty, \"\", then textscan will return data\n\ -in a number of columns matching the number of fields on the first data\n\ -line of the input.\n\ -Either of these is suitable only if @var{str} contains only numeric fields.\n\ -\n\ -For example, the string\n\ -\n\ -@example\n\ -@group\n\ -@var{str} = \"\\\n\ -Bunny Bugs 5.5\\n\\\n\ -Duck Daffy -7.5e-5\\n\\\n\ -Penguin Tux 6\"\n\ -@end group\n\ -@end example\n\ -\n\ -@noindent\n\ -can be read using\n\ -\n\ -@example\n\ -@var{a} = textscan (@var{str}, \"%s %s %f\");\n\ -@end example\n\ -\n\ -The optional numeric argument @var{repeat} can be used for limiting the\n\ -number of items read:\n\ -\n\ -@table @asis\n\ -@item -1\n\ -(default) read all of the string or file until the end.\n\ -\n\ -@item N\n\ -Read until the first of two conditions occurs: the format has been processed\n\ -N times, or N lines of the input have been processed.\n\ -Zero (0) is an acceptable value for @var{repeat}.\n\ -Currently, end-of-line characters inside %q, %c, and %[...]$ conversions\n\ -do not contribute to the line count.\n\ -This is incompatible with Matlab and may change in future.\n\ -@end table\n\ -\n\ -The behavior of @code{textscan} can be changed via property-value pairs.\n\ -The following properties are recognized:\n\ -\n\ -@table @asis\n\ -@item @qcode{\"BufSize\"}\n\ -This specifies the number of bytes to use for the internal buffer.\n\ -A modest speed improvement is obtained by setting this to a large value\n\ -when reading a large file, especially the input contains long strings.\n\ -The default is 4096, or a value dependent on @var{n} is that is specified.\n\ -\n\ -@item @qcode{\"CollectOutput\"}\n\ -A value of 1 or true instructs textscan to concatenate consecutive columns\n\ -of the same class in the output cell array.\n\ -A value of 0 or false (default) leaves output in distinct columns.\n\ -\n\ -@item @qcode{\"CommentStyle\"}\n\ -Parts of @var{str} are considered comments and will be skipped.\n\ -@var{value} is the comment style and can be either\n\ -(1) One string, or 1x1 cell string, to skip everything to the right of it;\n\ -(2) A cell array of two strings, to skip everything between the first and\n\ -second strings.\n\ -Comments are only parsed where whitespace is accepted, and do not act as\n\ -delimiters.\n\ -\n\ -@item @qcode{\"Delimiter\"}\n\ -If @var{value} is a string, any character in @var{value} will be used to\n\ -split @var{str} into words.\n\ -If @var{value} is a cell array of strings,\n\ -any string in the array will be used to split @var{str} into words.\n\ -(default value = any whitespace.)\n\ -\n\ -@item @qcode{\"EmptyValue\"}\n\ -Value to return for empty numeric values in non-whitespace delimited data.\n\ -The default is NaN@.\n\ -When the data type does not support NaN (int32 for example),\n\ -then default is zero.\n\ -\n\ -@item @qcode{\"EndOfLine\"}\n\ -@var{value} can be either a emtpy or one character specifying the\n\ -end of line character, or the pair\n\ -@qcode{\"@xbackslashchar{}r@xbackslashchar{}n\"} (CRLF).\n\ -In the latter case, any of\n\ -@qcode{\"@xbackslashchar{}r\"}, @qcode{\"@xbackslashchar{}n\"} or\n\ -@qcode{\"@xbackslashchar{}r@xbackslashchar{}n\"} is counted as a (single)\n\ -newline.\n\ -If no value is given, @qcode{\"@xbackslashchar{}r@xbackslashchar{}n\"} is\n\ -used.\n\ -@c If set to \"\" (empty string) EOLs are ignored as delimiters and added\n\ -@c to whitespace.\n\ -\n\ -@c When reading from a character string, optional input argument @var{n}\n\ -@c specifies the number of times @var{format} should be used (i.e., to limit\n\ -@c the amount of data read).\n\ -@c When reading from file, @var{n} specifies the number of data lines to read;\n\ -@c in this sense it differs slightly from the format repeat count in strread.\n\ -\n\ -@item @qcode{\"HeaderLines\"}\n\ -The first @var{value} number of lines of @var{fid} are skipped.\n\ -Note that this does not refer to the first non-comment lines, but the first\n\ -lines of any type.\n\ -\n\ -@item @qcode{\"MultipleDelimsAsOne\"}\n\ -If @var{value} is non-zero,\n\ -treat a series of consecutive delimiters, without whitespace in between,\n\ -as a single delimiter.\n\ -Consecutive delimiter series need not be vertically @qcode{\"aligned\"}.\n\ -Without this option, a single delimiter before the end of the line does\n\ -not cause the line to be considered to end with an empty value,\n\ -but a single delimiter at the start of a line causes the line\n\ -to be considered to start with an empty value.\n\ -\n\ -@item @qcode{\"TreatAsEmpty\"}\n\ -Treat single occurrences (surrounded by delimiters or whitespace) of the\n\ -string(s) in @var{value} as missing values.\n\ -\n\ -@item @qcode{\"ReturnOnError\"}\n\ -If set to numerical 1 or true, return normally as soon as an error\n\ -is encountered, such as trying to read a string using @qcode{%f}.\n\ -If set to 0 or false, return an error and no data.\n\ -If set to \"continue\" (default), textscan attempts to continue reading\n\ -beyond the location; however, this may cause the parsing to get out of sync.\n\ -\n\ -@item @qcode{\"Whitespace\"}\n\ -Any character in @var{value} will be interpreted as whitespace and trimmed;\n\ -The default value for whitespace is\n\ -@c Note: the next line specifically has a newline which generates a space\n\ -@c in the output of qcode, but keeps the next line < 80 characters.\n\ -@qcode{\"\n\ -@xbackslashchar{}b@xbackslashchar{}r@xbackslashchar{}n@xbackslashchar{}t\"}\n\ -(note the space). Unless whitespace is set to @qcode{\"\"} (empty) AND at\n\ -least one @qcode{\"%s\"} format conversion specifier is supplied, a space is\n\ -always part of whitespace.\n\ -\n\ -@end table\n\ -\n\ -When the number of words in @var{str} or @var{fid} doesn't match an exact\n\ -multiple of the number of format conversion specifiers,\n\ -textscan's behavior depends on\n\ -whether the last character of the string or file is\n\ -an end-of-line as specified by the EndOfLine option:\n\ -\n\ -@table @asis\n\ -@item last character = end-of-line\n\ -Data columns are padded with empty fields, NaN or 0 (for integer fields)\n\ -so that all columns have equal length\n\ -\n\ -@item last character is not end-of-line\n\ -Data columns are not padded; textscan returns columns of unequal length\n\ -@end table\n\ -\n\ -\n\ -The second output, @var{position}, provides the position, in characters\n\ -from the beginning of the file or string, at which the processing stopped.\n\ -\n\ -@seealso{dlmread, fscanf, load, strread, textread}\n\ -@end deftypefn") +// Create a delimited stream, reading from is, with delimiters delims, +// and allowing reading of up to tellg + longest_lookeahead. When is +// is at EOF, lookahead may be padded by ASCII nuls. + +dstr::dstr (std::istream& is, const std::string& delimiters, + int longest_lookahead, octave_idx_type bsize) + : bufsize (bsize), i_stream (is), longest (longest_lookahead), + delims (delimiters), + flags (std::ios::failbit & ~std::ios::failbit) // can't cast 0 +{ + buf = new char[bufsize]; + eob = buf + bufsize; + idx = eob; // refresh_buf shouldn't try to copy old data + progress_marker = idx; + refresh_buf (); // load the first batch of data +} + +// Used to create a stream from a strstream from data read from a dstr. +// FIXME: Find a more efficient approach. Perhaps derived dstrstream +dstr::dstr (std::istream& is, const dstr& ds) + : bufsize (ds.bufsize), i_stream (is), longest (ds.longest), + delims (ds.delims), + flags (std::ios::failbit & ~std::ios::failbit) // can't cast 0 +{ + buf = new char[bufsize]; + eob = buf + bufsize; + idx = eob; // refresh_buf shouldn't try to copy old data + progress_marker = idx; + refresh_buf (); // load the first batch of data +} + +dstr::~dstr () +{ + // Seek to the correct position in i_stream. + if (!eof ()) + { + i_stream.clear (); + i_stream.seekg (buf_in_file); + i_stream.read (buf, idx - buf); + } + + delete [] buf; +} + +// Read a character from the buffer, refilling the buffer from the file +// if necessary. + +int +dstr::get_undelim () { - octave_value_list retval; - std::string format; - int params = 1; - - if (args.length () < 1) - print_usage (); - else if (args.length () == 1) - format = "%f"; // ommited format = %f. explicit "" = width from file - else if (args(1).is_string ()) + int retval; + if (eof ()) + { + setstate (std::ios_base::failbit); + return EOF; + } + + if (idx < eob) + retval = *idx++; + else { - format = args(1).string_value (); - if (args(1).is_sq_string ()) - format = do_string_escapes (format); - params++; + refresh_buf (); + if (eof ()) + { + setstate (std::ios_base::eofbit); + retval = EOF; + } + else + retval = *idx++; + } + if (idx >= last) + delimited = false; + return retval; +} + +// Return the next character to be read without incrementing the +// pointer, refilling the buffer from the file if necessary. + +int +dstr::peek_undelim () +{ + int retval = get_undelim (); + putback (); + + return retval; +} + +// Copy remaining unprocessed data to the start of the buffer and load +// new data to fill it. Return EOF if the file is at EOF before +// reading any data and all of the data that has been read has been +// processed. + +int +dstr::refresh_buf (void) +{ + if (eof ()) + return EOF; + + int retval; + int old_remaining = eob - idx; + + if (old_remaining < 0) + { + idx = eob; + old_remaining = 0; + } + + octave_quit (); // allow ctrl-C + + if (old_remaining > 0) + memmove (buf, idx, old_remaining); + + progress_marker -= idx - buf; // where original idx would have been + idx = buf; + + int gcount; // chars read + if (!i_stream.eof ()) + { + buf_in_file = i_stream.tellg (); // record for destructor + i_stream.read (buf + old_remaining, bufsize - old_remaining); + gcount = i_stream.gcount (); + } + else + gcount = 0; + + eob = buf + old_remaining + gcount; + last = eob; + if (gcount == 0) + { + delimited = false; + if (eob != buf) // no more data in file, but still some to go + retval = 0; + else + retval = EOF; // file and buffer are both done. } else - error ("textscan: FORMAT must be a string, not <%s>", - args(1).class_name ().c_str ()); - - octave_idx_type ntimes = -1; - textscan tscanner; - - if (args.length () >= 3) + { + delimited = true; + + for (last = eob - longest; last - buf >= 0; last--) + { + if (strchr (delims.c_str (), *last)) + break; + } + if (last - buf < 0) + delimited = false; + + retval = 0; + } + + if (retval == EOF) // Ensure fast peek doesn't give valid char + *idx = '\0'; // FIXME - check that no TreatAsEmpty etc starts w. \0? + + return retval; +} + +// Return a pointer to a block of data of size size, assuming that a +// sufficiently large buffer is available in buffer, if required. +// If called when delimited == true, and size is no greater than +// longest_lookahead then this will not call refresh_buf, so seekg +// still works. Otherwise, seekg may be invalidated. + +char * +dstr::read (char *buffer, int size, char* &prior_tell) +{ + char *retval; + if (eob - idx > size) + { + retval = idx; + idx += size; + if (idx > last) + delimited = false; + } + else { - if (args(2).is_numeric_type ()) + // If there was a tellg pointing to an earlier point than the current + // read position, try to keep it in the active buffer. + // In the current code, prior_tell==idx for each call, + // so this is not necessary, just a precaution. + if (eob - prior_tell + size < bufsize) + { + octave_idx_type gap = idx - prior_tell; + idx = prior_tell; + refresh_buf (); + idx += gap; + } + else // can't keep the tellg in range. May skip some data. + { + refresh_buf (); + } + prior_tell = buf; + + if (eob - idx > size) { - ntimes = args(2).idx_type_value (); - if (ntimes < args(2).double_value ()) - error ("textscan: REPEAT = %g is too large", - args(2).double_value ()); - params = 3; + retval = idx; + idx += size; + if (idx > last) + delimited = false; + } + else + { + if (size <= bufsize) // small read, but reached EOF + { + retval = idx; + memset (eob, 0, size + (idx - buf)); + idx += size; + } + else // Reading more than the whole buf; return it in buffer + { + retval = buffer; + // FIXME -- read bufsize at a time + int i; + for (i = 0; i < size && !eof (); i++) + *buffer++ = get_undelim (); + if (eof ()) + memset (buffer, 0, size - i); + } } } - textscan_format_list fmt_list (format); - - tscanner.parse_options (args, params, fmt_list); - - if (args(0).is_string ()) + return retval; +} + +// Return in OUT an entire line, terminated by delim. On input, OUT +// must have length at least 1. + +int +dstr::getline (std::string& out, char delim) +{ + int len = out.length (), used = 0; + int ch; + while ((ch = get_undelim ()) != delim && ch != EOF) { - std::istringstream is (args(0).string_value ()); - octave_value tmp = tscanner.scan (&is, fmt_list, ntimes); - retval(0) = tmp; - - std::ios::iostate state = is.rdstate (); - is.clear (); - retval(1) = octave_value (static_cast<long>(is.tellg ())); - is.setstate (state); + out[used++] = ch; + if (used == len) + { + len <<= 1; + out.resize (len); + } + } + out.resize (used); + field_done (); + + return ch; +} + +textscan_format_list::textscan_format_list (const std::string& s) + : set_from_first (false), has_string (false), nconv (0), curr_idx (0), + list (dim_vector (16, 1)), buf (0) +{ + octave_idx_type num_elts = 0; + + size_t n = s.length (); + + size_t i = 0; + + unsigned int width = -1; // Unspecified width = max (except %c) + int prec = -1; + int bitwidth = 0; + bool discard = false; + char type = '\0'; + + bool have_more = true; + + if (s.length () == 0) + { + buf = new std::ostringstream ("%f"); + bitwidth = 64; + type = 'f'; + add_elt_to_list (width, prec, bitwidth, octave_value (NDArray ()), + discard, type, num_elts); + have_more = false; + set_from_first = true; + nconv = 1; } else { - octave_stream os = octave_stream_list::lookup (args(0), "textscan"); - octave_value tmp = tscanner.scan (os.input_stream (), fmt_list, ntimes); - - retval(0) = tmp; - // FIXME -- warn if stream is not opened in binary mode? - std::ios::iostate state = os.input_stream ()->rdstate (); - os.input_stream ()->clear (); - retval(1) = os.tell (); - os.input_stream ()->setstate (state); + set_from_first = false; + while (i < n) + { + have_more = true; + + if (! buf) + buf = new std::ostringstream (); + + if (s[i] == '%' && (i+1 == n || s[i+1] != '%')) + { + // Process percent-escape conversion type. + + process_conversion (s, i, n, num_elts); + + have_more = (buf != 0); + } + else if (isspace (s[i])) + { + while (++i < n && isspace (s[i])) + /* skip whitespace */; + + have_more = false; + } + else + { + type = textscan_format_elt::literal_conversion; + + width = 0; + prec = -1; + bitwidth = 0; + discard = true; + + while (i < n && ! isspace (s[i]) + && (s[i] != '%' || (i+1 < n && s[i+1] == '%'))) + { + if (s[i] == '%') // if double %, skip one + i++; + *buf << s[i++]; + width++; + } + + add_elt_to_list (width, prec, bitwidth, octave_value (), discard, + type, num_elts); + + have_more = false; + } + + if (nconv < 0) + { + have_more = false; + break; + } + } + } + + if (have_more) + add_elt_to_list (width, prec, bitwidth, octave_value (), discard, type, num_elts); + + list.resize (dim_vector (num_elts, 1)); + + delete buf; +} + +textscan_format_list::~textscan_format_list (void) +{ + octave_idx_type n = list.numel (); + + for (octave_idx_type i = 0; i < n; i++) + { + textscan_format_elt *elt = list(i); + delete elt; + } +} + +void +textscan_format_list::add_elt_to_list (unsigned int width, int prec, + int bitwidth, octave_value val_type, + bool discard, char type, + octave_idx_type& num_elts, + const std::string& char_class) +{ + if (buf) + { + std::string text = buf->str (); + + if (! text.empty ()) + { + textscan_format_elt *elt + = new textscan_format_elt (text.c_str (), width, prec, bitwidth, + discard, type, char_class); + + if (num_elts == list.numel ()) + { + list.resize (dim_vector (2 * num_elts, 1)); + } + + if (!discard) + { + output_container.push_back (val_type); + } + list(num_elts++) = elt; + } + + delete buf; + buf = 0; + } +} + +void +textscan_format_list::process_conversion (const std::string& s, size_t& i, + size_t n, octave_idx_type& num_elts) +{ + unsigned width = 0; + int prec = -1; + int bitwidth = 0; + bool discard = false; + octave_value val_type; + char type = '\0'; + + *buf << s[i++]; + + bool have_width = false; + + while (i < n) + { + switch (s[i]) + { + case '*': + if (discard) + nconv = -1; + else + { + discard = true; + *buf << s[i++]; + } + break; + + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + if (have_width) + nconv = -1; + else + { + char c = s[i++]; + width = width * 10 + c - '0'; + have_width = true; + *buf << c; + while (i < n && isdigit (s[i])) + { + c = s[i++]; + width = width * 10 + c - '0'; + *buf << c; + } + + if (i < n && s[i] == '.') + { + *buf << s[i++]; + prec = 0; + while (i < n && isdigit (s[i])) + { + c = s[i++]; + prec = prec * 10 + c - '0'; + *buf << c; + } + } + } + break; + + case 'd': case 'u': + { + bool done = true; + *buf << (type = s[i++]); + if (i < n) + { + if (s[i] == '8') + { + bitwidth = 8; + if (type == 'd') + val_type = octave_value (int8NDArray ()); + else + val_type = octave_value (uint8NDArray ()); + *buf << s[i++]; + } + else if (s[i] == '1' && i+1 < n && s[i+1] == '6') + { + bitwidth = 16; + if (type == 'd') + val_type = octave_value (int16NDArray ()); + else + val_type = octave_value (uint16NDArray ()); + *buf << s[i++]; + *buf << s[i++]; + } + else if (s[i] == '3' && i+1 < n && s[i+1] == '2') + { + done = false; // use default size below + *buf << s[i++]; + *buf << s[i++]; + } + else if (s[i] == '6' && i+1 < n && s[i+1] == '4') + { + bitwidth = 64; + if (type == 'd') + val_type = octave_value (int64NDArray ()); + else + val_type = octave_value (uint64NDArray ()); + *buf << s[i++]; + *buf << s[i++]; + } + else + done = false; + } + else + done = false; + + if (!done) + { + bitwidth = 32; + if (type == 'd') + val_type = octave_value (int32NDArray ()); + else + val_type = octave_value (uint32NDArray ()); + } + goto fini; + } + + case 'f': + *buf << (type = s[i++]); + bitwidth = 64; + if (i < n) + { + if (s[i] == '3' && i+1 < n && s[i+1] == '2') + { + bitwidth = 32; + val_type = octave_value (FloatNDArray ()); + *buf << s[i++]; + *buf << s[i++]; + } + else if (s[i] == '6' && i+1 < n && s[i+1] == '4') + { + val_type = octave_value (NDArray ()); + *buf << s[i++]; + *buf << s[i++]; + } + else + val_type = octave_value (NDArray ()); + } + else + val_type = octave_value (NDArray ()); + goto fini; + + case 'n': + *buf << (type = s[i++]); + bitwidth = 64; + val_type = octave_value (NDArray ()); + goto fini; + + case 's': case 'q': case '[': case 'c': + if (!discard) + val_type = octave_value (Cell ()); + *buf << (type = s[i++]); + has_string = true; + goto fini; + + fini: + { + if (!have_width) + { + if (type == 'c') // %c defaults to one character + width = 1; + else + width = static_cast<unsigned int> (-1); // others: unlimited + } + + if (finish_conversion (s, i, n, width, prec, bitwidth, val_type, + discard, type, num_elts) == 0) + return; + } + break; + + default: + error ("textscan: '%%%c' is not a valid format specifier", s[i]); + } + + if (nconv < 0) + break; + } + + nconv = -1; +} + +// Parse [...] and [^...] +// +// Matlab does not expand expressions like A-Z, but they are useful, and +// so we parse them "carefully". We treat '-' as a usual character +// unless both start and end characters are from the same class (upper +// case, lower case, numeric), or this is not the first '-' in the +// pattern. +// +// Keep both a running list of characters and a mask of which chars have +// occurred. The first is efficient for patterns with few characters. +// The latter is efficient for [^...] patterns. + +static std::string +textscan_char_class (const std::string& pattern) +{ + int len = pattern.length (); + if (len == 0) + return ""; + + std::string retval (256, '\0'); + std::string mask (256, '\0'); // number of times chr has been seen + + int in = 0, out = 0; + unsigned char ch, prev = 0; + bool flip = false; + + ch = pattern[in]; + if (ch == '^') + { + in++; + flip = true; + } + mask[pattern[in]] = '\1'; + retval[out++] = pattern[in++]; // even copy ']' if it is first + + bool prev_was_range = false; // disallow "a-m-z" as a pattern + bool prev_prev_was_range = false; + for (; in < len; in++) + { + bool was_range = false; + ch = pattern[in]; + if (ch == ']') + break; + + if (prev == '-' && in > 1 && isalnum (ch) && ! prev_prev_was_range) + { + unsigned char start_of_range = pattern[in-2]; + if (start_of_range < ch + && ((isupper (ch) && isupper (start_of_range)) + || (islower (ch) && islower (start_of_range)) + || (isdigit (ch) && isdigit (start_of_range)) + || mask['-'] > 1)) // not the first '-' + { + was_range = true; + out--; + mask['-']--; + for (int i = start_of_range; i <= ch; i++) + { + if (mask[i] == '\0') + { + mask[i] = '\1'; + retval[out++] = i; + } + } + } + } + if (!was_range) + { + if (mask[ch]++ == 0) + retval[out++] = ch; + else if (ch != '-') + warning_with_id ("octave:textscan-pattern", + "textscan: [...] contains two '%c's", ch); + if (prev == '-' && mask['-'] >= 2) + warning_with_id ("octave:textscan-pattern", + "textscan: [...] contains two '-'s " + "outside range expressions"); + } + prev = ch; + prev_prev_was_range = prev_was_range; + prev_was_range = was_range; + } + if (flip) // [^...] + { + out = 0; + for (int i = 0; i < 256; i++) + if (!mask[i]) + retval[out++] = i; + } + retval.resize (out); + + return retval; +} + +int +textscan_format_list::finish_conversion (const std::string& s, size_t& i, + size_t n, unsigned int& width, + int& prec, int& bitwidth, + octave_value& val_type, bool discard, + char& type, octave_idx_type& num_elts) +{ + int retval = 0; + + std::string char_class; + + size_t beg_idx = std::string::npos; + size_t end_idx = std::string::npos; + + if (type != '%') + { + nconv++; + if (type == '[') + { + if (i < n) + { + beg_idx = i; + + if (s[i] == '^') + { + type = '^'; + *buf << s[i++]; + + if (i < n) + { + beg_idx = i; + + if (s[i] == ']') + *buf << s[i++]; + } + } + else if (s[i] == ']') + *buf << s[i++]; + } + + while (i < n && s[i] != ']') + *buf << s[i++]; + + if (i < n && s[i] == ']') + { + end_idx = i-1; + *buf << s[i++]; + } + + if (s[i-1] != ']') + retval = nconv = -1; + } + } + + if (nconv >= 0) + { + if (beg_idx != std::string::npos && end_idx != std::string::npos) + char_class = textscan_char_class (s.substr (beg_idx, + end_idx - beg_idx + 1)); + + add_elt_to_list (width, prec, bitwidth, val_type, discard, type, + num_elts, char_class); } return retval; } +void +textscan_format_list::printme (void) const +{ + octave_idx_type n = list.numel (); + + for (octave_idx_type i = 0; i < n; i++) + { + textscan_format_elt *elt = list(i); + + std::cerr + << "width: " << elt->width << "\n" + << "digits " << elt->prec << "\n" + << "bitwidth: " << elt->bitwidth << "\n" + << "discard: " << elt->discard << "\n" + << "type: "; + + if (elt->type == textscan_format_elt::literal_conversion) + std::cerr << "literal text\n"; + else if (elt->type == textscan_format_elt::whitespace_conversion) + std::cerr << "whitespace\n"; + else + std::cerr << elt->type << "\n"; + + std::cerr + << "char_class: `" << undo_string_escapes (elt->char_class) << "'\n" + << "text: `" << undo_string_escapes (elt->text) << "'\n\n"; + } +} + +// If FORMAT is explicitly "", it is assumed to be "%f" repeated enough +// times to read the first row of the file. Set it now. + +int +textscan_format_list::read_first_row (dstr& is, textscan& ts) +{ + // Read first line and strip end-of-line, which may be two characters + std::string first_line (20, ' '); + is.getline (first_line, static_cast<char> (ts.eol2)); + if (first_line.length () > 0 + && first_line[first_line.length () - 1] == ts.eol1) + first_line.resize (first_line.length () - 1); + + std::istringstream strstr (first_line); + dstr ds (strstr, is); + + dim_vector dv (1,1); // initial size of each output_container + Complex val; + octave_value val_type; + nconv = 0; + int max_empty = 1000; // failsafe, if ds fails but not with eof + int retval = 0; + + // read line, creating output_container as we go + while (!ds.eof ()) + { + bool already_skipped_delim = false; + ts.skip_whitespace (ds); + ds.progress_benchmark (); + bool progress = false; + ts.scan_complex (ds, *list(0), val); + if (ds.fail ()) + { + ds.clear (ds.rdstate () & ~std::ios::failbit); + + if (ds.eof ()) + break; + + // If we don't continue after a conversion error, then + // unless this was a missing value (i.e., followed by a delimiter), + // return with an error status. + if (ts.return_on_error < 2) + { + ts.skip_delim (ds); + if (ds.no_progress ()) + { + retval = 4; + break; + } + already_skipped_delim = true; + } + else // skip offending field + { + std::ios::iostate state = ds.rdstate (); + ds.clear (); // clear to allow read pointer to advance + + std::string dummy; + textscan_format_elt fe ("", first_line.length ()); + ts.scan_string (ds, fe, dummy); + + progress = (dummy.length ()); + ds.setstate (state); + } + + val = ts.empty_value.scalar_value (); + if (!--max_empty) + break; + } + if (val.imag () == 0) + val_type = octave_value (NDArray (dv, val.real ())); + else + val_type = octave_value (ComplexNDArray (dv, val)); + output_container.push_back (val_type); + if (! already_skipped_delim) + ts.skip_delim (ds); + if (! progress && ds.no_progress ()) + break; + nconv++; + } + output_container.pop_front (); // discard empty element from constructor + + //Create fmt now that the size is known + list.resize (dim_vector (nconv, 1)); + for (octave_idx_type i = 1; i < nconv; i++) + list(i) = new textscan_format_elt (*list(0)); + + return retval; // May have returned 4 above. +} + +// Perform actual textscan: read data from stream, and create cell array. + +octave_value +textscan::scan (std::istream *isp, textscan_format_list& fmt_list, + octave_idx_type ntimes) +{ + octave_value retval; + + if (!isp) + error ("internal error: textscan called with invalid istream"); + if (fmt_list.num_conversions () == -1) + error ("textscan: invalid format specified"); + if (fmt_list.num_conversions () == 0) + error ("textscan: no valid format conversion specifiers\n"); + + // skip the first header_lines + std::string dummy; + for (int i = 0; i < header_lines && *isp; i++) + getline (*isp, dummy, static_cast<char> (eol2)); + + // Create our own buffered stream, for fast get/putback/tell/seek. + + // First, see how far ahead it should let us look. + int max_lookahead = std::max (std::max (comment_len, treat_as_empty_len), + std::max (delim_len, 3)); // 3 for NaN and Inf + + // Next, choose a buffer size to avoid reading too much, or too often. + octave_idx_type buf_size; + if (buffer_size) + buf_size = buffer_size; + else if (ntimes > 0) + { + buf_size = 80 * ntimes; + if (buf_size < ntimes) // if overflow... + buf_size = ntimes; + buf_size = std::max (ntimes, std::min (buf_size, 4096)); + } + else + buf_size = 4096; + // Finally, create the stream. + dstr is (*isp, whitespace + delims, max_lookahead, buf_size); + + // Grow retval dynamically. "size" is half the initial size + // (FIXME -- Should we start smaller if ntimes is large?) + octave_idx_type size = ((ntimes < 8 && ntimes >= 0) ? ntimes : 1); + Array<octave_idx_type> row_idx (dim_vector (1,2)); + row_idx(1) = 0; + + int err = 0; + octave_idx_type row = 0; + + if (multiple_delims_as_one) // bug #44750? + skip_delim (is); + + int done_after; // Number of columns read when EOF seen. + + // If FORMAT explicitly "", read first line and see how many "%f" match + if (fmt_list.set_from_first) + { + err = fmt_list.read_first_row (is, *this); + lines = 1; + + done_after = fmt_list.numel () + 1; + if (!err) + row = 1; // the above puts the first line into fmt_list.out_buf () + } + else + done_after = fmt_list.out_buf ().size () + 1; + + std::list<octave_value> out = fmt_list.out_buf (); + + // We will later merge adjacent columns of the same type. + // Check now which columns to merge. + // Reals may become complex, and so we can't trust types + // after reading in data. + // If the format was "", that conversion may already have happened, + // so force all to be merged (as all are %f). + bool merge_with_prev[fmt_list.numel ()]; + int conv = 0; + if (collect_output) + { + int prev_type = -1; + for (std::list<octave_value>::iterator col = out.begin (); + col != out.end (); col++) + { + if (col->type_id () == prev_type + || (fmt_list.set_from_first && prev_type != -1)) + merge_with_prev [conv++] = true; + else + merge_with_prev [conv++] = false; + prev_type = col->type_id (); + } + } + + // This should be caught by earlier code, but this avoids a possible + // infinite loop below. + if (fmt_list.num_conversions () == 0) + error ("textscan: No conversions specified"); + + + // Read the data. This is the main loop. + if (!err) + for (/* row set ~30 lines above */; row < ntimes || ntimes == -1; row++) + { + if (row == 0 || row >= size) + { + size += size+1; + for (std::list<octave_value>::iterator col = out.begin (); + col != out.end (); col++) + *col = (*col).resize (dim_vector (size, 1), 0); + } + row_idx(0) = row; + err = read_format_once (is, fmt_list, out, row_idx, done_after); + if (err > 0 || !is || (lines >= ntimes && ntimes > -1)) + break; + } + + if ((err & 4) && !return_on_error) + error ("textscan: Read error in field %d of row %d", + done_after + 1, row + 1); + + // If file does not end in EOL, do not pad columns with NaN. + bool uneven_columns = false; + if (isp->eof () || (err & 4)) + { + isp->clear (); + isp->seekg (-1, std::ios_base::end); + int last_char = isp->get (); + isp->setstate (isp->eofbit); + uneven_columns = (last_char != eol1 && last_char != eol2); + } + + // convert return value to Cell array + Array<octave_idx_type> ra_idx (dim_vector (1,2)); + + // (err & 1) means "error, and no columns read this row + // FIXME -- This may redundant now that done_after=0 says the same + if (err & 1) + done_after = out.size () + 1; + int valid_rows = (row == ntimes) ? ntimes : ((err & 1) ? row : row+1); + dim_vector dv (valid_rows, 1); + + ra_idx(0) = 0; + int i = 0; + if (!collect_output) + { + retval = Cell (dim_vector (1, out.size ())); + for (std::list<octave_value>::iterator col = out.begin (); + col != out.end (); col++, i++) + { + // trim last columns if that was requested + if (i == done_after && uneven_columns) + dv = dim_vector (std::max (valid_rows - 1, 0), 1); + + ra_idx(1) = i; + retval = do_cat_op (retval, octave_value (Cell (col->resize (dv,0))), + ra_idx); + } + } + else // group adjacent cells of the same type into a single cell + { + octave_value cur; // current cell, accumulating columns + octave_idx_type group_size = 0; // columns in this cell + int prev_type = -1; + + conv = 0; + retval = Cell (); + for (std::list<octave_value>::iterator col = out.begin (); + col != out.end (); col++) + { + if (!merge_with_prev [conv++]) // including first time + { + if (prev_type != -1) + { + ra_idx(1) = i++; + retval = do_cat_op (retval, octave_value (Cell(cur)), + ra_idx); + } + cur = octave_value (col->resize (dv,0)); + group_size = 1; + prev_type = col->type_id (); + } + else + { + ra_idx(1) = group_size++; + cur = do_cat_op (cur, octave_value (col->resize (dv,0)), + ra_idx); + } + } + ra_idx(1) = i; + retval = do_cat_op (retval, octave_value (Cell (cur)), ra_idx); + } + return retval; +} // Calculate x^n. Used for ...e+nn so that, for example, 1e2 is // exactly 100 and 5e-1 is 1/2 -double pown (double x, unsigned int n) +static double +pown (double x, unsigned int n) { double retval = 1; @@ -1610,1048 +2301,357 @@ return true; } - -textscan_format_list::textscan_format_list (const std::string& s) - : set_from_first (false), has_string (false), nconv (0), curr_idx (0), - list (dim_vector (16, 1)), buf (0) +DEFUN (textscan, args, , + "-*- texinfo -*-\n\ +@deftypefn {} {@var{C} =} textscan (@var{fid}, @var{format})\n\ +@deftypefnx {} {@var{C} =} textscan (@var{fid}, @var{format}, @var{repeat})\n\ +@deftypefnx {} {@var{C} =} textscan (@var{fid}, @var{format}, @var{param}, @var{value}, @dots{})\n\ +@deftypefnx {} {@var{C} =} textscan (@var{fid}, @var{format}, @var{repeat}, @var{param}, @var{value}, @dots{})\n\ +@deftypefnx {} {@var{C} =} textscan (@var{str}, @dots{})\n\ +@deftypefnx {} {[@var{C}, @var{position}] =} textscan (@dots{})\n\ +Read data from a text file or string.\n\ +\n\ +The string @var{str} or file associated with @var{fid} is read from and\n\ +parsed according to @var{format}.\n\ +The function is an extension of @code{strread} and @code{textread}.\n\ +Differences include: the ability to read from either a file or a string,\n\ +additional options, and additional format specifiers.\n\ +\n\ +The input is interpreted as a sequence of \"words\", delimiters\n\ +(such as whitespace) and literals.\n\ +The characters that form delimiters and whitespace are determined\n\ +by the options.\n\ +The format consists of format specifiers interspersed between literals.\n\ +In the format, whitespace forms a delimiter between consecutive literals,\n\ +but is otherwise ignored.\n\ +\n\ +The output @var{C} is a cell array whose second dimension is determined\n\ +by the number of format specifiers.\n\ +\n\ +The first word of the input is matched to the first specifier of the\n\ +format and placed in the first column of the output;\n\ +the second is matched to the second specifier and placed in the second column\n\ +and so forth.\n\ +If there are more words than specifiers, the process is repeated until all\n\ +words have been processed or the limit imposed by @var{repeat} has been met\n\ +(see below).\n\ +\n\ +The string @var{format} describes how the words in @var{str} should be\n\ +parsed.\n\ +As in @var{fscanf}, any (non-whitespace) text in the format that is\n\ +not one of these specifiers is considered a literal;\n\ +if there is a literal between two format specifiers then that same literal\n\ +must appear in the input stream between the matching words.\n\ +\n\ +The following specifiers are valid:\n\ +\n\ +@table @code\n\ +@item %f\n\ +@itemx %f64\n\ +@itemx %n\n\ +The word is parsed as a number and converted to double.\n\ +\n\ +@item %f32\n\ +The word is parsed as a number and converted to single (float).\n\ +\n\ +@item %d\n\ +@itemx %d8\n\ +@itemx %d16\n\ +@itemx %d32\n\ +@itemx %d64\n\ +The word is parsed as a number and converted to int8, int16, int32 or int64.\n\ +If not size is specified, int32 is used.\n\ +\n\ +@item %u\n\ +@itemx %u8\n\ +@itemx %u16\n\ +@itemx %u32\n\ +@itemx %u64\n\ +The word is parsed as a number and converted to uint8, uint16, uint32 or\n\ +uint64. If not size is specified, uint32 is used.\n\ +\n\ +@item %s\n\ +The word is parsed as a string, ending at the last character before\n\ +whitespace, an end-of-line or a delimiter specified in the options.\n\ +\n\ +@item %q\n\ +The word is parsed as a \"quoted string\".\n\ +If the first character of the string is a double quote (\") then the string\n\ +includes everything until a matching double quote, including whitespace,\n\ +delimiters and end of line characters.\n\ +If a pair of consecutive double quotes appears in the input,\n\ +it is replaced in the output by a single double quote.\n\ +That is, the input \"He said \"\"Hello\"\"\" would return the value\n\ +'He said \"Hello\"'.\n\ +\n\ +@item %c\n\ +The next character of the input is read.\n\ +This includes delimiters, whitespace and end of line characters.\n\ +\n\ +@item %[...]\n\ +@itemx %[^...]\n\ +In the first form, the word consists of the longest run consisting of only\n\ +characters between the brackets.\n\ +Ranges of characters can be specified by a hyphen;\n\ +for example, %[0-9a-zA-Z] matches all alphanumeric characters\n\ +(if the underlying character set is ASCII).\n\ +Since Matlab treats hyphens literally, this expansion only applies to\n\ +alphanumeric characters.\n\ +To include '-' in the set, it should appear first or last in the brackets;\n\ +to include ']', it should be the first character.\n\ +If the first character is '^' then the word consists of characters\n\ +NOT listed.\n\ +\n\ +@item %N...\n\ +For %s, %c %d, %f, %n, %u, an optional width can be specified as %Ns etc.\n\ +where N is an integer > 1.\n\ +For %c, this causes exactly the next N characters to be read instead of\n\ +a single character.\n\ +For the other specifiers, it is an upper bound on the\n\ +number of characters read;\n\ +normal delimiters can cause fewer characters to be read.\n\ +For complex numbers, this limit applies to the real and imaginary\n\ +components individually.\n\ +For %f and %n, format specifiers like %N.Mf are allowed, where M is an upper\n\ +bound on number of characters after the decimal point to be considered;\n\ +subsequent digits are skipped.\n\ +For example, the specifier %8.2f would read 12.345e6 as 1.234e7.\n\ +\n\ +@item %*...\n\ +The word specified by the remainder of the conversion specifier is skipped.\n\ +\n\ +@item literals\n\ +In addition the format may contain literal character strings;\n\ +these will be skipped during reading.\n\ +If the input string does not match this literal, the processing terminates,\n\ +unless \"ReturnOnError\" is set to \"continue\".\n\ +@end table\n\ +\n\ +Parsed words corresponding to the first specifier are returned in the first\n\ +output argument and likewise for the rest of the specifiers.\n\ +\n\ +By default, if there is only one input argument, @var{format} is @t{\"%f\"}.\n\ +This means that numbers are read from @var{str} into a single column vector.\n\ +If @var{format} is explicitly empty, \"\", then textscan will return data\n\ +in a number of columns matching the number of fields on the first data\n\ +line of the input.\n\ +Either of these is suitable only if @var{str} contains only numeric fields.\n\ +\n\ +For example, the string\n\ +\n\ +@example\n\ +@group\n\ +@var{str} = \"\\\n\ +Bunny Bugs 5.5\\n\\\n\ +Duck Daffy -7.5e-5\\n\\\n\ +Penguin Tux 6\"\n\ +@end group\n\ +@end example\n\ +\n\ +@noindent\n\ +can be read using\n\ +\n\ +@example\n\ +@var{a} = textscan (@var{str}, \"%s %s %f\");\n\ +@end example\n\ +\n\ +The optional numeric argument @var{repeat} can be used for limiting the\n\ +number of items read:\n\ +\n\ +@table @asis\n\ +@item -1\n\ +(default) read all of the string or file until the end.\n\ +\n\ +@item N\n\ +Read until the first of two conditions occurs: the format has been processed\n\ +N times, or N lines of the input have been processed.\n\ +Zero (0) is an acceptable value for @var{repeat}.\n\ +Currently, end-of-line characters inside %q, %c, and %[...]$ conversions\n\ +do not contribute to the line count.\n\ +This is incompatible with Matlab and may change in future.\n\ +@end table\n\ +\n\ +The behavior of @code{textscan} can be changed via property-value pairs.\n\ +The following properties are recognized:\n\ +\n\ +@table @asis\n\ +@item @qcode{\"BufSize\"}\n\ +This specifies the number of bytes to use for the internal buffer.\n\ +A modest speed improvement is obtained by setting this to a large value\n\ +when reading a large file, especially the input contains long strings.\n\ +The default is 4096, or a value dependent on @var{n} is that is specified.\n\ +\n\ +@item @qcode{\"CollectOutput\"}\n\ +A value of 1 or true instructs textscan to concatenate consecutive columns\n\ +of the same class in the output cell array.\n\ +A value of 0 or false (default) leaves output in distinct columns.\n\ +\n\ +@item @qcode{\"CommentStyle\"}\n\ +Parts of @var{str} are considered comments and will be skipped.\n\ +@var{value} is the comment style and can be either\n\ +(1) One string, or 1x1 cell string, to skip everything to the right of it;\n\ +(2) A cell array of two strings, to skip everything between the first and\n\ +second strings.\n\ +Comments are only parsed where whitespace is accepted, and do not act as\n\ +delimiters.\n\ +\n\ +@item @qcode{\"Delimiter\"}\n\ +If @var{value} is a string, any character in @var{value} will be used to\n\ +split @var{str} into words.\n\ +If @var{value} is a cell array of strings,\n\ +any string in the array will be used to split @var{str} into words.\n\ +(default value = any whitespace.)\n\ +\n\ +@item @qcode{\"EmptyValue\"}\n\ +Value to return for empty numeric values in non-whitespace delimited data.\n\ +The default is NaN@.\n\ +When the data type does not support NaN (int32 for example),\n\ +then default is zero.\n\ +\n\ +@item @qcode{\"EndOfLine\"}\n\ +@var{value} can be either a emtpy or one character specifying the\n\ +end of line character, or the pair\n\ +@qcode{\"@xbackslashchar{}r@xbackslashchar{}n\"} (CRLF).\n\ +In the latter case, any of\n\ +@qcode{\"@xbackslashchar{}r\"}, @qcode{\"@xbackslashchar{}n\"} or\n\ +@qcode{\"@xbackslashchar{}r@xbackslashchar{}n\"} is counted as a (single)\n\ +newline.\n\ +If no value is given, @qcode{\"@xbackslashchar{}r@xbackslashchar{}n\"} is\n\ +used.\n\ +@c If set to \"\" (empty string) EOLs are ignored as delimiters and added\n\ +@c to whitespace.\n\ +\n\ +@c When reading from a character string, optional input argument @var{n}\n\ +@c specifies the number of times @var{format} should be used (i.e., to limit\n\ +@c the amount of data read).\n\ +@c When reading from file, @var{n} specifies the number of data lines to read;\n\ +@c in this sense it differs slightly from the format repeat count in strread.\n\ +\n\ +@item @qcode{\"HeaderLines\"}\n\ +The first @var{value} number of lines of @var{fid} are skipped.\n\ +Note that this does not refer to the first non-comment lines, but the first\n\ +lines of any type.\n\ +\n\ +@item @qcode{\"MultipleDelimsAsOne\"}\n\ +If @var{value} is non-zero,\n\ +treat a series of consecutive delimiters, without whitespace in between,\n\ +as a single delimiter.\n\ +Consecutive delimiter series need not be vertically @qcode{\"aligned\"}.\n\ +Without this option, a single delimiter before the end of the line does\n\ +not cause the line to be considered to end with an empty value,\n\ +but a single delimiter at the start of a line causes the line\n\ +to be considered to start with an empty value.\n\ +\n\ +@item @qcode{\"TreatAsEmpty\"}\n\ +Treat single occurrences (surrounded by delimiters or whitespace) of the\n\ +string(s) in @var{value} as missing values.\n\ +\n\ +@item @qcode{\"ReturnOnError\"}\n\ +If set to numerical 1 or true, return normally as soon as an error\n\ +is encountered, such as trying to read a string using @qcode{%f}.\n\ +If set to 0 or false, return an error and no data.\n\ +If set to \"continue\" (default), textscan attempts to continue reading\n\ +beyond the location; however, this may cause the parsing to get out of sync.\n\ +\n\ +@item @qcode{\"Whitespace\"}\n\ +Any character in @var{value} will be interpreted as whitespace and trimmed;\n\ +The default value for whitespace is\n\ +@c Note: the next line specifically has a newline which generates a space\n\ +@c in the output of qcode, but keeps the next line < 80 characters.\n\ +@qcode{\"\n\ +@xbackslashchar{}b@xbackslashchar{}r@xbackslashchar{}n@xbackslashchar{}t\"}\n\ +(note the space). Unless whitespace is set to @qcode{\"\"} (empty) AND at\n\ +least one @qcode{\"%s\"} format conversion specifier is supplied, a space is\n\ +always part of whitespace.\n\ +\n\ +@end table\n\ +\n\ +When the number of words in @var{str} or @var{fid} doesn't match an exact\n\ +multiple of the number of format conversion specifiers,\n\ +textscan's behavior depends on\n\ +whether the last character of the string or file is\n\ +an end-of-line as specified by the EndOfLine option:\n\ +\n\ +@table @asis\n\ +@item last character = end-of-line\n\ +Data columns are padded with empty fields, NaN or 0 (for integer fields)\n\ +so that all columns have equal length\n\ +\n\ +@item last character is not end-of-line\n\ +Data columns are not padded; textscan returns columns of unequal length\n\ +@end table\n\ +\n\ +\n\ +The second output, @var{position}, provides the position, in characters\n\ +from the beginning of the file or string, at which the processing stopped.\n\ +\n\ +@seealso{dlmread, fscanf, load, strread, textread}\n\ +@end deftypefn") { - octave_idx_type num_elts = 0; - - size_t n = s.length (); - - size_t i = 0; - - unsigned int width = -1; // Unspecified width = max (except %c) - int prec = -1; - int bitwidth = 0; - bool discard = false; - char type = '\0'; - - bool have_more = true; - - if (s.length () == 0) + octave_value_list retval; + std::string format; + int params = 1; + + if (args.length () < 1) + print_usage (); + else if (args.length () == 1) + format = "%f"; // ommited format = %f. explicit "" = width from file + else if (args(1).is_string ()) + { + format = args(1).string_value (); + if (args(1).is_sq_string ()) + format = do_string_escapes (format); + params++; + } + else + error ("textscan: FORMAT must be a string, not <%s>", + args(1).class_name ().c_str ()); + + octave_idx_type ntimes = -1; + textscan tscanner; + + if (args.length () >= 3) { - buf = new std::ostringstream ("%f"); - bitwidth = 64; - type = 'f'; - add_elt_to_list (width, prec, bitwidth, octave_value (NDArray ()), - discard, type, num_elts); - have_more = false; - set_from_first = true; - nconv = 1; + if (args(2).is_numeric_type ()) + { + ntimes = args(2).idx_type_value (); + if (ntimes < args(2).double_value ()) + error ("textscan: REPEAT = %g is too large", + args(2).double_value ()); + params = 3; + } + } + textscan_format_list fmt_list (format); + + tscanner.parse_options (args, params, fmt_list); + + if (args(0).is_string ()) + { + std::istringstream is (args(0).string_value ()); + octave_value tmp = tscanner.scan (&is, fmt_list, ntimes); + retval(0) = tmp; + + std::ios::iostate state = is.rdstate (); + is.clear (); + retval(1) = octave_value (static_cast<long>(is.tellg ())); + is.setstate (state); } else { - set_from_first = false; - while (i < n) - { - have_more = true; - - if (! buf) - buf = new std::ostringstream (); - - if (s[i] == '%' && (i+1 == n || s[i+1] != '%')) - { - // Process percent-escape conversion type. - - process_conversion (s, i, n, num_elts); - - have_more = (buf != 0); - } - else if (isspace (s[i])) - { - while (++i < n && isspace (s[i])) - /* skip whitespace */; - - have_more = false; - } - else - { - type = textscan_format_elt::literal_conversion; - - width = 0; - prec = -1; - bitwidth = 0; - discard = true; - - while (i < n && ! isspace (s[i]) - && (s[i] != '%' || (i+1 < n && s[i+1] == '%'))) - { - if (s[i] == '%') // if double %, skip one - i++; - *buf << s[i++]; - width++; - } - - add_elt_to_list (width, prec, bitwidth, octave_value (), discard, - type, num_elts); - - have_more = false; - } - - if (nconv < 0) - { - have_more = false; - break; - } - } - } - - if (have_more) - add_elt_to_list (width, prec, bitwidth, octave_value (), discard, type, num_elts); - - list.resize (dim_vector (num_elts, 1)); - - delete buf; -} - -textscan_format_list::~textscan_format_list (void) -{ - octave_idx_type n = list.numel (); - - for (octave_idx_type i = 0; i < n; i++) - { - textscan_format_elt *elt = list(i); - delete elt; - } -} - -void -textscan_format_list::add_elt_to_list (unsigned int width, int prec, - int bitwidth, octave_value val_type, - bool discard, char type, - octave_idx_type& num_elts, - const std::string& char_class) -{ - if (buf) - { - std::string text = buf->str (); - - if (! text.empty ()) - { - textscan_format_elt *elt - = new textscan_format_elt (text.c_str (), width, prec, bitwidth, - discard, type, char_class); - - if (num_elts == list.numel ()) - { - list.resize (dim_vector (2 * num_elts, 1)); - } - - if (!discard) - { - output_container.push_back (val_type); - } - list(num_elts++) = elt; - } - - delete buf; - buf = 0; - } -} - -void -textscan_format_list::process_conversion (const std::string& s, size_t& i, - size_t n, octave_idx_type& num_elts) -{ - unsigned width = 0; - int prec = -1; - int bitwidth = 0; - bool discard = false; - octave_value val_type; - char type = '\0'; - - *buf << s[i++]; - - bool have_width = false; - - while (i < n) - { - switch (s[i]) - { - case '*': - if (discard) - nconv = -1; - else - { - discard = true; - *buf << s[i++]; - } - break; - - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - if (have_width) - nconv = -1; - else - { - char c = s[i++]; - width = width * 10 + c - '0'; - have_width = true; - *buf << c; - while (i < n && isdigit (s[i])) - { - c = s[i++]; - width = width * 10 + c - '0'; - *buf << c; - } - - if (i < n && s[i] == '.') - { - *buf << s[i++]; - prec = 0; - while (i < n && isdigit (s[i])) - { - c = s[i++]; - prec = prec * 10 + c - '0'; - *buf << c; - } - } - } - break; - - case 'd': case 'u': - { - bool done = true; - *buf << (type = s[i++]); - if (i < n) - { - if (s[i] == '8') - { - bitwidth = 8; - if (type == 'd') - val_type = octave_value (int8NDArray ()); - else - val_type = octave_value (uint8NDArray ()); - *buf << s[i++]; - } - else if (s[i] == '1' && i+1 < n && s[i+1] == '6') - { - bitwidth = 16; - if (type == 'd') - val_type = octave_value (int16NDArray ()); - else - val_type = octave_value (uint16NDArray ()); - *buf << s[i++]; - *buf << s[i++]; - } - else if (s[i] == '3' && i+1 < n && s[i+1] == '2') - { - done = false; // use default size below - *buf << s[i++]; - *buf << s[i++]; - } - else if (s[i] == '6' && i+1 < n && s[i+1] == '4') - { - bitwidth = 64; - if (type == 'd') - val_type = octave_value (int64NDArray ()); - else - val_type = octave_value (uint64NDArray ()); - *buf << s[i++]; - *buf << s[i++]; - } - else - done = false; - } - else - done = false; - - if (!done) - { - bitwidth = 32; - if (type == 'd') - val_type = octave_value (int32NDArray ()); - else - val_type = octave_value (uint32NDArray ()); - } - goto fini; - } - - case 'f': - *buf << (type = s[i++]); - bitwidth = 64; - if (i < n) - { - if (s[i] == '3' && i+1 < n && s[i+1] == '2') - { - bitwidth = 32; - val_type = octave_value (FloatNDArray ()); - *buf << s[i++]; - *buf << s[i++]; - } - else if (s[i] == '6' && i+1 < n && s[i+1] == '4') - { - val_type = octave_value (NDArray ()); - *buf << s[i++]; - *buf << s[i++]; - } - else - val_type = octave_value (NDArray ()); - } - else - val_type = octave_value (NDArray ()); - goto fini; - - case 'n': - *buf << (type = s[i++]); - bitwidth = 64; - val_type = octave_value (NDArray ()); - goto fini; - - case 's': case 'q': case '[': case 'c': - if (!discard) - val_type = octave_value (Cell ()); - *buf << (type = s[i++]); - has_string = true; - goto fini; - - fini: - { - if (!have_width) - { - if (type == 'c') // %c defaults to one character - width = 1; - else - width = static_cast<unsigned int> (-1); // others: unlimited - } - - if (finish_conversion (s, i, n, width, prec, bitwidth, val_type, - discard, type, num_elts) == 0) - return; - } - break; - - default: - error ("textscan: '%%%c' is not a valid format specifier", s[i]); - } - - if (nconv < 0) - break; - } - - nconv = -1; -} - -// Parse [...] and [^...] -// -// Matlab does not expand expressions like A-Z, but they are useful, and -// so we parse them "carefully". We treat '-' as a usual character -// unless both start and end characters are from the same class (upper -// case, lower case, numeric), or this is not the first '-' in the -// pattern. -// -// Keep both a running list of characters and a mask of which chars have -// occurred. The first is efficient for patterns with few characters. -// The latter is efficient for [^...] patterns. - -static std::string -textscan_char_class (const std::string& pattern) -{ - int len = pattern.length (); - if (len == 0) - return ""; - - std::string retval (256, '\0'); - std::string mask (256, '\0'); // number of times chr has been seen - - int in = 0, out = 0; - unsigned char ch, prev = 0; - bool flip = false; - - ch = pattern[in]; - if (ch == '^') - { - in++; - flip = true; - } - mask[pattern[in]] = '\1'; - retval[out++] = pattern[in++]; // even copy ']' if it is first - - bool prev_was_range = false; // disallow "a-m-z" as a pattern - bool prev_prev_was_range = false; - for (; in < len; in++) - { - bool was_range = false; - ch = pattern[in]; - if (ch == ']') - break; - - if (prev == '-' && in > 1 && isalnum (ch) && ! prev_prev_was_range) - { - unsigned char start_of_range = pattern[in-2]; - if (start_of_range < ch - && ((isupper (ch) && isupper (start_of_range)) - || (islower (ch) && islower (start_of_range)) - || (isdigit (ch) && isdigit (start_of_range)) - || mask['-'] > 1)) // not the first '-' - { - was_range = true; - out--; - mask['-']--; - for (int i = start_of_range; i <= ch; i++) - { - if (mask[i] == '\0') - { - mask[i] = '\1'; - retval[out++] = i; - } - } - } - } - if (!was_range) - { - if (mask[ch]++ == 0) - retval[out++] = ch; - else if (ch != '-') - warning_with_id ("octave:textscan-pattern", - "textscan: [...] contains two '%c's", ch); - if (prev == '-' && mask['-'] >= 2) - warning_with_id ("octave:textscan-pattern", - "textscan: [...] contains two '-'s " - "outside range expressions"); - } - prev = ch; - prev_prev_was_range = prev_was_range; - prev_was_range = was_range; - } - if (flip) // [^...] - { - out = 0; - for (int i = 0; i < 256; i++) - if (!mask[i]) - retval[out++] = i; - } - retval.resize (out); - - return retval; -} - -int -textscan_format_list::finish_conversion (const std::string& s, size_t& i, - size_t n, unsigned int& width, - int& prec, int& bitwidth, - octave_value& val_type, bool discard, - char& type, octave_idx_type& num_elts) -{ - int retval = 0; - - std::string char_class; - - size_t beg_idx = std::string::npos; - size_t end_idx = std::string::npos; - - if (type != '%') - { - nconv++; - if (type == '[') - { - if (i < n) - { - beg_idx = i; - - if (s[i] == '^') - { - type = '^'; - *buf << s[i++]; - - if (i < n) - { - beg_idx = i; - - if (s[i] == ']') - *buf << s[i++]; - } - } - else if (s[i] == ']') - *buf << s[i++]; - } - - while (i < n && s[i] != ']') - *buf << s[i++]; - - if (i < n && s[i] == ']') - { - end_idx = i-1; - *buf << s[i++]; - } - - if (s[i-1] != ']') - retval = nconv = -1; - } - } - - if (nconv >= 0) - { - if (beg_idx != std::string::npos && end_idx != std::string::npos) - char_class = textscan_char_class (s.substr (beg_idx, - end_idx - beg_idx + 1)); - - add_elt_to_list (width, prec, bitwidth, val_type, discard, type, - num_elts, char_class); + octave_stream os = octave_stream_list::lookup (args(0), "textscan"); + octave_value tmp = tscanner.scan (os.input_stream (), fmt_list, ntimes); + + retval(0) = tmp; + // FIXME -- warn if stream is not opened in binary mode? + std::ios::iostate state = os.input_stream ()->rdstate (); + os.input_stream ()->clear (); + retval(1) = os.tell (); + os.input_stream ()->setstate (state); } return retval; } -void -textscan_format_list::printme (void) const -{ - octave_idx_type n = list.numel (); - - for (octave_idx_type i = 0; i < n; i++) - { - textscan_format_elt *elt = list(i); - - std::cerr - << "width: " << elt->width << "\n" - << "digits " << elt->prec << "\n" - << "bitwidth: " << elt->bitwidth << "\n" - << "discard: " << elt->discard << "\n" - << "type: "; - - if (elt->type == textscan_format_elt::literal_conversion) - std::cerr << "literal text\n"; - else if (elt->type == textscan_format_elt::whitespace_conversion) - std::cerr << "whitespace\n"; - else - std::cerr << elt->type << "\n"; - - std::cerr - << "char_class: `" << undo_string_escapes (elt->char_class) << "'\n" - << "text: `" << undo_string_escapes (elt->text) << "'\n\n"; - } -} - -// If FORMAT is explicitly "", it is assumed to be "%f" repeated enough -// times to read the first row of the file. Set it now. - -int -textscan_format_list::read_first_row (dstr& is, textscan& ts) -{ - // Read first line and strip end-of-line, which may be two characters - std::string first_line (20, ' '); - is.getline (first_line, static_cast<char> (ts.eol2)); - if (first_line.length () > 0 - && first_line[first_line.length () - 1] == ts.eol1) - first_line.resize (first_line.length () - 1); - - std::istringstream strstr (first_line); - dstr ds (strstr, is); - - dim_vector dv (1,1); // initial size of each output_container - Complex val; - octave_value val_type; - nconv = 0; - int max_empty = 1000; // failsafe, if ds fails but not with eof - int retval = 0; - - // read line, creating output_container as we go - while (!ds.eof ()) - { - bool already_skipped_delim = false; - ts.skip_whitespace (ds); - ds.progress_benchmark (); - bool progress = false; - ts.scan_complex (ds, *list(0), val); - if (ds.fail ()) - { - ds.clear (ds.rdstate () & ~std::ios::failbit); - - if (ds.eof ()) - break; - - // If we don't continue after a conversion error, then - // unless this was a missing value (i.e., followed by a delimiter), - // return with an error status. - if (ts.return_on_error < 2) - { - ts.skip_delim (ds); - if (ds.no_progress ()) - { - retval = 4; - break; - } - already_skipped_delim = true; - } - else // skip offending field - { - std::ios::iostate state = ds.rdstate (); - ds.clear (); // clear to allow read pointer to advance - - std::string dummy; - textscan_format_elt fe ("", first_line.length ()); - ts.scan_string (ds, fe, dummy); - - progress = (dummy.length ()); - ds.setstate (state); - } - - val = ts.empty_value.scalar_value (); - if (!--max_empty) - break; - } - if (val.imag () == 0) - val_type = octave_value (NDArray (dv, val.real ())); - else - val_type = octave_value (ComplexNDArray (dv, val)); - output_container.push_back (val_type); - if (! already_skipped_delim) - ts.skip_delim (ds); - if (! progress && ds.no_progress ()) - break; - nconv++; - } - output_container.pop_front (); // discard empty element from constructor - - //Create fmt now that the size is known - list.resize (dim_vector (nconv, 1)); - for (octave_idx_type i = 1; i < nconv; i++) - list(i) = new textscan_format_elt (*list(0)); - - return retval; // May have returned 4 above. -} - -// Perform actual textscan: read data from stream, and create cell array. - -octave_value -textscan::scan (std::istream *isp, textscan_format_list& fmt_list, - octave_idx_type ntimes) -{ - octave_value retval; - - if (!isp) - error ("internal error: textscan called with invalid istream"); - if (fmt_list.num_conversions () == -1) - error ("textscan: invalid format specified"); - if (fmt_list.num_conversions () == 0) - error ("textscan: no valid format conversion specifiers\n"); - - // skip the first header_lines - std::string dummy; - for (int i = 0; i < header_lines && *isp; i++) - getline (*isp, dummy, static_cast<char> (eol2)); - - // Create our own buffered stream, for fast get/putback/tell/seek. - - // First, see how far ahead it should let us look. - int max_lookahead = std::max (std::max (comment_len, treat_as_empty_len), - std::max (delim_len, 3)); // 3 for NaN and Inf - - // Next, choose a buffer size to avoid reading too much, or too often. - octave_idx_type buf_size; - if (buffer_size) - buf_size = buffer_size; - else if (ntimes > 0) - { - buf_size = 80 * ntimes; - if (buf_size < ntimes) // if overflow... - buf_size = ntimes; - buf_size = std::max (ntimes, std::min (buf_size, 4096)); - } - else - buf_size = 4096; - // Finally, create the stream. - dstr is (*isp, whitespace + delims, max_lookahead, buf_size); - - // Grow retval dynamically. "size" is half the initial size - // (FIXME -- Should we start smaller if ntimes is large?) - octave_idx_type size = ((ntimes < 8 && ntimes >= 0) ? ntimes : 1); - Array<octave_idx_type> row_idx (dim_vector (1,2)); - row_idx(1) = 0; - - int err = 0; - octave_idx_type row = 0; - - if (multiple_delims_as_one) // bug #44750? - skip_delim (is); - - int done_after; // Number of columns read when EOF seen. - - // If FORMAT explicitly "", read first line and see how many "%f" match - if (fmt_list.set_from_first) - { - err = fmt_list.read_first_row (is, *this); - lines = 1; - - done_after = fmt_list.numel () + 1; - if (!err) - row = 1; // the above puts the first line into fmt_list.out_buf () - } - else - done_after = fmt_list.out_buf ().size () + 1; - - std::list<octave_value> out = fmt_list.out_buf (); - - // We will later merge adjacent columns of the same type. - // Check now which columns to merge. - // Reals may become complex, and so we can't trust types - // after reading in data. - // If the format was "", that conversion may already have happened, - // so force all to be merged (as all are %f). - bool merge_with_prev[fmt_list.numel ()]; - int conv = 0; - if (collect_output) - { - int prev_type = -1; - for (std::list<octave_value>::iterator col = out.begin (); - col != out.end (); col++) - { - if (col->type_id () == prev_type - || (fmt_list.set_from_first && prev_type != -1)) - merge_with_prev [conv++] = true; - else - merge_with_prev [conv++] = false; - prev_type = col->type_id (); - } - } - - // This should be caught by earlier code, but this avoids a possible - // infinite loop below. - if (fmt_list.num_conversions () == 0) - error ("textscan: No conversions specified"); - - - // Read the data. This is the main loop. - if (!err) - for (/* row set ~30 lines above */; row < ntimes || ntimes == -1; row++) - { - if (row == 0 || row >= size) - { - size += size+1; - for (std::list<octave_value>::iterator col = out.begin (); - col != out.end (); col++) - *col = (*col).resize (dim_vector (size, 1), 0); - } - row_idx(0) = row; - err = read_format_once (is, fmt_list, out, row_idx, done_after); - if (err > 0 || !is || (lines >= ntimes && ntimes > -1)) - break; - } - - if ((err & 4) && !return_on_error) - error ("textscan: Read error in field %d of row %d", - done_after + 1, row + 1); - - // If file does not end in EOL, do not pad columns with NaN. - bool uneven_columns = false; - if (isp->eof () || (err & 4)) - { - isp->clear (); - isp->seekg (-1, std::ios_base::end); - int last_char = isp->get (); - isp->setstate (isp->eofbit); - uneven_columns = (last_char != eol1 && last_char != eol2); - } - - // convert return value to Cell array - Array<octave_idx_type> ra_idx (dim_vector (1,2)); - - // (err & 1) means "error, and no columns read this row - // FIXME -- This may redundant now that done_after=0 says the same - if (err & 1) - done_after = out.size () + 1; - int valid_rows = (row == ntimes) ? ntimes : ((err & 1) ? row : row+1); - dim_vector dv (valid_rows, 1); - - ra_idx(0) = 0; - int i = 0; - if (!collect_output) - { - retval = Cell (dim_vector (1, out.size ())); - for (std::list<octave_value>::iterator col = out.begin (); - col != out.end (); col++, i++) - { - // trim last columns if that was requested - if (i == done_after && uneven_columns) - dv = dim_vector (std::max (valid_rows - 1, 0), 1); - - ra_idx(1) = i; - retval = do_cat_op (retval, octave_value (Cell (col->resize (dv,0))), - ra_idx); - } - } - else // group adjacent cells of the same type into a single cell - { - octave_value cur; // current cell, accumulating columns - octave_idx_type group_size = 0; // columns in this cell - int prev_type = -1; - - conv = 0; - retval = Cell (); - for (std::list<octave_value>::iterator col = out.begin (); - col != out.end (); col++) - { - if (!merge_with_prev [conv++]) // including first time - { - if (prev_type != -1) - { - ra_idx(1) = i++; - retval = do_cat_op (retval, octave_value (Cell(cur)), - ra_idx); - } - cur = octave_value (col->resize (dv,0)); - group_size = 1; - prev_type = col->type_id (); - } - else - { - ra_idx(1) = group_size++; - cur = do_cat_op (cur, octave_value (col->resize (dv,0)), - ra_idx); - } - } - ra_idx(1) = i; - retval = do_cat_op (retval, octave_value (Cell (cur)), ra_idx); - } - return retval; -} - -// Create a delimited stream, reading from is, with delimiters delims, -// and allowing reading of up to tellg + longest_lookeahead. When is -// is at EOF, lookahead may be padded by ASCII nuls. - -dstr::dstr (std::istream& is, const std::string& delimiters, - int longest_lookahead, octave_idx_type bsize) - : bufsize (bsize), i_stream (is), longest (longest_lookahead), - delims (delimiters), - flags (std::ios::failbit & ~std::ios::failbit) // can't cast 0 -{ - buf = new char[bufsize]; - eob = buf + bufsize; - idx = eob; // refresh_buf shouldn't try to copy old data - progress_marker = idx; - refresh_buf (); // load the first batch of data -} - -// Used to create a stream from a strstream from data read from a dstr. -// FIXME: Find a more efficient approach. Perhaps derived dstrstream -dstr::dstr (std::istream& is, const dstr& ds) - : bufsize (ds.bufsize), i_stream (is), longest (ds.longest), - delims (ds.delims), - flags (std::ios::failbit & ~std::ios::failbit) // can't cast 0 -{ - buf = new char[bufsize]; - eob = buf + bufsize; - idx = eob; // refresh_buf shouldn't try to copy old data - progress_marker = idx; - refresh_buf (); // load the first batch of data -} - -dstr::~dstr () -{ - // Seek to the correct position in i_stream. - if (!eof ()) - { - i_stream.clear (); - i_stream.seekg (buf_in_file); - i_stream.read (buf, idx - buf); - } - - delete [] buf; -} - -// Read a character from the buffer, refilling the buffer from the file -// if necessary. - -int -dstr::get_undelim () -{ - int retval; - if (eof ()) - { - setstate (std::ios_base::failbit); - return EOF; - } - - if (idx < eob) - retval = *idx++; - else - { - refresh_buf (); - if (eof ()) - { - setstate (std::ios_base::eofbit); - retval = EOF; - } - else - retval = *idx++; - } - if (idx >= last) - delimited = false; - return retval; -} - -// Return the next character to be read without incrementing the -// pointer, refilling the buffer from the file if necessary. - -int -dstr::peek_undelim () -{ - int retval = get_undelim (); - putback (); - - return retval; -} - -// Copy remaining unprocessed data to the start of the buffer and load -// new data to fill it. Return EOF if the file is at EOF before -// reading any data and all of the data that has been read has been -// processed. - -int -dstr::refresh_buf (void) -{ - if (eof ()) - return EOF; - - int retval; - int old_remaining = eob - idx; - - if (old_remaining < 0) - { - idx = eob; - old_remaining = 0; - } - - octave_quit (); // allow ctrl-C - - if (old_remaining > 0) - memmove (buf, idx, old_remaining); - - progress_marker -= idx - buf; // where original idx would have been - idx = buf; - - int gcount; // chars read - if (!i_stream.eof ()) - { - buf_in_file = i_stream.tellg (); // record for destructor - i_stream.read (buf + old_remaining, bufsize - old_remaining); - gcount = i_stream.gcount (); - } - else - gcount = 0; - - eob = buf + old_remaining + gcount; - last = eob; - if (gcount == 0) - { - delimited = false; - if (eob != buf) // no more data in file, but still some to go - retval = 0; - else - retval = EOF; // file and buffer are both done. - } - else - { - delimited = true; - - for (last = eob - longest; last - buf >= 0; last--) - { - if (strchr (delims.c_str (), *last)) - break; - } - if (last - buf < 0) - delimited = false; - - retval = 0; - } - - if (retval == EOF) // Ensure fast peek doesn't give valid char - *idx = '\0'; // FIXME - check that no TreatAsEmpty etc starts w. \0? - - return retval; -} - -// Return a pointer to a block of data of size size, assuming that a -// sufficiently large buffer is available in buffer, if required. -// If called when delimited == true, and size is no greater than -// longest_lookahead then this will not call refresh_buf, so seekg -// still works. Otherwise, seekg may be invalidated. - -char * -dstr::read (char *buffer, int size, char* &prior_tell) -{ - char *retval; - if (eob - idx > size) - { - retval = idx; - idx += size; - if (idx > last) - delimited = false; - } - else - { - // If there was a tellg pointing to an earlier point than the current - // read position, try to keep it in the active buffer. - // In the current code, prior_tell==idx for each call, - // so this is not necessary, just a precaution. - if (eob - prior_tell + size < bufsize) - { - octave_idx_type gap = idx - prior_tell; - idx = prior_tell; - refresh_buf (); - idx += gap; - } - else // can't keep the tellg in range. May skip some data. - { - refresh_buf (); - } - prior_tell = buf; - - if (eob - idx > size) - { - retval = idx; - idx += size; - if (idx > last) - delimited = false; - } - else - { - if (size <= bufsize) // small read, but reached EOF - { - retval = idx; - memset (eob, 0, size + (idx - buf)); - idx += size; - } - else // Reading more than the whole buf; return it in buffer - { - retval = buffer; - // FIXME -- read bufsize at a time - int i; - for (i = 0; i < size && !eof (); i++) - *buffer++ = get_undelim (); - if (eof ()) - memset (buffer, 0, size - i); - } - } - } - return retval; -} - -// Return in OUT an entire line, terminated by delim. On input, OUT -// must have length at least 1. - -int -dstr::getline (std::string& out, char delim) -{ - int len = out.length (), used = 0; - int ch; - while ((ch = get_undelim ()) != delim && ch != EOF) - { - out[used++] = ch; - if (used == len) - { - len <<= 1; - out.resize (len); - } - } - out.resize (used); - field_done (); - - return ch; -} - /* %!test %! str = "1, 2, 3, 4\n 5, , , 8\n 9, 10, 11, 12";
--- a/libinterp/corefcn/textscan.h Thu Mar 17 15:21:02 2016 -0700 +++ b/libinterp/corefcn/textscan.h Thu Mar 17 17:46:56 2016 -0400 @@ -1,4 +1,5 @@ /* + Copyright (C) 2015-2016 Lachlan Andrew, Monash University This file is part of Octave. @@ -44,22 +45,8 @@ class dstr { - int bufsize; // number of characters to read from the file at once - std::istream& i_stream; // stream to read from - char *buf; // temporary storage for a "chunk" of data - char *idx; // Current read pointer - char *last; // location of last delimiter in the buffer at buf - // (undefined if delimited is false) - char *eob; // Position after last character in buffer - bool delimited; // True if there is delimiter in the bufer after idx - int longest; // longest lookahead required - const std::string delims; // sequence of single-character delimiters +public: - std::streampos buf_in_file; // Position of start of buf in original stream - - char *progress_marker; // Marker to see if a read consumes any characters - -public: dstr (std::istream& is, const std::string& delimiters, int longest_lookahead, octave_idx_type bsize = 4096); @@ -124,17 +111,34 @@ // (get, read etc. not cancelled by putback or seekg) void progress_benchmark (void) { progress_marker = idx; } bool no_progress (void) { return progress_marker == idx; } + private: + + int bufsize; // number of characters to read from the file at once + std::istream& i_stream; // stream to read from + char *buf; // temporary storage for a "chunk" of data + char *idx; // Current read pointer + char *last; // location of last delimiter in the buffer at buf + // (undefined if delimited is false) + char *eob; // Position after last character in buffer + bool delimited; // True if there is delimiter in the bufer after idx + int longest; // longest lookahead required + const std::string delims; // sequence of single-character delimiters + + std::streampos buf_in_file; // Position of start of buf in original stream + + char *progress_marker; // Marker to see if a read consumes any characters + std::ios_base::iostate flags; - // No copying + // No copying! + dstr (const dstr&); dstr& operator = (const dstr&); }; -/** - * A single conversion specifier, such as %f or %c - */ +// A single conversion specifier, such as %f or %c. + class OCTINTERP_API textscan_format_elt @@ -318,6 +322,29 @@ class textscan { +public: + + textscan (void) + : buf (""), delim_table (""), delims (), comment_len (0), comment_char(-2), + buffer_size (0), + empty_value (octave_NaN), exp_chars ("edED"), header_lines (0), + treat_as_empty_len (0), whitespace (" \b\t"), eol1('\r'), eol2('\n'), + return_on_error (2), collect_output (false), + multiple_delims_as_one (false), default_exp (true), + numeric_delim (false), lines (0) + { + inf_nan = Cell (dim_vector (1,2)); + inf_nan(0) = Cell (octave_value ("inf")); + inf_nan(1) = Cell (octave_value ("nan")); + }; + + octave_value scan (std::istream* isp, textscan_format_list& fmt_list, + octave_idx_type ntimes); + void parse_options (const octave_value_list& args, int first_param, + textscan_format_list& formats); + +private: + friend class textscan_format_list; std::string buf; @@ -396,27 +423,6 @@ bool isspace (unsigned int ch) const { return whitespace_table[ch & 0xff]; } // true if the only delimiter is whitespace bool whitespace_delim (void) const { return delim_table.length () == 0; } - - public: - - textscan (void) - : buf (""), delim_table (""), delims (), comment_len (0), comment_char(-2), - buffer_size (0), - empty_value (octave_NaN), exp_chars ("edED"), header_lines (0), - treat_as_empty_len (0), whitespace (" \b\t"), eol1('\r'), eol2('\n'), - return_on_error (2), collect_output (false), - multiple_delims_as_one (false), default_exp (true), - numeric_delim (false), lines (0) - { - inf_nan = Cell (dim_vector (1,2)); - inf_nan(0) = Cell (octave_value ("inf")); - inf_nan(1) = Cell (octave_value ("nan")); - }; - - octave_value scan (std::istream* isp, textscan_format_list& fmt_list, - octave_idx_type ntimes); - void parse_options (const octave_value_list& args, int first_param, - textscan_format_list& formats); }; #endif