# HG changeset patch # User John W. Eaton # Date 1458400905 14400 # Node ID 7a19c5678f91200c8d88f0ab1dc1d0d5835370ea # Parent 24aab5c342bf87007fd6e46fd5ab568eaddd11ba move textscan class to oct-stream.cc and textscan function to file-io.cc * file-io.cc (Ftextscan): Move here. * textxcan.cc: From here. * oct-stream.h, oct-stream.cc (delimited_stream, textscan_format_elt, textscan_format_list, textscan): Move here. * textscan.h, textscan.cc: From here. * textscan.h, textscan.cc: Delete. * libinterp/corefcn/module.mk: Update. diff -r 24aab5c342bf -r 7a19c5678f91 libinterp/corefcn/file-io.cc --- a/libinterp/corefcn/file-io.cc Sat Mar 19 09:20:05 2016 -0400 +++ b/libinterp/corefcn/file-io.cc Sat Mar 19 11:21:45 2016 -0400 @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -68,10 +69,11 @@ #include "oct-fstrm.h" #include "oct-iostrm.h" #include "oct-map.h" -#include "ovl.h" #include "oct-prcstrm.h" #include "oct-stream.h" #include "oct-strstrm.h" +#include "ov.h" +#include "ovl.h" #include "pager.h" #include "sysdep.h" #include "utils.h" @@ -1166,6 +1168,1035 @@ return Ffscanf (tmp_args, nargout); } +DEFUN (textscan, args, , + "-*- texinfo -*-\n\ +@deftypefn {} {@var{C} =} textscan (@var{fid}, @var{format})\n\ +@deftypefnx {} {@var{C} =} textscan (@var{fid}, @var{format}, @var{repeat})\n\ +@deftypefnx {} {@var{C} =} textscan (@var{fid}, @var{format}, @var{param}, @var{value}, @dots{})\n\ +@deftypefnx {} {@var{C} =} textscan (@var{fid}, @var{format}, @var{repeat}, @var{param}, @var{value}, @dots{})\n\ +@deftypefnx {} {@var{C} =} textscan (@var{str}, @dots{})\n\ +@deftypefnx {} {[@var{C}, @var{position}, @var{errmsg}] =} textscan (@dots{})\n\ +Read data from a text file or string.\n\ +\n\ +The string @var{str} or file associated with @var{fid} is read from and\n\ +parsed according to @var{format}.\n\ +The function is an extension of @code{strread} and @code{textread}.\n\ +Differences include: the ability to read from either a file or a string,\n\ +additional options, and additional format specifiers.\n\ +\n\ +The input is interpreted as a sequence of \"words\", delimiters\n\ +(such as whitespace) and literals.\n\ +The characters that form delimiters and whitespace are determined\n\ +by the options.\n\ +The format consists of format specifiers interspersed between literals.\n\ +In the format, whitespace forms a delimiter between consecutive literals,\n\ +but is otherwise ignored.\n\ +\n\ +The output @var{C} is a cell array whose second dimension is determined\n\ +by the number of format specifiers.\n\ +\n\ +The first word of the input is matched to the first specifier of the\n\ +format and placed in the first column of the output;\n\ +the second is matched to the second specifier and placed in the second column\n\ +and so forth.\n\ +If there are more words than specifiers, the process is repeated until all\n\ +words have been processed or the limit imposed by @var{repeat} has been met\n\ +(see below).\n\ +\n\ +The string @var{format} describes how the words in @var{str} should be\n\ +parsed.\n\ +As in @var{fscanf}, any (non-whitespace) text in the format that is\n\ +not one of these specifiers is considered a literal;\n\ +if there is a literal between two format specifiers then that same literal\n\ +must appear in the input stream between the matching words.\n\ +\n\ +The following specifiers are valid:\n\ +\n\ +@table @code\n\ +@item %f\n\ +@itemx %f64\n\ +@itemx %n\n\ +The word is parsed as a number and converted to double.\n\ +\n\ +@item %f32\n\ +The word is parsed as a number and converted to single (float).\n\ +\n\ +@item %d\n\ +@itemx %d8\n\ +@itemx %d16\n\ +@itemx %d32\n\ +@itemx %d64\n\ +The word is parsed as a number and converted to int8, int16, int32 or int64.\n\ +If not size is specified, int32 is used.\n\ +\n\ +@item %u\n\ +@itemx %u8\n\ +@itemx %u16\n\ +@itemx %u32\n\ +@itemx %u64\n\ +The word is parsed as a number and converted to uint8, uint16, uint32 or\n\ +uint64. If not size is specified, uint32 is used.\n\ +\n\ +@item %s\n\ +The word is parsed as a string, ending at the last character before\n\ +whitespace, an end-of-line or a delimiter specified in the options.\n\ +\n\ +@item %q\n\ +The word is parsed as a \"quoted string\".\n\ +If the first character of the string is a double quote (\") then the string\n\ +includes everything until a matching double quote, including whitespace,\n\ +delimiters and end of line characters.\n\ +If a pair of consecutive double quotes appears in the input,\n\ +it is replaced in the output by a single double quote.\n\ +That is, the input \"He said \"\"Hello\"\"\" would return the value\n\ +'He said \"Hello\"'.\n\ +\n\ +@item %c\n\ +The next character of the input is read.\n\ +This includes delimiters, whitespace and end of line characters.\n\ +\n\ +@item %[...]\n\ +@itemx %[^...]\n\ +In the first form, the word consists of the longest run consisting of only\n\ +characters between the brackets.\n\ +Ranges of characters can be specified by a hyphen;\n\ +for example, %[0-9a-zA-Z] matches all alphanumeric characters\n\ +(if the underlying character set is ASCII).\n\ +Since Matlab treats hyphens literally, this expansion only applies to\n\ +alphanumeric characters.\n\ +To include '-' in the set, it should appear first or last in the brackets;\n\ +to include ']', it should be the first character.\n\ +If the first character is '^' then the word consists of characters\n\ +NOT listed.\n\ +\n\ +@item %N...\n\ +For %s, %c %d, %f, %n, %u, an optional width can be specified as %Ns etc.\n\ +where N is an integer > 1.\n\ +For %c, this causes exactly the next N characters to be read instead of\n\ +a single character.\n\ +For the other specifiers, it is an upper bound on the\n\ +number of characters read;\n\ +normal delimiters can cause fewer characters to be read.\n\ +For complex numbers, this limit applies to the real and imaginary\n\ +components individually.\n\ +For %f and %n, format specifiers like %N.Mf are allowed, where M is an upper\n\ +bound on number of characters after the decimal point to be considered;\n\ +subsequent digits are skipped.\n\ +For example, the specifier %8.2f would read 12.345e6 as 1.234e7.\n\ +\n\ +@item %*...\n\ +The word specified by the remainder of the conversion specifier is skipped.\n\ +\n\ +@item literals\n\ +In addition the format may contain literal character strings;\n\ +these will be skipped during reading.\n\ +If the input string does not match this literal, the processing terminates,\n\ +unless \"ReturnOnError\" is set to \"continue\".\n\ +@end table\n\ +\n\ +Parsed words corresponding to the first specifier are returned in the first\n\ +output argument and likewise for the rest of the specifiers.\n\ +\n\ +By default, if there is only one input argument, @var{format} is @t{\"%f\"}.\n\ +This means that numbers are read from @var{str} into a single column vector.\n\ +If @var{format} is explicitly empty, \"\", then textscan will return data\n\ +in a number of columns matching the number of fields on the first data\n\ +line of the input.\n\ +Either of these is suitable only if @var{str} contains only numeric fields.\n\ +\n\ +For example, the string\n\ +\n\ +@example\n\ +@group\n\ +@var{str} = \"\\\n\ +Bunny Bugs 5.5\\n\\\n\ +Duck Daffy -7.5e-5\\n\\\n\ +Penguin Tux 6\"\n\ +@end group\n\ +@end example\n\ +\n\ +@noindent\n\ +can be read using\n\ +\n\ +@example\n\ +@var{a} = textscan (@var{str}, \"%s %s %f\");\n\ +@end example\n\ +\n\ +The optional numeric argument @var{repeat} can be used for limiting the\n\ +number of items read:\n\ +\n\ +@table @asis\n\ +@item -1\n\ +(default) read all of the string or file until the end.\n\ +\n\ +@item N\n\ +Read until the first of two conditions occurs: the format has been processed\n\ +N times, or N lines of the input have been processed.\n\ +Zero (0) is an acceptable value for @var{repeat}.\n\ +Currently, end-of-line characters inside %q, %c, and %[...]$ conversions\n\ +do not contribute to the line count.\n\ +This is incompatible with Matlab and may change in future.\n\ +@end table\n\ +\n\ +The behavior of @code{textscan} can be changed via property-value pairs.\n\ +The following properties are recognized:\n\ +\n\ +@table @asis\n\ +@item @qcode{\"BufSize\"}\n\ +This specifies the number of bytes to use for the internal buffer.\n\ +A modest speed improvement is obtained by setting this to a large value\n\ +when reading a large file, especially the input contains long strings.\n\ +The default is 4096, or a value dependent on @var{n} is that is specified.\n\ +\n\ +@item @qcode{\"CollectOutput\"}\n\ +A value of 1 or true instructs textscan to concatenate consecutive columns\n\ +of the same class in the output cell array.\n\ +A value of 0 or false (default) leaves output in distinct columns.\n\ +\n\ +@item @qcode{\"CommentStyle\"}\n\ +Parts of @var{str} are considered comments and will be skipped.\n\ +@var{value} is the comment style and can be either\n\ +(1) One string, or 1x1 cell string, to skip everything to the right of it;\n\ +(2) A cell array of two strings, to skip everything between the first and\n\ +second strings.\n\ +Comments are only parsed where whitespace is accepted, and do not act as\n\ +delimiters.\n\ +\n\ +@item @qcode{\"Delimiter\"}\n\ +If @var{value} is a string, any character in @var{value} will be used to\n\ +split @var{str} into words.\n\ +If @var{value} is a cell array of strings,\n\ +any string in the array will be used to split @var{str} into words.\n\ +(default value = any whitespace.)\n\ +\n\ +@item @qcode{\"EmptyValue\"}\n\ +Value to return for empty numeric values in non-whitespace delimited data.\n\ +The default is NaN@.\n\ +When the data type does not support NaN (int32 for example),\n\ +then default is zero.\n\ +\n\ +@item @qcode{\"EndOfLine\"}\n\ +@var{value} can be either a emtpy or one character specifying the\n\ +end of line character, or the pair\n\ +@qcode{\"@xbackslashchar{}r@xbackslashchar{}n\"} (CRLF).\n\ +In the latter case, any of\n\ +@qcode{\"@xbackslashchar{}r\"}, @qcode{\"@xbackslashchar{}n\"} or\n\ +@qcode{\"@xbackslashchar{}r@xbackslashchar{}n\"} is counted as a (single)\n\ +newline.\n\ +If no value is given, @qcode{\"@xbackslashchar{}r@xbackslashchar{}n\"} is\n\ +used.\n\ +@c If set to \"\" (empty string) EOLs are ignored as delimiters and added\n\ +@c to whitespace.\n\ +\n\ +@c When reading from a character string, optional input argument @var{n}\n\ +@c specifies the number of times @var{format} should be used (i.e., to limit\n\ +@c the amount of data read).\n\ +@c When reading from file, @var{n} specifies the number of data lines to read;\n\ +@c in this sense it differs slightly from the format repeat count in strread.\n\ +\n\ +@item @qcode{\"HeaderLines\"}\n\ +The first @var{value} number of lines of @var{fid} are skipped.\n\ +Note that this does not refer to the first non-comment lines, but the first\n\ +lines of any type.\n\ +\n\ +@item @qcode{\"MultipleDelimsAsOne\"}\n\ +If @var{value} is non-zero,\n\ +treat a series of consecutive delimiters, without whitespace in between,\n\ +as a single delimiter.\n\ +Consecutive delimiter series need not be vertically @qcode{\"aligned\"}.\n\ +Without this option, a single delimiter before the end of the line does\n\ +not cause the line to be considered to end with an empty value,\n\ +but a single delimiter at the start of a line causes the line\n\ +to be considered to start with an empty value.\n\ +\n\ +@item @qcode{\"TreatAsEmpty\"}\n\ +Treat single occurrences (surrounded by delimiters or whitespace) of the\n\ +string(s) in @var{value} as missing values.\n\ +\n\ +@item @qcode{\"ReturnOnError\"}\n\ +If set to numerical 1 or true, return normally as soon as an error\n\ +is encountered, such as trying to read a string using @qcode{%f}.\n\ +If set to 0 or false, return an error and no data.\n\ +If set to \"continue\" (default), textscan attempts to continue reading\n\ +beyond the location; however, this may cause the parsing to get out of sync.\n\ +\n\ +@item @qcode{\"Whitespace\"}\n\ +Any character in @var{value} will be interpreted as whitespace and trimmed;\n\ +The default value for whitespace is\n\ +@c Note: the next line specifically has a newline which generates a space\n\ +@c in the output of qcode, but keeps the next line < 80 characters.\n\ +@qcode{\"\n\ +@xbackslashchar{}b@xbackslashchar{}r@xbackslashchar{}n@xbackslashchar{}t\"}\n\ +(note the space). Unless whitespace is set to @qcode{\"\"} (empty) AND at\n\ +least one @qcode{\"%s\"} format conversion specifier is supplied, a space is\n\ +always part of whitespace.\n\ +\n\ +@end table\n\ +\n\ +When the number of words in @var{str} or @var{fid} doesn't match an exact\n\ +multiple of the number of format conversion specifiers,\n\ +textscan's behavior depends on\n\ +whether the last character of the string or file is\n\ +an end-of-line as specified by the EndOfLine option:\n\ +\n\ +@table @asis\n\ +@item last character = end-of-line\n\ +Data columns are padded with empty fields, NaN or 0 (for integer fields)\n\ +so that all columns have equal length\n\ +\n\ +@item last character is not end-of-line\n\ +Data columns are not padded; textscan returns columns of unequal length\n\ +@end table\n\ +\n\ +\n\ +The second output, @var{position}, provides the position, in characters\n\ +from the beginning of the file or string, at which the processing stopped.\n\ +\n\ +@seealso{dlmread, fscanf, load, strread, textread}\n\ +@end deftypefn") +{ + static std::string who = "textscan"; + + octave_value_list retval; + + if (args.length () < 1) + print_usage (); + + octave_value_list tmp_args = args.splice (0, 1); + + octave_idx_type count = 0; + + textscan tscanner; + + if (args(0).is_string ()) + { + std::string data = args(0).string_value (); + + octave_stream os = octave_istrstream::create (data); + + if (! os.is_valid ()) + error ("%s: unable to create temporary input buffer", who.c_str ()); + + std::istream *isp = os.input_stream (); + + octave_value result = tscanner.scan (*isp, tmp_args, count); + + std::string errmsg = os.error (); + + return ovl (result, count, errmsg); + } + else + { + octave_stream os = octave_stream_list::lookup (args(0), who); + + std::istream *isp = os.input_stream (); + + if (! isp) + error ("internal error: textscan called with invalid istream"); + + octave_value result = tscanner.scan (*isp, tmp_args, count); + + std::string errmsg = os.error (); + + return ovl (result, count, errmsg); + } + + return retval; +} + +/* +%!test +%! str = "1, 2, 3, 4\n 5, , , 8\n 9, 10, 11, 12"; +%! fmtstr = "%f %d %f %s"; +%! c = textscan (str, fmtstr, 2, "delimiter", ",", "emptyvalue", -Inf); +%! assert (isequal (c{1}, [1;5])); +%! assert (length (c{1}), 2); +%! assert (iscellstr (c{4})); +%! assert (isequal (c{3}, [3; -Inf])); + +%!test +%! b = [10:10:100]; +%! b = [b; 8*b/5]; +%! str = sprintf ("%g miles/hr = %g kilometers/hr\n", b); +%! fmt = "%f miles/hr = %f kilometers/hr"; +%! c = textscan (str, fmt); +%! assert (c{1}, b(1,:)', 1e-5); +%! assert (c{2}, b(2,:)', 1e-5); + +%!test +%! str = "13, -, NA, str1, -25\r\n// Middle line\r\n36, na, 05, str3, 6"; +%! a = textscan (str, "%d %n %f %s %n", "delimiter", ",", +%! "treatAsEmpty", {"NA", "na", "-"},"commentStyle", "//"); +%! assert (a{1}, int32 ([13; 36])); +%! assert (a{2}, [NaN; NaN]); +%! assert (a{3}, [NaN; 5]); +%! assert (a{4}, {"str1"; "str3"}); +%! assert (a{5}, [-25; 6]); + +%!test +%! str = "Km:10 = hhhBjjj miles16hour\r\n"; +%! str = [str "Km:15 = hhhJjjj miles241hour\r\n"]; +%! str = [str "Km:2 = hhhRjjj miles3hour\r\n"]; +%! str = [str "Km:25 = hhhZ\r\n"]; +%! fmt = "Km:%d = hhh%1sjjj miles%dhour"; +%! a = textscan (str, fmt, "delimiter", " "); +%! assert (a{1}', int32 ([10 15 2 25])); +%! assert (a{2}', {'B' 'J' 'R' 'Z'}); +%! assert (a{3}', int32 ([16 241 3 0])); + +## Test with default endofline parameter +%!test +%! c = textscan ("L1\nL2", "%s"); +%! assert (c{:}, {"L1"; "L2"}); + +## Test with endofline parameter set to "" (empty) - newline should be in word +%!test +%! c = textscan ("L1\nL2", "%s", "endofline", ""); +%! assert (int8 ([c{:}{:}]), int8 ([ 76, 49, 10, 76, 50 ])); + +### Matlab fails this test. A literal after a conversion is not a delimiter +#%!test +#%! ## No delimiters at all besides EOL. Skip fields, even empty fields +#%! str = "Text1Text2Text\nTextText4Text\nText57Text"; +#%! c = textscan (str, "Text%*dText%dText"); +#%! assert (c{1}, int32 ([2; 4; 0])); + +## CollectOutput test +%!test +%! b = [10:10:100]; +%! b = [b; 8*b/5; 8*b*1000/5]; +%! str = sprintf ("%g miles/hr = %g (%g) kilometers (meters)/hr\n", b); +%! fmt = "%f miles%s %s %f (%f) kilometers %*s"; +%! c = textscan (str, fmt, "collectoutput", 1); +%! assert (size (c{3}), [10, 2]); +%! assert (size (c{2}), [10, 2]); + +## CollectOutput test with uneven column length files +%!test +%! b = [10:10:100]; +%! b = [b; 8*b/5; 8*b*1000/5]; +%! str = sprintf ("%g miles/hr = %g (%g) kilometers (meters)/hr\n", b); +%! str = [str "110 miles/hr"]; +%! fmt = "%f miles%s %s %f (%f) kilometers %*s"; +%! c = textscan (str, fmt, "collectoutput", 1); +%! assert (size (c{1}), [11, 1]); +%! assert (size (c{3}), [11, 2]); +%! assert (size (c{2}), [11, 2]); +%! assert (c{3}(end), NaN); +%! assert (c{2}{11, 1}, "/hr"); +%! assert (isempty (c{2}{11, 2}), true); + +## Double quoted string +%!test +%! str = 'First "the second called ""the middle""" third'; +%! fmt = "%q"; +%! c = textscan (str, fmt); +%! assert (c{1}, {"First"; 'the second called "the middle"'; "third"}); + +## Arbitrary character +%!test +%! c = textscan ("a first, \n second, third", "%s %c %11c", 'delimiter', ' ,'); +%! assert (c{1}, {"a"; "ond"}); +%! assert (c{2}, {"f"; "t"}); +%! assert (c{3}, {"irst, \n sec"; "hird"}); + +## Field width and non-standard delimiters +%!test +%! str = "12;34;123456789;7"; +%! c = textscan (str, "%4d %4d", "delimiter", ";", "collectOutput", 1); +%! assert (c, {[12 34; 1234 5678; 9 7]}); + +## Field width and non-standard delimiters +%!test +%! str = "12;34;123456789;7"; +%! c = textscan (str, "%4f %f", "delimiter", ";", "collectOutput", 1); +%! assert (c, {[12 34; 1234 56789; 7 NaN]}); + +## Ignore trailing delimiter, but use leading one +%!test +%! str = "12.234e+2,34, \n12345.789-9876j,78\n,10|3"; +%! c = textscan (str, "%10.2f %f", "delimiter", ",", "collectOutput", 1, +%! "expChars", "e|"); +%! assert (c, {[1223 34; 12345.79-9876j 78; NaN 10000]}, 1e-6); + +%!test +%! ## Multi-character delimiter +%! str = "99end2 space88gap 4564"; +%! c = textscan (str, "%d %s", "delimiter", {"end", "gap", "space"}); +%! assert (c{1}, int32 ([99; 88])); +%! assert (c{2}, {"2 "; "4564"}); + +### Delimiters as part of literals, and following literals +#%!test +#%! str = "12 R&D & 7"; +#%! c = textscan (str, "%f R&D %f", "delimiter", "&", "collectOutput", 1, "EmptyValue", -99); +#%! assert (c, {[12 -99; 7 -99]}); +# +### Delimiters as part of literals, and before literals +#%!test +#%! str = "12 & R&D 7"; +#%! c = textscan (str, "%f R&D %f", "delimiter", "&", "collectOutput", 1); +#%! assert (c, {[12 7]}); + +%!test +%! ## Check number of lines read, not number of passes through format string +%! f = tempname (); +%! fid = fopen (f, "w+"); +%! fprintf (fid, "1\n2\n3\n4\n5\n6"); +%! fseek (fid, 0, "bof"); +%! c = textscan (fid, "%f %f", 2); +%! E = feof (fid); +%! fclose (fid); +%! unlink (f); +%! assert (c, {1, 2}); +%! assert (!E); + +%!test +%! ## Check number of lines read, not number of passes through format string +%! f = tempname (); +%! fid = fopen (f, "w+"); +%! fprintf (fid, "1\r\n2\r3\n4\r\n5\n6"); +%! fseek (fid, 0, "bof"); +%! c = textscan (fid, "%f %f", 4); +%! fclose (fid); +%! unlink (f); +%! assert (c, {[1;3], [2;4]}) + +%!test +%! ## Check number of lines read, with multiple delimiters +%! f = tempname (); +%! fid = fopen (f, "w+"); +%! fprintf (fid, "1-\r\n-2\r3-\n-4\r\n5\n6"); +%! fseek (fid, 0, "bof"); +%! c = textscan (fid, "%f %f", 4, "delimiter", "-", "multipleDelimsAsOne", 1); +%! fclose (fid); +%! unlink (f); +%! assert (c, {[1;3], [2;4]}) + +%!test +%! ## Check ReturnOnError +%! f = tempname (); +%! fid = fopen (f, "w+"); +%! str = "1 2 3\n4 s 6"; +%! fprintf (fid, str); +%! fseek (fid, 0, "bof"); +%! c = textscan (fid, "%f %f %f"); +%! fseek (fid, 0, "bof"); +%! d = textscan (fid, "%f %f %f", "ReturnOnError", 1); +%! fseek (fid, 0, "bof"); +%! fclose (fid); +%! unlink (f); +%! u = textscan (str, "%f %f %f"); +%! v = textscan (str, "%f %f %f", "ReturnOnError", 1); +%! assert (c, {[1;4], [2;NaN], [3;6]}) +%! assert (d, {[1;4], [2], [3]}) +%! assert (u, {[1;4], [2;NaN], [3;6]}) +%! assert (v, {[1;4], [2], [3]}) + +%!test +%! ## Check ReturnOnError +%! f = tempname (); +%! fid = fopen (f, "w+"); +%! str = "1 2 3\n4 s 6\n"; +%! fprintf (fid, str); +%! fseek (fid, 0, "bof"); +%! c = textscan (fid, "%f %f %f", "ReturnOnError", 1); +%! fseek (fid, 0, "bof"); +%! fclose (fid); +%! unlink (f); +%! u = textscan (str, "%f %f %f", "ReturnOnError", 1); +%! assert (c, {[1;4], 2, 3}) +%! assert (u, {[1;4], 2, 3}) + +%!error textscan ("1 2 3\n4 s 6", "%f %f %f", "ReturnOnError", 0); + +%!test +%! ## Check ReturnOnError +%! f = tempname (); +%! fid = fopen (f, "w+"); +%! fprintf (fid, "1 s 3\n4 5 6"); +%! fseek (fid, 0, "bof"); +%! c = textscan (fid, ""); +%! fseek (fid, 0, "bof"); +%! d = textscan (fid, "", "ReturnOnError", 1); +%! fseek (fid, 0, "bof"); +%! fclose (fid); +%! unlink (f); +%! assert (c, {[1;4], [NaN;5], [3;6]}) +%! assert (d, {1}) + +%!test +%! ## Check ReturnOnError with empty fields +%! c = textscan ("1,,3\n4,5,6", "", "Delimiter", ",", "ReturnOnError", 1); +%! assert (c, {[1;4], [NaN;5], [3;6]}) + +%!test +%! ## Check ReturnOnError with empty fields +%! c = textscan ("1,,3\n4,5,6", "%f %f %f", "Delimiter", ",", "ReturnOnError", 1); +%! assert (c, {[1;4], [NaN;5], [3;6]}) + +%!test +%! ## Check ReturnOnError in first column +%! c = textscan ("1 2 3\ns 5 6", "", "ReturnOnError", 1); +%! assert (c, {1, 2, 3}) + +## Test input validation +%!error textscan () +%!error textscan (single (40)) +%!error textscan ({40}) +%!error textscan ("Hello World", 2) +#%!error [C, pos] = textscan ("Hello World") +%!error textscan ("Hello World", '%s', 'EndOfLine', 3) +%!error <'%z' is not a valid format specifier> textscan ("1.0", "%z"); +%!error textscan ("1.0", "foo"); + +## Test incomplete first data line +%! R = textscan (['Empty1' char(10)], 'Empty%d %f'); +%! assert (R{1}, int32 (1)); +%! assert (isempty (R{2}), true); + +## bug #37023 +%!test +%! data = textscan (" 1. 1 \n 2 3\n", '%f %f'); +%! assert (data{1}, [1; 2], 1e-15); +%! assert (data{2}, [1; 3], 1e-15); + +## Whitespace test (bug #37333) using delimiter ";" +%!test +%! tc = []; +%! tc{1, 1} = "C:/code;"; +%! tc{1, end+1} = "C:/code/meas;"; +%! tc{1, end+1} = " C:/code/sim;"; +%! tc{1, end+1} = "C:/code/utils;"; +%! string = [tc{:}]; +%! c = textscan (string, "%s", "delimiter", ";"); +%! for k = 1:max (numel (c{1}), numel (tc)) +%! lh = c{1}{k}; +%! rh = tc{k}; +%! rh(rh == ";") = ""; +%! rh = strtrim (rh); +%! assert (strcmp (lh, rh)); +%! end + +## Whitespace test (bug #37333), adding multipleDelimsAsOne true arg +%!test +%! tc = []; +%! tc{1, 1} = "C:/code;"; +%! tc{1, end+1} = " C:/code/meas;"; +%! tc{1, end+1} = "C:/code/sim;;"; +%! tc{1, end+1} = "C:/code/utils;"; +%! string = [tc{:}]; +%! c = textscan (string, "%s", "delimiter", ";", "multipleDelimsAsOne", 1); +%! for k = 1:max (numel (c{1}), numel (tc)) +%! lh = c{1}{k}; +%! rh = tc{k}; +%! rh(rh == ";") = ""; +%! rh = strtrim (rh); +%! assert (strcmp (lh, rh)); +%! end + +## Whitespace test (bug #37333), adding multipleDelimsAsOne false arg +%!test +%! tc = []; +%! tc{1, 1} = "C:/code;"; +%! tc{1, end+1} = " C:/code/meas;"; +%! tc{1, end+1} = "C:/code/sim;;"; +%! tc{1, end+1} = ""; +%! tc{1, end+1} = "C:/code/utils;"; +%! string = [tc{:}]; +%! c = textscan (string, "%s", "delimiter", ";", "multipleDelimsAsOne", 0); +%! for k = 1:max (numel (c{1}), numel (tc)) +%! lh = c{1}{k}; +%! rh = tc{k}; +%! rh(rh == ";") = ""; +%! rh = strtrim (rh); +%! assert (strcmp (lh, rh)); +%! end + +## Whitespace test (bug #37333) whitespace "" arg +%!test +%! tc = []; +%! tc{1, 1} = "C:/code;"; +%! tc{1, end+1} = " C:/code/meas;"; +%! tc{1, end+1} = "C:/code/sim;"; +%! tc{1, end+1} = "C:/code/utils;"; +%! string = [tc{:}]; +%! c = textscan (string, "%s", "delimiter", ";", "whitespace", ""); +%! for k = 1:max (numel (c{1}), numel (tc)) +%! lh = c{1}{k}; +%! rh = tc{k}; +%! rh(rh == ";") = ""; +%! assert (strcmp (lh, rh)); +%! end + +## Whitespace test (bug #37333), whitespace " " arg +%!test +%! tc = []; +%! tc{1, 1} = "C:/code;"; +%! tc{1, end+1} = " C:/code/meas;"; +%! tc{1, end+1} = "C:/code/sim;"; +%! tc{1, end+1} = "C:/code/utils;"; +%! string = [tc{:}]; +%! c = textscan (string, "%s", "delimiter", ";", "whitespace", " "); +%! for k = 1:max (numel (c{1}), numel (tc)) +%! lh = c{1}{k}; +%! rh = tc{k}; +%! rh(rh == ";") = ""; +%! rh = strtrim (rh); +%! assert (strcmp (lh, rh)); +%! end + +## Tests reading with empty format, should return proper nr of columns +%!test +%! f = tempname (); +%! fid = fopen (f, "w+"); +%! fprintf (fid, " 1 2 3 4\n5 6 7 8"); +%! fseek (fid, 0, "bof"); +%! A = textscan (fid, ""); +%! E = feof (fid); +%! fclose (fid); +%! unlink (f); +%! assert (A{1}, [1 ; 5], 1e-6); +%! assert (A{2}, [2 ; 6], 1e-6); +%! assert (A{3}, [3 ; 7], 1e-6); +%! assert (A{4}, [4 ; 8], 1e-6); +%! assert (E); + +## Tests reading with empty format; empty fields & incomplete lower row +%!test +%! f = tempname (); +%! fid = fopen (f, "w+"); +%! fprintf (fid, " ,2,,4\n5,6"); +%! fseek (fid, 0, "bof"); +%! A = textscan (fid, "", "delimiter", ",", "EmptyValue", 999, "CollectOutput" , 1); +%! fclose (fid); +%! unlink (f); +%! assert (A{1}, [999, 2, 999, 4; 5, 6, 999, 999], 1e-6); + +## Error message tests + +%!test +%! f = tempname (); +%! fid = fopen (f, "w+"); +%! msg1 = "textscan: 1 parameters given, but only 0 values"; +%! try +%! A = textscan (fid, "", "headerlines"); +%! end_try_catch; +%! assert (!feof (fid)); +%! fclose (fid); +%! unlink (f); +%! assert (msg1, lasterr); + +%!test +%! f = tempname (); +%! fid = fopen (f, "w+"); +%! msg1 = "textscan: HeaderLines must be numeric"; +%! try +%! A = textscan (fid, "", "headerlines", "hh"); +%! end_try_catch; +%! fclose (fid); +%! unlink (f); +%! assert (msg1, lasterr); + +%!test +%! ## Skip headerlines +%! A = textscan ("field 1 field2\n 1 2\n3 4", "", "headerlines", 1, "collectOutput", 1); +%! assert (A, {[1 2; 3 4]}); + +%!test +%! ## Skip headerlines with non-default EOL +%! A = textscan ("field 1 field2\r 1 2\r3 4", "", "headerlines", 2, "collectOutput", 1, "EndOfLine", '\r'); +%! assert (A, {[3 4]}); + +%!test +%! f = tempname (); +%! fid = fopen (f, "w+"); +%! fprintf (fid,"some_string"); +%! fseek (fid, 0, "bof"); +%! msg1 = "textscan: EndOfLine must be at most one character or '\\r\\n'"; +%! try +%! A = textscan (fid, "%f", "EndOfLine", "\n\r"); +%! end_try_catch; +%! fclose (fid); +%! unlink (f); +%! assert (msg1, lasterr); + +%!test +%! f = tempname (); +%! fid = fopen (f, "w+"); +%! fprintf (fid,"some_string"); +%! fseek (fid, 0, "bof"); +%! msg1 = "textscan: EndOfLine must be at most one character or '\\r\\n'"; +%! try +%! A = textscan (fid, "%f", "EndOfLine", 33); +%! end_try_catch; +%! fclose (fid); +%! unlink (f); +%! assert (msg1, lasterr); + +## Bug #41824 +%!test +%! assert (textscan ("123", "", "whitespace", " "){:}, 123); + +## Bug #42343-1, just test supplied emptyvalue +%!test +%! assert (textscan (",NaN", "", "delimiter", "," ,"emptyValue" ,Inf), {Inf, NaN}); + +## Bug #42343-2, test padding with supplied emptyvalue +%!test +%! a = textscan (",1,,4\nInf, ,NaN\n", "", "delimiter", ",", "emptyvalue", -10); +%! assert (cell2mat (a), [-10, 1, -10, 4; Inf, -10, NaN, -10]); + +## Bug #42528 +%!test +%! assert (textscan ("1i", ""){1}, 0+1i); +%! assert (cell2mat (textscan ("3, 2-4i, NaN\n -i, 1, 23.4+2.2i\n 1+1 1+1j", "", "delimiter", ",")), [3+0i, 2-4i, NaN+0i; 0-i, 1+0i, 23.4+2.2i; 1 1 1+1i]); + +%!test +%! ## TreatAsEmpty +%! C = textscan ("1,2,3,NN,5,6\n", "%d%d%d%f", "delimiter", ",", "TreatAsEmpty", "NN"); +%! assert (C{3}(1), int32 (3)); +%! assert (C{4}(1), NaN); + +## MultipleDelimsAsOne +%!test +%! str = "11, 12, 13,, 15\n21,, 23, 24, 25\n,, 33, 34, 35\n"; +%! C = textscan (str, "%f %f %f %f", "delimiter", ",", "multipledelimsasone", 1, "endofline", "\n"); +%! assert (C{1}', [11, 21, 33]); +%! assert (C{2}', [12, 23, 34]); +%! assert (C{3}', [13, 24, 35]); +%! assert (C{4}', [15, 25, NaN]); + +## Bug #44750 +%!test +%! assert (textscan ("/home/foo/", "%s", "delimiter", "/", "MultipleDelimsAsOne", 1){1}, ... +%! {"home"; "foo"}); + +### Allow cuddling %sliteral but warn it is ambiguous +#%!test +#%! C = textscan ("abcxyz51\nxyz83\n##xyz101", "%s xyz %d"); +#%! assert (C{1}([1 3]), {"abc"; "##"}); +#%! assert (isempty (C{1}{2}), true); +#%! assert (C{2}, int32 ([51; 83; 101])); +### Literals are not delimiters. + +## Test for false positives in check for non-supported format specifiers +%!test +%! assert (textscan ("Total: 32.5 % (of cm values)", "Total: %f %% (of cm values)"){1}, 32.5, 1e-5); + +## Test various forms of string format specifiers (bug #45712) +%!test +%! str = "14 :1 z:2 z:3 z:5 z:11"; +%! C = textscan (str, "%f %s %*s %3s %*3s %f", "delimiter", ":"); +%! assert (C, {14, {"1 z"}, {"3 z"}, 11}); + +%% Bit width, fixed width conv. specifiers +%!test +%! str2 = "123456789012345 "; +%! str2 = [str2 str2 str2 str2 str2 str2 str2 str2]; +%! str2 = [str2 "123456789.01234 1234567890.1234 12345.678901234 12345.678901234"]; +%! pttrn = "%3u8%*s %5u16%*s %10u32%*s %15u64 %3d8%*s %5d16%*s %10d32%*s %15d64 %9f32%*s %14f64%*s %10.2f32%*s %12.2f64%*s"; +%! C = textscan (str2, pttrn, "delimiter", " "); +%! assert (C{1}, uint8 (123)); +%! assert (C{2}, uint16 (12345)); +%! assert (C{3}, uint32 (1234567890)); +%! assert (C{4}, uint64 (123456789012345)); +%! assert (C{5}, int8 (123)); +%! assert (C{6}, int16 (12345)); +%! assert (C{7}, int32 (1234567890)); +%! assert (C{8}, int64 (123456789012345)); +%! assert (C{9}, single (123456789), 1e-12); +%! assert (C{10}, double (1234567890.123), 1e-15); +%! assert (C{11}, single (12345.68), 1e-5); +%! assert (C{12}, double (12345.68), 1e-11); + +%% Bit width, fixed width conv. specifiers -- check the right amount is left +%!test +%! str2 = "123456789012345 "; +%! str2 = [str2 str2 "123456789.01234"]; +%! pttrn = "%3u8 %5u16 %10u32 %3d8 %5d16 %10d32 %9f32 %9f"; +%! C = textscan (str2, pttrn, "delimiter", " "); +%! assert (C{1}, uint8 (123)); +%! assert (C{2}, uint16 (45678)); +%! assert (C{3}, uint32 (9012345)); +%! assert (C{4}, int8 (123)); +%! assert (C{5}, int16 (45678)); +%! assert (C{6}, int32 (9012345)); +%! assert (C{7}, single (123456789), 1e-12); +%! assert (C{8}, double (0.01234), 1e-12); + +%!test +%! C = textscan ("123.123", "%2f %3f %3f"); +%! assert (C{1}, 12); +%! assert (C{2}, 3.1, 1e-11); +%! assert (C{3}, 23); + +%!test +%! C = textscan ("123.123", "%3f %3f %3f"); +%! assert (C{1}, 123); +%! assert (C{2}, 0.12, 1e-11); +%! assert (C{3}, 3); + +%!test +%! C = textscan ("123.123", "%4f %3f"); +%! assert (C{1}, 123); +%! assert (C{2}, 123); + +%% field width interrupts exponent. (Matlab incorrectly gives [12, 2e12]) +%!test +%! assert (textscan ("12e12", "%4f"), {[120; 2]}); +%! assert (textscan ("12e+12", "%5f"), {[120; 2]}); +%! assert (textscan ("125e-12","%6f"), {[12.5; 2]}); + +%% %[] tests +%% Plain [..] and *[..] +%!test +%! ar = "abcdefguvwxAny\nacegxyzTrailing\nJunk"; +%! C = textscan (ar, "%[abcdefg] %*[uvwxyz] %s"); +%! assert (C{1}, {"abcdefg"; "aceg"; ""}); +%! assert (C{2}, {"Any"; "Trailing"; "Junk"}); + +%!test +%! assert (textscan ("A2 B2 C3", "%*[ABC]%d", 3), {int32([2; 2; 3])}); + +%% [^..] and *[^..] +%!test +%! br = "abcdefguvwx1Any\nacegxyz2Trailing\n3Junk"; +%! C = textscan (br, "%[abcdefg] %*[^0123456789] %s"); +%! assert (C{1}, {"abcdefg"; "aceg"; ""}); +%! assert (C{2}, {"1Any"; "2Trailing"; "3Junk"}); + +%% [..] and [^..] containing delimiters +%!test +%! cr = "ab cd efguv wx1Any\na ce gx yz2Trailing\n 3Junk"; +%! C = textscan (cr, "%[ abcdefg] %*[^0123456789] %s", "delimiter", " \n", "whitespace", ""); +%! assert (C{1}, {"ab cd efg"; "a ce g"; " "}); +%! assert (C{2}, {"1Any"; "2Trailing"; "3Junk"}); + +%% Bug #36464 +%!test +%! assert (textscan ('1 2 3 4 5 6', "%*n%n%*[^\n]"){1}, 2); + +%% test %[]] and %[^]] +%!test +%! assert (textscan ('345]', "%*[123456]%[]]"){1}{1}, "]"); +%! assert (textscan ('345]', "%*[^]]%s"){1}{1}, "]"); + +%% Test that "-i" checks the next two characters +%!test +%! C = textscan ("-i -in -inf -infinity", "%f %f%s %f %f %s"); +%! assert (C, {-i, -i, {"n"}, -Inf, -Inf, {"inity"}}); + +%% Again for "+i", this time with custom parser +%!test +%! C = textscan ("+i +in +inf +infinity", "%f %f%s %f %f %s", "ExpChars", "eE"); +%! assert (C, {i, i, {"n"}, Inf, Inf, {"inity"}}); + +%% Check single quoted format interprets control sequences +%!test +%! C = textscan ("1 2\t3 4", '%f %[^\t] %f %f'); +%! assert (C, {1, {"2"}, 3, 4}); + +%% Check overflow and underflow of integer types +%!test +%! a = "-1e90 "; +%! b = "1e90 "; +%! fmt = "%d8 %d16 %d32 %d64 %u8 %u16 %u32 %u64 "; +%! C = textscan ([a a a a a a a a b b b b b b b b], fmt); +%! assert (C{1}, int8 ([-128; 127])); +%! assert (C{2}, int16 ([-32768; 32767])); +%! assert (C{3}, int32 ([-2147483648; 2147483647])); +%! assert (C{4}, int64 ([-9223372036854775808; 9223372036854775807])); +%! assert (C{5}, uint8 ([0; 255])); +%! assert (C{6}, uint16 ([0; 65535])); +%! assert (C{7}, uint32 ([0; 4294967295])); +%! assert (C{8}, uint64 ([0; 18446744073709551615])); + +%% Tests from Matlab (does The MathWorks have any copyright over the input?) +%!test +%! f = tempname (); +%! fid = fopen (f, "w+"); +%! fprintf (fid,"09/12/2005 Level1 12.34 45 1.23e10 inf Nan Yes 5.1+3i\n"); +%! fprintf (fid,"10/12/2005 Level2 23.54 60 9e19 -inf 0.001 No 2.2-.5i\n"); +%! fprintf (fid,"11/12/2005 Level3 34.90 12 2e5 10 100 No 3.1+.1i\n"); +%! fseek (fid, 0, "bof"); +%! C = textscan (fid,"%s %s %f32 %d8 %u %f %f %s %f"); +%! %assert (C{1}, {"09/12/2005";"10/12/2005";"11/12/2005"}); +%! assert (C{2}, {"Level1";"Level2";"Level3"}); +%! assert (C{3}, [single(12.34);single(23.54);single(34.90)]); +%! assert (C{4}, [int8(45);int8(60);int8(12)]); +%! assert (C{5}, [uint32(4294967295);uint32(4294967295);uint32(200000)]); +%! assert (C{6}, [inf;-inf;10]); +%! assert (C{7}, [NaN;0.001;100], eps); +%! assert (C{8}, {"Yes";"No";"No"}); +%! assert (C{9}, [5.1+3i;2.2-0.5i;3.1+0.1i]); +%! fseek (fid, 0, "bof"); +%! C = textscan (fid,"%s Level%d %f32 %d8 %u %f %f %s %f"); +%! assert (C{2}, [int32(1);int32(2);int32(3)]); +%! assert (C{3}, [single(12.34);single(23.54);single(34.90)]); +%! fseek (fid, 0, "bof"); +%! C = textscan (fid,'%s %*[^\n]'); +%! fclose (fid); +%! unlink (f); +%! assert (C, {{"09/12/2005";"10/12/2005";"11/12/2005"}}); + +%!test +%! f = tempname (); +%! fid = fopen (f, "w+"); +%! fprintf (fid,"1, 2, 3, 4, , 6\n"); +%! fprintf (fid,"7, 8, 9, , 11, 12\n"); +%! fseek (fid, 0, "bof"); +%! C = textscan (fid,"%f %f %f %f %u8 %f", "Delimiter",",","EmptyValue",-Inf); +%! fclose (fid); +%! unlink (f); +%! assert (C{4}, [4; -Inf]); +%! assert (C{5}, uint8 ([0; 11])); + +%!test +%! f = tempname (); +%! fid = fopen (f, "w+"); +%! fprintf (fid,"abc, 2, NA, 3, 4\n"); +%! fprintf (fid,"// Comment Here\n"); +%! fprintf (fid,"def, na, 5, 6, 7\n"); +%! fseek (fid, 0, "bof"); +%! C = textscan (fid,"%s %n %n %n %n","Delimiter",",","TreatAsEmpty",{"NA","na"},"CommentStyle","//"); +%! fclose (fid); +%! unlink (f); +%! assert (C{1}, {"abc";"def"}); +%! assert (C{2}, [2; NaN]); +%! assert (C{3}, [NaN; 5]); +%! assert (C{4}, [3; 6]); +%! assert (C{5}, [4; 7]); + +%!test +%!## Test start of comment as string +%! c = textscan ("1 / 2 // 3", "%n %s %u8", "CommentStyle", {"//"}); +%! assert (c, {1, "/", 2}); +*/ + +// These tests have end-comment sequences, so can't just be in a comment +#if 0 +%!test +%!## Test unfinished comment +%! c = textscan ("1 2 /* half comment", "%n %u8", "CommentStyle", {"/*", "*/"}); +%! assert (c, {1, 2}); + +## Test reading from a real file +%!test +%! f = tempname (); +%! fid = fopen (f, "w+"); +%! d = rand (1, 4); +%! fprintf (fid, " %f %f /* comment */ %f %f ", d); +%! fseek (fid, 0, "bof"); +%! A = textscan (fid, "%f %f", "CommentStyle", {"/*", "*/"}); +%! E = feof (fid); +%! fclose (fid); +%! unlink (f); +%! assert (A{1}, [d(1); d(3)], 1e-6); +%! assert (A{2}, [d(2); d(4)], 1e-6); +%! assert (E); +#endif + static octave_value do_fread (octave_stream& os, const octave_value& size_arg, const octave_value& prec_arg, const octave_value& skip_arg, diff -r 24aab5c342bf -r 7a19c5678f91 libinterp/corefcn/module.mk --- a/libinterp/corefcn/module.mk Sat Mar 19 09:20:05 2016 -0400 +++ b/libinterp/corefcn/module.mk Sat Mar 19 11:21:45 2016 -0400 @@ -85,7 +85,6 @@ libinterp/corefcn/symtab.h \ libinterp/corefcn/sysdep.h \ libinterp/corefcn/text-renderer.h \ - libinterp/corefcn/textscan.h \ libinterp/corefcn/toplev.h \ libinterp/corefcn/txt-eng.h \ libinterp/corefcn/utils.h \ @@ -244,7 +243,6 @@ libinterp/corefcn/sysdep.cc \ libinterp/corefcn/time.cc \ libinterp/corefcn/text-renderer.cc \ - libinterp/corefcn/textscan.cc \ libinterp/corefcn/toplev.cc \ libinterp/corefcn/tril.cc \ libinterp/corefcn/tsearch.cc \ diff -r 24aab5c342bf -r 7a19c5678f91 libinterp/corefcn/oct-stream.cc --- a/libinterp/corefcn/oct-stream.cc Sat Mar 19 09:20:05 2016 -0400 +++ b/libinterp/corefcn/oct-stream.cc Sat Mar 19 11:21:45 2016 -0400 @@ -1,6 +1,7 @@ /* Copyright (C) 1996-2015 John W. Eaton +Copyright (C) 2015-2016 Lachlan Andrew, Monash University This file is part of Octave. @@ -28,13 +29,15 @@ #include #include +#include +#include #include #include -#include #include #include #include "Array.h" +#include "Cell.h" #include "byte-swap.h" #include "lo-ieee.h" #include "lo-mappers.h" @@ -49,6 +52,7 @@ #include "input.h" #include "oct-stdstrm.h" #include "oct-stream.h" +#include "ov.h" #include "ovl.h" #include "toplev.h" #include "utils.h" @@ -1170,6 +1174,2726 @@ } } +// Delimited stream, optimised to read strings of characters separated +// by single-character delimiters. +// +// The reason behind this class is that octstream doesn't provide +// seek/tell, but the opportunity has been taken to optimise for the +// textscan workload. +// +// The function reads chunks into a 4kiB buffer, and marks where the +// last delimiter occurs. Reads up to this delimiter can be fast. +// After that last delimiter, the remaining text is moved to the front +// of the buffer and the buffer is refilled. This also allows cheap +// seek and tell operations within a "fast read" block. + +class +delimited_stream +{ +public: + + delimited_stream (std::istream& is, const std::string& delimiters, + int longest_lookahead, octave_idx_type bsize = 4096); + + delimited_stream (std::istream& is, const delimited_stream& ds); + + ~delimited_stream (void); + + // Called when optimised sequence of get is finished. Ensures that + // there is a remaining delimiter in buf, or loads more data in. + void field_done (void) + { + if (idx >= last) + refresh_buf (); + } + + // Load new data into buffer, and set eob, last, idx. + // Return EOF at end of file, 0 otherwise. + int refresh_buf (void); + + // Get a character, relying on caller to call field_done if + // a delimiter has been reached. + int get (void) + { + if (delimited) + return eof () ? std::istream::traits_type::eof () : *idx++; + else + return get_undelim (); + } + + // Get a character, checking for underrun of the buffer. + int get_undelim (void); + + // Read character that will be got by the next get. + int peek (void) { return eof () ? std::istream::traits_type::eof () : *idx; } + + // Read character that will be got by the next get. + int peek_undelim (void); + + // Undo a 'get' or 'get_undelim'. It is the caller's responsibility + // to avoid overflow by calling putbacks only for a character got by + // get() or get_undelim(), with no intervening + // get, get_delim, field_done, refresh_buf, getline, read or seekg. + void putback (char /*ch*/ = 0) { if (! eof ()) --idx; } + + int getline (std::string& dest, char delim); + + // int skipline (char delim); + + char *read (char *buffer, int size, char* &new_start); + + // Return a position suitable to "seekg", valid only within this + // block between calls to field_done. + char *tellg (void) { return idx; } + + void seekg (char *old_idx) { idx = old_idx; } + + bool eof (void) + { + return (eob == buf && i_stream.eof ()) || (flags & std::ios_base::eofbit); + } + + operator const void* (void) { return (! eof () && ! flags) ? this : 0; } + + bool fail (void) { return flags & std::ios_base::failbit; } + + std::ios_base::iostate rdstate (void) { return flags; } + + void setstate (std::ios_base::iostate m) { flags = flags | m; } + + void clear (std::ios_base::iostate m + = (std::ios_base::eofbit & ~std::ios_base::eofbit)) + { + flags = flags & m; + } + + // Report if any characters have been consumed. + // (get, read etc. not cancelled by putback or seekg) + + void progress_benchmark (void) { progress_marker = idx; } + + bool no_progress (void) { return progress_marker == idx; } + +private: + + // Number of characters to read from the file at once. + int bufsize; + + // Stream to read from. + std::istream& i_stream; + + // Temporary storage for a "chunk" of data. + char *buf; + + // Current read pointer. + char *idx; + + // Location of last delimiter in the buffer at buf (undefined if + // delimited is false). + char *last; + + // Position after last character in buffer. + char *eob; + + // True if there is delimiter in the bufer after idx. + bool delimited; + + // Longest lookahead required. + int longest; + + // Sequence of single-character delimiters. + const std::string delims; + + // Position of start of buf in original stream. + std::streampos buf_in_file; + + // Marker to see if a read consumes any characters. + char *progress_marker; + + std::ios_base::iostate flags; + + // No copying! + + delimited_stream (const delimited_stream&); + + delimited_stream& operator = (const delimited_stream&); +}; + +// Create a delimited stream, reading from is, with delimiters delims, +// and allowing reading of up to tellg + longest_lookeahead. When is +// is at EOF, lookahead may be padded by ASCII nuls. + +delimited_stream::delimited_stream (std::istream& is, + const std::string& delimiters, + int longest_lookahead, + octave_idx_type bsize) + : bufsize (bsize), i_stream (is), longest (longest_lookahead), + delims (delimiters), + flags (std::ios::failbit & ~std::ios::failbit) // can't cast 0 +{ + buf = new char[bufsize]; + eob = buf + bufsize; + idx = eob; // refresh_buf shouldn't try to copy old data + progress_marker = idx; + refresh_buf (); // load the first batch of data +} + +// Used to create a stream from a strstream from data read from a dstr. +// FIXME: Find a more efficient approach. Perhaps derived dstr +delimited_stream::delimited_stream (std::istream& is, + const delimited_stream& ds) + : bufsize (ds.bufsize), i_stream (is), longest (ds.longest), + delims (ds.delims), + flags (std::ios::failbit & ~std::ios::failbit) // can't cast 0 +{ + buf = new char[bufsize]; + eob = buf + bufsize; + idx = eob; // refresh_buf shouldn't try to copy old data + progress_marker = idx; + refresh_buf (); // load the first batch of data +} + +delimited_stream::~delimited_stream (void) +{ + // Seek to the correct position in i_stream. + if (! eof ()) + { + i_stream.clear (); + i_stream.seekg (buf_in_file); + i_stream.read (buf, idx - buf); + } + + delete [] buf; +} + +// Read a character from the buffer, refilling the buffer from the file +// if necessary. + +int +delimited_stream::get_undelim (void) +{ + int retval; + if (eof ()) + { + setstate (std::ios_base::failbit); + return std::istream::traits_type::eof (); + } + + if (idx < eob) + retval = *idx++; + else + { + refresh_buf (); + + if (eof ()) + { + setstate (std::ios_base::eofbit); + retval = std::istream::traits_type::eof (); + } + else + retval = *idx++; + } + + if (idx >= last) + delimited = false; + + return retval; +} + +// Return the next character to be read without incrementing the +// pointer, refilling the buffer from the file if necessary. + +int +delimited_stream::peek_undelim (void) +{ + int retval = get_undelim (); + putback (); + + return retval; +} + +// Copy remaining unprocessed data to the start of the buffer and load +// new data to fill it. Return EOF if the file is at EOF before +// reading any data and all of the data that has been read has been +// processed. + +int +delimited_stream::refresh_buf (void) +{ + if (eof ()) + return std::istream::traits_type::eof (); + + int retval; + + if (eob < idx) + idx = eob; + + size_t old_remaining = eob - idx; + + octave_quit (); // allow ctrl-C + + if (old_remaining > 0) + memmove (buf, idx, old_remaining); + + progress_marker -= idx - buf; // where original idx would have been + idx = buf; + + int gcount; // chars read + if (! i_stream.eof ()) + { + buf_in_file = i_stream.tellg (); // record for destructor + i_stream.read (buf + old_remaining, bufsize - old_remaining); + gcount = i_stream.gcount (); + } + else + gcount = 0; + + eob = buf + old_remaining + gcount; + last = eob; + if (gcount == 0) + { + delimited = false; + + if (eob != buf) // no more data in file, but still some to go + retval = 0; + else + // file and buffer are both done. + retval = std::istream::traits_type::eof (); + } + else + { + delimited = true; + + for (last = eob - longest; last - buf >= 0; last--) + { + if (delims.find (*last) != std::string::npos) + break; + } + + if (last < buf) + delimited = false; + + retval = 0; + } + + // Ensure fast peek doesn't give valid char + if (retval == std::istream::traits_type::eof ()) + *idx = '\0'; // FIXME - check that no TreatAsEmpty etc starts w. \0? + + return retval; +} + +// Return a pointer to a block of data of size size, assuming that a +// sufficiently large buffer is available in buffer, if required. +// If called when delimited == true, and size is no greater than +// longest_lookahead then this will not call refresh_buf, so seekg +// still works. Otherwise, seekg may be invalidated. + +char * +delimited_stream::read (char *buffer, int size, char* &prior_tell) +{ + char *retval; + + if (eob - idx > size) + { + retval = idx; + idx += size; + if (idx > last) + delimited = false; + } + else + { + // If there was a tellg pointing to an earlier point than the current + // read position, try to keep it in the active buffer. + // In the current code, prior_tell==idx for each call, + // so this is not necessary, just a precaution. + + if (eob - prior_tell + size < bufsize) + { + octave_idx_type gap = idx - prior_tell; + idx = prior_tell; + refresh_buf (); + idx += gap; + } + else // can't keep the tellg in range. May skip some data. + { + refresh_buf (); + } + + prior_tell = buf; + + if (eob - idx > size) + { + retval = idx; + idx += size; + if (idx > last) + delimited = false; + } + else + { + if (size <= bufsize) // small read, but reached EOF + { + retval = idx; + memset (eob, 0, size + (idx - buf)); + idx += size; + } + else // Reading more than the whole buf; return it in buffer + { + retval = buffer; + // FIXME -- read bufsize at a time + int i; + for (i = 0; i < size && ! eof (); i++) + *buffer++ = get_undelim (); + if (eof ()) + memset (buffer, 0, size - i); + } + } + } + + return retval; +} + +// Return in OUT an entire line, terminated by delim. On input, OUT +// must have length at least 1. + +int +delimited_stream::getline (std::string& out, char delim) +{ + int len = out.length (), used = 0; + int ch; + while ((ch = get_undelim ()) != delim + && ch != std::istream::traits_type::eof ()) + { + out[used++] = ch; + if (used == len) + { + len <<= 1; + out.resize (len); + } + } + out.resize (used); + field_done (); + + return ch; +} + +// A single conversion specifier, such as %f or %c. + +class +textscan_format_elt +{ +public: + + enum special_conversion + { + whitespace_conversion = 1, + literal_conversion = 2 + }; + + textscan_format_elt (const std::string& txt, int w = 0, int p = -1, + int bw = 0, bool dis = false, char typ = '\0', + const std::string& ch_class = std::string ()) + : text (txt), width (w), prec (p), bitwidth (bw), + char_class (ch_class), type (typ), discard (dis), + numeric (typ == 'd' || typ == 'u' || type == 'f' || type == 'n') + { } + + textscan_format_elt (const textscan_format_elt& e) + : text (e.text), width (e.width), prec (e.prec), + bitwidth (e.bitwidth), char_class (e.char_class), type (e.type), + discard (e.discard), numeric (e.numeric) + { } + + textscan_format_elt& operator = (const textscan_format_elt& e) + { + if (this != &e) + { + text = e.text; + width = e.width; + prec = e.prec; + bitwidth = e.bitwidth; + discard = e.discard; + type = e.type; + numeric = e.numeric; + char_class = e.char_class; + } + + return *this; + } + + // The C-style format string. + std::string text; + + // The maximum field width. + unsigned int width; + + // The maximum number of digits to read after the decimal in a + // floating point conversion. + int prec; + + // The size of the result. For integers, bitwidth may be 8, 16, 34, + // or 64. For floating point values, bitwidth may be 32 or 64. + int bitwidth; + + // The class of characters in a `[' or `^' format. + std::string char_class; + + // Type of conversion + // -- `d', `u', `f', `n', `s', `q', `c', `%', `C', `D', `[' or `^'. + char type; + + // TRUE if we are not storing the result of this conversion. + bool discard; + + // TRUE if the type is 'd', 'u', 'f', 'n' + bool numeric; +}; + +class textscan; + +// The (parsed) sequence of format specifiers. + +class +textscan_format_list +{ +public: + + textscan_format_list (const std::string& fmt = std::string ()); + + ~textscan_format_list (void); + + octave_idx_type num_conversions (void) const { return nconv; } + + // The length can be different than the number of conversions. + // For example, "x %d y %d z" has 2 conversions but the length of + // the list is 3 because of the characters that appear after the + // last conversion. + + size_t numel (void) const { return fmt_elts.size (); } + + const textscan_format_elt *first (void) + { + curr_idx = 0; + return current (); + } + + const textscan_format_elt *current (void) const + { + return numel () > 0 ? fmt_elts[curr_idx] : 0; + } + + const textscan_format_elt *next (bool cycle = true) + { + curr_idx++; + + if (curr_idx >= numel ()) + { + if (cycle) + curr_idx = 0; + else + return 0; + } + + return current (); + } + + void printme (void) const; + + bool ok (void) const { return (nconv >= 0); } + + operator const void* (void) const { return ok () ? this : 0; } + + // True if number of %f to be set from data file. + bool set_from_first; + + // At least one conversion specifier is s,q,c, or [...]. + bool has_string; + + int read_first_row (delimited_stream& is, textscan& ts); + + std::list out_buf (void) const { return (output_container); } + +private: + + // Number of conversions specified by this format string, or -1 if + // invalid conversions have been found. + octave_idx_type nconv; + + // Index to current element; + size_t curr_idx; + + // List of format elements. + std::deque fmt_elts; + + // list holding column arrays of types specified by conversions + std::list output_container; + + // Temporary buffer. + std::ostringstream *buf; + + void add_elt_to_list (unsigned int width, int prec, int bitwidth, + octave_value val_type, bool discard, + char type, + const std::string& char_class = std::string ()); + + void process_conversion (const std::string& s, size_t& i, size_t n); + + int finish_conversion (const std::string& s, size_t& i, size_t n, + unsigned int& width, int& prec, int& bitwidth, + octave_value& val_type, + bool discard, char& type); + // No copying! + + textscan_format_list (const textscan_format_list&); + + textscan_format_list& operator = (const textscan_format_list&); +}; + + +textscan_format_list::textscan_format_list (const std::string& s) + : set_from_first (false), has_string (false), nconv (0), curr_idx (0), + fmt_elts (), buf (0) +{ + size_t n = s.length (); + + size_t i = 0; + + unsigned int width = -1; // Unspecified width = max (except %c) + int prec = -1; + int bitwidth = 0; + bool discard = false; + char type = '\0'; + + bool have_more = true; + + if (s.empty ()) + { + buf = new std::ostringstream ("%f"); + bitwidth = 64; + type = 'f'; + add_elt_to_list (width, prec, bitwidth, octave_value (NDArray ()), + discard, type); + have_more = false; + set_from_first = true; + nconv = 1; + } + else + { + set_from_first = false; + + while (i < n) + { + have_more = true; + + if (! buf) + buf = new std::ostringstream (); + + if (s[i] == '%' && (i+1 == n || s[i+1] != '%')) + { + // Process percent-escape conversion type. + + process_conversion (s, i, n); + + have_more = (buf != 0); + } + else if (isspace (s[i])) + { + while (++i < n && isspace (s[i])) + /* skip whitespace */; + + have_more = false; + } + else + { + type = textscan_format_elt::literal_conversion; + + width = 0; + prec = -1; + bitwidth = 0; + discard = true; + + while (i < n && ! isspace (s[i]) + && (s[i] != '%' || (i+1 < n && s[i+1] == '%'))) + { + if (s[i] == '%') // if double %, skip one + i++; + *buf << s[i++]; + width++; + } + + add_elt_to_list (width, prec, bitwidth, octave_value (), + discard, type); + + have_more = false; + } + + if (nconv < 0) + { + have_more = false; + break; + } + } + } + + if (have_more) + add_elt_to_list (width, prec, bitwidth, octave_value (), discard, type); + + delete buf; +} + +textscan_format_list::~textscan_format_list (void) +{ + size_t n = numel (); + + for (size_t i = 0; i < n; i++) + { + textscan_format_elt *elt = fmt_elts[i]; + delete elt; + } +} + +void +textscan_format_list::add_elt_to_list (unsigned int width, int prec, + int bitwidth, octave_value val_type, + bool discard, char type, + const std::string& char_class) +{ + if (buf) + { + std::string text = buf->str (); + + if (! text.empty ()) + { + textscan_format_elt *elt + = new textscan_format_elt (text, width, prec, bitwidth, discard, type, char_class); + + if (! discard) + output_container.push_back (val_type); + + fmt_elts.push_back (elt); + } + + delete buf; + buf = 0; + } +} + +void +textscan_format_list::process_conversion (const std::string& s, size_t& i, + size_t n) +{ + unsigned width = 0; + int prec = -1; + int bitwidth = 0; + bool discard = false; + octave_value val_type; + char type = '\0'; + + *buf << s[i++]; + + bool have_width = false; + + while (i < n) + { + switch (s[i]) + { + case '*': + if (discard) + nconv = -1; + else + { + discard = true; + *buf << s[i++]; + } + break; + + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + if (have_width) + nconv = -1; + else + { + char c = s[i++]; + width = width * 10 + c - '0'; + have_width = true; + *buf << c; + while (i < n && isdigit (s[i])) + { + c = s[i++]; + width = width * 10 + c - '0'; + *buf << c; + } + + if (i < n && s[i] == '.') + { + *buf << s[i++]; + prec = 0; + while (i < n && isdigit (s[i])) + { + c = s[i++]; + prec = prec * 10 + c - '0'; + *buf << c; + } + } + } + break; + + case 'd': case 'u': + { + bool done = true; + *buf << (type = s[i++]); + if (i < n) + { + if (s[i] == '8') + { + bitwidth = 8; + if (type == 'd') + val_type = octave_value (int8NDArray ()); + else + val_type = octave_value (uint8NDArray ()); + *buf << s[i++]; + } + else if (s[i] == '1' && i+1 < n && s[i+1] == '6') + { + bitwidth = 16; + if (type == 'd') + val_type = octave_value (int16NDArray ()); + else + val_type = octave_value (uint16NDArray ()); + *buf << s[i++]; + *buf << s[i++]; + } + else if (s[i] == '3' && i+1 < n && s[i+1] == '2') + { + done = false; // use default size below + *buf << s[i++]; + *buf << s[i++]; + } + else if (s[i] == '6' && i+1 < n && s[i+1] == '4') + { + bitwidth = 64; + if (type == 'd') + val_type = octave_value (int64NDArray ()); + else + val_type = octave_value (uint64NDArray ()); + *buf << s[i++]; + *buf << s[i++]; + } + else + done = false; + } + else + done = false; + + if (! done) + { + bitwidth = 32; + if (type == 'd') + val_type = octave_value (int32NDArray ()); + else + val_type = octave_value (uint32NDArray ()); + } + goto fini; + } + + case 'f': + *buf << (type = s[i++]); + bitwidth = 64; + if (i < n) + { + if (s[i] == '3' && i+1 < n && s[i+1] == '2') + { + bitwidth = 32; + val_type = octave_value (FloatNDArray ()); + *buf << s[i++]; + *buf << s[i++]; + } + else if (s[i] == '6' && i+1 < n && s[i+1] == '4') + { + val_type = octave_value (NDArray ()); + *buf << s[i++]; + *buf << s[i++]; + } + else + val_type = octave_value (NDArray ()); + } + else + val_type = octave_value (NDArray ()); + goto fini; + + case 'n': + *buf << (type = s[i++]); + bitwidth = 64; + val_type = octave_value (NDArray ()); + goto fini; + + case 's': case 'q': case '[': case 'c': + if (! discard) + val_type = octave_value (Cell ()); + *buf << (type = s[i++]); + has_string = true; + goto fini; + + fini: + { + if (! have_width) + { + if (type == 'c') // %c defaults to one character + width = 1; + else + width = static_cast (-1); // others: unlimited + } + + if (finish_conversion (s, i, n, width, prec, bitwidth, val_type, + discard, type) == 0) + return; + } + break; + + default: + error ("textscan: '%%%c' is not a valid format specifier", s[i]); + } + + if (nconv < 0) + break; + } + + nconv = -1; +} + +// Parse [...] and [^...] +// +// Matlab does not expand expressions like A-Z, but they are useful, and +// so we parse them "carefully". We treat '-' as a usual character +// unless both start and end characters are from the same class (upper +// case, lower case, numeric), or this is not the first '-' in the +// pattern. +// +// Keep both a running list of characters and a mask of which chars have +// occurred. The first is efficient for patterns with few characters. +// The latter is efficient for [^...] patterns. + +static std::string +textscan_char_class (const std::string& pattern) +{ + int len = pattern.length (); + if (len == 0) + return ""; + + std::string retval (256, '\0'); + std::string mask (256, '\0'); // number of times chr has been seen + + int in = 0, out = 0; + unsigned char ch, prev = 0; + bool flip = false; + + ch = pattern[in]; + if (ch == '^') + { + in++; + flip = true; + } + mask[pattern[in]] = '\1'; + retval[out++] = pattern[in++]; // even copy ']' if it is first + + bool prev_was_range = false; // disallow "a-m-z" as a pattern + bool prev_prev_was_range = false; + for (; in < len; in++) + { + bool was_range = false; + ch = pattern[in]; + if (ch == ']') + break; + + if (prev == '-' && in > 1 && isalnum (ch) && ! prev_prev_was_range) + { + unsigned char start_of_range = pattern[in-2]; + if (start_of_range < ch + && ((isupper (ch) && isupper (start_of_range)) + || (islower (ch) && islower (start_of_range)) + || (isdigit (ch) && isdigit (start_of_range)) + || mask['-'] > 1)) // not the first '-' + { + was_range = true; + out--; + mask['-']--; + for (int i = start_of_range; i <= ch; i++) + { + if (mask[i] == '\0') + { + mask[i] = '\1'; + retval[out++] = i; + } + } + } + } + if (! was_range) + { + if (mask[ch]++ == 0) + retval[out++] = ch; + else if (ch != '-') + warning_with_id ("octave:textscan-pattern", + "textscan: [...] contains two '%c's", ch); + + if (prev == '-' && mask['-'] >= 2) + warning_with_id ("octave:textscan-pattern", + "textscan: [...] contains two '-'s " + "outside range expressions"); + } + prev = ch; + prev_prev_was_range = prev_was_range; + prev_was_range = was_range; + } + + if (flip) // [^...] + { + out = 0; + for (int i = 0; i < 256; i++) + if (! mask[i]) + retval[out++] = i; + } + + retval.resize (out); + + return retval; +} + +int +textscan_format_list::finish_conversion (const std::string& s, size_t& i, + size_t n, unsigned int& width, + int& prec, int& bitwidth, + octave_value& val_type, bool discard, + char& type) +{ + int retval = 0; + + std::string char_class; + + size_t beg_idx = std::string::npos; + size_t end_idx = std::string::npos; + + if (type != '%') + { + nconv++; + if (type == '[') + { + if (i < n) + { + beg_idx = i; + + if (s[i] == '^') + { + type = '^'; + *buf << s[i++]; + + if (i < n) + { + beg_idx = i; + + if (s[i] == ']') + *buf << s[i++]; + } + } + else if (s[i] == ']') + *buf << s[i++]; + } + + while (i < n && s[i] != ']') + *buf << s[i++]; + + if (i < n && s[i] == ']') + { + end_idx = i-1; + *buf << s[i++]; + } + + if (s[i-1] != ']') + retval = nconv = -1; + } + } + + if (nconv >= 0) + { + if (beg_idx != std::string::npos && end_idx != std::string::npos) + char_class = textscan_char_class (s.substr (beg_idx, + end_idx - beg_idx + 1)); + + add_elt_to_list (width, prec, bitwidth, val_type, discard, type, + char_class); + } + + return retval; +} + +void +textscan_format_list::printme (void) const +{ + size_t n = numel (); + + for (size_t i = 0; i < n; i++) + { + textscan_format_elt *elt = fmt_elts[i]; + + std::cerr + << "width: " << elt->width << "\n" + << "digits " << elt->prec << "\n" + << "bitwidth: " << elt->bitwidth << "\n" + << "discard: " << elt->discard << "\n" + << "type: "; + + if (elt->type == textscan_format_elt::literal_conversion) + std::cerr << "literal text\n"; + else if (elt->type == textscan_format_elt::whitespace_conversion) + std::cerr << "whitespace\n"; + else + std::cerr << elt->type << "\n"; + + std::cerr + << "char_class: `" << undo_string_escapes (elt->char_class) << "'\n" + << "text: `" << undo_string_escapes (elt->text) << "'\n\n"; + } +} + +// If FORMAT is explicitly "", it is assumed to be "%f" repeated enough +// times to read the first row of the file. Set it now. + +int +textscan_format_list::read_first_row (delimited_stream& is, textscan& ts) +{ + // Read first line and strip end-of-line, which may be two characters + std::string first_line (20, ' '); + + is.getline (first_line, static_cast (ts.eol2)); + + if (! first_line.empty () + && first_line[first_line.length () - 1] == ts.eol1) + first_line.resize (first_line.length () - 1); + + std::istringstream strstr (first_line); + delimited_stream ds (strstr, is); + + dim_vector dv (1,1); // initial size of each output_container + Complex val; + octave_value val_type; + nconv = 0; + int max_empty = 1000; // failsafe, if ds fails but not with eof + int retval = 0; + + // read line, creating output_container as we go + while (! ds.eof ()) + { + bool already_skipped_delim = false; + ts.skip_whitespace (ds); + ds.progress_benchmark (); + bool progress = false; + ts.scan_complex (ds, *fmt_elts[0], val); + if (ds.fail ()) + { + ds.clear (ds.rdstate () & ~std::ios::failbit); + + if (ds.eof ()) + break; + + // If we don't continue after a conversion error, then + // unless this was a missing value (i.e., followed by a delimiter), + // return with an error status. + if (ts.return_on_error < 2) + { + ts.skip_delim (ds); + if (ds.no_progress ()) + { + retval = 4; + break; + } + already_skipped_delim = true; + } + else // skip offending field + { + std::ios::iostate state = ds.rdstate (); + ds.clear (); // clear to allow read pointer to advance + + std::string dummy; + textscan_format_elt fe ("", first_line.length ()); + ts.scan_string (ds, fe, dummy); + + progress = (dummy.length ()); + ds.setstate (state); + } + + val = ts.empty_value.scalar_value (); + + if (! --max_empty) + break; + } + + if (val.imag () == 0) + val_type = octave_value (NDArray (dv, val.real ())); + else + val_type = octave_value (ComplexNDArray (dv, val)); + + output_container.push_back (val_type); + + if (! already_skipped_delim) + ts.skip_delim (ds); + + if (! progress && ds.no_progress ()) + break; + + nconv++; + } + + output_container.pop_front (); // discard empty element from constructor + + // Create fmt_list now that the size is known + for (octave_idx_type i = 1; i < nconv; i++) + fmt_elts.push_back (new textscan_format_elt (*fmt_elts[0])); + + return retval; // May have returned 4 above. +} + +// Perform actual textscan: read data from stream, and create cell array. + +static Cell +init_inf_nan (void) +{ + Cell retval (dim_vector (1, 2)); + + retval(0) = Cell (octave_value ("inf")); + retval(1) = Cell (octave_value ("nan")); + + return retval; +} + +textscan::textscan (void) + : buf (), whitespace_table (), delim_table (), delims (), + comment_style (), comment_len (0), comment_char (-2), + buffer_size (0), date_locale (), inf_nan (init_inf_nan ()), + empty_value (octave_NaN), exp_chars ("edED"), + header_lines (0), treat_as_empty (), treat_as_empty_len (0), + whitespace (" \b\t"), eol1 ('\r'), eol2 ('\n'), + return_on_error (2), collect_output (false), + multiple_delims_as_one (false), default_exp (true), + numeric_delim (false), lines (0) +{ } + +octave_value +textscan::scan (std::istream& isp, const octave_value_list& args) +{ + octave_idx_type count = 0; + + return scan (isp, args, count); +} + +octave_value +textscan::scan (std::istream& isp, const octave_value_list& args, + octave_idx_type& count) +{ + std::string format; + int params = 0; + + if (args.length () == 0) + format = "%f"; // ommited format = %f. explicit "" = width from file + else if (args(0).is_string ()) + { + format = args(0).string_value (); + + if (args(0).is_sq_string ()) + format = do_string_escapes (format); + + params++; + } + else + error ("textscan: FORMAT must be a string, not <%s>", + args(0).class_name ().c_str ()); + + octave_idx_type ntimes = -1; + + if (args.length () > 1) + { + if (args(1).is_numeric_type ()) + { + ntimes = args(1).idx_type_value (); + + if (ntimes < args(1).double_value ()) + error ("textscan: REPEAT = %g is too large", + args(1).double_value ()); + + params++; + } + } + + octave_value_list tmp_args = args.splice (0, params); + + textscan_format_list fmt_list (format); + + parse_options (tmp_args, fmt_list); + + octave_value result = do_scan (isp, fmt_list, ntimes); + + // FIXME: this is probably not the best way to get count. The + // position could easily be larger than octave_idx_type when using + // 32-bit indexing. + + std::ios::iostate state = isp.rdstate (); + isp.clear (); + count = static_cast (isp.tellg ()); + isp.setstate (state); + + return result; +} + +octave_value +textscan::do_scan (std::istream& isp, textscan_format_list& fmt_list, + octave_idx_type ntimes) +{ + octave_value retval; + + if (fmt_list.num_conversions () == -1) + error ("textscan: invalid format specified"); + + if (fmt_list.num_conversions () == 0) + error ("textscan: no valid format conversion specifiers\n"); + + // skip the first header_lines + std::string dummy; + for (int i = 0; i < header_lines && isp; i++) + getline (isp, dummy, static_cast (eol2)); + + // Create our own buffered stream, for fast get/putback/tell/seek. + + // First, see how far ahead it should let us look. + int max_lookahead = std::max (std::max (comment_len, treat_as_empty_len), + std::max (delim_len, 3)); // 3 for NaN and Inf + + // Next, choose a buffer size to avoid reading too much, or too often. + octave_idx_type buf_size = 4096; + if (buffer_size) + buf_size = buffer_size; + else if (ntimes > 0) + { + // Avoid overflow of 80*ntimes... + buf_size = std::min (buf_size, std::max (ntimes, 80 * ntimes)); + buf_size = std::max (buf_size, ntimes); + } + // Finally, create the stream. + delimited_stream is (isp, whitespace + delims, max_lookahead, buf_size); + + // Grow retval dynamically. "size" is half the initial size + // (FIXME -- Should we start smaller if ntimes is large?) + octave_idx_type size = ((ntimes < 8 && ntimes >= 0) ? ntimes : 1); + Array row_idx (dim_vector (1,2)); + row_idx(1) = 0; + + int err = 0; + octave_idx_type row = 0; + + if (multiple_delims_as_one) // bug #44750? + skip_delim (is); + + int done_after; // Number of columns read when EOF seen. + + // If FORMAT explicitly "", read first line and see how many "%f" match + if (fmt_list.set_from_first) + { + err = fmt_list.read_first_row (is, *this); + lines = 1; + + done_after = fmt_list.numel () + 1; + if (! err) + row = 1; // the above puts the first line into fmt_list.out_buf () + } + else + done_after = fmt_list.out_buf ().size () + 1; + + std::list out = fmt_list.out_buf (); + + // We will later merge adjacent columns of the same type. + // Check now which columns to merge. + // Reals may become complex, and so we can't trust types + // after reading in data. + // If the format was "", that conversion may already have happened, + // so force all to be merged (as all are %f). + bool merge_with_prev[fmt_list.numel ()]; + int conv = 0; + if (collect_output) + { + int prev_type = -1; + for (std::list::iterator col = out.begin (); + col != out.end (); col++) + { + if (col->type_id () == prev_type + || (fmt_list.set_from_first && prev_type != -1)) + merge_with_prev [conv++] = true; + else + merge_with_prev [conv++] = false; + + prev_type = col->type_id (); + } + } + + // This should be caught by earlier code, but this avoids a possible + // infinite loop below. + if (fmt_list.num_conversions () == 0) + error ("textscan: No conversions specified"); + + // Read the data. This is the main loop. + if (! err) + { + for (/* row set ~30 lines above */; row < ntimes || ntimes == -1; row++) + { + if (row == 0 || row >= size) + { + size += size+1; + for (std::list::iterator col = out.begin (); + col != out.end (); col++) + *col = (*col).resize (dim_vector (size, 1), 0); + } + + row_idx(0) = row; + err = read_format_once (is, fmt_list, out, row_idx, done_after); + + if (err > 0 || ! is || (lines >= ntimes && ntimes > -1)) + break; + } + } + + if ((err & 4) && ! return_on_error) + error ("textscan: Read error in field %d of row %d", + done_after + 1, row + 1); + + // If file does not end in EOL, do not pad columns with NaN. + bool uneven_columns = false; + if (err & 4) + uneven_columns = true; + else if (isp.eof ()) + { + isp.clear (); + isp.seekg (-1, std::ios_base::end); + int last_char = isp.get (); + isp.setstate (isp.eofbit); + uneven_columns = (last_char != eol1 && last_char != eol2); + } + + // convert return value to Cell array + Array ra_idx (dim_vector (1,2)); + + // (err & 1) means "error, and no columns read this row + // FIXME -- This may redundant now that done_after=0 says the same + if (err & 1) + done_after = out.size () + 1; + + int valid_rows = (row == ntimes) ? ntimes : ((err & 1) ? row : row+1); + dim_vector dv (valid_rows, 1); + + ra_idx(0) = 0; + int i = 0; + if (! collect_output) + { + retval = Cell (dim_vector (1, out.size ())); + for (std::list::iterator col = out.begin (); + col != out.end (); col++, i++) + { + // trim last columns if that was requested + if (i == done_after && uneven_columns) + dv = dim_vector (std::max (valid_rows - 1, 0), 1); + + ra_idx(1) = i; + retval = do_cat_op (retval, octave_value (Cell (col->resize (dv,0))), + ra_idx); + } + } + else // group adjacent cells of the same type into a single cell + { + octave_value cur; // current cell, accumulating columns + octave_idx_type group_size = 0; // columns in this cell + int prev_type = -1; + + conv = 0; + retval = Cell (); + for (std::list::iterator col = out.begin (); + col != out.end (); col++) + { + if (! merge_with_prev [conv++]) // including first time + { + if (prev_type != -1) + { + ra_idx(1) = i++; + retval = do_cat_op (retval, octave_value (Cell (cur)), + ra_idx); + } + cur = octave_value (col->resize (dv,0)); + group_size = 1; + prev_type = col->type_id (); + } + else + { + ra_idx(1) = group_size++; + cur = do_cat_op (cur, octave_value (col->resize (dv,0)), + ra_idx); + } + } + ra_idx(1) = i; + retval = do_cat_op (retval, octave_value (Cell (cur)), ra_idx); + } + + return retval; +} + +// Calculate x^n. Used for ...e+nn so that, for example, 1e2 is +// exactly 100 and 5e-1 is 1/2 + +static double +pown (double x, unsigned int n) +{ + double retval = 1; + + for (unsigned int d = n; d; d >>= 1) + { + if (d & 1) + retval *= x; + x *= x; + } + + return retval; +} + +// Read a double considering the "precision" field of FMT and the +// EXP_CHARS option of OPTIONS. + +double +textscan::read_double (delimited_stream& is, + const textscan_format_elt& fmt) const +{ + int sign = 1; + unsigned int width_left = fmt.width; + double retval = 0; + bool valid = false; // syntactically correct double? + + int ch = is.peek (); + + if (ch == '+') + { + is.get (); + ch = is.peek (); + if (width_left) + width_left--; + } + else if (ch == '-') + { + sign = -1; + is.get (); + ch = is.peek (); + if (width_left) + width_left--; + } + + // Read integer part + if (ch != '.') + { + if (ch >= '0' && ch <= '9') // valid if at least one digit + valid = true; + while (width_left-- && is && (ch = is.get ()) >= '0' && ch <= '9') + retval = retval * 10 + (ch - '0'); + width_left++; + } + + // Read fractional part, up to specified precision + if (ch == '.' && width_left) + { + double multiplier = 1; + int precision = fmt.prec; + int i; + + if (width_left) + width_left--; // Consider width of '.' + + if (precision == -1) + precision = 1<<30; // FIXME Should be MAXINT + + if (! valid) // if there was nothing before '.'... + is.get (); // ...ch was a "peek", not "get". + + for (i = 0; i < precision; i++) + { + if (width_left-- && is && (ch = is.get ()) >= '0' && ch <= '9') + retval += (ch - '0') * (multiplier *= 0.1); + else + { + width_left++; + break; + } + } + + // round up if we truncated and the next digit is >= 5 + if ((i == precision || ! width_left) && (ch = is.get ()) >= '5' + && ch <= '9') + retval += multiplier; + + if (i > 0) + valid = true; // valid if at least one digit after '.' + + // skip remainder after '.', to field width, to look for exponent + if (i == precision) + while (width_left-- && is && (ch = is.get ()) >= '0' && ch <= '9') + ; // discard + + width_left++; + } + + // look for exponent part in, e.g., 6.023E+23 + bool used_exp = false; + if (valid && width_left > 1 && exp_chars.find (ch) != std::string::npos) + { + int ch1 = is.peek (); + if (ch1 == '-' || ch1 == '+' || (ch1 >= '0' && ch1 <= '9')) + { // if 1.0e+$ or some such, this will set failbit, as we want + width_left--; // count "E" + int exp = 0; + int exp_sign = 1; + if (ch1 == '+') + { + if (width_left) + width_left--; + is.get (); + } + else if (ch1 == '-') + { + exp_sign = -1; + is.get (); + if (width_left) + width_left--; + } + valid = false; + while (width_left-- && is && (ch = is.get ()) >= '0' && ch1 <= '9') + { + exp = exp*10 + ch - '0'; + valid = true; + } + width_left++; + if (ch != std::istream::traits_type::eof () && width_left) + is.putback (ch); + + double multiplier = pown (10, exp); + if (exp_sign > 0) + retval *= multiplier; + else + retval /= multiplier; + + used_exp = true; + } + } + is.clear (); + if (! used_exp && ch != std::istream::traits_type::eof () && width_left) + is.putback (ch); + + // Check for +/- inf and NaN + if (! valid && width_left >= 3) + { + int i = lookahead (is, inf_nan, 3, false); // false -> case insensitive + if (i == 0) + { + retval = octave_Inf; + valid = true; + } + else if (i == 1) + { + retval = octave_NaN; + valid = true; + } + } + + // Check for +/- inf and NaN + if (! valid && width_left >= 3) + { + int i = lookahead (is, inf_nan, 3, false); // false -> case insensitive + if (i == 0) + { + retval = octave_Inf; + valid = true; + } + else if (i == 1) + { + retval = octave_NaN; + valid = true; + } + } + + if (! valid) + is.setstate (std::ios::failbit); + else + is.setstate (is.rdstate () & ~std::ios::failbit); + + return retval * sign; +} + +// Read a single number: real, complex, inf, NaN, possibly with limited +// precision. Calls to this should be preceded by skip_whitespace. +// Calling that inside scan_complex would violate its const declaration. + +void +textscan::scan_complex (delimited_stream& is, const textscan_format_elt& fmt, + Complex& val) const +{ + double im = 0; + double re = 0; + bool as_empty = false; // did we fail but match a "treat_as_empty" string? + bool inf = false; + + int ch = is.peek (); + if (ch == '+' || ch == '-') // check for [+-][ij] with no coefficients + { + ch = is.get (); + int ch2 = is.peek (); + if (ch2 == 'i' || ch2 == 'j') + { + double value = 1; + is.get (); + // Check not -inf + if (is.peek () == 'n') + { + char *pos = is.tellg (); + std::ios::iostate state = is.rdstate (); + + is.get (); + ch2 = is.get (); + if (ch2 == 'f') + { + inf = true; + re = (ch == '+') ? octave_Inf : -octave_Inf; + value = 0; + } + else + { + is.clear (state); + is.seekg (pos); // reset to position before look-ahead + } + } + + im = (ch == '+') ? value : -value; + } + else + is.putback (ch); + } + + if (! im && ! inf) // if not [+-][ij] or [+-]inf, read real normally + { + char *pos = is.tellg (); + std::ios::iostate state = is.rdstate (); + //re = octave_read_value (is); + re = read_double (is, fmt); + + // check for "treat as empty" string + if (treat_as_empty.numel () + && (is.fail () || octave_is_NaN_or_NA (Complex (re)) + || re == octave_Inf)) + { + + for (int i = 0; i < treat_as_empty.numel (); i++) + { + if (ch == treat_as_empty (i).string_value ()[0]) + { + as_empty = true; // first char matches, so read the lot + break; + } + } + if (as_empty) // if first char matched... + { + as_empty = false; // ...look for the whole string + + is.clear (state); // treat_as_empty "-" causes partial read + is.seekg (pos); // reset to position before failed read + + // treat_as_empty strings may be different sizes. + // Read ahead longest, put it all back, then re-read the string + // that matches. + std::string look_buf (treat_as_empty_len, '\0'); + char *look = is.read (&look_buf[0], look_buf.size (), pos); + + is.clear (state); + is.seekg (pos); // reset to position before look-ahead + // FIXME -- is.read could invalidate pos + + for (int i = 0; i < treat_as_empty.numel (); i++) + { + std::string s = treat_as_empty (i).string_value (); + if (! strncmp (s.c_str (), look, s.size ())) + { + as_empty = true; + // read just the right amount + is.read (&look_buf[0], s.size (), pos); + break; + } + } + } + } + + if (! is.eof () && ! as_empty) + { + state = is.rdstate (); // before tellg, since that fails at EOF + pos = is.tellg (); + ch = is.peek (); // ch == EOF if read failed; no need to chk fail + if (ch == 'i' || ch == 'j') // pure imaginary + { + is.get (); + im = re; + re = 0; + } + else if (ch == '+' || ch == '-') // see if it is real+imag[ij] + { + // save stream state in case we have to restore it + pos = is.tellg (); + state = is.rdstate (); + + //im = octave_read_value (is); + im = read_double (is, fmt); + if (is.fail ()) + im = 1; + + if (is.peek () == 'i' || is.peek () == 'j') + is.get (); + else + { + im = 0; // no valid imaginary part. Restore state + is.clear (state); // eof shouldn't cause fail. + is.seekg (pos); + } + } + else if (is.eof ()) // we've read enough to be a "valid" read + is.clear (state); // failed peek shouldn't cause fail + } + } + if (as_empty) + val = empty_value.scalar_value (); + else + val = Complex (re, im); +} + +// Return in VAL the run of characters from IS NOT contained in PATTERN. + +int +textscan::scan_caret (delimited_stream& is, const std::string& pattern, + std::string& val) const +{ + int c1 = std::istream::traits_type::eof (); + std::ostringstream obuf; // Is this optimised for growing? + + while (is && ((c1 = (is && ! is.eof ()) + ? is.get_undelim () + : std::istream::traits_type::eof ()) + != std::istream::traits_type::eof ()) + && pattern.find (c1) == std::string::npos) + obuf << static_cast (c1); + + val = obuf.str (); + + if (c1 != std::istream::traits_type::eof ()) + is.putback (c1); + + return c1; +} + +// Read until one of the strings in DELIMITERS is found. For +// efficiency, ENDS is a list of the last character of each delimiter. + +std::string +textscan::read_until (delimited_stream& is, const Cell& delimiters, + const std::string& ends) const +{ + std::string retval (""); + bool done = false; + do + { // find sequence ending with an ending char + std::string next; + scan_caret (is, ends.c_str (), next); + retval = retval + next; // FIXME -- could use repeated doubling of size + + int last = (! is.eof () + ? is.get_undelim () : std::istream::traits_type::eof ()); + + if (last != std::istream::traits_type::eof ()) + { + retval = retval + static_cast (last); + for (int i = 0; i < delimiters.numel (); i++) + { + std::string delim = delimiters(i).string_value (); + size_t start = (retval.length () > delim.length () + ? retval.length () - delim.length () + : 0); + std::string may_match = retval.substr (start); + if (may_match == delim) + { + done = true; + retval = retval.substr (0, start); + break; + } + } + } + } + while (! done && is && ! is.eof ()); + + return retval; +} + + +// Read stream until either fmt.width chars have been read, or +// options.delimiter has been found. Does *not* rely on fmt being 's'. +// Used by formats like %6f to limit to 6. + +void +textscan::scan_string (delimited_stream& is, const textscan_format_elt& fmt, + std::string& val) const +{ + if (delim_list.numel () == 0) + { + unsigned int i = 0; + unsigned int width = fmt.width; + + for (i = 0; i < width; i++) + { + if (i+1 > val.length ()) + val = val + val + ' '; // grow even if empty + int ch = is.get (); + if (is_delim (ch) || ch == std::istream::traits_type::eof ()) + { + is.putback (ch); + break; + } + else + val[i] = ch; + } + val = val.substr (0, i); // trim pre-allocation + } + else // Cell array of multi-character delimiters + { + std::string ends (""); + for (int i = 0; i < delim_list.numel (); i++) + { + std::string tmp = delim_list(i).string_value (); + ends += tmp.substr (tmp.length () - 1); + } + val = textscan::read_until (is, delim_list, ends); + } +} + +// Return in VAL the run of characters from IS contained in PATTERN. + +int +textscan::scan_bracket (delimited_stream& is, const std::string& pattern, + std::string& val) const +{ + int c1 = std::istream::traits_type::eof (); + std::ostringstream obuf; // Is this optimised for growing? + + while (is && pattern.find (c1 = is.get_undelim ()) != std::string::npos) + obuf << static_cast (c1); + + val = obuf.str (); + if (c1 != std::istream::traits_type::eof ()) + is.putback (c1); + return c1; +} + +// Return in VAL a string, either delimited by whitespace/delimiters, or +// enclosed in a pair of double quotes ("..."). Enclosing quotes are +// removed. A consecutive pair "" is inserted into VAL as a single ". + +void +textscan::scan_qstring (delimited_stream& is, const textscan_format_elt& fmt, + std::string& val) +{ + skip_whitespace (is); + + if (is.peek () != '\"') + scan_string (is, fmt, val); + else + { + is.get (); + scan_caret (is, "\"", val); // read everything until " + is.get (); // swallow " + + while (is && is.peek () == '\"') // if double ", insert one in stream, + { // and keep looking for single " + is.get (); + std::string val1; + scan_caret (is, "\"", val1); + val = val + "\"" + val1; + is.get_undelim (); + } + } +} + +// Read from IS into VAL a string of the next fmt.width characters, +// including any whitespace or delimiters. + +void +textscan::scan_cstring (delimited_stream& is, const textscan_format_elt& fmt, + std::string& val) const +{ + val.resize (fmt.width); + + for (unsigned int i = 0; is && i < fmt.width; i++) + { + int ch = is.get_undelim (); + if (ch != std::istream::traits_type::eof ()) + val[i] = ch; + else + { + val.resize (i); + break; + } + } +} + + +// Read a single '%...' conversion and place it in position ROW of OV. + +void +textscan::scan_one (delimited_stream& is, const textscan_format_elt& fmt, + octave_value& ov, Array row) +{ + skip_whitespace (is); + + is.clear (); + + octave_value val; + if (fmt.numeric) + { + if (fmt.type == 'f' || fmt.type == 'n') + { + Complex v; + skip_whitespace (is); + scan_complex (is, fmt, v); + + if (! fmt.discard && ! is.fail ()) + { + if (fmt.bitwidth == 64) + { + if (ov.is_real_type () && v.imag () == 0) + ov.internal_rep ()->fast_elem_insert (row(0), v.real ()); + else + { + if (ov.is_real_type ()) // cat does type conversion + ov = do_cat_op (ov, octave_value (v), row); + else + ov.internal_rep ()->fast_elem_insert (row(0), v); + } + } + else + { + if (ov.is_real_type () && v.imag () == 0) + ov.internal_rep ()->fast_elem_insert (row(0), + float (v.real ())); + else + { + if (ov.is_real_type ()) // cat does type conversion + ov = do_cat_op (ov, octave_value (v), row); + else + ov.internal_rep ()->fast_elem_insert (row(0), + FloatComplex (v)); + } + } + } + } + else + { + double v; // Matlab docs say 1e30 etc should be valid for %d and + // 1000 as a %d8 should be 127, so read as double. + // Some loss of precision for d64 and u64. + skip_whitespace (is); + v = read_double (is, fmt); + if (! fmt.discard && ! is.fail ()) + switch (fmt.bitwidth) + { + case 64: + switch (fmt.type) + { + case 'd': + { + octave_int64 vv = v; + ov.internal_rep ()->fast_elem_insert (row(0), vv); + } + break; + + case 'u': + { + octave_uint64 vv = v; + ov.internal_rep ()->fast_elem_insert (row(0), vv); + } + break; + } + break; + + case 32: + switch (fmt.type) + { + case 'd': + { + octave_int32 vv = v; + ov.internal_rep ()->fast_elem_insert (row(0), vv); + } + break; + + case 'u': + { + octave_uint32 vv = v; + ov.internal_rep ()->fast_elem_insert (row(0), vv); + } + break; + } + break; + + case 16: + if (fmt.type == 'd') + { + octave_int16 vv = v; + ov.internal_rep ()->fast_elem_insert (row(0), vv); + } + else + { + octave_uint16 vv = v; + ov.internal_rep ()->fast_elem_insert (row(0), vv); + } + break; + + case 8: + if (fmt.type == 'd') + { + octave_int8 vv = v; + ov.internal_rep ()->fast_elem_insert (row(0), vv); + } + else + { + octave_uint8 vv = v; + ov.internal_rep ()->fast_elem_insert (row(0), vv); + } + break; + } + } + + if (is.fail ()) + { + if (! fmt.discard) + ov = do_cat_op (ov, empty_value, row); + + // If we are continuing after errors, skip over this field + if (return_on_error == 2) + { + std::ios::iostate state = is.rdstate (); + is.clear (); // clear to allow read pointer to advance + + std::string dummy; + scan_string (is, fmt, dummy); + + is.setstate (state); + } + } + + } + else + { + std::string vv (" "); // initial buffer. Grows as needed + switch (fmt.type) + { + case 's': + scan_string (is, fmt, vv); + break; + + case 'q': + scan_qstring (is, fmt, vv); + break; + + case 'c': + scan_cstring (is, fmt, vv); + break; + + case '[': + scan_bracket (is, fmt.char_class.c_str (), vv); + break; + + case '^': + scan_caret (is, fmt.char_class.c_str (), vv); + break; + } + + if (! fmt.discard) + ov.internal_rep ()->fast_elem_insert (row (0), + Cell (octave_value (vv))); + + // FIXME -- why does failbit get set at EOF, instead of eofbit? + if (! vv.empty ()) + is.clear (is.rdstate () & ~std::ios_base::failbit); + } + + is.field_done (); +} + +// Read data corresponding to the entire format string once, placing the +// values in row ROW of retval. + +int +textscan::read_format_once (delimited_stream& is, + textscan_format_list& fmt_list, + std::list& retval, + Array row, int& done_after) +{ + const textscan_format_elt *elem = fmt_list.first (); + std::list::iterator out = retval.begin (); + bool no_conversions = true; + bool done = false; + bool conversion_failed = false; // Record for ReturnOnError + + octave_quit (); + + for (size_t i = 0; i < fmt_list.numel (); i++) + { + bool this_conversion_failed = false; + + // Clear fail of previous numeric conversions. + is.clear (); + + switch (elem->type) + { + case 'C': + case 'D': + std::cerr << "textscan: Conversion %" << elem->type + << " not yet implemented\n"; + break; + + case 'u': + case 'd': + case 'f': + case 'n': + case 's': + case '[': + case '^': + case 'q': + case 'c': + scan_one (is, *elem, *out, row); + break; + + case textscan_format_elt::literal_conversion : + match_literal (is, *elem); + break; + + default: + error ("Unknown format element '%c'", elem->type); + } + + if (! is.fail ()) + { + if (! elem->discard) + no_conversions = false; + } + else + { + if (return_on_error < 2) + this_conversion_failed = true; + + is.clear (is.rdstate () & ~std::ios::failbit); + } + + if (! elem->discard) + out++; + + elem = fmt_list.next (); + char *pos = is.tellg (); + + // FIXME -- these conversions "ignore delimiters". Should they include + // delimiters at the start of the conversion, or can those be skipped? + if (elem->type != textscan_format_elt::literal_conversion + // && elem->type != '[' && elem->type != '^' && elem->type != 'c' + ) + skip_delim (is); + + if (this_conversion_failed) + { + if (is.tellg () == pos && ! conversion_failed) + { // done_after = first failure + done_after = i; // note fail, but parse others to get empty_val + conversion_failed = true; + } + else + this_conversion_failed = false; + } + + if (is.eof ()) + { + if (! done) + done_after = i+1; + + // note EOF, but process others to get empty_val. + done = true; + } + } + + if (done) + is.setstate (std::ios::eofbit); + + // Returning -3 means "error, and no columns read this row". + if (is.eof ()) + return (2 + no_conversions); + + if (conversion_failed) + return (4 + no_conversions); + + return 0; +} + +void +textscan::parse_options (const octave_value_list& args, + textscan_format_list& fmt_list) +{ + int last = args.length (); + int n = last; + + if (n & 1) + error ("textscan: %d parameters given, but only %d values", n-n/2, n/2); + + delim_len = 1; + bool have_delims = false; + for (int i = 0; i < last; i += 2) + { + if (! args(i).is_string ()) + error ("textscan: Invalid paramter type <%s> for parameter %d", + args(i).type_name ().c_str (), i/2 + 1); + + std::string param = args(i).string_value (); + std::transform (param.begin (), param.end (), + param.begin (), ::tolower); + if (param == "delimiter") + { + bool invalid = true; + if (args(i+1).is_string ()) + { + invalid = false; + have_delims = true; + delims = args(i+1).string_value (); + } + else if (args(i+1).is_cell ()) + { + invalid = false; + delim_list = args(i+1).cell_value (); + delim_table = " "; // non-empty, to flag non-default delim + + // Check that all elements are strings, and find max length + for (int j = 0; j < delim_list.numel (); j++) + { + if (! delim_list(j).is_string ()) + invalid = true; + else + { + octave_idx_type len = delim_list(j).string_value () + .length (); + delim_len = std::max (static_cast (len), delim_len); + } + } + } + if (invalid) + error ("textscan: Delimiters must be either a string or " + "cell array of strings"); + } + else if (param == "commentstyle") + { + if (args(i+1).is_string ()) + { // check here for names like "C++", "C", "shell", ...? + comment_style = Cell (args(i+1)); + } + else if (args(i+1).is_cell ()) + { + comment_style = args(i+1).cell_value (); + int len = comment_style.numel (); + if ((len >= 1 && ! comment_style (0).is_string ()) + || (len >= 2 && ! comment_style (1).is_string ()) + || (len >= 3)) + error ("textscan: CommentStyle must be either a string or " + "cell array of one or two strings"); + } + else + error ("textscan: CommentStyle must be either a string" + " or cell array of one or two strings, not <%s>", + args(i+1).class_name ().c_str ()); + + // How far ahead do we need to look to detect an open comment + // and which character do we look for? + if (comment_style.numel () >= 1) + { + comment_len = comment_style (0).string_value ().size (); + comment_char = comment_style (0).string_value ()[0]; + } + } + else if (param == "treatasempty") + { + bool invalid = false; + if (args(i+1).is_string ()) + { + treat_as_empty = Cell (args(i+1)); + treat_as_empty_len = args(i+1).string_value ().size (); + } + else if (args(i+1).is_cell ()) + { + treat_as_empty = args(i+1).cell_value (); + for (int j = 0; j < treat_as_empty.numel (); j++) + if (! treat_as_empty (j).is_string ()) + invalid = true; + else + { + int k = treat_as_empty (j).string_value ().size (); + if (k > treat_as_empty_len) + treat_as_empty_len = k; + } + } + if (invalid) + error ("textscan: TreatAsEmpty must be either a string or " + "cell array of one or two strings"); + + // FIXME Ensure none is a prefix of a later one. Sort by length? + } + else if (param == "collectoutput") + { + if (args(i+1).is_numeric_type ()) + collect_output = args(i+1).bool_value (); + else + error ("textscan: CollectOutput must be logical or numeric"); + } + else if (param == "emptyvalue") + { + if (args(i+1).is_numeric_type ()) + empty_value = args(i+1).scalar_value (); + else + error ("textscan: EmptyValue must be numeric, not <%s>", + args(i+1).class_name ().c_str ()); + } + else if (param == "headerlines") + { + if (args(i+1).is_numeric_type ()) + header_lines = args(i+1).scalar_value (); + else + error ("textscan: HeaderLines must be numeric"); + } + else if (param == "bufsize") + { + if (args(i+1).is_numeric_type ()) + buffer_size = args(i+1).scalar_value (); + else + error ("textscan: BufSize must be numeric"); + } + else if (param == "multipledelimsasone") + { + if (args(i+1).is_numeric_type ()) + { + if (args(i+1).bool_value ()) + multiple_delims_as_one = true; + } + else + error ("textscan: MultipleDimsAsOne must be logical or numeric"); + } + else if (param == "returnonerror") + { + if (args(i+1).is_numeric_type ()) + return_on_error = args(i+1).bool_value (); + else if (args(i+1).is_string () + && args(i+1).string_value () == "continue") + return_on_error = 2; + else + error ("textscan: ReturnOnError must be logical or " + "numeric, or 'continue'"); + } + else if (param == "whitespace") + { + if (args(i+1).is_string ()) + whitespace = args(i+1).string_value (); + else + error ("textscan: Whitespace must be a character string"); + } + else if (param == "expchars") + { + if (args(i+1).is_string ()) + { + exp_chars = args(i+1).string_value (); + default_exp = false; + } + else + error ("textscan: ExpChars must be a character string"); + } + else if (param == "endofline") + { + bool valid = true; + if (args(i+1).is_string ()) + { + std::string s = args(i+1).string_value (); + if (args(i+1).is_sq_string ()) + s = do_string_escapes (s); + int l = s.length (); + if (l == 0) + eol1 = eol2 = -2; + else if (l == 1) + eol1 = eol2 = s.c_str ()[0]; + else if (l == 2) + { + eol1 = s.c_str ()[0]; + eol2 = s.c_str ()[1]; + if (eol1 != '\r' || eol2 != '\n') // Why limit it? + valid = false; + } + else + valid = false; + } + else + valid = false; + if (! valid) + error ("textscan: EndOfLine must be at most one character " + "or '\\r\\n'"); + } + else + error ("textscan: Unrecognised option '%s'", param.c_str ()); + } + + whitespace_table = std::string (256, '\0'); + for (unsigned int i = 0; i < whitespace.length (); i++) + whitespace_table[whitespace[i]] = '1'; + + // For Matlab compatibility, add 0x20 to whitespace, unless + // whitespace is explicitly ignored. + if (! (whitespace.empty () && fmt_list.has_string)) + whitespace_table[' '] = '1'; + + // Create look-up table of delimiters, based on 'delimiter' + delim_table = std::string (256, '\0'); + if (eol1 >= 0 && eol1 < 256) + delim_table[eol1] = '1'; // EOL is always a delimiter + if (eol2 >= 0 && eol2 < 256) + delim_table[eol2] = '1'; // EOL is always a delimiter + if (! have_delims) + for (unsigned int i = 0; i < 256; i++) + { + if (isspace (i)) + delim_table[i] = '1'; + } + else + for (unsigned int i = 0; i < delims.length (); i++) + delim_table[delims[i]] = '1'; +} + +// Skip comments, and characters specified by the "Whitespace" option. +// If EOLstop == true, don't skip end of line. + +int +textscan::skip_whitespace (delimited_stream& is, bool EOLstop) +{ + int c1 = std::istream::traits_type::eof (); + bool found_comment = false; + + do + { + found_comment = false; + int prev = -1; + while (is && (c1 = is.get_undelim ()) != std::istream::traits_type::eof () + && ( ( (c1 == eol1 || c1 == eol2) && ++lines && ! EOLstop) + || isspace (c1))) + { + if (prev == eol1 && eol1 != eol2 && c1 == eol2) + lines--; + prev = c1; + } + + if (c1 == comment_char) // see if we match an open comment + { + // save stream state in case we have to restore it + char *pos = is.tellg (); + std::ios::iostate state = is.rdstate (); + + std::string tmp (comment_len, '\0'); + char *look = is.read (&tmp[0], comment_len-1, pos); // already read first char + if (is && ! strncmp (comment_style(0).string_value ().substr (1) + .c_str (), look, comment_len-1)) + { + found_comment = true; + + std::string dummy; + if (comment_style.numel () == 1) // skip to end of line + { + std::string eol (3, '\0'); + eol[0] = eol1; + eol[1] = eol2; + + scan_caret (is, eol, dummy); + c1 = is.get_undelim (); + if (c1 == eol1 && eol1 != eol2 && is.peek_undelim () == eol2) + is.get_undelim (); + lines++; + } + else // matching pair + { + std::string end_c = comment_style(1).string_value (); + // last char of end-comment sequence + std::string last = end_c.substr (end_c.size () - 1); + std::string may_match (""); + do + { // find sequence ending with last char + scan_caret (is, last, dummy); + is.get_undelim (); // (read LAST itself) + + may_match = may_match + dummy + last; + if (may_match.length () > end_c.length ()) + { + size_t start = may_match.length () - end_c.length (); + may_match = may_match.substr (start); + } + } + while (may_match != end_c && is && ! is.eof ()); + } + } + else // wasn't really a comment; restore state + { + is.clear (state); + is.seekg (pos); + } + } + } + while (found_comment); + + if (c1 != std::istream::traits_type::eof ()) + is.putback (c1); + return c1; +} + +// See if the next few characters match one of the strings in target. +// For efficiency, MAX_LEN is the cached longest length of any target. +// Return -1 if none is found, or the index of the match. + +int +textscan::lookahead (delimited_stream& is, const Cell& targets, int max_len, + bool case_sensitive) const +{ + // target strings may be different sizes. + // Read ahead longest, put it all back, then re-read the string + // that matches. + + char *pos = is.tellg (); + + std::string tmp (max_len, '\0'); + char *look = is.read (&tmp[0], tmp.size (), pos); + + is.clear (); + is.seekg (pos); // reset to position before look-ahead + // FIXME pos may be corrupted by is.read + + int i; + int (*compare)(const char *, const char *, size_t); + compare = case_sensitive ? strncmp : strncasecmp; + + for (i = 0; i < targets.numel (); i++) + { + std::string s = targets (i).string_value (); + if (! (*compare) (s.c_str (), look, s.size ())) + { + is.read (&tmp[0], s.size (), pos); // read just the right amount + break; + } + } + + if (i == targets.numel ()) + i = -1; + + return i; +} + +// Skip delimiters -- multiple if MultipleDelimsAsOne specified. +int +textscan::skip_delim (delimited_stream& is) +{ + int c1 = skip_whitespace (is, true); // 'true': stop once EOL is read + if (delim_list.numel () == 0) // single character delimiter + { + if (is_delim (c1) || c1 == eol1 || c1 == eol2) + { + is.get (); + if (c1 == eol1 && is.peek_undelim () == eol2) + is.get_undelim (); // if \r\n, skip the \n too. + + if (multiple_delims_as_one) + { + int prev = -1; + // skip multiple delims. + // Increment lines for each end-of-line seen; for \r\n, decrement + while (is && ((c1 = is.get_undelim ()) + != std::istream::traits_type::eof ()) + && (((c1 == eol1 || c1 == eol2) && ++lines) + || isspace (c1) || is_delim (c1))) + { + if (prev == eol1 && eol1 != eol2 && c1 == eol2) + lines--; + prev = c1; + } + if (c1 != std::istream::traits_type::eof ()) + is.putback (c1); + } + } + } + else // multi-character delimiter + { + int first_match; + + if (c1 == eol1 || c1 == eol2 + || (-1 != (first_match = lookahead (is, delim_list, delim_len)))) + { + if (c1 == eol1) + { + is.get_undelim (); + if (is.peek_undelim () == eol2) + is.get_undelim (); + } + else if (c1 == eol2) + { + is.get_undelim (); + } + + if (multiple_delims_as_one) + { + int prev = -1; + // skip multiple delims. + // Increment lines for each end-of-line seen; for \r\n, decrement + while (is && ((c1 = skip_whitespace (is, true)) + != std::istream::traits_type::eof ()) + && (((c1 == eol1 || c1 == eol2) && ++lines) + || -1 != lookahead (is, delim_list, delim_len))) + { + if (prev == eol1 && eol1 != eol2 && c1 == eol2) + lines--; + prev = c1; + } + } + } + } + + return c1; +} + +// Read in as much of the input as coincides with the literal in the +// format string. Return "true" if the entire literal is matched, else +// false (and set failbit). + +bool +textscan::match_literal (delimited_stream& is, const textscan_format_elt& fmt) +{ + // "false" -> treat EOL as normal space + // since a delimiter at the start of a line is a mismatch, not empty field + skip_whitespace (is, false); + + for (unsigned int i = 0; i < fmt.width; i++) + { + int ch = is.get_undelim (); + if (ch != fmt.text[i]) + { + if (ch != std::istream::traits_type::eof ()) + is.putback (ch); + is.setstate (std::ios::failbit); + return false; + } + } + return true; +} + void octave_base_stream::error (const std::string& msg) { diff -r 24aab5c342bf -r 7a19c5678f91 libinterp/corefcn/oct-stream.h --- a/libinterp/corefcn/oct-stream.h Sat Mar 19 09:20:05 2016 -0400 +++ b/libinterp/corefcn/oct-stream.h Sat Mar 19 11:21:45 2016 -0400 @@ -25,11 +25,9 @@ #include "octave-config.h" -#include #include #include #include -#include #include // These are only needed as arguments to private functions, so they @@ -37,20 +35,177 @@ class scanf_format_elt; class scanf_format_list; + class printf_format_elt; class printf_format_list; +class delimited_stream; +class textscan_format_elt; +class textscan_format_list; + // These only appear as reference arguments or return values. template class Array; +class octave_value_list; class string_vector; -class octave_value; -class octave_value_list; #include "data-conv.h" #include "mach-info.h" #include "oct-refcount.h" +#include "Cell.h" +#include "ov.h" + +// Main class to implement textscan. Read data and parse it +// according to a format. +// +// The calling sequence is +// +// textscan scanner (); +// scanner.scan (...); + +class +OCTINTERP_API +textscan +{ +public: + + textscan (void); + + ~textscan (void) { } + + octave_value scan (std::istream& isp, const octave_value_list& args); + + octave_value scan (std::istream& isp, const octave_value_list& args, + octave_idx_type& count); + +private: + + friend class textscan_format_list; + + std::string buf; + + // Three cases for delim_table and delim_list + // 1. delim_table empty, delim_list empty: whitespace delimiters + // 2. delim_table = look-up table of delim chars, delim_list empty. + // 3. delim_table non-empty, delim_list = Cell array of delim strings + + std::string whitespace_table; + + // delim_table[i] == '\0' if i is not a delimiter. + std::string delim_table; + + // String of delimiter characters. + std::string delims; + + Cell comment_style; + + // How far ahead to look to detect an open comment. + int comment_len; + + // First character of open comment. + int comment_char; + + octave_idx_type buffer_size; + + std::string date_locale; + + // 'inf' and 'nan' for formatted_double. + Cell inf_nan; + + // Array of strings of delimiters. + Cell delim_list; + + // Longest delimiter. + int delim_len; + + octave_value empty_value; + std::string exp_chars; + int header_lines; + Cell treat_as_empty; + + // Longest string to treat as "N/A". + int treat_as_empty_len; + + std::string whitespace; + + short eol1; + short eol2; + short return_on_error; + + bool collect_output; + bool multiple_delims_as_one; + bool default_exp; + bool numeric_delim; + + octave_idx_type lines; + + octave_value do_scan (std::istream& isp, textscan_format_list& fmt_list, + octave_idx_type ntimes); + + void parse_options (const octave_value_list& args, + textscan_format_list& fmt_list); + + int read_format_once (delimited_stream& isp, textscan_format_list& fmt_list, + std::list& retval, + Array row, int& done_after); + + void scan_one (delimited_stream& is, const textscan_format_elt& fmt, + octave_value& ov, Array row); + + // Methods to process a particular conversion specifier. + double read_double (delimited_stream& is, + const textscan_format_elt& fmt) const; + + void scan_complex (delimited_stream& is, const textscan_format_elt& fmt, + Complex& val) const; + + int scan_bracket (delimited_stream& is, const std::string& pattern, + std::string& val) const; + + int scan_caret (delimited_stream& is, const std::string& pattern, + std::string& val) const; + + void scan_string (delimited_stream& is, const textscan_format_elt& fmt, + std::string& val) const; + + void scan_cstring (delimited_stream& is, const textscan_format_elt& fmt, + std::string& val) const; + + void scan_qstring (delimited_stream& is, const textscan_format_elt& fmt, + std::string& val); + + // Helper methods. + std::string read_until (delimited_stream& is, const Cell& delimiters, + const std::string& ends) const; + + int lookahead (delimited_stream& is, const Cell& targets, int max_len, + bool case_sensitive = true) const; + + bool match_literal (delimited_stream& isp, const textscan_format_elt& elem); + + int skip_whitespace (delimited_stream& is, bool EOLstop = false); + + int skip_delim (delimited_stream& is); + + bool is_delim (unsigned char ch) const + { + return ((delim_table.empty () && (isspace (ch) || ch == eol1 || ch == eol2)) + || delim_table[ch] != '\0'); + } + + bool isspace (unsigned int ch) const { return whitespace_table[ch & 0xff]; } + + // True if the only delimiter is whitespace. + bool whitespace_delim (void) const { return delim_table.empty (); } + + // No copying! + + textscan (const textscan&); + + textscan& operator = (const textscan&); +}; + // Provide an interface for Octave streams. class diff -r 24aab5c342bf -r 7a19c5678f91 libinterp/corefcn/textscan.cc --- a/libinterp/corefcn/textscan.cc Sat Mar 19 09:20:05 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,3786 +0,0 @@ -/* - -Copyright (C) 2015-2016 Lachlan Andrew, Monash University - -This file is part of Octave. - -Octave is free software; you can redistribute it and/or modify it -under the terms of the GNU General Public License as published by the -Free Software Foundation; either version 3 of the License, or (at your -option) any later version. - -Octave is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -for more details. - -You should have received a copy of the GNU General Public License -along with Octave; see the file COPYING. If not, see -. - -*/ - -#ifdef HAVE_CONFIG_H -# include -#endif - -#include -#include - -#include "Cell.h" -#include "defun.h" -#include "oct-stream.h" -#include "oct-strstrm.h" -#include "ov.h" -#include "ovl.h" -#include "textscan.h" -#include "utils.h" - -// Delimited stream, optimised to read strings of characters separated -// by single-character delimiters. -// -// The reason behind this class is that octstream doesn't provide -// seek/tell, but the opportunity has been taken to optimise for the -// textscan workload. -// -// The function reads chunks into a 4kiB buffer, and marks where the -// last delimiter occurs. Reads up to this delimiter can be fast. -// After that last delimiter, the remaining text is moved to the front -// of the buffer and the buffer is refilled. This also allows cheap -// seek and tell operations within a "fast read" block. - -class -delimited_stream -{ -public: - - delimited_stream (std::istream& is, const std::string& delimiters, - int longest_lookahead, octave_idx_type bsize = 4096); - - delimited_stream (std::istream& is, const delimited_stream& ds); - - ~delimited_stream (void); - - // Called when optimised sequence of get is finished. Ensures that - // there is a remaining delimiter in buf, or loads more data in. - void field_done (void) - { - if (idx >= last) - refresh_buf (); - } - - // Load new data into buffer, and set eob, last, idx. - // Return EOF at end of file, 0 otherwise. - int refresh_buf (void); - - // Get a character, relying on caller to call field_done if - // a delimiter has been reached. - int get (void) - { - if (delimited) - return eof () ? std::istream::traits_type::eof () : *idx++; - else - return get_undelim (); - } - - // Get a character, checking for underrun of the buffer. - int get_undelim (void); - - // Read character that will be got by the next get. - int peek (void) { return eof () ? std::istream::traits_type::eof () : *idx; } - - // Read character that will be got by the next get. - int peek_undelim (void); - - // Undo a 'get' or 'get_undelim'. It is the caller's responsibility - // to avoid overflow by calling putbacks only for a character got by - // get() or get_undelim(), with no intervening - // get, get_delim, field_done, refresh_buf, getline, read or seekg. - void putback (char /*ch*/ = 0) { if (! eof ()) --idx; } - - int getline (std::string& dest, char delim); - - // int skipline (char delim); - - char *read (char *buffer, int size, char* &new_start); - - // Return a position suitable to "seekg", valid only within this - // block between calls to field_done. - char *tellg (void) { return idx; } - - void seekg (char *old_idx) { idx = old_idx; } - - bool eof (void) - { - return (eob == buf && i_stream.eof ()) || (flags & std::ios_base::eofbit); - } - - operator const void* (void) { return (! eof () && ! flags) ? this : 0; } - - bool fail (void) { return flags & std::ios_base::failbit; } - - std::ios_base::iostate rdstate (void) { return flags; } - - void setstate (std::ios_base::iostate m) { flags = flags | m; } - - void clear (std::ios_base::iostate m - = (std::ios_base::eofbit & ~std::ios_base::eofbit)) - { - flags = flags & m; - } - - // Report if any characters have been consumed. - // (get, read etc. not cancelled by putback or seekg) - - void progress_benchmark (void) { progress_marker = idx; } - - bool no_progress (void) { return progress_marker == idx; } - -private: - - // Number of characters to read from the file at once. - int bufsize; - - // Stream to read from. - std::istream& i_stream; - - // Temporary storage for a "chunk" of data. - char *buf; - - // Current read pointer. - char *idx; - - // Location of last delimiter in the buffer at buf (undefined if - // delimited is false). - char *last; - - // Position after last character in buffer. - char *eob; - - // True if there is delimiter in the bufer after idx. - bool delimited; - - // Longest lookahead required. - int longest; - - // Sequence of single-character delimiters. - const std::string delims; - - // Position of start of buf in original stream. - std::streampos buf_in_file; - - // Marker to see if a read consumes any characters. - char *progress_marker; - - std::ios_base::iostate flags; - - // No copying! - - delimited_stream (const delimited_stream&); - - delimited_stream& operator = (const delimited_stream&); -}; - -// Create a delimited stream, reading from is, with delimiters delims, -// and allowing reading of up to tellg + longest_lookeahead. When is -// is at EOF, lookahead may be padded by ASCII nuls. - -delimited_stream::delimited_stream (std::istream& is, - const std::string& delimiters, - int longest_lookahead, - octave_idx_type bsize) - : bufsize (bsize), i_stream (is), longest (longest_lookahead), - delims (delimiters), - flags (std::ios::failbit & ~std::ios::failbit) // can't cast 0 -{ - buf = new char[bufsize]; - eob = buf + bufsize; - idx = eob; // refresh_buf shouldn't try to copy old data - progress_marker = idx; - refresh_buf (); // load the first batch of data -} - -// Used to create a stream from a strstream from data read from a dstr. -// FIXME: Find a more efficient approach. Perhaps derived dstr -delimited_stream::delimited_stream (std::istream& is, - const delimited_stream& ds) - : bufsize (ds.bufsize), i_stream (is), longest (ds.longest), - delims (ds.delims), - flags (std::ios::failbit & ~std::ios::failbit) // can't cast 0 -{ - buf = new char[bufsize]; - eob = buf + bufsize; - idx = eob; // refresh_buf shouldn't try to copy old data - progress_marker = idx; - refresh_buf (); // load the first batch of data -} - -delimited_stream::~delimited_stream (void) -{ - // Seek to the correct position in i_stream. - if (! eof ()) - { - i_stream.clear (); - i_stream.seekg (buf_in_file); - i_stream.read (buf, idx - buf); - } - - delete [] buf; -} - -// Read a character from the buffer, refilling the buffer from the file -// if necessary. - -int -delimited_stream::get_undelim (void) -{ - int retval; - if (eof ()) - { - setstate (std::ios_base::failbit); - return std::istream::traits_type::eof (); - } - - if (idx < eob) - retval = *idx++; - else - { - refresh_buf (); - - if (eof ()) - { - setstate (std::ios_base::eofbit); - retval = std::istream::traits_type::eof (); - } - else - retval = *idx++; - } - - if (idx >= last) - delimited = false; - - return retval; -} - -// Return the next character to be read without incrementing the -// pointer, refilling the buffer from the file if necessary. - -int -delimited_stream::peek_undelim (void) -{ - int retval = get_undelim (); - putback (); - - return retval; -} - -// Copy remaining unprocessed data to the start of the buffer and load -// new data to fill it. Return EOF if the file is at EOF before -// reading any data and all of the data that has been read has been -// processed. - -int -delimited_stream::refresh_buf (void) -{ - if (eof ()) - return std::istream::traits_type::eof (); - - int retval; - - if (eob < idx) - idx = eob; - - size_t old_remaining = eob - idx; - - octave_quit (); // allow ctrl-C - - if (old_remaining > 0) - memmove (buf, idx, old_remaining); - - progress_marker -= idx - buf; // where original idx would have been - idx = buf; - - int gcount; // chars read - if (! i_stream.eof ()) - { - buf_in_file = i_stream.tellg (); // record for destructor - i_stream.read (buf + old_remaining, bufsize - old_remaining); - gcount = i_stream.gcount (); - } - else - gcount = 0; - - eob = buf + old_remaining + gcount; - last = eob; - if (gcount == 0) - { - delimited = false; - - if (eob != buf) // no more data in file, but still some to go - retval = 0; - else - // file and buffer are both done. - retval = std::istream::traits_type::eof (); - } - else - { - delimited = true; - - for (last = eob - longest; last - buf >= 0; last--) - { - if (delims.find (*last) != std::string::npos) - break; - } - - if (last < buf) - delimited = false; - - retval = 0; - } - - // Ensure fast peek doesn't give valid char - if (retval == std::istream::traits_type::eof ()) - *idx = '\0'; // FIXME - check that no TreatAsEmpty etc starts w. \0? - - return retval; -} - -// Return a pointer to a block of data of size size, assuming that a -// sufficiently large buffer is available in buffer, if required. -// If called when delimited == true, and size is no greater than -// longest_lookahead then this will not call refresh_buf, so seekg -// still works. Otherwise, seekg may be invalidated. - -char * -delimited_stream::read (char *buffer, int size, char* &prior_tell) -{ - char *retval; - - if (eob - idx > size) - { - retval = idx; - idx += size; - if (idx > last) - delimited = false; - } - else - { - // If there was a tellg pointing to an earlier point than the current - // read position, try to keep it in the active buffer. - // In the current code, prior_tell==idx for each call, - // so this is not necessary, just a precaution. - - if (eob - prior_tell + size < bufsize) - { - octave_idx_type gap = idx - prior_tell; - idx = prior_tell; - refresh_buf (); - idx += gap; - } - else // can't keep the tellg in range. May skip some data. - { - refresh_buf (); - } - - prior_tell = buf; - - if (eob - idx > size) - { - retval = idx; - idx += size; - if (idx > last) - delimited = false; - } - else - { - if (size <= bufsize) // small read, but reached EOF - { - retval = idx; - memset (eob, 0, size + (idx - buf)); - idx += size; - } - else // Reading more than the whole buf; return it in buffer - { - retval = buffer; - // FIXME -- read bufsize at a time - int i; - for (i = 0; i < size && ! eof (); i++) - *buffer++ = get_undelim (); - if (eof ()) - memset (buffer, 0, size - i); - } - } - } - - return retval; -} - -// Return in OUT an entire line, terminated by delim. On input, OUT -// must have length at least 1. - -int -delimited_stream::getline (std::string& out, char delim) -{ - int len = out.length (), used = 0; - int ch; - while ((ch = get_undelim ()) != delim - && ch != std::istream::traits_type::eof ()) - { - out[used++] = ch; - if (used == len) - { - len <<= 1; - out.resize (len); - } - } - out.resize (used); - field_done (); - - return ch; -} - -// A single conversion specifier, such as %f or %c. - -class -textscan_format_elt -{ -public: - - enum special_conversion - { - whitespace_conversion = 1, - literal_conversion = 2 - }; - - textscan_format_elt (const std::string& txt, int w = 0, int p = -1, - int bw = 0, bool dis = false, char typ = '\0', - const std::string& ch_class = std::string ()) - : text (txt), width (w), prec (p), bitwidth (bw), - char_class (ch_class), type (typ), discard (dis), - numeric (typ == 'd' || typ == 'u' || type == 'f' || type == 'n') - { } - - textscan_format_elt (const textscan_format_elt& e) - : text (e.text), width (e.width), prec (e.prec), - bitwidth (e.bitwidth), char_class (e.char_class), type (e.type), - discard (e.discard), numeric (e.numeric) - { } - - textscan_format_elt& operator = (const textscan_format_elt& e) - { - if (this != &e) - { - text = e.text; - width = e.width; - prec = e.prec; - bitwidth = e.bitwidth; - discard = e.discard; - type = e.type; - numeric = e.numeric; - char_class = e.char_class; - } - - return *this; - } - - // The C-style format string. - std::string text; - - // The maximum field width. - unsigned int width; - - // The maximum number of digits to read after the decimal in a - // floating point conversion. - int prec; - - // The size of the result. For integers, bitwidth may be 8, 16, 34, - // or 64. For floating point values, bitwidth may be 32 or 64. - int bitwidth; - - // The class of characters in a `[' or `^' format. - std::string char_class; - - // Type of conversion - // -- `d', `u', `f', `n', `s', `q', `c', `%', `C', `D', `[' or `^'. - char type; - - // TRUE if we are not storing the result of this conversion. - bool discard; - - // TRUE if the type is 'd', 'u', 'f', 'n' - bool numeric; -}; - -class textscan; - -// The (parsed) sequence of format specifiers. - -class -textscan_format_list -{ -public: - - textscan_format_list (const std::string& fmt = std::string ()); - - ~textscan_format_list (void); - - octave_idx_type num_conversions (void) const { return nconv; } - - // The length can be different than the number of conversions. - // For example, "x %d y %d z" has 2 conversions but the length of - // the list is 3 because of the characters that appear after the - // last conversion. - - size_t numel (void) const { return fmt_elts.size (); } - - const textscan_format_elt *first (void) - { - curr_idx = 0; - return current (); - } - - const textscan_format_elt *current (void) const - { - return numel () > 0 ? fmt_elts[curr_idx] : 0; - } - - const textscan_format_elt *next (bool cycle = true) - { - curr_idx++; - - if (curr_idx >= numel ()) - { - if (cycle) - curr_idx = 0; - else - return 0; - } - - return current (); - } - - void printme (void) const; - - bool ok (void) const { return (nconv >= 0); } - - operator const void* (void) const { return ok () ? this : 0; } - - // True if number of %f to be set from data file. - bool set_from_first; - - // At least one conversion specifier is s,q,c, or [...]. - bool has_string; - - int read_first_row (delimited_stream& is, textscan& ts); - - std::list out_buf (void) const { return (output_container); } - -private: - - // Number of conversions specified by this format string, or -1 if - // invalid conversions have been found. - octave_idx_type nconv; - - // Index to current element; - size_t curr_idx; - - // List of format elements. - std::deque fmt_elts; - - // list holding column arrays of types specified by conversions - std::list output_container; - - // Temporary buffer. - std::ostringstream *buf; - - void add_elt_to_list (unsigned int width, int prec, int bitwidth, - octave_value val_type, bool discard, - char type, - const std::string& char_class = std::string ()); - - void process_conversion (const std::string& s, size_t& i, size_t n); - - int finish_conversion (const std::string& s, size_t& i, size_t n, - unsigned int& width, int& prec, int& bitwidth, - octave_value& val_type, - bool discard, char& type); - // No copying! - - textscan_format_list (const textscan_format_list&); - - textscan_format_list& operator = (const textscan_format_list&); -}; - - -textscan_format_list::textscan_format_list (const std::string& s) - : set_from_first (false), has_string (false), nconv (0), curr_idx (0), - fmt_elts (), buf (0) -{ - size_t n = s.length (); - - size_t i = 0; - - unsigned int width = -1; // Unspecified width = max (except %c) - int prec = -1; - int bitwidth = 0; - bool discard = false; - char type = '\0'; - - bool have_more = true; - - if (s.empty ()) - { - buf = new std::ostringstream ("%f"); - bitwidth = 64; - type = 'f'; - add_elt_to_list (width, prec, bitwidth, octave_value (NDArray ()), - discard, type); - have_more = false; - set_from_first = true; - nconv = 1; - } - else - { - set_from_first = false; - - while (i < n) - { - have_more = true; - - if (! buf) - buf = new std::ostringstream (); - - if (s[i] == '%' && (i+1 == n || s[i+1] != '%')) - { - // Process percent-escape conversion type. - - process_conversion (s, i, n); - - have_more = (buf != 0); - } - else if (isspace (s[i])) - { - while (++i < n && isspace (s[i])) - /* skip whitespace */; - - have_more = false; - } - else - { - type = textscan_format_elt::literal_conversion; - - width = 0; - prec = -1; - bitwidth = 0; - discard = true; - - while (i < n && ! isspace (s[i]) - && (s[i] != '%' || (i+1 < n && s[i+1] == '%'))) - { - if (s[i] == '%') // if double %, skip one - i++; - *buf << s[i++]; - width++; - } - - add_elt_to_list (width, prec, bitwidth, octave_value (), - discard, type); - - have_more = false; - } - - if (nconv < 0) - { - have_more = false; - break; - } - } - } - - if (have_more) - add_elt_to_list (width, prec, bitwidth, octave_value (), discard, type); - - delete buf; -} - -textscan_format_list::~textscan_format_list (void) -{ - size_t n = numel (); - - for (size_t i = 0; i < n; i++) - { - textscan_format_elt *elt = fmt_elts[i]; - delete elt; - } -} - -void -textscan_format_list::add_elt_to_list (unsigned int width, int prec, - int bitwidth, octave_value val_type, - bool discard, char type, - const std::string& char_class) -{ - if (buf) - { - std::string text = buf->str (); - - if (! text.empty ()) - { - textscan_format_elt *elt - = new textscan_format_elt (text, width, prec, bitwidth, discard, type, char_class); - - if (! discard) - output_container.push_back (val_type); - - fmt_elts.push_back (elt); - } - - delete buf; - buf = 0; - } -} - -void -textscan_format_list::process_conversion (const std::string& s, size_t& i, - size_t n) -{ - unsigned width = 0; - int prec = -1; - int bitwidth = 0; - bool discard = false; - octave_value val_type; - char type = '\0'; - - *buf << s[i++]; - - bool have_width = false; - - while (i < n) - { - switch (s[i]) - { - case '*': - if (discard) - nconv = -1; - else - { - discard = true; - *buf << s[i++]; - } - break; - - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - if (have_width) - nconv = -1; - else - { - char c = s[i++]; - width = width * 10 + c - '0'; - have_width = true; - *buf << c; - while (i < n && isdigit (s[i])) - { - c = s[i++]; - width = width * 10 + c - '0'; - *buf << c; - } - - if (i < n && s[i] == '.') - { - *buf << s[i++]; - prec = 0; - while (i < n && isdigit (s[i])) - { - c = s[i++]; - prec = prec * 10 + c - '0'; - *buf << c; - } - } - } - break; - - case 'd': case 'u': - { - bool done = true; - *buf << (type = s[i++]); - if (i < n) - { - if (s[i] == '8') - { - bitwidth = 8; - if (type == 'd') - val_type = octave_value (int8NDArray ()); - else - val_type = octave_value (uint8NDArray ()); - *buf << s[i++]; - } - else if (s[i] == '1' && i+1 < n && s[i+1] == '6') - { - bitwidth = 16; - if (type == 'd') - val_type = octave_value (int16NDArray ()); - else - val_type = octave_value (uint16NDArray ()); - *buf << s[i++]; - *buf << s[i++]; - } - else if (s[i] == '3' && i+1 < n && s[i+1] == '2') - { - done = false; // use default size below - *buf << s[i++]; - *buf << s[i++]; - } - else if (s[i] == '6' && i+1 < n && s[i+1] == '4') - { - bitwidth = 64; - if (type == 'd') - val_type = octave_value (int64NDArray ()); - else - val_type = octave_value (uint64NDArray ()); - *buf << s[i++]; - *buf << s[i++]; - } - else - done = false; - } - else - done = false; - - if (! done) - { - bitwidth = 32; - if (type == 'd') - val_type = octave_value (int32NDArray ()); - else - val_type = octave_value (uint32NDArray ()); - } - goto fini; - } - - case 'f': - *buf << (type = s[i++]); - bitwidth = 64; - if (i < n) - { - if (s[i] == '3' && i+1 < n && s[i+1] == '2') - { - bitwidth = 32; - val_type = octave_value (FloatNDArray ()); - *buf << s[i++]; - *buf << s[i++]; - } - else if (s[i] == '6' && i+1 < n && s[i+1] == '4') - { - val_type = octave_value (NDArray ()); - *buf << s[i++]; - *buf << s[i++]; - } - else - val_type = octave_value (NDArray ()); - } - else - val_type = octave_value (NDArray ()); - goto fini; - - case 'n': - *buf << (type = s[i++]); - bitwidth = 64; - val_type = octave_value (NDArray ()); - goto fini; - - case 's': case 'q': case '[': case 'c': - if (! discard) - val_type = octave_value (Cell ()); - *buf << (type = s[i++]); - has_string = true; - goto fini; - - fini: - { - if (! have_width) - { - if (type == 'c') // %c defaults to one character - width = 1; - else - width = static_cast (-1); // others: unlimited - } - - if (finish_conversion (s, i, n, width, prec, bitwidth, val_type, - discard, type) == 0) - return; - } - break; - - default: - error ("textscan: '%%%c' is not a valid format specifier", s[i]); - } - - if (nconv < 0) - break; - } - - nconv = -1; -} - -// Parse [...] and [^...] -// -// Matlab does not expand expressions like A-Z, but they are useful, and -// so we parse them "carefully". We treat '-' as a usual character -// unless both start and end characters are from the same class (upper -// case, lower case, numeric), or this is not the first '-' in the -// pattern. -// -// Keep both a running list of characters and a mask of which chars have -// occurred. The first is efficient for patterns with few characters. -// The latter is efficient for [^...] patterns. - -static std::string -textscan_char_class (const std::string& pattern) -{ - int len = pattern.length (); - if (len == 0) - return ""; - - std::string retval (256, '\0'); - std::string mask (256, '\0'); // number of times chr has been seen - - int in = 0, out = 0; - unsigned char ch, prev = 0; - bool flip = false; - - ch = pattern[in]; - if (ch == '^') - { - in++; - flip = true; - } - mask[pattern[in]] = '\1'; - retval[out++] = pattern[in++]; // even copy ']' if it is first - - bool prev_was_range = false; // disallow "a-m-z" as a pattern - bool prev_prev_was_range = false; - for (; in < len; in++) - { - bool was_range = false; - ch = pattern[in]; - if (ch == ']') - break; - - if (prev == '-' && in > 1 && isalnum (ch) && ! prev_prev_was_range) - { - unsigned char start_of_range = pattern[in-2]; - if (start_of_range < ch - && ((isupper (ch) && isupper (start_of_range)) - || (islower (ch) && islower (start_of_range)) - || (isdigit (ch) && isdigit (start_of_range)) - || mask['-'] > 1)) // not the first '-' - { - was_range = true; - out--; - mask['-']--; - for (int i = start_of_range; i <= ch; i++) - { - if (mask[i] == '\0') - { - mask[i] = '\1'; - retval[out++] = i; - } - } - } - } - if (! was_range) - { - if (mask[ch]++ == 0) - retval[out++] = ch; - else if (ch != '-') - warning_with_id ("octave:textscan-pattern", - "textscan: [...] contains two '%c's", ch); - - if (prev == '-' && mask['-'] >= 2) - warning_with_id ("octave:textscan-pattern", - "textscan: [...] contains two '-'s " - "outside range expressions"); - } - prev = ch; - prev_prev_was_range = prev_was_range; - prev_was_range = was_range; - } - - if (flip) // [^...] - { - out = 0; - for (int i = 0; i < 256; i++) - if (! mask[i]) - retval[out++] = i; - } - - retval.resize (out); - - return retval; -} - -int -textscan_format_list::finish_conversion (const std::string& s, size_t& i, - size_t n, unsigned int& width, - int& prec, int& bitwidth, - octave_value& val_type, bool discard, - char& type) -{ - int retval = 0; - - std::string char_class; - - size_t beg_idx = std::string::npos; - size_t end_idx = std::string::npos; - - if (type != '%') - { - nconv++; - if (type == '[') - { - if (i < n) - { - beg_idx = i; - - if (s[i] == '^') - { - type = '^'; - *buf << s[i++]; - - if (i < n) - { - beg_idx = i; - - if (s[i] == ']') - *buf << s[i++]; - } - } - else if (s[i] == ']') - *buf << s[i++]; - } - - while (i < n && s[i] != ']') - *buf << s[i++]; - - if (i < n && s[i] == ']') - { - end_idx = i-1; - *buf << s[i++]; - } - - if (s[i-1] != ']') - retval = nconv = -1; - } - } - - if (nconv >= 0) - { - if (beg_idx != std::string::npos && end_idx != std::string::npos) - char_class = textscan_char_class (s.substr (beg_idx, - end_idx - beg_idx + 1)); - - add_elt_to_list (width, prec, bitwidth, val_type, discard, type, - char_class); - } - - return retval; -} - -void -textscan_format_list::printme (void) const -{ - size_t n = numel (); - - for (size_t i = 0; i < n; i++) - { - textscan_format_elt *elt = fmt_elts[i]; - - std::cerr - << "width: " << elt->width << "\n" - << "digits " << elt->prec << "\n" - << "bitwidth: " << elt->bitwidth << "\n" - << "discard: " << elt->discard << "\n" - << "type: "; - - if (elt->type == textscan_format_elt::literal_conversion) - std::cerr << "literal text\n"; - else if (elt->type == textscan_format_elt::whitespace_conversion) - std::cerr << "whitespace\n"; - else - std::cerr << elt->type << "\n"; - - std::cerr - << "char_class: `" << undo_string_escapes (elt->char_class) << "'\n" - << "text: `" << undo_string_escapes (elt->text) << "'\n\n"; - } -} - -// If FORMAT is explicitly "", it is assumed to be "%f" repeated enough -// times to read the first row of the file. Set it now. - -int -textscan_format_list::read_first_row (delimited_stream& is, textscan& ts) -{ - // Read first line and strip end-of-line, which may be two characters - std::string first_line (20, ' '); - - is.getline (first_line, static_cast (ts.eol2)); - - if (! first_line.empty () - && first_line[first_line.length () - 1] == ts.eol1) - first_line.resize (first_line.length () - 1); - - std::istringstream strstr (first_line); - delimited_stream ds (strstr, is); - - dim_vector dv (1,1); // initial size of each output_container - Complex val; - octave_value val_type; - nconv = 0; - int max_empty = 1000; // failsafe, if ds fails but not with eof - int retval = 0; - - // read line, creating output_container as we go - while (! ds.eof ()) - { - bool already_skipped_delim = false; - ts.skip_whitespace (ds); - ds.progress_benchmark (); - bool progress = false; - ts.scan_complex (ds, *fmt_elts[0], val); - if (ds.fail ()) - { - ds.clear (ds.rdstate () & ~std::ios::failbit); - - if (ds.eof ()) - break; - - // If we don't continue after a conversion error, then - // unless this was a missing value (i.e., followed by a delimiter), - // return with an error status. - if (ts.return_on_error < 2) - { - ts.skip_delim (ds); - if (ds.no_progress ()) - { - retval = 4; - break; - } - already_skipped_delim = true; - } - else // skip offending field - { - std::ios::iostate state = ds.rdstate (); - ds.clear (); // clear to allow read pointer to advance - - std::string dummy; - textscan_format_elt fe ("", first_line.length ()); - ts.scan_string (ds, fe, dummy); - - progress = (dummy.length ()); - ds.setstate (state); - } - - val = ts.empty_value.scalar_value (); - - if (! --max_empty) - break; - } - - if (val.imag () == 0) - val_type = octave_value (NDArray (dv, val.real ())); - else - val_type = octave_value (ComplexNDArray (dv, val)); - - output_container.push_back (val_type); - - if (! already_skipped_delim) - ts.skip_delim (ds); - - if (! progress && ds.no_progress ()) - break; - - nconv++; - } - - output_container.pop_front (); // discard empty element from constructor - - // Create fmt_list now that the size is known - for (octave_idx_type i = 1; i < nconv; i++) - fmt_elts.push_back (new textscan_format_elt (*fmt_elts[0])); - - return retval; // May have returned 4 above. -} - -// Perform actual textscan: read data from stream, and create cell array. - -static Cell -init_inf_nan (void) -{ - Cell retval (dim_vector (1, 2)); - - retval(0) = Cell (octave_value ("inf")); - retval(1) = Cell (octave_value ("nan")); - - return retval; -} - -textscan::textscan (void) - : buf (), whitespace_table (), delim_table (), delims (), - comment_style (), comment_len (0), comment_char (-2), - buffer_size (0), date_locale (), inf_nan (init_inf_nan ()), - empty_value (octave_NaN), exp_chars ("edED"), - header_lines (0), treat_as_empty (), treat_as_empty_len (0), - whitespace (" \b\t"), eol1 ('\r'), eol2 ('\n'), - return_on_error (2), collect_output (false), - multiple_delims_as_one (false), default_exp (true), - numeric_delim (false), lines (0) -{ } - -octave_value -textscan::scan (std::istream& isp, const octave_value_list& args) -{ - octave_idx_type count = 0; - - return scan (isp, args, count); -} - -octave_value -textscan::scan (std::istream& isp, const octave_value_list& args, - octave_idx_type& count) -{ - std::string format; - int params = 0; - - if (args.length () == 0) - format = "%f"; // ommited format = %f. explicit "" = width from file - else if (args(0).is_string ()) - { - format = args(0).string_value (); - - if (args(0).is_sq_string ()) - format = do_string_escapes (format); - - params++; - } - else - error ("textscan: FORMAT must be a string, not <%s>", - args(0).class_name ().c_str ()); - - octave_idx_type ntimes = -1; - - if (args.length () > 1) - { - if (args(1).is_numeric_type ()) - { - ntimes = args(1).idx_type_value (); - - if (ntimes < args(1).double_value ()) - error ("textscan: REPEAT = %g is too large", - args(1).double_value ()); - - params++; - } - } - - octave_value_list tmp_args = args.splice (0, params); - - textscan_format_list fmt_list (format); - - parse_options (tmp_args, fmt_list); - - octave_value result = do_scan (isp, fmt_list, ntimes); - - // FIXME: this is probably not the best way to get count. The - // position could easily be larger than octave_idx_type when using - // 32-bit indexing. - - std::ios::iostate state = isp.rdstate (); - isp.clear (); - count = static_cast (isp.tellg ()); - isp.setstate (state); - - return result; -} - -octave_value -textscan::do_scan (std::istream& isp, textscan_format_list& fmt_list, - octave_idx_type ntimes) -{ - octave_value retval; - - if (fmt_list.num_conversions () == -1) - error ("textscan: invalid format specified"); - - if (fmt_list.num_conversions () == 0) - error ("textscan: no valid format conversion specifiers\n"); - - // skip the first header_lines - std::string dummy; - for (int i = 0; i < header_lines && isp; i++) - getline (isp, dummy, static_cast (eol2)); - - // Create our own buffered stream, for fast get/putback/tell/seek. - - // First, see how far ahead it should let us look. - int max_lookahead = std::max (std::max (comment_len, treat_as_empty_len), - std::max (delim_len, 3)); // 3 for NaN and Inf - - // Next, choose a buffer size to avoid reading too much, or too often. - octave_idx_type buf_size = 4096; - if (buffer_size) - buf_size = buffer_size; - else if (ntimes > 0) - { - // Avoid overflow of 80*ntimes... - buf_size = std::min (buf_size, std::max (ntimes, 80 * ntimes)); - buf_size = std::max (buf_size, ntimes); - } - // Finally, create the stream. - delimited_stream is (isp, whitespace + delims, max_lookahead, buf_size); - - // Grow retval dynamically. "size" is half the initial size - // (FIXME -- Should we start smaller if ntimes is large?) - octave_idx_type size = ((ntimes < 8 && ntimes >= 0) ? ntimes : 1); - Array row_idx (dim_vector (1,2)); - row_idx(1) = 0; - - int err = 0; - octave_idx_type row = 0; - - if (multiple_delims_as_one) // bug #44750? - skip_delim (is); - - int done_after; // Number of columns read when EOF seen. - - // If FORMAT explicitly "", read first line and see how many "%f" match - if (fmt_list.set_from_first) - { - err = fmt_list.read_first_row (is, *this); - lines = 1; - - done_after = fmt_list.numel () + 1; - if (! err) - row = 1; // the above puts the first line into fmt_list.out_buf () - } - else - done_after = fmt_list.out_buf ().size () + 1; - - std::list out = fmt_list.out_buf (); - - // We will later merge adjacent columns of the same type. - // Check now which columns to merge. - // Reals may become complex, and so we can't trust types - // after reading in data. - // If the format was "", that conversion may already have happened, - // so force all to be merged (as all are %f). - bool merge_with_prev[fmt_list.numel ()]; - int conv = 0; - if (collect_output) - { - int prev_type = -1; - for (std::list::iterator col = out.begin (); - col != out.end (); col++) - { - if (col->type_id () == prev_type - || (fmt_list.set_from_first && prev_type != -1)) - merge_with_prev [conv++] = true; - else - merge_with_prev [conv++] = false; - - prev_type = col->type_id (); - } - } - - // This should be caught by earlier code, but this avoids a possible - // infinite loop below. - if (fmt_list.num_conversions () == 0) - error ("textscan: No conversions specified"); - - // Read the data. This is the main loop. - if (! err) - { - for (/* row set ~30 lines above */; row < ntimes || ntimes == -1; row++) - { - if (row == 0 || row >= size) - { - size += size+1; - for (std::list::iterator col = out.begin (); - col != out.end (); col++) - *col = (*col).resize (dim_vector (size, 1), 0); - } - - row_idx(0) = row; - err = read_format_once (is, fmt_list, out, row_idx, done_after); - - if (err > 0 || ! is || (lines >= ntimes && ntimes > -1)) - break; - } - } - - if ((err & 4) && ! return_on_error) - error ("textscan: Read error in field %d of row %d", - done_after + 1, row + 1); - - // If file does not end in EOL, do not pad columns with NaN. - bool uneven_columns = false; - if (err & 4) - uneven_columns = true; - else if (isp.eof ()) - { - isp.clear (); - isp.seekg (-1, std::ios_base::end); - int last_char = isp.get (); - isp.setstate (isp.eofbit); - uneven_columns = (last_char != eol1 && last_char != eol2); - } - - // convert return value to Cell array - Array ra_idx (dim_vector (1,2)); - - // (err & 1) means "error, and no columns read this row - // FIXME -- This may redundant now that done_after=0 says the same - if (err & 1) - done_after = out.size () + 1; - - int valid_rows = (row == ntimes) ? ntimes : ((err & 1) ? row : row+1); - dim_vector dv (valid_rows, 1); - - ra_idx(0) = 0; - int i = 0; - if (! collect_output) - { - retval = Cell (dim_vector (1, out.size ())); - for (std::list::iterator col = out.begin (); - col != out.end (); col++, i++) - { - // trim last columns if that was requested - if (i == done_after && uneven_columns) - dv = dim_vector (std::max (valid_rows - 1, 0), 1); - - ra_idx(1) = i; - retval = do_cat_op (retval, octave_value (Cell (col->resize (dv,0))), - ra_idx); - } - } - else // group adjacent cells of the same type into a single cell - { - octave_value cur; // current cell, accumulating columns - octave_idx_type group_size = 0; // columns in this cell - int prev_type = -1; - - conv = 0; - retval = Cell (); - for (std::list::iterator col = out.begin (); - col != out.end (); col++) - { - if (! merge_with_prev [conv++]) // including first time - { - if (prev_type != -1) - { - ra_idx(1) = i++; - retval = do_cat_op (retval, octave_value (Cell (cur)), - ra_idx); - } - cur = octave_value (col->resize (dv,0)); - group_size = 1; - prev_type = col->type_id (); - } - else - { - ra_idx(1) = group_size++; - cur = do_cat_op (cur, octave_value (col->resize (dv,0)), - ra_idx); - } - } - ra_idx(1) = i; - retval = do_cat_op (retval, octave_value (Cell (cur)), ra_idx); - } - - return retval; -} - -// Calculate x^n. Used for ...e+nn so that, for example, 1e2 is -// exactly 100 and 5e-1 is 1/2 - -static double -pown (double x, unsigned int n) -{ - double retval = 1; - - for (unsigned int d = n; d; d >>= 1) - { - if (d & 1) - retval *= x; - x *= x; - } - - return retval; -} - -// Read a double considering the "precision" field of FMT and the -// EXP_CHARS option of OPTIONS. - -double -textscan::read_double (delimited_stream& is, - const textscan_format_elt& fmt) const -{ - int sign = 1; - unsigned int width_left = fmt.width; - double retval = 0; - bool valid = false; // syntactically correct double? - - int ch = is.peek (); - - if (ch == '+') - { - is.get (); - ch = is.peek (); - if (width_left) - width_left--; - } - else if (ch == '-') - { - sign = -1; - is.get (); - ch = is.peek (); - if (width_left) - width_left--; - } - - // Read integer part - if (ch != '.') - { - if (ch >= '0' && ch <= '9') // valid if at least one digit - valid = true; - while (width_left-- && is && (ch = is.get ()) >= '0' && ch <= '9') - retval = retval * 10 + (ch - '0'); - width_left++; - } - - // Read fractional part, up to specified precision - if (ch == '.' && width_left) - { - double multiplier = 1; - int precision = fmt.prec; - int i; - - if (width_left) - width_left--; // Consider width of '.' - - if (precision == -1) - precision = 1<<30; // FIXME Should be MAXINT - - if (! valid) // if there was nothing before '.'... - is.get (); // ...ch was a "peek", not "get". - - for (i = 0; i < precision; i++) - { - if (width_left-- && is && (ch = is.get ()) >= '0' && ch <= '9') - retval += (ch - '0') * (multiplier *= 0.1); - else - { - width_left++; - break; - } - } - - // round up if we truncated and the next digit is >= 5 - if ((i == precision || ! width_left) && (ch = is.get ()) >= '5' - && ch <= '9') - retval += multiplier; - - if (i > 0) - valid = true; // valid if at least one digit after '.' - - // skip remainder after '.', to field width, to look for exponent - if (i == precision) - while (width_left-- && is && (ch = is.get ()) >= '0' && ch <= '9') - ; // discard - - width_left++; - } - - // look for exponent part in, e.g., 6.023E+23 - bool used_exp = false; - if (valid && width_left > 1 && exp_chars.find (ch) != std::string::npos) - { - int ch1 = is.peek (); - if (ch1 == '-' || ch1 == '+' || (ch1 >= '0' && ch1 <= '9')) - { // if 1.0e+$ or some such, this will set failbit, as we want - width_left--; // count "E" - int exp = 0; - int exp_sign = 1; - if (ch1 == '+') - { - if (width_left) - width_left--; - is.get (); - } - else if (ch1 == '-') - { - exp_sign = -1; - is.get (); - if (width_left) - width_left--; - } - valid = false; - while (width_left-- && is && (ch = is.get ()) >= '0' && ch1 <= '9') - { - exp = exp*10 + ch - '0'; - valid = true; - } - width_left++; - if (ch != std::istream::traits_type::eof () && width_left) - is.putback (ch); - - double multiplier = pown (10, exp); - if (exp_sign > 0) - retval *= multiplier; - else - retval /= multiplier; - - used_exp = true; - } - } - is.clear (); - if (! used_exp && ch != std::istream::traits_type::eof () && width_left) - is.putback (ch); - - // Check for +/- inf and NaN - if (! valid && width_left >= 3) - { - int i = lookahead (is, inf_nan, 3, false); // false -> case insensitive - if (i == 0) - { - retval = octave_Inf; - valid = true; - } - else if (i == 1) - { - retval = octave_NaN; - valid = true; - } - } - - // Check for +/- inf and NaN - if (! valid && width_left >= 3) - { - int i = lookahead (is, inf_nan, 3, false); // false -> case insensitive - if (i == 0) - { - retval = octave_Inf; - valid = true; - } - else if (i == 1) - { - retval = octave_NaN; - valid = true; - } - } - - if (! valid) - is.setstate (std::ios::failbit); - else - is.setstate (is.rdstate () & ~std::ios::failbit); - - return retval * sign; -} - -// Read a single number: real, complex, inf, NaN, possibly with limited -// precision. Calls to this should be preceded by skip_whitespace. -// Calling that inside scan_complex would violate its const declaration. - -void -textscan::scan_complex (delimited_stream& is, const textscan_format_elt& fmt, - Complex& val) const -{ - double im = 0; - double re = 0; - bool as_empty = false; // did we fail but match a "treat_as_empty" string? - bool inf = false; - - int ch = is.peek (); - if (ch == '+' || ch == '-') // check for [+-][ij] with no coefficients - { - ch = is.get (); - int ch2 = is.peek (); - if (ch2 == 'i' || ch2 == 'j') - { - double value = 1; - is.get (); - // Check not -inf - if (is.peek () == 'n') - { - char *pos = is.tellg (); - std::ios::iostate state = is.rdstate (); - - is.get (); - ch2 = is.get (); - if (ch2 == 'f') - { - inf = true; - re = (ch == '+') ? octave_Inf : -octave_Inf; - value = 0; - } - else - { - is.clear (state); - is.seekg (pos); // reset to position before look-ahead - } - } - - im = (ch == '+') ? value : -value; - } - else - is.putback (ch); - } - - if (! im && ! inf) // if not [+-][ij] or [+-]inf, read real normally - { - char *pos = is.tellg (); - std::ios::iostate state = is.rdstate (); - //re = octave_read_value (is); - re = read_double (is, fmt); - - // check for "treat as empty" string - if (treat_as_empty.numel () - && (is.fail () || octave_is_NaN_or_NA (Complex (re)) - || re == octave_Inf)) - { - - for (int i = 0; i < treat_as_empty.numel (); i++) - { - if (ch == treat_as_empty (i).string_value ()[0]) - { - as_empty = true; // first char matches, so read the lot - break; - } - } - if (as_empty) // if first char matched... - { - as_empty = false; // ...look for the whole string - - is.clear (state); // treat_as_empty "-" causes partial read - is.seekg (pos); // reset to position before failed read - - // treat_as_empty strings may be different sizes. - // Read ahead longest, put it all back, then re-read the string - // that matches. - std::string look_buf (treat_as_empty_len, '\0'); - char *look = is.read (&look_buf[0], look_buf.size (), pos); - - is.clear (state); - is.seekg (pos); // reset to position before look-ahead - // FIXME -- is.read could invalidate pos - - for (int i = 0; i < treat_as_empty.numel (); i++) - { - std::string s = treat_as_empty (i).string_value (); - if (! strncmp (s.c_str (), look, s.size ())) - { - as_empty = true; - // read just the right amount - is.read (&look_buf[0], s.size (), pos); - break; - } - } - } - } - - if (! is.eof () && ! as_empty) - { - state = is.rdstate (); // before tellg, since that fails at EOF - pos = is.tellg (); - ch = is.peek (); // ch == EOF if read failed; no need to chk fail - if (ch == 'i' || ch == 'j') // pure imaginary - { - is.get (); - im = re; - re = 0; - } - else if (ch == '+' || ch == '-') // see if it is real+imag[ij] - { - // save stream state in case we have to restore it - pos = is.tellg (); - state = is.rdstate (); - - //im = octave_read_value (is); - im = read_double (is, fmt); - if (is.fail ()) - im = 1; - - if (is.peek () == 'i' || is.peek () == 'j') - is.get (); - else - { - im = 0; // no valid imaginary part. Restore state - is.clear (state); // eof shouldn't cause fail. - is.seekg (pos); - } - } - else if (is.eof ()) // we've read enough to be a "valid" read - is.clear (state); // failed peek shouldn't cause fail - } - } - if (as_empty) - val = empty_value.scalar_value (); - else - val = Complex (re, im); -} - -// Return in VAL the run of characters from IS NOT contained in PATTERN. - -int -textscan::scan_caret (delimited_stream& is, const std::string& pattern, - std::string& val) const -{ - int c1 = std::istream::traits_type::eof (); - std::ostringstream obuf; // Is this optimised for growing? - - while (is && ((c1 = (is && ! is.eof ()) - ? is.get_undelim () - : std::istream::traits_type::eof ()) - != std::istream::traits_type::eof ()) - && pattern.find (c1) == std::string::npos) - obuf << static_cast (c1); - - val = obuf.str (); - - if (c1 != std::istream::traits_type::eof ()) - is.putback (c1); - - return c1; -} - -// Read until one of the strings in DELIMITERS is found. For -// efficiency, ENDS is a list of the last character of each delimiter. - -std::string -textscan::read_until (delimited_stream& is, const Cell& delimiters, - const std::string& ends) const -{ - std::string retval (""); - bool done = false; - do - { // find sequence ending with an ending char - std::string next; - scan_caret (is, ends.c_str (), next); - retval = retval + next; // FIXME -- could use repeated doubling of size - - int last = (! is.eof () - ? is.get_undelim () : std::istream::traits_type::eof ()); - - if (last != std::istream::traits_type::eof ()) - { - retval = retval + static_cast (last); - for (int i = 0; i < delimiters.numel (); i++) - { - std::string delim = delimiters(i).string_value (); - size_t start = (retval.length () > delim.length () - ? retval.length () - delim.length () - : 0); - std::string may_match = retval.substr (start); - if (may_match == delim) - { - done = true; - retval = retval.substr (0, start); - break; - } - } - } - } - while (! done && is && ! is.eof ()); - - return retval; -} - - -// Read stream until either fmt.width chars have been read, or -// options.delimiter has been found. Does *not* rely on fmt being 's'. -// Used by formats like %6f to limit to 6. - -void -textscan::scan_string (delimited_stream& is, const textscan_format_elt& fmt, - std::string& val) const -{ - if (delim_list.numel () == 0) - { - unsigned int i = 0; - unsigned int width = fmt.width; - - for (i = 0; i < width; i++) - { - if (i+1 > val.length ()) - val = val + val + ' '; // grow even if empty - int ch = is.get (); - if (is_delim (ch) || ch == std::istream::traits_type::eof ()) - { - is.putback (ch); - break; - } - else - val[i] = ch; - } - val = val.substr (0, i); // trim pre-allocation - } - else // Cell array of multi-character delimiters - { - std::string ends (""); - for (int i = 0; i < delim_list.numel (); i++) - { - std::string tmp = delim_list(i).string_value (); - ends += tmp.substr (tmp.length () - 1); - } - val = textscan::read_until (is, delim_list, ends); - } -} - -// Return in VAL the run of characters from IS contained in PATTERN. - -int -textscan::scan_bracket (delimited_stream& is, const std::string& pattern, - std::string& val) const -{ - int c1 = std::istream::traits_type::eof (); - std::ostringstream obuf; // Is this optimised for growing? - - while (is && pattern.find (c1 = is.get_undelim ()) != std::string::npos) - obuf << static_cast (c1); - - val = obuf.str (); - if (c1 != std::istream::traits_type::eof ()) - is.putback (c1); - return c1; -} - -// Return in VAL a string, either delimited by whitespace/delimiters, or -// enclosed in a pair of double quotes ("..."). Enclosing quotes are -// removed. A consecutive pair "" is inserted into VAL as a single ". - -void -textscan::scan_qstring (delimited_stream& is, const textscan_format_elt& fmt, - std::string& val) -{ - skip_whitespace (is); - - if (is.peek () != '\"') - scan_string (is, fmt, val); - else - { - is.get (); - scan_caret (is, "\"", val); // read everything until " - is.get (); // swallow " - - while (is && is.peek () == '\"') // if double ", insert one in stream, - { // and keep looking for single " - is.get (); - std::string val1; - scan_caret (is, "\"", val1); - val = val + "\"" + val1; - is.get_undelim (); - } - } -} - -// Read from IS into VAL a string of the next fmt.width characters, -// including any whitespace or delimiters. - -void -textscan::scan_cstring (delimited_stream& is, const textscan_format_elt& fmt, - std::string& val) const -{ - val.resize (fmt.width); - - for (unsigned int i = 0; is && i < fmt.width; i++) - { - int ch = is.get_undelim (); - if (ch != std::istream::traits_type::eof ()) - val[i] = ch; - else - { - val.resize (i); - break; - } - } -} - - -// Read a single '%...' conversion and place it in position ROW of OV. - -void -textscan::scan_one (delimited_stream& is, const textscan_format_elt& fmt, - octave_value& ov, Array row) -{ - skip_whitespace (is); - - is.clear (); - - octave_value val; - if (fmt.numeric) - { - if (fmt.type == 'f' || fmt.type == 'n') - { - Complex v; - skip_whitespace (is); - scan_complex (is, fmt, v); - - if (! fmt.discard && ! is.fail ()) - { - if (fmt.bitwidth == 64) - { - if (ov.is_real_type () && v.imag () == 0) - ov.internal_rep ()->fast_elem_insert (row(0), v.real ()); - else - { - if (ov.is_real_type ()) // cat does type conversion - ov = do_cat_op (ov, octave_value (v), row); - else - ov.internal_rep ()->fast_elem_insert (row(0), v); - } - } - else - { - if (ov.is_real_type () && v.imag () == 0) - ov.internal_rep ()->fast_elem_insert (row(0), - float (v.real ())); - else - { - if (ov.is_real_type ()) // cat does type conversion - ov = do_cat_op (ov, octave_value (v), row); - else - ov.internal_rep ()->fast_elem_insert (row(0), - FloatComplex (v)); - } - } - } - } - else - { - double v; // Matlab docs say 1e30 etc should be valid for %d and - // 1000 as a %d8 should be 127, so read as double. - // Some loss of precision for d64 and u64. - skip_whitespace (is); - v = read_double (is, fmt); - if (! fmt.discard && ! is.fail ()) - switch (fmt.bitwidth) - { - case 64: - switch (fmt.type) - { - case 'd': - { - octave_int64 vv = v; - ov.internal_rep ()->fast_elem_insert (row(0), vv); - } - break; - - case 'u': - { - octave_uint64 vv = v; - ov.internal_rep ()->fast_elem_insert (row(0), vv); - } - break; - } - break; - - case 32: - switch (fmt.type) - { - case 'd': - { - octave_int32 vv = v; - ov.internal_rep ()->fast_elem_insert (row(0), vv); - } - break; - - case 'u': - { - octave_uint32 vv = v; - ov.internal_rep ()->fast_elem_insert (row(0), vv); - } - break; - } - break; - - case 16: - if (fmt.type == 'd') - { - octave_int16 vv = v; - ov.internal_rep ()->fast_elem_insert (row(0), vv); - } - else - { - octave_uint16 vv = v; - ov.internal_rep ()->fast_elem_insert (row(0), vv); - } - break; - - case 8: - if (fmt.type == 'd') - { - octave_int8 vv = v; - ov.internal_rep ()->fast_elem_insert (row(0), vv); - } - else - { - octave_uint8 vv = v; - ov.internal_rep ()->fast_elem_insert (row(0), vv); - } - break; - } - } - - if (is.fail ()) - { - if (! fmt.discard) - ov = do_cat_op (ov, empty_value, row); - - // If we are continuing after errors, skip over this field - if (return_on_error == 2) - { - std::ios::iostate state = is.rdstate (); - is.clear (); // clear to allow read pointer to advance - - std::string dummy; - scan_string (is, fmt, dummy); - - is.setstate (state); - } - } - - } - else - { - std::string vv (" "); // initial buffer. Grows as needed - switch (fmt.type) - { - case 's': - scan_string (is, fmt, vv); - break; - - case 'q': - scan_qstring (is, fmt, vv); - break; - - case 'c': - scan_cstring (is, fmt, vv); - break; - - case '[': - scan_bracket (is, fmt.char_class.c_str (), vv); - break; - - case '^': - scan_caret (is, fmt.char_class.c_str (), vv); - break; - } - - if (! fmt.discard) - ov.internal_rep ()->fast_elem_insert (row (0), - Cell (octave_value (vv))); - - // FIXME -- why does failbit get set at EOF, instead of eofbit? - if (! vv.empty ()) - is.clear (is.rdstate () & ~std::ios_base::failbit); - } - - is.field_done (); -} - -// Read data corresponding to the entire format string once, placing the -// values in row ROW of retval. - -int -textscan::read_format_once (delimited_stream& is, - textscan_format_list& fmt_list, - std::list& retval, - Array row, int& done_after) -{ - const textscan_format_elt *elem = fmt_list.first (); - std::list::iterator out = retval.begin (); - bool no_conversions = true; - bool done = false; - bool conversion_failed = false; // Record for ReturnOnError - - octave_quit (); - - for (size_t i = 0; i < fmt_list.numel (); i++) - { - bool this_conversion_failed = false; - - // Clear fail of previous numeric conversions. - is.clear (); - - switch (elem->type) - { - case 'C': - case 'D': - std::cerr << "textscan: Conversion %" << elem->type - << " not yet implemented\n"; - break; - - case 'u': - case 'd': - case 'f': - case 'n': - case 's': - case '[': - case '^': - case 'q': - case 'c': - scan_one (is, *elem, *out, row); - break; - - case textscan_format_elt::literal_conversion : - match_literal (is, *elem); - break; - - default: - error ("Unknown format element '%c'", elem->type); - } - - if (! is.fail ()) - { - if (! elem->discard) - no_conversions = false; - } - else - { - if (return_on_error < 2) - this_conversion_failed = true; - - is.clear (is.rdstate () & ~std::ios::failbit); - } - - if (! elem->discard) - out++; - - elem = fmt_list.next (); - char *pos = is.tellg (); - - // FIXME -- these conversions "ignore delimiters". Should they include - // delimiters at the start of the conversion, or can those be skipped? - if (elem->type != textscan_format_elt::literal_conversion - // && elem->type != '[' && elem->type != '^' && elem->type != 'c' - ) - skip_delim (is); - - if (this_conversion_failed) - { - if (is.tellg () == pos && ! conversion_failed) - { // done_after = first failure - done_after = i; // note fail, but parse others to get empty_val - conversion_failed = true; - } - else - this_conversion_failed = false; - } - - if (is.eof ()) - { - if (! done) - done_after = i+1; - - // note EOF, but process others to get empty_val. - done = true; - } - } - - if (done) - is.setstate (std::ios::eofbit); - - // Returning -3 means "error, and no columns read this row". - if (is.eof ()) - return (2 + no_conversions); - - if (conversion_failed) - return (4 + no_conversions); - - return 0; -} - -void -textscan::parse_options (const octave_value_list& args, - textscan_format_list& fmt_list) -{ - int last = args.length (); - int n = last; - - if (n & 1) - error ("textscan: %d parameters given, but only %d values", n-n/2, n/2); - - delim_len = 1; - bool have_delims = false; - for (int i = 0; i < last; i += 2) - { - if (! args(i).is_string ()) - error ("textscan: Invalid paramter type <%s> for parameter %d", - args(i).type_name ().c_str (), i/2 + 1); - - std::string param = args(i).string_value (); - std::transform (param.begin (), param.end (), - param.begin (), ::tolower); - if (param == "delimiter") - { - bool invalid = true; - if (args(i+1).is_string ()) - { - invalid = false; - have_delims = true; - delims = args(i+1).string_value (); - } - else if (args(i+1).is_cell ()) - { - invalid = false; - delim_list = args(i+1).cell_value (); - delim_table = " "; // non-empty, to flag non-default delim - - // Check that all elements are strings, and find max length - for (int j = 0; j < delim_list.numel (); j++) - { - if (! delim_list(j).is_string ()) - invalid = true; - else - { - octave_idx_type len = delim_list(j).string_value () - .length (); - delim_len = std::max (static_cast (len), delim_len); - } - } - } - if (invalid) - error ("textscan: Delimiters must be either a string or " - "cell array of strings"); - } - else if (param == "commentstyle") - { - if (args(i+1).is_string ()) - { // check here for names like "C++", "C", "shell", ...? - comment_style = Cell (args(i+1)); - } - else if (args(i+1).is_cell ()) - { - comment_style = args(i+1).cell_value (); - int len = comment_style.numel (); - if ((len >= 1 && ! comment_style (0).is_string ()) - || (len >= 2 && ! comment_style (1).is_string ()) - || (len >= 3)) - error ("textscan: CommentStyle must be either a string or " - "cell array of one or two strings"); - } - else - error ("textscan: CommentStyle must be either a string" - " or cell array of one or two strings, not <%s>", - args(i+1).class_name ().c_str ()); - - // How far ahead do we need to look to detect an open comment - // and which character do we look for? - if (comment_style.numel () >= 1) - { - comment_len = comment_style (0).string_value ().size (); - comment_char = comment_style (0).string_value ()[0]; - } - } - else if (param == "treatasempty") - { - bool invalid = false; - if (args(i+1).is_string ()) - { - treat_as_empty = Cell (args(i+1)); - treat_as_empty_len = args(i+1).string_value ().size (); - } - else if (args(i+1).is_cell ()) - { - treat_as_empty = args(i+1).cell_value (); - for (int j = 0; j < treat_as_empty.numel (); j++) - if (! treat_as_empty (j).is_string ()) - invalid = true; - else - { - int k = treat_as_empty (j).string_value ().size (); - if (k > treat_as_empty_len) - treat_as_empty_len = k; - } - } - if (invalid) - error ("textscan: TreatAsEmpty must be either a string or " - "cell array of one or two strings"); - - // FIXME Ensure none is a prefix of a later one. Sort by length? - } - else if (param == "collectoutput") - { - if (args(i+1).is_numeric_type ()) - collect_output = args(i+1).bool_value (); - else - error ("textscan: CollectOutput must be logical or numeric"); - } - else if (param == "emptyvalue") - { - if (args(i+1).is_numeric_type ()) - empty_value = args(i+1).scalar_value (); - else - error ("textscan: EmptyValue must be numeric, not <%s>", - args(i+1).class_name ().c_str ()); - } - else if (param == "headerlines") - { - if (args(i+1).is_numeric_type ()) - header_lines = args(i+1).scalar_value (); - else - error ("textscan: HeaderLines must be numeric"); - } - else if (param == "bufsize") - { - if (args(i+1).is_numeric_type ()) - buffer_size = args(i+1).scalar_value (); - else - error ("textscan: BufSize must be numeric"); - } - else if (param == "multipledelimsasone") - { - if (args(i+1).is_numeric_type ()) - { - if (args(i+1).bool_value ()) - multiple_delims_as_one = true; - } - else - error ("textscan: MultipleDimsAsOne must be logical or numeric"); - } - else if (param == "returnonerror") - { - if (args(i+1).is_numeric_type ()) - return_on_error = args(i+1).bool_value (); - else if (args(i+1).is_string () - && args(i+1).string_value () == "continue") - return_on_error = 2; - else - error ("textscan: ReturnOnError must be logical or " - "numeric, or 'continue'"); - } - else if (param == "whitespace") - { - if (args(i+1).is_string ()) - whitespace = args(i+1).string_value (); - else - error ("textscan: Whitespace must be a character string"); - } - else if (param == "expchars") - { - if (args(i+1).is_string ()) - { - exp_chars = args(i+1).string_value (); - default_exp = false; - } - else - error ("textscan: ExpChars must be a character string"); - } - else if (param == "endofline") - { - bool valid = true; - if (args(i+1).is_string ()) - { - std::string s = args(i+1).string_value (); - if (args(i+1).is_sq_string ()) - s = do_string_escapes (s); - int l = s.length (); - if (l == 0) - eol1 = eol2 = -2; - else if (l == 1) - eol1 = eol2 = s.c_str ()[0]; - else if (l == 2) - { - eol1 = s.c_str ()[0]; - eol2 = s.c_str ()[1]; - if (eol1 != '\r' || eol2 != '\n') // Why limit it? - valid = false; - } - else - valid = false; - } - else - valid = false; - if (! valid) - error ("textscan: EndOfLine must be at most one character " - "or '\\r\\n'"); - } - else - error ("textscan: Unrecognised option '%s'", param.c_str ()); - } - - whitespace_table = std::string (256, '\0'); - for (unsigned int i = 0; i < whitespace.length (); i++) - whitespace_table[whitespace[i]] = '1'; - - // For Matlab compatibility, add 0x20 to whitespace, unless - // whitespace is explicitly ignored. - if (! (whitespace.empty () && fmt_list.has_string)) - whitespace_table[' '] = '1'; - - // Create look-up table of delimiters, based on 'delimiter' - delim_table = std::string (256, '\0'); - if (eol1 >= 0 && eol1 < 256) - delim_table[eol1] = '1'; // EOL is always a delimiter - if (eol2 >= 0 && eol2 < 256) - delim_table[eol2] = '1'; // EOL is always a delimiter - if (! have_delims) - for (unsigned int i = 0; i < 256; i++) - { - if (isspace (i)) - delim_table[i] = '1'; - } - else - for (unsigned int i = 0; i < delims.length (); i++) - delim_table[delims[i]] = '1'; -} - -// Skip comments, and characters specified by the "Whitespace" option. -// If EOLstop == true, don't skip end of line. - -int -textscan::skip_whitespace (delimited_stream& is, bool EOLstop) -{ - int c1 = std::istream::traits_type::eof (); - bool found_comment = false; - - do - { - found_comment = false; - int prev = -1; - while (is && (c1 = is.get_undelim ()) != std::istream::traits_type::eof () - && ( ( (c1 == eol1 || c1 == eol2) && ++lines && ! EOLstop) - || isspace (c1))) - { - if (prev == eol1 && eol1 != eol2 && c1 == eol2) - lines--; - prev = c1; - } - - if (c1 == comment_char) // see if we match an open comment - { - // save stream state in case we have to restore it - char *pos = is.tellg (); - std::ios::iostate state = is.rdstate (); - - std::string tmp (comment_len, '\0'); - char *look = is.read (&tmp[0], comment_len-1, pos); // already read first char - if (is && ! strncmp (comment_style(0).string_value ().substr (1) - .c_str (), look, comment_len-1)) - { - found_comment = true; - - std::string dummy; - if (comment_style.numel () == 1) // skip to end of line - { - std::string eol (3, '\0'); - eol[0] = eol1; - eol[1] = eol2; - - scan_caret (is, eol, dummy); - c1 = is.get_undelim (); - if (c1 == eol1 && eol1 != eol2 && is.peek_undelim () == eol2) - is.get_undelim (); - lines++; - } - else // matching pair - { - std::string end_c = comment_style(1).string_value (); - // last char of end-comment sequence - std::string last = end_c.substr (end_c.size () - 1); - std::string may_match (""); - do - { // find sequence ending with last char - scan_caret (is, last, dummy); - is.get_undelim (); // (read LAST itself) - - may_match = may_match + dummy + last; - if (may_match.length () > end_c.length ()) - { - size_t start = may_match.length () - end_c.length (); - may_match = may_match.substr (start); - } - } - while (may_match != end_c && is && ! is.eof ()); - } - } - else // wasn't really a comment; restore state - { - is.clear (state); - is.seekg (pos); - } - } - } - while (found_comment); - - if (c1 != std::istream::traits_type::eof ()) - is.putback (c1); - return c1; -} - -// See if the next few characters match one of the strings in target. -// For efficiency, MAX_LEN is the cached longest length of any target. -// Return -1 if none is found, or the index of the match. - -int -textscan::lookahead (delimited_stream& is, const Cell& targets, int max_len, - bool case_sensitive) const -{ - // target strings may be different sizes. - // Read ahead longest, put it all back, then re-read the string - // that matches. - - char *pos = is.tellg (); - - std::string tmp (max_len, '\0'); - char *look = is.read (&tmp[0], tmp.size (), pos); - - is.clear (); - is.seekg (pos); // reset to position before look-ahead - // FIXME pos may be corrupted by is.read - - int i; - int (*compare)(const char *, const char *, size_t); - compare = case_sensitive ? strncmp : strncasecmp; - - for (i = 0; i < targets.numel (); i++) - { - std::string s = targets (i).string_value (); - if (! (*compare) (s.c_str (), look, s.size ())) - { - is.read (&tmp[0], s.size (), pos); // read just the right amount - break; - } - } - - if (i == targets.numel ()) - i = -1; - - return i; -} - -// Skip delimiters -- multiple if MultipleDelimsAsOne specified. -int -textscan::skip_delim (delimited_stream& is) -{ - int c1 = skip_whitespace (is, true); // 'true': stop once EOL is read - if (delim_list.numel () == 0) // single character delimiter - { - if (is_delim (c1) || c1 == eol1 || c1 == eol2) - { - is.get (); - if (c1 == eol1 && is.peek_undelim () == eol2) - is.get_undelim (); // if \r\n, skip the \n too. - - if (multiple_delims_as_one) - { - int prev = -1; - // skip multiple delims. - // Increment lines for each end-of-line seen; for \r\n, decrement - while (is && ((c1 = is.get_undelim ()) - != std::istream::traits_type::eof ()) - && (((c1 == eol1 || c1 == eol2) && ++lines) - || isspace (c1) || is_delim (c1))) - { - if (prev == eol1 && eol1 != eol2 && c1 == eol2) - lines--; - prev = c1; - } - if (c1 != std::istream::traits_type::eof ()) - is.putback (c1); - } - } - } - else // multi-character delimiter - { - int first_match; - - if (c1 == eol1 || c1 == eol2 - || (-1 != (first_match = lookahead (is, delim_list, delim_len)))) - { - if (c1 == eol1) - { - is.get_undelim (); - if (is.peek_undelim () == eol2) - is.get_undelim (); - } - else if (c1 == eol2) - { - is.get_undelim (); - } - - if (multiple_delims_as_one) - { - int prev = -1; - // skip multiple delims. - // Increment lines for each end-of-line seen; for \r\n, decrement - while (is && ((c1 = skip_whitespace (is, true)) - != std::istream::traits_type::eof ()) - && (((c1 == eol1 || c1 == eol2) && ++lines) - || -1 != lookahead (is, delim_list, delim_len))) - { - if (prev == eol1 && eol1 != eol2 && c1 == eol2) - lines--; - prev = c1; - } - } - } - } - - return c1; -} - -// Read in as much of the input as coincides with the literal in the -// format string. Return "true" if the entire literal is matched, else -// false (and set failbit). - -bool -textscan::match_literal (delimited_stream& is, const textscan_format_elt& fmt) -{ - // "false" -> treat EOL as normal space - // since a delimiter at the start of a line is a mismatch, not empty field - skip_whitespace (is, false); - - for (unsigned int i = 0; i < fmt.width; i++) - { - int ch = is.get_undelim (); - if (ch != fmt.text[i]) - { - if (ch != std::istream::traits_type::eof ()) - is.putback (ch); - is.setstate (std::ios::failbit); - return false; - } - } - return true; -} - -DEFUN (textscan, args, , - "-*- texinfo -*-\n\ -@deftypefn {} {@var{C} =} textscan (@var{fid}, @var{format})\n\ -@deftypefnx {} {@var{C} =} textscan (@var{fid}, @var{format}, @var{repeat})\n\ -@deftypefnx {} {@var{C} =} textscan (@var{fid}, @var{format}, @var{param}, @var{value}, @dots{})\n\ -@deftypefnx {} {@var{C} =} textscan (@var{fid}, @var{format}, @var{repeat}, @var{param}, @var{value}, @dots{})\n\ -@deftypefnx {} {@var{C} =} textscan (@var{str}, @dots{})\n\ -@deftypefnx {} {[@var{C}, @var{position}, @var{errmsg}] =} textscan (@dots{})\n\ -Read data from a text file or string.\n\ -\n\ -The string @var{str} or file associated with @var{fid} is read from and\n\ -parsed according to @var{format}.\n\ -The function is an extension of @code{strread} and @code{textread}.\n\ -Differences include: the ability to read from either a file or a string,\n\ -additional options, and additional format specifiers.\n\ -\n\ -The input is interpreted as a sequence of \"words\", delimiters\n\ -(such as whitespace) and literals.\n\ -The characters that form delimiters and whitespace are determined\n\ -by the options.\n\ -The format consists of format specifiers interspersed between literals.\n\ -In the format, whitespace forms a delimiter between consecutive literals,\n\ -but is otherwise ignored.\n\ -\n\ -The output @var{C} is a cell array whose second dimension is determined\n\ -by the number of format specifiers.\n\ -\n\ -The first word of the input is matched to the first specifier of the\n\ -format and placed in the first column of the output;\n\ -the second is matched to the second specifier and placed in the second column\n\ -and so forth.\n\ -If there are more words than specifiers, the process is repeated until all\n\ -words have been processed or the limit imposed by @var{repeat} has been met\n\ -(see below).\n\ -\n\ -The string @var{format} describes how the words in @var{str} should be\n\ -parsed.\n\ -As in @var{fscanf}, any (non-whitespace) text in the format that is\n\ -not one of these specifiers is considered a literal;\n\ -if there is a literal between two format specifiers then that same literal\n\ -must appear in the input stream between the matching words.\n\ -\n\ -The following specifiers are valid:\n\ -\n\ -@table @code\n\ -@item %f\n\ -@itemx %f64\n\ -@itemx %n\n\ -The word is parsed as a number and converted to double.\n\ -\n\ -@item %f32\n\ -The word is parsed as a number and converted to single (float).\n\ -\n\ -@item %d\n\ -@itemx %d8\n\ -@itemx %d16\n\ -@itemx %d32\n\ -@itemx %d64\n\ -The word is parsed as a number and converted to int8, int16, int32 or int64.\n\ -If not size is specified, int32 is used.\n\ -\n\ -@item %u\n\ -@itemx %u8\n\ -@itemx %u16\n\ -@itemx %u32\n\ -@itemx %u64\n\ -The word is parsed as a number and converted to uint8, uint16, uint32 or\n\ -uint64. If not size is specified, uint32 is used.\n\ -\n\ -@item %s\n\ -The word is parsed as a string, ending at the last character before\n\ -whitespace, an end-of-line or a delimiter specified in the options.\n\ -\n\ -@item %q\n\ -The word is parsed as a \"quoted string\".\n\ -If the first character of the string is a double quote (\") then the string\n\ -includes everything until a matching double quote, including whitespace,\n\ -delimiters and end of line characters.\n\ -If a pair of consecutive double quotes appears in the input,\n\ -it is replaced in the output by a single double quote.\n\ -That is, the input \"He said \"\"Hello\"\"\" would return the value\n\ -'He said \"Hello\"'.\n\ -\n\ -@item %c\n\ -The next character of the input is read.\n\ -This includes delimiters, whitespace and end of line characters.\n\ -\n\ -@item %[...]\n\ -@itemx %[^...]\n\ -In the first form, the word consists of the longest run consisting of only\n\ -characters between the brackets.\n\ -Ranges of characters can be specified by a hyphen;\n\ -for example, %[0-9a-zA-Z] matches all alphanumeric characters\n\ -(if the underlying character set is ASCII).\n\ -Since Matlab treats hyphens literally, this expansion only applies to\n\ -alphanumeric characters.\n\ -To include '-' in the set, it should appear first or last in the brackets;\n\ -to include ']', it should be the first character.\n\ -If the first character is '^' then the word consists of characters\n\ -NOT listed.\n\ -\n\ -@item %N...\n\ -For %s, %c %d, %f, %n, %u, an optional width can be specified as %Ns etc.\n\ -where N is an integer > 1.\n\ -For %c, this causes exactly the next N characters to be read instead of\n\ -a single character.\n\ -For the other specifiers, it is an upper bound on the\n\ -number of characters read;\n\ -normal delimiters can cause fewer characters to be read.\n\ -For complex numbers, this limit applies to the real and imaginary\n\ -components individually.\n\ -For %f and %n, format specifiers like %N.Mf are allowed, where M is an upper\n\ -bound on number of characters after the decimal point to be considered;\n\ -subsequent digits are skipped.\n\ -For example, the specifier %8.2f would read 12.345e6 as 1.234e7.\n\ -\n\ -@item %*...\n\ -The word specified by the remainder of the conversion specifier is skipped.\n\ -\n\ -@item literals\n\ -In addition the format may contain literal character strings;\n\ -these will be skipped during reading.\n\ -If the input string does not match this literal, the processing terminates,\n\ -unless \"ReturnOnError\" is set to \"continue\".\n\ -@end table\n\ -\n\ -Parsed words corresponding to the first specifier are returned in the first\n\ -output argument and likewise for the rest of the specifiers.\n\ -\n\ -By default, if there is only one input argument, @var{format} is @t{\"%f\"}.\n\ -This means that numbers are read from @var{str} into a single column vector.\n\ -If @var{format} is explicitly empty, \"\", then textscan will return data\n\ -in a number of columns matching the number of fields on the first data\n\ -line of the input.\n\ -Either of these is suitable only if @var{str} contains only numeric fields.\n\ -\n\ -For example, the string\n\ -\n\ -@example\n\ -@group\n\ -@var{str} = \"\\\n\ -Bunny Bugs 5.5\\n\\\n\ -Duck Daffy -7.5e-5\\n\\\n\ -Penguin Tux 6\"\n\ -@end group\n\ -@end example\n\ -\n\ -@noindent\n\ -can be read using\n\ -\n\ -@example\n\ -@var{a} = textscan (@var{str}, \"%s %s %f\");\n\ -@end example\n\ -\n\ -The optional numeric argument @var{repeat} can be used for limiting the\n\ -number of items read:\n\ -\n\ -@table @asis\n\ -@item -1\n\ -(default) read all of the string or file until the end.\n\ -\n\ -@item N\n\ -Read until the first of two conditions occurs: the format has been processed\n\ -N times, or N lines of the input have been processed.\n\ -Zero (0) is an acceptable value for @var{repeat}.\n\ -Currently, end-of-line characters inside %q, %c, and %[...]$ conversions\n\ -do not contribute to the line count.\n\ -This is incompatible with Matlab and may change in future.\n\ -@end table\n\ -\n\ -The behavior of @code{textscan} can be changed via property-value pairs.\n\ -The following properties are recognized:\n\ -\n\ -@table @asis\n\ -@item @qcode{\"BufSize\"}\n\ -This specifies the number of bytes to use for the internal buffer.\n\ -A modest speed improvement is obtained by setting this to a large value\n\ -when reading a large file, especially the input contains long strings.\n\ -The default is 4096, or a value dependent on @var{n} is that is specified.\n\ -\n\ -@item @qcode{\"CollectOutput\"}\n\ -A value of 1 or true instructs textscan to concatenate consecutive columns\n\ -of the same class in the output cell array.\n\ -A value of 0 or false (default) leaves output in distinct columns.\n\ -\n\ -@item @qcode{\"CommentStyle\"}\n\ -Parts of @var{str} are considered comments and will be skipped.\n\ -@var{value} is the comment style and can be either\n\ -(1) One string, or 1x1 cell string, to skip everything to the right of it;\n\ -(2) A cell array of two strings, to skip everything between the first and\n\ -second strings.\n\ -Comments are only parsed where whitespace is accepted, and do not act as\n\ -delimiters.\n\ -\n\ -@item @qcode{\"Delimiter\"}\n\ -If @var{value} is a string, any character in @var{value} will be used to\n\ -split @var{str} into words.\n\ -If @var{value} is a cell array of strings,\n\ -any string in the array will be used to split @var{str} into words.\n\ -(default value = any whitespace.)\n\ -\n\ -@item @qcode{\"EmptyValue\"}\n\ -Value to return for empty numeric values in non-whitespace delimited data.\n\ -The default is NaN@.\n\ -When the data type does not support NaN (int32 for example),\n\ -then default is zero.\n\ -\n\ -@item @qcode{\"EndOfLine\"}\n\ -@var{value} can be either a emtpy or one character specifying the\n\ -end of line character, or the pair\n\ -@qcode{\"@xbackslashchar{}r@xbackslashchar{}n\"} (CRLF).\n\ -In the latter case, any of\n\ -@qcode{\"@xbackslashchar{}r\"}, @qcode{\"@xbackslashchar{}n\"} or\n\ -@qcode{\"@xbackslashchar{}r@xbackslashchar{}n\"} is counted as a (single)\n\ -newline.\n\ -If no value is given, @qcode{\"@xbackslashchar{}r@xbackslashchar{}n\"} is\n\ -used.\n\ -@c If set to \"\" (empty string) EOLs are ignored as delimiters and added\n\ -@c to whitespace.\n\ -\n\ -@c When reading from a character string, optional input argument @var{n}\n\ -@c specifies the number of times @var{format} should be used (i.e., to limit\n\ -@c the amount of data read).\n\ -@c When reading from file, @var{n} specifies the number of data lines to read;\n\ -@c in this sense it differs slightly from the format repeat count in strread.\n\ -\n\ -@item @qcode{\"HeaderLines\"}\n\ -The first @var{value} number of lines of @var{fid} are skipped.\n\ -Note that this does not refer to the first non-comment lines, but the first\n\ -lines of any type.\n\ -\n\ -@item @qcode{\"MultipleDelimsAsOne\"}\n\ -If @var{value} is non-zero,\n\ -treat a series of consecutive delimiters, without whitespace in between,\n\ -as a single delimiter.\n\ -Consecutive delimiter series need not be vertically @qcode{\"aligned\"}.\n\ -Without this option, a single delimiter before the end of the line does\n\ -not cause the line to be considered to end with an empty value,\n\ -but a single delimiter at the start of a line causes the line\n\ -to be considered to start with an empty value.\n\ -\n\ -@item @qcode{\"TreatAsEmpty\"}\n\ -Treat single occurrences (surrounded by delimiters or whitespace) of the\n\ -string(s) in @var{value} as missing values.\n\ -\n\ -@item @qcode{\"ReturnOnError\"}\n\ -If set to numerical 1 or true, return normally as soon as an error\n\ -is encountered, such as trying to read a string using @qcode{%f}.\n\ -If set to 0 or false, return an error and no data.\n\ -If set to \"continue\" (default), textscan attempts to continue reading\n\ -beyond the location; however, this may cause the parsing to get out of sync.\n\ -\n\ -@item @qcode{\"Whitespace\"}\n\ -Any character in @var{value} will be interpreted as whitespace and trimmed;\n\ -The default value for whitespace is\n\ -@c Note: the next line specifically has a newline which generates a space\n\ -@c in the output of qcode, but keeps the next line < 80 characters.\n\ -@qcode{\"\n\ -@xbackslashchar{}b@xbackslashchar{}r@xbackslashchar{}n@xbackslashchar{}t\"}\n\ -(note the space). Unless whitespace is set to @qcode{\"\"} (empty) AND at\n\ -least one @qcode{\"%s\"} format conversion specifier is supplied, a space is\n\ -always part of whitespace.\n\ -\n\ -@end table\n\ -\n\ -When the number of words in @var{str} or @var{fid} doesn't match an exact\n\ -multiple of the number of format conversion specifiers,\n\ -textscan's behavior depends on\n\ -whether the last character of the string or file is\n\ -an end-of-line as specified by the EndOfLine option:\n\ -\n\ -@table @asis\n\ -@item last character = end-of-line\n\ -Data columns are padded with empty fields, NaN or 0 (for integer fields)\n\ -so that all columns have equal length\n\ -\n\ -@item last character is not end-of-line\n\ -Data columns are not padded; textscan returns columns of unequal length\n\ -@end table\n\ -\n\ -\n\ -The second output, @var{position}, provides the position, in characters\n\ -from the beginning of the file or string, at which the processing stopped.\n\ -\n\ -@seealso{dlmread, fscanf, load, strread, textread}\n\ -@end deftypefn") -{ - static std::string who = "textscan"; - - octave_value_list retval; - - if (args.length () < 1) - print_usage (); - - octave_value_list tmp_args = args.splice (0, 1); - - octave_idx_type count = 0; - - textscan tscanner; - - if (args(0).is_string ()) - { - std::string data = args(0).string_value (); - - octave_stream os = octave_istrstream::create (data); - - if (! os.is_valid ()) - error ("%s: unable to create temporary input buffer", who.c_str ()); - - std::istream *isp = os.input_stream (); - - octave_value result = tscanner.scan (*isp, tmp_args, count); - - std::string errmsg = os.error (); - - return ovl (result, count, errmsg); - } - else - { - octave_stream os = octave_stream_list::lookup (args(0), who); - - std::istream *isp = os.input_stream (); - - if (! isp) - error ("internal error: textscan called with invalid istream"); - - octave_value result = tscanner.scan (*isp, tmp_args, count); - - std::string errmsg = os.error (); - - return ovl (result, count, errmsg); - } - - return retval; -} - -/* -%!test -%! str = "1, 2, 3, 4\n 5, , , 8\n 9, 10, 11, 12"; -%! fmtstr = "%f %d %f %s"; -%! c = textscan (str, fmtstr, 2, "delimiter", ",", "emptyvalue", -Inf); -%! assert (isequal (c{1}, [1;5])); -%! assert (length (c{1}), 2); -%! assert (iscellstr (c{4})); -%! assert (isequal (c{3}, [3; -Inf])); - -%!test -%! b = [10:10:100]; -%! b = [b; 8*b/5]; -%! str = sprintf ("%g miles/hr = %g kilometers/hr\n", b); -%! fmt = "%f miles/hr = %f kilometers/hr"; -%! c = textscan (str, fmt); -%! assert (c{1}, b(1,:)', 1e-5); -%! assert (c{2}, b(2,:)', 1e-5); - -%!test -%! str = "13, -, NA, str1, -25\r\n// Middle line\r\n36, na, 05, str3, 6"; -%! a = textscan (str, "%d %n %f %s %n", "delimiter", ",", -%! "treatAsEmpty", {"NA", "na", "-"},"commentStyle", "//"); -%! assert (a{1}, int32 ([13; 36])); -%! assert (a{2}, [NaN; NaN]); -%! assert (a{3}, [NaN; 5]); -%! assert (a{4}, {"str1"; "str3"}); -%! assert (a{5}, [-25; 6]); - -%!test -%! str = "Km:10 = hhhBjjj miles16hour\r\n"; -%! str = [str "Km:15 = hhhJjjj miles241hour\r\n"]; -%! str = [str "Km:2 = hhhRjjj miles3hour\r\n"]; -%! str = [str "Km:25 = hhhZ\r\n"]; -%! fmt = "Km:%d = hhh%1sjjj miles%dhour"; -%! a = textscan (str, fmt, "delimiter", " "); -%! assert (a{1}', int32 ([10 15 2 25])); -%! assert (a{2}', {'B' 'J' 'R' 'Z'}); -%! assert (a{3}', int32 ([16 241 3 0])); - -## Test with default endofline parameter -%!test -%! c = textscan ("L1\nL2", "%s"); -%! assert (c{:}, {"L1"; "L2"}); - -## Test with endofline parameter set to "" (empty) - newline should be in word -%!test -%! c = textscan ("L1\nL2", "%s", "endofline", ""); -%! assert (int8 ([c{:}{:}]), int8 ([ 76, 49, 10, 76, 50 ])); - -### Matlab fails this test. A literal after a conversion is not a delimiter -#%!test -#%! ## No delimiters at all besides EOL. Skip fields, even empty fields -#%! str = "Text1Text2Text\nTextText4Text\nText57Text"; -#%! c = textscan (str, "Text%*dText%dText"); -#%! assert (c{1}, int32 ([2; 4; 0])); - -## CollectOutput test -%!test -%! b = [10:10:100]; -%! b = [b; 8*b/5; 8*b*1000/5]; -%! str = sprintf ("%g miles/hr = %g (%g) kilometers (meters)/hr\n", b); -%! fmt = "%f miles%s %s %f (%f) kilometers %*s"; -%! c = textscan (str, fmt, "collectoutput", 1); -%! assert (size (c{3}), [10, 2]); -%! assert (size (c{2}), [10, 2]); - -## CollectOutput test with uneven column length files -%!test -%! b = [10:10:100]; -%! b = [b; 8*b/5; 8*b*1000/5]; -%! str = sprintf ("%g miles/hr = %g (%g) kilometers (meters)/hr\n", b); -%! str = [str "110 miles/hr"]; -%! fmt = "%f miles%s %s %f (%f) kilometers %*s"; -%! c = textscan (str, fmt, "collectoutput", 1); -%! assert (size (c{1}), [11, 1]); -%! assert (size (c{3}), [11, 2]); -%! assert (size (c{2}), [11, 2]); -%! assert (c{3}(end), NaN); -%! assert (c{2}{11, 1}, "/hr"); -%! assert (isempty (c{2}{11, 2}), true); - -## Double quoted string -%!test -%! str = 'First "the second called ""the middle""" third'; -%! fmt = "%q"; -%! c = textscan (str, fmt); -%! assert (c{1}, {"First"; 'the second called "the middle"'; "third"}); - -## Arbitrary character -%!test -%! c = textscan ("a first, \n second, third", "%s %c %11c", 'delimiter', ' ,'); -%! assert (c{1}, {"a"; "ond"}); -%! assert (c{2}, {"f"; "t"}); -%! assert (c{3}, {"irst, \n sec"; "hird"}); - -## Field width and non-standard delimiters -%!test -%! str = "12;34;123456789;7"; -%! c = textscan (str, "%4d %4d", "delimiter", ";", "collectOutput", 1); -%! assert (c, {[12 34; 1234 5678; 9 7]}); - -## Field width and non-standard delimiters -%!test -%! str = "12;34;123456789;7"; -%! c = textscan (str, "%4f %f", "delimiter", ";", "collectOutput", 1); -%! assert (c, {[12 34; 1234 56789; 7 NaN]}); - -## Ignore trailing delimiter, but use leading one -%!test -%! str = "12.234e+2,34, \n12345.789-9876j,78\n,10|3"; -%! c = textscan (str, "%10.2f %f", "delimiter", ",", "collectOutput", 1, -%! "expChars", "e|"); -%! assert (c, {[1223 34; 12345.79-9876j 78; NaN 10000]}, 1e-6); - -%!test -%! ## Multi-character delimiter -%! str = "99end2 space88gap 4564"; -%! c = textscan (str, "%d %s", "delimiter", {"end", "gap", "space"}); -%! assert (c{1}, int32 ([99; 88])); -%! assert (c{2}, {"2 "; "4564"}); - -### Delimiters as part of literals, and following literals -#%!test -#%! str = "12 R&D & 7"; -#%! c = textscan (str, "%f R&D %f", "delimiter", "&", "collectOutput", 1, "EmptyValue", -99); -#%! assert (c, {[12 -99; 7 -99]}); -# -### Delimiters as part of literals, and before literals -#%!test -#%! str = "12 & R&D 7"; -#%! c = textscan (str, "%f R&D %f", "delimiter", "&", "collectOutput", 1); -#%! assert (c, {[12 7]}); - -%!test -%! ## Check number of lines read, not number of passes through format string -%! f = tempname (); -%! fid = fopen (f, "w+"); -%! fprintf (fid, "1\n2\n3\n4\n5\n6"); -%! fseek (fid, 0, "bof"); -%! c = textscan (fid, "%f %f", 2); -%! E = feof (fid); -%! fclose (fid); -%! unlink (f); -%! assert (c, {1, 2}); -%! assert (!E); - -%!test -%! ## Check number of lines read, not number of passes through format string -%! f = tempname (); -%! fid = fopen (f, "w+"); -%! fprintf (fid, "1\r\n2\r3\n4\r\n5\n6"); -%! fseek (fid, 0, "bof"); -%! c = textscan (fid, "%f %f", 4); -%! fclose (fid); -%! unlink (f); -%! assert (c, {[1;3], [2;4]}) - -%!test -%! ## Check number of lines read, with multiple delimiters -%! f = tempname (); -%! fid = fopen (f, "w+"); -%! fprintf (fid, "1-\r\n-2\r3-\n-4\r\n5\n6"); -%! fseek (fid, 0, "bof"); -%! c = textscan (fid, "%f %f", 4, "delimiter", "-", "multipleDelimsAsOne", 1); -%! fclose (fid); -%! unlink (f); -%! assert (c, {[1;3], [2;4]}) - -%!test -%! ## Check ReturnOnError -%! f = tempname (); -%! fid = fopen (f, "w+"); -%! str = "1 2 3\n4 s 6"; -%! fprintf (fid, str); -%! fseek (fid, 0, "bof"); -%! c = textscan (fid, "%f %f %f"); -%! fseek (fid, 0, "bof"); -%! d = textscan (fid, "%f %f %f", "ReturnOnError", 1); -%! fseek (fid, 0, "bof"); -%! fclose (fid); -%! unlink (f); -%! u = textscan (str, "%f %f %f"); -%! v = textscan (str, "%f %f %f", "ReturnOnError", 1); -%! assert (c, {[1;4], [2;NaN], [3;6]}) -%! assert (d, {[1;4], [2], [3]}) -%! assert (u, {[1;4], [2;NaN], [3;6]}) -%! assert (v, {[1;4], [2], [3]}) - -%!test -%! ## Check ReturnOnError -%! f = tempname (); -%! fid = fopen (f, "w+"); -%! str = "1 2 3\n4 s 6\n"; -%! fprintf (fid, str); -%! fseek (fid, 0, "bof"); -%! c = textscan (fid, "%f %f %f", "ReturnOnError", 1); -%! fseek (fid, 0, "bof"); -%! fclose (fid); -%! unlink (f); -%! u = textscan (str, "%f %f %f", "ReturnOnError", 1); -%! assert (c, {[1;4], 2, 3}) -%! assert (u, {[1;4], 2, 3}) - -%!error textscan ("1 2 3\n4 s 6", "%f %f %f", "ReturnOnError", 0); - -%!test -%! ## Check ReturnOnError -%! f = tempname (); -%! fid = fopen (f, "w+"); -%! fprintf (fid, "1 s 3\n4 5 6"); -%! fseek (fid, 0, "bof"); -%! c = textscan (fid, ""); -%! fseek (fid, 0, "bof"); -%! d = textscan (fid, "", "ReturnOnError", 1); -%! fseek (fid, 0, "bof"); -%! fclose (fid); -%! unlink (f); -%! assert (c, {[1;4], [NaN;5], [3;6]}) -%! assert (d, {1}) - -%!test -%! ## Check ReturnOnError with empty fields -%! c = textscan ("1,,3\n4,5,6", "", "Delimiter", ",", "ReturnOnError", 1); -%! assert (c, {[1;4], [NaN;5], [3;6]}) - -%!test -%! ## Check ReturnOnError with empty fields -%! c = textscan ("1,,3\n4,5,6", "%f %f %f", "Delimiter", ",", "ReturnOnError", 1); -%! assert (c, {[1;4], [NaN;5], [3;6]}) - -%!test -%! ## Check ReturnOnError in first column -%! c = textscan ("1 2 3\ns 5 6", "", "ReturnOnError", 1); -%! assert (c, {1, 2, 3}) - -## Test input validation -%!error textscan () -%!error textscan (single (40)) -%!error textscan ({40}) -%!error textscan ("Hello World", 2) -#%!error [C, pos] = textscan ("Hello World") -%!error textscan ("Hello World", '%s', 'EndOfLine', 3) -%!error <'%z' is not a valid format specifier> textscan ("1.0", "%z"); -%!error textscan ("1.0", "foo"); - -## Test incomplete first data line -%! R = textscan (['Empty1' char(10)], 'Empty%d %f'); -%! assert (R{1}, int32 (1)); -%! assert (isempty (R{2}), true); - -## bug #37023 -%!test -%! data = textscan (" 1. 1 \n 2 3\n", '%f %f'); -%! assert (data{1}, [1; 2], 1e-15); -%! assert (data{2}, [1; 3], 1e-15); - -## Whitespace test (bug #37333) using delimiter ";" -%!test -%! tc = []; -%! tc{1, 1} = "C:/code;"; -%! tc{1, end+1} = "C:/code/meas;"; -%! tc{1, end+1} = " C:/code/sim;"; -%! tc{1, end+1} = "C:/code/utils;"; -%! string = [tc{:}]; -%! c = textscan (string, "%s", "delimiter", ";"); -%! for k = 1:max (numel (c{1}), numel (tc)) -%! lh = c{1}{k}; -%! rh = tc{k}; -%! rh(rh == ";") = ""; -%! rh = strtrim (rh); -%! assert (strcmp (lh, rh)); -%! end - -## Whitespace test (bug #37333), adding multipleDelimsAsOne true arg -%!test -%! tc = []; -%! tc{1, 1} = "C:/code;"; -%! tc{1, end+1} = " C:/code/meas;"; -%! tc{1, end+1} = "C:/code/sim;;"; -%! tc{1, end+1} = "C:/code/utils;"; -%! string = [tc{:}]; -%! c = textscan (string, "%s", "delimiter", ";", "multipleDelimsAsOne", 1); -%! for k = 1:max (numel (c{1}), numel (tc)) -%! lh = c{1}{k}; -%! rh = tc{k}; -%! rh(rh == ";") = ""; -%! rh = strtrim (rh); -%! assert (strcmp (lh, rh)); -%! end - -## Whitespace test (bug #37333), adding multipleDelimsAsOne false arg -%!test -%! tc = []; -%! tc{1, 1} = "C:/code;"; -%! tc{1, end+1} = " C:/code/meas;"; -%! tc{1, end+1} = "C:/code/sim;;"; -%! tc{1, end+1} = ""; -%! tc{1, end+1} = "C:/code/utils;"; -%! string = [tc{:}]; -%! c = textscan (string, "%s", "delimiter", ";", "multipleDelimsAsOne", 0); -%! for k = 1:max (numel (c{1}), numel (tc)) -%! lh = c{1}{k}; -%! rh = tc{k}; -%! rh(rh == ";") = ""; -%! rh = strtrim (rh); -%! assert (strcmp (lh, rh)); -%! end - -## Whitespace test (bug #37333) whitespace "" arg -%!test -%! tc = []; -%! tc{1, 1} = "C:/code;"; -%! tc{1, end+1} = " C:/code/meas;"; -%! tc{1, end+1} = "C:/code/sim;"; -%! tc{1, end+1} = "C:/code/utils;"; -%! string = [tc{:}]; -%! c = textscan (string, "%s", "delimiter", ";", "whitespace", ""); -%! for k = 1:max (numel (c{1}), numel (tc)) -%! lh = c{1}{k}; -%! rh = tc{k}; -%! rh(rh == ";") = ""; -%! assert (strcmp (lh, rh)); -%! end - -## Whitespace test (bug #37333), whitespace " " arg -%!test -%! tc = []; -%! tc{1, 1} = "C:/code;"; -%! tc{1, end+1} = " C:/code/meas;"; -%! tc{1, end+1} = "C:/code/sim;"; -%! tc{1, end+1} = "C:/code/utils;"; -%! string = [tc{:}]; -%! c = textscan (string, "%s", "delimiter", ";", "whitespace", " "); -%! for k = 1:max (numel (c{1}), numel (tc)) -%! lh = c{1}{k}; -%! rh = tc{k}; -%! rh(rh == ";") = ""; -%! rh = strtrim (rh); -%! assert (strcmp (lh, rh)); -%! end - -## Tests reading with empty format, should return proper nr of columns -%!test -%! f = tempname (); -%! fid = fopen (f, "w+"); -%! fprintf (fid, " 1 2 3 4\n5 6 7 8"); -%! fseek (fid, 0, "bof"); -%! A = textscan (fid, ""); -%! E = feof (fid); -%! fclose (fid); -%! unlink (f); -%! assert (A{1}, [1 ; 5], 1e-6); -%! assert (A{2}, [2 ; 6], 1e-6); -%! assert (A{3}, [3 ; 7], 1e-6); -%! assert (A{4}, [4 ; 8], 1e-6); -%! assert (E); - -## Tests reading with empty format; empty fields & incomplete lower row -%!test -%! f = tempname (); -%! fid = fopen (f, "w+"); -%! fprintf (fid, " ,2,,4\n5,6"); -%! fseek (fid, 0, "bof"); -%! A = textscan (fid, "", "delimiter", ",", "EmptyValue", 999, "CollectOutput" , 1); -%! fclose (fid); -%! unlink (f); -%! assert (A{1}, [999, 2, 999, 4; 5, 6, 999, 999], 1e-6); - -## Error message tests - -%!test -%! f = tempname (); -%! fid = fopen (f, "w+"); -%! msg1 = "textscan: 1 parameters given, but only 0 values"; -%! try -%! A = textscan (fid, "", "headerlines"); -%! end_try_catch; -%! assert (!feof (fid)); -%! fclose (fid); -%! unlink (f); -%! assert (msg1, lasterr); - -%!test -%! f = tempname (); -%! fid = fopen (f, "w+"); -%! msg1 = "textscan: HeaderLines must be numeric"; -%! try -%! A = textscan (fid, "", "headerlines", "hh"); -%! end_try_catch; -%! fclose (fid); -%! unlink (f); -%! assert (msg1, lasterr); - -%!test -%! ## Skip headerlines -%! A = textscan ("field 1 field2\n 1 2\n3 4", "", "headerlines", 1, "collectOutput", 1); -%! assert (A, {[1 2; 3 4]}); - -%!test -%! ## Skip headerlines with non-default EOL -%! A = textscan ("field 1 field2\r 1 2\r3 4", "", "headerlines", 2, "collectOutput", 1, "EndOfLine", '\r'); -%! assert (A, {[3 4]}); - -%!test -%! f = tempname (); -%! fid = fopen (f, "w+"); -%! fprintf (fid,"some_string"); -%! fseek (fid, 0, "bof"); -%! msg1 = "textscan: EndOfLine must be at most one character or '\\r\\n'"; -%! try -%! A = textscan (fid, "%f", "EndOfLine", "\n\r"); -%! end_try_catch; -%! fclose (fid); -%! unlink (f); -%! assert (msg1, lasterr); - -%!test -%! f = tempname (); -%! fid = fopen (f, "w+"); -%! fprintf (fid,"some_string"); -%! fseek (fid, 0, "bof"); -%! msg1 = "textscan: EndOfLine must be at most one character or '\\r\\n'"; -%! try -%! A = textscan (fid, "%f", "EndOfLine", 33); -%! end_try_catch; -%! fclose (fid); -%! unlink (f); -%! assert (msg1, lasterr); - -## Bug #41824 -%!test -%! assert (textscan ("123", "", "whitespace", " "){:}, 123); - -## Bug #42343-1, just test supplied emptyvalue -%!test -%! assert (textscan (",NaN", "", "delimiter", "," ,"emptyValue" ,Inf), {Inf, NaN}); - -## Bug #42343-2, test padding with supplied emptyvalue -%!test -%! a = textscan (",1,,4\nInf, ,NaN\n", "", "delimiter", ",", "emptyvalue", -10); -%! assert (cell2mat (a), [-10, 1, -10, 4; Inf, -10, NaN, -10]); - -## Bug #42528 -%!test -%! assert (textscan ("1i", ""){1}, 0+1i); -%! assert (cell2mat (textscan ("3, 2-4i, NaN\n -i, 1, 23.4+2.2i\n 1+1 1+1j", "", "delimiter", ",")), [3+0i, 2-4i, NaN+0i; 0-i, 1+0i, 23.4+2.2i; 1 1 1+1i]); - -%!test -%! ## TreatAsEmpty -%! C = textscan ("1,2,3,NN,5,6\n", "%d%d%d%f", "delimiter", ",", "TreatAsEmpty", "NN"); -%! assert (C{3}(1), int32 (3)); -%! assert (C{4}(1), NaN); - -## MultipleDelimsAsOne -%!test -%! str = "11, 12, 13,, 15\n21,, 23, 24, 25\n,, 33, 34, 35\n"; -%! C = textscan (str, "%f %f %f %f", "delimiter", ",", "multipledelimsasone", 1, "endofline", "\n"); -%! assert (C{1}', [11, 21, 33]); -%! assert (C{2}', [12, 23, 34]); -%! assert (C{3}', [13, 24, 35]); -%! assert (C{4}', [15, 25, NaN]); - -## Bug #44750 -%!test -%! assert (textscan ("/home/foo/", "%s", "delimiter", "/", "MultipleDelimsAsOne", 1){1}, ... -%! {"home"; "foo"}); - -### Allow cuddling %sliteral but warn it is ambiguous -#%!test -#%! C = textscan ("abcxyz51\nxyz83\n##xyz101", "%s xyz %d"); -#%! assert (C{1}([1 3]), {"abc"; "##"}); -#%! assert (isempty (C{1}{2}), true); -#%! assert (C{2}, int32 ([51; 83; 101])); -### Literals are not delimiters. - -## Test for false positives in check for non-supported format specifiers -%!test -%! assert (textscan ("Total: 32.5 % (of cm values)", "Total: %f %% (of cm values)"){1}, 32.5, 1e-5); - -## Test various forms of string format specifiers (bug #45712) -%!test -%! str = "14 :1 z:2 z:3 z:5 z:11"; -%! C = textscan (str, "%f %s %*s %3s %*3s %f", "delimiter", ":"); -%! assert (C, {14, {"1 z"}, {"3 z"}, 11}); - -%% Bit width, fixed width conv. specifiers -%!test -%! str2 = "123456789012345 "; -%! str2 = [str2 str2 str2 str2 str2 str2 str2 str2]; -%! str2 = [str2 "123456789.01234 1234567890.1234 12345.678901234 12345.678901234"]; -%! pttrn = "%3u8%*s %5u16%*s %10u32%*s %15u64 %3d8%*s %5d16%*s %10d32%*s %15d64 %9f32%*s %14f64%*s %10.2f32%*s %12.2f64%*s"; -%! C = textscan (str2, pttrn, "delimiter", " "); -%! assert (C{1}, uint8 (123)); -%! assert (C{2}, uint16 (12345)); -%! assert (C{3}, uint32 (1234567890)); -%! assert (C{4}, uint64 (123456789012345)); -%! assert (C{5}, int8 (123)); -%! assert (C{6}, int16 (12345)); -%! assert (C{7}, int32 (1234567890)); -%! assert (C{8}, int64 (123456789012345)); -%! assert (C{9}, single (123456789), 1e-12); -%! assert (C{10}, double (1234567890.123), 1e-15); -%! assert (C{11}, single (12345.68), 1e-5); -%! assert (C{12}, double (12345.68), 1e-11); - -%% Bit width, fixed width conv. specifiers -- check the right amount is left -%!test -%! str2 = "123456789012345 "; -%! str2 = [str2 str2 "123456789.01234"]; -%! pttrn = "%3u8 %5u16 %10u32 %3d8 %5d16 %10d32 %9f32 %9f"; -%! C = textscan (str2, pttrn, "delimiter", " "); -%! assert (C{1}, uint8 (123)); -%! assert (C{2}, uint16 (45678)); -%! assert (C{3}, uint32 (9012345)); -%! assert (C{4}, int8 (123)); -%! assert (C{5}, int16 (45678)); -%! assert (C{6}, int32 (9012345)); -%! assert (C{7}, single (123456789), 1e-12); -%! assert (C{8}, double (0.01234), 1e-12); - -%!test -%! C = textscan ("123.123", "%2f %3f %3f"); -%! assert (C{1}, 12); -%! assert (C{2}, 3.1, 1e-11); -%! assert (C{3}, 23); - -%!test -%! C = textscan ("123.123", "%3f %3f %3f"); -%! assert (C{1}, 123); -%! assert (C{2}, 0.12, 1e-11); -%! assert (C{3}, 3); - -%!test -%! C = textscan ("123.123", "%4f %3f"); -%! assert (C{1}, 123); -%! assert (C{2}, 123); - -%% field width interrupts exponent. (Matlab incorrectly gives [12, 2e12]) -%!test -%! assert (textscan ("12e12", "%4f"), {[120; 2]}); -%! assert (textscan ("12e+12", "%5f"), {[120; 2]}); -%! assert (textscan ("125e-12","%6f"), {[12.5; 2]}); - -%% %[] tests -%% Plain [..] and *[..] -%!test -%! ar = "abcdefguvwxAny\nacegxyzTrailing\nJunk"; -%! C = textscan (ar, "%[abcdefg] %*[uvwxyz] %s"); -%! assert (C{1}, {"abcdefg"; "aceg"; ""}); -%! assert (C{2}, {"Any"; "Trailing"; "Junk"}); - -%!test -%! assert (textscan ("A2 B2 C3", "%*[ABC]%d", 3), {int32([2; 2; 3])}); - -%% [^..] and *[^..] -%!test -%! br = "abcdefguvwx1Any\nacegxyz2Trailing\n3Junk"; -%! C = textscan (br, "%[abcdefg] %*[^0123456789] %s"); -%! assert (C{1}, {"abcdefg"; "aceg"; ""}); -%! assert (C{2}, {"1Any"; "2Trailing"; "3Junk"}); - -%% [..] and [^..] containing delimiters -%!test -%! cr = "ab cd efguv wx1Any\na ce gx yz2Trailing\n 3Junk"; -%! C = textscan (cr, "%[ abcdefg] %*[^0123456789] %s", "delimiter", " \n", "whitespace", ""); -%! assert (C{1}, {"ab cd efg"; "a ce g"; " "}); -%! assert (C{2}, {"1Any"; "2Trailing"; "3Junk"}); - -%% Bug #36464 -%!test -%! assert (textscan ('1 2 3 4 5 6', "%*n%n%*[^\n]"){1}, 2); - -%% test %[]] and %[^]] -%!test -%! assert (textscan ('345]', "%*[123456]%[]]"){1}{1}, "]"); -%! assert (textscan ('345]', "%*[^]]%s"){1}{1}, "]"); - -%% Test that "-i" checks the next two characters -%!test -%! C = textscan ("-i -in -inf -infinity", "%f %f%s %f %f %s"); -%! assert (C, {-i, -i, {"n"}, -Inf, -Inf, {"inity"}}); - -%% Again for "+i", this time with custom parser -%!test -%! C = textscan ("+i +in +inf +infinity", "%f %f%s %f %f %s", "ExpChars", "eE"); -%! assert (C, {i, i, {"n"}, Inf, Inf, {"inity"}}); - -%% Check single quoted format interprets control sequences -%!test -%! C = textscan ("1 2\t3 4", '%f %[^\t] %f %f'); -%! assert (C, {1, {"2"}, 3, 4}); - -%% Check overflow and underflow of integer types -%!test -%! a = "-1e90 "; -%! b = "1e90 "; -%! fmt = "%d8 %d16 %d32 %d64 %u8 %u16 %u32 %u64 "; -%! C = textscan ([a a a a a a a a b b b b b b b b], fmt); -%! assert (C{1}, int8 ([-128; 127])); -%! assert (C{2}, int16 ([-32768; 32767])); -%! assert (C{3}, int32 ([-2147483648; 2147483647])); -%! assert (C{4}, int64 ([-9223372036854775808; 9223372036854775807])); -%! assert (C{5}, uint8 ([0; 255])); -%! assert (C{6}, uint16 ([0; 65535])); -%! assert (C{7}, uint32 ([0; 4294967295])); -%! assert (C{8}, uint64 ([0; 18446744073709551615])); - -%% Tests from Matlab (does The MathWorks have any copyright over the input?) -%!test -%! f = tempname (); -%! fid = fopen (f, "w+"); -%! fprintf (fid,"09/12/2005 Level1 12.34 45 1.23e10 inf Nan Yes 5.1+3i\n"); -%! fprintf (fid,"10/12/2005 Level2 23.54 60 9e19 -inf 0.001 No 2.2-.5i\n"); -%! fprintf (fid,"11/12/2005 Level3 34.90 12 2e5 10 100 No 3.1+.1i\n"); -%! fseek (fid, 0, "bof"); -%! C = textscan (fid,"%s %s %f32 %d8 %u %f %f %s %f"); -%! %assert (C{1}, {"09/12/2005";"10/12/2005";"11/12/2005"}); -%! assert (C{2}, {"Level1";"Level2";"Level3"}); -%! assert (C{3}, [single(12.34);single(23.54);single(34.90)]); -%! assert (C{4}, [int8(45);int8(60);int8(12)]); -%! assert (C{5}, [uint32(4294967295);uint32(4294967295);uint32(200000)]); -%! assert (C{6}, [inf;-inf;10]); -%! assert (C{7}, [NaN;0.001;100], eps); -%! assert (C{8}, {"Yes";"No";"No"}); -%! assert (C{9}, [5.1+3i;2.2-0.5i;3.1+0.1i]); -%! fseek (fid, 0, "bof"); -%! C = textscan (fid,"%s Level%d %f32 %d8 %u %f %f %s %f"); -%! assert (C{2}, [int32(1);int32(2);int32(3)]); -%! assert (C{3}, [single(12.34);single(23.54);single(34.90)]); -%! fseek (fid, 0, "bof"); -%! C = textscan (fid,'%s %*[^\n]'); -%! fclose (fid); -%! unlink (f); -%! assert (C, {{"09/12/2005";"10/12/2005";"11/12/2005"}}); - -%!test -%! f = tempname (); -%! fid = fopen (f, "w+"); -%! fprintf (fid,"1, 2, 3, 4, , 6\n"); -%! fprintf (fid,"7, 8, 9, , 11, 12\n"); -%! fseek (fid, 0, "bof"); -%! C = textscan (fid,"%f %f %f %f %u8 %f", "Delimiter",",","EmptyValue",-Inf); -%! fclose (fid); -%! unlink (f); -%! assert (C{4}, [4; -Inf]); -%! assert (C{5}, uint8 ([0; 11])); - -%!test -%! f = tempname (); -%! fid = fopen (f, "w+"); -%! fprintf (fid,"abc, 2, NA, 3, 4\n"); -%! fprintf (fid,"// Comment Here\n"); -%! fprintf (fid,"def, na, 5, 6, 7\n"); -%! fseek (fid, 0, "bof"); -%! C = textscan (fid,"%s %n %n %n %n","Delimiter",",","TreatAsEmpty",{"NA","na"},"CommentStyle","//"); -%! fclose (fid); -%! unlink (f); -%! assert (C{1}, {"abc";"def"}); -%! assert (C{2}, [2; NaN]); -%! assert (C{3}, [NaN; 5]); -%! assert (C{4}, [3; 6]); -%! assert (C{5}, [4; 7]); - -%!test -%!## Test start of comment as string -%! c = textscan ("1 / 2 // 3", "%n %s %u8", "CommentStyle", {"//"}); -%! assert (c, {1, "/", 2}); -*/ - -// These tests have end-comment sequences, so can't just be in a comment -#if 0 -%!test -%!## Test unfinished comment -%! c = textscan ("1 2 /* half comment", "%n %u8", "CommentStyle", {"/*", "*/"}); -%! assert (c, {1, 2}); - -## Test reading from a real file -%!test -%! f = tempname (); -%! fid = fopen (f, "w+"); -%! d = rand (1, 4); -%! fprintf (fid, " %f %f /* comment */ %f %f ", d); -%! fseek (fid, 0, "bof"); -%! A = textscan (fid, "%f %f", "CommentStyle", {"/*", "*/"}); -%! E = feof (fid); -%! fclose (fid); -%! unlink (f); -%! assert (A{1}, [d(1); d(3)], 1e-6); -%! assert (A{2}, [d(2); d(4)], 1e-6); -%! assert (E); -#endif diff -r 24aab5c342bf -r 7a19c5678f91 libinterp/corefcn/textscan.h --- a/libinterp/corefcn/textscan.h Sat Mar 19 09:20:05 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,196 +0,0 @@ -/* - -Copyright (C) 2015-2016 Lachlan Andrew, Monash University - -This file is part of Octave. - -Octave is free software; you can redistribute it and/or modify it -under the terms of the GNU General Public License as published by the -Free Software Foundation; either version 3 of the License, or (at your -option) any later version. - -Octave is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -for more details. - -You should have received a copy of the GNU General Public License -along with Octave; see the file COPYING. If not, see -. - -*/ - -// @file -// Implementation of textscan, a versatile text parser. - -#if ! defined (octave_textscan_h) -#define octave_textscan_h 1 - -#include "octave-config.h" - -#include -#include -#include - -// These are only needed as arguments to private functions, so they -// are also treated as private. - -class delimited_stream; -class textscan_format_elt; -class textscan_format_list; - -#include "Cell.h" -#include "ov.h" -#include "ovl.h" - -// Main class to implement textscan. Read data and parse it -// according to a format. -// -// The calling sequence is -// -// textscan scanner (); -// scanner.scan (...); - -class -OCTINTERP_API -textscan -{ -public: - - textscan (void); - - ~textscan (void) { } - - octave_value scan (std::istream& isp, const octave_value_list& args); - - octave_value scan (std::istream& isp, const octave_value_list& args, - octave_idx_type& count); - -private: - - friend class textscan_format_list; - - std::string buf; - - // Three cases for delim_table and delim_list - // 1. delim_table empty, delim_list empty: whitespace delimiters - // 2. delim_table = look-up table of delim chars, delim_list empty. - // 3. delim_table non-empty, delim_list = Cell array of delim strings - - std::string whitespace_table; - - // delim_table[i] == '\0' if i is not a delimiter. - std::string delim_table; - - // String of delimiter characters. - std::string delims; - - Cell comment_style; - - // How far ahead to look to detect an open comment. - int comment_len; - - // First character of open comment. - int comment_char; - - octave_idx_type buffer_size; - - std::string date_locale; - - // 'inf' and 'nan' for formatted_double. - Cell inf_nan; - - // Array of strings of delimiters. - Cell delim_list; - - // Longest delimiter. - int delim_len; - - octave_value empty_value; - std::string exp_chars; - int header_lines; - Cell treat_as_empty; - - // Longest string to treat as "N/A". - int treat_as_empty_len; - - std::string whitespace; - - short eol1; - short eol2; - short return_on_error; - - bool collect_output; - bool multiple_delims_as_one; - bool default_exp; - bool numeric_delim; - - octave_idx_type lines; - - octave_value do_scan (std::istream& isp, textscan_format_list& fmt_list, - octave_idx_type ntimes); - - void parse_options (const octave_value_list& args, - textscan_format_list& fmt_list); - - int read_format_once (delimited_stream& isp, textscan_format_list& fmt_list, - std::list& retval, - Array row, int& done_after); - - void scan_one (delimited_stream& is, const textscan_format_elt& fmt, - octave_value& ov, Array row); - - // Methods to process a particular conversion specifier. - double read_double (delimited_stream& is, - const textscan_format_elt& fmt) const; - - void scan_complex (delimited_stream& is, const textscan_format_elt& fmt, - Complex& val) const; - - int scan_bracket (delimited_stream& is, const std::string& pattern, - std::string& val) const; - - int scan_caret (delimited_stream& is, const std::string& pattern, - std::string& val) const; - - void scan_string (delimited_stream& is, const textscan_format_elt& fmt, - std::string& val) const; - - void scan_cstring (delimited_stream& is, const textscan_format_elt& fmt, - std::string& val) const; - - void scan_qstring (delimited_stream& is, const textscan_format_elt& fmt, - std::string& val); - - // Helper methods. - std::string read_until (delimited_stream& is, const Cell& delimiters, - const std::string& ends) const; - - int lookahead (delimited_stream& is, const Cell& targets, int max_len, - bool case_sensitive = true) const; - - bool match_literal (delimited_stream& isp, const textscan_format_elt& elem); - - int skip_whitespace (delimited_stream& is, bool EOLstop = false); - - int skip_delim (delimited_stream& is); - - bool is_delim (unsigned char ch) const - { - return ((delim_table.empty () && (isspace (ch) || ch == eol1 || ch == eol2)) - || delim_table[ch] != '\0'); - } - - bool isspace (unsigned int ch) const { return whitespace_table[ch & 0xff]; } - - // True if the only delimiter is whitespace. - bool whitespace_delim (void) const { return delim_table.empty (); } - - // No copying! - - textscan (const textscan&); - - textscan& operator = (const textscan&); -}; - -#endif