Mercurial > jwe > octave
view libinterp/corefcn/textscan.h @ 21462:b7d1e93c0702
initial implementation of textscan in C++
* textscan.h, textscan.cc: New files.
* libinterp/corefcn/module.mk: Update.
* textscan.m: Delete.
* scripts/io/module.mk: Update.
author | Lachlan Andrew <lachlanbis@gmail.com> |
---|---|
date | Tue, 15 Mar 2016 19:36:52 +1100 |
parents | |
children | bca9aaef907a |
line wrap: on
line source
/* Copyright (C) 2015-2016 Lachlan Andrew, Monash University This file is part of Octave. Octave is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. Octave is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Octave; see the file COPYING. If not, see <http://www.gnu.org/licenses/>. */ /** @file * Implementation of textscan, a versatile text parser. */ #if !defined (octave_textscan_h) #define octave_textscan_h 1 // For Inf and NaN #include "lo-ieee.h" /** Delimited stream, optimised to read strings of characters separated * by single-character delimiters. * * The reason behind this class is that octstream doesn't provide seek/tell, * but the opportunity has been taken to optimise for the textscan workload. * * The function reads chunks into a 4kiB buffer, and marks where the last * delimiter occurs. Reads up to this delimiter can be fast. After that * last delimiter, the remaining text is moved to the front of the buffer * and the buffer is refilled. This also allows cheap seek and tell * operations within a "fast read" block. */ class dstr { int bufsize; // number of characters to read from the file at once std::istream& i_stream; // stream to read from char *buf; // temporary storage for a "chunk" of data char *idx; // Current read pointer char *last; // location of last delimiter in the buffer at buf // (undefined if delimited is false) char *eob; // Position after last character in buffer bool delimited; // True if there is delimiter in the bufer after idx int longest; // longest lookahead required const std::string delims; // sequence of single-character delimiters std::streampos buf_in_file; // Position of start of buf in original stream char *progress_marker; // Marker to see if a read consumes any characters public: dstr (std::istream& is, const std::string& delimiters, int longest_lookahead, octave_idx_type bsize = 4096); dstr (std::istream& is, const dstr& ds); ~dstr (); // Called when optimised sequence of get() is finished. Ensures that // there is a remaining delimiter in buf, or loads more data in. void field_done (void) { if (idx >= last) refresh_buf (); } // Load new data into buffer, and set eob, last, idx. // Return EOF at end of file, 0 otherwise. int refresh_buf (void); // get a character, relying on caller to call field_done () if // a delimiter has been reached. int get (void) { return delimited ? *idx++ : get_undelim (); } // get a character, checking for underrun of the buffer int get_undelim (void); // Read character that will be got by the next get(). int peek (void) { return *idx; } // Read character that will be got by the next get(). int peek_undelim (void); // Undo a 'get' or 'get_undelim'. It is the caller's responsibility // to avoid overflow by calling putbacks only for a character got by // get() or get_undelim(), with no intervening // get, get_delim, field_done, refresh_buf, getline, read or seekg. void putback (char /*ch*/ = 0) { --idx; } int getline (std::string& dest, char delim); //int skipline (char delim); char *read (char *buffer, int size, char* &new_start); // return a position suitable to "seekg", valid only within this // block between calls to field_done (). char *tellg (void) { return idx; } void seekg (char *old_idx) { idx = old_idx; } bool eof (void) { return (eob == buf && i_stream.eof ()) || (flags & std::ios_base::eofbit); } operator const void* (void) { return (!eof () && !flags) ? this : 0; } bool fail (void) { return flags & std::ios_base::failbit; } std::ios_base::iostate rdstate (void) { return flags; } void setstate (std::ios_base::iostate m) { flags = flags | m; } void clear (std::ios_base::iostate m = (std::ios_base::eofbit & ~std::ios_base::eofbit)) { flags = flags & m; } // Report if any characters have been consumed. // (get, read etc. not cancelled by putback or seekg) void progress_benchmark (void) { progress_marker = idx; } bool no_progress (void) { return progress_marker == idx; } private: std::ios_base::iostate flags; // No copying dstr (const dstr&); dstr& operator = (const dstr&); }; /** * A single conversion specifier, such as %f or %c */ class OCTINTERP_API textscan_format_elt { public: enum special_conversion { whitespace_conversion = 1, literal_conversion = 2 }; textscan_format_elt (const char *txt = 0, int w = 0, int p = -1, int bw = 0, bool dis = false, char typ = '\0', const std::string& ch_class = std::string ()) : text (strsave (txt)), width (w), prec (p), bitwidth (bw), char_class (ch_class), type (typ), discard (dis), numeric(typ == 'd' || typ == 'u' || type == 'f' || type == 'n') { } textscan_format_elt (const textscan_format_elt& e) : text (strsave (e.text)), width (e.width), prec (e.prec), bitwidth (e.bitwidth), char_class (e.char_class), type (e.type), discard (e.discard), numeric (e.numeric) { } textscan_format_elt& operator = (const textscan_format_elt& e) { if (this != &e) { text = strsave (e.text); width = e.width; prec = e.prec; bitwidth = e.bitwidth; discard = e.discard; type = e.type; numeric = e.numeric; char_class = e.char_class; } return *this; } ~textscan_format_elt (void) { delete [] text; } // The C-style format string. const char *text; // The maximum field width. unsigned int width; // The maximum number of digits to read after the decimal in a // floating point conversion. int prec; // The size of the result. For integers, bitwidth may be 8, 16, 34, // or 64. For floating point values, bitwidth may be 32 or 64. int bitwidth; // The class of characters in a `[' or `^' format. std::string char_class; // Type of conversion // -- `d', `u', `f', `n', `s', `q', `c', `%', `C', `D', `[' or `^'. char type; // TRUE if we are not storing the result of this conversion. bool discard; // TRUE if the type is 'd', 'u', 'f', 'n' bool numeric; }; class textscan; /** * The (parsed) sequence of format specifiers. */ class OCTINTERP_API textscan_format_list { public: textscan_format_list (const std::string& fmt = std::string ()); ~textscan_format_list (void); octave_idx_type num_conversions (void) const { return nconv; } // The length can be different than the number of conversions. // For example, "x %d y %d z" has 2 conversions but the length of // the list is 3 because of the characters that appear after the // last conversion. octave_idx_type numel (void) const { return list.numel (); } const textscan_format_elt *first (void) { curr_idx = 0; return current (); } const textscan_format_elt *current (void) const { return list.numel () > 0 ? list.elem (curr_idx) : 0; } const textscan_format_elt *next (bool cycle = true) { curr_idx++; if (curr_idx >= list.numel ()) { if (cycle) curr_idx = 0; else return 0; } return current (); } void printme (void) const; bool ok (void) const { return (nconv >= 0); } operator const void* (void) const { return ok () ? this : 0; } bool set_from_first; // true if number of %f to be set from data file bool has_string; // at least one conversion specifier is s,q,c, or [...] int read_first_row (dstr& is, textscan& ts); std::list<octave_value> out_buf (void) const { return (output_container); } private: // Number of conversions specified by this format string, or -1 if // invalid conversions have been found. octave_idx_type nconv; // Index to current element; octave_idx_type curr_idx; // FIXME -- maybe LIST should be a std::list object? // List of format elements. Array<textscan_format_elt*> list; // list holding column arrays of types specified by conversions std::list<octave_value > output_container; // Temporary buffer. std::ostringstream *buf; void add_elt_to_list (unsigned int width, int prec, int bitwidth, octave_value val_type, bool discard, char type, octave_idx_type& num_elts, const std::string& char_class = std::string ()); void process_conversion (const std::string& s, size_t& i, size_t n, octave_idx_type& num_elts); int finish_conversion (const std::string& s, size_t& i, size_t n, unsigned int& width, int& prec, int& bitwidth, octave_value& val_type, bool discard, char& type, octave_idx_type& num_elts); // No copying! textscan_format_list (const textscan_format_list&); textscan_format_list& operator = (const textscan_format_list&); }; /** * Main class to implement textscan. * Reads data and parses it according to a textscan_format_list. * The calling sequence is * textscan (); * parse_options (...); * scan (...); */ class textscan { friend class textscan_format_list; std::string buf; // Three cases for delim_table and delim_list // 1. delim_table empty, delim_list empty: whitespace delimiters // 2. delim_table = look-up table of delim chars, delim_list empty. // 3. delim_table non-empty, delim_list = Cell array of delim strings std::string whitespace_table; std::string delim_table; // delim_table[i]=='\0' if i is not a delimiter, std::string delims; // string of delimiter characters Cell comment_style; int comment_len; // How far ahead to look to detect an open comment int comment_char; // first character of open comment octave_idx_type buffer_size; std::string date_locale; Cell inf_nan; // 'inf' and 'nan' for formatted_double Cell delim_list; // Array of strings of delimiters int delim_len; // Longest delimiter octave_value empty_value; std::string exp_chars; int header_lines; Cell treat_as_empty; int treat_as_empty_len; // longest string to treat as "N/A" std::string whitespace; short eol1, eol2; short return_on_error; bool collect_output; bool multiple_delims_as_one; bool default_exp; bool numeric_delim; octave_idx_type lines; int read_format_once (dstr &isp, textscan_format_list& fmt_list, std::list<octave_value> & retval, Array<octave_idx_type> row, int& done_after); void scan_one (dstr& is, const textscan_format_elt& fmt, octave_value& ov, Array<octave_idx_type> row); // Methods to process a particular conversion specifier double read_double (dstr& is, const textscan_format_elt& fmt) const; void scan_complex (dstr& is, const textscan_format_elt& fmt, Complex& val) const; int scan_bracket (dstr& is, const char *pattern, std::string& val) const; int scan_caret (dstr& is, const char *, std::string& val) const; void scan_string (dstr& is, const textscan_format_elt& fmt, std::string& val) const; void scan_cstring (dstr& is, const textscan_format_elt& fmt, std::string& val) const; void scan_qstring (dstr& is, const textscan_format_elt& fmt, std::string& val); // helper methods std::string read_until (dstr& is, const Cell& delimiters, const std::string& ends) const; int lookahead (dstr& is, const Cell& targets, int max_len, bool case_sensitive = true) const; char *get_field (dstr& isp, unsigned int width) const; bool match_literal (dstr& isp, const textscan_format_elt& elem); int skip_whitespace (dstr& is, bool EOLstop = false); int skip_delim (dstr& is); bool is_delim (unsigned char ch) const { return (delim_table.length () == 0 && (isspace (ch) || ch == eol1 || ch == eol2)) || delim_table[ch] != '\0'; } bool isspace (unsigned int ch) const { return whitespace_table[ch & 0xff]; } // true if the only delimiter is whitespace bool whitespace_delim (void) const { return delim_table.length () == 0; } public: textscan (void) : buf (""), delim_table (""), delims (), comment_len (0), comment_char(-2), buffer_size (0), empty_value (octave_NaN), exp_chars ("edED"), header_lines (0), treat_as_empty_len (0), whitespace (" \b\t"), eol1('\r'), eol2('\n'), return_on_error (2), collect_output (false), multiple_delims_as_one (false), default_exp (true), numeric_delim (false), lines (0) { inf_nan = Cell (dim_vector (1,2)); inf_nan(0) = Cell (octave_value ("inf")); inf_nan(1) = Cell (octave_value ("nan")); }; octave_value scan (std::istream* isp, textscan_format_list& fmt_list, octave_idx_type ntimes); void parse_options (const octave_value_list& args, int first_param, textscan_format_list& formats); }; #endif