# HG changeset patch # User John W. Eaton # Date 1372984382 14400 # Node ID 5314734810844711df65a5aa8d333ac4205e3c46 # Parent 21d5e76891feb1c4e364a0c363c018de891dbcd1 rewrite string parsing to avoid unlimited lookahead * NEWS: Mention change. * lex.h, lex.ll (lexical_feedback::string_text, lexical_feedback::string_line, lexical_feedback::string_column): New data members (lexical_feedback::lexical_feedback): Initialize them. (lexical_feedback::reset): Initialize them. (octave_base_lexer::begin_string): New function. (\", "'", [\"\']): Use begin_string to set start state * instead of calling handle_string to parse string. (DQ_STRING_START, SQ_STRING_START): New exclusive start states. (\"\", \", {NL}, \\[0-7]{1,3}, "\\a", "\\b", "\\f", "\\n", "\\r", "\\t", "\\v", \\{ANY_INCLUDING_NL}, [^\\\n\"]+, [^\'\n\r]*\', {NL}): New rules for parsing character strings. (octave_base_lexer::have_continuation, octave_base_lexer::have_ellipsis_continuation, octave_base_lexer::handle_string): Delete. diff -r 21d5e76891fe -r 531473481084 NEWS --- a/NEWS Thu Jul 04 19:05:59 2013 -0400 +++ b/NEWS Thu Jul 04 20:33:02 2013 -0400 @@ -77,6 +77,17 @@ c,... ] = deal (1,2,3) + ** Line continuations inside character strings have changed. + + The sequence '...' is no longer recognized as a line continuations + are inside character strings. A backslash followed by a newline + character is no longer recognized as a line continuation inside + single-quoted character strings. Inside double-quoted character + strings, a backslash followed by a newline character is still + recognized as a line continuation but the backslash character must + be followed immediately by the newline character. No whitespace or + end-of-linecomment may appear between them. + ** Warning IDs renamed: Octave:array-as-scalar => Octave:array-to-scalar diff -r 21d5e76891fe -r 531473481084 libinterp/parse-tree/lex.h --- a/libinterp/parse-tree/lex.h Thu Jul 04 19:05:59 2013 -0400 +++ b/libinterp/parse-tree/lex.h Thu Jul 04 20:33:02 2013 -0400 @@ -280,6 +280,7 @@ looping (0), defining_func (0), looking_at_function_handle (0), block_comment_nesting_level (0), token_count (0), current_input_line (), comment_text (), help_text (), + string_text (), string_line (0), string_column (0), fcn_file_name (), fcn_file_full_name (), looking_at_object_index (), parsed_function_name (), pending_local_variables (), symtab_context (), nesting_level (), tokens () @@ -411,6 +412,13 @@ // The current help text. std::string help_text; + // The current character string text. + std::string string_text; + + // The position of the beginning of the current character string. + int string_line; + int string_column; + // Simple name of function file we are reading. std::string fcn_file_name; @@ -501,6 +509,8 @@ void prep_for_file (void); + void begin_string (int state); + virtual int fill_flex_buffer (char *buf, unsigned int max_size) = 0; bool at_end_of_buffer (void) const { return input_buf.empty (); } @@ -535,12 +545,6 @@ void finish_comment (octave_comment_elt::comment_type typ); - bool have_continuation (bool trailing_comments_ok = true); - - bool have_ellipsis_continuation (bool trailing_comments_ok = true); - - int handle_string (char delim); - int handle_close_bracket (int bracket_type); bool looks_like_command_arg (void); diff -r 21d5e76891fe -r 531473481084 libinterp/parse-tree/lex.ll --- a/libinterp/parse-tree/lex.ll Thu Jul 04 19:05:59 2013 -0400 +++ b/libinterp/parse-tree/lex.ll Thu Jul 04 20:33:02 2013 -0400 @@ -51,6 +51,9 @@ %x BLOCK_COMMENT_START %x LINE_COMMENT_START +%x DQ_STRING_START +%x SQ_STRING_START + %{ #include @@ -290,9 +293,9 @@ curr_lexer->at_beginning_of_statement = false; curr_lexer->current_input_column++; - int tok = curr_lexer->handle_string (yytext[0]); - - return curr_lexer->count_token_internal (tok); + + curr_lexer->begin_string (yytext[0] == '"' + ? DQ_STRING_START : SQ_STRING_START); } [^#% \t\r\n\;\,\"\'][^ \t\r\n\;\,]*{S}* { @@ -622,6 +625,106 @@ } %{ +// Double-quoted character strings. +%} + +\"\" { + curr_lexer->current_input_column += yyleng; + curr_lexer->string_text += '"'; + } + +\" { + + curr_lexer->pop_start_state (); + + curr_lexer->looking_for_object_index = true; + curr_lexer->at_beginning_of_statement = false; + + curr_lexer->push_token (new token (DQ_STRING, + curr_lexer->string_text, + curr_lexer->string_line, + curr_lexer->string_column)); + + curr_lexer->string_text = ""; + + return curr_lexer->count_token_internal (DQ_STRING); + } + +{NL} { + error ("unterminated character string constant"); + return LEXICAL_ERROR; + } + +\\[0-7]{1,3} { + int result; + sscanf (yytext+1, "%o", &result); + + if (result > 0xff) + error ("invalid octal escape sequence in character string"); + else + curr_lexer->string_text += static_cast (result); + } + +"\\a" { curr_lexer->string_text += '\a'; } +"\\b" { curr_lexer->string_text += '\b'; } +"\\f" { curr_lexer->string_text += '\f'; } +"\\n" { curr_lexer->string_text += '\n'; } +"\\r" { curr_lexer->string_text += '\r'; } +"\\t" { curr_lexer->string_text += '\t'; } +"\\v" { curr_lexer->string_text += '\v'; } + +\\{ANY_INCLUDING_NL} { + curr_lexer->string_text += yytext[1]; + } + +[^\\\n\"]+ { + curr_lexer->string_text += yytext; + } + +%{ +// Single-quoted character strings. +%} + +[^\'\n\r]*\' { + yytext[yyleng-1] = 0; + curr_lexer->string_text += yytext; + + curr_lexer->current_input_column += yyleng; + + int c = curr_lexer->text_yyinput (); + + if (c == '\'') + { + curr_lexer->string_text += c; + + curr_lexer->current_input_column++; + } + else + { + curr_lexer->xunput (c); + + curr_lexer->pop_start_state (); + + curr_lexer->looking_for_object_index = true; + curr_lexer->at_beginning_of_statement = false; + + curr_lexer->push_token (new token (SQ_STRING, + curr_lexer->string_text, + curr_lexer->string_line, + curr_lexer->string_column)); + + curr_lexer->string_text = ""; + + return curr_lexer->count_token_internal (SQ_STRING); + } + } + +{NL} { + error ("unterminated character string constant"); + return LEXICAL_ERROR; + } + +%{ // Imaginary numbers. %} @@ -867,14 +970,14 @@ if (curr_lexer->previous_token_may_be_command () && curr_lexer->space_follows_previous_token ()) { - yyless (0); + curr_lexer->current_input_column++; curr_lexer->push_start_state (COMMAND_START); + curr_lexer->begin_string (SQ_STRING_START); } else if (curr_lexer->at_beginning_of_statement) { curr_lexer->current_input_column++; - int retval = curr_lexer->handle_string ('\''); - return curr_lexer->count_token_internal (retval); + curr_lexer->begin_string (SQ_STRING_START); } else { @@ -888,8 +991,7 @@ || curr_lexer->previous_token_is_binop ()) { curr_lexer->current_input_column++; - int retval = curr_lexer->handle_string ('\''); - return curr_lexer->count_token_internal (retval); + curr_lexer->begin_string (SQ_STRING_START); } else { @@ -906,8 +1008,7 @@ || curr_lexer->previous_token_is_keyword ()) { curr_lexer->current_input_column++; - int retval = curr_lexer->handle_string ('\''); - return curr_lexer->count_token_internal (retval); + curr_lexer->begin_string (SQ_STRING_START); } else return curr_lexer->count_token (HERMITIAN); @@ -920,8 +1021,7 @@ || curr_lexer->previous_token_is_keyword ()) { curr_lexer->current_input_column++; - int retval = curr_lexer->handle_string ('\''); - return curr_lexer->count_token_internal (retval); + curr_lexer->begin_string (SQ_STRING_START); } else return curr_lexer->count_token (HERMITIAN); @@ -939,8 +1039,9 @@ if (curr_lexer->previous_token_may_be_command () && curr_lexer->space_follows_previous_token ()) { - yyless (0); + curr_lexer->current_input_column++; curr_lexer->push_start_state (COMMAND_START); + curr_lexer->begin_string (DQ_STRING_START); } else { @@ -954,8 +1055,7 @@ || curr_lexer->previous_token_is_binop ()) { curr_lexer->current_input_column++; - int retval = curr_lexer->handle_string ('"'); - return curr_lexer->count_token_internal (retval); + curr_lexer->begin_string (DQ_STRING_START); } else { @@ -968,15 +1068,13 @@ else { curr_lexer->current_input_column++; - int retval = curr_lexer->handle_string ('"'); - return curr_lexer->count_token_internal (retval); + curr_lexer->begin_string (DQ_STRING_START); } } else { curr_lexer->current_input_column++; - int retval = curr_lexer->handle_string ('"'); - return curr_lexer->count_token_internal (retval); + curr_lexer->begin_string (DQ_STRING_START); } } } @@ -1555,6 +1653,9 @@ current_input_line = ""; comment_text = ""; help_text = ""; + string_text = ""; + string_line = 0; + string_column = 0; fcn_file_name = ""; fcn_file_full_name = ""; looking_at_object_index.clear (); @@ -1769,6 +1870,15 @@ push_start_state (INPUT_FILE_START); } +void +octave_base_lexer::begin_string (int state) +{ + string_line = input_line_number; + string_column = current_input_column; + + push_start_state (state); +} + int octave_base_lexer::handle_end_of_input (void) { @@ -2257,218 +2367,6 @@ at_beginning_of_statement = true; } -// We have seen a backslash and need to find out if it should be -// treated as a continuation character. If so, this eats it, up to -// and including the new line character. -// -// Match whitespace only, followed by a comment character or newline. -// Once a comment character is found, discard all input until newline. -// If non-whitespace characters are found before comment -// characters, return 0. Otherwise, return 1. - -// FIXME -- we need to handle block comments here. - -bool -octave_base_lexer::have_continuation (bool trailing_comments_ok) -{ - std::ostringstream buf; - - std::string comment_buf; - - bool in_comment = false; - bool beginning_of_comment = false; - - int c = 0; - - while ((c = text_yyinput ()) != EOF) - { - buf << static_cast (c); - - switch (c) - { - case ' ': - case '\t': - if (in_comment) - { - comment_buf += static_cast (c); - beginning_of_comment = false; - } - break; - - case '%': - case '#': - if (trailing_comments_ok) - { - if (in_comment) - { - if (! beginning_of_comment) - comment_buf += static_cast (c); - } - else - { - maybe_gripe_matlab_incompatible_comment (c); - in_comment = true; - beginning_of_comment = true; - } - } - else - goto cleanup; - break; - - case '\n': - if (in_comment) - { - comment_buf += static_cast (c); - octave_comment_buffer::append (comment_buf); - } - current_input_column = 0; - decrement_promptflag (); - gripe_matlab_incompatible_continuation (); - return true; - - default: - if (in_comment) - { - comment_buf += static_cast (c); - beginning_of_comment = false; - } - else - goto cleanup; - break; - } - } - - xunput (c); - return false; - -cleanup: - - std::string s = buf.str (); - - int len = s.length (); - while (len--) - xunput (s[len]); - - return false; -} - -// We have seen a '.' and need to see if it is the start of a -// continuation. If so, this eats it, up to and including the new -// line character. - -bool -octave_base_lexer::have_ellipsis_continuation (bool trailing_comments_ok) -{ - char c1 = text_yyinput (); - if (c1 == '.') - { - char c2 = text_yyinput (); - if (c2 == '.' && have_continuation (trailing_comments_ok)) - return true; - else - { - xunput (c2); - xunput (c1); - } - } - else - xunput (c1); - - return false; -} - -int -octave_base_lexer::handle_string (char delim) -{ - std::ostringstream buf; - - int bos_line = input_line_number; - int bos_col = current_input_column; - - int c; - int escape_pending = 0; - - while ((c = text_yyinput ()) != EOF) - { - current_input_column++; - - if (c == '\\') - { - if (delim == '\'' || escape_pending) - { - buf << static_cast (c); - escape_pending = 0; - } - else - { - if (have_continuation (false)) - escape_pending = 0; - else - { - buf << static_cast (c); - escape_pending = 1; - } - } - continue; - } - else if (c == '.') - { - if (delim == '\'' || ! have_ellipsis_continuation (false)) - buf << static_cast (c); - } - else if (c == '\n') - { - error ("unterminated string constant"); - break; - } - else if (c == delim) - { - if (escape_pending) - buf << static_cast (c); - else - { - c = text_yyinput (); - if (c == delim) - { - buf << static_cast (c); - } - else - { - std::string s; - xunput (c); - - if (delim == '\'') - s = buf.str (); - else - s = do_string_escapes (buf.str ()); - - if (delim == '"') - gripe_matlab_incompatible ("\" used as string delimiter"); - else if (delim == '\'') - gripe_single_quote_string (); - - looking_for_object_index = true; - at_beginning_of_statement = false; - - int tok = delim == '"' ? DQ_STRING : SQ_STRING; - - push_token (new token (tok, s, bos_line, bos_col)); - - return tok; - } - } - } - else - { - buf << static_cast (c); - } - - escape_pending = 0; - } - - return LEXICAL_ERROR; -} - int octave_base_lexer::handle_close_bracket (int bracket_type) {