octave-nkf: libinterp/parse-tree/lex.ll comparison

comparison libinterp/parse-tree/lex.ll @ 16898:531473481084

rewrite string parsing to avoid unlimited lookahead * NEWS: Mention change. * lex.h, lex.ll (lexical_feedback::string_text, lexical_feedback::string_line, lexical_feedback::string_column): New data members (lexical_feedback::lexical_feedback): Initialize them. (lexical_feedback::reset): Initialize them. (octave_base_lexer::begin_string): New function. (\", "'", <COMMAND_START>[\"\']): Use begin_string to set start state * instead of calling handle_string to parse string. (DQ_STRING_START, SQ_STRING_START): New exclusive start states. (<DQ_STRING_START>\"\", <DQ_STRING_START>\", <DQ_STRING_START>{NL}, <DQ_STRING_START>\\[0-7]{1,3}, <DQ_STRING_START>"\\a", <DQ_STRING_START>"\\b", <DQ_STRING_START>"\\f", <DQ_STRING_START>"\\n", <DQ_STRING_START>"\\r", <DQ_STRING_START>"\\t", <DQ_STRING_START>"\\v", <DQ_STRING_START>\\{ANY_INCLUDING_NL}, <DQ_STRING_START>[^\\\n\"]+, <SQ_STRING_START>[^\'\n\r]*\', <SQ_STRING_START>{NL}): New rules for parsing character strings. (octave_base_lexer::have_continuation, octave_base_lexer::have_ellipsis_continuation, octave_base_lexer::handle_string): Delete.

author	John W. Eaton <jwe@octave.org>
date	Thu, 04 Jul 2013 20:33:02 -0400
parents	f89de736eecd
children	f21194531877

comparison

equal deleted inserted replaced

-:21d5e76891fe
+:531473481084
 %x INPUT_FILE_START
 %x BLOCK_COMMENT_START
 %x LINE_COMMENT_START
+%x DQ_STRING_START
+%x SQ_STRING_START
 %{
 #include <cctype>
 #include <cstring>
 curr_lexer->lexer_debug ("<COMMAND_START>[\\\"\\']");
 curr_lexer->at_beginning_of_statement = false;
 curr_lexer->current_input_column++;
-int tok = curr_lexer->handle_string (yytext[0]);
+curr_lexer->begin_string (yytext[0] == '"'
-return curr_lexer->count_token_internal (tok);
+? DQ_STRING_START : SQ_STRING_START);
 }
 <COMMAND_START>[^#% \t\r\n\;\,\"\'][^ \t\r\n\;\,]*{S}* {
 curr_lexer->lexer_debug ("<COMMAND_START>[^#% \\t\\r\\n\\;\\,\\\"\\'][^ \\t\\r\\n\\;\\,]*{S}*");
 curr_lexer->xunput (yytext[0]);
 curr_lexer->finish_comment (octave_comment_elt::full_line);
 curr_lexer->pop_start_state ();
+}
+%{
+// Double-quoted character strings.
+%}
+<DQ_STRING_START>\"\" {
+curr_lexer->current_input_column += yyleng;
+curr_lexer->string_text += '"';
+}
+<DQ_STRING_START>\" {
+curr_lexer->pop_start_state ();
+curr_lexer->looking_for_object_index = true;
+curr_lexer->at_beginning_of_statement = false;
+curr_lexer->push_token (new token (DQ_STRING,
+curr_lexer->string_text,
+curr_lexer->string_line,
+curr_lexer->string_column));
+curr_lexer->string_text = "";
+return curr_lexer->count_token_internal (DQ_STRING);
+}
+<DQ_STRING_START>{NL} {
+error ("unterminated character string constant");
+return LEXICAL_ERROR;
+}
+<DQ_STRING_START>\\[0-7]{1,3} {
+int result;
+sscanf (yytext+1, "%o", &result);
+if (result > 0xff)
+error ("invalid octal escape sequence in character string");
+else
+curr_lexer->string_text += static_cast<unsigned char> (result);
+}
+<DQ_STRING_START>"\\a" { curr_lexer->string_text += '\a'; }
+<DQ_STRING_START>"\\b" { curr_lexer->string_text += '\b'; }
+<DQ_STRING_START>"\\f" { curr_lexer->string_text += '\f'; }
+<DQ_STRING_START>"\\n" { curr_lexer->string_text += '\n'; }
+<DQ_STRING_START>"\\r" { curr_lexer->string_text += '\r'; }
+<DQ_STRING_START>"\\t" { curr_lexer->string_text += '\t'; }
+<DQ_STRING_START>"\\v" { curr_lexer->string_text += '\v'; }
+<DQ_STRING_START>\\{ANY_INCLUDING_NL} {
+curr_lexer->string_text += yytext[1];
+}
+<DQ_STRING_START>[^\\\n\"]+ {
+curr_lexer->string_text += yytext;
+}
+%{
+// Single-quoted character strings.
+%}
+<SQ_STRING_START>[^\'\n\r]*\' {
+yytext[yyleng-1] = 0;
+curr_lexer->string_text += yytext;
+curr_lexer->current_input_column += yyleng;
+int c = curr_lexer->text_yyinput ();
+if (c == '\'')
+{
+curr_lexer->string_text += c;
+curr_lexer->current_input_column++;
+}
+else
+{
+curr_lexer->xunput (c);
+curr_lexer->pop_start_state ();
+curr_lexer->looking_for_object_index = true;
+curr_lexer->at_beginning_of_statement = false;
+curr_lexer->push_token (new token (SQ_STRING,
+curr_lexer->string_text,
+curr_lexer->string_line,
+curr_lexer->string_column));
+curr_lexer->string_text = "";
+return curr_lexer->count_token_internal (SQ_STRING);
+}
+}
+<SQ_STRING_START>{NL} {
+error ("unterminated character string constant");
+return LEXICAL_ERROR;
 }
 %{
 // Imaginary numbers.
 %}
 curr_lexer->lexer_debug ("'");
 if (curr_lexer->previous_token_may_be_command ()
 &&  curr_lexer->space_follows_previous_token ())
 {
-yyless (0);
+curr_lexer->current_input_column++;
 curr_lexer->push_start_state (COMMAND_START);
+curr_lexer->begin_string (SQ_STRING_START);
 }
 else if (curr_lexer->at_beginning_of_statement)
 {
 curr_lexer->current_input_column++;
-int retval = curr_lexer->handle_string ('\'');
+curr_lexer->begin_string (SQ_STRING_START);
-return curr_lexer->count_token_internal (retval);
 }
 else
 {
 int tok = curr_lexer->previous_token_value ();
 {
 if (tok == '[' || tok == '{'
 || curr_lexer->previous_token_is_binop ())
 {
 curr_lexer->current_input_column++;
-int retval = curr_lexer->handle_string ('\'');
+curr_lexer->begin_string (SQ_STRING_START);
-return curr_lexer->count_token_internal (retval);
 }
 else
 {
 yyless (0);
 curr_lexer->xunput (',');
 if (tok == '[' || tok == '{'
 || curr_lexer->previous_token_is_binop ()
 || curr_lexer->previous_token_is_keyword ())
 {
 curr_lexer->current_input_column++;
-int retval = curr_lexer->handle_string ('\'');
+curr_lexer->begin_string (SQ_STRING_START);
-return curr_lexer->count_token_internal (retval);
 }
 else
 return curr_lexer->count_token (HERMITIAN);
 }
 }
 if (! tok || tok == '[' || tok == '{' || tok == '('
 || curr_lexer->previous_token_is_binop ()
 || curr_lexer->previous_token_is_keyword ())
 {
 curr_lexer->current_input_column++;
-int retval = curr_lexer->handle_string ('\'');
+curr_lexer->begin_string (SQ_STRING_START);
-return curr_lexer->count_token_internal (retval);
 }
 else
 return curr_lexer->count_token (HERMITIAN);
 }
 }
 curr_lexer->lexer_debug ("\"");
 if (curr_lexer->previous_token_may_be_command ()
 &&  curr_lexer->space_follows_previous_token ())
 {
-yyless (0);
+curr_lexer->current_input_column++;
 curr_lexer->push_start_state (COMMAND_START);
+curr_lexer->begin_string (DQ_STRING_START);
 }
 else
 {
 int tok = curr_lexer->previous_token_value ();
 {
 if (tok == '[' || tok == '{'
 || curr_lexer->previous_token_is_binop ())
 {
 curr_lexer->current_input_column++;
-int retval = curr_lexer->handle_string ('"');
+curr_lexer->begin_string (DQ_STRING_START);
-return curr_lexer->count_token_internal (retval);
 }
 else
 {
 yyless (0);
 curr_lexer->xunput (',');
 }
 }
 else
 {
 curr_lexer->current_input_column++;
-int retval = curr_lexer->handle_string ('"');
+curr_lexer->begin_string (DQ_STRING_START);
-return curr_lexer->count_token_internal (retval);
 }
 }
 else
 {
 curr_lexer->current_input_column++;
-int retval = curr_lexer->handle_string ('"');
+curr_lexer->begin_string (DQ_STRING_START);
-return curr_lexer->count_token_internal (retval);
 }
 }
 }
 %{
 block_comment_nesting_level = 0;
 token_count = 0;
 current_input_line = "";
 comment_text = "";
 help_text = "";
+string_text = "";
+string_line = 0;
+string_column = 0;
 fcn_file_name = "";
 fcn_file_full_name = "";
 looking_at_object_index.clear ();
 looking_at_object_index.push_front (false);
 reading_script_file = true;
 push_start_state (INPUT_FILE_START);
 }
+void
+octave_base_lexer::begin_string (int state)
+{
+string_line = input_line_number;
+string_column = current_input_column;
+push_start_state (state);
+}
 int
 octave_base_lexer::handle_end_of_input (void)
 {
 lexer_debug ("<<EOF>>");
 octave_comment_buffer::append (comment_text, typ);
 comment_text = "";
 at_beginning_of_statement = true;
-}
-// We have seen a backslash and need to find out if it should be
-// treated as a continuation character.  If so, this eats it, up to
-// and including the new line character.
-//
-// Match whitespace only, followed by a comment character or newline.
-// Once a comment character is found, discard all input until newline.
-// If non-whitespace characters are found before comment
-// characters, return 0.  Otherwise, return 1.
-// FIXME -- we need to handle block comments here.
-bool
-octave_base_lexer::have_continuation (bool trailing_comments_ok)
-{
-std::ostringstream buf;
-std::string comment_buf;
-bool in_comment = false;
-bool beginning_of_comment = false;
-int c = 0;
-while ((c = text_yyinput ()) != EOF)
-{
-buf << static_cast<char> (c);
-switch (c)
-{
-case ' ':
-case '\t':
-if (in_comment)
-{
-comment_buf += static_cast<char> (c);
-beginning_of_comment = false;
-}
-break;
-case '%':
-case '#':
-if (trailing_comments_ok)
-{
-if (in_comment)
-{
-if (! beginning_of_comment)
-comment_buf += static_cast<char> (c);
-}
-else
-{
-maybe_gripe_matlab_incompatible_comment (c);
-in_comment = true;
-beginning_of_comment = true;
-}
-}
-else
-goto cleanup;
-break;
-case '\n':
-if (in_comment)
-{
-comment_buf += static_cast<char> (c);
-octave_comment_buffer::append (comment_buf);
-}
-current_input_column = 0;
-decrement_promptflag ();
-gripe_matlab_incompatible_continuation ();
-return true;
-default:
-if (in_comment)
-{
-comment_buf += static_cast<char> (c);
-beginning_of_comment = false;
-}
-else
-goto cleanup;
-break;
-}
-}
-xunput (c);
-return false;
-cleanup:
-std::string s = buf.str ();
-int len = s.length ();
-while (len--)
-xunput (s[len]);
-return false;
-}
-// We have seen a '.' and need to see if it is the start of a
-// continuation.  If so, this eats it, up to and including the new
-// line character.
-bool
-octave_base_lexer::have_ellipsis_continuation (bool trailing_comments_ok)
-{
-char c1 = text_yyinput ();
-if (c1 == '.')
-{
-char c2 = text_yyinput ();
-if (c2 == '.' && have_continuation (trailing_comments_ok))
-return true;
-else
-{
-xunput (c2);
-xunput (c1);
-}
-}
-else
-xunput (c1);
-return false;
-}
-int
-octave_base_lexer::handle_string (char delim)
-{
-std::ostringstream buf;
-int bos_line = input_line_number;
-int bos_col = current_input_column;
-int c;
-int escape_pending = 0;
-while ((c = text_yyinput ()) != EOF)
-{
-current_input_column++;
-if (c == '\\')
-{
-if (delim == '\'' || escape_pending)
-{
-buf << static_cast<char> (c);
-escape_pending = 0;
-}
-else
-{
-if (have_continuation (false))
-escape_pending = 0;
-else
-{
-buf << static_cast<char> (c);
-escape_pending = 1;
-}
-}
-continue;
-}
-else if (c == '.')
-{
-if (delim == '\'' || ! have_ellipsis_continuation (false))
-buf << static_cast<char> (c);
-}
-else if (c == '\n')
-{
-error ("unterminated string constant");
-break;
-}
-else if (c == delim)
-{
-if (escape_pending)
-buf << static_cast<char> (c);
-else
-{
-c = text_yyinput ();
-if (c == delim)
-{
-buf << static_cast<char> (c);
-}
-else
-{
-std::string s;
-xunput (c);
-if (delim == '\'')
-s = buf.str ();
-else
-s = do_string_escapes (buf.str ());
-if (delim == '"')
-gripe_matlab_incompatible ("\" used as string delimiter");
-else if (delim == '\'')
-gripe_single_quote_string ();
-looking_for_object_index = true;
-at_beginning_of_statement = false;
-int tok = delim == '"' ? DQ_STRING : SQ_STRING;
-push_token (new token (tok, s, bos_line, bos_col));
-return tok;
-}
-}
-}
-else
-{
-buf << static_cast<char> (c);
-}
-escape_pending = 0;
-}
-return LEXICAL_ERROR;
 }
 int
 octave_base_lexer::handle_close_bracket (int bracket_type)
 {

Mercurial > octave-nkf

comparison libinterp/parse-tree/lex.ll @ 16898:531473481084