changeset 21064:a9f2c2d72892

handle lexical errors as parser errors (bug #46877) * lex.ll: Store error message info in LEXICAL_ERROR tokens and return token instead of immediately throwing errors in the lexer. * oct-parse.in.yy (LEXICAL_ERROR): Declare type as tok_val instead of dummy_type. (parse_error): Call parser.bison_error to report error.
author John W. Eaton <jwe@octave.org>
date Thu, 14 Jan 2016 05:28:27 -0500
parents 202cfd2b4514
children e1ee2203efe0
files libinterp/parse-tree/lex.ll libinterp/parse-tree/oct-parse.in.yy
diffstat 2 files changed, 115 insertions(+), 56 deletions(-) [+]
line wrap: on
line diff
--- a/libinterp/parse-tree/lex.ll	Thu Jan 14 05:35:36 2016 -0500
+++ b/libinterp/parse-tree/lex.ll	Thu Jan 14 05:28:27 2016 -0500
@@ -880,7 +880,17 @@
     sscanf (yytext+1, "%o", &result);
 
     if (result > 0xff)
-      error ("invalid octal escape sequence in character string");
+      {
+        token *tok
+          = new token (LEXICAL_ERROR,
+                       "invalid octal escape sequence in character string",
+                       curr_lexer->input_line_number,
+                       curr_lexer->current_input_column);
+
+        curr_lexer->push_token (tok);
+
+        return curr_lexer->count_token_internal (LEXICAL_ERROR);
+      }
     else
       curr_lexer->string_text += static_cast<unsigned char> (result);
   }
@@ -1013,13 +1023,17 @@
 <DQ_STRING_START>{NL} {
     curr_lexer->lexer_debug ("<DQ_STRING_START>{NL}");
 
+    token *tok = new token (LEXICAL_ERROR,
+                            "unterminated character string constant",
+                            curr_lexer->input_line_number,
+                            curr_lexer->current_input_column);
+
+    curr_lexer->push_token (tok);
+
     curr_lexer->input_line_number++;
     curr_lexer->current_input_column = 1;
 
-    error ("unterminated character string constant");
-
-    // FIXME: This is no longer reachable now that error is exception based.
-    return LEXICAL_ERROR;
+    return curr_lexer->count_token_internal (LEXICAL_ERROR);
   }
 
 %{
@@ -1066,13 +1080,17 @@
 <SQ_STRING_START>{NL} {
     curr_lexer->lexer_debug ("<SQ_STRING_START>{NL}");
 
+    token *tok = new token (LEXICAL_ERROR,
+                            "unterminated character string constant",
+                            curr_lexer->input_line_number,
+                            curr_lexer->current_input_column);
+
+    curr_lexer->push_token (tok);
+
     curr_lexer->input_line_number++;
     curr_lexer->current_input_column = 1;
 
-    error ("unterminated character string constant");
-
-    // FIXME: This is no longer reachable now that error is exception based.
-    return LEXICAL_ERROR;
+    return curr_lexer->count_token_internal (LEXICAL_ERROR);
   }
 
 %{
@@ -1332,11 +1350,11 @@
 {NL} {
     curr_lexer->lexer_debug ("{NL}");
 
-    curr_lexer->input_line_number++;
-    curr_lexer->current_input_column = 1;
-
     if (curr_lexer->nesting_level.is_paren ())
       {
+        curr_lexer->input_line_number++;
+        curr_lexer->current_input_column = 1;
+
         curr_lexer->at_beginning_of_statement = false;
         curr_lexer->gripe_language_extension
           ("bare newline inside parentheses");
@@ -1344,11 +1362,27 @@
     else if (curr_lexer->nesting_level.none ()
         || curr_lexer->nesting_level.is_anon_fcn_body ())
       {
+        curr_lexer->input_line_number++;
+        curr_lexer->current_input_column = 1;
+
         curr_lexer->at_beginning_of_statement = true;
+
         return curr_lexer->count_token ('\n');
       }
     else if (curr_lexer->nesting_level.is_bracket_or_brace ())
-      return LEXICAL_ERROR;
+      {
+        token *tok = new token (LEXICAL_ERROR,
+                                "unexpected internal lexer error",
+                                curr_lexer->input_line_number,
+                                curr_lexer->current_input_column);
+
+        curr_lexer->push_token (tok);
+
+        curr_lexer->input_line_number++;
+        curr_lexer->current_input_column = 1;
+
+        return curr_lexer->count_token_internal (LEXICAL_ERROR);
+      }
   }
 
 %{
@@ -1724,14 +1758,21 @@
       return curr_lexer->handle_end_of_input ();
     else
       {
+        std::ostringstream buf;
+
+        buf << "invalid character '"
+            << undo_string_escape (static_cast<char> (c))
+            << "' (ASCII " << c << ")";
+
+        token *tok = new token (LEXICAL_ERROR, buf.str (),
+                                curr_lexer->input_line_number,
+                                curr_lexer->current_input_column);
+
+        curr_lexer->push_token (tok);
+
         curr_lexer->current_input_column++;
 
-        error ("invalid character '%s' (ASCII %d) near line %d, column %d",
-               undo_string_escape (static_cast<char> (c)), c,
-               curr_lexer->input_line_number, curr_lexer->current_input_column);
-
-        // FIXME: This is no longer reachable now that error is exception based.
-        return LEXICAL_ERROR;
+        return curr_lexer->count_token_internal (LEXICAL_ERROR);
       }
   }
 
@@ -2904,9 +2945,14 @@
 
   if (kw_token)
     {
-      error ("method, class, and package names may not be keywords");
-      // FIXME: This is no longer reachable now that error is exception based.
-      return LEXICAL_ERROR;
+      token *tok
+        = new token (LEXICAL_ERROR,
+                     "method, class, and package names may not be keywords",
+                     input_line_number, current_input_column);
+
+      push_token (tok);
+
+      return count_token_internal (LEXICAL_ERROR);
     }
 
   push_token (new token (SUPERCLASSREF, meth, cls,
@@ -2924,9 +2970,12 @@
 
   if (fq_identifier_contains_keyword (cls))
     {
-      error ("class and package names may not be keywords");
-      // FIXME: This is no longer reachable now that error is exception based.
-      return LEXICAL_ERROR;
+      token *tok = new token (LEXICAL_ERROR,
+                              "class and package names may not be keywords",
+                              input_line_number, current_input_column);
+      push_token (tok);
+
+      return count_token_internal (LEXICAL_ERROR);
     }
 
   push_token (new token (METAQUERY, cls, input_line_number,
@@ -2940,16 +2989,21 @@
 int
 octave_base_lexer::handle_fq_identifier (void)
 {
-  std::string tok = flex_yytext ();
-
-  if (fq_identifier_contains_keyword (tok))
+  std::string fq_id = flex_yytext ();
+
+  if (fq_identifier_contains_keyword (fq_id))
     {
-      error ("function, method, class, and package names may not be keywords");
-      // FIXME: This is no longer reachable now that error is exception based.
-      return LEXICAL_ERROR;
+      token *tok
+        = new token (LEXICAL_ERROR,
+                     "function, method, class, and package names may not be keywords",
+                     input_line_number, current_input_column);
+
+      push_token (tok);
+
+      return count_token_internal (LEXICAL_ERROR);
     }
 
-  push_token (new token (FQ_IDENT, tok, input_line_number,
+  push_token (new token (FQ_IDENT, fq_id, input_line_number,
                          current_input_column));
 
   current_input_column += flex_yyleng ();
@@ -2964,9 +3018,7 @@
 int
 octave_base_lexer::handle_identifier (void)
 {
-  char *yytxt = flex_yytext ();
-
-  std::string tok = yytxt;
+  std::string ident = flex_yytext ();
 
   // If we are expecting a structure element, avoid recognizing
   // keywords and other special names and return STRUCT_ELT, which is
@@ -2974,7 +3026,7 @@
 
   if (looking_at_indirect_ref)
     {
-      push_token (new token (STRUCT_ELT, tok, input_line_number,
+      push_token (new token (STRUCT_ELT, ident, input_line_number,
                              current_input_column));
 
       looking_for_object_index = true;
@@ -2984,23 +3036,28 @@
       return STRUCT_ELT;
     }
 
-  // If tok is a keyword token, then is_keyword_token will set
+  // If ident is a keyword token, then is_keyword_token will set
   // at_beginning_of_statement.  For example, if tok is an IF
   // token, then at_beginning_of_statement will be false.
 
-  int kw_token = is_keyword_token (tok);
+  int kw_token = is_keyword_token (ident);
 
   if (looking_at_function_handle)
     {
       if (kw_token)
         {
-          error ("function handles may not refer to keywords");
-          // FIXME: This is no longer reachable now that error is exception based.
-          return LEXICAL_ERROR;
+          token *tok
+            = new token (LEXICAL_ERROR,
+                         "function handles may not refer to keywords",
+                         input_line_number, current_input_column);
+
+          push_token (tok);
+
+          return count_token_internal (LEXICAL_ERROR);
         }
       else
         {
-          push_token (new token (FCN_HANDLE, tok, input_line_number,
+          push_token (new token (FCN_HANDLE, ident, input_line_number,
                                  current_input_column));
 
           current_input_column += flex_yyleng ();
@@ -3032,8 +3089,8 @@
 
   symbol_table::scope_id sid = symtab_context.curr_scope ();
 
-  token *tok_val = new token (NAME, &(symbol_table::insert (tok, sid)),
-                              input_line_number, current_input_column);
+  token *tok = new token (NAME, &(symbol_table::insert (ident, sid)),
+                          input_line_number, current_input_column);
 
   // The following symbols are handled specially so that things like
   //
@@ -3043,21 +3100,21 @@
   // function call with the argument "+1".
 
   if (at_beginning_of_statement
-      && (! (is_variable (tok)
-             || tok == "e" || tok == "pi"
-             || tok == "I" || tok == "i"
-             || tok == "J" || tok == "j"
-             || tok == "Inf" || tok == "inf"
-             || tok == "NaN" || tok == "nan")))
-    tok_val->mark_may_be_command ();
-
-  push_token (tok_val);
+      && (! (is_variable (ident)
+             || ident == "e" || ident == "pi"
+             || ident == "I" || ident == "i"
+             || ident == "J" || ident == "j"
+             || ident == "Inf" || ident == "inf"
+             || ident == "NaN" || ident == "nan")))
+    tok->mark_may_be_command ();
+
+  push_token (tok);
 
   current_input_column += flex_yyleng ();
 
   // The magic end index can't be indexed.
 
-  if (tok != "end")
+  if (ident != "end")
     looking_for_object_index = true;
 
   at_beginning_of_statement = false;
--- a/libinterp/parse-tree/oct-parse.in.yy	Thu Jan 14 05:35:36 2016 -0500
+++ b/libinterp/parse-tree/oct-parse.in.yy	Thu Jan 14 05:28:27 2016 -0500
@@ -245,9 +245,10 @@
 %token <tok_val> FQ_IDENT
 %token <tok_val> GET SET
 %token <tok_val> FCN
+%token <tok_val> LEXICAL_ERROR
 
 // Other tokens.
-%token<dummy_type> END_OF_INPUT LEXICAL_ERROR
+%token<dummy_type> END_OF_INPUT
 %token<dummy_type> INPUT_FILE
 // %token VARARGIN VARARGOUT
 
@@ -1958,7 +1959,8 @@
 parse_error     : LEXICAL_ERROR
                   {
                     $$ = 0;
-                    parser.bison_error ("parse error");
+                    std::string msg = $1->text ();
+                    parser.bison_error (msg.c_str ());
                   }
                 | error
                   { $$ = 0; }