changeset 16898:531473481084

rewrite string parsing to avoid unlimited lookahead * NEWS: Mention change. * lex.h, lex.ll (lexical_feedback::string_text, lexical_feedback::string_line, lexical_feedback::string_column): New data members (lexical_feedback::lexical_feedback): Initialize them. (lexical_feedback::reset): Initialize them. (octave_base_lexer::begin_string): New function. (\", "'", <COMMAND_START>[\"\']): Use begin_string to set start state * instead of calling handle_string to parse string. (DQ_STRING_START, SQ_STRING_START): New exclusive start states. (<DQ_STRING_START>\"\", <DQ_STRING_START>\", <DQ_STRING_START>{NL}, <DQ_STRING_START>\\[0-7]{1,3}, <DQ_STRING_START>"\\a", <DQ_STRING_START>"\\b", <DQ_STRING_START>"\\f", <DQ_STRING_START>"\\n", <DQ_STRING_START>"\\r", <DQ_STRING_START>"\\t", <DQ_STRING_START>"\\v", <DQ_STRING_START>\\{ANY_INCLUDING_NL}, <DQ_STRING_START>[^\\\n\"]+, <SQ_STRING_START>[^\'\n\r]*\', <SQ_STRING_START>{NL}): New rules for parsing character strings. (octave_base_lexer::have_continuation, octave_base_lexer::have_ellipsis_continuation, octave_base_lexer::handle_string): Delete.
author John W. Eaton <jwe@octave.org>
date Thu, 04 Jul 2013 20:33:02 -0400
parents 21d5e76891fe
children 55caca526827
files NEWS libinterp/parse-tree/lex.h libinterp/parse-tree/lex.ll
diffstat 3 files changed, 150 insertions(+), 237 deletions(-) [+]
line wrap: on
line diff
--- a/NEWS	Thu Jul 04 19:05:59 2013 -0400
+++ b/NEWS	Thu Jul 04 20:33:02 2013 -0400
@@ -77,6 +77,17 @@
      c,...
     ] = deal (1,2,3)
 
+ ** Line continuations inside character strings have changed.
+
+    The sequence '...' is no longer recognized as a line continuations
+    are inside character strings.  A backslash followed by a newline
+    character is no longer recognized as a line continuation inside
+    single-quoted character strings.  Inside double-quoted character
+    strings, a backslash followed by a newline character is still
+    recognized as a line continuation but the backslash character must
+    be followed immediately by the newline character.  No whitespace or
+    end-of-linecomment may appear between them.
+
  ** Warning IDs renamed:
 
     Octave:array-as-scalar => Octave:array-to-scalar
--- a/libinterp/parse-tree/lex.h	Thu Jul 04 19:05:59 2013 -0400
+++ b/libinterp/parse-tree/lex.h	Thu Jul 04 20:33:02 2013 -0400
@@ -280,6 +280,7 @@
       looping (0), defining_func (0), looking_at_function_handle (0),
       block_comment_nesting_level (0), token_count (0),
       current_input_line (), comment_text (), help_text (),
+      string_text (), string_line (0), string_column (0),
       fcn_file_name (), fcn_file_full_name (), looking_at_object_index (),
       parsed_function_name (), pending_local_variables (),
       symtab_context (), nesting_level (), tokens ()
@@ -411,6 +412,13 @@
   // The current help text.
   std::string help_text;
 
+  // The current character string text.
+  std::string string_text;
+
+  // The position of the beginning of the current character string.
+  int string_line;
+  int string_column;
+
   // Simple name of function file we are reading.
   std::string fcn_file_name;
 
@@ -501,6 +509,8 @@
 
   void prep_for_file (void);
 
+  void begin_string (int state);
+
   virtual int fill_flex_buffer (char *buf, unsigned int max_size) = 0;
 
   bool at_end_of_buffer (void) const { return input_buf.empty (); }
@@ -535,12 +545,6 @@
 
   void finish_comment (octave_comment_elt::comment_type typ);
 
-  bool have_continuation (bool trailing_comments_ok = true);
-
-  bool have_ellipsis_continuation (bool trailing_comments_ok = true);
-
-  int handle_string (char delim);
-
   int handle_close_bracket (int bracket_type);
 
   bool looks_like_command_arg (void);
--- a/libinterp/parse-tree/lex.ll	Thu Jul 04 19:05:59 2013 -0400
+++ b/libinterp/parse-tree/lex.ll	Thu Jul 04 20:33:02 2013 -0400
@@ -51,6 +51,9 @@
 %x BLOCK_COMMENT_START
 %x LINE_COMMENT_START
 
+%x DQ_STRING_START
+%x SQ_STRING_START
+
 %{
 
 #include <cctype>
@@ -290,9 +293,9 @@
     curr_lexer->at_beginning_of_statement = false;
 
     curr_lexer->current_input_column++;
-    int tok = curr_lexer->handle_string (yytext[0]);
-
-    return curr_lexer->count_token_internal (tok);
+
+    curr_lexer->begin_string (yytext[0] == '"'
+                              ? DQ_STRING_START : SQ_STRING_START);
   }
 
 <COMMAND_START>[^#% \t\r\n\;\,\"\'][^ \t\r\n\;\,]*{S}* {
@@ -622,6 +625,106 @@
   }
 
 %{
+// Double-quoted character strings.
+%}
+
+<DQ_STRING_START>\"\" {
+    curr_lexer->current_input_column += yyleng;
+    curr_lexer->string_text += '"';
+  }
+
+<DQ_STRING_START>\" {
+
+    curr_lexer->pop_start_state ();
+
+    curr_lexer->looking_for_object_index = true;
+    curr_lexer->at_beginning_of_statement = false;
+
+    curr_lexer->push_token (new token (DQ_STRING,
+                                       curr_lexer->string_text,
+                                       curr_lexer->string_line,
+                                       curr_lexer->string_column));
+
+    curr_lexer->string_text = "";
+
+    return curr_lexer->count_token_internal (DQ_STRING);
+  }
+
+<DQ_STRING_START>{NL} {
+    error ("unterminated character string constant");
+    return LEXICAL_ERROR;
+  }
+
+<DQ_STRING_START>\\[0-7]{1,3} {
+    int result;
+    sscanf (yytext+1, "%o", &result);
+
+    if (result > 0xff)
+      error ("invalid octal escape sequence in character string");
+    else
+      curr_lexer->string_text += static_cast<unsigned char> (result);
+  }
+
+<DQ_STRING_START>"\\a" { curr_lexer->string_text += '\a'; }
+<DQ_STRING_START>"\\b" { curr_lexer->string_text += '\b'; }
+<DQ_STRING_START>"\\f" { curr_lexer->string_text += '\f'; }
+<DQ_STRING_START>"\\n" { curr_lexer->string_text += '\n'; }
+<DQ_STRING_START>"\\r" { curr_lexer->string_text += '\r'; }
+<DQ_STRING_START>"\\t" { curr_lexer->string_text += '\t'; }
+<DQ_STRING_START>"\\v" { curr_lexer->string_text += '\v'; }
+
+<DQ_STRING_START>\\{ANY_INCLUDING_NL} {
+    curr_lexer->string_text += yytext[1];
+  }
+
+<DQ_STRING_START>[^\\\n\"]+ {
+    curr_lexer->string_text += yytext;
+  }
+
+%{
+// Single-quoted character strings.
+%}
+
+<SQ_STRING_START>[^\'\n\r]*\' {
+    yytext[yyleng-1] = 0;
+    curr_lexer->string_text += yytext;
+
+    curr_lexer->current_input_column += yyleng;
+
+    int c = curr_lexer->text_yyinput ();
+
+    if (c == '\'')
+      {
+        curr_lexer->string_text += c;
+
+        curr_lexer->current_input_column++;
+      }
+    else
+      {
+        curr_lexer->xunput (c);
+
+        curr_lexer->pop_start_state ();
+
+        curr_lexer->looking_for_object_index = true;
+        curr_lexer->at_beginning_of_statement = false;
+
+        curr_lexer->push_token (new token (SQ_STRING,
+                                           curr_lexer->string_text,
+                                           curr_lexer->string_line,
+                                           curr_lexer->string_column));
+
+        curr_lexer->string_text = "";
+
+        return curr_lexer->count_token_internal (SQ_STRING);
+      }      
+  }
+
+<SQ_STRING_START>{NL} {
+    error ("unterminated character string constant");
+    return LEXICAL_ERROR;
+  }
+
+%{
 // Imaginary numbers.
 %}
 
@@ -867,14 +970,14 @@
     if (curr_lexer->previous_token_may_be_command ()
         &&  curr_lexer->space_follows_previous_token ())
       {
-        yyless (0);
+        curr_lexer->current_input_column++;
         curr_lexer->push_start_state (COMMAND_START);
+        curr_lexer->begin_string (SQ_STRING_START);
       }
     else if (curr_lexer->at_beginning_of_statement)
       {
         curr_lexer->current_input_column++;
-        int retval = curr_lexer->handle_string ('\'');
-        return curr_lexer->count_token_internal (retval);
+        curr_lexer->begin_string (SQ_STRING_START);
       }
     else
       {
@@ -888,8 +991,7 @@
                     || curr_lexer->previous_token_is_binop ())
                   {
                     curr_lexer->current_input_column++;
-                    int retval = curr_lexer->handle_string ('\'');
-                    return curr_lexer->count_token_internal (retval);
+                    curr_lexer->begin_string (SQ_STRING_START);
                   }
                 else
                   {
@@ -906,8 +1008,7 @@
                     || curr_lexer->previous_token_is_keyword ())
                   {
                     curr_lexer->current_input_column++;
-                    int retval = curr_lexer->handle_string ('\'');
-                    return curr_lexer->count_token_internal (retval);
+                    curr_lexer->begin_string (SQ_STRING_START);
                   }
                 else
                   return curr_lexer->count_token (HERMITIAN);
@@ -920,8 +1021,7 @@
                 || curr_lexer->previous_token_is_keyword ())
               {
                 curr_lexer->current_input_column++;
-                int retval = curr_lexer->handle_string ('\'');
-                return curr_lexer->count_token_internal (retval);
+                curr_lexer->begin_string (SQ_STRING_START);
               }
             else
               return curr_lexer->count_token (HERMITIAN);
@@ -939,8 +1039,9 @@
     if (curr_lexer->previous_token_may_be_command ()
         &&  curr_lexer->space_follows_previous_token ())
       {
-        yyless (0);
+        curr_lexer->current_input_column++;
         curr_lexer->push_start_state (COMMAND_START);
+        curr_lexer->begin_string (DQ_STRING_START);
       }
     else
       {
@@ -954,8 +1055,7 @@
                     || curr_lexer->previous_token_is_binop ())
                   {
                     curr_lexer->current_input_column++;
-                    int retval = curr_lexer->handle_string ('"');
-                    return curr_lexer->count_token_internal (retval);
+                    curr_lexer->begin_string (DQ_STRING_START);
                   }
                 else
                   {
@@ -968,15 +1068,13 @@
             else
               {
                 curr_lexer->current_input_column++;
-                int retval = curr_lexer->handle_string ('"');
-                return curr_lexer->count_token_internal (retval);
+                curr_lexer->begin_string (DQ_STRING_START);
               }
           }
         else
           {
             curr_lexer->current_input_column++;
-            int retval = curr_lexer->handle_string ('"');
-            return curr_lexer->count_token_internal (retval);
+            curr_lexer->begin_string (DQ_STRING_START);
           }
       }
   }
@@ -1555,6 +1653,9 @@
   current_input_line = "";
   comment_text = "";
   help_text = "";
+  string_text = "";
+  string_line = 0;
+  string_column = 0;
   fcn_file_name = "";
   fcn_file_full_name = "";
   looking_at_object_index.clear ();
@@ -1769,6 +1870,15 @@
   push_start_state (INPUT_FILE_START);
 }
 
+void
+octave_base_lexer::begin_string (int state)
+{
+  string_line = input_line_number;
+  string_column = current_input_column;
+
+  push_start_state (state);
+}
+
 int
 octave_base_lexer::handle_end_of_input (void)
 {
@@ -2257,218 +2367,6 @@
   at_beginning_of_statement = true;
 }
 
-// We have seen a backslash and need to find out if it should be
-// treated as a continuation character.  If so, this eats it, up to
-// and including the new line character.
-//
-// Match whitespace only, followed by a comment character or newline.
-// Once a comment character is found, discard all input until newline.
-// If non-whitespace characters are found before comment
-// characters, return 0.  Otherwise, return 1.
-
-// FIXME -- we need to handle block comments here.
-
-bool
-octave_base_lexer::have_continuation (bool trailing_comments_ok)
-{
-  std::ostringstream buf;
-
-  std::string comment_buf;
-
-  bool in_comment = false;
-  bool beginning_of_comment = false;
-
-  int c = 0;
-
-  while ((c = text_yyinput ()) != EOF)
-    {
-      buf << static_cast<char> (c);
-
-      switch (c)
-        {
-        case ' ':
-        case '\t':
-          if (in_comment)
-            {
-              comment_buf += static_cast<char> (c);
-              beginning_of_comment = false;
-            }
-          break;
-
-        case '%':
-        case '#':
-          if (trailing_comments_ok)
-            {
-              if (in_comment)
-                {
-                  if (! beginning_of_comment)
-                    comment_buf += static_cast<char> (c);
-                }
-              else
-                {
-                  maybe_gripe_matlab_incompatible_comment (c);
-                  in_comment = true;
-                  beginning_of_comment = true;
-                }
-            }
-          else
-            goto cleanup;
-          break;
-
-        case '\n':
-          if (in_comment)
-            {
-              comment_buf += static_cast<char> (c);
-              octave_comment_buffer::append (comment_buf);
-            }
-          current_input_column = 0;
-          decrement_promptflag ();
-          gripe_matlab_incompatible_continuation ();
-          return true;
-
-        default:
-          if (in_comment)
-            {
-              comment_buf += static_cast<char> (c);
-              beginning_of_comment = false;
-            }
-          else
-            goto cleanup;
-          break;
-        }
-    }
-
-  xunput (c);
-  return false;
-
-cleanup:
-
-  std::string s = buf.str ();
-
-  int len = s.length ();
-  while (len--)
-    xunput (s[len]);
-
-  return false;
-}
-
-// We have seen a '.' and need to see if it is the start of a
-// continuation.  If so, this eats it, up to and including the new
-// line character.
-
-bool
-octave_base_lexer::have_ellipsis_continuation (bool trailing_comments_ok)
-{
-  char c1 = text_yyinput ();
-  if (c1 == '.')
-    {
-      char c2 = text_yyinput ();
-      if (c2 == '.' && have_continuation (trailing_comments_ok))
-        return true;
-      else
-        {
-          xunput (c2);
-          xunput (c1);
-        }
-    }
-  else
-    xunput (c1);
-
-  return false;
-}
-
-int
-octave_base_lexer::handle_string (char delim)
-{
-  std::ostringstream buf;
-
-  int bos_line = input_line_number;
-  int bos_col = current_input_column;
-
-  int c;
-  int escape_pending = 0;
-
-  while ((c = text_yyinput ()) != EOF)
-    {
-      current_input_column++;
-
-      if (c == '\\')
-        {
-          if (delim == '\'' || escape_pending)
-            {
-              buf << static_cast<char> (c);
-              escape_pending = 0;
-            }
-          else
-            {
-              if (have_continuation (false))
-                escape_pending = 0;
-              else
-                {
-                  buf << static_cast<char> (c);
-                  escape_pending = 1;
-                }
-            }
-          continue;
-        }
-      else if (c == '.')
-        {
-          if (delim == '\'' || ! have_ellipsis_continuation (false))
-            buf << static_cast<char> (c);
-        }
-      else if (c == '\n')
-        {
-          error ("unterminated string constant");
-          break;
-        }
-      else if (c == delim)
-        {
-          if (escape_pending)
-            buf << static_cast<char> (c);
-          else
-            {
-              c = text_yyinput ();
-              if (c == delim)
-                {
-                  buf << static_cast<char> (c);
-                }
-              else
-                {
-                  std::string s;
-                  xunput (c);
-
-                  if (delim == '\'')
-                    s = buf.str ();
-                  else
-                    s = do_string_escapes (buf.str ());
-
-                  if (delim == '"')
-                    gripe_matlab_incompatible ("\" used as string delimiter");
-                  else if (delim == '\'')
-                    gripe_single_quote_string ();
-
-                  looking_for_object_index = true;
-                  at_beginning_of_statement = false;
-
-                  int tok = delim == '"' ? DQ_STRING : SQ_STRING;
-
-                  push_token (new token (tok, s, bos_line, bos_col));
-
-                  return tok;
-                }
-            }
-        }
-      else
-        {
-          buf << static_cast<char> (c);
-        }
-
-      escape_pending = 0;
-    }
-
-  return LEXICAL_ERROR;
-}
-
 int
 octave_base_lexer::handle_close_bracket (int bracket_type)
 {