changeset 27578:5aad387784aa

fix line count in lexer when mixing line and block comments (bug #57121) * lex.ll (^{S}*{CCHAR}\{{S}*{NL}): Don't handle LINE_COMMENT_START here since that can't happen. (<LINE_COMMENT_START>^{S}*{CCHAR}\{{S}*{NL}): New pattern to match the beginning of a block comment when we are already processing a series of line-oriented comments. (<LINE_COMMENT_START>{S}*{CCHAR}{ANY_EXCEPT_NL}*{NL}): Simplify. There is no need to parse the comment text to attempt to match the beginning of a block comment as that is now handled by a separate pattern.
author John W. Eaton <jwe@octave.org>
date Mon, 28 Oct 2019 12:05:06 -0400
parents f90564fb99e0
children e7df1df55f0e
files libinterp/parse-tree/lex.ll
diffstat 1 files changed, 72 insertions(+), 73 deletions(-) [+]
line wrap: on
line diff
--- a/libinterp/parse-tree/lex.ll	Mon Oct 28 20:03:04 2019 +0900
+++ b/libinterp/parse-tree/lex.ll	Mon Oct 28 12:05:06 2019 -0400
@@ -630,13 +630,16 @@
   }
 
 %{
-// Gobble comments.
+// Gobble comments.  Both BLOCK_COMMENT_START and LINE_COMMENT_START
+// are exclusive start states.  We try to grab a continuous series of
+// line-oriented comments as a single collection of comments.
 %}
 
 %{
-// Start of a block comment.  If the comment marker appears immediately
-// after a block of full-line comments, finish the full line comment
-// block.
+// Start of a block comment.  Since comment start states are exclusive,
+// this pattern will not match a block comment that immediately follows
+// a line-oriented comment.  All we need to do is push the matched text
+// back on the input stream and push the new start state.
 %}
 
 ^{S}*{CCHAR}\{{S}*{NL} {
@@ -644,16 +647,7 @@
 
     yyless (0);
 
-    if (curr_lexer->start_state () == LINE_COMMENT_START)
-      {
-        if (! curr_lexer->m_comment_text.empty ())
-          curr_lexer->finish_comment (octave::comment_elt::full_line);
-
-        curr_lexer->pop_start_state ();
-      }
-
     curr_lexer->push_start_state (BLOCK_COMMENT_START);
-
   }
 
 <BLOCK_COMMENT_START>^{S}*{CCHAR}\{{S}*{NL} {
@@ -670,8 +664,12 @@
 
 %{
 // End of a block comment.  If this block comment is nested inside
-// another, wait for the outermost block comment block to be closed
-// before storing the comment.
+// another, wait for the outermost block comment to be closed before
+// storing the comment.
+
+// NOTE: This pattern must appear before the one below.  Both may match
+// the same text and this one should take precedence over the one that
+// follows.
 %}
 
 <BLOCK_COMMENT_START>^{S}*{CCHAR}\}{S}*{NL} {
@@ -714,72 +712,65 @@
     yyless (0);
   }
 
+%{
+// Beginning of a block comment while we are looking at a series of
+// line-oriented comments.  Finish previous comment, push current
+// text back on input stream, and switch start states.
+
+// NOTE: This pattern must appear before the one below.  Both may match
+// the same text and this one should take precedence over the one that
+// follows.
+%}
+
+<LINE_COMMENT_START>^{S}*{CCHAR}\{{S}*{NL} {
+    curr_lexer->lexer_debug ("<LINE_COMMENT_START>^{S}*{CCHAR}\\{{S}*{NL}");
+
+    if (! curr_lexer->m_comment_text.empty ())
+      curr_lexer->finish_comment (octave::comment_elt::full_line);
+
+    curr_lexer->pop_start_state ();
+    curr_lexer->push_start_state (BLOCK_COMMENT_START);
+    yyless (0);
+  }
+
+%{
+// Line-oriented comment.  If we are at the beginning of a line, this is
+// part of a series of full-line comments.  Otherwise, this is an end of
+// line comment.  We don't need to parse the matched text to determine
+// whether we are looking at the start of a block comment as that
+// pattern is handled above.
+
+// NOTE: This pattern must appear before the one below.  Both may match
+// the same text and this one should take precedence over the one that
+// follows.
+%}
+
 <LINE_COMMENT_START>{S}*{CCHAR}{ANY_EXCEPT_NL}*{NL} {
     curr_lexer->lexer_debug ("<LINE_COMMENT_START>{S}*{CCHAR}{ANY_EXCEPT_NL}*{NL}");
 
-    bool full_line_comment = curr_lexer->m_current_input_column == 1;
-    curr_lexer->m_input_line_number++;
-    curr_lexer->m_current_input_column = 1;
-
-    bool have_space = false;
-    size_t len = yyleng;
+    // Grab text of comment without leading space or comment
+    // characters.
+
     size_t i = 0;
-    while (i < len)
-      {
-        char c = yytext[i];
-        if (is_space_or_tab (c))
-          {
-            have_space = true;
-            i++;
-          }
-        else
-          break;
-      }
-
-    size_t num_comment_chars = 0;
-
-    while (i < len)
-      {
-        char c = yytext[i];
-        if (c == '#' || c == '%')
-          {
-            num_comment_chars++;
-            i++;
-          }
-        else
-          break;
-      }
+    while (i < yyleng && is_space_or_tab (yytext[i]))
+      i++;
+
+    bool have_space = (i > 0);
+
+    while (i < yyleng && (yytext[i] == '#' || yytext[i] == '%'))
+      i++;
 
     curr_lexer->m_comment_text += &yytext[i];
 
-    if (full_line_comment)
+    if (curr_lexer->m_current_input_column == 1)
       {
-        if (num_comment_chars == 1 && yytext[i++] == '{')
-          {
-            bool looks_like_block_comment = true;
-
-            while (i < len)
-              {
-                char c = yytext[i++];
-                if (! is_space_or_tab_or_eol (c))
-                  {
-                    looks_like_block_comment = false;
-                    break;
-                  }
-              }
-
-            if (looks_like_block_comment)
-              {
-                yyless (0);
-
-                curr_lexer->finish_comment (octave::comment_elt::full_line);
-
-                curr_lexer->pop_start_state ();
-              }
-          }
+        curr_lexer->m_input_line_number++;
+        curr_lexer->m_current_input_column = 1;
       }
     else
       {
+        // End of line comment.
+
         if (have_space)
           curr_lexer->mark_previous_token_trailing_space ();
 
@@ -787,13 +778,21 @@
 
         curr_lexer->pop_start_state ();
 
+        // Push the newline character back on the input and skip
+        // incrementing the line count so we don't have to duplicate
+        // all the possible actions that happen with newlines here.
+
         curr_lexer->xunput ('\n');
-        curr_lexer->m_input_line_number--;
+
+        // The next action should recognize a newline character and set
+        // the input column back to 1, but we should try to keep the
+        // input column location accurate anyway, so update here.
+        curr_lexer->m_current_input_column += yyleng;
       }
   }
 
 %{
-// End of a block of full-line comments.
+// End of a series of full-line comments.
 %}
 
 <LINE_COMMENT_START>{ANY_INCLUDING_NL} {
@@ -807,7 +806,7 @@
   }
 
 %{
-// End of a block of full-line comments.
+// End of file will also end a series of full-line comments.
 %}
 
 <LINE_COMMENT_START><<EOF>> {