changeset 16228:e19b1632d7c1

revamp most comment handling * comment-list.h (octave_comment_elt::comment_type): New value, full_line. * lex.h (lexical_feedback::comment_text): New member variable. (lexical_feedback::finish_comment): New function. (octave_lexer::grab_block_comment, octave_lexer::grab_comment_block, octave_lexer::process_comment): Delete. * lex.ll (BLOCK_COMMENT_START, LINE_COMMENT_START): New exclusive start states. (ANY_INCLUDING_NL): New pattern. (<INPUT_FILE_START>{ANY_INCLUDING_NL}): Use it instead of ".". (^{S}*{CCHAR}\{{S}*{NL}, <BLOCK_COMMENT_START>^{S}*{CCHAR}\{{S}*{NL}, <BLOCK_COMMENT_START>^{S}*{CCHAR}\}{S}*{NL}, <BLOCK_COMMENT_START>.*{NL}, {S}*{CCHAR}.*{NL}, <LINE_COMMENT_START>{S}*{CCHAR}.*{NL}, <LINE_COMMENT_START>{ANY_INCLUDING_NL}): New patterns and rules for handling comments. ({CCHAR}, ^{S}*{CCHAR}\{{S}*{NL}): Delete old rules for comments. (display_start_state): Also handle BLOCK_COMMENT_START and LINE_COMMENT_START.
author John W. Eaton <jwe@octave.org>
date Fri, 08 Mar 2013 17:13:54 -0500
parents 054d9e8f99b6
children 7b7b1e4968e8
files libinterp/interp-core/comment-list.h libinterp/parse-tree/lex.h libinterp/parse-tree/lex.ll
diffstat 3 files changed, 222 insertions(+), 350 deletions(-) [+]
line wrap: on
line diff
--- a/libinterp/interp-core/comment-list.h	Fri Mar 08 16:57:28 2013 -0500
+++ b/libinterp/interp-core/comment-list.h	Fri Mar 08 17:13:54 2013 -0500
@@ -42,6 +42,7 @@
   {
     unknown,
     block,
+    full_line,
     end_of_line,
     doc_string,
     copyright
--- a/libinterp/parse-tree/lex.h	Fri Mar 08 16:57:28 2013 -0500
+++ b/libinterp/parse-tree/lex.h	Fri Mar 08 17:13:54 2013 -0500
@@ -27,6 +27,7 @@
 #include <set>
 #include <stack>
 
+#include "comment-list.h"
 #include "input.h"
 
 extern OCTINTERP_API void cleanup_parser (void);
@@ -177,8 +178,8 @@
       bracketflag (0), braceflag (0),
       looping (0), defining_func (0), looking_at_function_handle (0),
       block_comment_nesting_level (0), token_count (0),
-      current_input_line (), help_text (), fcn_file_name (),
-      fcn_file_full_name (), looking_at_object_index (),
+      current_input_line (), comment_text (), help_text (),
+      fcn_file_name (), fcn_file_full_name (), looking_at_object_index (),
       parsed_function_name (), pending_local_variables (),
       nesting_level (), token_stack ()
   {
@@ -191,6 +192,8 @@
 
   void reset (void);
 
+  int finish_comment (octave_comment_elt::comment_type typ);
+
   // true means that we have encountered eof on the input stream.
   bool end_of_input;
 
@@ -291,6 +294,9 @@
   // The current line of input.
   std::string current_input_line;
 
+  // The current comment text.
+  std::string comment_text;
+
   // The current help text.
   std::string help_text;
 
@@ -421,13 +427,6 @@
 
   bool is_variable (const std::string& name);
 
-  std::string grab_block_comment (stream_reader& reader, bool& eof);
-
-  std::string grab_comment_block (stream_reader& reader, bool at_bol,
-                                  bool& eof);
-
-  int process_comment (bool start_in_block, bool& eof);
-
   bool next_token_is_sep_op (void);
 
   bool next_token_is_postfix_unary_op (bool spc_prev);
--- a/libinterp/parse-tree/lex.ll	Fri Mar 08 16:57:28 2013 -0500
+++ b/libinterp/parse-tree/lex.ll	Fri Mar 08 17:13:54 2013 -0500
@@ -48,6 +48,9 @@
 
 %x INPUT_FILE_START
 
+%x BLOCK_COMMENT_START
+%x LINE_COMMENT_START
+
 %{
 
 #include <cctype>
@@ -252,6 +255,9 @@
 IDENT   ([_$a-zA-Z][_$a-zA-Z0-9]*)
 EXPON   ([DdEe][+-]?{D}+)
 NUMBER  (({D}+\.?{D}*{EXPON}?)|(\.{D}+{EXPON}?)|(0[xX][0-9a-fA-F]+))
+
+ANY_INCLUDING_NL (.|{NL})
+
 %%
 
 %{
@@ -259,8 +265,8 @@
 // the parser go down a special path.
 %}
 
-<INPUT_FILE_START>. {
-    LEXER_DEBUG ("<INPUT_FILE_START>.");
+<INPUT_FILE_START>{ANY_INCLUDING_NL} {
+    LEXER_DEBUG ("<INPUT_FILE_START>{ANY_INCLUDING_NL}");
 
     curr_lexer->xunput (yytext[0]);
 
@@ -564,6 +570,157 @@
   }
 
 %{
+// Gobble comments.
+%}
+
+%{
+// Start of a block comment.  If the comment marker appears immediately
+// after a block of full-line comments, finish the full line comment
+// block.
+%}
+
+^{S}*{CCHAR}\{{S}*{NL} {
+    LEXER_DEBUG ("^{S}*{CCHAR}\{{S}*{NL}");
+
+    int tok = 0;
+
+    if (curr_lexer->start_state () == LINE_COMMENT_START)
+      {
+        if (! curr_lexer->comment_text.empty ())
+          tok = curr_lexer->finish_comment (octave_comment_elt::full_line);
+
+        curr_lexer->pop_start_state ();
+      }
+
+    curr_lexer->push_start_state (BLOCK_COMMENT_START);
+
+    yyless (0);
+
+    if (tok > 0)
+      COUNT_TOK_AND_RETURN (tok);
+  }
+
+<BLOCK_COMMENT_START>^{S}*{CCHAR}\{{S}*{NL} {
+    LEXER_DEBUG ("<BLOCK_COMMENT_START>^{S}*{CCHAR}\{{S}*{NL}");
+
+    curr_lexer->input_line_number++;
+    curr_lexer->current_input_column = 1;
+
+    if (curr_lexer->block_comment_nesting_level)
+      curr_lexer->comment_text = "\n";
+
+    curr_lexer->block_comment_nesting_level++;
+  }
+
+%{
+// End of a block comment.  If this block comment is nested inside
+// another, wait for the outermost block comment block to be closed
+// before storing the comment.
+%}
+
+<BLOCK_COMMENT_START>^{S}*{CCHAR}\}{S}*{NL} {
+    LEXER_DEBUG ("<BLOCK_COMMENT_START>^{S}*{CCHAR}\}{S}*{NL}");
+
+    curr_lexer->input_line_number++;
+    curr_lexer->current_input_column = 1;
+
+    int tok = 0;
+
+    if (curr_lexer->block_comment_nesting_level > 1)
+      curr_lexer->comment_text = "\n";
+    else
+      tok = curr_lexer->finish_comment (octave_comment_elt::block);
+
+    curr_lexer->block_comment_nesting_level--;
+    curr_lexer->pop_start_state ();
+
+    if (tok > 0)
+      COUNT_TOK_AND_RETURN (tok);
+  }
+
+%{
+// Body of a block comment.
+%}
+
+<BLOCK_COMMENT_START>.*{NL} {
+    LEXER_DEBUG ("<BLOCK_COMMENT_START>.*{NL}");
+
+    curr_lexer->input_line_number++;
+    curr_lexer->current_input_column = 1;
+    curr_lexer->comment_text += yytext;
+  }
+
+%{
+// Full-line or end-of-line comment.
+%}
+
+{S}*{CCHAR}.*{NL} {
+    LEXER_DEBUG ("{S}*{CCHAR}.*{NL}");
+
+    curr_lexer->push_start_state (LINE_COMMENT_START);
+    yyless (0);
+  }
+
+<LINE_COMMENT_START>{S}*{CCHAR}.*{NL} {
+    LEXER_DEBUG ("<LINE_COMMENT_START>{S}*{CCHAR}.*{NL}");
+
+    bool full_line_comment = curr_lexer->current_input_column == 1;
+    curr_lexer->input_line_number++;
+    curr_lexer->current_input_column = 1;
+
+    size_t len = yyleng;
+    size_t i = 0;
+    while (i < len)
+      {
+        char c = yytext[i];
+        if (c == '#' || c == '%' || c == ' ' || c == '\t')
+          i++;
+        else
+          break;
+      }
+      
+    curr_lexer->comment_text += &yytext[i];
+
+    int tok = 0;
+
+    if (! full_line_comment)
+      {
+        tok = curr_lexer->finish_comment (octave_comment_elt::end_of_line);
+
+        curr_lexer->pop_start_state ();
+
+        if (curr_lexer->start_state () == COMMAND_START)
+          {
+            // Allow the actions for the end of a COMMAND line to be
+            // executed next.
+
+            tok = 0;
+            curr_lexer->xunput ('\n');
+          }
+      }
+
+    if (tok > 0)
+      COUNT_TOK_AND_RETURN (tok);
+  }
+
+%{
+// End of a block of full-line comments.
+%}
+
+<LINE_COMMENT_START>{ANY_INCLUDING_NL} {
+    LEXER_DEBUG ("<LINE_COMMENT_START>{ANY_INCLUDING_NL}");
+
+    curr_lexer->xunput (yytext[0]);
+
+    int tok = curr_lexer->finish_comment (octave_comment_elt::full_line);  
+
+    curr_lexer->pop_start_state ();
+
+    if (tok > 0)
+      COUNT_TOK_AND_RETURN (tok);
+  }
+
+%{
 // Imaginary numbers.
 %}
 
@@ -754,44 +911,6 @@
 }
 
 %{
-// Gobble comments.
-%}
-
-{CCHAR} {
-    LEXER_DEBUG ("{CCHAR}");
-
-    curr_lexer->looking_for_object_index = false;
-
-    curr_lexer->xunput (yytext[0]);
-
-    bool eof = false;
-    int tok = curr_lexer->process_comment (false, eof);
-
-    if (eof)
-      return curr_lexer->handle_end_of_input ();
-    else if (tok > 0)
-      COUNT_TOK_AND_RETURN (tok);
-  }
-
-%{
-// Block comments.
-%}
-
-^{S}*{CCHAR}\{{S}*{NL} {
-    LEXER_DEBUG ("^{S}*{CCHAR}\\{{S}*{NL}");
-
-    curr_lexer->looking_for_object_index = false;
-
-    curr_lexer->input_line_number++;
-    curr_lexer->current_input_column = 1;
-    curr_lexer->block_comment_nesting_level++;
-    curr_lexer->decrement_promptflag ();
-
-    bool eof = false;
-    curr_lexer->process_comment (true, eof);
-  }
-
-%{
 // Other operators.
 %}
 
@@ -1344,6 +1463,7 @@
   block_comment_nesting_level = 0;
   token_count = 0;
   current_input_line = "";
+  comment_text = "";
   help_text = "";
   fcn_file_name = "";
   fcn_file_full_name = "";
@@ -1360,6 +1480,51 @@
   reset_token_stack ();
 }
 
+static bool
+looks_like_copyright (const std::string& s)
+{
+  bool retval = false;
+
+  if (! s.empty ())
+    {
+      size_t offset = s.find_first_not_of (" \t");
+
+      retval = (s.substr (offset, 9) == "Copyright" || s.substr (offset, 6) == "Author");
+    }
+
+  return retval;
+}
+
+int
+lexical_feedback::finish_comment (octave_comment_elt::comment_type typ)
+{
+  bool copyright = looks_like_copyright (comment_text);
+
+  if (nesting_level.none () && help_text.empty ()
+    && ! comment_text.empty () && ! copyright)
+    help_text = comment_text;
+
+  if (copyright)
+    typ = octave_comment_elt::copyright;
+
+  octave_comment_buffer::append (comment_text, typ);
+
+  comment_text = "";
+
+  quote_is_transpose = false;
+  convert_spaces_to_comma = true;
+  at_beginning_of_statement = true;
+
+  if (nesting_level.none ())
+    return '\n';
+  else if (nesting_level.is_bracket_or_brace ())
+    // FIXME -- this result will be different if the comment follows a
+    // continuation token.
+    return ';';
+  else
+    return 0;
+}
+
 void
 lexical_feedback::reset_token_stack (void)
 {
@@ -1897,307 +2062,6 @@
               != pending_local_variables.end ()));
 }
 
-std::string
-octave_lexer::grab_block_comment (stream_reader& reader, bool& eof)
-{
-  std::string buf;
-
-  bool at_bol = true;
-  bool look_for_marker = false;
-
-  bool warned_incompatible = false;
-
-  int c = 0;
-
-  while ((c = reader.getc ()) != EOF)
-    {
-      current_input_column++;
-
-      if (look_for_marker)
-        {
-          at_bol = false;
-          look_for_marker = false;
-
-          if (c == '{' || c == '}')
-            {
-              std::string tmp_buf (1, static_cast<char> (c));
-
-              int type = c;
-
-              bool done = false;
-
-              while ((c = reader.getc ()) != EOF && ! done)
-                {
-                  current_input_column++;
-
-                  switch (c)
-                    {
-                    case ' ':
-                    case '\t':
-                      tmp_buf += static_cast<char> (c);
-                      break;
-
-                    case '\n':
-                      {
-                        current_input_column = 0;
-                        at_bol = true;
-                        done = true;
-
-                        if (type == '{')
-                          {
-                            block_comment_nesting_level++;
-                            decrement_promptflag ();
-                          }
-                        else
-                          {
-                            block_comment_nesting_level--;
-                            increment_promptflag ();
-
-                            if (block_comment_nesting_level == 0)
-                              {
-                                buf += grab_comment_block (reader, true, eof);
-
-                                return buf;
-                              }
-                          }
-                      }
-                      break;
-
-                    default:
-                      at_bol = false;
-                      tmp_buf += static_cast<char> (c);
-                      buf += tmp_buf;
-                      done = true;
-                      break;
-                    }
-                }
-            }
-        }
-
-      if (at_bol && (c == '%' || c == '#'))
-        {
-          if (c == '#' && ! warned_incompatible)
-            {
-              warned_incompatible = true;
-              maybe_gripe_matlab_incompatible_comment (c);
-            }
-
-          at_bol = false;
-          look_for_marker = true;
-        }
-      else
-        {
-          buf += static_cast<char> (c);
-
-          if (c == '\n')
-            {
-              current_input_column = 0;
-              at_bol = true;
-            }
-        }
-    }
-
-  if (c == EOF)
-    eof = true;
-
-  return buf;
-}
-
-std::string
-octave_lexer::grab_comment_block (stream_reader& reader, bool at_bol,
-                                  bool& eof)
-{
-  std::string buf;
-
-  // TRUE means we are at the beginning of a comment block.
-  bool begin_comment = false;
-
-  // TRUE means we are currently reading a comment block.
-  bool in_comment = false;
-
-  bool warned_incompatible = false;
-
-  int c = 0;
-
-  while ((c = reader.getc ()) != EOF)
-    {
-      current_input_column++;
-
-      if (begin_comment)
-        {
-          if (c == '%' || c == '#')
-            {
-              at_bol = false;
-              continue;
-            }
-          else if (at_bol && c == '{')
-            {
-              std::string tmp_buf (1, static_cast<char> (c));
-
-              bool done = false;
-
-              while ((c = reader.getc ()) != EOF && ! done)
-                {
-                  current_input_column++;
-
-                  switch (c)
-                    {
-                    case ' ':
-                    case '\t':
-                      tmp_buf += static_cast<char> (c);
-                      break;
-
-                    case '\n':
-                      {
-                        current_input_column = 0;
-                        at_bol = true;
-                        done = true;
-
-                        block_comment_nesting_level++;
-                        decrement_promptflag ();
-
-                        buf += grab_block_comment (reader, eof);
-
-                        in_comment = false;
-
-                        if (eof)
-                          goto done;
-                      }
-                      break;
-
-                    default:
-                      at_bol = false;
-                      tmp_buf += static_cast<char> (c);
-                      buf += tmp_buf;
-                      done = true;
-                      break;
-                    }
-                }
-            }
-          else
-            {
-              at_bol = false;
-              begin_comment = false;
-            }
-        }
-
-      if (in_comment)
-        {
-          buf += static_cast<char> (c);
-
-          if (c == '\n')
-            {
-              at_bol = true;
-              current_input_column = 0;
-              in_comment = false;
-
-              // FIXME -- bailing out here prevents things like
-              //
-              //    octave> # comment
-              //    octave> x = 1
-              //
-              // from failing at the command line, while still
-              // allowing blocks of comments to be grabbed properly
-              // for function doc strings.  But only the first line of
-              // a mult-line doc string will be picked up for
-              // functions defined on the command line.  We need a
-              // better way of collecting these comments...
-              if (! (reading_fcn_file || reading_script_file))
-                goto done;
-            }
-        }
-      else
-        {
-          switch (c)
-            {
-            case ' ':
-            case '\t':
-              break;
-
-            case '#':
-              if (! warned_incompatible)
-                {
-                  warned_incompatible = true;
-                  maybe_gripe_matlab_incompatible_comment (c);
-                }
-              // fall through...
-
-            case '%':
-              in_comment = true;
-              begin_comment = true;
-              break;
-
-            default:
-              current_input_column--;
-              reader.ungetc (c);
-              goto done;
-            }
-        }
-    }
-
- done:
-
-  if (c == EOF)
-    eof = true;
-
-  return buf;
-}
-
-static bool
-looks_like_copyright (const std::string& s)
-{
-  bool retval = false;
-
-  if (! s.empty ())
-    {
-      size_t offset = s.find_first_not_of (" \t");
-
-      retval = (s.substr (offset, 9) == "Copyright" || s.substr (offset, 6) == "Author");
-    }
-
-  return retval;
-}
-
-int
-octave_lexer::process_comment (bool start_in_block, bool& eof)
-{
-  eof = false;
-
-  char *yytxt = flex_yytext ();
-  flex_stream_reader flex_reader (this, yytxt);
-
-  // process_comment is only supposed to be called when we are not
-  // initially looking at a block comment.
-
-  std::string txt = start_in_block
-    ? grab_block_comment (flex_reader, eof)
-    : grab_comment_block (flex_reader, false, eof);
-
-  if (lexer_debug_flag)
-    std::cerr << "C: " << txt << std::endl;
-
-  if (nesting_level.none () && help_text.empty () && ! txt.empty ()
-      && ! looks_like_copyright (txt))
-    help_text = txt;
-
-  octave_comment_buffer::append (txt);
-
-  current_input_column = 1;
-  quote_is_transpose = false;
-  convert_spaces_to_comma = true;
-  at_beginning_of_statement = true;
-
-  if (start_state () == COMMAND_START)
-    pop_start_state ();
-
-  if (nesting_level.none ())
-    return '\n';
-  else if (nesting_level.is_bracket_or_brace ())
-    return ';';
-  else
-    return 0;
-}
-
 // Recognize separators.  If the separator is a CRLF pair, it is
 // replaced by a single LF.
 
@@ -3816,6 +3680,14 @@
       std::cerr << "INPUT_FILE_BEGIN" << std::endl;
       break;
 
+    case BLOCK_COMMENT_START:
+      std::cerr << "BLOCK_COMMENT_START" << std::endl;
+      break;
+
+    case LINE_COMMENT_START:
+      std::cerr << "LINE_COMMENT_START" << std::endl;
+      break;
+
     default:
       std::cerr << "UNKNOWN START STATE!" << std::endl;
       break;