changeset 16903:f21194531877

improve character string handling in the lexer * lex.ll: Add calls to lexer_debug for character string patterns. Attempt to be consistent with handling of backslash characters in patterns passed to lexer_debug. (<DQ_STRING_START>\\{NL}): Handle EOF and EOB conditions explicitly. (octave_base_lexer::display_start_state): Handle DQ_STRING_START and SQ_STRING_START states.
author John W. Eaton <jwe@octave.org>
date Fri, 05 Jul 2013 13:28:50 -0400
parents 51c1076a9c13
children f29dd5a7591d
files libinterp/parse-tree/lex.ll
diffstat 1 files changed, 104 insertions(+), 19 deletions(-) [+]
line wrap: on
line diff
--- a/libinterp/parse-tree/lex.ll	Fri Jul 05 01:45:26 2013 +0100
+++ b/libinterp/parse-tree/lex.ll	Fri Jul 05 13:28:50 2013 -0400
@@ -452,7 +452,7 @@
 %}
 
 ^{S}*{CCHAR}\{{S}*{NL} {
-    curr_lexer->lexer_debug ("^{S}*{CCHAR}\{{S}*{NL}");
+    curr_lexer->lexer_debug ("^{S}*{CCHAR}\\{{S}*{NL}");
 
     yyless (0);
 
@@ -471,7 +471,7 @@
   }
 
 <BLOCK_COMMENT_START>^{S}*{CCHAR}\{{S}*{NL} {
-    curr_lexer->lexer_debug ("<BLOCK_COMMENT_START>^{S}*{CCHAR}\{{S}*{NL}");
+    curr_lexer->lexer_debug ("<BLOCK_COMMENT_START>^{S}*{CCHAR}\\{{S}*{NL}");
 
     curr_lexer->input_line_number++;
     curr_lexer->current_input_column = 1;
@@ -629,11 +629,14 @@
 %}
 
 <DQ_STRING_START>\"\" {
+    curr_lexer->lexer_debug ("<DQ_STRING_START>\\\"\\\"");
+
     curr_lexer->current_input_column += yyleng;
     curr_lexer->string_text += '"';
   }
 
 <DQ_STRING_START>\" {
+    curr_lexer->lexer_debug ("<DQ_STRING_START>\\\"");
 
     curr_lexer->pop_start_state ();
 
@@ -650,12 +653,9 @@
     return curr_lexer->count_token_internal (DQ_STRING);
   }
 
-<DQ_STRING_START>{NL} {
-    error ("unterminated character string constant");
-    return LEXICAL_ERROR;
-  }
-
 <DQ_STRING_START>\\[0-7]{1,3} {
+    curr_lexer->lexer_debug ("<DQ_STRING_START>\\\\[0-7]{1,3}");
+
     int result;
     sscanf (yytext+1, "%o", &result);
 
@@ -665,27 +665,98 @@
       curr_lexer->string_text += static_cast<unsigned char> (result);
   }
 
-<DQ_STRING_START>"\\a" { curr_lexer->string_text += '\a'; }
-<DQ_STRING_START>"\\b" { curr_lexer->string_text += '\b'; }
-<DQ_STRING_START>"\\f" { curr_lexer->string_text += '\f'; }
-<DQ_STRING_START>"\\n" { curr_lexer->string_text += '\n'; }
-<DQ_STRING_START>"\\r" { curr_lexer->string_text += '\r'; }
-<DQ_STRING_START>"\\t" { curr_lexer->string_text += '\t'; }
-<DQ_STRING_START>"\\v" { curr_lexer->string_text += '\v'; }
-
-<DQ_STRING_START>\\{ANY_INCLUDING_NL} {
+<DQ_STRING_START>"\\a" {
+    curr_lexer->lexer_debug ("<DQ_STRING_START>\"\\\\a\"");
+
+    curr_lexer->string_text += '\a';
+  }
+
+<DQ_STRING_START>"\\b" {
+    curr_lexer->lexer_debug ("<DQ_STRING_START>\"\\\\b\"");
+
+    curr_lexer->string_text += '\b';
+  }
+
+<DQ_STRING_START>"\\f" {
+    curr_lexer->lexer_debug ("<DQ_STRING_START>\"\\\\f\"");
+
+    curr_lexer->string_text += '\f';
+  }
+
+<DQ_STRING_START>"\\n" {
+    curr_lexer->lexer_debug ("<DQ_STRING_START>\"\\\\n\"");
+
+    curr_lexer->string_text += '\n';
+  }
+
+<DQ_STRING_START>"\\r" {
+    curr_lexer->lexer_debug ("<DQ_STRING_START>\"\\\\r\"");
+
+    curr_lexer->string_text += '\r';
+  }
+
+<DQ_STRING_START>"\\t" {
+    curr_lexer->lexer_debug ("<DQ_STRING_START>\"\\\\t\"");
+
+    curr_lexer->string_text += '\t';
+  }
+
+<DQ_STRING_START>"\\v" {
+    curr_lexer->lexer_debug ("<DQ_STRING_START>\"\\\\v\"");
+
+    curr_lexer->string_text += '\v';
+  }
+
+<DQ_STRING_START>\\{NL} {
+    curr_lexer->lexer_debug ("<DQ_STRING_START>\\\\{NL}");
+
+    curr_lexer->decrement_promptflag ();
+    curr_lexer->input_line_number++;
+    curr_lexer->current_input_column = 1;
+
+    // We can't rely on the trick used elsewhere of sticking ASCII 1
+    // in the intput buffer and recognizing it as a special case
+    // because ASCII 1 is a valid character for a character string.
+
+    if (curr_lexer->at_end_of_buffer ())
+      return -1;
+
+    if (curr_lexer->at_end_of_file ())
+      return curr_lexer->handle_end_of_input ();
+
+    // Otherwise, just keep going with the text from the current buffer.
+  }
+
+<DQ_STRING_START>\\. {
+    curr_lexer->lexer_debug ("<DQ_STRING_START>\\\\.");
+
     curr_lexer->string_text += yytext[1];
   }
 
-<DQ_STRING_START>[^\\\n\"]+ {
+<DQ_STRING_START>[^\\\r\n\"]+ {
+    curr_lexer->lexer_debug ("<DQ_STRING_START>[^\\\\\\r\\n\\\"]+");
+
     curr_lexer->string_text += yytext;
   }
 
+<DQ_STRING_START>{NL} {
+    curr_lexer->lexer_debug ("<DQ_STRING_START>{NL}");
+
+    curr_lexer->input_line_number++;
+    curr_lexer->current_input_column = 1;
+
+    error ("unterminated character string constant");
+
+    return LEXICAL_ERROR;
+  }
+
 %{
 // Single-quoted character strings.
 %}
 
 <SQ_STRING_START>[^\'\n\r]*\' {
+    curr_lexer->lexer_debug ("<SQ_STRING_START>[^\\'\\n\\r]*\\'");
+
     yytext[yyleng-1] = 0;
     curr_lexer->string_text += yytext;
 
@@ -720,7 +791,13 @@
   }
 
 <SQ_STRING_START>{NL} {
+    curr_lexer->lexer_debug ("<SQ_STRING_START>{NL}");
+
+    curr_lexer->input_line_number++;
+    curr_lexer->current_input_column = 1;
+
     error ("unterminated character string constant");
+
     return LEXICAL_ERROR;
   }
 
@@ -764,7 +841,7 @@
 
 {D}+/\.[\*/\\^\'] |
 {NUMBER} {
-    curr_lexer->lexer_debug ("{D}+/\\.[\\*/\\^\\']|{NUMBER}");
+    curr_lexer->lexer_debug ("{D}+/\\.[\\*/\\\\^\\']|{NUMBER}");
 
     if (curr_lexer->previous_token_may_be_command ()
         &&  curr_lexer->space_follows_previous_token ())
@@ -1034,7 +1111,7 @@
 %}
 
 \" {
-    curr_lexer->lexer_debug ("\"");
+    curr_lexer->lexer_debug ("\\\"");
 
     if (curr_lexer->previous_token_may_be_command ()
         &&  curr_lexer->space_follows_previous_token ())
@@ -2888,6 +2965,14 @@
       std::cerr << "LINE_COMMENT_START" << std::endl;
       break;
 
+    case DQ_STRING_START:
+      std::cerr << "DQ_STRING_START" << std::endl;
+      break;
+
+    case SQ_STRING_START:
+      std::cerr << "SQ_STRING_START" << std::endl;
+      break;
+
     default:
       std::cerr << "UNKNOWN START STATE!" << std::endl;
       break;