comparison libinterp/parse-tree/lex.ll @ 16898:531473481084

rewrite string parsing to avoid unlimited lookahead * NEWS: Mention change. * lex.h, lex.ll (lexical_feedback::string_text, lexical_feedback::string_line, lexical_feedback::string_column): New data members (lexical_feedback::lexical_feedback): Initialize them. (lexical_feedback::reset): Initialize them. (octave_base_lexer::begin_string): New function. (\", "'", <COMMAND_START>[\"\']): Use begin_string to set start state * instead of calling handle_string to parse string. (DQ_STRING_START, SQ_STRING_START): New exclusive start states. (<DQ_STRING_START>\"\", <DQ_STRING_START>\", <DQ_STRING_START>{NL}, <DQ_STRING_START>\\[0-7]{1,3}, <DQ_STRING_START>"\\a", <DQ_STRING_START>"\\b", <DQ_STRING_START>"\\f", <DQ_STRING_START>"\\n", <DQ_STRING_START>"\\r", <DQ_STRING_START>"\\t", <DQ_STRING_START>"\\v", <DQ_STRING_START>\\{ANY_INCLUDING_NL}, <DQ_STRING_START>[^\\\n\"]+, <SQ_STRING_START>[^\'\n\r]*\', <SQ_STRING_START>{NL}): New rules for parsing character strings. (octave_base_lexer::have_continuation, octave_base_lexer::have_ellipsis_continuation, octave_base_lexer::handle_string): Delete.
author John W. Eaton <jwe@octave.org>
date Thu, 04 Jul 2013 20:33:02 -0400
parents f89de736eecd
children f21194531877
comparison
equal deleted inserted replaced
16897:21d5e76891fe 16898:531473481084
48 48
49 %x INPUT_FILE_START 49 %x INPUT_FILE_START
50 50
51 %x BLOCK_COMMENT_START 51 %x BLOCK_COMMENT_START
52 %x LINE_COMMENT_START 52 %x LINE_COMMENT_START
53
54 %x DQ_STRING_START
55 %x SQ_STRING_START
53 56
54 %{ 57 %{
55 58
56 #include <cctype> 59 #include <cctype>
57 #include <cstring> 60 #include <cstring>
288 curr_lexer->lexer_debug ("<COMMAND_START>[\\\"\\']"); 291 curr_lexer->lexer_debug ("<COMMAND_START>[\\\"\\']");
289 292
290 curr_lexer->at_beginning_of_statement = false; 293 curr_lexer->at_beginning_of_statement = false;
291 294
292 curr_lexer->current_input_column++; 295 curr_lexer->current_input_column++;
293 int tok = curr_lexer->handle_string (yytext[0]); 296
294 297 curr_lexer->begin_string (yytext[0] == '"'
295 return curr_lexer->count_token_internal (tok); 298 ? DQ_STRING_START : SQ_STRING_START);
296 } 299 }
297 300
298 <COMMAND_START>[^#% \t\r\n\;\,\"\'][^ \t\r\n\;\,]*{S}* { 301 <COMMAND_START>[^#% \t\r\n\;\,\"\'][^ \t\r\n\;\,]*{S}* {
299 curr_lexer->lexer_debug ("<COMMAND_START>[^#% \\t\\r\\n\\;\\,\\\"\\'][^ \\t\\r\\n\\;\\,]*{S}*"); 302 curr_lexer->lexer_debug ("<COMMAND_START>[^#% \\t\\r\\n\\;\\,\\\"\\'][^ \\t\\r\\n\\;\\,]*{S}*");
300 303
617 curr_lexer->xunput (yytext[0]); 620 curr_lexer->xunput (yytext[0]);
618 621
619 curr_lexer->finish_comment (octave_comment_elt::full_line); 622 curr_lexer->finish_comment (octave_comment_elt::full_line);
620 623
621 curr_lexer->pop_start_state (); 624 curr_lexer->pop_start_state ();
625 }
626
627 %{
628 // Double-quoted character strings.
629 %}
630
631 <DQ_STRING_START>\"\" {
632 curr_lexer->current_input_column += yyleng;
633 curr_lexer->string_text += '"';
634 }
635
636 <DQ_STRING_START>\" {
637
638 curr_lexer->pop_start_state ();
639
640 curr_lexer->looking_for_object_index = true;
641 curr_lexer->at_beginning_of_statement = false;
642
643 curr_lexer->push_token (new token (DQ_STRING,
644 curr_lexer->string_text,
645 curr_lexer->string_line,
646 curr_lexer->string_column));
647
648 curr_lexer->string_text = "";
649
650 return curr_lexer->count_token_internal (DQ_STRING);
651 }
652
653 <DQ_STRING_START>{NL} {
654 error ("unterminated character string constant");
655 return LEXICAL_ERROR;
656 }
657
658 <DQ_STRING_START>\\[0-7]{1,3} {
659 int result;
660 sscanf (yytext+1, "%o", &result);
661
662 if (result > 0xff)
663 error ("invalid octal escape sequence in character string");
664 else
665 curr_lexer->string_text += static_cast<unsigned char> (result);
666 }
667
668 <DQ_STRING_START>"\\a" { curr_lexer->string_text += '\a'; }
669 <DQ_STRING_START>"\\b" { curr_lexer->string_text += '\b'; }
670 <DQ_STRING_START>"\\f" { curr_lexer->string_text += '\f'; }
671 <DQ_STRING_START>"\\n" { curr_lexer->string_text += '\n'; }
672 <DQ_STRING_START>"\\r" { curr_lexer->string_text += '\r'; }
673 <DQ_STRING_START>"\\t" { curr_lexer->string_text += '\t'; }
674 <DQ_STRING_START>"\\v" { curr_lexer->string_text += '\v'; }
675
676 <DQ_STRING_START>\\{ANY_INCLUDING_NL} {
677 curr_lexer->string_text += yytext[1];
678 }
679
680 <DQ_STRING_START>[^\\\n\"]+ {
681 curr_lexer->string_text += yytext;
682 }
683
684 %{
685 // Single-quoted character strings.
686 %}
687
688 <SQ_STRING_START>[^\'\n\r]*\' {
689 yytext[yyleng-1] = 0;
690 curr_lexer->string_text += yytext;
691
692 curr_lexer->current_input_column += yyleng;
693
694 int c = curr_lexer->text_yyinput ();
695
696 if (c == '\'')
697 {
698 curr_lexer->string_text += c;
699
700 curr_lexer->current_input_column++;
701 }
702 else
703 {
704 curr_lexer->xunput (c);
705
706 curr_lexer->pop_start_state ();
707
708 curr_lexer->looking_for_object_index = true;
709 curr_lexer->at_beginning_of_statement = false;
710
711 curr_lexer->push_token (new token (SQ_STRING,
712 curr_lexer->string_text,
713 curr_lexer->string_line,
714 curr_lexer->string_column));
715
716 curr_lexer->string_text = "";
717
718 return curr_lexer->count_token_internal (SQ_STRING);
719 }
720 }
721
722 <SQ_STRING_START>{NL} {
723 error ("unterminated character string constant");
724 return LEXICAL_ERROR;
622 } 725 }
623 726
624 %{ 727 %{
625 // Imaginary numbers. 728 // Imaginary numbers.
626 %} 729 %}
865 curr_lexer->lexer_debug ("'"); 968 curr_lexer->lexer_debug ("'");
866 969
867 if (curr_lexer->previous_token_may_be_command () 970 if (curr_lexer->previous_token_may_be_command ()
868 && curr_lexer->space_follows_previous_token ()) 971 && curr_lexer->space_follows_previous_token ())
869 { 972 {
870 yyless (0); 973 curr_lexer->current_input_column++;
871 curr_lexer->push_start_state (COMMAND_START); 974 curr_lexer->push_start_state (COMMAND_START);
975 curr_lexer->begin_string (SQ_STRING_START);
872 } 976 }
873 else if (curr_lexer->at_beginning_of_statement) 977 else if (curr_lexer->at_beginning_of_statement)
874 { 978 {
875 curr_lexer->current_input_column++; 979 curr_lexer->current_input_column++;
876 int retval = curr_lexer->handle_string ('\''); 980 curr_lexer->begin_string (SQ_STRING_START);
877 return curr_lexer->count_token_internal (retval);
878 } 981 }
879 else 982 else
880 { 983 {
881 int tok = curr_lexer->previous_token_value (); 984 int tok = curr_lexer->previous_token_value ();
882 985
886 { 989 {
887 if (tok == '[' || tok == '{' 990 if (tok == '[' || tok == '{'
888 || curr_lexer->previous_token_is_binop ()) 991 || curr_lexer->previous_token_is_binop ())
889 { 992 {
890 curr_lexer->current_input_column++; 993 curr_lexer->current_input_column++;
891 int retval = curr_lexer->handle_string ('\''); 994 curr_lexer->begin_string (SQ_STRING_START);
892 return curr_lexer->count_token_internal (retval);
893 } 995 }
894 else 996 else
895 { 997 {
896 yyless (0); 998 yyless (0);
897 curr_lexer->xunput (','); 999 curr_lexer->xunput (',');
904 if (tok == '[' || tok == '{' 1006 if (tok == '[' || tok == '{'
905 || curr_lexer->previous_token_is_binop () 1007 || curr_lexer->previous_token_is_binop ()
906 || curr_lexer->previous_token_is_keyword ()) 1008 || curr_lexer->previous_token_is_keyword ())
907 { 1009 {
908 curr_lexer->current_input_column++; 1010 curr_lexer->current_input_column++;
909 int retval = curr_lexer->handle_string ('\''); 1011 curr_lexer->begin_string (SQ_STRING_START);
910 return curr_lexer->count_token_internal (retval);
911 } 1012 }
912 else 1013 else
913 return curr_lexer->count_token (HERMITIAN); 1014 return curr_lexer->count_token (HERMITIAN);
914 } 1015 }
915 } 1016 }
918 if (! tok || tok == '[' || tok == '{' || tok == '(' 1019 if (! tok || tok == '[' || tok == '{' || tok == '('
919 || curr_lexer->previous_token_is_binop () 1020 || curr_lexer->previous_token_is_binop ()
920 || curr_lexer->previous_token_is_keyword ()) 1021 || curr_lexer->previous_token_is_keyword ())
921 { 1022 {
922 curr_lexer->current_input_column++; 1023 curr_lexer->current_input_column++;
923 int retval = curr_lexer->handle_string ('\''); 1024 curr_lexer->begin_string (SQ_STRING_START);
924 return curr_lexer->count_token_internal (retval);
925 } 1025 }
926 else 1026 else
927 return curr_lexer->count_token (HERMITIAN); 1027 return curr_lexer->count_token (HERMITIAN);
928 } 1028 }
929 } 1029 }
937 curr_lexer->lexer_debug ("\""); 1037 curr_lexer->lexer_debug ("\"");
938 1038
939 if (curr_lexer->previous_token_may_be_command () 1039 if (curr_lexer->previous_token_may_be_command ()
940 && curr_lexer->space_follows_previous_token ()) 1040 && curr_lexer->space_follows_previous_token ())
941 { 1041 {
942 yyless (0); 1042 curr_lexer->current_input_column++;
943 curr_lexer->push_start_state (COMMAND_START); 1043 curr_lexer->push_start_state (COMMAND_START);
1044 curr_lexer->begin_string (DQ_STRING_START);
944 } 1045 }
945 else 1046 else
946 { 1047 {
947 int tok = curr_lexer->previous_token_value (); 1048 int tok = curr_lexer->previous_token_value ();
948 1049
952 { 1053 {
953 if (tok == '[' || tok == '{' 1054 if (tok == '[' || tok == '{'
954 || curr_lexer->previous_token_is_binop ()) 1055 || curr_lexer->previous_token_is_binop ())
955 { 1056 {
956 curr_lexer->current_input_column++; 1057 curr_lexer->current_input_column++;
957 int retval = curr_lexer->handle_string ('"'); 1058 curr_lexer->begin_string (DQ_STRING_START);
958 return curr_lexer->count_token_internal (retval);
959 } 1059 }
960 else 1060 else
961 { 1061 {
962 yyless (0); 1062 yyless (0);
963 curr_lexer->xunput (','); 1063 curr_lexer->xunput (',');
966 } 1066 }
967 } 1067 }
968 else 1068 else
969 { 1069 {
970 curr_lexer->current_input_column++; 1070 curr_lexer->current_input_column++;
971 int retval = curr_lexer->handle_string ('"'); 1071 curr_lexer->begin_string (DQ_STRING_START);
972 return curr_lexer->count_token_internal (retval);
973 } 1072 }
974 } 1073 }
975 else 1074 else
976 { 1075 {
977 curr_lexer->current_input_column++; 1076 curr_lexer->current_input_column++;
978 int retval = curr_lexer->handle_string ('"'); 1077 curr_lexer->begin_string (DQ_STRING_START);
979 return curr_lexer->count_token_internal (retval);
980 } 1078 }
981 } 1079 }
982 } 1080 }
983 1081
984 %{ 1082 %{
1553 block_comment_nesting_level = 0; 1651 block_comment_nesting_level = 0;
1554 token_count = 0; 1652 token_count = 0;
1555 current_input_line = ""; 1653 current_input_line = "";
1556 comment_text = ""; 1654 comment_text = "";
1557 help_text = ""; 1655 help_text = "";
1656 string_text = "";
1657 string_line = 0;
1658 string_column = 0;
1558 fcn_file_name = ""; 1659 fcn_file_name = "";
1559 fcn_file_full_name = ""; 1660 fcn_file_full_name = "";
1560 looking_at_object_index.clear (); 1661 looking_at_object_index.clear ();
1561 looking_at_object_index.push_front (false); 1662 looking_at_object_index.push_front (false);
1562 1663
1767 reading_script_file = true; 1868 reading_script_file = true;
1768 1869
1769 push_start_state (INPUT_FILE_START); 1870 push_start_state (INPUT_FILE_START);
1770 } 1871 }
1771 1872
1873 void
1874 octave_base_lexer::begin_string (int state)
1875 {
1876 string_line = input_line_number;
1877 string_column = current_input_column;
1878
1879 push_start_state (state);
1880 }
1881
1772 int 1882 int
1773 octave_base_lexer::handle_end_of_input (void) 1883 octave_base_lexer::handle_end_of_input (void)
1774 { 1884 {
1775 lexer_debug ("<<EOF>>"); 1885 lexer_debug ("<<EOF>>");
1776 1886
2253 octave_comment_buffer::append (comment_text, typ); 2363 octave_comment_buffer::append (comment_text, typ);
2254 2364
2255 comment_text = ""; 2365 comment_text = "";
2256 2366
2257 at_beginning_of_statement = true; 2367 at_beginning_of_statement = true;
2258 }
2259
2260 // We have seen a backslash and need to find out if it should be
2261 // treated as a continuation character. If so, this eats it, up to
2262 // and including the new line character.
2263 //
2264 // Match whitespace only, followed by a comment character or newline.
2265 // Once a comment character is found, discard all input until newline.
2266 // If non-whitespace characters are found before comment
2267 // characters, return 0. Otherwise, return 1.
2268
2269 // FIXME -- we need to handle block comments here.
2270
2271 bool
2272 octave_base_lexer::have_continuation (bool trailing_comments_ok)
2273 {
2274 std::ostringstream buf;
2275
2276 std::string comment_buf;
2277
2278 bool in_comment = false;
2279 bool beginning_of_comment = false;
2280
2281 int c = 0;
2282
2283 while ((c = text_yyinput ()) != EOF)
2284 {
2285 buf << static_cast<char> (c);
2286
2287 switch (c)
2288 {
2289 case ' ':
2290 case '\t':
2291 if (in_comment)
2292 {
2293 comment_buf += static_cast<char> (c);
2294 beginning_of_comment = false;
2295 }
2296 break;
2297
2298 case '%':
2299 case '#':
2300 if (trailing_comments_ok)
2301 {
2302 if (in_comment)
2303 {
2304 if (! beginning_of_comment)
2305 comment_buf += static_cast<char> (c);
2306 }
2307 else
2308 {
2309 maybe_gripe_matlab_incompatible_comment (c);
2310 in_comment = true;
2311 beginning_of_comment = true;
2312 }
2313 }
2314 else
2315 goto cleanup;
2316 break;
2317
2318 case '\n':
2319 if (in_comment)
2320 {
2321 comment_buf += static_cast<char> (c);
2322 octave_comment_buffer::append (comment_buf);
2323 }
2324 current_input_column = 0;
2325 decrement_promptflag ();
2326 gripe_matlab_incompatible_continuation ();
2327 return true;
2328
2329 default:
2330 if (in_comment)
2331 {
2332 comment_buf += static_cast<char> (c);
2333 beginning_of_comment = false;
2334 }
2335 else
2336 goto cleanup;
2337 break;
2338 }
2339 }
2340
2341 xunput (c);
2342 return false;
2343
2344 cleanup:
2345
2346 std::string s = buf.str ();
2347
2348 int len = s.length ();
2349 while (len--)
2350 xunput (s[len]);
2351
2352 return false;
2353 }
2354
2355 // We have seen a '.' and need to see if it is the start of a
2356 // continuation. If so, this eats it, up to and including the new
2357 // line character.
2358
2359 bool
2360 octave_base_lexer::have_ellipsis_continuation (bool trailing_comments_ok)
2361 {
2362 char c1 = text_yyinput ();
2363 if (c1 == '.')
2364 {
2365 char c2 = text_yyinput ();
2366 if (c2 == '.' && have_continuation (trailing_comments_ok))
2367 return true;
2368 else
2369 {
2370 xunput (c2);
2371 xunput (c1);
2372 }
2373 }
2374 else
2375 xunput (c1);
2376
2377 return false;
2378 }
2379
2380 int
2381 octave_base_lexer::handle_string (char delim)
2382 {
2383 std::ostringstream buf;
2384
2385 int bos_line = input_line_number;
2386 int bos_col = current_input_column;
2387
2388 int c;
2389 int escape_pending = 0;
2390
2391 while ((c = text_yyinput ()) != EOF)
2392 {
2393 current_input_column++;
2394
2395 if (c == '\\')
2396 {
2397 if (delim == '\'' || escape_pending)
2398 {
2399 buf << static_cast<char> (c);
2400 escape_pending = 0;
2401 }
2402 else
2403 {
2404 if (have_continuation (false))
2405 escape_pending = 0;
2406 else
2407 {
2408 buf << static_cast<char> (c);
2409 escape_pending = 1;
2410 }
2411 }
2412 continue;
2413 }
2414 else if (c == '.')
2415 {
2416 if (delim == '\'' || ! have_ellipsis_continuation (false))
2417 buf << static_cast<char> (c);
2418 }
2419 else if (c == '\n')
2420 {
2421 error ("unterminated string constant");
2422 break;
2423 }
2424 else if (c == delim)
2425 {
2426 if (escape_pending)
2427 buf << static_cast<char> (c);
2428 else
2429 {
2430 c = text_yyinput ();
2431 if (c == delim)
2432 {
2433 buf << static_cast<char> (c);
2434 }
2435 else
2436 {
2437 std::string s;
2438 xunput (c);
2439
2440 if (delim == '\'')
2441 s = buf.str ();
2442 else
2443 s = do_string_escapes (buf.str ());
2444
2445 if (delim == '"')
2446 gripe_matlab_incompatible ("\" used as string delimiter");
2447 else if (delim == '\'')
2448 gripe_single_quote_string ();
2449
2450 looking_for_object_index = true;
2451 at_beginning_of_statement = false;
2452
2453 int tok = delim == '"' ? DQ_STRING : SQ_STRING;
2454
2455 push_token (new token (tok, s, bos_line, bos_col));
2456
2457 return tok;
2458 }
2459 }
2460 }
2461 else
2462 {
2463 buf << static_cast<char> (c);
2464 }
2465
2466 escape_pending = 0;
2467 }
2468
2469 return LEXICAL_ERROR;
2470 } 2368 }
2471 2369
2472 int 2370 int
2473 octave_base_lexer::handle_close_bracket (int bracket_type) 2371 octave_base_lexer::handle_close_bracket (int bracket_type)
2474 { 2372 {