Mercurial > octave-nkf
comparison libinterp/parse-tree/lex.ll @ 16898:531473481084
rewrite string parsing to avoid unlimited lookahead
* NEWS: Mention change.
* lex.h, lex.ll (lexical_feedback::string_text,
lexical_feedback::string_line, lexical_feedback::string_column):
New data members
(lexical_feedback::lexical_feedback): Initialize them.
(lexical_feedback::reset): Initialize them.
(octave_base_lexer::begin_string): New function.
(\", "'", <COMMAND_START>[\"\']): Use begin_string to set start state
* instead of calling handle_string to parse string.
(DQ_STRING_START, SQ_STRING_START): New exclusive start states.
(<DQ_STRING_START>\"\", <DQ_STRING_START>\", <DQ_STRING_START>{NL},
<DQ_STRING_START>\\[0-7]{1,3}, <DQ_STRING_START>"\\a",
<DQ_STRING_START>"\\b", <DQ_STRING_START>"\\f",
<DQ_STRING_START>"\\n", <DQ_STRING_START>"\\r",
<DQ_STRING_START>"\\t", <DQ_STRING_START>"\\v",
<DQ_STRING_START>\\{ANY_INCLUDING_NL}, <DQ_STRING_START>[^\\\n\"]+,
<SQ_STRING_START>[^\'\n\r]*\', <SQ_STRING_START>{NL}):
New rules for parsing character strings.
(octave_base_lexer::have_continuation,
octave_base_lexer::have_ellipsis_continuation,
octave_base_lexer::handle_string): Delete.
author | John W. Eaton <jwe@octave.org> |
---|---|
date | Thu, 04 Jul 2013 20:33:02 -0400 |
parents | f89de736eecd |
children | f21194531877 |
comparison
equal
deleted
inserted
replaced
16897:21d5e76891fe | 16898:531473481084 |
---|---|
48 | 48 |
49 %x INPUT_FILE_START | 49 %x INPUT_FILE_START |
50 | 50 |
51 %x BLOCK_COMMENT_START | 51 %x BLOCK_COMMENT_START |
52 %x LINE_COMMENT_START | 52 %x LINE_COMMENT_START |
53 | |
54 %x DQ_STRING_START | |
55 %x SQ_STRING_START | |
53 | 56 |
54 %{ | 57 %{ |
55 | 58 |
56 #include <cctype> | 59 #include <cctype> |
57 #include <cstring> | 60 #include <cstring> |
288 curr_lexer->lexer_debug ("<COMMAND_START>[\\\"\\']"); | 291 curr_lexer->lexer_debug ("<COMMAND_START>[\\\"\\']"); |
289 | 292 |
290 curr_lexer->at_beginning_of_statement = false; | 293 curr_lexer->at_beginning_of_statement = false; |
291 | 294 |
292 curr_lexer->current_input_column++; | 295 curr_lexer->current_input_column++; |
293 int tok = curr_lexer->handle_string (yytext[0]); | 296 |
294 | 297 curr_lexer->begin_string (yytext[0] == '"' |
295 return curr_lexer->count_token_internal (tok); | 298 ? DQ_STRING_START : SQ_STRING_START); |
296 } | 299 } |
297 | 300 |
298 <COMMAND_START>[^#% \t\r\n\;\,\"\'][^ \t\r\n\;\,]*{S}* { | 301 <COMMAND_START>[^#% \t\r\n\;\,\"\'][^ \t\r\n\;\,]*{S}* { |
299 curr_lexer->lexer_debug ("<COMMAND_START>[^#% \\t\\r\\n\\;\\,\\\"\\'][^ \\t\\r\\n\\;\\,]*{S}*"); | 302 curr_lexer->lexer_debug ("<COMMAND_START>[^#% \\t\\r\\n\\;\\,\\\"\\'][^ \\t\\r\\n\\;\\,]*{S}*"); |
300 | 303 |
617 curr_lexer->xunput (yytext[0]); | 620 curr_lexer->xunput (yytext[0]); |
618 | 621 |
619 curr_lexer->finish_comment (octave_comment_elt::full_line); | 622 curr_lexer->finish_comment (octave_comment_elt::full_line); |
620 | 623 |
621 curr_lexer->pop_start_state (); | 624 curr_lexer->pop_start_state (); |
625 } | |
626 | |
627 %{ | |
628 // Double-quoted character strings. | |
629 %} | |
630 | |
631 <DQ_STRING_START>\"\" { | |
632 curr_lexer->current_input_column += yyleng; | |
633 curr_lexer->string_text += '"'; | |
634 } | |
635 | |
636 <DQ_STRING_START>\" { | |
637 | |
638 curr_lexer->pop_start_state (); | |
639 | |
640 curr_lexer->looking_for_object_index = true; | |
641 curr_lexer->at_beginning_of_statement = false; | |
642 | |
643 curr_lexer->push_token (new token (DQ_STRING, | |
644 curr_lexer->string_text, | |
645 curr_lexer->string_line, | |
646 curr_lexer->string_column)); | |
647 | |
648 curr_lexer->string_text = ""; | |
649 | |
650 return curr_lexer->count_token_internal (DQ_STRING); | |
651 } | |
652 | |
653 <DQ_STRING_START>{NL} { | |
654 error ("unterminated character string constant"); | |
655 return LEXICAL_ERROR; | |
656 } | |
657 | |
658 <DQ_STRING_START>\\[0-7]{1,3} { | |
659 int result; | |
660 sscanf (yytext+1, "%o", &result); | |
661 | |
662 if (result > 0xff) | |
663 error ("invalid octal escape sequence in character string"); | |
664 else | |
665 curr_lexer->string_text += static_cast<unsigned char> (result); | |
666 } | |
667 | |
668 <DQ_STRING_START>"\\a" { curr_lexer->string_text += '\a'; } | |
669 <DQ_STRING_START>"\\b" { curr_lexer->string_text += '\b'; } | |
670 <DQ_STRING_START>"\\f" { curr_lexer->string_text += '\f'; } | |
671 <DQ_STRING_START>"\\n" { curr_lexer->string_text += '\n'; } | |
672 <DQ_STRING_START>"\\r" { curr_lexer->string_text += '\r'; } | |
673 <DQ_STRING_START>"\\t" { curr_lexer->string_text += '\t'; } | |
674 <DQ_STRING_START>"\\v" { curr_lexer->string_text += '\v'; } | |
675 | |
676 <DQ_STRING_START>\\{ANY_INCLUDING_NL} { | |
677 curr_lexer->string_text += yytext[1]; | |
678 } | |
679 | |
680 <DQ_STRING_START>[^\\\n\"]+ { | |
681 curr_lexer->string_text += yytext; | |
682 } | |
683 | |
684 %{ | |
685 // Single-quoted character strings. | |
686 %} | |
687 | |
688 <SQ_STRING_START>[^\'\n\r]*\' { | |
689 yytext[yyleng-1] = 0; | |
690 curr_lexer->string_text += yytext; | |
691 | |
692 curr_lexer->current_input_column += yyleng; | |
693 | |
694 int c = curr_lexer->text_yyinput (); | |
695 | |
696 if (c == '\'') | |
697 { | |
698 curr_lexer->string_text += c; | |
699 | |
700 curr_lexer->current_input_column++; | |
701 } | |
702 else | |
703 { | |
704 curr_lexer->xunput (c); | |
705 | |
706 curr_lexer->pop_start_state (); | |
707 | |
708 curr_lexer->looking_for_object_index = true; | |
709 curr_lexer->at_beginning_of_statement = false; | |
710 | |
711 curr_lexer->push_token (new token (SQ_STRING, | |
712 curr_lexer->string_text, | |
713 curr_lexer->string_line, | |
714 curr_lexer->string_column)); | |
715 | |
716 curr_lexer->string_text = ""; | |
717 | |
718 return curr_lexer->count_token_internal (SQ_STRING); | |
719 } | |
720 } | |
721 | |
722 <SQ_STRING_START>{NL} { | |
723 error ("unterminated character string constant"); | |
724 return LEXICAL_ERROR; | |
622 } | 725 } |
623 | 726 |
624 %{ | 727 %{ |
625 // Imaginary numbers. | 728 // Imaginary numbers. |
626 %} | 729 %} |
865 curr_lexer->lexer_debug ("'"); | 968 curr_lexer->lexer_debug ("'"); |
866 | 969 |
867 if (curr_lexer->previous_token_may_be_command () | 970 if (curr_lexer->previous_token_may_be_command () |
868 && curr_lexer->space_follows_previous_token ()) | 971 && curr_lexer->space_follows_previous_token ()) |
869 { | 972 { |
870 yyless (0); | 973 curr_lexer->current_input_column++; |
871 curr_lexer->push_start_state (COMMAND_START); | 974 curr_lexer->push_start_state (COMMAND_START); |
975 curr_lexer->begin_string (SQ_STRING_START); | |
872 } | 976 } |
873 else if (curr_lexer->at_beginning_of_statement) | 977 else if (curr_lexer->at_beginning_of_statement) |
874 { | 978 { |
875 curr_lexer->current_input_column++; | 979 curr_lexer->current_input_column++; |
876 int retval = curr_lexer->handle_string ('\''); | 980 curr_lexer->begin_string (SQ_STRING_START); |
877 return curr_lexer->count_token_internal (retval); | |
878 } | 981 } |
879 else | 982 else |
880 { | 983 { |
881 int tok = curr_lexer->previous_token_value (); | 984 int tok = curr_lexer->previous_token_value (); |
882 | 985 |
886 { | 989 { |
887 if (tok == '[' || tok == '{' | 990 if (tok == '[' || tok == '{' |
888 || curr_lexer->previous_token_is_binop ()) | 991 || curr_lexer->previous_token_is_binop ()) |
889 { | 992 { |
890 curr_lexer->current_input_column++; | 993 curr_lexer->current_input_column++; |
891 int retval = curr_lexer->handle_string ('\''); | 994 curr_lexer->begin_string (SQ_STRING_START); |
892 return curr_lexer->count_token_internal (retval); | |
893 } | 995 } |
894 else | 996 else |
895 { | 997 { |
896 yyless (0); | 998 yyless (0); |
897 curr_lexer->xunput (','); | 999 curr_lexer->xunput (','); |
904 if (tok == '[' || tok == '{' | 1006 if (tok == '[' || tok == '{' |
905 || curr_lexer->previous_token_is_binop () | 1007 || curr_lexer->previous_token_is_binop () |
906 || curr_lexer->previous_token_is_keyword ()) | 1008 || curr_lexer->previous_token_is_keyword ()) |
907 { | 1009 { |
908 curr_lexer->current_input_column++; | 1010 curr_lexer->current_input_column++; |
909 int retval = curr_lexer->handle_string ('\''); | 1011 curr_lexer->begin_string (SQ_STRING_START); |
910 return curr_lexer->count_token_internal (retval); | |
911 } | 1012 } |
912 else | 1013 else |
913 return curr_lexer->count_token (HERMITIAN); | 1014 return curr_lexer->count_token (HERMITIAN); |
914 } | 1015 } |
915 } | 1016 } |
918 if (! tok || tok == '[' || tok == '{' || tok == '(' | 1019 if (! tok || tok == '[' || tok == '{' || tok == '(' |
919 || curr_lexer->previous_token_is_binop () | 1020 || curr_lexer->previous_token_is_binop () |
920 || curr_lexer->previous_token_is_keyword ()) | 1021 || curr_lexer->previous_token_is_keyword ()) |
921 { | 1022 { |
922 curr_lexer->current_input_column++; | 1023 curr_lexer->current_input_column++; |
923 int retval = curr_lexer->handle_string ('\''); | 1024 curr_lexer->begin_string (SQ_STRING_START); |
924 return curr_lexer->count_token_internal (retval); | |
925 } | 1025 } |
926 else | 1026 else |
927 return curr_lexer->count_token (HERMITIAN); | 1027 return curr_lexer->count_token (HERMITIAN); |
928 } | 1028 } |
929 } | 1029 } |
937 curr_lexer->lexer_debug ("\""); | 1037 curr_lexer->lexer_debug ("\""); |
938 | 1038 |
939 if (curr_lexer->previous_token_may_be_command () | 1039 if (curr_lexer->previous_token_may_be_command () |
940 && curr_lexer->space_follows_previous_token ()) | 1040 && curr_lexer->space_follows_previous_token ()) |
941 { | 1041 { |
942 yyless (0); | 1042 curr_lexer->current_input_column++; |
943 curr_lexer->push_start_state (COMMAND_START); | 1043 curr_lexer->push_start_state (COMMAND_START); |
1044 curr_lexer->begin_string (DQ_STRING_START); | |
944 } | 1045 } |
945 else | 1046 else |
946 { | 1047 { |
947 int tok = curr_lexer->previous_token_value (); | 1048 int tok = curr_lexer->previous_token_value (); |
948 | 1049 |
952 { | 1053 { |
953 if (tok == '[' || tok == '{' | 1054 if (tok == '[' || tok == '{' |
954 || curr_lexer->previous_token_is_binop ()) | 1055 || curr_lexer->previous_token_is_binop ()) |
955 { | 1056 { |
956 curr_lexer->current_input_column++; | 1057 curr_lexer->current_input_column++; |
957 int retval = curr_lexer->handle_string ('"'); | 1058 curr_lexer->begin_string (DQ_STRING_START); |
958 return curr_lexer->count_token_internal (retval); | |
959 } | 1059 } |
960 else | 1060 else |
961 { | 1061 { |
962 yyless (0); | 1062 yyless (0); |
963 curr_lexer->xunput (','); | 1063 curr_lexer->xunput (','); |
966 } | 1066 } |
967 } | 1067 } |
968 else | 1068 else |
969 { | 1069 { |
970 curr_lexer->current_input_column++; | 1070 curr_lexer->current_input_column++; |
971 int retval = curr_lexer->handle_string ('"'); | 1071 curr_lexer->begin_string (DQ_STRING_START); |
972 return curr_lexer->count_token_internal (retval); | |
973 } | 1072 } |
974 } | 1073 } |
975 else | 1074 else |
976 { | 1075 { |
977 curr_lexer->current_input_column++; | 1076 curr_lexer->current_input_column++; |
978 int retval = curr_lexer->handle_string ('"'); | 1077 curr_lexer->begin_string (DQ_STRING_START); |
979 return curr_lexer->count_token_internal (retval); | |
980 } | 1078 } |
981 } | 1079 } |
982 } | 1080 } |
983 | 1081 |
984 %{ | 1082 %{ |
1553 block_comment_nesting_level = 0; | 1651 block_comment_nesting_level = 0; |
1554 token_count = 0; | 1652 token_count = 0; |
1555 current_input_line = ""; | 1653 current_input_line = ""; |
1556 comment_text = ""; | 1654 comment_text = ""; |
1557 help_text = ""; | 1655 help_text = ""; |
1656 string_text = ""; | |
1657 string_line = 0; | |
1658 string_column = 0; | |
1558 fcn_file_name = ""; | 1659 fcn_file_name = ""; |
1559 fcn_file_full_name = ""; | 1660 fcn_file_full_name = ""; |
1560 looking_at_object_index.clear (); | 1661 looking_at_object_index.clear (); |
1561 looking_at_object_index.push_front (false); | 1662 looking_at_object_index.push_front (false); |
1562 | 1663 |
1767 reading_script_file = true; | 1868 reading_script_file = true; |
1768 | 1869 |
1769 push_start_state (INPUT_FILE_START); | 1870 push_start_state (INPUT_FILE_START); |
1770 } | 1871 } |
1771 | 1872 |
1873 void | |
1874 octave_base_lexer::begin_string (int state) | |
1875 { | |
1876 string_line = input_line_number; | |
1877 string_column = current_input_column; | |
1878 | |
1879 push_start_state (state); | |
1880 } | |
1881 | |
1772 int | 1882 int |
1773 octave_base_lexer::handle_end_of_input (void) | 1883 octave_base_lexer::handle_end_of_input (void) |
1774 { | 1884 { |
1775 lexer_debug ("<<EOF>>"); | 1885 lexer_debug ("<<EOF>>"); |
1776 | 1886 |
2253 octave_comment_buffer::append (comment_text, typ); | 2363 octave_comment_buffer::append (comment_text, typ); |
2254 | 2364 |
2255 comment_text = ""; | 2365 comment_text = ""; |
2256 | 2366 |
2257 at_beginning_of_statement = true; | 2367 at_beginning_of_statement = true; |
2258 } | |
2259 | |
2260 // We have seen a backslash and need to find out if it should be | |
2261 // treated as a continuation character. If so, this eats it, up to | |
2262 // and including the new line character. | |
2263 // | |
2264 // Match whitespace only, followed by a comment character or newline. | |
2265 // Once a comment character is found, discard all input until newline. | |
2266 // If non-whitespace characters are found before comment | |
2267 // characters, return 0. Otherwise, return 1. | |
2268 | |
2269 // FIXME -- we need to handle block comments here. | |
2270 | |
2271 bool | |
2272 octave_base_lexer::have_continuation (bool trailing_comments_ok) | |
2273 { | |
2274 std::ostringstream buf; | |
2275 | |
2276 std::string comment_buf; | |
2277 | |
2278 bool in_comment = false; | |
2279 bool beginning_of_comment = false; | |
2280 | |
2281 int c = 0; | |
2282 | |
2283 while ((c = text_yyinput ()) != EOF) | |
2284 { | |
2285 buf << static_cast<char> (c); | |
2286 | |
2287 switch (c) | |
2288 { | |
2289 case ' ': | |
2290 case '\t': | |
2291 if (in_comment) | |
2292 { | |
2293 comment_buf += static_cast<char> (c); | |
2294 beginning_of_comment = false; | |
2295 } | |
2296 break; | |
2297 | |
2298 case '%': | |
2299 case '#': | |
2300 if (trailing_comments_ok) | |
2301 { | |
2302 if (in_comment) | |
2303 { | |
2304 if (! beginning_of_comment) | |
2305 comment_buf += static_cast<char> (c); | |
2306 } | |
2307 else | |
2308 { | |
2309 maybe_gripe_matlab_incompatible_comment (c); | |
2310 in_comment = true; | |
2311 beginning_of_comment = true; | |
2312 } | |
2313 } | |
2314 else | |
2315 goto cleanup; | |
2316 break; | |
2317 | |
2318 case '\n': | |
2319 if (in_comment) | |
2320 { | |
2321 comment_buf += static_cast<char> (c); | |
2322 octave_comment_buffer::append (comment_buf); | |
2323 } | |
2324 current_input_column = 0; | |
2325 decrement_promptflag (); | |
2326 gripe_matlab_incompatible_continuation (); | |
2327 return true; | |
2328 | |
2329 default: | |
2330 if (in_comment) | |
2331 { | |
2332 comment_buf += static_cast<char> (c); | |
2333 beginning_of_comment = false; | |
2334 } | |
2335 else | |
2336 goto cleanup; | |
2337 break; | |
2338 } | |
2339 } | |
2340 | |
2341 xunput (c); | |
2342 return false; | |
2343 | |
2344 cleanup: | |
2345 | |
2346 std::string s = buf.str (); | |
2347 | |
2348 int len = s.length (); | |
2349 while (len--) | |
2350 xunput (s[len]); | |
2351 | |
2352 return false; | |
2353 } | |
2354 | |
2355 // We have seen a '.' and need to see if it is the start of a | |
2356 // continuation. If so, this eats it, up to and including the new | |
2357 // line character. | |
2358 | |
2359 bool | |
2360 octave_base_lexer::have_ellipsis_continuation (bool trailing_comments_ok) | |
2361 { | |
2362 char c1 = text_yyinput (); | |
2363 if (c1 == '.') | |
2364 { | |
2365 char c2 = text_yyinput (); | |
2366 if (c2 == '.' && have_continuation (trailing_comments_ok)) | |
2367 return true; | |
2368 else | |
2369 { | |
2370 xunput (c2); | |
2371 xunput (c1); | |
2372 } | |
2373 } | |
2374 else | |
2375 xunput (c1); | |
2376 | |
2377 return false; | |
2378 } | |
2379 | |
2380 int | |
2381 octave_base_lexer::handle_string (char delim) | |
2382 { | |
2383 std::ostringstream buf; | |
2384 | |
2385 int bos_line = input_line_number; | |
2386 int bos_col = current_input_column; | |
2387 | |
2388 int c; | |
2389 int escape_pending = 0; | |
2390 | |
2391 while ((c = text_yyinput ()) != EOF) | |
2392 { | |
2393 current_input_column++; | |
2394 | |
2395 if (c == '\\') | |
2396 { | |
2397 if (delim == '\'' || escape_pending) | |
2398 { | |
2399 buf << static_cast<char> (c); | |
2400 escape_pending = 0; | |
2401 } | |
2402 else | |
2403 { | |
2404 if (have_continuation (false)) | |
2405 escape_pending = 0; | |
2406 else | |
2407 { | |
2408 buf << static_cast<char> (c); | |
2409 escape_pending = 1; | |
2410 } | |
2411 } | |
2412 continue; | |
2413 } | |
2414 else if (c == '.') | |
2415 { | |
2416 if (delim == '\'' || ! have_ellipsis_continuation (false)) | |
2417 buf << static_cast<char> (c); | |
2418 } | |
2419 else if (c == '\n') | |
2420 { | |
2421 error ("unterminated string constant"); | |
2422 break; | |
2423 } | |
2424 else if (c == delim) | |
2425 { | |
2426 if (escape_pending) | |
2427 buf << static_cast<char> (c); | |
2428 else | |
2429 { | |
2430 c = text_yyinput (); | |
2431 if (c == delim) | |
2432 { | |
2433 buf << static_cast<char> (c); | |
2434 } | |
2435 else | |
2436 { | |
2437 std::string s; | |
2438 xunput (c); | |
2439 | |
2440 if (delim == '\'') | |
2441 s = buf.str (); | |
2442 else | |
2443 s = do_string_escapes (buf.str ()); | |
2444 | |
2445 if (delim == '"') | |
2446 gripe_matlab_incompatible ("\" used as string delimiter"); | |
2447 else if (delim == '\'') | |
2448 gripe_single_quote_string (); | |
2449 | |
2450 looking_for_object_index = true; | |
2451 at_beginning_of_statement = false; | |
2452 | |
2453 int tok = delim == '"' ? DQ_STRING : SQ_STRING; | |
2454 | |
2455 push_token (new token (tok, s, bos_line, bos_col)); | |
2456 | |
2457 return tok; | |
2458 } | |
2459 } | |
2460 } | |
2461 else | |
2462 { | |
2463 buf << static_cast<char> (c); | |
2464 } | |
2465 | |
2466 escape_pending = 0; | |
2467 } | |
2468 | |
2469 return LEXICAL_ERROR; | |
2470 } | 2368 } |
2471 | 2369 |
2472 int | 2370 int |
2473 octave_base_lexer::handle_close_bracket (int bracket_type) | 2371 octave_base_lexer::handle_close_bracket (int bracket_type) |
2474 { | 2372 { |