# HG changeset patch # User John W. Eaton # Date 1594938459 14400 # Node ID 96e7dc4c2214ce23be9e50f1df8964b5c4bafcfb # Parent dc8de424fc725a26cc79d82d087e623b3a3282d9 improve Matlab compatibility for binary and hexadecimal constants * parser.tst: Update tests for new parsing rules for hex and binary integer literals. * lex.h, lex.ll (handle_number): Now a template with three specializations (binary, decimal, hexadecimal). Eliminate IMAG argument. (HANDLE_NUMBER): New macro. Use it to process binary, decimal, and hexadecimal constants with separate lexer patterns. (Im): Delete pattern macro. (EXPON, NUMBIN, NUMHEX, NUMREAL, NUMBER): Replace with DECIMAL_DIGITS, EXPONENT, REAL_DECIMAL, IMAG_DECIMAL, and DECIMAL_NUMBER for matching decimal numbers and SIZE_SUFFIX, BINARY_BITS, BINARY_NUMBER, HEXADECIMAL_BITS, and HEXADECIMAL_NUMBER for matching binary and hexadecimal numbers. ({NUMBER}{Im}): Delete rule. ({BINARY_NUMBER}, {HEXADECIMAL_NUMBER}): New rules. ({DECIMAL_DIGITS}/\.[\*/\\^\']|{DECIMAL_NUMBER}): New rule to replace {D}{D_}*/\.[\*/\\^\']|{NUMBER}. (make_integer_value): New static function. NEWS: Note change. diff -r dc8de424fc72 -r 96e7dc4c2214 NEWS --- a/NEWS Thu Jul 16 16:02:45 2020 -0400 +++ b/NEWS Thu Jul 16 18:27:39 2020 -0400 @@ -61,6 +61,24 @@ - The function `dec2bin` and `dec2hex` now support negative numbers. +- Binary and hexadecimal constants like `0b101` and `0xDEADBEEF` now create +integers (unsigned by default) with sizes determined from the number of +digits present. So, for example, `0xff` creates an `uint8` value and +`0xDEADBEEF` creates a `uint64` value. Binary constants are limited to 64 +binary digits and hexadecimal constants are limited to 16 hexadecimal +digits with no automatic rounding or conversion to floating point +values. Note that this may cause trouble for existing code. For +example, an expression like `[0x1; 0x100; 0x10000]` will be uint8 (because +of the rules of concatenating integers of different sizes) with the +larger values truncated (because of the saturation semantics of integer +values). To avoid these kinds of problems, pad constants in array +expressions with leading zeros so that they use the same number of +digits for each value. For example, `[0x01_00_00; 0x00_01_00; +0x01_00_00]`. You may also use a suffix of the form `s8`, `s16`, `s32`, +`s64`, `u8`, `u16`, `u32`, or `u64` to explicitly specify the data type +to use (`u` or `s` to indicate signed or unsigned and the number to +indicate the integer size). + - The function `importdata` now produces more compatible results when the file contains a 2-D text matrix. diff -r dc8de424fc72 -r 96e7dc4c2214 libinterp/parse-tree/lex.h --- a/libinterp/parse-tree/lex.h Thu Jul 16 16:02:45 2020 -0400 +++ b/libinterp/parse-tree/lex.h Thu Jul 16 18:27:39 2020 -0400 @@ -661,7 +661,9 @@ bool whitespace_is_significant (void); - int handle_number (bool imag); + // We only provide specializations with base equal to 2, 10, or 16. + template + int handle_number (void); void handle_continuation (void); @@ -822,6 +824,10 @@ bool m_initial_input; }; + template <> int base_lexer::handle_number<2> (); + template <> int base_lexer::handle_number<10> (); + template <> int base_lexer::handle_number<16> (); + class push_lexer : public base_lexer { diff -r dc8de424fc72 -r 96e7dc4c2214 libinterp/parse-tree/lex.ll --- a/libinterp/parse-tree/lex.ll Thu Jul 16 16:02:45 2020 -0400 +++ b/libinterp/parse-tree/lex.ll Thu Jul 16 18:27:39 2020 -0400 @@ -92,6 +92,7 @@ #include #include +#include #include #include #include @@ -251,6 +252,35 @@ } \ while (0) +#define HANDLE_NUMBER(PATTERN, BASE) \ + do \ + { \ + curr_lexer->lexer_debug (PATTERN); \ + \ + if (curr_lexer->previous_token_may_be_command () \ + && curr_lexer->space_follows_previous_token ()) \ + { \ + yyless (0); \ + curr_lexer->push_start_state (COMMAND_START); \ + } \ + else \ + { \ + int tok = curr_lexer->previous_token_value (); \ + \ + if (curr_lexer->whitespace_is_significant () \ + && curr_lexer->space_follows_previous_token () \ + && ! (tok == '[' || tok == '{' \ + || curr_lexer->previous_token_is_binop ())) \ + { \ + yyless (0); \ + unput (','); \ + } \ + else \ + return curr_lexer->handle_number (); \ + } \ + } \ + while (0) + #define HANDLE_IDENTIFIER(pattern, get_set) \ do \ { \ @@ -326,15 +356,40 @@ D_ [0-9_] S [ \t] NL ((\n)|(\r)|(\r\n)) -Im [iIjJ] CCHAR [#%] IDENT ([_$a-zA-Z][_$a-zA-Z0-9]*) FQIDENT ({IDENT}({S}*\.{S}*{IDENT})*) -EXPON ([DdEe][+-]?{D}{D_}*) -NUMBIN (0[bB][01_]+) -NUMHEX (0[xX][0-9a-fA-F][0-9a-fA-F_]*) -NUMREAL (({D}{D_}*\.?{D_}*{EXPON}?)|(\.{D}{D_}*{EXPON}?)) -NUMBER ({NUMREAL}|{NUMHEX}|{NUMBIN}) + +%{ +// Decimal numbers may be real or imaginary but always create +// double precision constants initially. Any conversion to single +// precision happens as part of an expression evaluation in the +// interpreter, not the lexer and parser. +%} + +DECIMAL_DIGITS ({D}{D_}*) +EXPONENT ([DdEe][+-]?{DECIMAL_DIGITS}) +REAL_DECIMAL ((({DECIMAL_DIGITS}\.?)|({DECIMAL_DIGITS}?\.{DECIMAL_DIGITS})){EXPONENT}?) +IMAG_DECIMAL ({REAL_DECIMAL}[IiJj]) +DECIMAL_NUMBER ({REAL_DECIMAL}|{IMAG_DECIMAL}) + +%{ +// It is possible to specify signedness and size for binary and +// hexadecimal numbers but there is no special syntax for imaginary +// constants. Binary and hexadecimal constants always create integer +// valued constants ({u,}int{8,16,32,64}). If a size is not specified, +// the smallest integer type that will hold the value is used. Negative +// values may be created with a signed size specification by applying +// twos-complement conversion (for example, 0xffs8 produces an 8-bit +// signed integer equal to -1 and 0b10000000s8 produces an 8-bit signed +// integer equal to -128). +%} + +SIZE_SUFFIX ([su](8|16|32|64)) +BINARY_BITS (0[bB][01][01_]*) +BINARY_NUMBER ({BINARY_BITS}|{BINARY_BITS}{SIZE_SUFFIX}) +HEXADECIMAL_BITS (0[xX][0-9a-fA-F][0-9a-fA-F_]*) +HEXADECIMAL_NUMBER ({HEXADECIMAL_BITS}|{HEXADECIMAL_BITS}{SIZE_SUFFIX}) ANY_EXCEPT_NL [^\r\n] ANY_INCLUDING_NL (.|{NL}) @@ -1183,66 +1238,24 @@ curr_lexer->pop_start_state (); } -%{ -// Imaginary numbers. -%} - -{NUMBER}{Im} { - curr_lexer->lexer_debug ("{NUMBER}{Im}"); - - if (curr_lexer->previous_token_may_be_command () - && curr_lexer->space_follows_previous_token ()) - { - yyless (0); - curr_lexer->push_start_state (COMMAND_START); - } - else - { - int tok = curr_lexer->previous_token_value (); - - if (curr_lexer->whitespace_is_significant () - && curr_lexer->space_follows_previous_token () - && ! (tok == '[' || tok == '{' - || curr_lexer->previous_token_is_binop ())) - { - yyless (0); - unput (','); - } - else - return curr_lexer->handle_number (true); - } +{BINARY_NUMBER} { + HANDLE_NUMBER ("{BINARY_NUMBER}", 2); } %{ -// Real numbers. Don't grab the '.' part of a dot operator as part of -// the constant. +// Decimal numbers. For expressions that are just digits followed +// directly by an element-by-element operator, don't grab the '.' +// part of the operator as part of the constant (for example, in an +// expression like "13./x"). %} -{D}{D_}*/\.[\*/\\^\'] | -{NUMBER} { - curr_lexer->lexer_debug ("{D}{D_}*/\\.[\\*/\\\\^\\']|{NUMBER}"); - - if (curr_lexer->previous_token_may_be_command () - && curr_lexer->space_follows_previous_token ()) - { - yyless (0); - curr_lexer->push_start_state (COMMAND_START); - } - else - { - int tok = curr_lexer->previous_token_value (); - - if (curr_lexer->whitespace_is_significant () - && curr_lexer->space_follows_previous_token () - && ! (tok == '[' || tok == '{' - || curr_lexer->previous_token_is_binop ())) - { - yyless (0); - unput (','); - } - else - return curr_lexer->handle_number (false); - } +{DECIMAL_DIGITS}/\.[\*/\\^\'] | +{DECIMAL_NUMBER} { + HANDLE_NUMBER ("{DECIMAL_DIGITS}/\\.[\\*/\\\\^\\']|{DECIMAL_NUMBER}", 10); + } + +{HEXADECIMAL_NUMBER} { + HANDLE_NUMBER ("{HEXADECIMAL_NUMBER}", 16); } %{ @@ -2928,65 +2941,190 @@ return (len > 2 && s[0] == '0' && (s[1] == 'x' || s[1] == 'X')); } +static inline octave_value +make_integer_value (uintmax_t long_int_val, bool unsigned_val, int bytes) +{ + if (unsigned_val) + { + switch (bytes) + { + case 1: + return octave_value (octave_uint8 (long_int_val)); + + case 2: + return octave_value (octave_uint16 (long_int_val)); + + case 4: + return octave_value (octave_uint32 (long_int_val)); + + case 8: + return octave_value (octave_uint64 (long_int_val)); + + default: + panic_impossible (); + }; + } + else + { + // FIXME: Conversion to signed values is supposed to follow + // twos-complement rules. Do we need to be more carefule here? + + switch (bytes) + { + case 1: + return octave_value (octave_int8 (int8_t (long_int_val))); + + case 2: + return octave_value (octave_int16 (int16_t (long_int_val))); + + case 4: + return octave_value (octave_int32 (int32_t (long_int_val))); + + case 8: + return octave_value (octave_int64 (int64_t (long_int_val))); + + default: + panic_impossible (); + }; + } + + return octave_value (); +} + namespace octave { + template <> int - base_lexer::handle_number (bool imag) + base_lexer::handle_number<2> (void) { - double value = 0.0; - int nread = 0; - - char *yytxt = flex_yytext (); - - // Strip any underscores - char *tmptxt = strsave (yytxt); - char *rptr = tmptxt; - char *wptr = tmptxt; - while (*rptr) - { - *wptr = *rptr++; - wptr += (*wptr != '_'); - } - *wptr = '\0'; - - if (looks_like_hex (tmptxt, strlen (tmptxt))) + // Skip 0[bB] prefix. + std::string yytxt (flex_yytext () + 2); + + yytxt.erase (std::remove (yytxt.begin (), yytxt.end (), '_'), + yytxt.end ()); + + size_t pos = yytxt.find_first_of ("su"); + + bool unsigned_val = true; + int bytes = -1; + std::string size_str; + if (pos == std::string::npos) { - uintmax_t long_int_value; - - nread = sscanf (tmptxt, "%jx", &long_int_value); - - value = static_cast (long_int_value); - } - else if (looks_like_bin (tmptxt, strlen (tmptxt))) - { - uintmax_t long_int_value = 0; - - for (size_t i = 0; i < strlen (tmptxt); i++) - { - if (tmptxt[i] == '0') - long_int_value <<= 1; - else if (tmptxt[i] == '1') - { - long_int_value <<= 1; - long_int_value += 1; - } - } - - value = static_cast (long_int_value); - - nread = 1; // Just to pass the assert stmt below + size_t num_digits = yytxt.length (); + + if (num_digits <= 8) + bytes = 1; + else if (num_digits <= 16) + bytes = 2; + else if (num_digits <= 32) + bytes = 4; + else if (num_digits <= 64) + bytes = 8; } else { - char *idx = strpbrk (tmptxt, "Dd"); - - if (idx) - *idx = 'e'; - - nread = sscanf (tmptxt, "%lf", &value); + unsigned_val = (yytxt[pos] == 'u'); + std::string size_str = yytxt.substr (pos+1); + yytxt = yytxt.substr (0, pos); + size_t num_digits = yytxt.length (); + + if (size_str == "8" && num_digits <= 8) + bytes = 1; + else if (size_str == "16" && num_digits <= 16) + bytes = 2; + else if (size_str == "32" && num_digits <= 32) + bytes = 4; + else if (size_str == "64" && num_digits <= 64) + bytes = 8; + } + + if (bytes < 0) + { + token *tok + = new token (LEXICAL_ERROR, + "too many digits for binary constant", + m_tok_beg, m_tok_end); + + push_token (tok); + + return count_token_internal (LEXICAL_ERROR); } - delete [] tmptxt; + // FIXME: is there a better way? Can uintmax_t be anything other + // than long or long long? Should we just be using uint64_t instead + // of uintmax_t? + + errno = 0; + char *end; + uintmax_t long_int_val; + if (sizeof (uintmax_t) == sizeof (unsigned long long)) + long_int_val = strtoull (yytxt.c_str (), &end, 2); + else if (sizeof (uintmax_t) == sizeof (unsigned long)) + long_int_val = strtoul (yytxt.c_str (), &end, 2); + else + panic_impossible (); + + if (errno == ERANGE) + panic_impossible (); + + octave_value ov_value + = make_integer_value (long_int_val, unsigned_val, bytes); + + m_looking_for_object_index = false; + m_at_beginning_of_statement = false; + + update_token_positions (flex_yyleng ()); + + push_token (new token (NUMBER, ov_value, yytxt, m_tok_beg, m_tok_end)); + + return count_token_internal (NUMBER); + } + + template <> + int + base_lexer::handle_number<10> (void) + { + bool imag = false; + + char *yytxt = flex_yytext (); + size_t yylng = flex_yyleng (); + + OCTAVE_LOCAL_BUFFER (char, tmptxt, yylng + 1); + char *rp = yytxt; + char *p = &tmptxt[0]; + + char ch; + while ((ch = *rp++)) + { + switch (ch) + { + case '_': + break; + + case 'D': + case 'd': + *p++ = 'e'; + break; + + case 'I': + case 'i': + case 'J': + case 'j': + imag = true; + break; + + default: + *p++ = ch; + break; + } + } + + *p = '\0'; + + double value = 0.0; + int nread = 0; + + nread = sscanf (tmptxt, "%lf", &value); // If yytext doesn't contain a valid number, we are in deep doo doo. @@ -2995,7 +3133,7 @@ m_looking_for_object_index = false; m_at_beginning_of_statement = false; - update_token_positions (flex_yyleng ()); + update_token_positions (yylng); octave_value ov_value = imag ? octave_value (Complex (0.0, value)) : octave_value (value); @@ -3005,6 +3143,82 @@ return count_token_internal (NUMBER); } + template <> + int + base_lexer::handle_number<16> (void) + { + // Skip 0[xX] prefix. + std::string yytxt (flex_yytext () + 2); + + yytxt.erase (std::remove (yytxt.begin (), yytxt.end (), '_'), + yytxt.end ()); + + size_t pos = yytxt.find_first_of ("su"); + + bool unsigned_val = true; + int bytes = -1; + std::string size_str; + if (pos == std::string::npos) + { + size_t num_digits = yytxt.length (); + + if (num_digits <= 2) + bytes = 1; + else if (num_digits <= 4) + bytes = 2; + else if (num_digits <= 8) + bytes = 4; + else if (num_digits <= 16) + bytes = 8; + } + else + { + unsigned_val = (yytxt[pos] == 'u'); + std::string size_str = yytxt.substr (pos+1); + yytxt = yytxt.substr (0, pos); + size_t num_digits = yytxt.length (); + + if (size_str == "8" && num_digits <= 2) + bytes = 1; + else if (size_str == "16" && num_digits <= 4) + bytes = 2; + else if (size_str == "32" && num_digits <= 8) + bytes = 4; + else if (size_str == "64" && num_digits <= 16) + bytes = 8; + } + + if (bytes < 0) + { + token *tok + = new token (LEXICAL_ERROR, + "too many digits for hexadecimal constant", + m_tok_beg, m_tok_end); + + push_token (tok); + + return count_token_internal (LEXICAL_ERROR); + } + + // Assert here because if yytext doesn't contain a valid number, we + // are in deep doo doo. + + uintmax_t long_int_val; + assert (sscanf (yytxt.c_str (), "%jx", &long_int_val)); + + octave_value ov_value + = make_integer_value (long_int_val, unsigned_val, bytes); + + m_looking_for_object_index = false; + m_at_beginning_of_statement = false; + + update_token_positions (flex_yyleng ()); + + push_token (new token (NUMBER, ov_value, yytxt, m_tok_beg, m_tok_end)); + + return count_token_internal (NUMBER); + } + void base_lexer::handle_continuation (void) { diff -r dc8de424fc72 -r 96e7dc4c2214 test/parser.tst --- a/test/parser.tst Thu Jul 16 16:02:45 2020 -0400 +++ b/test/parser.tst Thu Jul 16 18:27:39 2020 -0400 @@ -286,19 +286,19 @@ %!assert (123_456, 123456) %!assert (.123_456, .123456) %!assert (123_456.123_456, 123456.123456) -%!assert (0xAB_CD, 43981) +%!assert (0xAB_CD, uint16 (43981)) %!assert (2e0_1, 20) ## Test binary constants -%!assert (0b101, 5) +%!assert (0b101, uint8 (5)) %!assert (0B1100_0001, 0xC1) -%!assert (class (0b1), "double") +%!assert (class (0b1), "uint8") ## Test range of large binary and hexadecimal literals -%!assert (0x8000_0000_0000_0000, 2^63) -%!assert (0xFFFF_FFFF_FFFF_FFFF, 2^64) -%!assert (0b10000000_0000000_000000000_00000000_00000000_00000000_00000000_00000000, 2^63) -%!assert (0b11111111_1111111_111111111_11111111_11111111_11111111_11111111_11111111, 2^64) +%!assert (0x8000_0000_0000_0000, uint64 (2^63)) +%!assert (0xFFFF_FFFF_FFFF_FFFF, uint64 (2^64)) +%!assert (0b10000000_0000000_000000000_00000000_00000000_00000000_00000000_00000000, uint64 (2^63)) +%!assert (0b11111111_1111111_111111111_11111111_11111111_11111111_11111111_11111111, uint64 (2^64)) ## Test creation of anonymous functions