changeset 28583:96e7dc4c2214

improve Matlab compatibility for binary and hexadecimal constants * parser.tst: Update tests for new parsing rules for hex and binary integer literals. * lex.h, lex.ll (handle_number): Now a template with three specializations (binary, decimal, hexadecimal). Eliminate IMAG argument. (HANDLE_NUMBER): New macro. Use it to process binary, decimal, and hexadecimal constants with separate lexer patterns. (Im): Delete pattern macro. (EXPON, NUMBIN, NUMHEX, NUMREAL, NUMBER): Replace with DECIMAL_DIGITS, EXPONENT, REAL_DECIMAL, IMAG_DECIMAL, and DECIMAL_NUMBER for matching decimal numbers and SIZE_SUFFIX, BINARY_BITS, BINARY_NUMBER, HEXADECIMAL_BITS, and HEXADECIMAL_NUMBER for matching binary and hexadecimal numbers. ({NUMBER}{Im}): Delete rule. ({BINARY_NUMBER}, {HEXADECIMAL_NUMBER}): New rules. ({DECIMAL_DIGITS}/\.[\*/\\^\']|{DECIMAL_NUMBER}): New rule to replace {D}{D_}*/\.[\*/\\^\']|{NUMBER}. (make_integer_value): New static function. NEWS: Note change.
author John W. Eaton <jwe@octave.org>
date Thu, 16 Jul 2020 18:27:39 -0400
parents dc8de424fc72
children d40274873cb3
files NEWS libinterp/parse-tree/lex.h libinterp/parse-tree/lex.ll test/parser.tst
diffstat 4 files changed, 357 insertions(+), 119 deletions(-) [+]
line wrap: on
line diff
--- a/NEWS	Thu Jul 16 16:02:45 2020 -0400
+++ b/NEWS	Thu Jul 16 18:27:39 2020 -0400
@@ -61,6 +61,24 @@
 
 - The function `dec2bin` and `dec2hex` now support negative numbers.
 
+- Binary and hexadecimal constants like `0b101` and `0xDEADBEEF` now create
+integers (unsigned by default) with sizes determined from the number of
+digits present.  So, for example, `0xff` creates an `uint8` value and
+`0xDEADBEEF` creates a `uint64` value.  Binary constants are limited to 64
+binary digits and hexadecimal constants are limited to 16 hexadecimal
+digits with no automatic rounding or conversion to floating point
+values.  Note that this may cause trouble for existing code.  For
+example, an expression like `[0x1; 0x100; 0x10000]` will be uint8 (because
+of the rules of concatenating integers of different sizes) with the
+larger values truncated (because of the saturation semantics of integer
+values).  To avoid these kinds of problems, pad constants in array
+expressions with leading zeros so that they use the same number of
+digits for each value.  For example, `[0x01_00_00; 0x00_01_00;
+0x01_00_00]`.  You may also use a suffix of the form `s8`, `s16`, `s32`,
+`s64`, `u8`, `u16`, `u32`, or `u64` to explicitly specify the data type
+to use (`u` or `s` to indicate signed or unsigned and the number to
+indicate the integer size).
+
 - The function `importdata` now produces more compatible results when
 the file contains a 2-D text matrix.
 
--- a/libinterp/parse-tree/lex.h	Thu Jul 16 16:02:45 2020 -0400
+++ b/libinterp/parse-tree/lex.h	Thu Jul 16 18:27:39 2020 -0400
@@ -661,7 +661,9 @@
 
     bool whitespace_is_significant (void);
 
-    int handle_number (bool imag);
+    // We only provide specializations with base equal to 2, 10, or 16.
+    template <int base>
+    int handle_number (void);
 
     void handle_continuation (void);
 
@@ -822,6 +824,10 @@
     bool m_initial_input;
   };
 
+  template <> int base_lexer::handle_number<2> ();
+  template <> int base_lexer::handle_number<10> ();
+  template <> int base_lexer::handle_number<16> ();
+
   class
   push_lexer : public base_lexer
   {
--- a/libinterp/parse-tree/lex.ll	Thu Jul 16 16:02:45 2020 -0400
+++ b/libinterp/parse-tree/lex.ll	Thu Jul 16 18:27:39 2020 -0400
@@ -92,6 +92,7 @@
 #include <cctype>
 #include <cstring>
 
+#include <algorithm>
 #include <iostream>
 #include <set>
 #include <sstream>
@@ -251,6 +252,35 @@
      }                                                  \
    while (0)
 
+#define HANDLE_NUMBER(PATTERN, BASE)                            \
+  do                                                            \
+    {                                                           \
+     curr_lexer->lexer_debug (PATTERN);                         \
+                                                                \
+     if (curr_lexer->previous_token_may_be_command ()           \
+         &&  curr_lexer->space_follows_previous_token ())       \
+       {                                                        \
+         yyless (0);                                            \
+         curr_lexer->push_start_state (COMMAND_START);          \
+       }                                                        \
+     else                                                       \
+       {                                                        \
+         int tok = curr_lexer->previous_token_value ();         \
+                                                                \
+         if (curr_lexer->whitespace_is_significant ()           \
+             && curr_lexer->space_follows_previous_token ()     \
+             && ! (tok == '[' || tok == '{'                     \
+                   || curr_lexer->previous_token_is_binop ()))  \
+           {                                                    \
+             yyless (0);                                        \
+             unput (',');                                       \
+           }                                                    \
+         else                                                   \
+           return curr_lexer->handle_number<BASE> ();           \
+       }                                                        \
+    }                                                           \
+  while (0)
+
 #define HANDLE_IDENTIFIER(pattern, get_set)                             \
    do                                                                   \
      {                                                                  \
@@ -326,15 +356,40 @@
 D_      [0-9_]
 S       [ \t]
 NL      ((\n)|(\r)|(\r\n))
-Im      [iIjJ]
 CCHAR   [#%]
 IDENT   ([_$a-zA-Z][_$a-zA-Z0-9]*)
 FQIDENT ({IDENT}({S}*\.{S}*{IDENT})*)
-EXPON   ([DdEe][+-]?{D}{D_}*)
-NUMBIN  (0[bB][01_]+)
-NUMHEX  (0[xX][0-9a-fA-F][0-9a-fA-F_]*)
-NUMREAL (({D}{D_}*\.?{D_}*{EXPON}?)|(\.{D}{D_}*{EXPON}?))
-NUMBER  ({NUMREAL}|{NUMHEX}|{NUMBIN})
+
+%{
+// Decimal numbers may be real or imaginary but always create
+// double precision constants initially.  Any conversion to single
+// precision happens as part of an expression evaluation in the
+// interpreter, not the lexer and parser.
+%}
+
+DECIMAL_DIGITS ({D}{D_}*)
+EXPONENT       ([DdEe][+-]?{DECIMAL_DIGITS})
+REAL_DECIMAL   ((({DECIMAL_DIGITS}\.?)|({DECIMAL_DIGITS}?\.{DECIMAL_DIGITS})){EXPONENT}?)
+IMAG_DECIMAL   ({REAL_DECIMAL}[IiJj])
+DECIMAL_NUMBER ({REAL_DECIMAL}|{IMAG_DECIMAL})
+
+%{
+// It is possible to specify signedness and size for binary and
+// hexadecimal numbers but there is no special syntax for imaginary
+// constants.  Binary and hexadecimal constants always create integer
+// valued constants ({u,}int{8,16,32,64}).  If a size is not specified,
+// the smallest integer type that will hold the value is used.  Negative
+// values may be created with a signed size specification by applying
+// twos-complement conversion (for example, 0xffs8 produces an 8-bit
+// signed integer equal to -1 and 0b10000000s8 produces an 8-bit signed
+// integer equal to -128).
+%}
+
+SIZE_SUFFIX        ([su](8|16|32|64))
+BINARY_BITS        (0[bB][01][01_]*)
+BINARY_NUMBER      ({BINARY_BITS}|{BINARY_BITS}{SIZE_SUFFIX})
+HEXADECIMAL_BITS   (0[xX][0-9a-fA-F][0-9a-fA-F_]*)
+HEXADECIMAL_NUMBER ({HEXADECIMAL_BITS}|{HEXADECIMAL_BITS}{SIZE_SUFFIX})
 
 ANY_EXCEPT_NL [^\r\n]
 ANY_INCLUDING_NL (.|{NL})
@@ -1183,66 +1238,24 @@
     curr_lexer->pop_start_state ();
   }
 
-%{
-// Imaginary numbers.
-%}
-
-{NUMBER}{Im} {
-    curr_lexer->lexer_debug ("{NUMBER}{Im}");
-
-    if (curr_lexer->previous_token_may_be_command ()
-        &&  curr_lexer->space_follows_previous_token ())
-      {
-        yyless (0);
-        curr_lexer->push_start_state (COMMAND_START);
-      }
-    else
-      {
-        int tok = curr_lexer->previous_token_value ();
-
-        if (curr_lexer->whitespace_is_significant ()
-            && curr_lexer->space_follows_previous_token ()
-            && ! (tok == '[' || tok == '{'
-                  || curr_lexer->previous_token_is_binop ()))
-          {
-            yyless (0);
-            unput (',');
-          }
-        else
-          return curr_lexer->handle_number (true);
-      }
+{BINARY_NUMBER} {
+    HANDLE_NUMBER ("{BINARY_NUMBER}", 2);
   }
 
 %{
-// Real numbers.  Don't grab the '.' part of a dot operator as part of
-// the constant.
+// Decimal numbers.  For expressions that are just digits followed
+// directly by an element-by-element operator, don't grab the '.'
+// part of the operator as part of the constant (for example, in an
+// expression like "13./x").
 %}
 
-{D}{D_}*/\.[\*/\\^\'] |
-{NUMBER} {
-    curr_lexer->lexer_debug ("{D}{D_}*/\\.[\\*/\\\\^\\']|{NUMBER}");
-
-    if (curr_lexer->previous_token_may_be_command ()
-        &&  curr_lexer->space_follows_previous_token ())
-      {
-        yyless (0);
-        curr_lexer->push_start_state (COMMAND_START);
-      }
-    else
-      {
-        int tok = curr_lexer->previous_token_value ();
-
-        if (curr_lexer->whitespace_is_significant ()
-            && curr_lexer->space_follows_previous_token ()
-            && ! (tok == '[' || tok == '{'
-                  || curr_lexer->previous_token_is_binop ()))
-          {
-            yyless (0);
-            unput (',');
-          }
-        else
-          return curr_lexer->handle_number (false);
-      }
+{DECIMAL_DIGITS}/\.[\*/\\^\'] |
+{DECIMAL_NUMBER} {
+    HANDLE_NUMBER ("{DECIMAL_DIGITS}/\\.[\\*/\\\\^\\']|{DECIMAL_NUMBER}", 10);
+  }
+
+{HEXADECIMAL_NUMBER} {
+    HANDLE_NUMBER ("{HEXADECIMAL_NUMBER}", 16);
   }
 
 %{
@@ -2928,65 +2941,190 @@
   return (len > 2 && s[0] == '0' && (s[1] == 'x' || s[1] == 'X'));
 }
 
+static inline octave_value
+make_integer_value (uintmax_t long_int_val, bool unsigned_val, int bytes)
+{
+  if (unsigned_val)
+    {
+     switch (bytes)
+       {
+       case 1:
+         return octave_value (octave_uint8 (long_int_val));
+
+       case 2:
+         return octave_value (octave_uint16 (long_int_val));
+
+       case 4:
+         return octave_value (octave_uint32 (long_int_val));
+
+       case 8:
+         return octave_value (octave_uint64 (long_int_val));
+
+       default:
+         panic_impossible ();
+       };
+    }
+  else
+    {
+      // FIXME: Conversion to signed values is supposed to follow
+      // twos-complement rules.  Do we need to be more carefule here?
+
+      switch (bytes)
+        {
+        case 1:
+          return octave_value (octave_int8 (int8_t (long_int_val)));
+
+        case 2:
+          return octave_value (octave_int16 (int16_t (long_int_val)));
+
+        case 4:
+          return octave_value (octave_int32 (int32_t (long_int_val)));
+
+        case 8:
+        return octave_value (octave_int64 (int64_t (long_int_val)));
+
+        default:
+          panic_impossible ();
+        };
+    }
+
+  return octave_value ();
+}
+
 namespace octave
 {
+  template <>
   int
-  base_lexer::handle_number (bool imag)
+  base_lexer::handle_number<2> (void)
   {
-    double value = 0.0;
-    int nread = 0;
-
-    char *yytxt = flex_yytext ();
-
-    // Strip any underscores
-    char *tmptxt = strsave (yytxt);
-    char *rptr = tmptxt;
-    char *wptr = tmptxt;
-    while (*rptr)
-      {
-        *wptr = *rptr++;
-        wptr += (*wptr != '_');
-      }
-    *wptr = '\0';
-
-    if (looks_like_hex (tmptxt, strlen (tmptxt)))
+    // Skip 0[bB] prefix.
+    std::string yytxt (flex_yytext () + 2);
+
+    yytxt.erase (std::remove (yytxt.begin (), yytxt.end (), '_'),
+                 yytxt.end ());
+
+    size_t pos = yytxt.find_first_of ("su");
+
+    bool unsigned_val = true;
+    int bytes = -1;
+    std::string size_str;
+    if (pos == std::string::npos)
       {
-        uintmax_t long_int_value;
-
-        nread = sscanf (tmptxt, "%jx", &long_int_value);
-
-        value = static_cast<double> (long_int_value);
-      }
-    else if (looks_like_bin (tmptxt, strlen (tmptxt)))
-      {
-        uintmax_t long_int_value = 0;
-
-        for (size_t i = 0; i < strlen (tmptxt); i++)
-          {
-            if (tmptxt[i] == '0')
-              long_int_value <<= 1;
-            else if (tmptxt[i] == '1')
-            {
-              long_int_value <<= 1;
-              long_int_value += 1;
-            }
-          }
-
-        value = static_cast<double> (long_int_value);
-
-        nread = 1;  // Just to pass the assert stmt below
+        size_t num_digits = yytxt.length ();
+
+        if (num_digits <= 8)
+          bytes = 1;
+        else if (num_digits <= 16)
+          bytes = 2;
+        else if (num_digits <= 32)
+          bytes = 4;
+        else if (num_digits <= 64)
+          bytes = 8;
       }
     else
       {
-        char *idx = strpbrk (tmptxt, "Dd");
-
-        if (idx)
-          *idx = 'e';
-
-        nread = sscanf (tmptxt, "%lf", &value);
+        unsigned_val = (yytxt[pos] == 'u');
+        std::string size_str = yytxt.substr (pos+1);
+        yytxt = yytxt.substr (0, pos);
+        size_t num_digits = yytxt.length ();
+
+        if (size_str == "8" && num_digits <= 8)
+          bytes = 1;
+        else if (size_str == "16" && num_digits <= 16)
+          bytes = 2;
+        else if (size_str == "32" && num_digits <= 32)
+          bytes = 4;
+        else if (size_str == "64" && num_digits <= 64)
+          bytes = 8;
+      }
+
+    if (bytes < 0)
+      {
+        token *tok
+          = new token (LEXICAL_ERROR,
+                       "too many digits for binary constant",
+                       m_tok_beg, m_tok_end);
+
+        push_token (tok);
+
+        return count_token_internal (LEXICAL_ERROR);
       }
 
-    delete [] tmptxt;
+    // FIXME: is there a better way?  Can uintmax_t be anything other
+    // than long or long long?  Should we just be using uint64_t instead
+    // of uintmax_t?
+
+    errno = 0;
+    char *end;
+    uintmax_t long_int_val;
+    if (sizeof (uintmax_t) == sizeof (unsigned long long))
+      long_int_val = strtoull (yytxt.c_str (), &end, 2);
+    else if (sizeof (uintmax_t) == sizeof (unsigned long))
+      long_int_val = strtoul (yytxt.c_str (), &end, 2);
+    else
+      panic_impossible ();
+
+    if (errno == ERANGE)
+      panic_impossible ();
+
+    octave_value ov_value
+      = make_integer_value (long_int_val, unsigned_val, bytes);
+
+    m_looking_for_object_index = false;
+    m_at_beginning_of_statement = false;
+
+    update_token_positions (flex_yyleng ());
+
+    push_token (new token (NUMBER, ov_value, yytxt, m_tok_beg, m_tok_end));
+
+    return count_token_internal (NUMBER);
+  }
+
+  template <>
+  int
+  base_lexer::handle_number<10> (void)
+  {
+    bool imag = false;
+
+    char *yytxt = flex_yytext ();
+    size_t yylng = flex_yyleng ();
+
+    OCTAVE_LOCAL_BUFFER (char, tmptxt, yylng + 1);
+    char *rp = yytxt;
+    char *p = &tmptxt[0];
+
+    char ch;
+    while ((ch = *rp++))
+      {
+        switch (ch)
+          {
+          case '_':
+            break;
+
+          case 'D':
+          case 'd':
+            *p++ = 'e';
+            break;
+
+          case 'I':
+          case 'i':
+          case 'J':
+          case 'j':
+            imag = true;
+            break;
+
+          default:
+            *p++ = ch;
+            break;
+          }
+      }
+
+    *p = '\0';
+
+    double value = 0.0;
+    int nread = 0;
+
+    nread = sscanf (tmptxt, "%lf", &value);
 
     // If yytext doesn't contain a valid number, we are in deep doo doo.
 
@@ -2995,7 +3133,7 @@
     m_looking_for_object_index = false;
     m_at_beginning_of_statement = false;
 
-    update_token_positions (flex_yyleng ());
+    update_token_positions (yylng);
 
     octave_value ov_value
       = imag ? octave_value (Complex (0.0, value)) : octave_value (value);
@@ -3005,6 +3143,82 @@
     return count_token_internal (NUMBER);
   }
 
+  template <>
+  int
+  base_lexer::handle_number<16> (void)
+  {
+    // Skip 0[xX] prefix.
+    std::string yytxt (flex_yytext () + 2);
+
+    yytxt.erase (std::remove (yytxt.begin (), yytxt.end (), '_'),
+                 yytxt.end ());
+
+    size_t pos = yytxt.find_first_of ("su");
+
+    bool unsigned_val = true;
+    int bytes = -1;
+    std::string size_str;
+    if (pos == std::string::npos)
+      {
+        size_t num_digits = yytxt.length ();
+
+        if (num_digits <= 2)
+          bytes = 1;
+        else if (num_digits <= 4)
+          bytes = 2;
+        else if (num_digits <= 8)
+          bytes = 4;
+        else if (num_digits <= 16)
+          bytes = 8;
+      }
+    else
+      {
+        unsigned_val = (yytxt[pos] == 'u');
+        std::string size_str = yytxt.substr (pos+1);
+        yytxt = yytxt.substr (0, pos);
+        size_t num_digits = yytxt.length ();
+
+        if (size_str == "8" && num_digits <= 2)
+          bytes = 1;
+        else if (size_str == "16" && num_digits <= 4)
+          bytes = 2;
+        else if (size_str == "32" && num_digits <= 8)
+          bytes = 4;
+        else if (size_str == "64" && num_digits <= 16)
+          bytes = 8;
+      }
+
+    if (bytes < 0)
+      {
+        token *tok
+          = new token (LEXICAL_ERROR,
+                       "too many digits for hexadecimal constant",
+                       m_tok_beg, m_tok_end);
+
+        push_token (tok);
+
+        return count_token_internal (LEXICAL_ERROR);
+      }
+
+    // Assert here because if yytext doesn't contain a valid number, we
+    // are in deep doo doo.
+
+    uintmax_t long_int_val;
+    assert (sscanf (yytxt.c_str (), "%jx", &long_int_val));
+
+    octave_value ov_value
+      = make_integer_value (long_int_val, unsigned_val, bytes);
+
+    m_looking_for_object_index = false;
+    m_at_beginning_of_statement = false;
+
+    update_token_positions (flex_yyleng ());
+
+    push_token (new token (NUMBER, ov_value, yytxt, m_tok_beg, m_tok_end));
+
+    return count_token_internal (NUMBER);
+  }
+
   void
   base_lexer::handle_continuation (void)
   {
--- a/test/parser.tst	Thu Jul 16 16:02:45 2020 -0400
+++ b/test/parser.tst	Thu Jul 16 18:27:39 2020 -0400
@@ -286,19 +286,19 @@
 %!assert (123_456, 123456)
 %!assert (.123_456, .123456)
 %!assert (123_456.123_456, 123456.123456)
-%!assert (0xAB_CD, 43981)
+%!assert (0xAB_CD, uint16 (43981))
 %!assert (2e0_1, 20)
 
 ## Test binary constants
-%!assert (0b101, 5)
+%!assert (0b101, uint8 (5))
 %!assert (0B1100_0001, 0xC1)
-%!assert (class (0b1), "double")
+%!assert (class (0b1), "uint8")
 
 ## Test range of large binary and hexadecimal literals
-%!assert (0x8000_0000_0000_0000, 2^63)
-%!assert (0xFFFF_FFFF_FFFF_FFFF, 2^64)
-%!assert (0b10000000_0000000_000000000_00000000_00000000_00000000_00000000_00000000, 2^63)
-%!assert (0b11111111_1111111_111111111_11111111_11111111_11111111_11111111_11111111, 2^64)
+%!assert (0x8000_0000_0000_0000, uint64 (2^63))
+%!assert (0xFFFF_FFFF_FFFF_FFFF, uint64 (2^64))
+%!assert (0b10000000_0000000_000000000_00000000_00000000_00000000_00000000_00000000, uint64 (2^63))
+%!assert (0b11111111_1111111_111111111_11111111_11111111_11111111_11111111_11111111, uint64 (2^64))
 
 ## Test creation of anonymous functions