changeset 31424:d1165473e4b0

allow Octave to use PCRE2 (bug #61542) * acinclude.m4 (OCTAVE_CHECK_LIB_PCRE2_OK): New macro. * configure.ac: Check for libpcre2-8 and pcre2.h before checking for libpcre and pcre.h. If PCRE2 is found, also define PCRE_CPPFLAGS, PCRE_LDFLAGS, and PCRE_LIBS. Error if neither library is found. * lo-regexp.h (regexp::match_element::match_element): New constructor that accepts int values for start and end. * lo-regexp.cc: Include either PCRE2 or PCRE headers. Fail if neither HAVE_PCRE2 or HAVE_PCRE is defined. (octave_pcre_code, OCTAVE_PCRE_SIZE): New typedefs. (OCTAVE_PCRE_CASELESS, OCTAVE_PCRE_DOTALL, OCTAVE_PCRE_MULTILINE, OCTAVE_PCRE_EXTENDED, OCTAVE_PCRE_UTF, OCTAVE_PCRE_INFO_CAPTURECOUNT, OCTAVE_PCRE_INFO_NAMECOUNT, OCTAVE_PCRE_INFO_NAMEENTRYSIZE, OCTAVE_PCRE_INFO_NAMETABLE): New macro definitions. (octave_pcre_code_free): Define function pointer that references either pcre2_code_free or pcre_free depending on the library in use. (regexp::free): Call octave_pcre_code_free to free m_code. (octave_pcre_pattern_info): New function. (regexp::compile_internal): Allow use of either PCRE2 or PCRE. Use new macros and functions to hide differences in library interfaces.
author Rafael Laboissiere <rafael@laboissiere.net>
date Sun, 13 Nov 2022 10:17:17 -0500
parents fcd4bc97f5f7
children 9f4a9dd4a6ee
files configure.ac liboctave/util/lo-regexp.cc liboctave/util/lo-regexp.h m4/acinclude.m4
diffstat 4 files changed, 240 insertions(+), 35 deletions(-) [+]
line wrap: on
line diff
--- a/configure.ac	Fri Nov 11 15:40:45 2022 -0500
+++ b/configure.ac	Sun Nov 13 10:17:17 2022 -0500
@@ -1375,14 +1375,42 @@
     [Define to 1 to build experimental Virtual Machine evaluator.])
 fi
 
-### Check for PCRE regex library.
-
-OCTAVE_CHECK_LIB(pcre, PCRE,
-  [], [pcre.h pcre/pcre.h], [pcre_compile], [], [],
-  [OCTAVE_CHECK_LIB_PCRE_OK([],
-    [AC_MSG_ERROR([PCRE library must be built with UTF support (--enable-utf)])])
-  ],
-  [libpcre], [REQUIRED])
+### Check for PCRE2 or PCRE regex library, requiring one to exist.
+
+have_pcre2=no
+have_pcre=no
+save_CPPLAGS="$CPPFLAGS"
+CPPFLAGS="-DPCRE2_CODE_UNIT_WIDTH=8 $CPPFLAGS"
+OCTAVE_CHECK_LIB(pcre2, PCRE2,
+  [], [pcre2.h pcre2/pcre2.h], [pcre2_compile_8], [], [],
+  [OCTAVE_CHECK_LIB_PCRE2_OK([have_pcre2=yes],
+    [AC_MSG_ERROR([PCRE2 library must be built with UTF support (--enable-utf)])])],
+  [libpcre2-8])
+CPPFLAGS="$save_CPPFLAGS"
+
+if test $have_pcre2 = no; then
+  OCTAVE_CHECK_LIB(pcre, PCRE,
+    [], [pcre.h pcre/pcre.h], [pcre_compile], [], [],
+    [OCTAVE_CHECK_LIB_PCRE_OK([have_pcre=yes],
+      [AC_MSG_ERROR([PCRE library must be built with UTF support (--enable-utf)])])],
+    [libpcre])
+fi
+
+if test $have_pcre2 = yes; then
+  AC_DEFINE(HAVE_PCRE2, 1, [Define to 1 if PCRE2 is available.])
+
+  ## Only one of PCRE2 or PCRE is used, so avoid having to define and use
+  ## both PCRE2_* and PCRE_* variables everywhere.
+
+  PCRE_CPPFLAGS="$PCRE2_CPPFLAGS"
+  PCRE_LDFLAGS="$PCRE2_LDFLAGS"
+  PCRE_LIBS="$PCRE2_LIBS"
+
+elif test $have_pcre = yes; then
+  AC_DEFINE(HAVE_PCRE, 1, [Define to 1 if PCRE is available.])
+else
+  AC_MSG_ERROR([to build Octave, you must have the PCRE or PCRE2 library and header files installed])
+fi
 
 ### Check for Qhull library.
 
--- a/liboctave/util/lo-regexp.cc	Fri Nov 11 15:40:45 2022 -0500
+++ b/liboctave/util/lo-regexp.cc	Sun Nov 13 10:17:17 2022 -0500
@@ -32,10 +32,19 @@
 #include <string>
 #include <vector>
 
-#if defined (HAVE_PCRE_H)
-#  include <pcre.h>
-#elif defined (HAVE_PCRE_PCRE_H)
-#  include <pcre/pcre.h>
+#if defined (HAVE_PCRE2_H) || defined (HAVE_PCRE2_PCRE2_H)
+#  define PCRE2_CODE_UNIT_WIDTH 8
+#  if defined (HAVE_PCRE2_H)
+#    include <pcre2.h>
+#  elif defined (HAVE_PCRE2_PCRE2_H)
+#    include <pcre2/pcre2.h>
+#  endif
+#elif defined (HAVE_PCRE_H) || defined (HAVE_PCRE_PCRE_H)
+#  if defined (HAVE_PCRE_H)
+#    include <pcre.h>
+#  elif defined (HAVE_PCRE_PCRE_H)
+#    include <pcre/pcre.h>
+#  endif
 #endif
 
 #include "Matrix.h"
@@ -46,6 +55,47 @@
 #include "lo-regexp.h"
 #include "str-vec.h"
 #include "unistr-wrappers.h"
+#include "unwind-prot.h"
+
+#if defined (HAVE_PCRE2)
+typedef pcre2_code octave_pcre_code;
+typedef PCRE2_SIZE OCTAVE_PCRE_SIZE;
+void (*octave_pcre_code_free) (octave_pcre_code *) = pcre2_code_free;
+#  define OCTAVE_PCRE_CASELESS PCRE2_CASELESS
+#  define OCTAVE_PCRE_DOTALL PCRE2_DOTALL
+#  define OCTAVE_PCRE_MULTILINE PCRE2_MULTILINE
+#  define OCTAVE_PCRE_EXTENDED PCRE2_EXTENDED
+#  define OCTAVE_PCRE_UTF PCRE2_UTF
+#  define OCTAVE_PCRE_INFO_CAPTURECOUNT PCRE2_INFO_CAPTURECOUNT
+#  define OCTAVE_PCRE_INFO_NAMECOUNT PCRE2_INFO_NAMECOUNT
+#  define OCTAVE_PCRE_INFO_NAMEENTRYSIZE PCRE2_INFO_NAMEENTRYSIZE
+#  define OCTAVE_PCRE_INFO_NAMETABLE PCRE2_INFO_NAMETABLE
+#elif defined (HAVE_PCRE)
+typedef pcre octave_pcre_code;
+typedef int OCTAVE_PCRE_SIZE;
+void (*octave_pcre_code_free) (void *) = pcre_free;
+#  define OCTAVE_PCRE_CASELESS PCRE_CASELESS
+#  define OCTAVE_PCRE_DOTALL PCRE_DOTALL
+#  define OCTAVE_PCRE_MULTILINE PCRE_MULTILINE
+#  define OCTAVE_PCRE_EXTENDED PCRE_EXTENDED
+#  define OCTAVE_PCRE_UTF PCRE_UTF8
+#  define OCTAVE_PCRE_INFO_CAPTURECOUNT PCRE_INFO_CAPTURECOUNT
+#  define OCTAVE_PCRE_INFO_NAMECOUNT PCRE_INFO_NAMECOUNT
+#  define OCTAVE_PCRE_INFO_NAMEENTRYSIZE PCRE_INFO_NAMEENTRYSIZE
+#  define OCTAVE_PCRE_INFO_NAMETABLE PCRE_INFO_NAMETABLE
+#else
+#  error "PCRE2 or PCRE library is required to build Octave"
+#endif
+
+static inline int
+octave_pcre_pattern_info (const octave_pcre_code *code, int what, void *where)
+{
+#if defined (HAVE_PCRE2)
+  return pcre2_pattern_info (code, what, where);
+#else
+  return pcre_fullinfo (code, nullptr, what, where);
+#endif
+}
 
 namespace octave
 {
@@ -64,8 +114,7 @@
   void
   regexp::free (void)
   {
-    if (m_code)
-      pcre_free (static_cast<pcre *> (m_code));
+    octave_pcre_code_free (static_cast<octave_pcre_code *> (m_code));
   }
 
   void
@@ -229,22 +278,50 @@
     while ((pos = buf_str.find ('\0')) != std::string::npos)
       buf_str.replace (pos, 1, "\\000");
 
+    int pcre_options
+      = (  (m_options.case_insensitive () ? OCTAVE_PCRE_CASELESS : 0)
+         | (m_options.dotexceptnewline () ? 0 : OCTAVE_PCRE_DOTALL)
+         | (m_options.lineanchors () ? OCTAVE_PCRE_MULTILINE : 0)
+         | (m_options.freespacing () ? OCTAVE_PCRE_EXTENDED : 0)
+         | OCTAVE_PCRE_UTF);
+
+#if defined (HAVE_PCRE2)
+    PCRE2_SIZE erroffset;
+    int errnumber;
+
+    m_code = pcre2_compile (reinterpret_cast<PCRE2_SPTR> (buf_str.c_str ()),
+                            PCRE2_ZERO_TERMINATED, pcre_options,
+                            &errnumber, &erroffset, nullptr);
+
+    if (! m_code)
+      {
+        // PCRE docs say:
+        //
+        //   If the buffer is too small, the message is truncated (but
+        //   still with a trailing zero), and the negative error code
+        //   PCRE2_ERROR_NOMEMORY is returned. None of the messages are
+        //   very long; a buffer size of 120 code units is ample.
+        //
+        // so we assume that 256 will be large enough to avoid truncated
+        // messages.
+
+        PCRE2_UCHAR err [256];
+        pcre2_get_error_message (errnumber, err, sizeof (err));
+        (*current_liboctave_error_handler)
+          ("%s: %s at position %zu of expression", m_who.c_str (), err,
+           erroffset);
+      }
+#else
     const char *err;
     int erroffset;
 
-    int pcre_options
-      = (  (m_options.case_insensitive () ? PCRE_CASELESS : 0)
-         | (m_options.dotexceptnewline () ? 0 : PCRE_DOTALL)
-         | (m_options.lineanchors () ? PCRE_MULTILINE : 0)
-         | (m_options.freespacing () ? PCRE_EXTENDED : 0)
-         | PCRE_UTF8);
-
     m_code = pcre_compile (buf_str.c_str (), pcre_options,
                            &err, &erroffset, nullptr);
 
     if (! m_code)
       (*current_liboctave_error_handler)
         ("%s: %s at position %d of expression", m_who.c_str (), err, erroffset);
+#endif
   }
 
   regexp::match_data
@@ -266,14 +343,17 @@
     char *nametable;
     std::size_t idx = 0;
 
-    pcre *re = static_cast<pcre *> (m_code);
+    octave_pcre_code *re = static_cast<octave_pcre_code *> (m_code);
 
-    pcre_fullinfo (re, nullptr, PCRE_INFO_CAPTURECOUNT,  &subpatterns);
-    pcre_fullinfo (re, nullptr, PCRE_INFO_NAMECOUNT, &namecount);
-    pcre_fullinfo (re, nullptr, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize);
-    pcre_fullinfo (re, nullptr, PCRE_INFO_NAMETABLE, &nametable);
+    octave_pcre_pattern_info (re, OCTAVE_PCRE_INFO_CAPTURECOUNT, &subpatterns);
+    octave_pcre_pattern_info (re, OCTAVE_PCRE_INFO_NAMECOUNT, &namecount);
+    octave_pcre_pattern_info (re, OCTAVE_PCRE_INFO_NAMEENTRYSIZE, &nameentrysize);
+    octave_pcre_pattern_info (re, OCTAVE_PCRE_INFO_NAMETABLE, &nametable);
 
-    OCTAVE_LOCAL_BUFFER (int, ovector, (subpatterns+1)*3);
+#if defined (HAVE_PCRE)
+    OCTAVE_LOCAL_BUFFER (OCTAVE_PCRE_SIZE, ovector, (subpatterns+1)*3);
+#endif
+
     OCTAVE_LOCAL_BUFFER (int, nidx, namecount);
 
     for (int i = 0; i < namecount; i++)
@@ -288,6 +368,27 @@
       {
         octave_quit ();
 
+#if defined (HAVE_PCRE2)
+        pcre2_match_data *m_data = pcre2_match_data_create_from_pattern (re, NULL);
+
+        unwind_action cleanup_match_data
+          ([=] () { pcre2_match_data_free (m_data); });
+
+        int matches = pcre2_match (re, reinterpret_cast<PCRE2_SPTR> (buffer.c_str ()),
+                                   buffer.length (), idx,
+                                   PCRE2_NO_UTF_CHECK | (idx ? PCRE2_NOTBOL : 0),
+                                   m_data, nullptr);
+
+        if (matches < 0 && matches != PCRE2_ERROR_NOMATCH)
+            (*current_liboctave_error_handler)
+              ("%s: internal error calling pcre2_match; "
+               "error code from pcre2_match is %i", m_who.c_str (), matches);
+
+        if (matches == PCRE2_ERROR_NOMATCH)
+          break;
+
+        OCTAVE_PCRE_SIZE *ovector = pcre2_get_ovector_pointer (m_data);
+#else
         int matches = pcre_exec (re, nullptr, buffer.c_str (),
                                  buffer.length (), idx,
                                  PCRE_NO_UTF8_CHECK | (idx ? PCRE_NOTBOL : 0),
@@ -330,7 +431,8 @@
 
         if (matches == PCRE_ERROR_NOMATCH)
           break;
-        else if (ovector[0] >= ovector[1] && ! m_options.emptymatch ())
+#endif
+        if (ovector[0] >= ovector[1] && ! m_options.emptymatch ())
           {
             // Zero length match.  Skip to next char.
             idx = ovector[0] + 1;
@@ -346,7 +448,12 @@
 
             for (int i = 1; i < matches; i++)
               {
-                if (ovector[2*i] >= 0 && ovector[2*i+1] > 0
+#if defined (HAVE_PCRE2)
+                if (ovector[2*i] != PCRE2_SIZE_MAX
+#else
+                if (ovector[2*i] >= 0
+#endif
+                    && ovector[2*i+1] > 0
                     && (i == 1 || ovector[2*i] != ovector[2*i-2]
                         || ovector[2*i-1] != ovector[2*i+1]))
                   {
@@ -357,9 +464,14 @@
 
             token_extents.resize (pos_match, 2);
 
-            double start = double (ovector[0]+1);
-            double end = double (ovector[1]);
+            OCTAVE_PCRE_SIZE start = ovector[0] + 1;
+            OCTAVE_PCRE_SIZE end = ovector[1];
 
+#if defined (HAVE_PCRE2)
+             // Must use explicit length constructor as match can contain '\0'.
+            std::string match_string = std::string (buffer.c_str() + start - 1,
+                                                    end - start + 1);
+#else
             const char **listptr;
             int status = pcre_get_substring_list (buffer.c_str (), ovector,
                                                   matches, &listptr);
@@ -371,6 +483,7 @@
 
             // Must use explicit length constructor as match can contain '\0'.
             std::string match_string = std::string (*listptr, end - start + 1);
+#endif
 
             string_vector tokens (pos_match);
             string_vector named_tokens (m_names);
@@ -379,7 +492,12 @@
 
             for (int i = 1; i < matches; i++)
               {
-                if (ovector[2*i] >= 0 && ovector[2*i+1] > 0)
+#if defined (HAVE_PCRE2)
+                if (ovector[2*i] != PCRE2_SIZE_MAX
+#else
+                if (ovector[2*i] >= 0
+#endif
+                    && ovector[2*i+1] > 0)
                   {
                     if (i == 1 || ovector[2*i] != ovector[2*i-2]
                         || ovector[2*i-1] != ovector[2*i+1])
@@ -396,25 +514,45 @@
                                   {
                                     std::size_t len = ovector[2*i+1] - ovector[2*i];
                                     named_tokens(m_named_idx(j))
-                                      = std::string (*(listptr+i-pos_offset),
-                                                     len);
+#if defined (HAVE_PCRE2)
+                                      = std::string (buffer.c_str () + ovector[2*i], len);
+#else
+                                      = std::string (*(listptr+i-pos_offset), len);
+#endif
                                     break;
                                   }
                               }
                           }
 
                         std::size_t len = ovector[2*i+1] - ovector[2*i];
+#if defined (HAVE_PCRE2)
+                        tokens(pos_match++) = std::string (buffer.c_str() + ovector[2*i], len);
+#else
                         tokens(pos_match++) = std::string (*(listptr+i), len);
+#endif
                       }
                     else
                       pos_offset++;
                   }
               }
 
+#if ! defined (HAVE_PCRE2)
             pcre_free_substring_list (listptr);
+#endif
+
+            // FIXME: MATCH_ELEMENT uses double values for these,
+            // presumably because that is what the Octave interpreter
+            // uses.  Should we check that the values don't exceed
+            // flintmax here?  It seems unlikely that it would happen,
+            // but...
+
+            double dstart = static_cast<double> (start);
+            double dend = static_cast<double> (end);
 
             regexp::match_element new_elem (named_tokens, tokens, match_string,
-                                            token_extents, start, end);
+                                            token_extents,
+                                            dstart, dend);
+
             lst.push_back (new_elem);
 
             if (ovector[1] <= ovector[0])
--- a/liboctave/util/lo-regexp.h	Fri Nov 11 15:40:45 2022 -0500
+++ b/liboctave/util/lo-regexp.h	Sun Nov 13 10:17:17 2022 -0500
@@ -188,6 +188,10 @@
       string_vector m_named_tokens;
       string_vector m_tokens;
       Matrix m_token_extents;
+
+      // FIXME: Are these values declared as double because that's what
+      // Octave interpreter functions will store?  Should they be int or
+      // size_t instead?
       double m_start;
       double m_end;
     };
--- a/m4/acinclude.m4	Fri Nov 11 15:40:45 2022 -0500
+++ b/m4/acinclude.m4	Sun Nov 13 10:17:17 2022 -0500
@@ -1531,6 +1531,41 @@
   fi
 ])
 dnl
+dnl Check whether PCRE2 is compiled with --enable-utf.
+dnl
+AC_DEFUN([OCTAVE_CHECK_LIB_PCRE2_OK], [
+  AC_CACHE_CHECK([whether PCRE2 library was compiled with UTF support],
+    [octave_cv_lib_pcre2_ok],
+    [AC_LANG_PUSH(C++)
+    AC_RUN_IFELSE([AC_LANG_PROGRAM([[
+        #include <stdio.h>
+        #define PCRE2_CODE-uNIT_WIDTH 8
+        #if defined (HAVE_PCRE2_H)
+        #  include <pcre2.h>
+        #elif defined (HAVE_PCRE2_PCRE2_H)
+        #  include <pcre2/pcre2.h>
+        #endif
+        ]], [[
+        const char *pattern = "test";
+        int err;
+        PCRE2_SIZE erroffset;
+        pcre2_code *data = pcre2_compile ((PCRE2_SPTR) pattern, PCRE2_ZERO_TERMINATED, PCRE2_UTF, &err, &erroffset, nullptr);
+        return (! data);
+      ]])],
+      octave_cv_lib_pcre2_ok=yes,
+      octave_cv_lib_pcre2_ok=no,
+      octave_cv_lib_pcre2_ok=yes)
+    AC_LANG_POP(C++)
+  ])
+  if test $octave_cv_lib_pcre2_ok = yes; then
+    $1
+    :
+  else
+    $2
+    :
+  fi
+])
+dnl
 dnl Check whether Qhull works (does not crash).
 dnl
 AC_DEFUN([OCTAVE_CHECK_LIB_QHULL_OK], [