Mercurial > octave
changeset 31424:d1165473e4b0
allow Octave to use PCRE2 (bug #61542)
* acinclude.m4 (OCTAVE_CHECK_LIB_PCRE2_OK): New macro.
* configure.ac: Check for libpcre2-8 and pcre2.h before checking for
libpcre and pcre.h. If PCRE2 is found, also define PCRE_CPPFLAGS,
PCRE_LDFLAGS, and PCRE_LIBS. Error if neither library is found.
* lo-regexp.h (regexp::match_element::match_element): New constructor
that accepts int values for start and end.
* lo-regexp.cc: Include either PCRE2 or PCRE headers. Fail if neither
HAVE_PCRE2 or HAVE_PCRE is defined.
(octave_pcre_code, OCTAVE_PCRE_SIZE): New typedefs.
(OCTAVE_PCRE_CASELESS, OCTAVE_PCRE_DOTALL, OCTAVE_PCRE_MULTILINE,
OCTAVE_PCRE_EXTENDED, OCTAVE_PCRE_UTF, OCTAVE_PCRE_INFO_CAPTURECOUNT,
OCTAVE_PCRE_INFO_NAMECOUNT, OCTAVE_PCRE_INFO_NAMEENTRYSIZE,
OCTAVE_PCRE_INFO_NAMETABLE): New macro definitions.
(octave_pcre_code_free): Define function pointer that references
either pcre2_code_free or pcre_free depending on the library in use.
(regexp::free): Call octave_pcre_code_free to free m_code.
(octave_pcre_pattern_info): New function.
(regexp::compile_internal): Allow use of either PCRE2 or PCRE. Use
new macros and functions to hide differences in library interfaces.
author | Rafael Laboissiere <rafael@laboissiere.net> |
---|---|
date | Sun, 13 Nov 2022 10:17:17 -0500 |
parents | fcd4bc97f5f7 |
children | 9f4a9dd4a6ee |
files | configure.ac liboctave/util/lo-regexp.cc liboctave/util/lo-regexp.h m4/acinclude.m4 |
diffstat | 4 files changed, 240 insertions(+), 35 deletions(-) [+] |
line wrap: on
line diff
--- a/configure.ac Fri Nov 11 15:40:45 2022 -0500 +++ b/configure.ac Sun Nov 13 10:17:17 2022 -0500 @@ -1375,14 +1375,42 @@ [Define to 1 to build experimental Virtual Machine evaluator.]) fi -### Check for PCRE regex library. - -OCTAVE_CHECK_LIB(pcre, PCRE, - [], [pcre.h pcre/pcre.h], [pcre_compile], [], [], - [OCTAVE_CHECK_LIB_PCRE_OK([], - [AC_MSG_ERROR([PCRE library must be built with UTF support (--enable-utf)])]) - ], - [libpcre], [REQUIRED]) +### Check for PCRE2 or PCRE regex library, requiring one to exist. + +have_pcre2=no +have_pcre=no +save_CPPLAGS="$CPPFLAGS" +CPPFLAGS="-DPCRE2_CODE_UNIT_WIDTH=8 $CPPFLAGS" +OCTAVE_CHECK_LIB(pcre2, PCRE2, + [], [pcre2.h pcre2/pcre2.h], [pcre2_compile_8], [], [], + [OCTAVE_CHECK_LIB_PCRE2_OK([have_pcre2=yes], + [AC_MSG_ERROR([PCRE2 library must be built with UTF support (--enable-utf)])])], + [libpcre2-8]) +CPPFLAGS="$save_CPPFLAGS" + +if test $have_pcre2 = no; then + OCTAVE_CHECK_LIB(pcre, PCRE, + [], [pcre.h pcre/pcre.h], [pcre_compile], [], [], + [OCTAVE_CHECK_LIB_PCRE_OK([have_pcre=yes], + [AC_MSG_ERROR([PCRE library must be built with UTF support (--enable-utf)])])], + [libpcre]) +fi + +if test $have_pcre2 = yes; then + AC_DEFINE(HAVE_PCRE2, 1, [Define to 1 if PCRE2 is available.]) + + ## Only one of PCRE2 or PCRE is used, so avoid having to define and use + ## both PCRE2_* and PCRE_* variables everywhere. + + PCRE_CPPFLAGS="$PCRE2_CPPFLAGS" + PCRE_LDFLAGS="$PCRE2_LDFLAGS" + PCRE_LIBS="$PCRE2_LIBS" + +elif test $have_pcre = yes; then + AC_DEFINE(HAVE_PCRE, 1, [Define to 1 if PCRE is available.]) +else + AC_MSG_ERROR([to build Octave, you must have the PCRE or PCRE2 library and header files installed]) +fi ### Check for Qhull library.
--- a/liboctave/util/lo-regexp.cc Fri Nov 11 15:40:45 2022 -0500 +++ b/liboctave/util/lo-regexp.cc Sun Nov 13 10:17:17 2022 -0500 @@ -32,10 +32,19 @@ #include <string> #include <vector> -#if defined (HAVE_PCRE_H) -# include <pcre.h> -#elif defined (HAVE_PCRE_PCRE_H) -# include <pcre/pcre.h> +#if defined (HAVE_PCRE2_H) || defined (HAVE_PCRE2_PCRE2_H) +# define PCRE2_CODE_UNIT_WIDTH 8 +# if defined (HAVE_PCRE2_H) +# include <pcre2.h> +# elif defined (HAVE_PCRE2_PCRE2_H) +# include <pcre2/pcre2.h> +# endif +#elif defined (HAVE_PCRE_H) || defined (HAVE_PCRE_PCRE_H) +# if defined (HAVE_PCRE_H) +# include <pcre.h> +# elif defined (HAVE_PCRE_PCRE_H) +# include <pcre/pcre.h> +# endif #endif #include "Matrix.h" @@ -46,6 +55,47 @@ #include "lo-regexp.h" #include "str-vec.h" #include "unistr-wrappers.h" +#include "unwind-prot.h" + +#if defined (HAVE_PCRE2) +typedef pcre2_code octave_pcre_code; +typedef PCRE2_SIZE OCTAVE_PCRE_SIZE; +void (*octave_pcre_code_free) (octave_pcre_code *) = pcre2_code_free; +# define OCTAVE_PCRE_CASELESS PCRE2_CASELESS +# define OCTAVE_PCRE_DOTALL PCRE2_DOTALL +# define OCTAVE_PCRE_MULTILINE PCRE2_MULTILINE +# define OCTAVE_PCRE_EXTENDED PCRE2_EXTENDED +# define OCTAVE_PCRE_UTF PCRE2_UTF +# define OCTAVE_PCRE_INFO_CAPTURECOUNT PCRE2_INFO_CAPTURECOUNT +# define OCTAVE_PCRE_INFO_NAMECOUNT PCRE2_INFO_NAMECOUNT +# define OCTAVE_PCRE_INFO_NAMEENTRYSIZE PCRE2_INFO_NAMEENTRYSIZE +# define OCTAVE_PCRE_INFO_NAMETABLE PCRE2_INFO_NAMETABLE +#elif defined (HAVE_PCRE) +typedef pcre octave_pcre_code; +typedef int OCTAVE_PCRE_SIZE; +void (*octave_pcre_code_free) (void *) = pcre_free; +# define OCTAVE_PCRE_CASELESS PCRE_CASELESS +# define OCTAVE_PCRE_DOTALL PCRE_DOTALL +# define OCTAVE_PCRE_MULTILINE PCRE_MULTILINE +# define OCTAVE_PCRE_EXTENDED PCRE_EXTENDED +# define OCTAVE_PCRE_UTF PCRE_UTF8 +# define OCTAVE_PCRE_INFO_CAPTURECOUNT PCRE_INFO_CAPTURECOUNT +# define OCTAVE_PCRE_INFO_NAMECOUNT PCRE_INFO_NAMECOUNT +# define OCTAVE_PCRE_INFO_NAMEENTRYSIZE PCRE_INFO_NAMEENTRYSIZE +# define OCTAVE_PCRE_INFO_NAMETABLE PCRE_INFO_NAMETABLE +#else +# error "PCRE2 or PCRE library is required to build Octave" +#endif + +static inline int +octave_pcre_pattern_info (const octave_pcre_code *code, int what, void *where) +{ +#if defined (HAVE_PCRE2) + return pcre2_pattern_info (code, what, where); +#else + return pcre_fullinfo (code, nullptr, what, where); +#endif +} namespace octave { @@ -64,8 +114,7 @@ void regexp::free (void) { - if (m_code) - pcre_free (static_cast<pcre *> (m_code)); + octave_pcre_code_free (static_cast<octave_pcre_code *> (m_code)); } void @@ -229,22 +278,50 @@ while ((pos = buf_str.find ('\0')) != std::string::npos) buf_str.replace (pos, 1, "\\000"); + int pcre_options + = ( (m_options.case_insensitive () ? OCTAVE_PCRE_CASELESS : 0) + | (m_options.dotexceptnewline () ? 0 : OCTAVE_PCRE_DOTALL) + | (m_options.lineanchors () ? OCTAVE_PCRE_MULTILINE : 0) + | (m_options.freespacing () ? OCTAVE_PCRE_EXTENDED : 0) + | OCTAVE_PCRE_UTF); + +#if defined (HAVE_PCRE2) + PCRE2_SIZE erroffset; + int errnumber; + + m_code = pcre2_compile (reinterpret_cast<PCRE2_SPTR> (buf_str.c_str ()), + PCRE2_ZERO_TERMINATED, pcre_options, + &errnumber, &erroffset, nullptr); + + if (! m_code) + { + // PCRE docs say: + // + // If the buffer is too small, the message is truncated (but + // still with a trailing zero), and the negative error code + // PCRE2_ERROR_NOMEMORY is returned. None of the messages are + // very long; a buffer size of 120 code units is ample. + // + // so we assume that 256 will be large enough to avoid truncated + // messages. + + PCRE2_UCHAR err [256]; + pcre2_get_error_message (errnumber, err, sizeof (err)); + (*current_liboctave_error_handler) + ("%s: %s at position %zu of expression", m_who.c_str (), err, + erroffset); + } +#else const char *err; int erroffset; - int pcre_options - = ( (m_options.case_insensitive () ? PCRE_CASELESS : 0) - | (m_options.dotexceptnewline () ? 0 : PCRE_DOTALL) - | (m_options.lineanchors () ? PCRE_MULTILINE : 0) - | (m_options.freespacing () ? PCRE_EXTENDED : 0) - | PCRE_UTF8); - m_code = pcre_compile (buf_str.c_str (), pcre_options, &err, &erroffset, nullptr); if (! m_code) (*current_liboctave_error_handler) ("%s: %s at position %d of expression", m_who.c_str (), err, erroffset); +#endif } regexp::match_data @@ -266,14 +343,17 @@ char *nametable; std::size_t idx = 0; - pcre *re = static_cast<pcre *> (m_code); + octave_pcre_code *re = static_cast<octave_pcre_code *> (m_code); - pcre_fullinfo (re, nullptr, PCRE_INFO_CAPTURECOUNT, &subpatterns); - pcre_fullinfo (re, nullptr, PCRE_INFO_NAMECOUNT, &namecount); - pcre_fullinfo (re, nullptr, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize); - pcre_fullinfo (re, nullptr, PCRE_INFO_NAMETABLE, &nametable); + octave_pcre_pattern_info (re, OCTAVE_PCRE_INFO_CAPTURECOUNT, &subpatterns); + octave_pcre_pattern_info (re, OCTAVE_PCRE_INFO_NAMECOUNT, &namecount); + octave_pcre_pattern_info (re, OCTAVE_PCRE_INFO_NAMEENTRYSIZE, &nameentrysize); + octave_pcre_pattern_info (re, OCTAVE_PCRE_INFO_NAMETABLE, &nametable); - OCTAVE_LOCAL_BUFFER (int, ovector, (subpatterns+1)*3); +#if defined (HAVE_PCRE) + OCTAVE_LOCAL_BUFFER (OCTAVE_PCRE_SIZE, ovector, (subpatterns+1)*3); +#endif + OCTAVE_LOCAL_BUFFER (int, nidx, namecount); for (int i = 0; i < namecount; i++) @@ -288,6 +368,27 @@ { octave_quit (); +#if defined (HAVE_PCRE2) + pcre2_match_data *m_data = pcre2_match_data_create_from_pattern (re, NULL); + + unwind_action cleanup_match_data + ([=] () { pcre2_match_data_free (m_data); }); + + int matches = pcre2_match (re, reinterpret_cast<PCRE2_SPTR> (buffer.c_str ()), + buffer.length (), idx, + PCRE2_NO_UTF_CHECK | (idx ? PCRE2_NOTBOL : 0), + m_data, nullptr); + + if (matches < 0 && matches != PCRE2_ERROR_NOMATCH) + (*current_liboctave_error_handler) + ("%s: internal error calling pcre2_match; " + "error code from pcre2_match is %i", m_who.c_str (), matches); + + if (matches == PCRE2_ERROR_NOMATCH) + break; + + OCTAVE_PCRE_SIZE *ovector = pcre2_get_ovector_pointer (m_data); +#else int matches = pcre_exec (re, nullptr, buffer.c_str (), buffer.length (), idx, PCRE_NO_UTF8_CHECK | (idx ? PCRE_NOTBOL : 0), @@ -330,7 +431,8 @@ if (matches == PCRE_ERROR_NOMATCH) break; - else if (ovector[0] >= ovector[1] && ! m_options.emptymatch ()) +#endif + if (ovector[0] >= ovector[1] && ! m_options.emptymatch ()) { // Zero length match. Skip to next char. idx = ovector[0] + 1; @@ -346,7 +448,12 @@ for (int i = 1; i < matches; i++) { - if (ovector[2*i] >= 0 && ovector[2*i+1] > 0 +#if defined (HAVE_PCRE2) + if (ovector[2*i] != PCRE2_SIZE_MAX +#else + if (ovector[2*i] >= 0 +#endif + && ovector[2*i+1] > 0 && (i == 1 || ovector[2*i] != ovector[2*i-2] || ovector[2*i-1] != ovector[2*i+1])) { @@ -357,9 +464,14 @@ token_extents.resize (pos_match, 2); - double start = double (ovector[0]+1); - double end = double (ovector[1]); + OCTAVE_PCRE_SIZE start = ovector[0] + 1; + OCTAVE_PCRE_SIZE end = ovector[1]; +#if defined (HAVE_PCRE2) + // Must use explicit length constructor as match can contain '\0'. + std::string match_string = std::string (buffer.c_str() + start - 1, + end - start + 1); +#else const char **listptr; int status = pcre_get_substring_list (buffer.c_str (), ovector, matches, &listptr); @@ -371,6 +483,7 @@ // Must use explicit length constructor as match can contain '\0'. std::string match_string = std::string (*listptr, end - start + 1); +#endif string_vector tokens (pos_match); string_vector named_tokens (m_names); @@ -379,7 +492,12 @@ for (int i = 1; i < matches; i++) { - if (ovector[2*i] >= 0 && ovector[2*i+1] > 0) +#if defined (HAVE_PCRE2) + if (ovector[2*i] != PCRE2_SIZE_MAX +#else + if (ovector[2*i] >= 0 +#endif + && ovector[2*i+1] > 0) { if (i == 1 || ovector[2*i] != ovector[2*i-2] || ovector[2*i-1] != ovector[2*i+1]) @@ -396,25 +514,45 @@ { std::size_t len = ovector[2*i+1] - ovector[2*i]; named_tokens(m_named_idx(j)) - = std::string (*(listptr+i-pos_offset), - len); +#if defined (HAVE_PCRE2) + = std::string (buffer.c_str () + ovector[2*i], len); +#else + = std::string (*(listptr+i-pos_offset), len); +#endif break; } } } std::size_t len = ovector[2*i+1] - ovector[2*i]; +#if defined (HAVE_PCRE2) + tokens(pos_match++) = std::string (buffer.c_str() + ovector[2*i], len); +#else tokens(pos_match++) = std::string (*(listptr+i), len); +#endif } else pos_offset++; } } +#if ! defined (HAVE_PCRE2) pcre_free_substring_list (listptr); +#endif + + // FIXME: MATCH_ELEMENT uses double values for these, + // presumably because that is what the Octave interpreter + // uses. Should we check that the values don't exceed + // flintmax here? It seems unlikely that it would happen, + // but... + + double dstart = static_cast<double> (start); + double dend = static_cast<double> (end); regexp::match_element new_elem (named_tokens, tokens, match_string, - token_extents, start, end); + token_extents, + dstart, dend); + lst.push_back (new_elem); if (ovector[1] <= ovector[0])
--- a/liboctave/util/lo-regexp.h Fri Nov 11 15:40:45 2022 -0500 +++ b/liboctave/util/lo-regexp.h Sun Nov 13 10:17:17 2022 -0500 @@ -188,6 +188,10 @@ string_vector m_named_tokens; string_vector m_tokens; Matrix m_token_extents; + + // FIXME: Are these values declared as double because that's what + // Octave interpreter functions will store? Should they be int or + // size_t instead? double m_start; double m_end; };
--- a/m4/acinclude.m4 Fri Nov 11 15:40:45 2022 -0500 +++ b/m4/acinclude.m4 Sun Nov 13 10:17:17 2022 -0500 @@ -1531,6 +1531,41 @@ fi ]) dnl +dnl Check whether PCRE2 is compiled with --enable-utf. +dnl +AC_DEFUN([OCTAVE_CHECK_LIB_PCRE2_OK], [ + AC_CACHE_CHECK([whether PCRE2 library was compiled with UTF support], + [octave_cv_lib_pcre2_ok], + [AC_LANG_PUSH(C++) + AC_RUN_IFELSE([AC_LANG_PROGRAM([[ + #include <stdio.h> + #define PCRE2_CODE-uNIT_WIDTH 8 + #if defined (HAVE_PCRE2_H) + # include <pcre2.h> + #elif defined (HAVE_PCRE2_PCRE2_H) + # include <pcre2/pcre2.h> + #endif + ]], [[ + const char *pattern = "test"; + int err; + PCRE2_SIZE erroffset; + pcre2_code *data = pcre2_compile ((PCRE2_SPTR) pattern, PCRE2_ZERO_TERMINATED, PCRE2_UTF, &err, &erroffset, nullptr); + return (! data); + ]])], + octave_cv_lib_pcre2_ok=yes, + octave_cv_lib_pcre2_ok=no, + octave_cv_lib_pcre2_ok=yes) + AC_LANG_POP(C++) + ]) + if test $octave_cv_lib_pcre2_ok = yes; then + $1 + : + else + $2 + : + fi +]) +dnl dnl Check whether Qhull works (does not crash). dnl AC_DEFUN([OCTAVE_CHECK_LIB_QHULL_OK], [