changeset 27523:19ad9150dd69

Check if input to regexp is valid UTF-8 is faster than PCRE's checks (bug #57064). * lo-regexp.cc (regexp::match): Check if input is valid UTF-8 and disable the check in PCRE. Remove error handling for error that can no longer occur. * unistr-wrappers.[cc/h] (octave_u8_check_wrapper): Add new wrapper. * bootstrap.conf: Add gnulib module.
author Markus Mützel <markus.muetzel@gmx.de>
date Thu, 17 Oct 2019 20:09:10 +0200
parents 3912e3a74e31
children 1dbe839bedcb
files bootstrap.conf liboctave/util/lo-regexp.cc liboctave/wrappers/unistr-wrappers.c liboctave/wrappers/unistr-wrappers.h
diffstat 4 files changed, 21 insertions(+), 7 deletions(-) [+]
line wrap: on
line diff
--- a/bootstrap.conf	Fri Oct 18 07:51:46 2019 -0700
+++ b/bootstrap.conf	Thu Oct 17 20:09:10 2019 +0200
@@ -111,6 +111,7 @@
   unictype/ctype-upper
   unictype/ctype-xdigit
   unistd
+  unistr/u8-check
   unistr/u8-strmblen
   unistr/u8-strmbtouc
   unistr/u8-to-u32
--- a/liboctave/util/lo-regexp.cc	Fri Oct 18 07:51:46 2019 -0700
+++ b/liboctave/util/lo-regexp.cc	Thu Oct 17 20:09:10 2019 +0200
@@ -44,6 +44,7 @@
 #include "quit.h"
 #include "lo-regexp.h"
 #include "str-vec.h"
+#include "unistr-wrappers.h"
 
 namespace octave
 {
@@ -246,6 +247,12 @@
   regexp::match_data
   regexp::match (const std::string& buffer)
   {
+    // check if input is valid utf-8
+    const uint8_t *buf_str = reinterpret_cast<const uint8_t *> (buffer.c_str ());
+    if (octave_u8_check_wrapper (buf_str, buffer.length ()))
+      (*current_liboctave_error_handler)
+        ("%s: the input string is invalid UTF-8", m_who.c_str ());
+
     regexp::match_data retval;
 
     std::list<regexp::match_element> lst;
@@ -280,7 +287,7 @@
 
         int matches = pcre_exec (re, nullptr, buffer.c_str (),
                                  buffer.length (), idx,
-                                 (idx ? PCRE_NOTBOL : 0),
+                                 PCRE_NO_UTF8_CHECK | (idx ? PCRE_NOTBOL : 0),
                                  ovector, (subpatterns+1)*3);
 
         if (matches == PCRE_ERROR_MATCHLIMIT)
@@ -307,16 +314,13 @@
                 pe.match_limit *= 10;
                 matches = pcre_exec (re, &pe, buffer.c_str (),
                                      buffer.length (), idx,
-                                     (idx ? PCRE_NOTBOL : 0),
+                                     PCRE_NO_UTF8_CHECK
+                                     | (idx ? PCRE_NOTBOL : 0),
                                      ovector, (subpatterns+1)*3);
               }
           }
 
-        if (matches == PCRE_ERROR_BADUTF8)
-          (*current_liboctave_error_handler)
-            ("%s: internal error calling pcre_exec; "
-             "the input string is invalid UTF-8", m_who.c_str ());
-        else if (matches < 0 && matches != PCRE_ERROR_NOMATCH)
+        if (matches < 0 && matches != PCRE_ERROR_NOMATCH)
           (*current_liboctave_error_handler)
             ("%s: internal error calling pcre_exec; "
              "error code from pcre_exec is %i", m_who.c_str (), matches);
--- a/liboctave/wrappers/unistr-wrappers.c	Fri Oct 18 07:51:46 2019 -0700
+++ b/liboctave/wrappers/unistr-wrappers.c	Thu Oct 17 20:09:10 2019 +0200
@@ -28,6 +28,12 @@
 
 #include "unistr-wrappers.h"
 
+const uint8_t *
+octave_u8_check_wrapper (const uint8_t *src, size_t n)
+{
+  return u8_check (src, n);
+}
+
 int
 octave_u8_strmblen_wrapper (const uint8_t *src)
 {
--- a/liboctave/wrappers/unistr-wrappers.h	Fri Oct 18 07:51:46 2019 -0700
+++ b/liboctave/wrappers/unistr-wrappers.h	Thu Oct 17 20:09:10 2019 +0200
@@ -27,6 +27,9 @@
 extern "C" {
 #endif
 
+const uint8_t *
+octave_u8_check_wrapper (const uint8_t *src, size_t n);
+
 extern int
 octave_u8_strmblen_wrapper (const uint8_t *src);