view liboctave/util/lo-regexp.cc @ 20651:e54ecb33727e

lo-array-gripes.cc: Remove FIXME's related to buffer size. * lo-array-gripes.cc: Remove FIXME's related to buffer size. Shorten sprintf buffers from 100 to 64 characters (still well more than 19 required). Use 'const' decorator on constant value for clarity. Remove extra space between variable and array bracket.
author Rik <rik@octave.org>
date Mon, 12 Oct 2015 21:13:47 -0700
parents a9574e3c6e9e
children
line wrap: on
line source

/*

Copyright (C) 2012 John W. Eaton
Copyright (C) 2005-2015 David Bateman
Copyright (C) 2002-2005 Paul Kienzle

This file is part of Octave.

Octave is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation; either version 3 of the License, or (at your
option) any later version.

Octave is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received a copy of the GNU General Public License
along with Octave; see the file COPYING.  If not, see
<http://www.gnu.org/licenses/>.

*/

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include <list>
#include <sstream>
#include <string>
#include <vector>

#if defined (HAVE_PCRE_H)
#include <pcre.h>
#elif defined (HAVE_PCRE_PCRE_H)
#include <pcre/pcre.h>
#endif

#include "Matrix.h"
#include "base-list.h"
#include "lo-error.h"
#include "oct-locbuf.h"
#include "quit.h"
#include "lo-regexp.h"
#include "str-vec.h"

// Define the maximum number of retries for a pattern
// that possibly results in an infinite recursion.
#define PCRE_MATCHLIMIT_MAX 10

// FIXME: should this be configurable?
#define MAXLOOKBEHIND 10

static bool lookbehind_warned = false;

// FIXME: don't bother collecting and composing return values
//        the user doesn't want.

void
regexp::free (void)
{
  if (data)
    pcre_free (static_cast<pcre *> (data));
}

void
regexp::compile_internal (void)
{
  // If we had a previously compiled pattern, release it.
  free ();

  size_t max_length = MAXLOOKBEHIND;

  size_t pos = 0;
  size_t new_pos;
  int inames = 0;
  std::ostringstream buf;

  while ((new_pos = pattern.find ("(?", pos)) != std::string::npos)
    {
      if (pattern.at (new_pos + 2) == '<'
          && !(pattern.at (new_pos + 3) == '='
               || pattern.at (new_pos + 3) == '!'))
        {
          // The syntax of named tokens in pcre is "(?P<name>...)" while
          // we need a syntax "(?<name>...)", so fix that here. Also an
          // expression like
          // "(?<first>\w+)\s+(?<last>\w+)|(?<last>\w+),\s+(?<first>\w+)"
          // should be perfectly legal, while pcre does not allow the same
          // named token name on both sides of the alternative. Also fix
          // that here by replacing name tokens by dummy names, and dealing
          // with the dummy names later.

          size_t tmp_pos = pattern.find_first_of ('>', new_pos);

          if (tmp_pos == std::string::npos)
            {
              (*current_liboctave_error_handler)
                ("regexp: syntax error in pattern");
              return;
            }

          std::string tmp_name =
            pattern.substr (new_pos+3, tmp_pos-new_pos-3);

          bool found = false;

          for (int i = 0; i < nnames; i++)
            {
              if (named_pats(i) == tmp_name)
                {
                  named_idx.resize (dim_vector (inames+1, 1));
                  named_idx(inames) = i;
                  found = true;
                  break;
                }
            }

          if (! found)
            {
              named_idx.resize (dim_vector (inames+1, 1));
              named_idx(inames) = nnames;
              named_pats.append (tmp_name);
              nnames++;
            }

          if (new_pos - pos > 0)
            buf << pattern.substr (pos, new_pos-pos);
          if (inames < 10)
            buf << "(?P<n00" << inames++;
          else if (inames < 100)
            buf << "(?P<n0" << inames++;
          else
            buf << "(?P<n" << inames++;

          pos = tmp_pos;
        }
      else if (pattern.at (new_pos + 2) == '<')
        {
          // Find lookbehind operators of arbitrary length (ie like
          // "(?<=[a-z]*)") and replace with a maximum length operator
          // as PCRE can not yet handle arbitrary length lookahead
          // operators. Use the string length as the maximum length to
          // avoid issues.

          int brackets = 1;
          size_t tmp_pos1 = new_pos + 2;
          size_t tmp_pos2 = tmp_pos1;

          while (tmp_pos1 < pattern.length () && brackets > 0)
            {
              char ch = pattern.at (tmp_pos1);

              if (ch == '(')
                brackets++;
              else if (ch == ')')
                {
                  if (brackets > 1)
                    tmp_pos2 = tmp_pos1;

                  brackets--;
                }

              tmp_pos1++;
            }

          if (brackets != 0)
            {
              buf << pattern.substr (pos, new_pos - pos) << "(?";
              pos = new_pos + 2;
            }
          else
            {
              size_t tmp_pos3 = pattern.find_first_of ("*+", tmp_pos2);

              if (tmp_pos3 != std::string::npos && tmp_pos3 < tmp_pos1)
                {
                  if (!lookbehind_warned)
                    {
                      lookbehind_warned = true;
                      (*current_liboctave_warning_with_id_handler)
                        ("Octave:regexp-lookbehind-limit",
                         "%s: arbitrary length lookbehind patterns are only supported up to length %d",
                         who.c_str (), MAXLOOKBEHIND);
                    }

                  buf << pattern.substr (pos, new_pos - pos) << "(";

                  size_t i;

                  if (pattern.at (tmp_pos3) == '*')
                    i = 0;
                  else
                    i = 1;

                  for (; i < max_length + 1; i++)
                    {
                      buf << pattern.substr (new_pos, tmp_pos3 - new_pos)
                          << "{" << i << "}";
                      buf << pattern.substr (tmp_pos3 + 1,
                                             tmp_pos1 - tmp_pos3 - 1);
                      if (i != max_length)
                        buf << "|";
                    }
                  buf << ")";
                }
              else
                buf << pattern.substr (pos, tmp_pos1 - pos);

              pos = tmp_pos1;
            }
        }
      else
        {
          buf << pattern.substr (pos, new_pos - pos) << "(?";
          pos = new_pos + 2;
        }

    }

  buf << pattern.substr (pos);

  const char *err;
  int erroffset;
  std::string buf_str = buf.str ();

  int pcre_options
    = ((options.case_insensitive () ? PCRE_CASELESS : 0)
       | (options.dotexceptnewline () ? 0 : PCRE_DOTALL)
       | (options.lineanchors () ? PCRE_MULTILINE : 0)
       | (options.freespacing () ? PCRE_EXTENDED : 0));

  data = pcre_compile (buf_str.c_str (), pcre_options, &err, &erroffset, 0);

  if (! data)
    (*current_liboctave_error_handler)
      ("%s: %s at position %d of expression", who.c_str (),
       err, erroffset);
}

regexp::match_data
regexp::match (const std::string& buffer)
{
  regexp::match_data retval;

  std::list<regexp::match_element> lst;

  int subpatterns;
  int namecount;
  int nameentrysize;
  char *nametable;
  size_t idx = 0;

  pcre *re = static_cast<pcre *> (data);

  pcre_fullinfo (re, 0, PCRE_INFO_CAPTURECOUNT,  &subpatterns);
  pcre_fullinfo (re, 0, PCRE_INFO_NAMECOUNT, &namecount);
  pcre_fullinfo (re, 0, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize);
  pcre_fullinfo (re, 0, PCRE_INFO_NAMETABLE, &nametable);

  OCTAVE_LOCAL_BUFFER (int, ovector, (subpatterns+1)*3);
  OCTAVE_LOCAL_BUFFER (int, nidx, namecount);

  for (int i = 0; i < namecount; i++)
    {
      // Index of subpattern in first two bytes MSB first of name.
      // Extract index.
      nidx[i] = (static_cast<int> (nametable[i*nameentrysize])) << 8
                | static_cast<int> (nametable[i*nameentrysize+1]);
    }

  while (true)
    {
      OCTAVE_QUIT;

      int matches = pcre_exec (re, 0, buffer.c_str (),
                               buffer.length (), idx,
                               (idx ? PCRE_NOTBOL : 0),
                               ovector, (subpatterns+1)*3);

      if (matches == PCRE_ERROR_MATCHLIMIT)
        {
          // Try harder; start with default value for MATCH_LIMIT
          // and increase it.
          (*current_liboctave_warning_with_id_handler)
            ("Octave:regexp-match-limit",
             "your pattern caused PCRE to hit its MATCH_LIMIT; trying harder now, but this will be slow");

          pcre_extra pe;

          pcre_config (PCRE_CONFIG_MATCH_LIMIT,
                       static_cast<void *> (&pe.match_limit));

          pe.flags = PCRE_EXTRA_MATCH_LIMIT;

          int i = 0;
          while (matches == PCRE_ERROR_MATCHLIMIT
                 && i++ < PCRE_MATCHLIMIT_MAX)
            {
              OCTAVE_QUIT;

              pe.match_limit *= 10;
              matches = pcre_exec (re, &pe, buffer.c_str (),
                                   buffer.length (), idx,
                                   (idx ? PCRE_NOTBOL : 0),
                                   ovector, (subpatterns+1)*3);
            }
        }

      if (matches < 0 && matches != PCRE_ERROR_NOMATCH)
        {
          (*current_liboctave_error_handler)
            ("%s: internal error calling pcre_exec; error code from pcre_exec is %i",
             who.c_str (), matches);
          return retval;
        }
      else if (matches == PCRE_ERROR_NOMATCH)
        break;
      else if (ovector[1] <= ovector[0] && ! options.emptymatch ())
        {
          // Zero length match.  Skip to next char.
          idx = ovector[0] + 1;
          if (idx < buffer.length ())
            continue;
          else
            break;
        }
      else
        {
          int pos_match = 0;
          Matrix token_extents (matches-1, 2);

          for (int i = 1; i < matches; i++)
            {
              if (ovector[2*i] >= 0 && ovector[2*i+1] > 0
                  && (i == 1 || ovector[2*i] != ovector[2*i-2]
                      || ovector[2*i-1] != ovector[2*i+1]))
                {
                  token_extents(pos_match,0) = double (ovector[2*i]+1);
                  token_extents(pos_match++,1) = double (ovector[2*i+1]);
                }
            }

          token_extents.resize (pos_match, 2);

          double start = double (ovector[0]+1);
          double end = double (ovector[1]);

          const char **listptr;
          int status = pcre_get_substring_list (buffer.c_str (), ovector,
                                                matches, &listptr);

          if (status == PCRE_ERROR_NOMEMORY)
            {
              (*current_liboctave_error_handler)
                ("%s: cannot allocate memory in pcre_get_substring_list",
                 who.c_str ());
              return retval;
            }

          string_vector tokens (pos_match);
          string_vector named_tokens (nnames);
          int pos_offset = 0;
          pos_match = 0;

          for (int i = 1; i < matches; i++)
            {
              if (ovector[2*i] >= 0 && ovector[2*i+1] > 0)
                {
                  if (i == 1 || ovector[2*i] != ovector[2*i-2]
                      || ovector[2*i-1] != ovector[2*i+1])
                    {
                      if (namecount > 0)
                        {
                          // FIXME: Should probably do this with a map()
                          //        rather than a linear search.  However,
                          //        the number of captured, named expressions
                          //        is usually pretty small (< 4)
                          for (int j = 0; j < namecount; j++)
                            {
                              if (nidx[j] == i)
                                {
                                  named_tokens(named_idx(j)) =
                                    std::string (*(listptr+i-pos_offset));
                                  break;
                                }
                            }
                        }

                      tokens(pos_match++) = std::string (*(listptr+i));
                    }
                  else
                    pos_offset++;
                }
            }

          std::string match_string = std::string (*listptr);

          pcre_free_substring_list (listptr);

          regexp::match_element new_elem (named_tokens, tokens, match_string,
                                          token_extents, start, end);
          lst.push_back (new_elem);

          if (ovector[1] <= ovector[0])
            {
              // Zero length match.  Skip to next char.
              idx = ovector[0] + 1;
              if (idx <= buffer.length ())
                continue;
            }
          else
            idx = ovector[1];

          if (options.once () || idx >= buffer.length ())
            break;
        }
    }

  retval = regexp::match_data (lst, named_pats);

  return retval;
}

bool
regexp::is_match (const std::string& buffer)
{
  regexp::match_data rx_lst = match (buffer);

  return rx_lst.size () > 0;
}

Array<bool>
regexp::is_match (const string_vector& buffer)
{
  octave_idx_type len = buffer.numel ();

  Array<bool> retval (dim_vector (len, 1));

  for (octave_idx_type i = 0; i < buffer.numel (); i++)
    retval(i) = is_match (buffer(i));

  return retval;
}

// Declare rep_token_t used in processing replacement string
typedef struct
{
  size_t pos;
  int num;
} rep_token_t;


std::string
regexp::replace (const std::string& buffer, const std::string& replacement)
{
  std::string retval;

  regexp::match_data rx_lst = match (buffer);

  size_t num_matches = rx_lst.size ();

  if (num_matches == 0)
    {
      retval = buffer;
      return retval;
    }

  // Identify replacement tokens; build a vector of group numbers in
  // the replacement string so that we can quickly calculate the size
  // of the replacement.

  // FIXME: All code assumes that only 10 tokens ($0-$9) exist.
  //        $11 represents $1 followed by the character '1' rather than
  //        the eleventh capture buffer.

  std::string repstr = replacement;
  std::vector<rep_token_t> tokens;
  tokens.reserve (5);  // Reserve memory for 5 pattern replacements

  for (size_t i=0; i < repstr.size (); i++)
    {
      if (repstr[i] == '\\')
        {
          if (i < repstr.size () - 1 && repstr[i+1] == '$')
            {
              repstr.erase (i,1);  // erase backslash
              i++;                 // skip over '$'
              continue;
            }
          if (i < repstr.size () - 1 && repstr[i+1] == '\\')
            {
              repstr.erase (i,1);  // erase 1st backslash
              continue;
            }
        }
      else if (repstr[i] == '$')
        {
          if (i < repstr.size () - 1 && isdigit (repstr[i+1]))
            {
              rep_token_t tmp_token;

              tmp_token.pos = i;
              tmp_token.num = repstr[i+1]-'0';
              tokens.push_back (tmp_token);
            }
        }
    }

  std::string rep;
  int num_tokens = tokens.size ();

  if (num_tokens > 0)
    {
      // Determine replacement length
      const size_t replen = repstr.size () - 2*num_tokens;
      int delta = 0;
      regexp::match_data::const_iterator p = rx_lst.begin ();
      for (size_t i = 0; i < num_matches; i++)
        {
          OCTAVE_QUIT;

          double start = p->start ();
          double end = p->end ();

          const Matrix pairs (p->token_extents ());
          size_t pairlen = 0;
          for (int j = 0; j < num_tokens; j++)
            {
              if (tokens[j].num == 0)
                pairlen += static_cast<size_t> (end - start) + 1;
              else if (tokens[j].num <= pairs.rows ())
                pairlen += static_cast<size_t> (pairs(tokens[j].num-1,1)
                                                - pairs(tokens[j].num-1,0)) + 1;
            }
          delta += (static_cast<int> (replen + pairlen)
                    - static_cast<int> (end - start + 1));
          p++;
        }

      // Build replacement string
      rep.reserve (buffer.size () + delta);
      size_t from = 0;
      p = rx_lst.begin ();
      for (size_t i = 0; i < num_matches; i++)
        {
          OCTAVE_QUIT;

          double start = p->start ();
          double end = p->end ();

          const Matrix pairs (p->token_extents ());
          rep.append (&buffer[from], static_cast<size_t> (start - 1) - from);
          from = static_cast<size_t> (end);

          size_t cur_pos = 0;

          for (int j = 0; j < num_tokens; j++)
            {
              rep.append (&repstr[cur_pos], (tokens[j].pos) - cur_pos);
              cur_pos = tokens[j].pos+2;

              int k = tokens[j].num;
              if (k == 0)
                {
                  // replace with entire match
                  rep.append (&buffer[static_cast<size_t> (end - 1)],
                              static_cast<size_t> (end - start) + 1);
                }
              else if (k <= pairs.rows ())
                {
                  // replace with group capture
                  rep.append (&buffer[static_cast<size_t> (pairs(k-1,0)-1)],
                              static_cast<size_t> (pairs(k-1,1)
                                                   - pairs(k-1,0)) + 1);
                }
              else
                {
                  // replace with nothing
                }
            }
          if (cur_pos < repstr.size ())
            rep.append (&repstr[cur_pos], repstr.size () - cur_pos);

          p++;
        }
      rep.append (&buffer[from], buffer.size () - from);
    }
  else
    {
      // Determine repstr length
      const size_t replen = repstr.size ();
      int delta = 0;
      regexp::match_data::const_iterator p = rx_lst.begin ();
      for (size_t i = 0; i < num_matches; i++)
        {
          OCTAVE_QUIT;
          delta += static_cast<int> (replen)
                   - static_cast<int> (p->end () - p->start () + 1);
          p++;
        }

      // Build replacement string
      rep.reserve (buffer.size () + delta);
      size_t from = 0;
      p = rx_lst.begin ();
      for (size_t i = 0; i < num_matches; i++)
        {
          OCTAVE_QUIT;
          rep.append (&buffer[from],
                      static_cast<size_t> (p->start () - 1) - from);
          from = static_cast<size_t> (p->end ());
          rep.append (repstr);
          p++;
        }
      rep.append (&buffer[from], buffer.size () - from);
    }

  retval = rep;
  return retval;
}