view libinterp/corefcn/regexp.cc @ 31605:e88a07dec498 stable

maint: Use macros to begin/end C++ namespaces. * oct-conf-post-public.in.h: Define two macros (OCTAVE_BEGIN_NAMESPACE, OCTAVE_END_NAMESPACE) that can be used to start/end a namespace. * mk-opts.pl, build-env.h, build-env.in.cc, __betainc__.cc, __contourc__.cc, __dsearchn__.cc, __eigs__.cc, __expint__.cc, __ftp__.cc, __gammainc__.cc, __ichol__.cc, __ilu__.cc, __isprimelarge__.cc, __lin_interpn__.cc, __magick_read__.cc, __pchip_deriv__.cc, __qp__.cc, amd.cc, auto-shlib.cc, auto-shlib.h, balance.cc, base-text-renderer.cc, base-text-renderer.h, besselj.cc, bitfcns.cc, bsxfun.cc, c-file-ptr-stream.cc, c-file-ptr-stream.h, call-stack.cc, call-stack.h, ccolamd.cc, cellfun.cc, chol.cc, colamd.cc, colloc.cc, conv2.cc, daspk.cc, dasrt.cc, dassl.cc, data.cc, data.h, debug.cc, defaults.cc, defaults.h, defun-int.h, defun.cc, det.cc, dirfns.cc, display.cc, display.h, dlmread.cc, dmperm.cc, dot.cc, dynamic-ld.cc, dynamic-ld.h, eig.cc, ellipj.cc, environment.cc, environment.h, error.cc, error.h, errwarn.h, event-manager.cc, event-manager.h, event-queue.cc, event-queue.h, fcn-info.cc, fcn-info.h, fft.cc, fft2.cc, fftn.cc, file-io.cc, filter.cc, find.cc, ft-text-renderer.cc, ft-text-renderer.h, gcd.cc, getgrent.cc, getpwent.cc, getrusage.cc, givens.cc, gl-render.cc, gl-render.h, gl2ps-print.cc, gl2ps-print.h, graphics-toolkit.cc, graphics-toolkit.h, graphics.cc, graphics.in.h, gsvd.cc, gtk-manager.cc, gtk-manager.h, hash.cc, help.cc, help.h, hess.cc, hex2num.cc, hook-fcn.cc, hook-fcn.h, input.cc, input.h, interpreter-private.cc, interpreter-private.h, interpreter.cc, interpreter.h, inv.cc, jsondecode.cc, jsonencode.cc, kron.cc, latex-text-renderer.cc, latex-text-renderer.h, load-path.cc, load-path.h, load-save.cc, load-save.h, lookup.cc, ls-ascii-helper.cc, ls-ascii-helper.h, ls-oct-text.cc, ls-utils.cc, ls-utils.h, lsode.cc, lu.cc, mappers.cc, matrix_type.cc, max.cc, mex-private.h, mex.cc, mgorth.cc, nproc.cc, oct-fstrm.cc, oct-fstrm.h, oct-hdf5-types.cc, oct-hdf5-types.h, oct-hist.cc, oct-hist.h, oct-iostrm.cc, oct-iostrm.h, oct-opengl.h, oct-prcstrm.cc, oct-prcstrm.h, oct-procbuf.cc, oct-procbuf.h, oct-process.cc, oct-process.h, oct-stdstrm.h, oct-stream.cc, oct-stream.h, oct-strstrm.cc, oct-strstrm.h, oct-tex-lexer.in.ll, oct-tex-parser.yy, ordqz.cc, ordschur.cc, pager.cc, pager.h, pinv.cc, pow2.cc, pr-flt-fmt.cc, pr-output.cc, procstream.cc, procstream.h, psi.cc, qr.cc, quad.cc, quadcc.cc, qz.cc, rand.cc, rcond.cc, regexp.cc, schur.cc, settings.cc, settings.h, sighandlers.cc, sighandlers.h, sparse-xdiv.cc, sparse-xdiv.h, sparse-xpow.cc, sparse-xpow.h, sparse.cc, spparms.cc, sqrtm.cc, stack-frame.cc, stack-frame.h, stream-euler.cc, strfind.cc, strfns.cc, sub2ind.cc, svd.cc, sylvester.cc, symbfact.cc, syminfo.cc, syminfo.h, symrcm.cc, symrec.cc, symrec.h, symscope.cc, symscope.h, symtab.cc, symtab.h, syscalls.cc, sysdep.cc, sysdep.h, text-engine.cc, text-engine.h, text-renderer.cc, text-renderer.h, time.cc, toplev.cc, tril.cc, tsearch.cc, typecast.cc, url-handle-manager.cc, url-handle-manager.h, urlwrite.cc, utils.cc, utils.h, variables.cc, variables.h, xdiv.cc, xdiv.h, xnorm.cc, xnorm.h, xpow.cc, xpow.h, __delaunayn__.cc, __fltk_uigetfile__.cc, __glpk__.cc, __init_fltk__.cc, __init_gnuplot__.cc, __ode15__.cc, __voronoi__.cc, audiodevinfo.cc, audioread.cc, convhulln.cc, fftw.cc, gzip.cc, mk-build-env-features.sh, mk-builtins.pl, cdef-class.cc, cdef-class.h, cdef-fwd.h, cdef-manager.cc, cdef-manager.h, cdef-method.cc, cdef-method.h, cdef-object.cc, cdef-object.h, cdef-package.cc, cdef-package.h, cdef-property.cc, cdef-property.h, cdef-utils.cc, cdef-utils.h, ov-base.cc, ov-base.h, ov-bool-mat.cc, ov-builtin.h, ov-cell.cc, ov-class.cc, ov-class.h, ov-classdef.cc, ov-classdef.h, ov-complex.cc, ov-fcn-handle.cc, ov-fcn-handle.h, ov-fcn.h, ov-java.cc, ov-java.h, ov-mex-fcn.h, ov-null-mat.cc, ov-oncleanup.cc, ov-struct.cc, ov-typeinfo.cc, ov-typeinfo.h, ov-usr-fcn.cc, ov-usr-fcn.h, ov.cc, ov.h, octave.cc, octave.h, mk-ops.sh, op-b-b.cc, op-b-bm.cc, op-b-sbm.cc, op-bm-b.cc, op-bm-bm.cc, op-bm-sbm.cc, op-cdm-cdm.cc, op-cell.cc, op-chm.cc, op-class.cc, op-cm-cm.cc, op-cm-cs.cc, op-cm-m.cc, op-cm-s.cc, op-cm-scm.cc, op-cm-sm.cc, op-cs-cm.cc, op-cs-cs.cc, op-cs-m.cc, op-cs-s.cc, op-cs-scm.cc, op-cs-sm.cc, op-dm-dm.cc, op-dm-scm.cc, op-dm-sm.cc, op-dm-template.cc, op-dms-template.cc, op-fcdm-fcdm.cc, op-fcm-fcm.cc, op-fcm-fcs.cc, op-fcm-fm.cc, op-fcm-fs.cc, op-fcn.cc, op-fcs-fcm.cc, op-fcs-fcs.cc, op-fcs-fm.cc, op-fcs-fs.cc, op-fdm-fdm.cc, op-fm-fcm.cc, op-fm-fcs.cc, op-fm-fm.cc, op-fm-fs.cc, op-fs-fcm.cc, op-fs-fcs.cc, op-fs-fm.cc, op-fs-fs.cc, op-i16-i16.cc, op-i32-i32.cc, op-i64-i64.cc, op-i8-i8.cc, op-int-concat.cc, op-m-cm.cc, op-m-cs.cc, op-m-m.cc, op-m-s.cc, op-m-scm.cc, op-m-sm.cc, op-mi.cc, op-pm-pm.cc, op-pm-scm.cc, op-pm-sm.cc, op-pm-template.cc, op-range.cc, op-s-cm.cc, op-s-cs.cc, op-s-m.cc, op-s-s.cc, op-s-scm.cc, op-s-sm.cc, op-sbm-b.cc, op-sbm-bm.cc, op-sbm-sbm.cc, op-scm-cm.cc, op-scm-cs.cc, op-scm-m.cc, op-scm-s.cc, op-scm-scm.cc, op-scm-sm.cc, op-sm-cm.cc, op-sm-cs.cc, op-sm-m.cc, op-sm-s.cc, op-sm-scm.cc, op-sm-sm.cc, op-str-m.cc, op-str-s.cc, op-str-str.cc, op-struct.cc, op-ui16-ui16.cc, op-ui32-ui32.cc, op-ui64-ui64.cc, op-ui8-ui8.cc, ops.h, anon-fcn-validator.cc, anon-fcn-validator.h, bp-table.cc, bp-table.h, comment-list.cc, comment-list.h, filepos.h, lex.h, lex.ll, oct-lvalue.cc, oct-lvalue.h, oct-parse.yy, parse.h, profiler.cc, profiler.h, pt-anon-scopes.cc, pt-anon-scopes.h, pt-arg-list.cc, pt-arg-list.h, pt-args-block.cc, pt-args-block.h, pt-array-list.cc, pt-array-list.h, pt-assign.cc, pt-assign.h, pt-binop.cc, pt-binop.h, pt-bp.cc, pt-bp.h, pt-cbinop.cc, pt-cbinop.h, pt-cell.cc, pt-cell.h, pt-check.cc, pt-check.h, pt-classdef.cc, pt-classdef.h, pt-cmd.h, pt-colon.cc, pt-colon.h, pt-const.cc, pt-const.h, pt-decl.cc, pt-decl.h, pt-eval.cc, pt-eval.h, pt-except.cc, pt-except.h, pt-exp.cc, pt-exp.h, pt-fcn-handle.cc, pt-fcn-handle.h, pt-id.cc, pt-id.h, pt-idx.cc, pt-idx.h, pt-jump.h, pt-loop.cc, pt-loop.h, pt-mat.cc, pt-mat.h, pt-misc.cc, pt-misc.h, pt-pr-code.cc, pt-pr-code.h, pt-select.cc, pt-select.h, pt-spmd.cc, pt-spmd.h, pt-stmt.cc, pt-stmt.h, pt-tm-const.cc, pt-tm-const.h, pt-unop.cc, pt-unop.h, pt-vm-eval.cc, pt-walk.cc, pt-walk.h, pt.cc, pt.h, token.cc, token.h, Range.cc, Range.h, idx-vector.cc, idx-vector.h, range-fwd.h, CollocWt.cc, CollocWt.h, aepbalance.cc, aepbalance.h, chol.cc, chol.h, gepbalance.cc, gepbalance.h, gsvd.cc, gsvd.h, hess.cc, hess.h, lo-mappers.cc, lo-mappers.h, lo-specfun.cc, lo-specfun.h, lu.cc, lu.h, oct-convn.cc, oct-convn.h, oct-fftw.cc, oct-fftw.h, oct-norm.cc, oct-norm.h, oct-rand.cc, oct-rand.h, oct-spparms.cc, oct-spparms.h, qr.cc, qr.h, qrp.cc, qrp.h, randgamma.cc, randgamma.h, randmtzig.cc, randmtzig.h, randpoisson.cc, randpoisson.h, schur.cc, schur.h, sparse-chol.cc, sparse-chol.h, sparse-lu.cc, sparse-lu.h, sparse-qr.cc, sparse-qr.h, svd.cc, svd.h, child-list.cc, child-list.h, dir-ops.cc, dir-ops.h, file-ops.cc, file-ops.h, file-stat.cc, file-stat.h, lo-sysdep.cc, lo-sysdep.h, lo-sysinfo.cc, lo-sysinfo.h, mach-info.cc, mach-info.h, oct-env.cc, oct-env.h, oct-group.cc, oct-group.h, oct-password.cc, oct-password.h, oct-syscalls.cc, oct-syscalls.h, oct-time.cc, oct-time.h, oct-uname.cc, oct-uname.h, action-container.cc, action-container.h, base-list.h, cmd-edit.cc, cmd-edit.h, cmd-hist.cc, cmd-hist.h, f77-fcn.h, file-info.cc, file-info.h, lo-array-errwarn.cc, lo-array-errwarn.h, lo-hash.cc, lo-hash.h, lo-ieee.h, lo-regexp.cc, lo-regexp.h, lo-utils.cc, lo-utils.h, oct-base64.cc, oct-base64.h, oct-glob.cc, oct-glob.h, oct-inttypes.h, oct-mutex.cc, oct-mutex.h, oct-refcount.h, oct-shlib.cc, oct-shlib.h, oct-sparse.cc, oct-sparse.h, oct-string.h, octave-preserve-stream-state.h, pathsearch.cc, pathsearch.h, quit.cc, quit.h, unwind-prot.cc, unwind-prot.h, url-transfer.cc, url-transfer.h : Use new macros to begin/end C++ namespaces.
author Rik <rik@octave.org>
date Thu, 01 Dec 2022 14:23:45 -0800
parents 864d29fb932b
children aac27ad79be6
line wrap: on
line source

////////////////////////////////////////////////////////////////////////
//
// Copyright (C) 2002-2022 The Octave Project Developers
//
// See the file COPYRIGHT.md in the top-level directory of this
// distribution or <https://octave.org/copyright/>.
//
// This file is part of Octave.
//
// Octave is free software: you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// Octave is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with Octave; see the file COPYING.  If not, see
// <https://www.gnu.org/licenses/>.
//
////////////////////////////////////////////////////////////////////////

#if defined (HAVE_CONFIG_H)
#  include "config.h"
#endif

#include <list>
#include <sstream>

#include "base-list.h"
#include "oct-locbuf.h"
#include "quit.h"
#include "lo-regexp.h"
#include "str-vec.h"

#include "defun.h"
#include "Cell.h"
#include "error.h"
#include "errwarn.h"
#include "oct-map.h"
#include "ovl.h"
#include "utils.h"

OCTAVE_BEGIN_NAMESPACE(octave)

// Replace backslash escapes in a string with the real values.  We need
// two special functions instead of the one in utils.cc because the set
// of escape sequences used for regexp patterns and replacement strings
// is different from those used in the *printf functions.

static std::string
do_regexp_ptn_string_escapes (const std::string& s, bool is_sq_str)
{
  std::string retval;

  std::size_t i = 0;
  std::size_t j = 0;
  std::size_t len = s.length ();

  retval.resize (len);

  while (j < len)
    {
      if (s[j] == '\\' && j+1 < len)
        {
          switch (s[++j])
            {
            case 'b': // backspace
              if (is_sq_str)
                retval[i] = '\b';
              else
                {
                  // Pass escape sequence through
                  retval[i] = '\\';
                  retval[++i] = 'b';
                }
              break;

            // Translate \< and \> to PCRE patterns for pseudo-word boundary
            case '<': // begin word boundary
              retval.insert (i, "(?<=\\W|^)");
              i += 8;
              break;

            case '>': // end word boundary
              retval.insert (i, "(?=\\W|$)");
              i += 7;
              break;

            case 'o': // octal input
            {
              bool bad_esc_seq = (j+1 >= len);

              bool brace = false;
              if (! bad_esc_seq && s[++j] == '{')
                {
                  brace = true;
                  j++;
                }

              int tmpi = 0;
              std::size_t k;
              for (k = j; k < std::min (j+3+brace, len); k++)
                {
                  int digit = s[k] - '0';
                  if (digit < 0 || digit > 7)
                    break;
                  tmpi <<= 3;
                  tmpi += digit;
                }
              if (bad_esc_seq || (brace && s[k++] != '}'))
                {
                  tmpi = 0;
                  warning (R"(malformed octal escape sequence '\o' -- converting to '\0')");
                }
              retval[i] = tmpi;
              j = k - 1;
              break;
            }

            default:  // pass escape sequence through
              retval[i] = '\\';
              retval[++i] = s[j];
              break;
            }
        }
      else
        {
          retval[i] = s[j];
        }

      i++;
      j++;
    }

  retval.resize (i);

  return retval;
}

static std::string
do_regexp_rep_string_escapes (const std::string& s)
{
  std::string retval;

  std::size_t i = 0;
  std::size_t j = 0;
  std::size_t len = s.length ();

  retval.resize (len);

  while (j < len)
    {
      if (s[j] == '\\' && j+1 < len)
        {
          switch (s[++j])
            {
            case 'a': // alarm
              retval[i] = '\a';
              break;

            case 'b': // backspace
              retval[i] = '\b';
              break;

            case 'f': // formfeed
              retval[i] = '\f';
              break;

            case 'n': // newline
              retval[i] = '\n';
              break;

            case 'r': // carriage return
              retval[i] = '\r';
              break;

            case 't': // horizontal tab
              retval[i] = '\t';
              break;

            case 'v': // vertical tab
              retval[i] = '\v';
              break;

            case '0':
            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7': // octal input
            {
              std::size_t k;
              int tmpi = s[j] - '0';
              for (k = j+1; k < std::min (j+3, len); k++)
                {
                  int digit = s[k] - '0';
                  if (digit < 0 || digit > 7)
                    break;
                  tmpi <<= 3;
                  tmpi += digit;
                }
              retval[i] = tmpi;
              j = k - 1;
              break;
            }

            case 'o': // octal input
            {
              bool bad_esc_seq = (j+1 >= len);

              bool brace = false;
              if (! bad_esc_seq && s[++j] == '{')
                {
                  brace = true;
                  j++;
                }

              int tmpi = 0;
              std::size_t k;
              for (k = j; k < std::min (j+3+brace, len); k++)
                {
                  int digit = s[k] - '0';
                  if (digit < 0 || digit > 7)
                    break;
                  tmpi <<= 3;
                  tmpi += digit;
                }
              if (bad_esc_seq || (brace && s[k++] != '}'))
                {
                  warning (R"(malformed octal escape sequence '\o' -- converting to '\0')");
                  tmpi = 0;
                }
              retval[i] = tmpi;
              j = k - 1;
              break;
            }

            case 'x': // hex input
            {
              bool bad_esc_seq = (j+1 >= len);

              bool brace = false;
              if (! bad_esc_seq && s[++j] == '{')
                {
                  brace = true;
                  j++;
                }

              int tmpi = 0;
              std::size_t k;
              for (k = j; k < std::min (j+2+brace, len); k++)
                {
                  if (! isxdigit (s[k]))
                    break;

                  tmpi <<= 4;
                  int digit = s[k];
                  if (digit >= 'a')
                    tmpi += digit - 'a' + 10;
                  else if (digit >= 'A')
                    tmpi += digit - 'A' + 10;
                  else
                    tmpi += digit - '0';
                }
              if (bad_esc_seq || (brace && s[k++] != '}'))
                {
                  warning (R"(malformed hex escape sequence '\x' -- converting to '\0')");
                  tmpi = 0;
                }
              retval[i] = tmpi;
              j = k - 1;
              break;
            }

            // Both dollar sign (for capture buffer) and backslash are
            // passed through with their escape backslash.  The processing
            // for these must occur during the actual replacement operation
            // in lo-regexp.cc.
            case '$':  // pass dollar sign through with escape
              retval[i] = '\\'; retval[++i] = '$';
              break;

            case '\\': // pass backslash through with escape
              retval[i] = '\\'; retval[++i] = '\\';
              break;

            default:   // convert escaped character to unescaped char
              retval[i] = s[j];
              break;
            }
        }
      else
        {
          retval[i] = s[j];
        }

      i++;
      j++;
    }

  retval.resize (i);

  return retval;
}

static void
parse_options (regexp::opts& options, const octave_value_list& args,
               const std::string& who, int skip, bool& extra_args)
{
  extra_args = false;

  for (int i = skip; i < args.length (); i++)
    {
      std::string str;

      str = args(i).xstring_value ("%s: optional arguments must be strings", who.c_str ());

      std::transform (str.begin (), str.end (), str.begin (), tolower);

      if (str.find ("once", 0) == 0)
        options.once (true);
      else if (str.find ("matchcase", 0) == 0)
        options.case_insensitive (false);
      else if (str.find ("ignorecase", 0) == 0)
        options.case_insensitive (true);
      else if (str.find ("dotall", 0) == 0)
        options.dotexceptnewline (false);
      else if (str.find ("stringanchors", 0) == 0)
        options.lineanchors (false);
      else if (str.find ("literalspacing", 0) == 0)
        options.freespacing (false);
      else if (str.find ("noemptymatch", 0) == 0)
        options.emptymatch (false);
      else if (str.find ("dotexceptnewline", 0) == 0)
        options.dotexceptnewline (true);
      else if (str.find ("lineanchors", 0) == 0)
        options.lineanchors (true);
      else if (str.find ("freespacing", 0) == 0)
        options.freespacing (true);
      else if (str.find ("emptymatch", 0) == 0)
        options.emptymatch (true);
      else if (str.find ("start", 0) == 0
               || str.find ("end", 0) == 0
               || str.find ("tokenextents", 0) == 0
               || str.find ("match", 0) == 0
               || str.find ("tokens", 0) == 0
               || str.find ("names", 0) == 0
               || str.find ("split", 0) == 0)
        extra_args = true;
      else
        error ("%s: unrecognized option", who.c_str ());
    }
}

static octave_value_list
octregexp (const octave_value_list& args, int nargout,
           const std::string& who, bool case_insensitive = false)
{
  octave_value_list retval;

  int nargin = args.length ();

  // Make sure we have string, pattern
  const std::string buffer = args(0).string_value ();

  std::string pattern = args(1).string_value ();

  // Rewrite pattern for PCRE
  pattern = do_regexp_ptn_string_escapes (pattern, args(1).is_sq_string ());

  regexp::opts options;
  options.case_insensitive (case_insensitive);
  bool extra_options = false;
  parse_options (options, args, who, 2, extra_options);

  const regexp::match_data rx_lst
    = regexp::match (pattern, buffer, options, who);

  string_vector named_pats = rx_lst.named_patterns ();

  std::size_t sz = rx_lst.size ();

  // Converted the linked list in the correct form for the return values

  octave_map nmap (dim_vector ((sz == 0 ? 0 : 1), sz), named_pats);

  retval.resize (7);

  if (sz != 0)
    {
      for (int j = 0; j < named_pats.numel (); j++)
        {
          Cell ctmp (dim_vector (1, sz));
          octave_idx_type i = 0;

          for (const auto& match_data : rx_lst)
            {
              string_vector named_tokens = match_data.named_tokens ();

              ctmp(i++) = named_tokens(j);
            }

          nmap.assign (named_pats(j), ctmp);
        }
    }
  retval(5) = nmap;

  if (options.once ())
    {
      auto p = rx_lst.begin ();

      retval(4) = (sz ? p->tokens () : Cell ());
      retval(3) = (sz ? p->match_string () : "");
      retval(2) = (sz ? p->token_extents () : Matrix ());

      if (sz)
        {
          double start = p->start ();
          double end = p->end ();

          Cell split (dim_vector (1, 2));
          split(0) = buffer.substr (0, start-1);
          split(1) = buffer.substr (end);

          retval(6) = split;
          retval(1) = end;
          retval(0) = start;
        }
      else
        {
          retval(6) = buffer;
          retval(1) = Matrix ();
          retval(0) = Matrix ();
        }
    }
  else
    {
      Cell tokens (dim_vector (1, sz));
      Cell match_string (dim_vector (1, sz));
      Cell token_extents (dim_vector (1, sz));
      NDArray end (dim_vector (1, sz));
      NDArray start (dim_vector (1, sz));
      Cell split (dim_vector (1, sz+1));
      std::size_t sp_start = 0;

      octave_idx_type i = 0;
      for (const auto& match_data : rx_lst)
        {
          double s = match_data.start ();
          double e = match_data.end ();

          string_vector tmp = match_data.tokens ();
          tokens(i) = Cell (dim_vector (1, tmp.numel ()), tmp);
          match_string(i) = match_data.match_string ();
          token_extents(i) = match_data.token_extents ();
          end(i) = e;
          start(i) = s;
          split(i) = buffer.substr (sp_start, s-sp_start-1);
          sp_start = e;
          i++;
        }

      split(i) = buffer.substr (sp_start);

      retval(6) = split;
      retval(4) = tokens;
      retval(3) = match_string;
      retval(2) = token_extents;
      retval(1) = end;
      retval(0) = start;
    }

  // Alter the order of the output arguments

  if (extra_options)
    {
      int n = 0;
      octave_value_list new_retval;
      new_retval.resize (nargout);

      bool arg_used[7] {};

      for (int j = 2; j < nargin; j++)
        {
          int k = 0;
          std::string str = args(j).string_value ();
          std::transform (str.begin (), str.end (), str.begin (), tolower);

          if (str.find ("once", 0) == 0
              || str.find ("stringanchors", 0) == 0
              || str.find ("lineanchors", 0) == 0
              || str.find ("matchcase", 0) == 0
              || str.find ("ignorecase", 0) == 0
              || str.find ("dotall", 0) == 0
              || str.find ("dotexceptnewline", 0) == 0
              || str.find ("literalspacing", 0) == 0
              || str.find ("freespacing", 0) == 0
              || str.find ("noemptymatch", 0) == 0
              || str.find ("emptymatch", 0) == 0)
            continue;
          else if (str.find ("start", 0) == 0)
            k = 0;
          else if (str.find ("end", 0) == 0)
            k = 1;
          else if (str.find ("tokenextents", 0) == 0)
            k = 2;
          else if (str.find ("match", 0) == 0)
            k = 3;
          else if (str.find ("tokens", 0) == 0)
            k = 4;
          else if (str.find ("names", 0) == 0)
            k = 5;
          else if (str.find ("split", 0) == 0)
            k = 6;

          new_retval(n++) = retval(k);
          arg_used[k] = true;

          if (n == nargout)
            break;
        }

      // Fill in the rest of the arguments
      if (n < nargout)
        {
          for (int j = 0; j < 7; j++)
            {
              if (! arg_used[j])
                new_retval(n++) = retval(j);
            }
        }

      retval = new_retval;
    }

  return retval;
}

static octave_value_list
octcellregexp (const octave_value_list& args, int nargout,
               const std::string& who, bool case_insensitive = false)
{
  octave_value_list retval;

  if (args(0).iscell ())
    {
      OCTAVE_LOCAL_BUFFER (Cell, newretval, nargout);
      octave_value_list new_args = args;
      Cell cellstr = args(0).cell_value ();
      if (args(1).iscell ())
        {
          Cell cellpat = args(1).cell_value ();

          if (cellpat.numel () == 1)
            {
              for (int j = 0; j < nargout; j++)
                newretval[j].resize (cellstr.dims ());

              new_args(1) = cellpat(0);

              for (octave_idx_type i = 0; i < cellstr.numel (); i++)
                {
                  new_args(0) = cellstr(i);
                  octave_value_list tmp = octregexp (new_args, nargout, who,
                                                     case_insensitive);

                  for (int j = 0; j < nargout; j++)
                    newretval[j](i) = tmp(j);
                }
            }
          else if (cellstr.numel () == 1)
            {
              for (int j = 0; j < nargout; j++)
                newretval[j].resize (cellpat.dims ());

              new_args(0) = cellstr(0);

              for (octave_idx_type i = 0; i < cellpat.numel (); i++)
                {
                  new_args(1) = cellpat(i);
                  octave_value_list tmp = octregexp (new_args, nargout, who,
                                                     case_insensitive);

                  for (int j = 0; j < nargout; j++)
                    newretval[j](i) = tmp(j);
                }
            }
          else if (cellstr.numel () == cellpat.numel ())
            {
              if (cellstr.dims () != cellpat.dims ())
                error ("%s: inconsistent cell array dimensions", who.c_str ());

              for (int j = 0; j < nargout; j++)
                newretval[j].resize (cellstr.dims ());

              for (octave_idx_type i = 0; i < cellstr.numel (); i++)
                {
                  new_args(0) = cellstr(i);
                  new_args(1) = cellpat(i);

                  octave_value_list tmp = octregexp (new_args, nargout, who,
                                                     case_insensitive);

                  for (int j = 0; j < nargout; j++)
                    newretval[j](i) = tmp(j);
                }
            }
          else
            error ("regexp: cell array arguments must be scalar or equal size");
        }
      else
        {
          for (int j = 0; j < nargout; j++)
            newretval[j].resize (cellstr.dims ());

          for (octave_idx_type i = 0; i < cellstr.numel (); i++)
            {
              new_args(0) = cellstr(i);
              octave_value_list tmp = octregexp (new_args, nargout, who,
                                                 case_insensitive);

              for (int j = 0; j < nargout; j++)
                newretval[j](i) = tmp(j);
            }
        }

      for (int j = 0; j < nargout; j++)
        retval(j) = octave_value (newretval[j]);
    }
  else if (args(1).iscell ())
    {
      OCTAVE_LOCAL_BUFFER (Cell, newretval, nargout);
      octave_value_list new_args = args;
      Cell cellpat = args(1).cell_value ();

      for (int j = 0; j < nargout; j++)
        newretval[j].resize (cellpat.dims ());

      for (octave_idx_type i = 0; i < cellpat.numel (); i++)
        {
          new_args(1) = cellpat(i);
          octave_value_list tmp = octregexp (new_args, nargout, who,
                                             case_insensitive);

          for (int j = 0; j < nargout; j++)
            newretval[j](i) = tmp(j);
        }

      for (int j = 0; j < nargout; j++)
        retval(j) = octave_value (newretval[j]);
    }
  else
    retval = octregexp (args, nargout, who, case_insensitive);

  return retval;

}

DEFUN (regexp, args, nargout,
       doc: /* -*- texinfo -*-
@deftypefn  {} {[@var{s}, @var{e}, @var{te}, @var{m}, @var{t}, @var{nm}, @var{sp}] =} regexp (@var{str}, @var{pat})
@deftypefnx {} {[@dots{}] =} regexp (@var{str}, @var{pat}, "@var{opt1}", @dots{})
Regular expression string matching.

Search for @var{pat} in UTF-8 encoded @var{str} and return the positions and
substrings of any matches, or empty values if there are none.

The matched pattern @var{pat} can include any of the standard regex
operators, including:

@table @code
@item .
Match any character

@item * + ? @{@}
Repetition operators, representing

@table @code
@item *
Match zero or more times

@item +
Match one or more times

@item ?
Match zero or one times

@item @{@var{n}@}
Match exactly @var{n} times

@item @{@var{n},@}
Match @var{n} or more times

@item @{@var{m},@var{n}@}
Match between @var{m} and @var{n} times
@end table

@item [@dots{}] [^@dots{}]

List operators.  The pattern will match any character listed between
@qcode{"["} and @qcode{"]"}.  If the first character is @qcode{"^"} then the
pattern is inverted and any character except those listed between brackets
will match.

Escape sequences defined below can also be used inside list operators.  For
example, a template for a floating point number might be @code{[-+.\d]+}.

@item () (?:)
Grouping operator.  The first form, parentheses only, also creates a token.

@item |
Alternation operator.  Match one of a choice of regular expressions.  The
alternatives must be delimited by the grouping operator @code{()} above.

@item ^ $
Anchoring operators.  Requires pattern to occur at the start (@code{^}) or
end (@code{$}) of the string.
@end table

In addition, the following escaped characters have special meaning.

@table @code

@item \d
Match any digit

@item \D
Match any non-digit

@item \s
Match any whitespace character

@item \S
Match any non-whitespace character

@item \w
Match any word character

@item \W
Match any non-word character

@item \<
Match the beginning of a word

@item \>
Match the end of a word

@item \B
Match within a word
@end table

Implementation Note: For compatibility with @sc{matlab}, escape sequences
in @var{pat} (e.g., @qcode{"@backslashchar{}n"} => newline) are expanded
even when @var{pat} has been defined with single quotes.  To disable
expansion use a second backslash before the escape sequence (e.g.,
"@backslashchar{}@backslashchar{}n") or use the @code{regexptranslate}
function.

The outputs of @code{regexp} default to the order given below

@table @var
@item s
The start indices of each matching substring

@item e
The end indices of each matching substring

@item te
The extents of each matched token surrounded by @code{(@dots{})} in
@var{pat}

@item m
A cell array of the text of each match

@item t
A cell array of the text of each token matched

@item nm
A structure containing the text of each matched named token, with the name
being used as the fieldname.  A named token is denoted by
@code{(?<name>@dots{})}.

@item sp
A cell array of the text not returned by match, i.e., what remains if you
split the string based on @var{pat}.
@end table

Particular output arguments, or the order of the output arguments, can be
selected by additional @var{opt} arguments.  These are strings and the
correspondence between the output arguments and the optional argument
are

@multitable @columnfractions 0.2 0.3 0.3 0.2
@item @tab @qcode{'start'}        @tab @var{s}  @tab
@item @tab @qcode{'end'}          @tab @var{e}  @tab
@item @tab @qcode{'tokenExtents'} @tab @var{te} @tab
@item @tab @qcode{'match'}        @tab @var{m}  @tab
@item @tab @qcode{'tokens'}       @tab @var{t}  @tab
@item @tab @qcode{'names'}        @tab @var{nm} @tab
@item @tab @qcode{'split'}        @tab @var{sp} @tab
@end multitable

Additional arguments are summarized below.

@table @samp
@item once
Return only the first occurrence of the pattern.

@item matchcase
Make the matching case sensitive.  (default)

Alternatively, use (?-i) in the pattern.

@item ignorecase
Ignore case when matching the pattern to the string.

Alternatively, use (?i) in the pattern.

@item stringanchors
Match the anchor characters at the beginning and end of the string.
(default)

Alternatively, use (?-m) in the pattern.

@item lineanchors
Match the anchor characters at the beginning and end of the line.

Alternatively, use (?m) in the pattern.

@item dotall
The pattern @code{.} matches all characters including the newline character.
 (default)

Alternatively, use (?s) in the pattern.

@item dotexceptnewline
The pattern @code{.} matches all characters except the newline character.

Alternatively, use (?-s) in the pattern.

@item literalspacing
All characters in the pattern, including whitespace, are significant and are
used in pattern matching.  (default)

Alternatively, use (?-x) in the pattern.

@item freespacing
The pattern may include arbitrary whitespace and also comments beginning
with the character @samp{#}.

Alternatively, use (?x) in the pattern.

@item noemptymatch
Zero-length matches are not returned.  (default)

@item emptymatch
Return zero-length matches.

@code{regexp ('a', 'b*', 'emptymatch')} returns @code{[1 2]} because there
are zero or more @qcode{'b'} characters at positions 1 and end-of-string.

@end table

Stack Limitation Note: Pattern searches are done with a recursive function
which can overflow the program stack when there are a high number of matches.
For example,

@example
@code{regexp (repmat ('a', 1, 1e5), '(a)+')}
@end example

@noindent
may lead to a segfault.  As an alternative, consider constructing pattern
searches that reduce the number of matches (e.g., by creatively using set
complement), and then further processing the return variables (now reduced in
size) with successive @code{regexp} searches.
@seealso{regexpi, strfind, regexprep}
@end deftypefn */)
{
  if (args.length () < 2)
    print_usage ();

  octave_value_list retval;

  if (args(0).iscell () || args(1).iscell ())
    retval = (octcellregexp (args, (nargout > 0 ? nargout : 1), "regexp"));
  else
    retval = octregexp (args, nargout, "regexp");

  return retval;
}

/*
## PCRE_ERROR_MATCHLIMIT test
%!test
%! s = sprintf ('\t4\n0000\t-0.00\t-0.0000\t4\t-0.00\t-0.0000\t4\n0000\t-0.00\t-0.0000\t0\t-0.00\t-');
%! ws = warning ("query");
%! unwind_protect
%!   warning ("off");
%!   regexp (s, '(\s*-*\d+[.]*\d*\s*)+\n');
%! unwind_protect_cleanup
%!   warning (ws);
%! end_unwind_protect

## segfault test
%!assert (regexp ("abcde", "."), [1,2,3,4,5])
%!assert <*62704> (regexpi('(', '\(?'), 1)
## Infinite loop test
%!assert (isempty (regexp ("abcde", "")))

## Check that anchoring of pattern works correctly
%!assert (regexp ('abcabc', '^abc'), 1)
%!assert (regexp ('abcabc', 'abc$'), 4)
%!assert (regexp ('abcabc', '^abc$'), zeros (1,0))

## UTF-8 test with character vector "âé🙂ïõù"
%!assert (regexp (char ([195, 162, 195, 169, 240, 159, 153, 130, 195, 175, ...
%!                       195, 181, 195, 185]), "."), [1, 3, 5, 9, 11, 13])

%!test
%! [s, e, te, m, t] = regexp (' No Match ', 'f(.*)uck');
%! assert (s, zeros (1,0));
%! assert (e, zeros (1,0));
%! assert (te, cell (1,0));
%! assert (m, cell (1,0));
%! assert (t, cell (1,0));

%!test
%! [s, e, te, m, t] = regexp (' FiRetrUck ', 'f(.*)uck');
%! assert (s, zeros (1,0));
%! assert (e, zeros (1,0));
%! assert (te, cell (1,0));
%! assert (m, cell (1,0));
%! assert (t, cell (1,0));

%!test
%! [s, e, te, m, t] = regexp (' firetruck ', 'f(.*)uck');
%! assert (s, 2);
%! assert (e, 10);
%! assert (te{1}, [3, 7]);
%! assert (m{1}, 'firetruck');
%! assert (t{1}{1}, 'iretr');

%!test
%! [s, e, te, m, t] = regexp ('short test string', '\w*r\w*');
%! assert (s, [1, 12]);
%! assert (e, [5, 17]);
%! assert (size (te), [1, 2]);
%! assert (isempty (te{1}));
%! assert (isempty (te{2}));
%! assert (m{1}, 'short');
%! assert (m{2}, 'string');
%! assert (size (t), [1, 2]);
%! assert (isempty (t{1}));
%! assert (isempty (t{2}));

%!test
%! [s, e, te, m, t] = regexp ('short test string', '\w*r\w*', 'once');
%! assert (s, 1);
%! assert (e, 5);
%! assert (isempty (te));
%! assert (m, 'short');
%! assert (isempty (t));

%!test
%! [m, te, e, s, t] = regexp ('short test string', '\w*r\w*', 'once', 'match', 'tokenExtents', 'end', 'start', 'tokens');
%! assert (s, 1);
%! assert (e, 5);
%! assert (isempty (te));
%! assert (m, 'short');
%! assert (isempty (t));

%!test
%! [s, e, te, m, t, nm] = regexp ('short test string', '(?<word1>\w*t)\s*(?<word2>\w*t)');
%! assert (s, 1);
%! assert (e, 10);
%! assert (size (te), [1, 1]);
%! assert (te{1}, [1,5; 7,10]);
%! assert (m{1}, 'short test');
%! assert (size (t), [1, 1]);
%! assert (t{1}{1}, 'short');
%! assert (t{1}{2}, 'test');
%! assert (size (nm), [1, 1]);
%! assert (! isempty (fieldnames (nm)));
%! assert (sort (fieldnames (nm)), {'word1';'word2'});
%! assert (nm.word1, 'short');
%! assert (nm.word2, 'test');

%!test
%! [nm, m, te, e, s, t] = regexp ('short test string', '(?<word1>\w*t)\s*(?<word2>\w*t)', 'names', 'match', 'tokenExtents', 'end', 'start', 'tokens');
%! assert (s, 1);
%! assert (e, 10);
%! assert (size (te), [1, 1]);
%! assert (te{1}, [1,5; 7,10]);
%! assert (m{1}, 'short test');
%! assert (size (t), [1, 1]);
%! assert (t{1}{1}, 'short');
%! assert (t{1}{2}, 'test');
%! assert (size (nm), [1, 1]);
%! assert (! isempty (fieldnames (nm)));
%! assert (sort (fieldnames (nm)), {'word1';'word2'});
%! assert (nm.word1, 'short');
%! assert (nm.word2, 'test');

%!test
%! [t, nm] = regexp ("John Davis\nRogers, James", '(?<first>\w+)\s+(?<last>\w+)|(?<last>\w+),\s+(?<first>\w+)', 'tokens', 'names');
%! assert (size (t), [1, 2]);
%! assert (t{1}{1}, "John");
%! assert (t{1}{2}, "Davis");
%! assert (t{2}{1}, "Rogers");
%! assert (t{2}{2}, "James");
%! assert (size (nm), [1, 2]);
%! assert (nm(1).first, "John");
%! assert (nm(1).last, "Davis");
%! assert (nm(2).first, "James");
%! assert (nm(2).last, "Rogers");

## Tests for nulls in strings properly matching
%!test
%! str = "A\0B\0\0C";
%! ptn = '(\0+)';  # also test null in single-quote pattern
%! M = regexp (str, ptn, "match");
%! assert (size (M), [1, 2]);
%! assert (double (M{1}), [0]);
%! assert (double (M{2}), [0, 0]);

%!test
%! str = "A\0B\0\0C";
%! ptn = "(\0+)";  # also test null in double-quote pattern
%! T = regexp (str, ptn, "tokens");
%! assert (size (T), [1, 2]);
%! assert (double (T{1}{1}), [0]);
%! assert (double (T{2}{1}), [0, 0]);

%!test
%! str = "A\0B\0\0C";
%! ptn = '(?<namedtoken>\0+)';
%! NT = regexp (str, ptn, "names");
%! assert (size (NT), [1, 2]);
%! assert (double (NT(1).namedtoken), [0]);
%! assert (double (NT(2).namedtoken), [0, 0]);

## Tests for named tokens
%!test
%! ## Parenthesis in named token (ie (int)) causes a problem
%! assert (regexp ('qwe int asd', ['(?<typestr>(int))'], 'names'),
%!         struct ('typestr', 'int'));

%!test <*35683>
%! ## Mix of named and unnamed tokens can cause segfault
%! str = "abcde";
%! ptn = '(?<T1>a)(\w+)(?<T2>d\w+)';
%! tokens = regexp (str, ptn, "names");
%! assert (isstruct (tokens) && numel (tokens) == 1);
%! assert (tokens.T1, "a");
%! assert (tokens.T2, "de");

## Test options to regexp
%!assert (regexp ("abc\nabc", '.'), [1:7])
%!assert (regexp ("abc\nabc", '.', 'dotall'), [1:7])
%!test
%! assert (regexp ("abc\nabc", '(?s).'), [1:7]);
%! assert (regexp ("abc\nabc", '.', 'dotexceptnewline'), [1,2,3,5,6,7]);
%! assert (regexp ("abc\nabc", '(?-s).'), [1,2,3,5,6,7]);

%!assert (regexp ("caseCaSe", 'case'), 1)
%!assert (regexp ("caseCaSe", 'case', "matchcase"), 1)
%!assert (regexp ("caseCaSe", 'case', "ignorecase"), [1,5])
%!test
%! assert (regexp ("caseCaSe", '(?-i)case'), 1);
%! assert (regexp ("caseCaSe", '(?i)case'), [1, 5]);

%!assert (regexp ("abc\nabc", 'c$'), 7)
%!assert (regexp ("abc\nabc", 'c$', "stringanchors"), 7)
%!test
%! assert (regexp ("abc\nabc", '(?-m)c$'), 7);
%! assert (regexp ("abc\nabc", 'c$',"lineanchors"), [3, 7]);
%! assert (regexp ("abc\nabc", '(?m)c$'), [3,7]);

%!assert (regexp ("this word", 's w'), 4)
%!assert (regexp ("this word", 's w', 'literalspacing'), 4)
%!test
%! assert (regexp ("this word", '(?-x)s w', 'literalspacing'), 4);
%! assert (regexp ("this word", 's w', 'freespacing'), zeros (1,0));
%! assert (regexp ("this word", '(?x)s w'), zeros (1,0));

%!test
%! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '[VOCT]*', 'noemptymatch');
%! assert (s, [1 5]);
%! assert (e, [3 5]);
%! assert (te, { zeros(0,2), zeros(0,2) });
%! assert (m, { "OCT", "V" });
%! assert (t, { cell(1,0), cell(1,0) });
%! assert (isempty (fieldnames (nm)));
%! assert (sp, { "", "A", "E" });

%!test
%! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '([VOCT]*)', 'noemptymatch');
%! assert (s, [1 5]);
%! assert (e, [3 5]);
%! assert (te, { [1 3], [5 5] });
%! assert (m, { "OCT", "V" });
%! assert (t, { {"OCT"}, {"V"} });
%! assert (isempty (fieldnames (nm)));
%! assert (sp, { "", "A", "E" });

%!test
%! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '[VOCT]*', 'emptymatch');
%! assert (s, [1 4 5 6 7]);
%! assert (e, [3 3 5 5 6]);
%! assert (te, repmat ({zeros(0,2)}, [1, 5]));
%! assert (m, { "OCT", "", "V", "", "" });
%! assert (t, repmat({cell(1,0)}, [1, 5]));
%! assert (isempty (fieldnames (nm)));
%! assert (sp, { "", "", "A", "", "E", "" });

%!test
%! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '([VOCT]*)', 'emptymatch');
%! assert (s, [1 4 5 6 7]);
%! assert (e, [3 3 5 5 6]);
%! assert (te, { [1 3], [4 3], [5 5], [6 5], [7 6] });
%! assert (m, { "OCT", "", "V", "", "" });
%! assert (t, { {"OCT"}, {""}, {"V"}, {""}, {""} });
%! assert (isempty (fieldnames (nm)));
%! assert (sp, { "", "", "A", "", "E", "" });

%!assert (regexp ({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'}, '-'),
%!        {6;[1,5,9];zeros(1,0)})
%!assert (regexp ({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'}, {'-';'f';'q'}),
%!        {6;[3,7];[1,9]})
%!assert (regexp ('Strings', {'t','s'}), {2, 7})

## Test case for lookaround operators
%!test
%! assert (regexp ('Iraq', 'q(?!u)'), 4);
%! assert (regexp ('quit', 'q(?!u)'), zeros (1, 0));
%! assert (regexp ('quit', 'q(?=u)' , 'match'), {'q'});
%! assert (regexp ("quit", 'q(?=u+)', 'match'), {'q'});
%! assert (regexp ("qit",  'q(?=u+)', 'match'), cell (1, 0));
%! assert (regexp ("qit",  'q(?=u*)', 'match'), {'q'});
%! assert (regexp ('thingamabob', '(?<=a)b'), 9);

## Tests for split option.
%!shared str
%! str = "foo bar foo";
%!test
%! [a, b] = regexp (str, "f..", "match", "split");
%! assert (a, {"foo", "foo"});
%! assert (b, {"", " bar ", ""});
%!test
%! [a, b] = regexp (str, "f..", "match", "split", "once");
%! assert (a, "foo");
%! assert (b, {"", " bar foo"});
%!test
%! [a, b] = regexp (str, "fx.", "match", "split");
%! assert (a, cell (1, 0));
%! assert (b, {"foo bar foo"});
%!test
%! [a, b] = regexp (str, "fx.", "match", "split", "once");
%! assert (a, "");;
%! assert (b, "foo bar foo");

%!shared str
%! str = "foo bar";
%!test
%! [a, b] = regexp (str, "f..", "match", "split");
%! assert (a, {"foo"});
%! assert (b, {"", " bar"});
%!test
%! [a, b] = regexp (str, "b..", "match", "split");
%! assert (a, {"bar"});
%! assert (b, {"foo ", ""});
%!test
%! [a, b] = regexp (str, "x", "match", "split");
%! assert (a, cell (1, 0));
%! assert (b, {"foo bar"});
%!test
%! [a, b] = regexp (str, "[o]+", "match", "split");
%! assert (a, {"oo"});
%! assert (b, {"f", " bar"});

## Test escape sequences are expanded even in single-quoted strings
%!assert (regexp ("\n", '\n'), 1)
%!assert (regexp ("\n", "\n"), 1)

## Test escape sequences are silently converted
%!test <*45407>
%! assert (regexprep ('s', 's', 'x\.y'), 'x.y');
%! assert (regexprep ('s', '(s)', 'x\$1y'), 'x$1y');
%! assert (regexprep ('s', '(s)', 'x\\$1y'), 'x\sy');

## Test start-of-word / end-of-word patterns for Matlab compatibility
%!test <*59992>
%! assert (regexp ('foo!+bar', '\<\w'), [1, 6]);
%! assert (regexp ('foo!+bar', '.\>'), [3, 4, 8]);
%! assert (regexp ('foo!+bar\nbar!+foo', '.\>'), [3, 4, 8, 13, 14, 18]);
%! assert (regexp ('foo!+bar\nbar!+foo', '\<\w'), [1, 6, 10, 16]);

## Test "incomplete" named patterns
%!assert <*62705> (regexpi ('<', '\(?<'), 1)
%!assert <*62705> (regexpi ('<n>', '\(?<n\>'), 1)
%!assert <*62705> (regexpi ('<n>', '\(?<n\>\)?'), 1)
%!assert <62705> (regexpi ('<n>a', '\(?<n\>a\)?'), 1)

## Test input validation
%!error regexp ('string', 'tri', 'BadArg')
%!error regexp ('string')

*/

DEFUN (regexpi, args, nargout,
       doc: /* -*- texinfo -*-
@deftypefn  {} {[@var{s}, @var{e}, @var{te}, @var{m}, @var{t}, @var{nm}, @var{sp}] =} regexpi (@var{str}, @var{pat})
@deftypefnx {} {[@dots{}] =} regexpi (@var{str}, @var{pat}, "@var{opt1}", @dots{})

Case insensitive regular expression string matching.

Search for @var{pat} in UTF-8 encoded @var{str} and return the positions and
substrings of any matches, or empty values if there are none.
@xref{XREFregexp,,@code{regexp}}, for details on the syntax of the search
pattern.
@seealso{regexp}
@end deftypefn */)
{
  if (args.length () < 2)
    print_usage ();

  if (args(0).iscell () || args(1).iscell ())
    return octcellregexp (args, (nargout > 0 ? nargout : 1), "regexpi", true);
  else
    return octregexp (args, nargout, "regexpi", true);
}

/*
## segfault test
%!assert (regexpi ("abcde", "."), [1,2,3,4,5])

## Check that anchoring of pattern works correctly
%!assert (regexpi ('abcabc', '^ABC'), 1)
%!assert (regexpi ('abcabc', 'ABC$'), 4)
%!assert (regexpi ('abcabc', '^ABC$'), zeros (1,0))

%!test
%! [s, e, te, m, t] = regexpi (' No Match ', 'f(.*)uck');
%! assert (s, zeros (1,0));
%! assert (e, zeros (1,0));
%! assert (te, cell (1,0));
%! assert (m, cell (1,0));
%! assert (t, cell (1,0));

%!test
%! [s, e, te, m, t] = regexpi (' FiRetrUck ', 'f(.*)uck');
%! assert (s, 2);
%! assert (e, 10);
%! assert (te{1}, [3, 7]);
%! assert (m{1}, 'FiRetrUck');
%! assert (t{1}{1}, 'iRetr');

%!test
%! [s, e, te, m, t] = regexpi (' firetruck ', 'f(.*)uck');
%! assert (s, 2);
%! assert (e, 10);
%! assert (te{1}, [3, 7]);
%! assert (m{1}, 'firetruck');
%! assert (t{1}{1}, 'iretr');

%!test
%! [s, e, te, m, t] = regexpi ('ShoRt Test String', '\w*r\w*');
%! assert (s, [1, 12]);
%! assert (e, [5, 17]);
%! assert (size (te), [1, 2]);
%! assert (isempty (te{1}));
%! assert (isempty (te{2}));
%! assert (m{1}, 'ShoRt');
%! assert (m{2}, 'String');
%! assert (size (t), [1, 2]);
%! assert (isempty (t{1}));
%! assert (isempty (t{2}));

%!test
%! [s, e, te, m, t] = regexpi ('ShoRt Test String', '\w*r\w*', 'once');
%! assert (s, 1);
%! assert (e, 5);
%! assert (isempty (te));
%! assert (m, 'ShoRt');
%! assert (isempty (t));

%!test
%! [m, te, e, s, t] = regexpi ('ShoRt Test String', '\w*r\w*', 'once', 'match', 'tokenExtents', 'end', 'start', 'tokens');
%! assert (s, 1);
%! assert (e, 5);
%! assert (isempty (te));
%! assert (m, 'ShoRt');
%! assert (isempty (t));

%!test
%! [s, e, te, m, t, nm] = regexpi ('ShoRt Test String', '(?<word1>\w*t)\s*(?<word2>\w*t)');
%! assert (s, 1);
%! assert (e, 10);
%! assert (size (te), [1, 1]);
%! assert (te{1}, [1,5; 7,10]);
%! assert (m{1}, 'ShoRt Test');
%! assert (size (t), [1, 1]);
%! assert (t{1}{1}, 'ShoRt');
%! assert (t{1}{2}, 'Test');
%! assert (size (nm), [1, 1]);
%! assert (! isempty (fieldnames (nm)));
%! assert (sort (fieldnames (nm)), {'word1';'word2'});
%! assert (nm.word1, 'ShoRt');
%! assert (nm.word2, 'Test');

%!test
%! [nm, m, te, e, s, t] = regexpi ('ShoRt Test String', '(?<word1>\w*t)\s*(?<word2>\w*t)', 'names', 'match', 'tokenExtents', 'end', 'start', 'tokens');
%! assert (s, 1);
%! assert (e, 10);
%! assert (size (te), [1, 1]);
%! assert (te{1}, [1,5; 7,10]);
%! assert (m{1}, 'ShoRt Test');
%! assert (size (t), [1, 1]);
%! assert (t{1}{1}, 'ShoRt');
%! assert (t{1}{2}, 'Test');
%! assert (size (nm), [1, 1]);
%! assert (! isempty (fieldnames (nm)));
%! assert (sort (fieldnames (nm)), {'word1';'word2'});
%! assert (nm.word1, 'ShoRt');
%! assert (nm.word2, 'Test');

%!assert (regexpi ("abc\nabc", '.'), [1:7])
%!assert (regexpi ("abc\nabc", '.', 'dotall'), [1:7])
%!test
%! assert (regexpi ("abc\nabc", '(?s).'), [1:7]);
%! assert (regexpi ("abc\nabc", '.', 'dotexceptnewline'), [1,2,3,5,6,7]);
%! assert (regexpi ("abc\nabc", '(?-s).'), [1,2,3,5,6,7]);

%!assert (regexpi ("caseCaSe", 'case'), [1, 5])
%!assert (regexpi ("caseCaSe", 'case', "matchcase"), 1)
%!assert (regexpi ("caseCaSe", 'case', "ignorecase"), [1, 5])
%!test
%! assert (regexpi ("caseCaSe", '(?-i)case'), 1);
%! assert (regexpi ("caseCaSe", '(?i)case'), [1, 5]);

%!assert (regexpi ("abc\nabc", 'C$'), 7)
%!assert (regexpi ("abc\nabc", 'C$', "stringanchors"), 7)
%!test
%! assert (regexpi ("abc\nabc", '(?-m)C$'), 7);
%! assert (regexpi ("abc\nabc", 'C$', "lineanchors"), [3, 7]);
%! assert (regexpi ("abc\nabc", '(?m)C$'), [3, 7]);

%!assert (regexpi ("this word", 'S w'), 4)
%!assert (regexpi ("this word", 'S w', 'literalspacing'), 4)
%!test
%! assert (regexpi ("this word", '(?-x)S w', 'literalspacing'), 4);
%! assert (regexpi ("this word", 'S w', 'freespacing'), zeros (1,0));
%! assert (regexpi ("this word", '(?x)S w'), zeros (1,0));

%!error regexpi ('string', 'tri', 'BadArg')
%!error regexpi ('string')

%!assert (regexpi ({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'}, '-'),
%!        {6;[1,5,9];zeros(1, 0)})
%!assert (regexpi ({'asdfg-dfd', '-dfd-dfd-', 'qasfdfdaq'}, '-'),
%!        {6, [1,5,9], zeros(1,0)})
%!assert (regexpi ({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'}, {'-';'f';'q'}),
%!        {6;[3,7];[1,9]})
%!assert (regexpi ('Strings', {'t', 's'}), {2, [1, 7]})

%!assert (regexpi ("\n", '\n'), 1)
%!assert (regexpi ("\n", "\n"), 1)
*/

static octave_value
octregexprep (const octave_value_list& args, const std::string& who)
{
  int nargin = args.length ();

  // Make sure we have string, pattern, replacement
  const std::string buffer = args(0).string_value ();

  std::string pattern = args(1).string_value ();

  // Rewrite pattern for PCRE
  pattern = do_regexp_ptn_string_escapes (pattern, args(1).is_sq_string ());

  std::string replacement = args(2).string_value ();

  // Matlab compatibility.
  if (args(2).is_sq_string ())
    replacement = do_regexp_rep_string_escapes (replacement);

  // Pack options excluding 'tokenize' and various output
  // reordering strings into regexp arg list
  octave_value_list regexpargs (nargin-3, octave_value ());

  int len = 0;
  for (int i = 3; i < nargin; i++)
    {
      const std::string opt = args(i).string_value ();
      if (opt != "tokenize" && opt != "start" && opt != "end"
          && opt != "tokenextents" && opt != "match" && opt != "tokens"
          && opt != "names"  && opt != "split" && opt != "warnings")
        {
          regexpargs(len++) = args(i);
        }
    }
  regexpargs.resize (len);

  regexp::opts options;
  bool extra_args = false;
  parse_options (options, regexpargs, who, 0, extra_args);

  return regexp::replace (pattern, buffer, replacement, options, who);
}

DEFUN (regexprep, args, ,
       doc: /* -*- texinfo -*-
@deftypefn  {} {@var{outstr} =} regexprep (@var{string}, @var{pat}, @var{repstr})
@deftypefnx {} {@var{outstr} =} regexprep (@var{string}, @var{pat}, @var{repstr}, "@var{opt1}", @dots{})
Replace occurrences of pattern @var{pat} in @var{string} with @var{repstr}.

The pattern is a regular expression as documented for @code{regexp}.
@xref{XREFregexp,,@code{regexp}}.

All strings must be UTF-8 encoded.

The replacement string may contain @code{$i}, which substitutes for the ith
set of parentheses in the match string.  For example,

@example
regexprep ("Bill Dunn", '(\w+) (\w+)', '$2, $1')
@end example

@noindent
returns @qcode{"Dunn, Bill"}

Options in addition to those of @code{regexp} are

@table @samp

@item once
Replace only the first occurrence of @var{pat} in the result.

@item warnings
This option is present for compatibility but is ignored.

@end table

Implementation Note: For compatibility with @sc{matlab}, escape sequences
in @var{pat} (e.g., @qcode{"@backslashchar{}n"} => newline) are expanded
even when @var{pat} has been defined with single quotes.  To disable
expansion use a second backslash before the escape sequence (e.g.,
"@backslashchar{}@backslashchar{}n") or use the @code{regexptranslate}
function.
@seealso{regexp, regexpi, strrep}
@end deftypefn */)
{
  if (args.length () < 3)
    print_usage ();

  octave_value_list retval;

  if (args(0).iscell () || args(1).iscell () || args(2).iscell ())
    {
      Cell str, pat, rep;
      dim_vector dv0;
      dim_vector dv1 (1, 1);

      if (args(0).iscell ())
        str = args(0).cell_value ();
      else
        str = Cell (args(0));

      if (args(1).iscell ())
        pat = args(1).cell_value ();
      else
        pat = Cell (args(1));

      if (args(2).iscell ())
        rep = args(2).cell_value ();
      else
        rep = Cell (args(2));

      dv0 = str.dims ();
      if (pat.numel () != 1)
        {
          dv1 = pat.dims ();
          if (rep.numel () != 1 && dv1 != rep.dims ())
            error ("regexprep: inconsistent cell array dimensions");
        }
      else if (rep.numel () != 1)
        dv1 = rep.dims ();

      Cell ret (dv0);
      octave_value_list new_args = args;

      for (octave_idx_type i = 0; i < dv0.numel (); i++)
        {
          new_args(0) = str(i);
          if (pat.numel () == 1)
            new_args(1) = pat(0);
          if (rep.numel () == 1)
            new_args(2) = rep(0);

          for (octave_idx_type j = 0; j < dv1.numel (); j++)
            {
              if (pat.numel () != 1)
                new_args(1) = pat(j);
              if (rep.numel () != 1)
                new_args(2) = rep(j);
              new_args(0) = octregexprep (new_args, "regexprep");
            }

          ret(i) = new_args(0);
        }

      retval = (args(0).iscell () ? ovl (ret) : ovl (ret(0)));
    }
  else
    retval = octregexprep (args, "regexprep");

  return retval;
}

/*
%!test  # Replace with empty
%! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>';
%! t = regexprep (xml, '<[!?][^>]*>', '');
%! assert (t, ' <tag v="hello">some stuff</tag>');

%!test  # Replace with non-empty
%! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>';
%! t = regexprep (xml, '<[!?][^>]*>', '?');
%! assert (t, '? <tag v="hello">some stuff?</tag>');

%!test  # Check that 'tokenize' is ignored
%! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>';
%! t = regexprep (xml, '<[!?][^>]*>', '', 'tokenize');
%! assert (t, ' <tag v="hello">some stuff</tag>');

## Test capture replacement
%!test
%! data = "Bob Smith\nDavid Hollerith\nSam Jenkins";
%! result = "Smith, Bob\nHollerith, David\nJenkins, Sam";
%! t = regexprep (data, '(?m)^(\w+)\s+(\w+)$', '$2, $1');
%! assert (t, result);

## Return the original if no match
%!assert (regexprep ('hello', 'world', 'earth'), 'hello')

## Test emptymatch option
%!assert (regexprep ('World', '^', 'Hello '), 'World')
%!assert (regexprep ('World', '^', 'Hello ', 'emptymatch'), 'Hello World')

## Test a general replacement
%!assert (regexprep ("a[b]c{d}e-f=g", "[^A-Za-z0-9_]", "_"), "a_b_c_d_e_f_g")

## Make sure replacements work at the beginning and end of string
%!assert (regexprep ("a[b]c{d}e-f=g", "a", "_"), "_[b]c{d}e-f=g")
%!assert (regexprep ("a[b]c{d}e-f=g", "g", "_"), "a[b]c{d}e-f=_")

## Test options "once" and "ignorecase"
%!assert (regexprep ("a[b]c{d}e-f=g", "[^A-Za-z0-9_]", "_", "once"),
%!        "a_b]c{d}e-f=g")
%!assert (regexprep ("a[b]c{d}e-f=g", "[^A-Z0-9_]", "_", "ignorecase"),
%!        "a_b_c_d_e_f_g")

## Option combinations
%!assert (regexprep ("a[b]c{d}e-f=g", "[^A-Z0-9_]", "_", "once", "ignorecase"),
%!        "a_b]c{d}e-f=g")

## End conditions on replacement
%!assert (regexprep ("abc", "(b)", ".$1"), "a.bc")
%!assert (regexprep ("abc", "(b)", "$1"), "abc")
%!assert (regexprep ("abc", "(b)", "$1."), "ab.c")
%!assert (regexprep ("abc", "(b)", "$1.."), "ab..c")

## Test cell array arguments
%!assert (regexprep ("abc", {"b","a"}, "?"), "??c")
%!assert (regexprep ({"abc","cba"}, "b", "?"), {"a?c","c?a"})
%!assert (regexprep ({"abc","cba"}, {"b","a"}, {"?","!"}), {"!?c","c?!"})

## Nasty lookbehind expression
%!test
%! warning ("off", "Octave:regexp-lookbehind-limit", "local");
%! assert (regexprep ('x^(-1)+y(-1)+z(-1)=0', '(?<=[a-z]+)\(\-[1-9]*\)',
%!         '_minus1'),'x^(-1)+y_minus1+z_minus1=0');

## Verify escape sequences in pattern
%!assert (regexprep ("\n", '\n', "X"), "X")
%!assert (regexprep ("\n", "\n", "X"), "X")

## Verify NULLs in pattern and replacement string
%!assert (regexprep ("A\0A", "\0", ","), "A,A")
%!assert (regexprep ("A\0A", '\0', ","), "A,A")
%!assert (regexprep ("A,A", "A", "B\0B"), "B\0B,B\0B")
%!assert (regexprep ("A,A", "A", 'B\0B'), "B\0B,B\0B")

## Empty matches were broken on ARM architecture
%!test <*52810>
%! assert (strcmp (regexprep ("\nabc", "^(\t*)(abc)$", "$1$2", "lineanchors"),
%!                 "\nabc"));
*/

OCTAVE_END_NAMESPACE(octave)