Mercurial > octave-nkf
view src/DLD-FUNCTIONS/regexp.cc @ 5582:6bf56668b01a
[project @ 2005-12-15 01:08:20 by jwe]
author | jwe |
---|---|
date | Thu, 15 Dec 2005 01:10:05 +0000 |
parents | |
children | e9112ff172b1 |
line wrap: on
line source
/* Copyright (C) 2005 David Bateman Copyright (C) 2002-2005 Paul Kienzle Octave is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. Octave is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; see the file COPYING. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ // XXX FIXME XXX // regexprep should be written as an m-file based on regexp #ifdef HAVE_CONFIG_H #include <config.h> #endif #include "defun-dld.h" #include "error.h" #include "gripes.h" #include "oct-obj.h" #include "utils.h" #include "Cell.h" #include "oct-map.h" #include "str-vec.h" #ifdef HAVE_PCRE #include <pcre.h> #else #ifdef HAVE_REGEX #ifdef __MINGW32__ #define __restrict #endif #include <regex.h> #endif #endif static octave_value_list octregexp (const octave_value_list &args, int nargout, const std::string &nm, bool case_insensitive) { octave_value_list retval; #if defined (HAVE_REGEX) || defined (HAVE_PCRE) int nargin = args.length(); int nopts = nargin - 2; bool once = false; if (nargin < 2) { print_usage(nm); return retval; } std::string buffer = args(0).string_value (); if (error_state) { gripe_wrong_type_arg (nm.c_str(), args(0)); return retval; } std::string pattern = args(1).string_value (); if (error_state) { gripe_wrong_type_arg (nm.c_str(), args(1)); return retval; } for (int i = 2; i < nargin; i++) { std::string str = args(i).string_value(); if (error_state) { error ("%s: optional arguments must be strings", nm.c_str()); break; } std::transform (str.begin (), str.end (), str.begin (), tolower); if (str.find("once", 0) == 0) { once = true; nopts--; } #ifdef HAVE_PCRE // XXX FIXME XXX named tokens still broken. Disable for now #if 0 else if (str.find("start", 0) && str.find("end", 0) && str.find("tokenextents", 0) && str.find("match", 0) && str.find("tokens", 0) && str.find("names", 0)) error ("%s: unrecognized option", nm.c_str()); #else else if (str.find("names", 0) == 0) error ("%s: named tokens not implemented in this version", nm.c_str()); else if (str.find("start", 0) && str.find("end", 0) && str.find("tokenextents", 0) && str.find("match", 0) && str.find("tokens", 0)) error ("%s: unrecognized option", nm.c_str()); #endif #else else if (str.find("names", 0) == 0) error ("%s: named tokens not implemented in this version", nm.c_str()); else if (str.find("start", 0) && str.find("end", 0) && str.find("tokenextents", 0) && str.find("match", 0) && str.find("tokens", 0)) error ("%s: unrecognized option", nm.c_str()); #endif } if (!error_state) { Octave_map nmap; Cell t, m, te; NDArray s, e; // named tokens "(?<name>...)" are only treated with PCRE not regex. #if HAVE_PCRE // The syntax of named tokens in pcre is "(?P<name>...)" while we need // a syntax "(?<name>...)", so fix that here. Also an expression like // "(?<first>\w+)\s+(?<last>\w+)|(?<last>\w+),\s+(?<first>\w+)" should // be perfectly legal, while pcre does not allow the same named token // name of both sides of the alternative. Also fix that here by replacing // duplicate name tokens by dummy names, and dealing with the dummy names // later. // Compile expression pcre *re; const char *err; int erroffset; re = pcre_compile(pattern.c_str(), (case_insensitive ? PCRE_CASELESS : 0), &err, &erroffset, NULL); if (re == NULL) { error("%s: %s at position %d of expression", nm.c_str(), err, erroffset); return retval; } int subpatterns; int namecount; int nameentrysize; char *nametable; int idx = 0; int sz = 0; pcre_fullinfo(re, NULL, PCRE_INFO_CAPTURECOUNT, &subpatterns); pcre_fullinfo(re, NULL, PCRE_INFO_NAMECOUNT, &namecount); pcre_fullinfo(re, NULL, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize); pcre_fullinfo(re, NULL, PCRE_INFO_NAMETABLE, &nametable); OCTAVE_LOCAL_BUFFER(int, ovector, (subpatterns+1)*3); OCTAVE_LOCAL_BUFFER(int, nidx, namecount); string_vector names (namecount); for (int i = 0; i < namecount; i++) { // Index of subpattern in first two bytes MSB first of name. // Extract name and index. nidx[i] = ((int)nametable[i*nameentrysize]) << 8 | (int)nametable[i*nameentrysize+1]; names(i) = std::string((&(nametable[i*nameentrysize+2]))); } Cell named_tokens(dim_vector(namecount,1)); while(true) { int matches = pcre_exec(re, NULL, buffer.c_str(), buffer.length(), idx, (idx ? PCRE_NOTBOL : 0), ovector, (subpatterns+1)*3); if (matches < 0 && matches != PCRE_ERROR_NOMATCH) { error ("%s: internal error calling pcre_exec", nm.c_str()); pcre_free(re); return retval; } else if (matches == PCRE_ERROR_NOMATCH) break; else { s.resize (dim_vector(1, sz+1)); s(sz) = double (ovector[0]+1); e.resize (dim_vector(1, sz+1)); e(sz) = double (ovector[1]); te.resize(dim_vector(1, sz+1)); Matrix mat_te(matches-1,2); for (int i = 1; i < matches; i++) { mat_te(i-1,0) = double (ovector[2*i]+1); mat_te(i-1,1) = double (ovector[2*i+1]); } te(sz) = mat_te; const char **listptr; int status = pcre_get_substring_list(buffer.c_str(), ovector, matches, &listptr); if (status == PCRE_ERROR_NOMEMORY) { error("%s: cannot allocate memory in pcre_get_substring_list", nm.c_str()); pcre_free(re); return retval; } m.resize (dim_vector(1, sz+1)); m(sz) = std::string(*listptr); t.resize (dim_vector(1, sz+1)); Cell cell_t (dim_vector(1,matches-1)); for (int i = 1; i < matches; i++) cell_t(i-1) = std::string(*(listptr+i)); t(sz) = cell_t; for (int i = 0; i < namecount; i++) { Cell tmp = named_tokens(i); tmp.resize(dim_vector(1,sz+1)); tmp(sz) = std::string(*(listptr+nidx[i])); named_tokens(i) = tmp; } pcre_free_substring_list(listptr); if (once) break; idx = ovector[1]; sz++; } } for (int i = 0; i < namecount; i++) nmap.assign (names(i), named_tokens(i)); pcre_free(re); #else regex_t compiled; int err=regcomp(&compiled, pattern.c_str(), REG_EXTENDED | (case_insensitive ? REG_ICASE : 0)); if (err) { int len = regerror(err, &compiled, NULL, 0); char *errmsg = (char *)malloc(len); if (errmsg) { regerror(err, &compiled, errmsg, len); error("%s: %s in pattern (%s)", nm.c_str(), errmsg, pattern.c_str()); free(errmsg); } else { error("out of memory"); } regfree(&compiled); return retval; } int subexpr = 1; int idx = 0; int sz = 0; for (unsigned int i=0; i < pattern.length(); i++) subexpr += ( pattern[i] == '(' ? 1 : 0 ); OCTAVE_LOCAL_BUFFER (regmatch_t, match, subexpr ); while(true) { if (regexec(&compiled, buffer.c_str() + idx, subexpr, match, (idx ? REG_NOTBOL : 0)) == 0) { // Count actual matches int matches = 0; while (matches < subexpr && match[matches].rm_so >= 0) matches++; s.resize (dim_vector(1, sz+1)); s(sz) = double (match[0].rm_so+1+idx); e.resize (dim_vector(1, sz+1)); e(sz) = double (match[0].rm_eo+idx); te.resize(dim_vector(1, sz+1)); Matrix mat_te(matches-1,2); for (int i = 1; i < matches; i++) { mat_te(i-1,0) = double (match[i].rm_so+1+idx); mat_te(i-1,1) = double (match[i].rm_eo+idx); } te(sz) = mat_te; m.resize (dim_vector(1, sz+1)); m(sz) = buffer.substr (match[0].rm_so+idx, match[0].rm_eo-match[0].rm_so); t.resize (dim_vector(1, sz+1)); Cell cell_t (dim_vector(1,matches-1)); for (int i = 1; i < matches; i++) cell_t(i-1) = buffer.substr (match[i].rm_so+idx, match[i].rm_eo-match[i].rm_so); t(sz) = cell_t; idx += match[0].rm_eo; sz++; if (once) break; } else break; } regfree(&compiled); #endif retval(5) = nmap; retval(4) = t; retval(3) = m; retval(2) = te; retval(1) = e; retval(0) = s; // Alter the order of the output arguments if (nopts > 0) { int n = 0; octave_value_list new_retval; new_retval.resize(nargout); OCTAVE_LOCAL_BUFFER (int, arg_used, 6); for (int i = 0; i < 6; i++) arg_used[i] = false; for (int i = 2; i < nargin; i++) { int k = 0; std::string str = args(i).string_value(); std::transform (str.begin (), str.end (), str.begin (), tolower); if (str.find("once", 0) == 0) continue; else if (str.find("start", 0) == 0) k = 0; else if (str.find("end", 0) == 0) k = 1; else if (str.find("tokenextents", 0) == 0) k = 2; else if (str.find("match", 0) == 0) k = 3; else if (str.find("tokens", 0) == 0) k = 4; else if (str.find("names", 0) == 0) k = 5; new_retval(n++) = retval(k); arg_used[k] = true; if (n == nargout) break; } // Fill in the rest of the arguments if (n < nargout) { for (int i = 0; i < 6; i++) { if (! arg_used[i]) new_retval(n++) = retval(i); } } retval = new_retval; } } #else error ("%s: not available in this version of Octave", nm); #endif return retval; } DEFUN_DLD (regexp, args, nargout, "-*- texinfo -*-\n\ @deftypefn {Loadable Function} {[@var{s}, @var{e}, @var{te}, @var{m}, @var{t}, @var{nm}] =} regexp (@var{str}, @var{pat})\n\ @deftypefnx {Loadable Function} {[@dots{}] =} regexp (@var{str}, @var{pat}, @var{opts}, @dots{})\n\ \n\ Regular expression string matching. Matches @var{pat} in @var{str} and\n\ returns the position and matching substrings or empty values if there are\n\ none.\n\ \n\ The matched pattern @var{pat} can include any of the standard regex\n\ operators, including:\n\ \n\ @table @code\n\ @item .\n\ Match any character\n\ @item * + ? @{@}\n\ Repetition operators, representing\n\ @table @code\n\ @item *\n\ Match zero or more times\n\ @item +\n\ Match one or more times\n\ @item ?\n\ Match zero or one times\n\ @item @{@}\n\ Match range operator, which is of the form @code{@{@var{n}@}} to match exactly\n\ @var{n} times, @code{@{@var{m},@}} to match @var{m} or more times,\n\ @code{@{@var{m},@var{n}@}} to match between @var{m} and @var{n} times.\n\ @end table\n\ @item [@dots{}] [^@dots{}]\n\ List operators, where for example @code{[ab]c} matches @code{ac} and @code{bc}\n\ @item ()\n\ Grouping operator\n\ @item |\n\ Alternation operator. Match one of a choice of regular expressions. The\n\ alternatives must be delimited by the grouoing operator @code{()} above\n\ @item ^ $\n\ Anchoring operator. @code{^} matches the start of the string @var{str} and\n\ @code{$} the end\n\ @end table\n\ \n\ In addition the following escaped characters have special meaning. It should\n\ be noted that it is recommended to quote @var{pat} in single quotes rather\n\ than double quotes, to avoid the escape sequences being interpreted by octave\n\ before being passed to @code{regexp}.\n\ \n\ @table @code\n\ @item \\b\n\ Match a word boundary\n\ @item \\B\n\ Match within a word\n\ @item \\w\n\ Matches any word character\n\ @item \\W\n\ Matches any non word character\n\ @item \\<\n\ Matches the beginning of a word\n\ @item \\>\n\ Matches the end of a word\n\ @item \\s\n\ Matches any whitespace character\n\ @item \\S\n\ Matches any non whitespace character\n\ @item \\d\n\ Matches any digit\n\ @item \\D\n\ Matches any non-digit\n\ @end table\n\ \n\ The outputs of @code{regexp} by default are in the order as given below\n\ \n\ @table @asis\n\ @item @var{s}\n\ The start indices of each of the matching substrings\n\ \n\ @item @var{e}\n\ The end indices of each matching substring\n\ \n\ @item @var{te}\n\ The extents of each of the matched token surrounded by @code{(@dots{})} in\n\ @var{pat}.\n\ \n\ @item @var{m}\n\ A cell array of the text of each match.\n\ \n\ @item @var{t}\n\ A cell array of the text of each token matched.\n\ \n\ @item @var{nm}\n\ A structure containing the text of each matched named token, with the name\n\ being used as the fieldname. A named token is denoted as\n\ @code{(?<name>@dots{})}\n\ @end table\n\ \n\ Particular output arguments or the order of the output arguments can be\n\ selected by additional @var{opts} arguments. These are strings and the\n\ correspondence between the output arguments and the optional argument\n\ are\n\ \n\ @multitable @columnfractions 0.2 0.3 0.3 0.2\n\ @item @tab 'start' @tab @var{s} @tab\n\ @item @tab 'end' @tab @var{e} @tab\n\ @item @tab 'tokenExtents' @tab @var{te} @tab\n\ @item @tab 'match' @tab @var{m} @tab\n\ @item @tab 'tokens' @tab @var{t} @tab\n\ @item @tab 'names' @tab @var{nm} @tab\n\ @end multitable\n\ \n\ A further optional argument is 'once', that limits the number of returned\n\ matches to the first match.\n\ @end deftypefn") { return octregexp (args, nargout, "regexp", false); } /* ## seg-fault test %!assert(regexp("abcde","."),[1,2,3,4,5]) ## Check that anchoring of pattern works correctly %!assert(regexp('abcabc','^abc'),1); %!assert(regexp('abcabc','abc$'),4); %!assert(regexp('abcabc','^abc$'),[]); %!test %! [s, e, te, m, t] = regexp(' No Match ', 'f(.*)uck'); %! assert (s,[]) %! assert (e,[]) %! assert (te,{}) %! assert (m, {}) %! assert (t, {}) %!test %! [s, e, te, m, t] = regexp(' FiRetrUck ', 'f(.*)uck'); %! assert (s,[]) %! assert (e,[]) %! assert (te,{}) %! assert (m, {}) %! assert (t, {}) %!test %! [s, e, te, m, t] = regexp(' firetruck ', 'f(.*)uck'); %! assert (s,2) %! assert (e,10) %! assert (te{1},[3,7]) %! assert (m{1}, 'firetruck') %! assert (t{1}{1}, 'iretr') %!test %! [s, e, te, m, t] = regexp('short test string','\w*r\w*'); %! assert (s,[1,12]) %! assert (e,[5,17]) %! assert (size(te), [1,2]) %! assert (isempty(te{1})) %! assert (isempty(te{2})) %! assert (m{1},'short') %! assert (m{2},'string') %! assert (size(t), [1,2]) %! assert (isempty(t{1})) %! assert (isempty(t{2})) %!test %! [s, e, te, m, t] = regexp('short test string','\w*r\w*','once'); %! assert (s,1) %! assert (e,5) %! assert (size(te), [1,1]) %! assert (isempty(te{1})) %! assert (m{1},'short') %! ## Matlab gives [1,0] here but that seems wrong. %! assert (size(t), [1,1]) %!test %! [m, te, e, s, t] = regexp('short test string','\w*r\w*','once', 'match', 'tokenExtents', 'end', 'start', 'tokens'); %! assert (s,1) %! assert (e,5) %! assert (size(te), [1,1]) %! assert (isempty(te{1})) %! assert (m{1},'short') %! ## Matlab gives [1,0] here but that seems wrong. %! assert (size(t), [1,1]) ## XXX FIXME XXX Disable test for now as PCRE version not written %!#test %! ## This test is expected to fail if PCRE is not installed %! [s, e, te, m, t, nm] = regexp('short test string','(?<word1>\w*t)\s*(?<word2>\w*t)'); %! assert (s,1) %! assert (e,10) %! assert (size(te), [1,1]) %! assert (te{1}, [1 5; 7, 10]) %! assert (m{1},'short test') %! assert (size(t),[1,1]) %! assert (t{1}{1},'short') %! assert (t{1}{2},'test') %! assert (size(nm), [1,1]) %! assert (isempty(fieldnames(nm))) %! assert (sort(fieldnames(nm)),{'word1','word2'}) %! assert (nm.word1,'short') %! assert (nm.word2,'test') ## XXX FIXME XXX Disable test for now as PCRE version not written %!#test %! ## This test is expected to fail if PCRE is not installed %! [nm, m, te, e, s, t] = regexp('short test string','(?<word1>\w*t)\s*(?<word2>\w*t)', 'names', 'match', 'tokenExtents', 'end', 'start', 'tokens'); %! assert (s,1) %! assert (e,10) %! assert (size(te), [1,1]) %! assert (te{1}, [1 5; 7, 10]) %! assert (m{1},'short test') %! assert (size(t),[1,1]) %! assert (t{1}{1},'short') %! assert (t{1}{2},'test') %! assert (size(nm), [1,1]) %! assert (isempty(fieldnames(nm))) %! assert (sort(fieldnames(nm)),{'word1','word2'}) %! assert (nm.word1,'short') %! assert (nm.word2,'test') %!error regexp('string', 'tri', 'BadArg'); %!error regexp('string'); */ DEFUN_DLD(regexpi, args, nargout, "-*- texinfo -*-\n\ @deftypefn {Loadable Function} {[@var{s}, @var{e}, @var{te}, @var{m}, @var{t}, @var{nm}] =} regexpi (@var{str}, @var{pat})\n\ @deftypefnx {Loadable Function} {[@dots{}] =} regexpi (@var{str}, @var{pat}, @var{opts}, @dots{})\n\ \n\ Case insensitive regular expression string matching. Matches @var{pat} in\n\ @var{str} and returns the position and matching substrings or empty values\n\ if there are none. See @code{regexp} for more details\n\ @end deftypefn") { return octregexp (args, nargout, "regexp", true); } /* ## seg-fault test %!assert(regexpi("abcde","."),[1,2,3,4,5]) ## Check that anchoring of pattern works correctly %!assert(regexpi('abcabc','^abc'),1); %!assert(regexpi('abcabc','abc$'),4); %!assert(regexpi('abcabc','^abc$'),[]); %!test %! [s, e, te, m, t] = regexpi(' No Match ', 'f(.*)uck'); %! assert (s,[]) %! assert (e,[]) %! assert (te,{}) %! assert (m, {}) %! assert (t, {}) %!test %! [s, e, te, m, t] = regexpi(' FiRetrUck ', 'f(.*)uck'); %! assert (s,2) %! assert (e,10) %! assert (te{1},[3,7]) %! assert (m{1}, 'FiRetrUck') %! assert (t{1}{1}, 'iRetr') %!test %! [s, e, te, m, t] = regexpi(' firetruck ', 'f(.*)uck'); %! assert (s,2) %! assert (e,10) %! assert (te{1},[3,7]) %! assert (m{1}, 'firetruck') %! assert (t{1}{1}, 'iretr') %!test %! [s, e, te, m, t] = regexpi('ShoRt Test String','\w*r\w*'); %! assert (s,[1,12]) %! assert (e,[5,17]) %! assert (size(te), [1,2]) %! assert (isempty(te{1})) %! assert (isempty(te{2})) %! assert (m{1},'ShoRt') %! assert (m{2},'String') %! assert (size(t), [1,2]) %! assert (isempty(t{1})) %! assert (isempty(t{2})) %!test %! [s, e, te, m, t] = regexpi('ShoRt Test String','\w*r\w*','once'); %! assert (s,1) %! assert (e,5) %! assert (size(te), [1,1]) %! assert (isempty(te{1})) %! assert (m{1},'ShoRt') %! ## Matlab gives [1,0] here but that seems wrong. %! assert (size(t), [1,1]) %!test %! [m, te, e, s, t] = regexpi('ShoRt Test String','\w*r\w*','once', 'match', 'tokenExtents', 'end', 'start', 'tokens'); %! assert (s,1) %! assert (e,5) %! assert (size(te), [1,1]) %! assert (isempty(te{1})) %! assert (m{1},'ShoRt') %! ## Matlab gives [1,0] here but that seems wrong. %! assert (size(t), [1,1]) ## XXX FIXME XXX Disable test for now as PCRE version not written %!#test %! ## This test is expected to fail if PCRE is not installed %! [s, e, te, m, t, nm] = regexpi('ShoRt Test String','(?<word1>\w*t)\s*(?<word2>\w*t)'); %! assert (s,1) %! assert (e,10) %! assert (size(te), [1,1]) %! assert (te{1}, [1 5; 7, 10]) %! assert (m{1},'ShoRt Test') %! assert (size(t),[1,1]) %! assert (t{1}{1},'ShoRt') %! assert (t{1}{2},'Test') %! assert (size(nm), [1,1]) %! assert (isempty(fieldnames(nm))) %! assert (sort(fieldnames(nm)),{'word1','word2'}) %! assert (nm.word1,'ShoRt') %! assert (nm.word2,'Test') ## XXX FIXME XXX Disable test for now as PCRE version not written %!#test %! ## This test is expected to fail if PCRE is not installed %! [nm, m, te, e, s, t] = regexpi('ShoRt Test String','(?<word1>\w*t)\s*(?<word2>\w*t)', 'names', 'match', 'tokenExtents', 'end', 'start', 'tokens'); %! assert (s,1) %! assert (e,10) %! assert (size(te), [1,1]) %! assert (te{1}, [1 5; 7, 10]) %! assert (m{1},'ShoRt Test') %! assert (size(t),[1,1]) %! assert (t{1}{1},'ShoRt') %! assert (t{1}{2},'Test') %! assert (size(nm), [1,1]) %! assert (isempty(fieldnames(nm))) %! assert (sort(fieldnames(nm)),{'word1','word2'}) %! assert (nm.word1,'ShoRt') %! assert (nm.word2,'Test') %!error regexpi('string', 'tri', 'BadArg'); %!error regexpi('string'); */ /* ;;; Local Variables: *** ;;; mode: C++ *** ;;; End: *** */