Mercurial > octave
annotate src/DLD-FUNCTIONS/regexp.cc @ 8093:dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
author | David Bateman <dbateman@free.fr> |
---|---|
date | Tue, 09 Sep 2008 12:36:53 -0400 |
parents | 85184151822e |
children | cdd05e46f6c9 |
rev | line source |
---|---|
5582 | 1 /* |
2 | |
7017 | 3 Copyright (C) 2005, 2006, 2007 David Bateman |
7016 | 4 Copyright (C) 2002, 2003, 2004, 2005 Paul Kienzle |
5 | |
6 This file is part of Octave. | |
5582 | 7 |
8 Octave is free software; you can redistribute it and/or modify it | |
9 under the terms of the GNU General Public License as published by the | |
7016 | 10 Free Software Foundation; either version 3 of the License, or (at your |
11 option) any later version. | |
5582 | 12 |
13 Octave is distributed in the hope that it will be useful, but WITHOUT | |
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
16 for more details. | |
17 | |
18 You should have received a copy of the GNU General Public License | |
7016 | 19 along with Octave; see the file COPYING. If not, see |
20 <http://www.gnu.org/licenses/>. | |
5582 | 21 |
22 */ | |
23 | |
24 #ifdef HAVE_CONFIG_H | |
25 #include <config.h> | |
26 #endif | |
27 | |
5773 | 28 #include <algorithm> |
5765 | 29 #include <sstream> |
30 | |
5582 | 31 #include "defun-dld.h" |
32 #include "error.h" | |
33 #include "gripes.h" | |
34 #include "oct-obj.h" | |
35 #include "utils.h" | |
36 | |
37 #include "Cell.h" | |
38 #include "oct-map.h" | |
39 #include "str-vec.h" | |
5785 | 40 #include "quit.h" |
41 #include "parse.h" | |
5582 | 42 |
7173 | 43 #if defined (HAVE_PCRE) |
5582 | 44 #include <pcre.h> |
7117 | 45 #elif defined (HAVE_REGEX) |
46 #if defined (__MINGW32__) | |
5582 | 47 #define __restrict |
48 #endif | |
7237 | 49 #if defined (HAVE_SYS_TYPES_H) |
50 #include <sys/types.h> | |
51 #endif | |
5582 | 52 #include <regex.h> |
53 #endif | |
54 | |
5785 | 55 // The regexp is constructed as a linked list to avoid resizing the |
56 // return values in arrays at each new match. | |
57 | |
58 // FIXME don't bother collecting and composing return values the user | |
59 // doesn't want. | |
60 | |
61 class regexp_elem | |
5582 | 62 { |
5785 | 63 public: |
5787 | 64 regexp_elem (const string_vector& _named_token, const Cell& _t, |
65 const std::string& _m, const Matrix& _te, double _s, | |
66 double _e) : | |
5785 | 67 named_token (_named_token), t (_t), m (_m), te (_te), s (_s), e (_e) { } |
68 | |
69 regexp_elem (const regexp_elem &a) : named_token (a.named_token), t (a.t), | |
70 m (a.m), te (a.te), s (a.s), e (a.e) | |
71 { } | |
72 | |
73 string_vector named_token; | |
74 Cell t; | |
75 std::string m; | |
76 Matrix te; | |
77 double s; | |
78 double e; | |
79 }; | |
80 | |
81 typedef std::list<regexp_elem>::const_iterator const_iterator; | |
82 | |
8093
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
83 #define MAXLOOKBEHIND 10 |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
84 static bool lookbehind_warned = false; |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
85 |
5785 | 86 static int |
87 octregexp_list (const octave_value_list &args, const std::string &nm, | |
88 bool case_insensitive, std::list<regexp_elem> &lst, | |
7893
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
89 string_vector &named, int &nopts, bool &once) |
5785 | 90 { |
91 int sz = 0; | |
5582 | 92 #if defined (HAVE_REGEX) || defined (HAVE_PCRE) |
93 int nargin = args.length(); | |
5779 | 94 bool lineanchors = false; |
95 bool dotexceptnewline = false; | |
96 bool freespacing = false; | |
5582 | 97 |
5785 | 98 nopts = nargin - 2; |
7893
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
99 once = false; |
5785 | 100 |
5582 | 101 std::string buffer = args(0).string_value (); |
8093
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
102 size_t max_length = (buffer.length () > MAXLOOKBEHIND ? |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
103 MAXLOOKBEHIND: buffer.length ()); |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
104 |
5582 | 105 if (error_state) |
106 { | |
107 gripe_wrong_type_arg (nm.c_str(), args(0)); | |
5785 | 108 return 0; |
5582 | 109 } |
110 | |
111 std::string pattern = args(1).string_value (); | |
112 if (error_state) | |
113 { | |
114 gripe_wrong_type_arg (nm.c_str(), args(1)); | |
5785 | 115 return 0; |
5582 | 116 } |
117 | |
118 for (int i = 2; i < nargin; i++) | |
119 { | |
120 std::string str = args(i).string_value(); | |
121 if (error_state) | |
122 { | |
123 error ("%s: optional arguments must be strings", nm.c_str()); | |
124 break; | |
125 } | |
126 std::transform (str.begin (), str.end (), str.begin (), tolower); | |
127 if (str.find("once", 0) == 0) | |
128 { | |
129 once = true; | |
130 nopts--; | |
131 } | |
5779 | 132 else if (str.find("matchcase", 0) == 0) |
133 { | |
134 case_insensitive = false; | |
135 nopts--; | |
136 } | |
137 else if (str.find("ignorecase", 0) == 0) | |
138 { | |
139 case_insensitive = true; | |
140 nopts--; | |
141 } | |
5785 | 142 else if (str.find("dotall", 0) == 0) |
5779 | 143 { |
5785 | 144 dotexceptnewline = false; |
145 nopts--; | |
146 } | |
147 else if (str.find("stringanchors", 0) == 0) | |
148 { | |
149 lineanchors = false; | |
5779 | 150 nopts--; |
151 } | |
152 else if (str.find("literalspacing", 0) == 0) | |
153 { | |
154 freespacing = false; | |
155 nopts--; | |
156 } | |
5785 | 157 #if HAVE_PCRE |
158 // Only accept these options with pcre | |
159 else if (str.find("dotexceptnewline", 0) == 0) | |
160 { | |
161 dotexceptnewline = true; | |
162 nopts--; | |
163 } | |
164 else if (str.find("lineanchors", 0) == 0) | |
165 { | |
166 lineanchors = true; | |
167 nopts--; | |
168 } | |
169 else if (str.find("freespacing", 0) == 0) | |
170 { | |
171 freespacing = true; | |
172 nopts--; | |
173 } | |
5582 | 174 else if (str.find("start", 0) && str.find("end", 0) && |
175 str.find("tokenextents", 0) && str.find("match", 0) && | |
176 str.find("tokens", 0) && str.find("names", 0)) | |
177 error ("%s: unrecognized option", nm.c_str()); | |
178 #else | |
5779 | 179 else if (str.find("names", 0) == 0 || |
180 str.find("dotexceptnewline", 0) == 0 || | |
181 str.find("lineanchors", 0) == 0 || | |
182 str.find("freespacing", 0) == 0) | |
5785 | 183 error ("%s: %s not implemented in this version", str.c_str(), nm.c_str()); |
5582 | 184 else if (str.find("start", 0) && str.find("end", 0) && |
185 str.find("tokenextents", 0) && str.find("match", 0) && | |
186 str.find("tokens", 0)) | |
187 error ("%s: unrecognized option", nm.c_str()); | |
188 #endif | |
189 } | |
190 | |
191 if (!error_state) | |
192 { | |
5785 | 193 Cell t; |
194 std::string m; | |
195 double s, e; | |
5582 | 196 |
197 // named tokens "(?<name>...)" are only treated with PCRE not regex. | |
198 #if HAVE_PCRE | |
199 | |
5619 | 200 size_t pos = 0; |
201 size_t new_pos; | |
202 int nnames = 0; | |
203 int inames = 0; | |
5765 | 204 std::ostringstream buf; |
5619 | 205 Array<int> named_idx; |
5582 | 206 |
8093
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
207 while ((new_pos = pattern.find ("(?",pos)) != std::string::npos) |
5619 | 208 { |
8093
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
209 if (pattern.at (new_pos + 2) == '<' && |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
210 !(pattern.at (new_pos + 3) == '=' || |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
211 pattern.at (new_pos + 3) == '!')) |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
212 { |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
213 // The syntax of named tokens in pcre is "(?P<name>...)" while |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
214 // we need a syntax "(?<name>...)", so fix that here. Also an |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
215 // expression like |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
216 // "(?<first>\w+)\s+(?<last>\w+)|(?<last>\w+),\s+(?<first>\w+)" |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
217 // should be perfectly legal, while pcre does not allow the same |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
218 // named token name on both sides of the alternative. Also fix |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
219 // that here by replacing name tokens by dummy names, and dealing |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
220 // with the dummy names later. |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
221 |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
222 size_t tmp_pos = pattern.find_first_of ('>',new_pos); |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
223 |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
224 if (tmp_pos == std::string::npos) |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
225 { |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
226 error ("syntax error in pattern"); |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
227 break; |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
228 } |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
229 |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
230 std::string tmp_name = |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
231 pattern.substr(new_pos+3,tmp_pos-new_pos-3); |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
232 bool found = false; |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
233 |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
234 for (int i = 0; i < nnames; i++) |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
235 if (named(i) == tmp_name) |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
236 { |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
237 named_idx.resize(inames+1); |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
238 named_idx(inames) = i; |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
239 found = true; |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
240 break; |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
241 } |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
242 if (! found) |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
243 { |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
244 named_idx.resize(inames+1); |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
245 named_idx(inames) = nnames; |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
246 named.append(tmp_name); |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
247 nnames++; |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
248 } |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
249 |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
250 if (new_pos - pos > 0) |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
251 buf << pattern.substr(pos,new_pos-pos); |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
252 if (inames < 10) |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
253 buf << "(?P<n00" << inames++; |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
254 else if (inames < 100) |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
255 buf << "(?P<n0" << inames++; |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
256 else |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
257 buf << "(?P<n" << inames++; |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
258 pos = tmp_pos; |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
259 } |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
260 else if (pattern.at (new_pos + 2) == '<') |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
261 { |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
262 // Find lookbehind operators of arbitrary length (ie like |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
263 // "(?<=[a-z]*)") and replace with a maximum length operator |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
264 // as PCRE can not yet handle arbitrary length lookahead |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
265 // operators. Use the string length as the maximum length to |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
266 // avoid issues. |
5582 | 267 |
8093
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
268 int brackets = 1; |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
269 size_t tmp_pos1 = new_pos + 2; |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
270 size_t tmp_pos2 = tmp_pos1; |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
271 while (tmp_pos1 <= pattern.length () && brackets > 0) |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
272 { |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
273 char ch = pattern.at (tmp_pos1); |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
274 if (ch == '(') |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
275 brackets++; |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
276 else if (ch == ')') |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
277 { |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
278 if (brackets > 1) |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
279 tmp_pos2 = tmp_pos1; |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
280 |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
281 brackets--; |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
282 } |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
283 tmp_pos1++; |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
284 } |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
285 |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
286 if (brackets != 0) |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
287 { |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
288 buf << pattern.substr (pos, new_pos - pos) << "(?"; |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
289 pos = new_pos + 2; |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
290 } |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
291 else |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
292 { |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
293 size_t tmp_pos3 = pattern.find_first_of ("*+", tmp_pos2); |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
294 if (tmp_pos3 != std::string::npos && tmp_pos3 < tmp_pos1) |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
295 { |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
296 if (!lookbehind_warned) |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
297 { |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
298 lookbehind_warned = true; |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
299 warning ("%s: arbitrary length lookbehind patterns are only support up to length %d", nm.c_str(), MAXLOOKBEHIND); |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
300 } |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
301 |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
302 buf << pattern.substr (pos, new_pos - pos) << "("; |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
303 |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
304 size_t i; |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
305 if (pattern.at (tmp_pos3) == '*') |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
306 i = 0; |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
307 else |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
308 i = 1; |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
309 |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
310 for (; i < max_length + 1; i++) |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
311 { |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
312 buf <<pattern.substr(new_pos, tmp_pos3 - new_pos) |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
313 << "{" << i << "}"; |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
314 buf << pattern.substr(tmp_pos3 + 1, |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
315 tmp_pos1 - tmp_pos3 - 1); |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
316 if (i != max_length) |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
317 buf << "|"; |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
318 } |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
319 buf << ")"; |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
320 } |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
321 else |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
322 buf << pattern.substr (pos, tmp_pos1 - pos); |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
323 pos = tmp_pos1; |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
324 } |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
325 } |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
326 else |
5619 | 327 { |
8093
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
328 buf << pattern.substr (pos, new_pos - pos) << "(?"; |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
329 pos = new_pos + 2; |
5619 | 330 } |
331 | |
332 } | |
333 | |
5765 | 334 buf << pattern.substr(pos); |
5619 | 335 |
336 if (error_state) | |
5785 | 337 return 0; |
5582 | 338 |
339 // Compile expression | |
340 pcre *re; | |
341 const char *err; | |
342 int erroffset; | |
5765 | 343 std::string buf_str = buf.str (); |
344 re = pcre_compile (buf_str.c_str (), | |
5779 | 345 (case_insensitive ? PCRE_CASELESS : 0) | |
346 (dotexceptnewline ? 0 : PCRE_DOTALL) | | |
347 (lineanchors ? PCRE_MULTILINE : 0) | | |
348 (freespacing ? PCRE_EXTENDED : 0), | |
7520 | 349 &err, &erroffset, 0); |
5582 | 350 |
7520 | 351 if (re == 0) { |
5582 | 352 error("%s: %s at position %d of expression", nm.c_str(), |
353 err, erroffset); | |
5785 | 354 return 0; |
5582 | 355 } |
356 | |
357 int subpatterns; | |
358 int namecount; | |
359 int nameentrysize; | |
360 char *nametable; | |
361 int idx = 0; | |
362 | |
7520 | 363 pcre_fullinfo(re, 0, PCRE_INFO_CAPTURECOUNT, &subpatterns); |
364 pcre_fullinfo(re, 0, PCRE_INFO_NAMECOUNT, &namecount); | |
365 pcre_fullinfo(re, 0, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize); | |
366 pcre_fullinfo(re, 0, PCRE_INFO_NAMETABLE, &nametable); | |
5582 | 367 |
368 OCTAVE_LOCAL_BUFFER(int, ovector, (subpatterns+1)*3); | |
369 OCTAVE_LOCAL_BUFFER(int, nidx, namecount); | |
370 | |
371 for (int i = 0; i < namecount; i++) | |
372 { | |
373 // Index of subpattern in first two bytes MSB first of name. | |
5619 | 374 // Extract index. |
5779 | 375 nidx[i] = (static_cast<int>(nametable[i*nameentrysize])) << 8 | |
376 static_cast<int>(nametable[i*nameentrysize+1]); | |
5582 | 377 } |
378 | |
379 while(true) | |
380 { | |
5785 | 381 OCTAVE_QUIT; |
382 | |
7520 | 383 int matches = pcre_exec(re, 0, buffer.c_str(), |
5582 | 384 buffer.length(), idx, |
385 (idx ? PCRE_NOTBOL : 0), | |
386 ovector, (subpatterns+1)*3); | |
387 | |
388 if (matches < 0 && matches != PCRE_ERROR_NOMATCH) | |
389 { | |
390 error ("%s: internal error calling pcre_exec", nm.c_str()); | |
391 pcre_free(re); | |
5785 | 392 return 0; |
5582 | 393 } |
394 else if (matches == PCRE_ERROR_NOMATCH) | |
395 break; | |
5779 | 396 else if (ovector[1] <= ovector[0]) |
397 break; | |
5582 | 398 else |
399 { | |
5619 | 400 int pos_match = 0; |
5785 | 401 Matrix te(matches-1,2); |
5582 | 402 for (int i = 1; i < matches; i++) |
403 { | |
5619 | 404 if (ovector[2*i] >= 0 && ovector[2*i+1] > 0) |
405 { | |
5785 | 406 te(pos_match,0) = double (ovector[2*i]+1); |
407 te(pos_match++,1) = double (ovector[2*i+1]); | |
5619 | 408 } |
5582 | 409 } |
5785 | 410 te.resize(pos_match,2); |
411 s = double (ovector[0]+1); | |
412 e = double (ovector[1]); | |
5582 | 413 |
414 const char **listptr; | |
415 int status = pcre_get_substring_list(buffer.c_str(), ovector, | |
416 matches, &listptr); | |
417 | |
418 if (status == PCRE_ERROR_NOMEMORY) { | |
419 error("%s: cannot allocate memory in pcre_get_substring_list", | |
420 nm.c_str()); | |
421 pcre_free(re); | |
5785 | 422 return 0; |
5582 | 423 } |
424 | |
5619 | 425 Cell cell_t (dim_vector(1,pos_match)); |
426 pos_match = 0; | |
5582 | 427 for (int i = 1; i < matches; i++) |
5619 | 428 if (ovector[2*i] >= 0 && ovector[2*i+1] > 0) |
429 cell_t(pos_match++) = std::string(*(listptr+i)); | |
5582 | 430 |
5785 | 431 m = std::string(*listptr); |
432 t = cell_t; | |
433 | |
434 string_vector named_tokens(nnames); | |
5619 | 435 if (namecount > 0) |
436 for (int i = 1; i < matches; i++) | |
437 { | |
438 if (ovector[2*i] >= 0 && ovector[2*i+1] > 0) | |
439 { | |
5785 | 440 named_tokens(named_idx(i-1)) = |
441 std::string(*(listptr+nidx[i-1])); | |
5619 | 442 } |
443 } | |
5582 | 444 |
445 pcre_free_substring_list(listptr); | |
446 | |
5785 | 447 regexp_elem new_elem (named_tokens, t, m, te, s, e); |
448 lst.push_back (new_elem); | |
449 idx = ovector[1]; | |
450 sz++; | |
451 | |
5582 | 452 if (once) |
453 break; | |
454 | |
455 } | |
456 } | |
457 | |
458 pcre_free(re); | |
459 #else | |
460 regex_t compiled; | |
461 int err=regcomp(&compiled, pattern.c_str(), REG_EXTENDED | | |
462 (case_insensitive ? REG_ICASE : 0)); | |
463 if (err) | |
464 { | |
7520 | 465 int len = regerror(err, &compiled, 0, 0); |
5760 | 466 OCTAVE_LOCAL_BUFFER (char, errmsg, len); |
467 regerror(err, &compiled, errmsg, len); | |
468 error("%s: %s in pattern (%s)", nm.c_str(), errmsg, | |
469 pattern.c_str()); | |
5582 | 470 regfree(&compiled); |
5785 | 471 return 0; |
5582 | 472 } |
473 | |
474 int subexpr = 1; | |
475 int idx = 0; | |
476 for (unsigned int i=0; i < pattern.length(); i++) | |
477 subexpr += ( pattern[i] == '(' ? 1 : 0 ); | |
478 OCTAVE_LOCAL_BUFFER (regmatch_t, match, subexpr ); | |
479 | |
480 while(true) | |
481 { | |
5785 | 482 OCTAVE_QUIT; |
483 | |
5582 | 484 if (regexec(&compiled, buffer.c_str() + idx, subexpr, |
485 match, (idx ? REG_NOTBOL : 0)) == 0) | |
486 { | |
487 // Count actual matches | |
488 int matches = 0; | |
489 while (matches < subexpr && match[matches].rm_so >= 0) | |
490 matches++; | |
491 | |
5785 | 492 s = double (match[0].rm_so+1+idx); |
493 e = double (match[0].rm_eo+idx); | |
494 Matrix te(matches-1,2); | |
5582 | 495 for (int i = 1; i < matches; i++) |
496 { | |
5785 | 497 te(i-1,0) = double (match[i].rm_so+1+idx); |
498 te(i-1,1) = double (match[i].rm_eo+idx); | |
5582 | 499 } |
500 | |
5785 | 501 m = buffer.substr (match[0].rm_so+idx, |
5582 | 502 match[0].rm_eo-match[0].rm_so); |
503 | |
504 Cell cell_t (dim_vector(1,matches-1)); | |
505 for (int i = 1; i < matches; i++) | |
506 cell_t(i-1) = buffer.substr (match[i].rm_so+idx, | |
507 match[i].rm_eo-match[i].rm_so); | |
5785 | 508 t = cell_t; |
5582 | 509 |
510 idx += match[0].rm_eo; | |
5785 | 511 |
5866 | 512 string_vector sv; |
513 regexp_elem new_elem (sv, t, m, te, s, e); | |
5785 | 514 lst.push_back (new_elem); |
5582 | 515 sz++; |
516 | |
517 if (once) | |
518 break; | |
519 } | |
520 else | |
521 break; | |
522 } | |
523 regfree(&compiled); | |
524 #endif | |
5785 | 525 } |
526 #else | |
527 error ("%s: not available in this version of Octave", nm.c_str()); | |
528 #endif | |
529 return sz; | |
530 } | |
5582 | 531 |
5785 | 532 static octave_value_list |
533 octregexp (const octave_value_list &args, int nargout, const std::string &nm, | |
534 bool case_insensitive) | |
535 { | |
536 octave_value_list retval; | |
537 int nargin = args.length(); | |
538 std::list<regexp_elem> lst; | |
539 string_vector named; | |
540 int nopts; | |
7893
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
541 bool once; |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
542 int sz = octregexp_list (args, nm, case_insensitive, lst, named, nopts, once); |
5785 | 543 |
544 if (! error_state) | |
545 { | |
546 // Converted the linked list in the correct form for the return values | |
547 | |
548 octave_idx_type i = 0; | |
549 #ifdef HAVE_PCRE | |
550 Octave_map nmap; | |
551 if (sz == 1) | |
552 { | |
553 for (int j = 0; j < named.length(); j++) | |
554 nmap.assign (named(j), lst.begin()->named_token(j)); | |
555 retval(5) = nmap; | |
556 } | |
557 else | |
558 { | |
559 for (int j = 0; j < named.length (); j++) | |
560 { | |
561 i = 0; | |
562 Cell tmp(dim_vector (1, sz)); | |
563 for (const_iterator p = lst.begin(); p != lst.end(); p++) | |
564 tmp(i++) = p->named_token(j); | |
565 nmap.assign (named(j), octave_value (tmp)); | |
566 } | |
567 retval(5) = nmap; | |
568 } | |
569 #else | |
570 retval(5) = Octave_map(); | |
571 #endif | |
572 | |
7893
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
573 if (once) |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
574 retval(4) = sz ? lst.front ().t : Cell(); |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
575 else |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
576 { |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
577 Cell t (dim_vector(1, sz)); |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
578 i = 0; |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
579 for (const_iterator p = lst.begin(); p != lst.end(); p++) |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
580 t(i++) = p->t; |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
581 retval(4) = t; |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
582 } |
5785 | 583 |
7893
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
584 if (once) |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
585 retval(3) = sz ? lst.front ().m : std::string(); |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
586 else |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
587 { |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
588 Cell m (dim_vector(1, sz)); |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
589 i = 0; |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
590 for (const_iterator p = lst.begin(); p != lst.end(); p++) |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
591 m(i++) = p->m; |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
592 retval(3) = m; |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
593 } |
5785 | 594 |
7893
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
595 if (once) |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
596 retval(2) = sz ? lst.front ().te : Matrix(); |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
597 else |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
598 { |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
599 Cell te (dim_vector(1, sz)); |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
600 i = 0; |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
601 for (const_iterator p = lst.begin(); p != lst.end(); p++) |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
602 te(i++) = p->te; |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
603 retval(2) = te; |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
604 } |
5785 | 605 |
7893
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
606 if (once) |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
607 { |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
608 if (sz) |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
609 retval(1) = lst.front ().e; |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
610 else |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
611 retval(1) = Matrix(); |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
612 } |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
613 else |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
614 { |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
615 NDArray e (dim_vector(1, sz)); |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
616 i = 0; |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
617 for (const_iterator p = lst.begin(); p != lst.end(); p++) |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
618 e(i++) = p->e; |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
619 retval(1) = e; |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
620 } |
5785 | 621 |
7893
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
622 if (once) |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
623 { |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
624 if (sz) |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
625 retval(0) = lst.front ().s; |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
626 else |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
627 retval(0) = Matrix(); |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
628 } |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
629 else |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
630 { |
5785 | 631 NDArray s (dim_vector(1, sz)); |
632 i = 0; | |
633 for (const_iterator p = lst.begin(); p != lst.end(); p++) | |
634 s(i++) = p->s; | |
5582 | 635 retval(0) = s; |
7893
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
636 } |
5582 | 637 |
638 // Alter the order of the output arguments | |
639 if (nopts > 0) | |
640 { | |
641 int n = 0; | |
642 octave_value_list new_retval; | |
643 new_retval.resize(nargout); | |
644 | |
645 OCTAVE_LOCAL_BUFFER (int, arg_used, 6); | |
5785 | 646 for (int j = 0; j < 6; j++) |
647 arg_used[j] = false; | |
5582 | 648 |
5785 | 649 for (int j = 2; j < nargin; j++) |
5582 | 650 { |
651 int k = 0; | |
5785 | 652 std::string str = args(j).string_value(); |
5582 | 653 std::transform (str.begin (), str.end (), str.begin (), tolower); |
5779 | 654 if (str.find("once", 0) == 0 |
655 || str.find("stringanchors", 0) == 0 | |
656 || str.find("lineanchors", 0) == 0 | |
657 || str.find("matchcase", 0) == 0 | |
658 || str.find("ignorecase", 0) == 0 | |
659 || str.find("dotall", 0) == 0 | |
660 || str.find("dotexceptnewline", 0) == 0 | |
661 || str.find("literalspacing", 0) == 0 | |
662 || str.find("freespacing", 0) == 0 | |
663 ) | |
5582 | 664 continue; |
665 else if (str.find("start", 0) == 0) | |
666 k = 0; | |
667 else if (str.find("end", 0) == 0) | |
668 k = 1; | |
669 else if (str.find("tokenextents", 0) == 0) | |
670 k = 2; | |
671 else if (str.find("match", 0) == 0) | |
672 k = 3; | |
673 else if (str.find("tokens", 0) == 0) | |
674 k = 4; | |
675 else if (str.find("names", 0) == 0) | |
676 k = 5; | |
677 | |
678 new_retval(n++) = retval(k); | |
679 arg_used[k] = true; | |
680 | |
681 if (n == nargout) | |
682 break; | |
683 } | |
684 | |
685 // Fill in the rest of the arguments | |
686 if (n < nargout) | |
687 { | |
5785 | 688 for (int j = 0; j < 6; j++) |
5582 | 689 { |
5785 | 690 if (! arg_used[j]) |
691 new_retval(n++) = retval(j); | |
5582 | 692 } |
693 } | |
694 | |
695 retval = new_retval; | |
696 } | |
697 } | |
698 | |
699 return retval; | |
700 } | |
701 | |
6361 | 702 static octave_value_list |
703 octcellregexp (const octave_value_list &args, int nargout, const std::string &nm, | |
704 bool case_insensitive) | |
705 { | |
706 octave_value_list retval; | |
707 | |
708 if (args(0).is_cell()) | |
709 { | |
710 OCTAVE_LOCAL_BUFFER (Cell, newretval, nargout); | |
711 octave_value_list new_args = args; | |
712 Cell cellstr = args(0).cell_value(); | |
713 if (args(1).is_cell()) | |
714 { | |
715 Cell cellpat = args(1).cell_value(); | |
716 | |
717 if (cellpat.numel() == 1) | |
718 { | |
719 for (int j = 0; j < nargout; j++) | |
720 newretval[j].resize(cellstr.dims()); | |
721 | |
722 new_args(1) = cellpat(0); | |
723 | |
724 for (octave_idx_type i = 0; i < cellstr.numel (); i++) | |
725 { | |
726 new_args(0) = cellstr(i); | |
727 octave_value_list tmp = octregexp (new_args, nargout, nm, | |
728 case_insensitive); | |
729 | |
730 if (error_state) | |
731 break; | |
732 | |
733 for (int j = 0; j < nargout; j++) | |
734 newretval[j](i) = tmp(j); | |
735 } | |
736 } | |
737 else if (cellstr.numel() == 1) | |
738 { | |
739 for (int j = 0; j < nargout; j++) | |
740 newretval[j].resize(cellpat.dims()); | |
741 | |
742 new_args(0) = cellstr(0); | |
743 | |
744 for (octave_idx_type i = 0; i < cellpat.numel (); i++) | |
745 { | |
746 new_args(1) = cellpat(i); | |
747 octave_value_list tmp = octregexp (new_args, nargout, nm, | |
748 case_insensitive); | |
749 | |
750 if (error_state) | |
751 break; | |
752 | |
753 for (int j = 0; j < nargout; j++) | |
754 newretval[j](i) = tmp(j); | |
755 } | |
756 } | |
757 else if (cellstr.numel() == cellpat.numel()) | |
758 { | |
759 | |
760 if (cellstr.dims() != cellpat.dims()) | |
761 error ("%s: Inconsistent cell array dimensions", nm.c_str()); | |
762 else | |
763 { | |
764 for (int j = 0; j < nargout; j++) | |
765 newretval[j].resize(cellstr.dims()); | |
766 | |
767 for (octave_idx_type i = 0; i < cellstr.numel (); i++) | |
768 { | |
769 new_args(0) = cellstr(i); | |
770 new_args(1) = cellpat(i); | |
771 | |
772 octave_value_list tmp = octregexp (new_args, nargout, nm, | |
773 case_insensitive); | |
774 | |
775 if (error_state) | |
776 break; | |
777 | |
778 for (int j = 0; j < nargout; j++) | |
779 newretval[j](i) = tmp(j); | |
780 } | |
781 } | |
782 } | |
783 else | |
784 error ("regexp: cell array arguments must be scalar or equal size"); | |
785 } | |
786 else | |
787 { | |
788 for (int j = 0; j < nargout; j++) | |
789 newretval[j].resize(cellstr.dims()); | |
790 | |
791 for (octave_idx_type i = 0; i < cellstr.numel (); i++) | |
792 { | |
793 new_args(0) = cellstr(i); | |
794 octave_value_list tmp = octregexp (new_args, nargout, nm, case_insensitive); | |
795 | |
796 if (error_state) | |
797 break; | |
798 | |
799 for (int j = 0; j < nargout; j++) | |
800 newretval[j](i) = tmp(j); | |
801 } | |
802 } | |
803 | |
804 if (!error_state) | |
805 for (int j = 0; j < nargout; j++) | |
806 retval(j) = octave_value (newretval[j]); | |
807 } | |
808 else if (args(1).is_cell()) | |
809 { | |
810 OCTAVE_LOCAL_BUFFER (Cell, newretval, nargout); | |
811 octave_value_list new_args = args; | |
812 Cell cellpat = args(1).cell_value(); | |
813 | |
814 for (int j = 0; j < nargout; j++) | |
815 newretval[j].resize(cellpat.dims()); | |
816 | |
817 for (octave_idx_type i = 0; i < cellpat.numel (); i++) | |
818 { | |
819 new_args(1) = cellpat(i); | |
820 octave_value_list tmp = octregexp (new_args, nargout, nm, case_insensitive); | |
821 | |
822 if (error_state) | |
823 break; | |
824 | |
825 for (int j = 0; j < nargout; j++) | |
826 newretval[j](i) = tmp(j); | |
827 } | |
828 | |
829 if (!error_state) | |
830 for (int j = 0; j < nargout; j++) | |
831 retval(j) = octave_value (newretval[j]); | |
832 } | |
833 else | |
834 retval = octregexp (args, nargout, nm, case_insensitive); | |
835 | |
836 return retval; | |
837 | |
838 } | |
839 | |
5582 | 840 DEFUN_DLD (regexp, args, nargout, |
841 "-*- texinfo -*-\n\ | |
842 @deftypefn {Loadable Function} {[@var{s}, @var{e}, @var{te}, @var{m}, @var{t}, @var{nm}] =} regexp (@var{str}, @var{pat})\n\ | |
843 @deftypefnx {Loadable Function} {[@dots{}] =} regexp (@var{str}, @var{pat}, @var{opts}, @dots{})\n\ | |
844 \n\ | |
845 Regular expression string matching. Matches @var{pat} in @var{str} and\n\ | |
846 returns the position and matching substrings or empty values if there are\n\ | |
847 none.\n\ | |
848 \n\ | |
849 The matched pattern @var{pat} can include any of the standard regex\n\ | |
850 operators, including:\n\ | |
851 \n\ | |
852 @table @code\n\ | |
853 @item .\n\ | |
854 Match any character\n\ | |
855 @item * + ? @{@}\n\ | |
856 Repetition operators, representing\n\ | |
857 @table @code\n\ | |
858 @item *\n\ | |
859 Match zero or more times\n\ | |
860 @item +\n\ | |
861 Match one or more times\n\ | |
862 @item ?\n\ | |
863 Match zero or one times\n\ | |
864 @item @{@}\n\ | |
865 Match range operator, which is of the form @code{@{@var{n}@}} to match exactly\n\ | |
866 @var{n} times, @code{@{@var{m},@}} to match @var{m} or more times,\n\ | |
867 @code{@{@var{m},@var{n}@}} to match between @var{m} and @var{n} times.\n\ | |
868 @end table\n\ | |
869 @item [@dots{}] [^@dots{}]\n\ | |
870 List operators, where for example @code{[ab]c} matches @code{ac} and @code{bc}\n\ | |
871 @item ()\n\ | |
872 Grouping operator\n\ | |
873 @item |\n\ | |
874 Alternation operator. Match one of a choice of regular expressions. The\n\ | |
7001 | 875 alternatives must be delimited by the grouping operator @code{()} above\n\ |
5582 | 876 @item ^ $\n\ |
877 Anchoring operator. @code{^} matches the start of the string @var{str} and\n\ | |
878 @code{$} the end\n\ | |
879 @end table\n\ | |
880 \n\ | |
881 In addition the following escaped characters have special meaning. It should\n\ | |
882 be noted that it is recommended to quote @var{pat} in single quotes rather\n\ | |
883 than double quotes, to avoid the escape sequences being interpreted by octave\n\ | |
884 before being passed to @code{regexp}.\n\ | |
885 \n\ | |
886 @table @code\n\ | |
887 @item \\b\n\ | |
888 Match a word boundary\n\ | |
889 @item \\B\n\ | |
890 Match within a word\n\ | |
891 @item \\w\n\ | |
892 Matches any word character\n\ | |
893 @item \\W\n\ | |
894 Matches any non word character\n\ | |
895 @item \\<\n\ | |
896 Matches the beginning of a word\n\ | |
897 @item \\>\n\ | |
898 Matches the end of a word\n\ | |
899 @item \\s\n\ | |
900 Matches any whitespace character\n\ | |
901 @item \\S\n\ | |
902 Matches any non whitespace character\n\ | |
903 @item \\d\n\ | |
904 Matches any digit\n\ | |
905 @item \\D\n\ | |
906 Matches any non-digit\n\ | |
907 @end table\n\ | |
908 \n\ | |
909 The outputs of @code{regexp} by default are in the order as given below\n\ | |
910 \n\ | |
911 @table @asis\n\ | |
912 @item @var{s}\n\ | |
913 The start indices of each of the matching substrings\n\ | |
914 \n\ | |
915 @item @var{e}\n\ | |
916 The end indices of each matching substring\n\ | |
917 \n\ | |
918 @item @var{te}\n\ | |
919 The extents of each of the matched token surrounded by @code{(@dots{})} in\n\ | |
920 @var{pat}.\n\ | |
921 \n\ | |
922 @item @var{m}\n\ | |
923 A cell array of the text of each match.\n\ | |
924 \n\ | |
925 @item @var{t}\n\ | |
926 A cell array of the text of each token matched.\n\ | |
927 \n\ | |
928 @item @var{nm}\n\ | |
929 A structure containing the text of each matched named token, with the name\n\ | |
930 being used as the fieldname. A named token is denoted as\n\ | |
931 @code{(?<name>@dots{})}\n\ | |
932 @end table\n\ | |
933 \n\ | |
934 Particular output arguments or the order of the output arguments can be\n\ | |
935 selected by additional @var{opts} arguments. These are strings and the\n\ | |
936 correspondence between the output arguments and the optional argument\n\ | |
937 are\n\ | |
938 \n\ | |
939 @multitable @columnfractions 0.2 0.3 0.3 0.2\n\ | |
940 @item @tab 'start' @tab @var{s} @tab\n\ | |
941 @item @tab 'end' @tab @var{e} @tab\n\ | |
942 @item @tab 'tokenExtents' @tab @var{te} @tab\n\ | |
943 @item @tab 'match' @tab @var{m} @tab\n\ | |
944 @item @tab 'tokens' @tab @var{t} @tab\n\ | |
945 @item @tab 'names' @tab @var{nm} @tab\n\ | |
946 @end multitable\n\ | |
947 \n\ | |
948 A further optional argument is 'once', that limits the number of returned\n\ | |
5779 | 949 matches to the first match. Additional arguments are\n\ |
950 \n\ | |
951 @table @asis\n\ | |
952 @item matchcase\n\ | |
953 Make the matching case sensitive.\n\ | |
954 @item ignorecase\n\ | |
955 Make the matching case insensitive.\n\ | |
956 @item stringanchors\n\ | |
957 Match the anchor characters at the beginning and end of the string.\n\ | |
958 @item lineanchors\n\ | |
959 Match the anchor characters at the beginning and end of the line.\n\ | |
960 @item dotall\n\ | |
961 The character @code{.} matches the newline character.\n\ | |
962 @item dotexceptnewline\n\ | |
963 The character @code{.} matches all but the newline character.\n\ | |
964 @item freespacing\n\ | |
965 The pattern can include arbitrary whitespace and comments starting with\n\ | |
966 @code{#}.\n\ | |
967 @item literalspacing\n\ | |
968 The pattern is taken literally.\n\ | |
969 @end table\n\ | |
5582 | 970 @end deftypefn") |
971 { | |
6361 | 972 octave_value_list retval; |
973 int nargin = args.length(); | |
974 | |
975 if (nargin < 2) | |
976 print_usage (); | |
977 else if (args(0).is_cell() || args(1).is_cell()) | |
978 retval = octcellregexp (args, nargout, "regexp", false); | |
979 else | |
980 retval = octregexp (args, nargout, "regexp", false); | |
981 | |
982 return retval; | |
5582 | 983 } |
984 | |
985 /* | |
986 | |
987 ## seg-fault test | |
988 %!assert(regexp("abcde","."),[1,2,3,4,5]) | |
989 | |
990 ## Check that anchoring of pattern works correctly | |
991 %!assert(regexp('abcabc','^abc'),1); | |
992 %!assert(regexp('abcabc','abc$'),4); | |
5785 | 993 %!assert(regexp('abcabc','^abc$'),zeros(1,0)); |
5582 | 994 |
995 %!test | |
996 %! [s, e, te, m, t] = regexp(' No Match ', 'f(.*)uck'); | |
5785 | 997 %! assert (s,zeros(1,0)) |
998 %! assert (e,zeros(1,0)) | |
999 %! assert (te,cell(1,0)) | |
1000 %! assert (m, cell(1,0)) | |
1001 %! assert (t, cell(1,0)) | |
5582 | 1002 |
1003 %!test | |
1004 %! [s, e, te, m, t] = regexp(' FiRetrUck ', 'f(.*)uck'); | |
5785 | 1005 %! assert (s,zeros(1,0)) |
1006 %! assert (e,zeros(1,0)) | |
1007 %! assert (te,cell(1,0)) | |
1008 %! assert (m, cell(1,0)) | |
1009 %! assert (t, cell(1,0)) | |
5582 | 1010 |
1011 %!test | |
1012 %! [s, e, te, m, t] = regexp(' firetruck ', 'f(.*)uck'); | |
1013 %! assert (s,2) | |
1014 %! assert (e,10) | |
1015 %! assert (te{1},[3,7]) | |
1016 %! assert (m{1}, 'firetruck') | |
1017 %! assert (t{1}{1}, 'iretr') | |
1018 | |
1019 %!test | |
1020 %! [s, e, te, m, t] = regexp('short test string','\w*r\w*'); | |
1021 %! assert (s,[1,12]) | |
1022 %! assert (e,[5,17]) | |
1023 %! assert (size(te), [1,2]) | |
1024 %! assert (isempty(te{1})) | |
1025 %! assert (isempty(te{2})) | |
1026 %! assert (m{1},'short') | |
1027 %! assert (m{2},'string') | |
1028 %! assert (size(t), [1,2]) | |
1029 %! assert (isempty(t{1})) | |
1030 %! assert (isempty(t{2})) | |
1031 | |
1032 %!test | |
1033 %! [s, e, te, m, t] = regexp('short test string','\w*r\w*','once'); | |
1034 %! assert (s,1) | |
1035 %! assert (e,5) | |
7893
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
1036 %! assert (isempty(te)) |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
1037 %! assert (m,'short') |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
1038 %! assert (isempty(t)) |
5582 | 1039 |
1040 %!test | |
1041 %! [m, te, e, s, t] = regexp('short test string','\w*r\w*','once', 'match', 'tokenExtents', 'end', 'start', 'tokens'); | |
1042 %! assert (s,1) | |
1043 %! assert (e,5) | |
7893
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
1044 %! assert (isempty(te)) |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
1045 %! assert (m,'short') |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
1046 %! assert (isempty(t)) |
5582 | 1047 |
7242 | 1048 %!testif HAVE_PCRE |
5582 | 1049 %! ## This test is expected to fail if PCRE is not installed |
7242 | 1050 %! [s, e, te, m, t, nm] = regexp('short test string','(?<word1>\w*t)\s*(?<word2>\w*t)'); |
1051 %! assert (s,1) | |
1052 %! assert (e,10) | |
1053 %! assert (size(te), [1,1]) | |
1054 %! assert (te{1}, [1 5; 7, 10]) | |
1055 %! assert (m{1},'short test') | |
1056 %! assert (size(t),[1,1]) | |
1057 %! assert (t{1}{1},'short') | |
1058 %! assert (t{1}{2},'test') | |
1059 %! assert (size(nm), [1,1]) | |
1060 %! assert (!isempty(fieldnames(nm))) | |
1061 %! assert (sort(fieldnames(nm)),{'word1';'word2'}) | |
1062 %! assert (nm.word1,'short') | |
1063 %! assert (nm.word2,'test') | |
5582 | 1064 |
7242 | 1065 %!testif HAVE_PCRE |
5582 | 1066 %! ## This test is expected to fail if PCRE is not installed |
7242 | 1067 %! [nm, m, te, e, s, t] = regexp('short test string','(?<word1>\w*t)\s*(?<word2>\w*t)', 'names', 'match', 'tokenExtents', 'end', 'start', 'tokens'); |
1068 %! assert (s,1) | |
1069 %! assert (e,10) | |
1070 %! assert (size(te), [1,1]) | |
1071 %! assert (te{1}, [1 5; 7, 10]) | |
1072 %! assert (m{1},'short test') | |
1073 %! assert (size(t),[1,1]) | |
1074 %! assert (t{1}{1},'short') | |
1075 %! assert (t{1}{2},'test') | |
1076 %! assert (size(nm), [1,1]) | |
1077 %! assert (!isempty(fieldnames(nm))) | |
1078 %! assert (sort(fieldnames(nm)),{'word1';'word2'}) | |
1079 %! assert (nm.word1,'short') | |
1080 %! assert (nm.word2,'test') | |
5619 | 1081 |
7242 | 1082 %!testif HAVE_PCRE |
5619 | 1083 %! ## This test is expected to fail if PCRE is not installed |
7242 | 1084 %! [t, nm] = regexp("John Davis\nRogers, James",'(?<first>\w+)\s+(?<last>\w+)|(?<last>\w+),\s+(?<first>\w+)','tokens','names'); |
1085 %! assert (size(t), [1,2]); | |
1086 %! assert (t{1}{1},'John'); | |
1087 %! assert (t{1}{2},'Davis'); | |
1088 %! assert (t{2}{1},'Rogers'); | |
1089 %! assert (t{2}{2},'James'); | |
1090 %! assert (size(nm), [1,1]); | |
1091 %! assert (nm.first{1},'John'); | |
1092 %! assert (nm.first{2},'James'); | |
1093 %! assert (nm.last{1},'Davis'); | |
1094 %! assert (nm.last{2},'Rogers'); | |
5582 | 1095 |
5779 | 1096 %!assert(regexp("abc\nabc",'.'),[1:7]) |
1097 %!assert(regexp("abc\nabc",'.','dotall'),[1:7]) | |
7242 | 1098 %!testif HAVE_PCRE |
1099 %! assert(regexp("abc\nabc",'(?s).'),[1:7]) | |
1100 %! assert(regexp("abc\nabc",'.','dotexceptnewline'),[1,2,3,5,6,7]) | |
1101 %! assert(regexp("abc\nabc",'(?-s).'),[1,2,3,5,6,7]) | |
5779 | 1102 |
1103 %!assert(regexp("caseCaSe",'case'),1) | |
1104 %!assert(regexp("caseCaSe",'case',"matchcase"),1) | |
1105 %!assert(regexp("caseCaSe",'case',"ignorecase"),[1,5]) | |
7242 | 1106 %!testif HAVE_PCRE |
1107 %! assert(regexp("caseCaSe",'(?-i)case'),1) | |
1108 %! assert(regexp("caseCaSe",'(?i)case'),[1,5]) | |
5779 | 1109 |
1110 %!assert (regexp("abc\nabc",'c$'),7) | |
1111 %!assert (regexp("abc\nabc",'c$',"stringanchors"),7) | |
7242 | 1112 %!testif HAVE_PCRE |
1113 %! assert (regexp("abc\nabc",'(?-m)c$'),7) | |
1114 %! assert (regexp("abc\nabc",'c$',"lineanchors"),[3,7]) | |
1115 %! assert (regexp("abc\nabc",'(?m)c$'),[3,7]) | |
5779 | 1116 |
1117 %!assert (regexp("this word",'s w'),4) | |
1118 %!assert (regexp("this word",'s w','literalspacing'),4) | |
7242 | 1119 %!testif HAVE_PCRE |
1120 %! assert (regexp("this word",'(?-x)s w','literalspacing'),4) | |
1121 %! assert (regexp("this word",'s w','freespacing'),zeros(1,0)) | |
1122 %! assert (regexp("this word",'(?x)s w'),zeros(1,0)) | |
5779 | 1123 |
5582 | 1124 %!error regexp('string', 'tri', 'BadArg'); |
1125 %!error regexp('string'); | |
1126 | |
6361 | 1127 %!assert(regexp({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'},'-'),{6;[1,5,9];zeros(1,0)}) |
1128 %!assert(regexp({'asdfg-dfd','-dfd-dfd-','qasfdfdaq'},'-'),{6,[1,5,9],zeros(1,0)}) | |
1129 %!assert(regexp({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'},{'-';'f';'q'}),{6;[3,7];[1,9]}) | |
1130 %!assert(regexp('Strings',{'t','s'}),{2,7}) | |
1131 | |
8093
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
1132 ## Test case for lookaround operators |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
1133 %!assert(regexp('Iraq','q(?!u)'),4) |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
1134 %!assert(regexp('quit','q(?!u)'), zeros(1,0)) |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
1135 %!assert(regexp('quit','q(?=u)','match'), {'q'}) |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
1136 %!assert(regexp("quit",'q(?=u+)','match'), {'q'}) |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
1137 %!assert(regexp("qit",'q(?=u+)','match'), cell(1,0)) |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
1138 %!assert(regexp("qit",'q(?=u*)','match'), {'q'}) |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
1139 |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
1140 %!assert(regexp('thingamabob','(?<=a)b'), 9) |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
1141 |
5582 | 1142 */ |
1143 | |
6549 | 1144 DEFUN_DLD (regexpi, args, nargout, |
5582 | 1145 "-*- texinfo -*-\n\ |
1146 @deftypefn {Loadable Function} {[@var{s}, @var{e}, @var{te}, @var{m}, @var{t}, @var{nm}] =} regexpi (@var{str}, @var{pat})\n\ | |
1147 @deftypefnx {Loadable Function} {[@dots{}] =} regexpi (@var{str}, @var{pat}, @var{opts}, @dots{})\n\ | |
1148 \n\ | |
1149 Case insensitive regular expression string matching. Matches @var{pat} in\n\ | |
1150 @var{str} and returns the position and matching substrings or empty values\n\ | |
1151 if there are none. See @code{regexp} for more details\n\ | |
1152 @end deftypefn") | |
1153 { | |
6361 | 1154 octave_value_list retval; |
1155 int nargin = args.length(); | |
1156 | |
1157 if (nargin < 2) | |
1158 print_usage (); | |
1159 else if (args(0).is_cell() || args(1).is_cell()) | |
1160 retval = octcellregexp (args, nargout, "regexpi", true); | |
1161 else | |
1162 retval = octregexp (args, nargout, "regexpi", true); | |
1163 | |
1164 return retval; | |
5582 | 1165 } |
1166 | |
1167 /* | |
1168 | |
1169 ## seg-fault test | |
1170 %!assert(regexpi("abcde","."),[1,2,3,4,5]) | |
1171 | |
1172 ## Check that anchoring of pattern works correctly | |
1173 %!assert(regexpi('abcabc','^abc'),1); | |
1174 %!assert(regexpi('abcabc','abc$'),4); | |
5785 | 1175 %!assert(regexpi('abcabc','^abc$'),zeros(1,0)); |
5582 | 1176 |
1177 %!test | |
1178 %! [s, e, te, m, t] = regexpi(' No Match ', 'f(.*)uck'); | |
5785 | 1179 %! assert (s,zeros(1,0)) |
1180 %! assert (e,zeros(1,0)) | |
1181 %! assert (te,cell(1,0)) | |
1182 %! assert (m, cell(1,0)) | |
1183 %! assert (t, cell(1,0)) | |
5582 | 1184 |
1185 %!test | |
1186 %! [s, e, te, m, t] = regexpi(' FiRetrUck ', 'f(.*)uck'); | |
1187 %! assert (s,2) | |
1188 %! assert (e,10) | |
1189 %! assert (te{1},[3,7]) | |
1190 %! assert (m{1}, 'FiRetrUck') | |
1191 %! assert (t{1}{1}, 'iRetr') | |
1192 | |
1193 %!test | |
1194 %! [s, e, te, m, t] = regexpi(' firetruck ', 'f(.*)uck'); | |
1195 %! assert (s,2) | |
1196 %! assert (e,10) | |
1197 %! assert (te{1},[3,7]) | |
1198 %! assert (m{1}, 'firetruck') | |
1199 %! assert (t{1}{1}, 'iretr') | |
1200 | |
1201 %!test | |
1202 %! [s, e, te, m, t] = regexpi('ShoRt Test String','\w*r\w*'); | |
1203 %! assert (s,[1,12]) | |
1204 %! assert (e,[5,17]) | |
1205 %! assert (size(te), [1,2]) | |
1206 %! assert (isempty(te{1})) | |
1207 %! assert (isempty(te{2})) | |
1208 %! assert (m{1},'ShoRt') | |
1209 %! assert (m{2},'String') | |
1210 %! assert (size(t), [1,2]) | |
1211 %! assert (isempty(t{1})) | |
1212 %! assert (isempty(t{2})) | |
1213 | |
1214 %!test | |
1215 %! [s, e, te, m, t] = regexpi('ShoRt Test String','\w*r\w*','once'); | |
1216 %! assert (s,1) | |
1217 %! assert (e,5) | |
7893
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
1218 %! assert (isempty(te)) |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
1219 %! assert (m,'ShoRt') |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
1220 %! assert (isempty(t)) |
5582 | 1221 |
1222 %!test | |
1223 %! [m, te, e, s, t] = regexpi('ShoRt Test String','\w*r\w*','once', 'match', 'tokenExtents', 'end', 'start', 'tokens'); | |
1224 %! assert (s,1) | |
1225 %! assert (e,5) | |
7893
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
1226 %! assert (isempty(te)) |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
1227 %! assert (m,'ShoRt') |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
1228 %! assert (isempty(t)) |
5582 | 1229 |
7242 | 1230 %!testif HAVE_PCRE |
5582 | 1231 %! ## This test is expected to fail if PCRE is not installed |
7242 | 1232 %! [s, e, te, m, t, nm] = regexpi('ShoRt Test String','(?<word1>\w*t)\s*(?<word2>\w*t)'); |
1233 %! assert (s,1) | |
1234 %! assert (e,10) | |
1235 %! assert (size(te), [1,1]) | |
1236 %! assert (te{1}, [1 5; 7, 10]) | |
1237 %! assert (m{1},'ShoRt Test') | |
1238 %! assert (size(t),[1,1]) | |
1239 %! assert (t{1}{1},'ShoRt') | |
1240 %! assert (t{1}{2},'Test') | |
1241 %! assert (size(nm), [1,1]) | |
1242 %! assert (!isempty(fieldnames(nm))) | |
1243 %! assert (sort(fieldnames(nm)),{'word1';'word2'}) | |
1244 %! assert (nm.word1,'ShoRt') | |
1245 %! assert (nm.word2,'Test') | |
5582 | 1246 |
7242 | 1247 %!testif HAVE_PCRE |
5582 | 1248 %! ## This test is expected to fail if PCRE is not installed |
7242 | 1249 %! [nm, m, te, e, s, t] = regexpi('ShoRt Test String','(?<word1>\w*t)\s*(?<word2>\w*t)', 'names', 'match', 'tokenExtents', 'end', 'start', 'tokens'); |
1250 %! assert (s,1) | |
1251 %! assert (e,10) | |
1252 %! assert (size(te), [1,1]) | |
1253 %! assert (te{1}, [1 5; 7, 10]) | |
1254 %! assert (m{1},'ShoRt Test') | |
1255 %! assert (size(t),[1,1]) | |
1256 %! assert (t{1}{1},'ShoRt') | |
1257 %! assert (t{1}{2},'Test') | |
1258 %! assert (size(nm), [1,1]) | |
1259 %! assert (!isempty(fieldnames(nm))) | |
1260 %! assert (sort(fieldnames(nm)),{'word1';'word2'}) | |
1261 %! assert (nm.word1,'ShoRt') | |
1262 %! assert (nm.word2,'Test') | |
5582 | 1263 |
5779 | 1264 %!assert(regexpi("abc\nabc",'.'),[1:7]) |
1265 %!assert(regexpi("abc\nabc",'.','dotall'),[1:7]) | |
7242 | 1266 %!testif HAVE_PCRE |
1267 %! assert(regexpi("abc\nabc",'(?s).'),[1:7]) | |
1268 %! assert(regexpi("abc\nabc",'.','dotexceptnewline'),[1,2,3,5,6,7]) | |
1269 %! assert(regexpi("abc\nabc",'(?-s).'),[1,2,3,5,6,7]) | |
5779 | 1270 |
1271 %!assert(regexpi("caseCaSe",'case'),[1,5]) | |
1272 %!assert(regexpi("caseCaSe",'case',"matchcase"),1) | |
1273 %!assert(regexpi("caseCaSe",'case',"ignorecase"),[1,5]) | |
7242 | 1274 %!testif HAVE_PCRE |
1275 %! assert(regexpi("caseCaSe",'(?-i)case'),1) | |
1276 %! assert(regexpi("caseCaSe",'(?i)case'),[1,5]) | |
5779 | 1277 |
1278 %!assert (regexpi("abc\nabc",'c$'),7) | |
1279 %!assert (regexpi("abc\nabc",'c$',"stringanchors"),7) | |
7242 | 1280 %!testif HAVE_PCRE |
1281 %! assert (regexpi("abc\nabc",'(?-m)c$'),7) | |
1282 %! assert (regexpi("abc\nabc",'c$',"lineanchors"),[3,7]) | |
1283 %! assert (regexpi("abc\nabc",'(?m)c$'),[3,7]) | |
5779 | 1284 |
1285 %!assert (regexpi("this word",'s w'),4) | |
1286 %!assert (regexpi("this word",'s w','literalspacing'),4) | |
7242 | 1287 %!testif HAVE_PCRE |
1288 %! assert (regexpi("this word",'(?-x)s w','literalspacing'),4) | |
1289 %! assert (regexpi("this word",'s w','freespacing'),zeros(1,0)) | |
1290 %! assert (regexpi("this word",'(?x)s w'),zeros(1,0)) | |
5779 | 1291 |
5582 | 1292 %!error regexpi('string', 'tri', 'BadArg'); |
1293 %!error regexpi('string'); | |
1294 | |
6361 | 1295 %!assert(regexpi({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'},'-'),{6;[1,5,9];zeros(1,0)}) |
1296 %!assert(regexpi({'asdfg-dfd','-dfd-dfd-','qasfdfdaq'},'-'),{6,[1,5,9],zeros(1,0)}) | |
1297 %!assert(regexpi({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'},{'-';'f';'q'}),{6;[3,7];[1,9]}) | |
1298 %!assert(regexpi('Strings',{'t','s'}),{2,[1,7]}) | |
1299 | |
5582 | 1300 */ |
1301 | |
6361 | 1302 |
1303 static octave_value | |
1304 octregexprep (const octave_value_list &args, const std::string &nm) | |
5785 | 1305 { |
6361 | 1306 octave_value retval; |
5785 | 1307 int nargin = args.length(); |
1308 | |
1309 // Make sure we have string,pattern,replacement | |
1310 const std::string buffer = args(0).string_value (); | |
1311 if (error_state) return retval; | |
1312 const std::string pattern = args(1).string_value (); | |
1313 if (error_state) return retval; | |
1314 const std::string replacement = args(2).string_value (); | |
1315 if (error_state) return retval; | |
1316 | |
1317 // Pack options excluding 'tokenize' and various output | |
1318 // reordering strings into regexp arg list | |
1319 octave_value_list regexpargs(nargin-1,octave_value()); | |
1320 regexpargs(0) = args(0); | |
1321 regexpargs(1) = args(1); | |
1322 int len=2; | |
1323 for (int i = 3; i < nargin; i++) | |
1324 { | |
1325 const std::string opt = args(i).string_value(); | |
1326 if (opt != "tokenize" && opt != "start" && opt != "end" | |
1327 && opt != "tokenextents" && opt != "match" && opt != "tokens" | |
1328 && opt != "names" && opt != "warnings") | |
1329 { | |
1330 regexpargs(len++) = args(i); | |
1331 } | |
1332 } | |
1333 regexpargs.resize(len); | |
1334 | |
1335 // Identify replacement tokens; build a vector of group numbers in | |
1336 // the replacement string so that we can quickly calculate the size | |
1337 // of the replacement. | |
1338 int tokens = 0; | |
1339 for (size_t i=1; i < replacement.size(); i++) | |
1340 { | |
1341 if (replacement[i-1]=='$' && isdigit(replacement[i])) | |
1342 { | |
1343 tokens++, i++; | |
1344 } | |
1345 } | |
1346 std::vector<int> token(tokens); | |
1347 int kk = 0; | |
1348 for (size_t i = 1; i < replacement.size(); i++) | |
1349 { | |
1350 if (replacement[i-1]=='$' && isdigit(replacement[i])) | |
1351 { | |
1352 token[kk++] = replacement[i]-'0'; | |
1353 i++; | |
1354 } | |
1355 } | |
1356 | |
1357 // Perform replacement | |
1358 std::string rep; | |
1359 if (tokens > 0) | |
1360 { | |
1361 std::list<regexp_elem> lst; | |
1362 string_vector named; | |
1363 int nopts; | |
7893
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
1364 bool once; |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
1365 int sz = octregexp_list (regexpargs, nm , false, lst, named, nopts, once); |
5785 | 1366 |
1367 if (error_state) | |
1368 return retval; | |
1369 if (sz == 0) | |
1370 { | |
6361 | 1371 retval = args(0); |
5785 | 1372 return retval; |
1373 } | |
1374 | |
1375 // Determine replacement length | |
1376 const size_t replen = replacement.size() - 2*tokens; | |
1377 int delta = 0; | |
1378 const_iterator p = lst.begin(); | |
1379 for (int i = 0; i < sz; i++) | |
1380 { | |
1381 OCTAVE_QUIT; | |
1382 | |
1383 const Matrix pairs(p->te); | |
1384 size_t pairlen = 0; | |
1385 for (int j = 0; j < tokens; j++) | |
1386 { | |
1387 if (token[j] == 0) | |
1388 pairlen += static_cast<size_t>(p->e - p->s) + 1; | |
1389 else if (token[j] <= pairs.rows()) | |
1390 pairlen += static_cast<size_t>(pairs(token[j]-1,1) - | |
1391 pairs(token[j]-1,0)) + 1; | |
1392 } | |
1393 delta += static_cast<int>(replen + pairlen) - | |
1394 static_cast<int>(p->e - p->s + 1); | |
1395 p++; | |
1396 } | |
1397 | |
1398 // Build replacement string | |
1399 rep.reserve(buffer.size()+delta); | |
1400 size_t from = 0; | |
1401 p = lst.begin(); | |
1402 for (int i=0; i < sz; i++) | |
1403 { | |
1404 OCTAVE_QUIT; | |
1405 | |
1406 const Matrix pairs(p->te); | |
1407 rep.append(&buffer[from], static_cast<size_t>(p->s - 1) - from); | |
1408 from = static_cast<size_t>(p->e - 1) + 1; | |
1409 for (size_t j = 1; j < replacement.size(); j++) | |
1410 { | |
1411 if (replacement[j-1]=='$' && isdigit(replacement[j])) | |
1412 { | |
1413 int k = replacement[j]-'0'; | |
1414 if (k == 0) | |
1415 { | |
1416 // replace with entire match | |
1417 rep.append(&buffer[static_cast<size_t>(p->e - 1)], | |
1418 static_cast<size_t>(p->e - p->s) + 1); | |
1419 } | |
1420 else if (k <= pairs.rows()) | |
1421 { | |
1422 // replace with group capture | |
1423 rep.append(&buffer[static_cast<size_t>(pairs(k-1,0)-1)], | |
1424 static_cast<size_t>(pairs(k-1,1) - | |
1425 pairs(k-1,0))+1); | |
1426 } | |
1427 else | |
1428 { | |
1429 // replace with nothing | |
1430 } | |
1431 j++; | |
1432 } | |
1433 else | |
1434 { | |
1435 rep.append(1,replacement[j-1]); | |
1436 } | |
1437 if (j+1 == replacement.size()) | |
1438 { | |
1439 rep.append(1,replacement[j]); | |
1440 } | |
1441 } | |
1442 p++; | |
1443 } | |
1444 rep.append(&buffer[from],buffer.size()-from); | |
1445 } | |
1446 else | |
1447 { | |
1448 std::list<regexp_elem> lst; | |
1449 string_vector named; | |
1450 int nopts; | |
7893
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
1451 bool once; |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
1452 int sz = octregexp_list (regexpargs, nm, false, lst, named, nopts, once); |
5785 | 1453 |
1454 if (error_state) | |
1455 return retval; | |
1456 if (sz == 0) | |
1457 { | |
6361 | 1458 retval = args(0); |
5785 | 1459 return retval; |
1460 } | |
1461 | |
1462 // Determine replacement length | |
1463 const size_t replen = replacement.size(); | |
1464 int delta = 0; | |
1465 const_iterator p = lst.begin(); | |
1466 for (int i = 0; i < sz; i++) | |
1467 { | |
1468 OCTAVE_QUIT; | |
1469 delta += static_cast<int>(replen) - | |
1470 static_cast<int>(p->e - p->s + 1); | |
1471 p++; | |
1472 } | |
1473 | |
1474 // Build replacement string | |
1475 rep.reserve(buffer.size()+delta); | |
1476 size_t from = 0; | |
1477 p = lst.begin(); | |
1478 for (int i=0; i < sz; i++) | |
1479 { | |
1480 OCTAVE_QUIT; | |
1481 rep.append(&buffer[from], static_cast<size_t>(p->s - 1) - from); | |
1482 from = static_cast<size_t>(p->e - 1) + 1; | |
1483 rep.append(replacement); | |
1484 p++; | |
1485 } | |
1486 rep.append(&buffer[from],buffer.size()-from); | |
1487 } | |
1488 | |
6361 | 1489 retval = rep; |
1490 return retval; | |
1491 } | |
1492 | |
6549 | 1493 DEFUN_DLD (regexprep, args, , |
6361 | 1494 "-*- texinfo -*-\n\ |
6678 | 1495 @deftypefn {Loadable Function} {@var{string} =} regexprep (@var{string}, @var{pat}, @var{repstr}, @var{options})\n\ |
6361 | 1496 Replace matches of @var{pat} in @var{string} with @var{repstr}.\n\ |
1497 \n\ | |
1498 \n\ | |
7007 | 1499 The replacement can contain @code{$i}, which substitutes\n\ |
6361 | 1500 for the ith set of parentheses in the match string. E.g.,\n\ |
1501 @example\n\ | |
1502 \n\ | |
1503 regexprep(\"Bill Dunn\",'(\\w+) (\\w+)','$2, $1')\n\ | |
1504 \n\ | |
1505 @end example\n\ | |
1506 returns \"Dunn, Bill\"\n\ | |
1507 \n\ | |
1508 @var{options} may be zero or more of\n\ | |
1509 @table @samp\n\ | |
1510 \n\ | |
1511 @item once\n\ | |
7001 | 1512 Replace only the first occurrence of @var{pat} in the result.\n\ |
6361 | 1513 \n\ |
1514 @item warnings\n\ | |
1515 This option is present for compatibility but is ignored.\n\ | |
1516 \n\ | |
1517 @item ignorecase or matchcase\n\ | |
1518 Ignore case for the pattern matching (see @code{regexpi}).\n\ | |
1519 Alternatively, use (?i) or (?-i) in the pattern.\n\ | |
1520 \n\ | |
1521 @item lineanchors and stringanchors\n\ | |
1522 Whether characters ^ and $ match the beginning and ending of lines.\n\ | |
1523 Alternatively, use (?m) or (?-m) in the pattern.\n\ | |
1524 \n\ | |
1525 @item dotexceptnewline and dotall\n\ | |
1526 Whether . matches newlines in the string.\n\ | |
1527 Alternatively, use (?s) or (?-s) in the pattern.\n\ | |
1528 \n\ | |
1529 @item freespacing or literalspacing\n\ | |
1530 Whether whitespace and # comments can be used to make the regular expression more readable.\n\ | |
1531 Alternatively, use (?x) or (?-x) in the pattern.\n\ | |
1532 \n\ | |
1533 @end table\n\ | |
1534 @seealso{regexp,regexpi}\n\ | |
1535 @end deftypefn") | |
1536 { | |
1537 octave_value_list retval; | |
1538 int nargin = args.length(); | |
1539 | |
1540 if (nargin < 3) | |
1541 { | |
1542 print_usage (); | |
1543 return retval; | |
1544 } | |
1545 | |
1546 if (args(0).is_cell() || args(1).is_cell() || args(2).is_cell()) | |
1547 { | |
1548 Cell str; | |
1549 Cell pat; | |
1550 Cell rep; | |
6495 | 1551 dim_vector dv0; |
1552 dim_vector dv1(1,1); | |
6361 | 1553 |
1554 if (args(0).is_cell()) | |
1555 str = args(0).cell_value(); | |
1556 else | |
1557 str = Cell (args(0)); | |
1558 | |
1559 if (args(1).is_cell()) | |
1560 pat = args(1).cell_value(); | |
1561 else | |
1562 pat = Cell (args(1)); | |
1563 | |
1564 if (args(2).is_cell()) | |
1565 rep = args(2).cell_value(); | |
1566 else | |
1567 rep = Cell (args(2)); | |
1568 | |
6495 | 1569 dv0 = str.dims(); |
1570 if (pat.numel() != 1) | |
6361 | 1571 { |
6495 | 1572 dv1 = pat.dims(); |
1573 if (rep.numel() != 1 && dv1 != rep.dims()) | |
6361 | 1574 error ("regexprep: Inconsistent cell array dimensions"); |
1575 } | |
1576 else if (rep.numel() != 1) | |
6495 | 1577 dv1 = rep.dims(); |
6361 | 1578 |
1579 if (!error_state) | |
1580 { | |
6495 | 1581 Cell ret (dv0); |
6361 | 1582 octave_value_list new_args = args; |
1583 | |
6495 | 1584 for (octave_idx_type i = 0; i < dv0.numel(); i++) |
1585 { | |
1586 new_args(0) = str(i); | |
1587 if (pat.numel() == 1) | |
1588 new_args(1) = pat(0); | |
1589 if (rep.numel() == 1) | |
1590 new_args(2) = rep(0); | |
1591 for (octave_idx_type j = 0; j < dv1.numel(); j++) | |
1592 { | |
1593 if (pat.numel() != 1) | |
1594 new_args(1) = pat(j); | |
1595 if (rep.numel() != 1) | |
1596 new_args(2) = rep(j); | |
1597 new_args(0) = octregexprep (new_args, "regexprep"); | |
6361 | 1598 |
6495 | 1599 if (error_state) |
1600 break; | |
1601 } | |
6361 | 1602 |
1603 if (error_state) | |
1604 break; | |
6495 | 1605 |
1606 ret(i) = new_args(0); | |
6361 | 1607 } |
1608 | |
1609 if (!error_state) | |
1610 retval = octave_value (ret); | |
1611 } | |
1612 } | |
1613 else | |
1614 retval = octregexprep (args, "regexprep"); | |
1615 | |
5785 | 1616 return retval; |
1617 } | |
1618 | |
1619 /* | |
1620 %!test # Replace with empty | |
1621 %! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>'; | |
1622 %! t = regexprep(xml,'<[!?][^>]*>',''); | |
1623 %! assert(t,' <tag v="hello">some stuff</tag>') | |
1624 | |
1625 %!test # Replace with non-empty | |
1626 %! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>'; | |
1627 %! t = regexprep(xml,'<[!?][^>]*>','?'); | |
1628 %! assert(t,'? <tag v="hello">some stuff?</tag>') | |
1629 | |
1630 %!test # Check that 'tokenize' is ignored | |
1631 %! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>'; | |
1632 %! t = regexprep(xml,'<[!?][^>]*>','','tokenize'); | |
1633 %! assert(t,' <tag v="hello">some stuff</tag>') | |
1634 | |
7242 | 1635 %!testif HAVE_PCRE # Capture replacement |
1636 %! data = "Bob Smith\nDavid Hollerith\nSam Jenkins"; | |
1637 %! result = "Smith, Bob\nHollerith, David\nJenkins, Sam"; | |
1638 %! t = regexprep(data,'(?m)^(\w+)\s+(\w+)$','$2, $1'); | |
1639 %! assert(t,result) | |
5785 | 1640 |
1641 # Return the original if no match | |
1642 %!assert(regexprep('hello','world','earth'),'hello') | |
1643 | |
1644 ## Test a general replacement | |
1645 %!assert(regexprep("a[b]c{d}e-f=g", "[^A-Za-z0-9_]", "_"), "a_b_c_d_e_f_g"); | |
1646 | |
1647 ## Make sure it works at the beginning and end | |
1648 %!assert(regexprep("a[b]c{d}e-f=g", "a", "_"), "_[b]c{d}e-f=g"); | |
1649 %!assert(regexprep("a[b]c{d}e-f=g", "g", "_"), "a[b]c{d}e-f=_"); | |
1650 | |
1651 ## Options | |
1652 %!assert(regexprep("a[b]c{d}e-f=g", "[^A-Za-z0-9_]", "_", "once"), "a_b]c{d}e-f=g"); | |
1653 %!assert(regexprep("a[b]c{d}e-f=g", "[^A-Z0-9_]", "_", "ignorecase"), "a_b_c_d_e_f_g"); | |
1654 | |
1655 ## Option combinations | |
1656 %!assert(regexprep("a[b]c{d}e-f=g", "[^A-Z0-9_]", "_", "once", "ignorecase"), "a_b]c{d}e-f=g"); | |
1657 | |
1658 ## End conditions on replacement | |
1659 %!assert(regexprep("abc","(b)",".$1"),"a.bc"); | |
1660 %!assert(regexprep("abc","(b)","$1"),"abc"); | |
1661 %!assert(regexprep("abc","(b)","$1."),"ab.c"); | |
1662 %!assert(regexprep("abc","(b)","$1.."),"ab..c"); | |
1663 | |
6361 | 1664 ## Test cell array arguments |
6503 | 1665 %!assert(regexprep("abc",{"b","a"},"?"),{"??c"}) |
6361 | 1666 %!assert(regexprep({"abc","cba"},"b","?"),{"a?c","c?a"}) |
6503 | 1667 %!assert(regexprep({"abc","cba"},{"b","a"},{"?","!"}),{"!?c","c?!"}) |
6361 | 1668 |
8093
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
1669 # Nasty lookbehind expression |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
1670 %!assert(regexprep('x^(-1)+y(-1)+z(-1)=0','(?<=[a-z]+)\(\-[1-9]*\)','_minus1'),'x^(-1)+y_minus1+z_minus1=0') |
dcc31f473596
Treat PCRE lookbehind operators in a manner that is approximately correct
David Bateman <dbateman@free.fr>
parents:
8021
diff
changeset
|
1671 |
5785 | 1672 */ |
1673 | |
5582 | 1674 /* |
1675 ;;; Local Variables: *** | |
1676 ;;; mode: C++ *** | |
1677 ;;; End: *** | |
1678 */ |