Mercurial > octave-nkf
annotate src/DLD-FUNCTIONS/regexp.cc @ 8021:85184151822e
fix typo in NPOS change
author | John W. Eaton <jwe@octave.org> |
---|---|
date | Thu, 07 Aug 2008 15:31:17 -0400 |
parents | 0ef13e15319b |
children | dcc31f473596 |
rev | line source |
---|---|
5582 | 1 /* |
2 | |
7017 | 3 Copyright (C) 2005, 2006, 2007 David Bateman |
7016 | 4 Copyright (C) 2002, 2003, 2004, 2005 Paul Kienzle |
5 | |
6 This file is part of Octave. | |
5582 | 7 |
8 Octave is free software; you can redistribute it and/or modify it | |
9 under the terms of the GNU General Public License as published by the | |
7016 | 10 Free Software Foundation; either version 3 of the License, or (at your |
11 option) any later version. | |
5582 | 12 |
13 Octave is distributed in the hope that it will be useful, but WITHOUT | |
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
16 for more details. | |
17 | |
18 You should have received a copy of the GNU General Public License | |
7016 | 19 along with Octave; see the file COPYING. If not, see |
20 <http://www.gnu.org/licenses/>. | |
5582 | 21 |
22 */ | |
23 | |
24 #ifdef HAVE_CONFIG_H | |
25 #include <config.h> | |
26 #endif | |
27 | |
5773 | 28 #include <algorithm> |
5765 | 29 #include <sstream> |
30 | |
5582 | 31 #include "defun-dld.h" |
32 #include "error.h" | |
33 #include "gripes.h" | |
34 #include "oct-obj.h" | |
35 #include "utils.h" | |
36 | |
37 #include "Cell.h" | |
38 #include "oct-map.h" | |
39 #include "str-vec.h" | |
5785 | 40 #include "quit.h" |
41 #include "parse.h" | |
5582 | 42 |
7173 | 43 #if defined (HAVE_PCRE) |
5582 | 44 #include <pcre.h> |
7117 | 45 #elif defined (HAVE_REGEX) |
46 #if defined (__MINGW32__) | |
5582 | 47 #define __restrict |
48 #endif | |
7237 | 49 #if defined (HAVE_SYS_TYPES_H) |
50 #include <sys/types.h> | |
51 #endif | |
5582 | 52 #include <regex.h> |
53 #endif | |
54 | |
5785 | 55 // The regexp is constructed as a linked list to avoid resizing the |
56 // return values in arrays at each new match. | |
57 | |
58 // FIXME don't bother collecting and composing return values the user | |
59 // doesn't want. | |
60 | |
61 class regexp_elem | |
5582 | 62 { |
5785 | 63 public: |
5787 | 64 regexp_elem (const string_vector& _named_token, const Cell& _t, |
65 const std::string& _m, const Matrix& _te, double _s, | |
66 double _e) : | |
5785 | 67 named_token (_named_token), t (_t), m (_m), te (_te), s (_s), e (_e) { } |
68 | |
69 regexp_elem (const regexp_elem &a) : named_token (a.named_token), t (a.t), | |
70 m (a.m), te (a.te), s (a.s), e (a.e) | |
71 { } | |
72 | |
73 string_vector named_token; | |
74 Cell t; | |
75 std::string m; | |
76 Matrix te; | |
77 double s; | |
78 double e; | |
79 }; | |
80 | |
81 typedef std::list<regexp_elem>::const_iterator const_iterator; | |
82 | |
83 static int | |
84 octregexp_list (const octave_value_list &args, const std::string &nm, | |
85 bool case_insensitive, std::list<regexp_elem> &lst, | |
7893
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
86 string_vector &named, int &nopts, bool &once) |
5785 | 87 { |
88 int sz = 0; | |
5582 | 89 #if defined (HAVE_REGEX) || defined (HAVE_PCRE) |
90 int nargin = args.length(); | |
5779 | 91 bool lineanchors = false; |
92 bool dotexceptnewline = false; | |
93 bool freespacing = false; | |
5582 | 94 |
5785 | 95 nopts = nargin - 2; |
7893
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
96 once = false; |
5785 | 97 |
5582 | 98 std::string buffer = args(0).string_value (); |
99 if (error_state) | |
100 { | |
101 gripe_wrong_type_arg (nm.c_str(), args(0)); | |
5785 | 102 return 0; |
5582 | 103 } |
104 | |
105 std::string pattern = args(1).string_value (); | |
106 if (error_state) | |
107 { | |
108 gripe_wrong_type_arg (nm.c_str(), args(1)); | |
5785 | 109 return 0; |
5582 | 110 } |
111 | |
112 for (int i = 2; i < nargin; i++) | |
113 { | |
114 std::string str = args(i).string_value(); | |
115 if (error_state) | |
116 { | |
117 error ("%s: optional arguments must be strings", nm.c_str()); | |
118 break; | |
119 } | |
120 std::transform (str.begin (), str.end (), str.begin (), tolower); | |
121 if (str.find("once", 0) == 0) | |
122 { | |
123 once = true; | |
124 nopts--; | |
125 } | |
5779 | 126 else if (str.find("matchcase", 0) == 0) |
127 { | |
128 case_insensitive = false; | |
129 nopts--; | |
130 } | |
131 else if (str.find("ignorecase", 0) == 0) | |
132 { | |
133 case_insensitive = true; | |
134 nopts--; | |
135 } | |
5785 | 136 else if (str.find("dotall", 0) == 0) |
5779 | 137 { |
5785 | 138 dotexceptnewline = false; |
139 nopts--; | |
140 } | |
141 else if (str.find("stringanchors", 0) == 0) | |
142 { | |
143 lineanchors = false; | |
5779 | 144 nopts--; |
145 } | |
146 else if (str.find("literalspacing", 0) == 0) | |
147 { | |
148 freespacing = false; | |
149 nopts--; | |
150 } | |
5785 | 151 #if HAVE_PCRE |
152 // Only accept these options with pcre | |
153 else if (str.find("dotexceptnewline", 0) == 0) | |
154 { | |
155 dotexceptnewline = true; | |
156 nopts--; | |
157 } | |
158 else if (str.find("lineanchors", 0) == 0) | |
159 { | |
160 lineanchors = true; | |
161 nopts--; | |
162 } | |
163 else if (str.find("freespacing", 0) == 0) | |
164 { | |
165 freespacing = true; | |
166 nopts--; | |
167 } | |
5582 | 168 else if (str.find("start", 0) && str.find("end", 0) && |
169 str.find("tokenextents", 0) && str.find("match", 0) && | |
170 str.find("tokens", 0) && str.find("names", 0)) | |
171 error ("%s: unrecognized option", nm.c_str()); | |
172 #else | |
5779 | 173 else if (str.find("names", 0) == 0 || |
174 str.find("dotexceptnewline", 0) == 0 || | |
175 str.find("lineanchors", 0) == 0 || | |
176 str.find("freespacing", 0) == 0) | |
5785 | 177 error ("%s: %s not implemented in this version", str.c_str(), nm.c_str()); |
5582 | 178 else if (str.find("start", 0) && str.find("end", 0) && |
179 str.find("tokenextents", 0) && str.find("match", 0) && | |
180 str.find("tokens", 0)) | |
181 error ("%s: unrecognized option", nm.c_str()); | |
182 #endif | |
183 } | |
184 | |
185 if (!error_state) | |
186 { | |
5785 | 187 Cell t; |
188 std::string m; | |
189 double s, e; | |
5582 | 190 |
191 // named tokens "(?<name>...)" are only treated with PCRE not regex. | |
192 #if HAVE_PCRE | |
193 // The syntax of named tokens in pcre is "(?P<name>...)" while we need | |
194 // a syntax "(?<name>...)", so fix that here. Also an expression like | |
195 // "(?<first>\w+)\s+(?<last>\w+)|(?<last>\w+),\s+(?<first>\w+)" should | |
196 // be perfectly legal, while pcre does not allow the same named token | |
5619 | 197 // name on both sides of the alternative. Also fix that here by replacing |
198 // name tokens by dummy names, and dealing with the dummy names later. | |
5582 | 199 |
5619 | 200 size_t pos = 0; |
201 size_t new_pos; | |
202 int nnames = 0; | |
203 int inames = 0; | |
5765 | 204 std::ostringstream buf; |
5619 | 205 Array<int> named_idx; |
5582 | 206 |
8021 | 207 while ((new_pos = pattern.find ("(?<",pos)) != std::string::npos) |
5619 | 208 { |
209 size_t tmp_pos = pattern.find_first_of ('>',new_pos); | |
5582 | 210 |
8021 | 211 if (tmp_pos == std::string::npos) |
5619 | 212 { |
213 error ("syntax error in pattern"); | |
214 break; | |
215 } | |
216 | |
217 std::string tmp_name = pattern.substr(new_pos+3,tmp_pos-new_pos-3); | |
218 bool found = false; | |
219 | |
220 for (int i = 0; i < nnames; i++) | |
221 if (named(i) == tmp_name) | |
222 { | |
223 named_idx.resize(inames+1); | |
224 named_idx(inames) = i; | |
225 found = true; | |
226 break; | |
227 } | |
228 if (! found) | |
229 { | |
230 named_idx.resize(inames+1); | |
231 named_idx(inames) = nnames; | |
232 named.append(tmp_name); | |
233 nnames++; | |
234 } | |
235 | |
236 if (new_pos - pos > 0) | |
237 buf << pattern.substr(pos,new_pos-pos); | |
238 if (inames < 10) | |
239 buf << "(?P<n00" << inames++; | |
240 else if (inames < 100) | |
241 buf << "(?P<n0" << inames++; | |
242 else | |
243 buf << "(?P<n" << inames++; | |
244 pos = tmp_pos; | |
245 } | |
246 | |
5765 | 247 buf << pattern.substr(pos); |
5619 | 248 |
249 if (error_state) | |
5785 | 250 return 0; |
5582 | 251 |
252 // Compile expression | |
253 pcre *re; | |
254 const char *err; | |
255 int erroffset; | |
5765 | 256 std::string buf_str = buf.str (); |
257 re = pcre_compile (buf_str.c_str (), | |
5779 | 258 (case_insensitive ? PCRE_CASELESS : 0) | |
259 (dotexceptnewline ? 0 : PCRE_DOTALL) | | |
260 (lineanchors ? PCRE_MULTILINE : 0) | | |
261 (freespacing ? PCRE_EXTENDED : 0), | |
7520 | 262 &err, &erroffset, 0); |
5582 | 263 |
7520 | 264 if (re == 0) { |
5582 | 265 error("%s: %s at position %d of expression", nm.c_str(), |
266 err, erroffset); | |
5785 | 267 return 0; |
5582 | 268 } |
269 | |
270 int subpatterns; | |
271 int namecount; | |
272 int nameentrysize; | |
273 char *nametable; | |
274 int idx = 0; | |
275 | |
7520 | 276 pcre_fullinfo(re, 0, PCRE_INFO_CAPTURECOUNT, &subpatterns); |
277 pcre_fullinfo(re, 0, PCRE_INFO_NAMECOUNT, &namecount); | |
278 pcre_fullinfo(re, 0, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize); | |
279 pcre_fullinfo(re, 0, PCRE_INFO_NAMETABLE, &nametable); | |
5582 | 280 |
281 OCTAVE_LOCAL_BUFFER(int, ovector, (subpatterns+1)*3); | |
282 OCTAVE_LOCAL_BUFFER(int, nidx, namecount); | |
283 | |
284 for (int i = 0; i < namecount; i++) | |
285 { | |
286 // Index of subpattern in first two bytes MSB first of name. | |
5619 | 287 // Extract index. |
5779 | 288 nidx[i] = (static_cast<int>(nametable[i*nameentrysize])) << 8 | |
289 static_cast<int>(nametable[i*nameentrysize+1]); | |
5582 | 290 } |
291 | |
292 while(true) | |
293 { | |
5785 | 294 OCTAVE_QUIT; |
295 | |
7520 | 296 int matches = pcre_exec(re, 0, buffer.c_str(), |
5582 | 297 buffer.length(), idx, |
298 (idx ? PCRE_NOTBOL : 0), | |
299 ovector, (subpatterns+1)*3); | |
300 | |
301 if (matches < 0 && matches != PCRE_ERROR_NOMATCH) | |
302 { | |
303 error ("%s: internal error calling pcre_exec", nm.c_str()); | |
304 pcre_free(re); | |
5785 | 305 return 0; |
5582 | 306 } |
307 else if (matches == PCRE_ERROR_NOMATCH) | |
308 break; | |
5779 | 309 else if (ovector[1] <= ovector[0]) |
310 break; | |
5582 | 311 else |
312 { | |
5619 | 313 int pos_match = 0; |
5785 | 314 Matrix te(matches-1,2); |
5582 | 315 for (int i = 1; i < matches; i++) |
316 { | |
5619 | 317 if (ovector[2*i] >= 0 && ovector[2*i+1] > 0) |
318 { | |
5785 | 319 te(pos_match,0) = double (ovector[2*i]+1); |
320 te(pos_match++,1) = double (ovector[2*i+1]); | |
5619 | 321 } |
5582 | 322 } |
5785 | 323 te.resize(pos_match,2); |
324 s = double (ovector[0]+1); | |
325 e = double (ovector[1]); | |
5582 | 326 |
327 const char **listptr; | |
328 int status = pcre_get_substring_list(buffer.c_str(), ovector, | |
329 matches, &listptr); | |
330 | |
331 if (status == PCRE_ERROR_NOMEMORY) { | |
332 error("%s: cannot allocate memory in pcre_get_substring_list", | |
333 nm.c_str()); | |
334 pcre_free(re); | |
5785 | 335 return 0; |
5582 | 336 } |
337 | |
5619 | 338 Cell cell_t (dim_vector(1,pos_match)); |
339 pos_match = 0; | |
5582 | 340 for (int i = 1; i < matches; i++) |
5619 | 341 if (ovector[2*i] >= 0 && ovector[2*i+1] > 0) |
342 cell_t(pos_match++) = std::string(*(listptr+i)); | |
5582 | 343 |
5785 | 344 m = std::string(*listptr); |
345 t = cell_t; | |
346 | |
347 string_vector named_tokens(nnames); | |
5619 | 348 if (namecount > 0) |
349 for (int i = 1; i < matches; i++) | |
350 { | |
351 if (ovector[2*i] >= 0 && ovector[2*i+1] > 0) | |
352 { | |
5785 | 353 named_tokens(named_idx(i-1)) = |
354 std::string(*(listptr+nidx[i-1])); | |
5619 | 355 } |
356 } | |
5582 | 357 |
358 pcre_free_substring_list(listptr); | |
359 | |
5785 | 360 regexp_elem new_elem (named_tokens, t, m, te, s, e); |
361 lst.push_back (new_elem); | |
362 idx = ovector[1]; | |
363 sz++; | |
364 | |
5582 | 365 if (once) |
366 break; | |
367 | |
368 } | |
369 } | |
370 | |
371 pcre_free(re); | |
372 #else | |
373 regex_t compiled; | |
374 int err=regcomp(&compiled, pattern.c_str(), REG_EXTENDED | | |
375 (case_insensitive ? REG_ICASE : 0)); | |
376 if (err) | |
377 { | |
7520 | 378 int len = regerror(err, &compiled, 0, 0); |
5760 | 379 OCTAVE_LOCAL_BUFFER (char, errmsg, len); |
380 regerror(err, &compiled, errmsg, len); | |
381 error("%s: %s in pattern (%s)", nm.c_str(), errmsg, | |
382 pattern.c_str()); | |
5582 | 383 regfree(&compiled); |
5785 | 384 return 0; |
5582 | 385 } |
386 | |
387 int subexpr = 1; | |
388 int idx = 0; | |
389 for (unsigned int i=0; i < pattern.length(); i++) | |
390 subexpr += ( pattern[i] == '(' ? 1 : 0 ); | |
391 OCTAVE_LOCAL_BUFFER (regmatch_t, match, subexpr ); | |
392 | |
393 while(true) | |
394 { | |
5785 | 395 OCTAVE_QUIT; |
396 | |
5582 | 397 if (regexec(&compiled, buffer.c_str() + idx, subexpr, |
398 match, (idx ? REG_NOTBOL : 0)) == 0) | |
399 { | |
400 // Count actual matches | |
401 int matches = 0; | |
402 while (matches < subexpr && match[matches].rm_so >= 0) | |
403 matches++; | |
404 | |
5785 | 405 s = double (match[0].rm_so+1+idx); |
406 e = double (match[0].rm_eo+idx); | |
407 Matrix te(matches-1,2); | |
5582 | 408 for (int i = 1; i < matches; i++) |
409 { | |
5785 | 410 te(i-1,0) = double (match[i].rm_so+1+idx); |
411 te(i-1,1) = double (match[i].rm_eo+idx); | |
5582 | 412 } |
413 | |
5785 | 414 m = buffer.substr (match[0].rm_so+idx, |
5582 | 415 match[0].rm_eo-match[0].rm_so); |
416 | |
417 Cell cell_t (dim_vector(1,matches-1)); | |
418 for (int i = 1; i < matches; i++) | |
419 cell_t(i-1) = buffer.substr (match[i].rm_so+idx, | |
420 match[i].rm_eo-match[i].rm_so); | |
5785 | 421 t = cell_t; |
5582 | 422 |
423 idx += match[0].rm_eo; | |
5785 | 424 |
5866 | 425 string_vector sv; |
426 regexp_elem new_elem (sv, t, m, te, s, e); | |
5785 | 427 lst.push_back (new_elem); |
5582 | 428 sz++; |
429 | |
430 if (once) | |
431 break; | |
432 } | |
433 else | |
434 break; | |
435 } | |
436 regfree(&compiled); | |
437 #endif | |
5785 | 438 } |
439 #else | |
440 error ("%s: not available in this version of Octave", nm.c_str()); | |
441 #endif | |
442 return sz; | |
443 } | |
5582 | 444 |
5785 | 445 static octave_value_list |
446 octregexp (const octave_value_list &args, int nargout, const std::string &nm, | |
447 bool case_insensitive) | |
448 { | |
449 octave_value_list retval; | |
450 int nargin = args.length(); | |
451 std::list<regexp_elem> lst; | |
452 string_vector named; | |
453 int nopts; | |
7893
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
454 bool once; |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
455 int sz = octregexp_list (args, nm, case_insensitive, lst, named, nopts, once); |
5785 | 456 |
457 if (! error_state) | |
458 { | |
459 // Converted the linked list in the correct form for the return values | |
460 | |
461 octave_idx_type i = 0; | |
462 #ifdef HAVE_PCRE | |
463 Octave_map nmap; | |
464 if (sz == 1) | |
465 { | |
466 for (int j = 0; j < named.length(); j++) | |
467 nmap.assign (named(j), lst.begin()->named_token(j)); | |
468 retval(5) = nmap; | |
469 } | |
470 else | |
471 { | |
472 for (int j = 0; j < named.length (); j++) | |
473 { | |
474 i = 0; | |
475 Cell tmp(dim_vector (1, sz)); | |
476 for (const_iterator p = lst.begin(); p != lst.end(); p++) | |
477 tmp(i++) = p->named_token(j); | |
478 nmap.assign (named(j), octave_value (tmp)); | |
479 } | |
480 retval(5) = nmap; | |
481 } | |
482 #else | |
483 retval(5) = Octave_map(); | |
484 #endif | |
485 | |
7893
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
486 if (once) |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
487 retval(4) = sz ? lst.front ().t : Cell(); |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
488 else |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
489 { |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
490 Cell t (dim_vector(1, sz)); |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
491 i = 0; |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
492 for (const_iterator p = lst.begin(); p != lst.end(); p++) |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
493 t(i++) = p->t; |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
494 retval(4) = t; |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
495 } |
5785 | 496 |
7893
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
497 if (once) |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
498 retval(3) = sz ? lst.front ().m : std::string(); |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
499 else |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
500 { |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
501 Cell m (dim_vector(1, sz)); |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
502 i = 0; |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
503 for (const_iterator p = lst.begin(); p != lst.end(); p++) |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
504 m(i++) = p->m; |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
505 retval(3) = m; |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
506 } |
5785 | 507 |
7893
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
508 if (once) |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
509 retval(2) = sz ? lst.front ().te : Matrix(); |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
510 else |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
511 { |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
512 Cell te (dim_vector(1, sz)); |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
513 i = 0; |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
514 for (const_iterator p = lst.begin(); p != lst.end(); p++) |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
515 te(i++) = p->te; |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
516 retval(2) = te; |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
517 } |
5785 | 518 |
7893
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
519 if (once) |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
520 { |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
521 if (sz) |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
522 retval(1) = lst.front ().e; |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
523 else |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
524 retval(1) = Matrix(); |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
525 } |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
526 else |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
527 { |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
528 NDArray e (dim_vector(1, sz)); |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
529 i = 0; |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
530 for (const_iterator p = lst.begin(); p != lst.end(); p++) |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
531 e(i++) = p->e; |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
532 retval(1) = e; |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
533 } |
5785 | 534 |
7893
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
535 if (once) |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
536 { |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
537 if (sz) |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
538 retval(0) = lst.front ().s; |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
539 else |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
540 retval(0) = Matrix(); |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
541 } |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
542 else |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
543 { |
5785 | 544 NDArray s (dim_vector(1, sz)); |
545 i = 0; | |
546 for (const_iterator p = lst.begin(); p != lst.end(); p++) | |
547 s(i++) = p->s; | |
5582 | 548 retval(0) = s; |
7893
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
549 } |
5582 | 550 |
551 // Alter the order of the output arguments | |
552 if (nopts > 0) | |
553 { | |
554 int n = 0; | |
555 octave_value_list new_retval; | |
556 new_retval.resize(nargout); | |
557 | |
558 OCTAVE_LOCAL_BUFFER (int, arg_used, 6); | |
5785 | 559 for (int j = 0; j < 6; j++) |
560 arg_used[j] = false; | |
5582 | 561 |
5785 | 562 for (int j = 2; j < nargin; j++) |
5582 | 563 { |
564 int k = 0; | |
5785 | 565 std::string str = args(j).string_value(); |
5582 | 566 std::transform (str.begin (), str.end (), str.begin (), tolower); |
5779 | 567 if (str.find("once", 0) == 0 |
568 || str.find("stringanchors", 0) == 0 | |
569 || str.find("lineanchors", 0) == 0 | |
570 || str.find("matchcase", 0) == 0 | |
571 || str.find("ignorecase", 0) == 0 | |
572 || str.find("dotall", 0) == 0 | |
573 || str.find("dotexceptnewline", 0) == 0 | |
574 || str.find("literalspacing", 0) == 0 | |
575 || str.find("freespacing", 0) == 0 | |
576 ) | |
5582 | 577 continue; |
578 else if (str.find("start", 0) == 0) | |
579 k = 0; | |
580 else if (str.find("end", 0) == 0) | |
581 k = 1; | |
582 else if (str.find("tokenextents", 0) == 0) | |
583 k = 2; | |
584 else if (str.find("match", 0) == 0) | |
585 k = 3; | |
586 else if (str.find("tokens", 0) == 0) | |
587 k = 4; | |
588 else if (str.find("names", 0) == 0) | |
589 k = 5; | |
590 | |
591 new_retval(n++) = retval(k); | |
592 arg_used[k] = true; | |
593 | |
594 if (n == nargout) | |
595 break; | |
596 } | |
597 | |
598 // Fill in the rest of the arguments | |
599 if (n < nargout) | |
600 { | |
5785 | 601 for (int j = 0; j < 6; j++) |
5582 | 602 { |
5785 | 603 if (! arg_used[j]) |
604 new_retval(n++) = retval(j); | |
5582 | 605 } |
606 } | |
607 | |
608 retval = new_retval; | |
609 } | |
610 } | |
611 | |
612 return retval; | |
613 } | |
614 | |
6361 | 615 static octave_value_list |
616 octcellregexp (const octave_value_list &args, int nargout, const std::string &nm, | |
617 bool case_insensitive) | |
618 { | |
619 octave_value_list retval; | |
620 | |
621 if (args(0).is_cell()) | |
622 { | |
623 OCTAVE_LOCAL_BUFFER (Cell, newretval, nargout); | |
624 octave_value_list new_args = args; | |
625 Cell cellstr = args(0).cell_value(); | |
626 if (args(1).is_cell()) | |
627 { | |
628 Cell cellpat = args(1).cell_value(); | |
629 | |
630 if (cellpat.numel() == 1) | |
631 { | |
632 for (int j = 0; j < nargout; j++) | |
633 newretval[j].resize(cellstr.dims()); | |
634 | |
635 new_args(1) = cellpat(0); | |
636 | |
637 for (octave_idx_type i = 0; i < cellstr.numel (); i++) | |
638 { | |
639 new_args(0) = cellstr(i); | |
640 octave_value_list tmp = octregexp (new_args, nargout, nm, | |
641 case_insensitive); | |
642 | |
643 if (error_state) | |
644 break; | |
645 | |
646 for (int j = 0; j < nargout; j++) | |
647 newretval[j](i) = tmp(j); | |
648 } | |
649 } | |
650 else if (cellstr.numel() == 1) | |
651 { | |
652 for (int j = 0; j < nargout; j++) | |
653 newretval[j].resize(cellpat.dims()); | |
654 | |
655 new_args(0) = cellstr(0); | |
656 | |
657 for (octave_idx_type i = 0; i < cellpat.numel (); i++) | |
658 { | |
659 new_args(1) = cellpat(i); | |
660 octave_value_list tmp = octregexp (new_args, nargout, nm, | |
661 case_insensitive); | |
662 | |
663 if (error_state) | |
664 break; | |
665 | |
666 for (int j = 0; j < nargout; j++) | |
667 newretval[j](i) = tmp(j); | |
668 } | |
669 } | |
670 else if (cellstr.numel() == cellpat.numel()) | |
671 { | |
672 | |
673 if (cellstr.dims() != cellpat.dims()) | |
674 error ("%s: Inconsistent cell array dimensions", nm.c_str()); | |
675 else | |
676 { | |
677 for (int j = 0; j < nargout; j++) | |
678 newretval[j].resize(cellstr.dims()); | |
679 | |
680 for (octave_idx_type i = 0; i < cellstr.numel (); i++) | |
681 { | |
682 new_args(0) = cellstr(i); | |
683 new_args(1) = cellpat(i); | |
684 | |
685 octave_value_list tmp = octregexp (new_args, nargout, nm, | |
686 case_insensitive); | |
687 | |
688 if (error_state) | |
689 break; | |
690 | |
691 for (int j = 0; j < nargout; j++) | |
692 newretval[j](i) = tmp(j); | |
693 } | |
694 } | |
695 } | |
696 else | |
697 error ("regexp: cell array arguments must be scalar or equal size"); | |
698 } | |
699 else | |
700 { | |
701 for (int j = 0; j < nargout; j++) | |
702 newretval[j].resize(cellstr.dims()); | |
703 | |
704 for (octave_idx_type i = 0; i < cellstr.numel (); i++) | |
705 { | |
706 new_args(0) = cellstr(i); | |
707 octave_value_list tmp = octregexp (new_args, nargout, nm, case_insensitive); | |
708 | |
709 if (error_state) | |
710 break; | |
711 | |
712 for (int j = 0; j < nargout; j++) | |
713 newretval[j](i) = tmp(j); | |
714 } | |
715 } | |
716 | |
717 if (!error_state) | |
718 for (int j = 0; j < nargout; j++) | |
719 retval(j) = octave_value (newretval[j]); | |
720 } | |
721 else if (args(1).is_cell()) | |
722 { | |
723 OCTAVE_LOCAL_BUFFER (Cell, newretval, nargout); | |
724 octave_value_list new_args = args; | |
725 Cell cellpat = args(1).cell_value(); | |
726 | |
727 for (int j = 0; j < nargout; j++) | |
728 newretval[j].resize(cellpat.dims()); | |
729 | |
730 for (octave_idx_type i = 0; i < cellpat.numel (); i++) | |
731 { | |
732 new_args(1) = cellpat(i); | |
733 octave_value_list tmp = octregexp (new_args, nargout, nm, case_insensitive); | |
734 | |
735 if (error_state) | |
736 break; | |
737 | |
738 for (int j = 0; j < nargout; j++) | |
739 newretval[j](i) = tmp(j); | |
740 } | |
741 | |
742 if (!error_state) | |
743 for (int j = 0; j < nargout; j++) | |
744 retval(j) = octave_value (newretval[j]); | |
745 } | |
746 else | |
747 retval = octregexp (args, nargout, nm, case_insensitive); | |
748 | |
749 return retval; | |
750 | |
751 } | |
752 | |
5582 | 753 DEFUN_DLD (regexp, args, nargout, |
754 "-*- texinfo -*-\n\ | |
755 @deftypefn {Loadable Function} {[@var{s}, @var{e}, @var{te}, @var{m}, @var{t}, @var{nm}] =} regexp (@var{str}, @var{pat})\n\ | |
756 @deftypefnx {Loadable Function} {[@dots{}] =} regexp (@var{str}, @var{pat}, @var{opts}, @dots{})\n\ | |
757 \n\ | |
758 Regular expression string matching. Matches @var{pat} in @var{str} and\n\ | |
759 returns the position and matching substrings or empty values if there are\n\ | |
760 none.\n\ | |
761 \n\ | |
762 The matched pattern @var{pat} can include any of the standard regex\n\ | |
763 operators, including:\n\ | |
764 \n\ | |
765 @table @code\n\ | |
766 @item .\n\ | |
767 Match any character\n\ | |
768 @item * + ? @{@}\n\ | |
769 Repetition operators, representing\n\ | |
770 @table @code\n\ | |
771 @item *\n\ | |
772 Match zero or more times\n\ | |
773 @item +\n\ | |
774 Match one or more times\n\ | |
775 @item ?\n\ | |
776 Match zero or one times\n\ | |
777 @item @{@}\n\ | |
778 Match range operator, which is of the form @code{@{@var{n}@}} to match exactly\n\ | |
779 @var{n} times, @code{@{@var{m},@}} to match @var{m} or more times,\n\ | |
780 @code{@{@var{m},@var{n}@}} to match between @var{m} and @var{n} times.\n\ | |
781 @end table\n\ | |
782 @item [@dots{}] [^@dots{}]\n\ | |
783 List operators, where for example @code{[ab]c} matches @code{ac} and @code{bc}\n\ | |
784 @item ()\n\ | |
785 Grouping operator\n\ | |
786 @item |\n\ | |
787 Alternation operator. Match one of a choice of regular expressions. The\n\ | |
7001 | 788 alternatives must be delimited by the grouping operator @code{()} above\n\ |
5582 | 789 @item ^ $\n\ |
790 Anchoring operator. @code{^} matches the start of the string @var{str} and\n\ | |
791 @code{$} the end\n\ | |
792 @end table\n\ | |
793 \n\ | |
794 In addition the following escaped characters have special meaning. It should\n\ | |
795 be noted that it is recommended to quote @var{pat} in single quotes rather\n\ | |
796 than double quotes, to avoid the escape sequences being interpreted by octave\n\ | |
797 before being passed to @code{regexp}.\n\ | |
798 \n\ | |
799 @table @code\n\ | |
800 @item \\b\n\ | |
801 Match a word boundary\n\ | |
802 @item \\B\n\ | |
803 Match within a word\n\ | |
804 @item \\w\n\ | |
805 Matches any word character\n\ | |
806 @item \\W\n\ | |
807 Matches any non word character\n\ | |
808 @item \\<\n\ | |
809 Matches the beginning of a word\n\ | |
810 @item \\>\n\ | |
811 Matches the end of a word\n\ | |
812 @item \\s\n\ | |
813 Matches any whitespace character\n\ | |
814 @item \\S\n\ | |
815 Matches any non whitespace character\n\ | |
816 @item \\d\n\ | |
817 Matches any digit\n\ | |
818 @item \\D\n\ | |
819 Matches any non-digit\n\ | |
820 @end table\n\ | |
821 \n\ | |
822 The outputs of @code{regexp} by default are in the order as given below\n\ | |
823 \n\ | |
824 @table @asis\n\ | |
825 @item @var{s}\n\ | |
826 The start indices of each of the matching substrings\n\ | |
827 \n\ | |
828 @item @var{e}\n\ | |
829 The end indices of each matching substring\n\ | |
830 \n\ | |
831 @item @var{te}\n\ | |
832 The extents of each of the matched token surrounded by @code{(@dots{})} in\n\ | |
833 @var{pat}.\n\ | |
834 \n\ | |
835 @item @var{m}\n\ | |
836 A cell array of the text of each match.\n\ | |
837 \n\ | |
838 @item @var{t}\n\ | |
839 A cell array of the text of each token matched.\n\ | |
840 \n\ | |
841 @item @var{nm}\n\ | |
842 A structure containing the text of each matched named token, with the name\n\ | |
843 being used as the fieldname. A named token is denoted as\n\ | |
844 @code{(?<name>@dots{})}\n\ | |
845 @end table\n\ | |
846 \n\ | |
847 Particular output arguments or the order of the output arguments can be\n\ | |
848 selected by additional @var{opts} arguments. These are strings and the\n\ | |
849 correspondence between the output arguments and the optional argument\n\ | |
850 are\n\ | |
851 \n\ | |
852 @multitable @columnfractions 0.2 0.3 0.3 0.2\n\ | |
853 @item @tab 'start' @tab @var{s} @tab\n\ | |
854 @item @tab 'end' @tab @var{e} @tab\n\ | |
855 @item @tab 'tokenExtents' @tab @var{te} @tab\n\ | |
856 @item @tab 'match' @tab @var{m} @tab\n\ | |
857 @item @tab 'tokens' @tab @var{t} @tab\n\ | |
858 @item @tab 'names' @tab @var{nm} @tab\n\ | |
859 @end multitable\n\ | |
860 \n\ | |
861 A further optional argument is 'once', that limits the number of returned\n\ | |
5779 | 862 matches to the first match. Additional arguments are\n\ |
863 \n\ | |
864 @table @asis\n\ | |
865 @item matchcase\n\ | |
866 Make the matching case sensitive.\n\ | |
867 @item ignorecase\n\ | |
868 Make the matching case insensitive.\n\ | |
869 @item stringanchors\n\ | |
870 Match the anchor characters at the beginning and end of the string.\n\ | |
871 @item lineanchors\n\ | |
872 Match the anchor characters at the beginning and end of the line.\n\ | |
873 @item dotall\n\ | |
874 The character @code{.} matches the newline character.\n\ | |
875 @item dotexceptnewline\n\ | |
876 The character @code{.} matches all but the newline character.\n\ | |
877 @item freespacing\n\ | |
878 The pattern can include arbitrary whitespace and comments starting with\n\ | |
879 @code{#}.\n\ | |
880 @item literalspacing\n\ | |
881 The pattern is taken literally.\n\ | |
882 @end table\n\ | |
5582 | 883 @end deftypefn") |
884 { | |
6361 | 885 octave_value_list retval; |
886 int nargin = args.length(); | |
887 | |
888 if (nargin < 2) | |
889 print_usage (); | |
890 else if (args(0).is_cell() || args(1).is_cell()) | |
891 retval = octcellregexp (args, nargout, "regexp", false); | |
892 else | |
893 retval = octregexp (args, nargout, "regexp", false); | |
894 | |
895 return retval; | |
5582 | 896 } |
897 | |
898 /* | |
899 | |
900 ## seg-fault test | |
901 %!assert(regexp("abcde","."),[1,2,3,4,5]) | |
902 | |
903 ## Check that anchoring of pattern works correctly | |
904 %!assert(regexp('abcabc','^abc'),1); | |
905 %!assert(regexp('abcabc','abc$'),4); | |
5785 | 906 %!assert(regexp('abcabc','^abc$'),zeros(1,0)); |
5582 | 907 |
908 %!test | |
909 %! [s, e, te, m, t] = regexp(' No Match ', 'f(.*)uck'); | |
5785 | 910 %! assert (s,zeros(1,0)) |
911 %! assert (e,zeros(1,0)) | |
912 %! assert (te,cell(1,0)) | |
913 %! assert (m, cell(1,0)) | |
914 %! assert (t, cell(1,0)) | |
5582 | 915 |
916 %!test | |
917 %! [s, e, te, m, t] = regexp(' FiRetrUck ', 'f(.*)uck'); | |
5785 | 918 %! assert (s,zeros(1,0)) |
919 %! assert (e,zeros(1,0)) | |
920 %! assert (te,cell(1,0)) | |
921 %! assert (m, cell(1,0)) | |
922 %! assert (t, cell(1,0)) | |
5582 | 923 |
924 %!test | |
925 %! [s, e, te, m, t] = regexp(' firetruck ', 'f(.*)uck'); | |
926 %! assert (s,2) | |
927 %! assert (e,10) | |
928 %! assert (te{1},[3,7]) | |
929 %! assert (m{1}, 'firetruck') | |
930 %! assert (t{1}{1}, 'iretr') | |
931 | |
932 %!test | |
933 %! [s, e, te, m, t] = regexp('short test string','\w*r\w*'); | |
934 %! assert (s,[1,12]) | |
935 %! assert (e,[5,17]) | |
936 %! assert (size(te), [1,2]) | |
937 %! assert (isempty(te{1})) | |
938 %! assert (isempty(te{2})) | |
939 %! assert (m{1},'short') | |
940 %! assert (m{2},'string') | |
941 %! assert (size(t), [1,2]) | |
942 %! assert (isempty(t{1})) | |
943 %! assert (isempty(t{2})) | |
944 | |
945 %!test | |
946 %! [s, e, te, m, t] = regexp('short test string','\w*r\w*','once'); | |
947 %! assert (s,1) | |
948 %! assert (e,5) | |
7893
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
949 %! assert (isempty(te)) |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
950 %! assert (m,'short') |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
951 %! assert (isempty(t)) |
5582 | 952 |
953 %!test | |
954 %! [m, te, e, s, t] = regexp('short test string','\w*r\w*','once', 'match', 'tokenExtents', 'end', 'start', 'tokens'); | |
955 %! assert (s,1) | |
956 %! assert (e,5) | |
7893
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
957 %! assert (isempty(te)) |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
958 %! assert (m,'short') |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
959 %! assert (isempty(t)) |
5582 | 960 |
7242 | 961 %!testif HAVE_PCRE |
5582 | 962 %! ## This test is expected to fail if PCRE is not installed |
7242 | 963 %! [s, e, te, m, t, nm] = regexp('short test string','(?<word1>\w*t)\s*(?<word2>\w*t)'); |
964 %! assert (s,1) | |
965 %! assert (e,10) | |
966 %! assert (size(te), [1,1]) | |
967 %! assert (te{1}, [1 5; 7, 10]) | |
968 %! assert (m{1},'short test') | |
969 %! assert (size(t),[1,1]) | |
970 %! assert (t{1}{1},'short') | |
971 %! assert (t{1}{2},'test') | |
972 %! assert (size(nm), [1,1]) | |
973 %! assert (!isempty(fieldnames(nm))) | |
974 %! assert (sort(fieldnames(nm)),{'word1';'word2'}) | |
975 %! assert (nm.word1,'short') | |
976 %! assert (nm.word2,'test') | |
5582 | 977 |
7242 | 978 %!testif HAVE_PCRE |
5582 | 979 %! ## This test is expected to fail if PCRE is not installed |
7242 | 980 %! [nm, m, te, e, s, t] = regexp('short test string','(?<word1>\w*t)\s*(?<word2>\w*t)', 'names', 'match', 'tokenExtents', 'end', 'start', 'tokens'); |
981 %! assert (s,1) | |
982 %! assert (e,10) | |
983 %! assert (size(te), [1,1]) | |
984 %! assert (te{1}, [1 5; 7, 10]) | |
985 %! assert (m{1},'short test') | |
986 %! assert (size(t),[1,1]) | |
987 %! assert (t{1}{1},'short') | |
988 %! assert (t{1}{2},'test') | |
989 %! assert (size(nm), [1,1]) | |
990 %! assert (!isempty(fieldnames(nm))) | |
991 %! assert (sort(fieldnames(nm)),{'word1';'word2'}) | |
992 %! assert (nm.word1,'short') | |
993 %! assert (nm.word2,'test') | |
5619 | 994 |
7242 | 995 %!testif HAVE_PCRE |
5619 | 996 %! ## This test is expected to fail if PCRE is not installed |
7242 | 997 %! [t, nm] = regexp("John Davis\nRogers, James",'(?<first>\w+)\s+(?<last>\w+)|(?<last>\w+),\s+(?<first>\w+)','tokens','names'); |
998 %! assert (size(t), [1,2]); | |
999 %! assert (t{1}{1},'John'); | |
1000 %! assert (t{1}{2},'Davis'); | |
1001 %! assert (t{2}{1},'Rogers'); | |
1002 %! assert (t{2}{2},'James'); | |
1003 %! assert (size(nm), [1,1]); | |
1004 %! assert (nm.first{1},'John'); | |
1005 %! assert (nm.first{2},'James'); | |
1006 %! assert (nm.last{1},'Davis'); | |
1007 %! assert (nm.last{2},'Rogers'); | |
5582 | 1008 |
5779 | 1009 %!assert(regexp("abc\nabc",'.'),[1:7]) |
1010 %!assert(regexp("abc\nabc",'.','dotall'),[1:7]) | |
7242 | 1011 %!testif HAVE_PCRE |
1012 %! assert(regexp("abc\nabc",'(?s).'),[1:7]) | |
1013 %! assert(regexp("abc\nabc",'.','dotexceptnewline'),[1,2,3,5,6,7]) | |
1014 %! assert(regexp("abc\nabc",'(?-s).'),[1,2,3,5,6,7]) | |
5779 | 1015 |
1016 %!assert(regexp("caseCaSe",'case'),1) | |
1017 %!assert(regexp("caseCaSe",'case',"matchcase"),1) | |
1018 %!assert(regexp("caseCaSe",'case',"ignorecase"),[1,5]) | |
7242 | 1019 %!testif HAVE_PCRE |
1020 %! assert(regexp("caseCaSe",'(?-i)case'),1) | |
1021 %! assert(regexp("caseCaSe",'(?i)case'),[1,5]) | |
5779 | 1022 |
1023 %!assert (regexp("abc\nabc",'c$'),7) | |
1024 %!assert (regexp("abc\nabc",'c$',"stringanchors"),7) | |
7242 | 1025 %!testif HAVE_PCRE |
1026 %! assert (regexp("abc\nabc",'(?-m)c$'),7) | |
1027 %! assert (regexp("abc\nabc",'c$',"lineanchors"),[3,7]) | |
1028 %! assert (regexp("abc\nabc",'(?m)c$'),[3,7]) | |
5779 | 1029 |
1030 %!assert (regexp("this word",'s w'),4) | |
1031 %!assert (regexp("this word",'s w','literalspacing'),4) | |
7242 | 1032 %!testif HAVE_PCRE |
1033 %! assert (regexp("this word",'(?-x)s w','literalspacing'),4) | |
1034 %! assert (regexp("this word",'s w','freespacing'),zeros(1,0)) | |
1035 %! assert (regexp("this word",'(?x)s w'),zeros(1,0)) | |
5779 | 1036 |
5582 | 1037 %!error regexp('string', 'tri', 'BadArg'); |
1038 %!error regexp('string'); | |
1039 | |
6361 | 1040 %!assert(regexp({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'},'-'),{6;[1,5,9];zeros(1,0)}) |
1041 %!assert(regexp({'asdfg-dfd','-dfd-dfd-','qasfdfdaq'},'-'),{6,[1,5,9],zeros(1,0)}) | |
1042 %!assert(regexp({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'},{'-';'f';'q'}),{6;[3,7];[1,9]}) | |
1043 %!assert(regexp('Strings',{'t','s'}),{2,7}) | |
1044 | |
5582 | 1045 */ |
1046 | |
6549 | 1047 DEFUN_DLD (regexpi, args, nargout, |
5582 | 1048 "-*- texinfo -*-\n\ |
1049 @deftypefn {Loadable Function} {[@var{s}, @var{e}, @var{te}, @var{m}, @var{t}, @var{nm}] =} regexpi (@var{str}, @var{pat})\n\ | |
1050 @deftypefnx {Loadable Function} {[@dots{}] =} regexpi (@var{str}, @var{pat}, @var{opts}, @dots{})\n\ | |
1051 \n\ | |
1052 Case insensitive regular expression string matching. Matches @var{pat} in\n\ | |
1053 @var{str} and returns the position and matching substrings or empty values\n\ | |
1054 if there are none. See @code{regexp} for more details\n\ | |
1055 @end deftypefn") | |
1056 { | |
6361 | 1057 octave_value_list retval; |
1058 int nargin = args.length(); | |
1059 | |
1060 if (nargin < 2) | |
1061 print_usage (); | |
1062 else if (args(0).is_cell() || args(1).is_cell()) | |
1063 retval = octcellregexp (args, nargout, "regexpi", true); | |
1064 else | |
1065 retval = octregexp (args, nargout, "regexpi", true); | |
1066 | |
1067 return retval; | |
5582 | 1068 } |
1069 | |
1070 /* | |
1071 | |
1072 ## seg-fault test | |
1073 %!assert(regexpi("abcde","."),[1,2,3,4,5]) | |
1074 | |
1075 ## Check that anchoring of pattern works correctly | |
1076 %!assert(regexpi('abcabc','^abc'),1); | |
1077 %!assert(regexpi('abcabc','abc$'),4); | |
5785 | 1078 %!assert(regexpi('abcabc','^abc$'),zeros(1,0)); |
5582 | 1079 |
1080 %!test | |
1081 %! [s, e, te, m, t] = regexpi(' No Match ', 'f(.*)uck'); | |
5785 | 1082 %! assert (s,zeros(1,0)) |
1083 %! assert (e,zeros(1,0)) | |
1084 %! assert (te,cell(1,0)) | |
1085 %! assert (m, cell(1,0)) | |
1086 %! assert (t, cell(1,0)) | |
5582 | 1087 |
1088 %!test | |
1089 %! [s, e, te, m, t] = regexpi(' FiRetrUck ', 'f(.*)uck'); | |
1090 %! assert (s,2) | |
1091 %! assert (e,10) | |
1092 %! assert (te{1},[3,7]) | |
1093 %! assert (m{1}, 'FiRetrUck') | |
1094 %! assert (t{1}{1}, 'iRetr') | |
1095 | |
1096 %!test | |
1097 %! [s, e, te, m, t] = regexpi(' firetruck ', 'f(.*)uck'); | |
1098 %! assert (s,2) | |
1099 %! assert (e,10) | |
1100 %! assert (te{1},[3,7]) | |
1101 %! assert (m{1}, 'firetruck') | |
1102 %! assert (t{1}{1}, 'iretr') | |
1103 | |
1104 %!test | |
1105 %! [s, e, te, m, t] = regexpi('ShoRt Test String','\w*r\w*'); | |
1106 %! assert (s,[1,12]) | |
1107 %! assert (e,[5,17]) | |
1108 %! assert (size(te), [1,2]) | |
1109 %! assert (isempty(te{1})) | |
1110 %! assert (isempty(te{2})) | |
1111 %! assert (m{1},'ShoRt') | |
1112 %! assert (m{2},'String') | |
1113 %! assert (size(t), [1,2]) | |
1114 %! assert (isempty(t{1})) | |
1115 %! assert (isempty(t{2})) | |
1116 | |
1117 %!test | |
1118 %! [s, e, te, m, t] = regexpi('ShoRt Test String','\w*r\w*','once'); | |
1119 %! assert (s,1) | |
1120 %! assert (e,5) | |
7893
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
1121 %! assert (isempty(te)) |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
1122 %! assert (m,'ShoRt') |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
1123 %! assert (isempty(t)) |
5582 | 1124 |
1125 %!test | |
1126 %! [m, te, e, s, t] = regexpi('ShoRt Test String','\w*r\w*','once', 'match', 'tokenExtents', 'end', 'start', 'tokens'); | |
1127 %! assert (s,1) | |
1128 %! assert (e,5) | |
7893
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
1129 %! assert (isempty(te)) |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
1130 %! assert (m,'ShoRt') |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
1131 %! assert (isempty(t)) |
5582 | 1132 |
7242 | 1133 %!testif HAVE_PCRE |
5582 | 1134 %! ## This test is expected to fail if PCRE is not installed |
7242 | 1135 %! [s, e, te, m, t, nm] = regexpi('ShoRt Test String','(?<word1>\w*t)\s*(?<word2>\w*t)'); |
1136 %! assert (s,1) | |
1137 %! assert (e,10) | |
1138 %! assert (size(te), [1,1]) | |
1139 %! assert (te{1}, [1 5; 7, 10]) | |
1140 %! assert (m{1},'ShoRt Test') | |
1141 %! assert (size(t),[1,1]) | |
1142 %! assert (t{1}{1},'ShoRt') | |
1143 %! assert (t{1}{2},'Test') | |
1144 %! assert (size(nm), [1,1]) | |
1145 %! assert (!isempty(fieldnames(nm))) | |
1146 %! assert (sort(fieldnames(nm)),{'word1';'word2'}) | |
1147 %! assert (nm.word1,'ShoRt') | |
1148 %! assert (nm.word2,'Test') | |
5582 | 1149 |
7242 | 1150 %!testif HAVE_PCRE |
5582 | 1151 %! ## This test is expected to fail if PCRE is not installed |
7242 | 1152 %! [nm, m, te, e, s, t] = regexpi('ShoRt Test String','(?<word1>\w*t)\s*(?<word2>\w*t)', 'names', 'match', 'tokenExtents', 'end', 'start', 'tokens'); |
1153 %! assert (s,1) | |
1154 %! assert (e,10) | |
1155 %! assert (size(te), [1,1]) | |
1156 %! assert (te{1}, [1 5; 7, 10]) | |
1157 %! assert (m{1},'ShoRt Test') | |
1158 %! assert (size(t),[1,1]) | |
1159 %! assert (t{1}{1},'ShoRt') | |
1160 %! assert (t{1}{2},'Test') | |
1161 %! assert (size(nm), [1,1]) | |
1162 %! assert (!isempty(fieldnames(nm))) | |
1163 %! assert (sort(fieldnames(nm)),{'word1';'word2'}) | |
1164 %! assert (nm.word1,'ShoRt') | |
1165 %! assert (nm.word2,'Test') | |
5582 | 1166 |
5779 | 1167 %!assert(regexpi("abc\nabc",'.'),[1:7]) |
1168 %!assert(regexpi("abc\nabc",'.','dotall'),[1:7]) | |
7242 | 1169 %!testif HAVE_PCRE |
1170 %! assert(regexpi("abc\nabc",'(?s).'),[1:7]) | |
1171 %! assert(regexpi("abc\nabc",'.','dotexceptnewline'),[1,2,3,5,6,7]) | |
1172 %! assert(regexpi("abc\nabc",'(?-s).'),[1,2,3,5,6,7]) | |
5779 | 1173 |
1174 %!assert(regexpi("caseCaSe",'case'),[1,5]) | |
1175 %!assert(regexpi("caseCaSe",'case',"matchcase"),1) | |
1176 %!assert(regexpi("caseCaSe",'case',"ignorecase"),[1,5]) | |
7242 | 1177 %!testif HAVE_PCRE |
1178 %! assert(regexpi("caseCaSe",'(?-i)case'),1) | |
1179 %! assert(regexpi("caseCaSe",'(?i)case'),[1,5]) | |
5779 | 1180 |
1181 %!assert (regexpi("abc\nabc",'c$'),7) | |
1182 %!assert (regexpi("abc\nabc",'c$',"stringanchors"),7) | |
7242 | 1183 %!testif HAVE_PCRE |
1184 %! assert (regexpi("abc\nabc",'(?-m)c$'),7) | |
1185 %! assert (regexpi("abc\nabc",'c$',"lineanchors"),[3,7]) | |
1186 %! assert (regexpi("abc\nabc",'(?m)c$'),[3,7]) | |
5779 | 1187 |
1188 %!assert (regexpi("this word",'s w'),4) | |
1189 %!assert (regexpi("this word",'s w','literalspacing'),4) | |
7242 | 1190 %!testif HAVE_PCRE |
1191 %! assert (regexpi("this word",'(?-x)s w','literalspacing'),4) | |
1192 %! assert (regexpi("this word",'s w','freespacing'),zeros(1,0)) | |
1193 %! assert (regexpi("this word",'(?x)s w'),zeros(1,0)) | |
5779 | 1194 |
5582 | 1195 %!error regexpi('string', 'tri', 'BadArg'); |
1196 %!error regexpi('string'); | |
1197 | |
6361 | 1198 %!assert(regexpi({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'},'-'),{6;[1,5,9];zeros(1,0)}) |
1199 %!assert(regexpi({'asdfg-dfd','-dfd-dfd-','qasfdfdaq'},'-'),{6,[1,5,9],zeros(1,0)}) | |
1200 %!assert(regexpi({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'},{'-';'f';'q'}),{6;[3,7];[1,9]}) | |
1201 %!assert(regexpi('Strings',{'t','s'}),{2,[1,7]}) | |
1202 | |
5582 | 1203 */ |
1204 | |
6361 | 1205 |
1206 static octave_value | |
1207 octregexprep (const octave_value_list &args, const std::string &nm) | |
5785 | 1208 { |
6361 | 1209 octave_value retval; |
5785 | 1210 int nargin = args.length(); |
1211 | |
1212 // Make sure we have string,pattern,replacement | |
1213 const std::string buffer = args(0).string_value (); | |
1214 if (error_state) return retval; | |
1215 const std::string pattern = args(1).string_value (); | |
1216 if (error_state) return retval; | |
1217 const std::string replacement = args(2).string_value (); | |
1218 if (error_state) return retval; | |
1219 | |
1220 // Pack options excluding 'tokenize' and various output | |
1221 // reordering strings into regexp arg list | |
1222 octave_value_list regexpargs(nargin-1,octave_value()); | |
1223 regexpargs(0) = args(0); | |
1224 regexpargs(1) = args(1); | |
1225 int len=2; | |
1226 for (int i = 3; i < nargin; i++) | |
1227 { | |
1228 const std::string opt = args(i).string_value(); | |
1229 if (opt != "tokenize" && opt != "start" && opt != "end" | |
1230 && opt != "tokenextents" && opt != "match" && opt != "tokens" | |
1231 && opt != "names" && opt != "warnings") | |
1232 { | |
1233 regexpargs(len++) = args(i); | |
1234 } | |
1235 } | |
1236 regexpargs.resize(len); | |
1237 | |
1238 // Identify replacement tokens; build a vector of group numbers in | |
1239 // the replacement string so that we can quickly calculate the size | |
1240 // of the replacement. | |
1241 int tokens = 0; | |
1242 for (size_t i=1; i < replacement.size(); i++) | |
1243 { | |
1244 if (replacement[i-1]=='$' && isdigit(replacement[i])) | |
1245 { | |
1246 tokens++, i++; | |
1247 } | |
1248 } | |
1249 std::vector<int> token(tokens); | |
1250 int kk = 0; | |
1251 for (size_t i = 1; i < replacement.size(); i++) | |
1252 { | |
1253 if (replacement[i-1]=='$' && isdigit(replacement[i])) | |
1254 { | |
1255 token[kk++] = replacement[i]-'0'; | |
1256 i++; | |
1257 } | |
1258 } | |
1259 | |
1260 // Perform replacement | |
1261 std::string rep; | |
1262 if (tokens > 0) | |
1263 { | |
1264 std::list<regexp_elem> lst; | |
1265 string_vector named; | |
1266 int nopts; | |
7893
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
1267 bool once; |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
1268 int sz = octregexp_list (regexpargs, nm , false, lst, named, nopts, once); |
5785 | 1269 |
1270 if (error_state) | |
1271 return retval; | |
1272 if (sz == 0) | |
1273 { | |
6361 | 1274 retval = args(0); |
5785 | 1275 return retval; |
1276 } | |
1277 | |
1278 // Determine replacement length | |
1279 const size_t replen = replacement.size() - 2*tokens; | |
1280 int delta = 0; | |
1281 const_iterator p = lst.begin(); | |
1282 for (int i = 0; i < sz; i++) | |
1283 { | |
1284 OCTAVE_QUIT; | |
1285 | |
1286 const Matrix pairs(p->te); | |
1287 size_t pairlen = 0; | |
1288 for (int j = 0; j < tokens; j++) | |
1289 { | |
1290 if (token[j] == 0) | |
1291 pairlen += static_cast<size_t>(p->e - p->s) + 1; | |
1292 else if (token[j] <= pairs.rows()) | |
1293 pairlen += static_cast<size_t>(pairs(token[j]-1,1) - | |
1294 pairs(token[j]-1,0)) + 1; | |
1295 } | |
1296 delta += static_cast<int>(replen + pairlen) - | |
1297 static_cast<int>(p->e - p->s + 1); | |
1298 p++; | |
1299 } | |
1300 | |
1301 // Build replacement string | |
1302 rep.reserve(buffer.size()+delta); | |
1303 size_t from = 0; | |
1304 p = lst.begin(); | |
1305 for (int i=0; i < sz; i++) | |
1306 { | |
1307 OCTAVE_QUIT; | |
1308 | |
1309 const Matrix pairs(p->te); | |
1310 rep.append(&buffer[from], static_cast<size_t>(p->s - 1) - from); | |
1311 from = static_cast<size_t>(p->e - 1) + 1; | |
1312 for (size_t j = 1; j < replacement.size(); j++) | |
1313 { | |
1314 if (replacement[j-1]=='$' && isdigit(replacement[j])) | |
1315 { | |
1316 int k = replacement[j]-'0'; | |
1317 if (k == 0) | |
1318 { | |
1319 // replace with entire match | |
1320 rep.append(&buffer[static_cast<size_t>(p->e - 1)], | |
1321 static_cast<size_t>(p->e - p->s) + 1); | |
1322 } | |
1323 else if (k <= pairs.rows()) | |
1324 { | |
1325 // replace with group capture | |
1326 rep.append(&buffer[static_cast<size_t>(pairs(k-1,0)-1)], | |
1327 static_cast<size_t>(pairs(k-1,1) - | |
1328 pairs(k-1,0))+1); | |
1329 } | |
1330 else | |
1331 { | |
1332 // replace with nothing | |
1333 } | |
1334 j++; | |
1335 } | |
1336 else | |
1337 { | |
1338 rep.append(1,replacement[j-1]); | |
1339 } | |
1340 if (j+1 == replacement.size()) | |
1341 { | |
1342 rep.append(1,replacement[j]); | |
1343 } | |
1344 } | |
1345 p++; | |
1346 } | |
1347 rep.append(&buffer[from],buffer.size()-from); | |
1348 } | |
1349 else | |
1350 { | |
1351 std::list<regexp_elem> lst; | |
1352 string_vector named; | |
1353 int nopts; | |
7893
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
1354 bool once; |
eb9ccb44ea41
make regexp(...,'once') matlab compatible
Jaroslav Hajek <highegg@gmail.com>
parents:
7520
diff
changeset
|
1355 int sz = octregexp_list (regexpargs, nm, false, lst, named, nopts, once); |
5785 | 1356 |
1357 if (error_state) | |
1358 return retval; | |
1359 if (sz == 0) | |
1360 { | |
6361 | 1361 retval = args(0); |
5785 | 1362 return retval; |
1363 } | |
1364 | |
1365 // Determine replacement length | |
1366 const size_t replen = replacement.size(); | |
1367 int delta = 0; | |
1368 const_iterator p = lst.begin(); | |
1369 for (int i = 0; i < sz; i++) | |
1370 { | |
1371 OCTAVE_QUIT; | |
1372 delta += static_cast<int>(replen) - | |
1373 static_cast<int>(p->e - p->s + 1); | |
1374 p++; | |
1375 } | |
1376 | |
1377 // Build replacement string | |
1378 rep.reserve(buffer.size()+delta); | |
1379 size_t from = 0; | |
1380 p = lst.begin(); | |
1381 for (int i=0; i < sz; i++) | |
1382 { | |
1383 OCTAVE_QUIT; | |
1384 rep.append(&buffer[from], static_cast<size_t>(p->s - 1) - from); | |
1385 from = static_cast<size_t>(p->e - 1) + 1; | |
1386 rep.append(replacement); | |
1387 p++; | |
1388 } | |
1389 rep.append(&buffer[from],buffer.size()-from); | |
1390 } | |
1391 | |
6361 | 1392 retval = rep; |
1393 return retval; | |
1394 } | |
1395 | |
6549 | 1396 DEFUN_DLD (regexprep, args, , |
6361 | 1397 "-*- texinfo -*-\n\ |
6678 | 1398 @deftypefn {Loadable Function} {@var{string} =} regexprep (@var{string}, @var{pat}, @var{repstr}, @var{options})\n\ |
6361 | 1399 Replace matches of @var{pat} in @var{string} with @var{repstr}.\n\ |
1400 \n\ | |
1401 \n\ | |
7007 | 1402 The replacement can contain @code{$i}, which substitutes\n\ |
6361 | 1403 for the ith set of parentheses in the match string. E.g.,\n\ |
1404 @example\n\ | |
1405 \n\ | |
1406 regexprep(\"Bill Dunn\",'(\\w+) (\\w+)','$2, $1')\n\ | |
1407 \n\ | |
1408 @end example\n\ | |
1409 returns \"Dunn, Bill\"\n\ | |
1410 \n\ | |
1411 @var{options} may be zero or more of\n\ | |
1412 @table @samp\n\ | |
1413 \n\ | |
1414 @item once\n\ | |
7001 | 1415 Replace only the first occurrence of @var{pat} in the result.\n\ |
6361 | 1416 \n\ |
1417 @item warnings\n\ | |
1418 This option is present for compatibility but is ignored.\n\ | |
1419 \n\ | |
1420 @item ignorecase or matchcase\n\ | |
1421 Ignore case for the pattern matching (see @code{regexpi}).\n\ | |
1422 Alternatively, use (?i) or (?-i) in the pattern.\n\ | |
1423 \n\ | |
1424 @item lineanchors and stringanchors\n\ | |
1425 Whether characters ^ and $ match the beginning and ending of lines.\n\ | |
1426 Alternatively, use (?m) or (?-m) in the pattern.\n\ | |
1427 \n\ | |
1428 @item dotexceptnewline and dotall\n\ | |
1429 Whether . matches newlines in the string.\n\ | |
1430 Alternatively, use (?s) or (?-s) in the pattern.\n\ | |
1431 \n\ | |
1432 @item freespacing or literalspacing\n\ | |
1433 Whether whitespace and # comments can be used to make the regular expression more readable.\n\ | |
1434 Alternatively, use (?x) or (?-x) in the pattern.\n\ | |
1435 \n\ | |
1436 @end table\n\ | |
1437 @seealso{regexp,regexpi}\n\ | |
1438 @end deftypefn") | |
1439 { | |
1440 octave_value_list retval; | |
1441 int nargin = args.length(); | |
1442 | |
1443 if (nargin < 3) | |
1444 { | |
1445 print_usage (); | |
1446 return retval; | |
1447 } | |
1448 | |
1449 if (args(0).is_cell() || args(1).is_cell() || args(2).is_cell()) | |
1450 { | |
1451 Cell str; | |
1452 Cell pat; | |
1453 Cell rep; | |
6495 | 1454 dim_vector dv0; |
1455 dim_vector dv1(1,1); | |
6361 | 1456 |
1457 if (args(0).is_cell()) | |
1458 str = args(0).cell_value(); | |
1459 else | |
1460 str = Cell (args(0)); | |
1461 | |
1462 if (args(1).is_cell()) | |
1463 pat = args(1).cell_value(); | |
1464 else | |
1465 pat = Cell (args(1)); | |
1466 | |
1467 if (args(2).is_cell()) | |
1468 rep = args(2).cell_value(); | |
1469 else | |
1470 rep = Cell (args(2)); | |
1471 | |
6495 | 1472 dv0 = str.dims(); |
1473 if (pat.numel() != 1) | |
6361 | 1474 { |
6495 | 1475 dv1 = pat.dims(); |
1476 if (rep.numel() != 1 && dv1 != rep.dims()) | |
6361 | 1477 error ("regexprep: Inconsistent cell array dimensions"); |
1478 } | |
1479 else if (rep.numel() != 1) | |
6495 | 1480 dv1 = rep.dims(); |
6361 | 1481 |
1482 if (!error_state) | |
1483 { | |
6495 | 1484 Cell ret (dv0); |
6361 | 1485 octave_value_list new_args = args; |
1486 | |
6495 | 1487 for (octave_idx_type i = 0; i < dv0.numel(); i++) |
1488 { | |
1489 new_args(0) = str(i); | |
1490 if (pat.numel() == 1) | |
1491 new_args(1) = pat(0); | |
1492 if (rep.numel() == 1) | |
1493 new_args(2) = rep(0); | |
1494 for (octave_idx_type j = 0; j < dv1.numel(); j++) | |
1495 { | |
1496 if (pat.numel() != 1) | |
1497 new_args(1) = pat(j); | |
1498 if (rep.numel() != 1) | |
1499 new_args(2) = rep(j); | |
1500 new_args(0) = octregexprep (new_args, "regexprep"); | |
6361 | 1501 |
6495 | 1502 if (error_state) |
1503 break; | |
1504 } | |
6361 | 1505 |
1506 if (error_state) | |
1507 break; | |
6495 | 1508 |
1509 ret(i) = new_args(0); | |
6361 | 1510 } |
1511 | |
1512 if (!error_state) | |
1513 retval = octave_value (ret); | |
1514 } | |
1515 } | |
1516 else | |
1517 retval = octregexprep (args, "regexprep"); | |
1518 | |
5785 | 1519 return retval; |
1520 } | |
1521 | |
1522 /* | |
1523 %!test # Replace with empty | |
1524 %! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>'; | |
1525 %! t = regexprep(xml,'<[!?][^>]*>',''); | |
1526 %! assert(t,' <tag v="hello">some stuff</tag>') | |
1527 | |
1528 %!test # Replace with non-empty | |
1529 %! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>'; | |
1530 %! t = regexprep(xml,'<[!?][^>]*>','?'); | |
1531 %! assert(t,'? <tag v="hello">some stuff?</tag>') | |
1532 | |
1533 %!test # Check that 'tokenize' is ignored | |
1534 %! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>'; | |
1535 %! t = regexprep(xml,'<[!?][^>]*>','','tokenize'); | |
1536 %! assert(t,' <tag v="hello">some stuff</tag>') | |
1537 | |
7242 | 1538 %!testif HAVE_PCRE # Capture replacement |
1539 %! data = "Bob Smith\nDavid Hollerith\nSam Jenkins"; | |
1540 %! result = "Smith, Bob\nHollerith, David\nJenkins, Sam"; | |
1541 %! t = regexprep(data,'(?m)^(\w+)\s+(\w+)$','$2, $1'); | |
1542 %! assert(t,result) | |
5785 | 1543 |
1544 # Return the original if no match | |
1545 %!assert(regexprep('hello','world','earth'),'hello') | |
1546 | |
1547 ## Test a general replacement | |
1548 %!assert(regexprep("a[b]c{d}e-f=g", "[^A-Za-z0-9_]", "_"), "a_b_c_d_e_f_g"); | |
1549 | |
1550 ## Make sure it works at the beginning and end | |
1551 %!assert(regexprep("a[b]c{d}e-f=g", "a", "_"), "_[b]c{d}e-f=g"); | |
1552 %!assert(regexprep("a[b]c{d}e-f=g", "g", "_"), "a[b]c{d}e-f=_"); | |
1553 | |
1554 ## Options | |
1555 %!assert(regexprep("a[b]c{d}e-f=g", "[^A-Za-z0-9_]", "_", "once"), "a_b]c{d}e-f=g"); | |
1556 %!assert(regexprep("a[b]c{d}e-f=g", "[^A-Z0-9_]", "_", "ignorecase"), "a_b_c_d_e_f_g"); | |
1557 | |
1558 ## Option combinations | |
1559 %!assert(regexprep("a[b]c{d}e-f=g", "[^A-Z0-9_]", "_", "once", "ignorecase"), "a_b]c{d}e-f=g"); | |
1560 | |
1561 ## End conditions on replacement | |
1562 %!assert(regexprep("abc","(b)",".$1"),"a.bc"); | |
1563 %!assert(regexprep("abc","(b)","$1"),"abc"); | |
1564 %!assert(regexprep("abc","(b)","$1."),"ab.c"); | |
1565 %!assert(regexprep("abc","(b)","$1.."),"ab..c"); | |
1566 | |
6361 | 1567 ## Test cell array arguments |
6503 | 1568 %!assert(regexprep("abc",{"b","a"},"?"),{"??c"}) |
6361 | 1569 %!assert(regexprep({"abc","cba"},"b","?"),{"a?c","c?a"}) |
6503 | 1570 %!assert(regexprep({"abc","cba"},{"b","a"},{"?","!"}),{"!?c","c?!"}) |
6361 | 1571 |
5785 | 1572 */ |
1573 | |
5582 | 1574 /* |
1575 ;;; Local Variables: *** | |
1576 ;;; mode: C++ *** | |
1577 ;;; End: *** | |
1578 */ |