Mercurial > octave
diff libinterp/corefcn/strfns.cc @ 25412:922a93fc73ec
Add function to index characters in UTF-8 encoded strings.
* strfns.cc (Funicode_idx): Add new function.
* unistr-wrappers.(cc/h): Add wrapper for "u8_strmblen".
* strings.txi: UTF-8 as expected encoding. Add doc string for "unicode_idx".
author | Markus Mützel <markus.muetzel@gmx.de> |
---|---|
date | Sun, 27 May 2018 19:20:47 +0200 |
parents | 6652d3823428 |
children | 996d78102a71 |
line wrap: on
line diff
--- a/libinterp/corefcn/strfns.cc Sat May 26 20:50:46 2018 +0200 +++ b/libinterp/corefcn/strfns.cc Sun May 27 19:20:47 2018 +0200 @@ -32,6 +32,7 @@ #include "dMatrix.h" #include "localcharset-wrapper.h" #include "uniconv-wrappers.h" +#include "unistr-wrappers.h" #include "Cell.h" #include "defun.h" @@ -828,6 +829,60 @@ return ovl (retval); } +DEFUN (unicode_idx, args, , + doc: /* -*- texinfo -*- +@deftypefn {} {@var{idx} =} unicode_idx (@var{str}) +Return an array with the indices for each UTF-8 encoded character in @var{str}. + +@example +@group +unicode_idx ("aäbc") + @result{} [1, 2, 2, 3, 4] +@end group +@end example + +@end deftypefn */) +{ + int nargin = args.length (); + + if (nargin != 1) + print_usage (); + + charNDArray str = args(0).xchar_array_value ("STR must be a string"); + Array<octave_idx_type> p (dim_vector (str.ndims (), 1)); + charNDArray str_p; + if (str.ndims () > 1) + { + for (octave_idx_type i=0; i < str.ndims (); i++) + p(i) = i; + p(0) = 1; + p(1) = 0; + str_p = str.permute (p); + } + + const uint8_t *src = reinterpret_cast<const uint8_t *> (str_p.data ()); + octave_idx_type srclen = str.numel (); + + NDArray idx (str_p.dims ()); + + octave_idx_type u8_char_num = 1; + for (octave_idx_type i = 0; i < srclen; u8_char_num++) + { + int mblen = octave_u8_strmblen_wrapper (src + i); + if (mblen < 1) + mblen = 1; + for (octave_idx_type j = 0; j < mblen; j++) + idx (i+j) = u8_char_num; + i += mblen; + } + + return ovl(str.ndims () > 1 ? idx.permute (p, true) : idx); +} + +/* +%!assert (unicode_idx (["aäou"; "Ä∞"]), [1 2 2 3 4; 5 5 6 6 6]); +*/ + DEFUN (list_in_columns, args, , doc: /* -*- texinfo -*- @deftypefn {} {} list_in_columns (@var{arg}, @var{width}, @var{prefix})