Mercurial > octave
changeset 25412:922a93fc73ec
Add function to index characters in UTF-8 encoded strings.
* strfns.cc (Funicode_idx): Add new function.
* unistr-wrappers.(cc/h): Add wrapper for "u8_strmblen".
* strings.txi: UTF-8 as expected encoding. Add doc string for "unicode_idx".
author | Markus Mützel <markus.muetzel@gmx.de> |
---|---|
date | Sun, 27 May 2018 19:20:47 +0200 |
parents | ad3d018e595f |
children | 39cf8145405f |
files | doc/interpreter/strings.txi libinterp/corefcn/strfns.cc liboctave/wrappers/unistr-wrappers.c liboctave/wrappers/unistr-wrappers.h |
diffstat | 4 files changed, 69 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/doc/interpreter/strings.txi Sat May 26 20:50:46 2018 +0200 +++ b/doc/interpreter/strings.txi Sun May 27 19:20:47 2018 +0200 @@ -53,6 +53,9 @@ produces the string whose contents are @samp{foobarbaz}. @xref{Numeric Data Types}, for more information about creating matrices. +While strings can in principal store arbitrary content, most functions expect +them to be UTF-8 encoded Unicode strings. + @menu * Escape Sequences in String Constants:: * Character Arrays:: @@ -468,6 +471,8 @@ @DOCSTRING(untabify) +@DOCSTRING(unicode_idx) + @node String Conversions @section String Conversions
--- a/libinterp/corefcn/strfns.cc Sat May 26 20:50:46 2018 +0200 +++ b/libinterp/corefcn/strfns.cc Sun May 27 19:20:47 2018 +0200 @@ -32,6 +32,7 @@ #include "dMatrix.h" #include "localcharset-wrapper.h" #include "uniconv-wrappers.h" +#include "unistr-wrappers.h" #include "Cell.h" #include "defun.h" @@ -828,6 +829,60 @@ return ovl (retval); } +DEFUN (unicode_idx, args, , + doc: /* -*- texinfo -*- +@deftypefn {} {@var{idx} =} unicode_idx (@var{str}) +Return an array with the indices for each UTF-8 encoded character in @var{str}. + +@example +@group +unicode_idx ("aäbc") + @result{} [1, 2, 2, 3, 4] +@end group +@end example + +@end deftypefn */) +{ + int nargin = args.length (); + + if (nargin != 1) + print_usage (); + + charNDArray str = args(0).xchar_array_value ("STR must be a string"); + Array<octave_idx_type> p (dim_vector (str.ndims (), 1)); + charNDArray str_p; + if (str.ndims () > 1) + { + for (octave_idx_type i=0; i < str.ndims (); i++) + p(i) = i; + p(0) = 1; + p(1) = 0; + str_p = str.permute (p); + } + + const uint8_t *src = reinterpret_cast<const uint8_t *> (str_p.data ()); + octave_idx_type srclen = str.numel (); + + NDArray idx (str_p.dims ()); + + octave_idx_type u8_char_num = 1; + for (octave_idx_type i = 0; i < srclen; u8_char_num++) + { + int mblen = octave_u8_strmblen_wrapper (src + i); + if (mblen < 1) + mblen = 1; + for (octave_idx_type j = 0; j < mblen; j++) + idx (i+j) = u8_char_num; + i += mblen; + } + + return ovl(str.ndims () > 1 ? idx.permute (p, true) : idx); +} + +/* +%!assert (unicode_idx (["aäou"; "Ä∞"]), [1 2 2 3 4; 5 5 6 6 6]); +*/ + DEFUN (list_in_columns, args, , doc: /* -*- texinfo -*- @deftypefn {} {} list_in_columns (@var{arg}, @var{width}, @var{prefix})
--- a/liboctave/wrappers/unistr-wrappers.c Sat May 26 20:50:46 2018 +0200 +++ b/liboctave/wrappers/unistr-wrappers.c Sun May 27 19:20:47 2018 +0200 @@ -28,6 +28,12 @@ #include "unistr-wrappers.h" +int +octave_u8_strmblen_wrapper (const uint8_t *src) +{ + return u8_strmblen (src); +} + uint32_t * octave_u8_to_u32_wrapper (const uint8_t *src, size_t src_len, uint32_t *result_buf, size_t *lengthp)
--- a/liboctave/wrappers/unistr-wrappers.h Sat May 26 20:50:46 2018 +0200 +++ b/liboctave/wrappers/unistr-wrappers.h Sun May 27 19:20:47 2018 +0200 @@ -27,6 +27,9 @@ extern "C" { #endif +extern int +octave_u8_strmblen_wrapper (const uint8_t *src); + extern uint32_t * octave_u8_to_u32_wrapper (const uint8_t *src, size_t src_len, uint32_t *result_buf, size_t *lengthp);