Mercurial > octave
changeset 29823:7917b91a3c58
__unicode_length__: New function to get number of Unicode code points (bug #50855).
* libinterp/corefcn/strfns.cc (F__unicode_length__): Add new function that
returns the number of Unicode code points in a UTF-8 encoded character array
or cell string.
* liboctave/wrappers/unistr-wrappers.c, unistr-wrappers.h
(octave_u8_mbsnlen_wrapper): Add wrapper for u8_mbsnlen.
* bootstrap.conf: Add unistr/u8-mbslen to list of used gnulib modules.
author | Markus Mützel <markus.muetzel@gmx.de> |
---|---|
date | Fri, 30 Apr 2021 20:43:30 +0200 |
parents | 0923ae48a4f7 |
children | a48151f59b69 |
files | bootstrap.conf libinterp/corefcn/strfns.cc liboctave/wrappers/unistr-wrappers.c liboctave/wrappers/unistr-wrappers.h |
diffstat | 4 files changed, 82 insertions(+), 2 deletions(-) [+] |
line wrap: on
line diff
--- a/bootstrap.conf Fri Jun 25 15:39:48 2021 -0400 +++ b/bootstrap.conf Fri Apr 30 20:43:30 2021 +0200 @@ -123,6 +123,7 @@ unistr/u16-to-u8 unistr/u32-to-u8 unistr/u8-check + unistr/u8-mbsnlen unistr/u8-strmblen unistr/u8-strmbtouc unistr/u8-to-u16
--- a/libinterp/corefcn/strfns.cc Fri Jun 25 15:39:48 2021 -0400 +++ b/libinterp/corefcn/strfns.cc Fri Apr 30 20:43:30 2021 +0200 @@ -1018,17 +1018,87 @@ if (mblen < 1) mblen = 1; for (octave_idx_type j = 0; j < mblen; j++) - idx (i+j) = u8_char_num; + idx(i+j) = u8_char_num; i += mblen; } - return ovl(str.ndims () > 1 ? idx.permute (p, true) : idx); + return ovl (str.ndims () > 1 ? idx.permute (p, true) : idx); } /* %!assert (unicode_idx (["aäou"; "Ä∞"]), [1 2 2 3 4; 5 5 6 6 6]) */ +DEFUN (__unicode_length__, args, , + doc: /* -*- texinfo -*- +@deftypefn {} {@var{len} =} __unicode_length__ (@var{str}) +Return number of Unicode code points in @var{str}. + +The input @var{str} must be a UTF-8 encoded character vector or cell string. + +@example +@group +length ("aäbc") + @result{} 5 +__unicode_length__ ("aäbc") + @result{} 4 +@end group +@end example + +@end deftypefn */) +{ + if (args.length () != 1) + print_usage (); + + bool arg_char = args(0).is_char_matrix (); + + if (! arg_char && ! args(0).iscellstr ()) + error ("STR must be a character array or cell string."); + + octave_value_list retval; + + if (arg_char) + { + charNDArray str = args(0).char_array_value (); + Array<octave_idx_type> p (dim_vector (str.ndims (), 1)); + if (str.ndims () > 1) + { + for (octave_idx_type i=0; i < str.ndims (); i++) + p(i) = i; + p(0) = 1; + p(1) = 0; + str = str.permute (p); + } + + const uint8_t *src = reinterpret_cast<const uint8_t *> (str.data ()); + octave_idx_type mbsnlen = octave_u8_mbsnlen_wrapper (src, str.numel ()); + + retval = ovl (mbsnlen); + } + else + { + const Array<std::string> cellstr = args(0).cellstr_value (); + NDArray output (args(0).dims (), false); + for (octave_idx_type i = 0; i < cellstr.numel (); i++) + { + const uint8_t *src + = reinterpret_cast<const uint8_t *> (cellstr(i).c_str ()); + output(i) = octave_u8_mbsnlen_wrapper (src, cellstr(i).size ()); + } + + retval = ovl (output); + } + + return retval; +} + +/* +%!assert (__unicode_length__ (""), 0) +%!assert (__unicode_length__ ("aäbc"), 4) +%!assert (__unicode_length__ (["aä"; "öo"]), 4) +%!assert (__unicode_length__ ({"aäbc", "abc"}), [4, 3]) +*/ + DEFUN (__u8_validate__, args, , doc: /* -*- texinfo -*- @deftypefn {} {@var{out_str} =} __u8_validate__ (in_str, mode)
--- a/liboctave/wrappers/unistr-wrappers.c Fri Jun 25 15:39:48 2021 -0400 +++ b/liboctave/wrappers/unistr-wrappers.c Fri Apr 30 20:43:30 2021 +0200 @@ -43,6 +43,12 @@ return u8_strmblen (src); } +size_t +octave_u8_mbsnlen_wrapper (const uint8_t *src, size_t n) +{ + return u8_mbsnlen (src, n); +} + int octave_u8_strmbtouc_wrapper (uint32_t *puc, const uint8_t *src) {
--- a/liboctave/wrappers/unistr-wrappers.h Fri Jun 25 15:39:48 2021 -0400 +++ b/liboctave/wrappers/unistr-wrappers.h Fri Apr 30 20:43:30 2021 +0200 @@ -35,6 +35,9 @@ extern OCTAVE_API int octave_u8_strmblen_wrapper (const uint8_t *src); +extern OCTAVE_API size_t +octave_u8_mbsnlen_wrapper (const uint8_t *src, size_t n); + extern OCTAVE_API int octave_u8_strmbtouc_wrapper (uint32_t *puc, const uint8_t *src);