changeset 29823:7917b91a3c58

__unicode_length__: New function to get number of Unicode code points (bug #50855). * libinterp/corefcn/strfns.cc (F__unicode_length__): Add new function that returns the number of Unicode code points in a UTF-8 encoded character array or cell string. * liboctave/wrappers/unistr-wrappers.c, unistr-wrappers.h (octave_u8_mbsnlen_wrapper): Add wrapper for u8_mbsnlen. * bootstrap.conf: Add unistr/u8-mbslen to list of used gnulib modules.
author Markus Mützel <markus.muetzel@gmx.de>
date Fri, 30 Apr 2021 20:43:30 +0200
parents 0923ae48a4f7
children a48151f59b69
files bootstrap.conf libinterp/corefcn/strfns.cc liboctave/wrappers/unistr-wrappers.c liboctave/wrappers/unistr-wrappers.h
diffstat 4 files changed, 82 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/bootstrap.conf	Fri Jun 25 15:39:48 2021 -0400
+++ b/bootstrap.conf	Fri Apr 30 20:43:30 2021 +0200
@@ -123,6 +123,7 @@
   unistr/u16-to-u8
   unistr/u32-to-u8
   unistr/u8-check
+  unistr/u8-mbsnlen
   unistr/u8-strmblen
   unistr/u8-strmbtouc
   unistr/u8-to-u16
--- a/libinterp/corefcn/strfns.cc	Fri Jun 25 15:39:48 2021 -0400
+++ b/libinterp/corefcn/strfns.cc	Fri Apr 30 20:43:30 2021 +0200
@@ -1018,17 +1018,87 @@
       if (mblen < 1)
         mblen = 1;
       for (octave_idx_type j = 0; j < mblen; j++)
-        idx (i+j) = u8_char_num;
+        idx(i+j) = u8_char_num;
       i += mblen;
     }
 
-  return ovl(str.ndims () > 1 ? idx.permute (p, true) : idx);
+  return ovl (str.ndims () > 1 ? idx.permute (p, true) : idx);
 }
 
 /*
 %!assert (unicode_idx (["aäou"; "Ä∞"]), [1 2 2 3 4; 5 5 6 6 6])
 */
 
+DEFUN (__unicode_length__, args, ,
+       doc: /* -*- texinfo -*-
+@deftypefn {} {@var{len} =} __unicode_length__ (@var{str})
+Return number of Unicode code points in @var{str}.
+
+The input @var{str} must be a UTF-8 encoded character vector or cell string.
+
+@example
+@group
+length ("aäbc")
+     @result{} 5
+__unicode_length__ ("aäbc")
+     @result{} 4
+@end group
+@end example
+
+@end deftypefn */)
+{
+  if (args.length () != 1)
+    print_usage ();
+
+  bool arg_char = args(0).is_char_matrix ();
+
+  if (! arg_char && ! args(0).iscellstr ())
+    error ("STR must be a character array or cell string.");
+
+  octave_value_list retval;
+
+  if (arg_char)
+    {
+      charNDArray str = args(0).char_array_value ();
+      Array<octave_idx_type> p (dim_vector (str.ndims (), 1));
+      if (str.ndims () > 1)
+        {
+          for (octave_idx_type i=0; i < str.ndims (); i++)
+            p(i) = i;
+          p(0) = 1;
+          p(1) = 0;
+          str = str.permute (p);
+        }
+
+      const uint8_t *src = reinterpret_cast<const uint8_t *> (str.data ());
+      octave_idx_type mbsnlen = octave_u8_mbsnlen_wrapper (src, str.numel ());
+
+      retval = ovl (mbsnlen);
+    }
+  else
+    {
+      const Array<std::string> cellstr = args(0).cellstr_value ();
+      NDArray output (args(0).dims (), false);
+      for (octave_idx_type i = 0; i < cellstr.numel (); i++)
+        {
+          const uint8_t *src 
+            = reinterpret_cast<const uint8_t *> (cellstr(i).c_str ());
+          output(i) = octave_u8_mbsnlen_wrapper (src, cellstr(i).size ());
+        }
+
+      retval = ovl (output);
+    }
+
+  return retval;
+}
+
+/*
+%!assert (__unicode_length__ (""), 0)
+%!assert (__unicode_length__ ("aäbc"), 4)
+%!assert (__unicode_length__ (["aä"; "öo"]), 4)
+%!assert (__unicode_length__ ({"aäbc", "abc"}), [4, 3])
+*/
+
 DEFUN (__u8_validate__, args, ,
        doc: /* -*- texinfo -*-
 @deftypefn {} {@var{out_str} =} __u8_validate__ (in_str, mode)
--- a/liboctave/wrappers/unistr-wrappers.c	Fri Jun 25 15:39:48 2021 -0400
+++ b/liboctave/wrappers/unistr-wrappers.c	Fri Apr 30 20:43:30 2021 +0200
@@ -43,6 +43,12 @@
   return u8_strmblen (src);
 }
 
+size_t
+octave_u8_mbsnlen_wrapper (const uint8_t *src, size_t n)
+{
+  return u8_mbsnlen (src, n);
+}
+
 int
 octave_u8_strmbtouc_wrapper (uint32_t *puc, const uint8_t *src)
 {
--- a/liboctave/wrappers/unistr-wrappers.h	Fri Jun 25 15:39:48 2021 -0400
+++ b/liboctave/wrappers/unistr-wrappers.h	Fri Apr 30 20:43:30 2021 +0200
@@ -35,6 +35,9 @@
 
 extern OCTAVE_API int octave_u8_strmblen_wrapper (const uint8_t *src);
 
+extern OCTAVE_API size_t
+octave_u8_mbsnlen_wrapper (const uint8_t *src, size_t n);
+
 extern OCTAVE_API int
 octave_u8_strmbtouc_wrapper (uint32_t *puc, const uint8_t *src);