changeset 25412:922a93fc73ec

Add function to index characters in UTF-8 encoded strings. * strfns.cc (Funicode_idx): Add new function. * unistr-wrappers.(cc/h): Add wrapper for "u8_strmblen". * strings.txi: UTF-8 as expected encoding. Add doc string for "unicode_idx".
author Markus Mützel <markus.muetzel@gmx.de>
date Sun, 27 May 2018 19:20:47 +0200
parents ad3d018e595f
children 39cf8145405f
files doc/interpreter/strings.txi libinterp/corefcn/strfns.cc liboctave/wrappers/unistr-wrappers.c liboctave/wrappers/unistr-wrappers.h
diffstat 4 files changed, 69 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/doc/interpreter/strings.txi	Sat May 26 20:50:46 2018 +0200
+++ b/doc/interpreter/strings.txi	Sun May 27 19:20:47 2018 +0200
@@ -53,6 +53,9 @@
 produces the string whose contents are @samp{foobarbaz}.  @xref{Numeric Data
 Types}, for more information about creating matrices.
 
+While strings can in principal store arbitrary content, most functions expect
+them to be UTF-8 encoded Unicode strings.
+
 @menu
 * Escape Sequences in String Constants::
 * Character Arrays::
@@ -468,6 +471,8 @@
 
 @DOCSTRING(untabify)
 
+@DOCSTRING(unicode_idx)
+
 @node String Conversions
 @section String Conversions
 
--- a/libinterp/corefcn/strfns.cc	Sat May 26 20:50:46 2018 +0200
+++ b/libinterp/corefcn/strfns.cc	Sun May 27 19:20:47 2018 +0200
@@ -32,6 +32,7 @@
 #include "dMatrix.h"
 #include "localcharset-wrapper.h"
 #include "uniconv-wrappers.h"
+#include "unistr-wrappers.h"
 
 #include "Cell.h"
 #include "defun.h"
@@ -828,6 +829,60 @@
   return ovl (retval);
 }
 
+DEFUN (unicode_idx, args, ,
+       doc: /* -*- texinfo -*-
+@deftypefn {} {@var{idx} =} unicode_idx (@var{str})
+Return an array with the indices for each UTF-8 encoded character in @var{str}.
+
+@example
+@group
+unicode_idx ("aäbc")
+     @result{} [1, 2, 2, 3, 4]
+@end group
+@end example
+
+@end deftypefn */)
+{
+  int nargin = args.length ();
+
+  if (nargin != 1)
+    print_usage ();
+
+  charNDArray str = args(0).xchar_array_value ("STR must be a string");
+  Array<octave_idx_type> p (dim_vector (str.ndims (), 1));
+  charNDArray str_p;
+  if (str.ndims () > 1)
+  {
+    for (octave_idx_type i=0; i < str.ndims (); i++)
+      p(i) = i;
+    p(0) = 1;
+    p(1) = 0;
+    str_p = str.permute (p);
+  }
+
+  const uint8_t *src = reinterpret_cast<const uint8_t *> (str_p.data ());
+  octave_idx_type srclen = str.numel ();
+
+  NDArray idx (str_p.dims ());
+
+  octave_idx_type u8_char_num = 1;
+  for (octave_idx_type i = 0; i < srclen; u8_char_num++)
+  {
+    int mblen = octave_u8_strmblen_wrapper (src + i);
+    if (mblen < 1)
+      mblen = 1;
+    for (octave_idx_type j = 0; j < mblen; j++)
+      idx (i+j) = u8_char_num;
+    i += mblen;
+  }
+  
+  return ovl(str.ndims () > 1 ? idx.permute (p, true) : idx);
+}
+
+/*
+%!assert (unicode_idx (["aäou"; "Ä∞"]), [1 2 2 3 4; 5 5 6 6 6]);
+*/
+
 DEFUN (list_in_columns, args, ,
        doc: /* -*- texinfo -*-
 @deftypefn {} {} list_in_columns (@var{arg}, @var{width}, @var{prefix})
--- a/liboctave/wrappers/unistr-wrappers.c	Sat May 26 20:50:46 2018 +0200
+++ b/liboctave/wrappers/unistr-wrappers.c	Sun May 27 19:20:47 2018 +0200
@@ -28,6 +28,12 @@
 
 #include "unistr-wrappers.h"
 
+int
+octave_u8_strmblen_wrapper (const uint8_t *src)
+{
+  return u8_strmblen (src);
+}
+
 uint32_t *
 octave_u8_to_u32_wrapper (const uint8_t *src, size_t src_len,
                           uint32_t *result_buf, size_t *lengthp)
--- a/liboctave/wrappers/unistr-wrappers.h	Sat May 26 20:50:46 2018 +0200
+++ b/liboctave/wrappers/unistr-wrappers.h	Sun May 27 19:20:47 2018 +0200
@@ -27,6 +27,9 @@
 extern "C" {
 #endif
 
+extern int
+octave_u8_strmblen_wrapper (const uint8_t *src);
+
 extern uint32_t *
 octave_u8_to_u32_wrapper (const uint8_t *src, size_t src_len,
                           uint32_t *result_buf, size_t *lengthp);