changeset 25415:d4bc8590b5cf

Make "is*" string functions Unicode aware. * ov-ch-mat.cc (map): Use UTF-8 aware functions for "is*" string functions. * mappers.cc: Add tests for "is*" string functions. * unistr-wrappers.[c/h]: Add wrapper for "u8_strmbtouc". * unictype-wrappers.[c/h]: Add wrappers for UTF-8 aware C-like "is*" functions. * module.mk: Add new files. * bootstrap.conf: Add new modules.
author Markus Mützel <markus.muetzel@gmx.de>
date Sun, 27 May 2018 22:13:24 +0200
parents 8fae933e7228
children a741730fca5e
files bootstrap.conf libinterp/corefcn/mappers.cc libinterp/octave-value/ov-ch-mat.cc liboctave/wrappers/module.mk liboctave/wrappers/unictype-wrappers.c liboctave/wrappers/unictype-wrappers.h liboctave/wrappers/unistr-wrappers.c liboctave/wrappers/unistr-wrappers.h
diffstat 8 files changed, 258 insertions(+), 18 deletions(-) [+]
line wrap: on
line diff
--- a/bootstrap.conf	Tue May 29 09:08:25 2018 +0200
+++ b/bootstrap.conf	Sun May 27 22:13:24 2018 +0200
@@ -95,8 +95,21 @@
   unicase/u8-toupper
   uniconv/u8-conv-from-enc
   uniconv/u8-conv-to-enc
+  unictype/ctype-alnum
+  unictype/ctype-alpha
+  unictype/ctype-blank
+  unictype/ctype-cntrl
+  unictype/ctype-digit
+  unictype/ctype-graph
+  unictype/ctype-lower
+  unictype/ctype-print
+  unictype/ctype-punct
+  unictype/ctype-space
+  unictype/ctype-upper
+  unictype/ctype-xdigit
   unistd
   unistr/u8-strmblen
+  unistr/u8-strmbtouc
   unistr/u8-to-u32
   unlink
   unsetenv
--- a/libinterp/corefcn/mappers.cc	Tue May 29 09:08:25 2018 +0200
+++ b/libinterp/corefcn/mappers.cc	Sun May 27 22:13:24 2018 +0200
@@ -1214,6 +1214,7 @@
 %! result(double ("0":"9") + 1) = true;
 %! result(double ("a":"z") + 1) = true;
 %! assert (isalnum (charset), result);
+%!assert (isalnum(["Ä8Aa?"; "(Uß ;"]), logical ([1 1 1 1 1 0; 0 1 1 1 0 0]));
 
 %!error isalnum ()
 %!error isalnum (1, 2)
@@ -1242,6 +1243,7 @@
 %! result(double ("A":"Z") + 1) = true;
 %! result(double ("a":"z") + 1) = true;
 %! assert (isalpha (charset), result);
+%!assert (isalpha("Ä8Aa(Uß ;"), logical ([1 1 0 1 1 0 1 1 1 0 0]));
 
 %!error isalpha ()
 %!error isalpha (1, 2)
@@ -1317,6 +1319,7 @@
 %! result = false (1, 128);
 %! result(double ("0":"9") + 1) = true;
 %! assert (isdigit (charset), result);
+%!assert (isdigit("Ä8Aa(Uß ;"), logical ([0 0 1 0 0 0 0 0 0 0 0]));
 
 %!error isdigit ()
 %!error isdigit (1, 2)
@@ -1383,6 +1386,7 @@
 %! result = false (1, 128);
 %! result(34:127) = true;
 %! assert (isgraph (charset), result);
+%!assert (isgraph("Ä8Aa(Uß ;"), logical ([1 1 1 1 1 1 1 1 1 0 1]));
 
 %!error isgraph ()
 %!error isgraph (1, 2)
@@ -1408,6 +1412,7 @@
 %! result = false (1, 128);
 %! result(double ("a":"z") + 1) = true;
 %! assert (islower (charset), result);
+%!assert (islower("Ä8Aa(Uß ;"), logical ([0 0 0 0 1 0 0 1 1 0 0]));
 
 %!error islower ()
 %!error islower (1, 2)
@@ -1514,6 +1519,7 @@
 %! result = false (1, 128);
 %! result(33:127) = true;
 %! assert (isprint (charset), result);
+%!assert (isprint("Ä8Aa(Uß ;"), logical ([1 1 1 1 1 1 1 1 1 1 1]));
 
 %!error isprint ()
 %!error isprint (1, 2)
@@ -1542,6 +1548,7 @@
 %! result(92:97) = true;
 %! result(124:127) = true;
 %! assert (ispunct (charset), result);
+%!assert (ispunct("Ä8Aa(Uß ;"), logical ([0 0 0 0 0 1 0 0 0 0 1]));
 
 %!error ispunct ()
 %!error ispunct (1, 2)
@@ -1568,6 +1575,7 @@
 %! result = false (1, 128);
 %! result(double (" \f\n\r\t\v") + 1) = true;
 %! assert (isspace (charset), result);
+%!assert (isspace("Ä8Aa(Uß ;"), logical ([0 0 0 0 0 0 0 0 0 1 0]));
 
 %!error isspace ()
 %!error isspace (1, 2)
@@ -1593,6 +1601,7 @@
 %! result = false (1, 128);
 %! result(double ("A":"Z") + 1) = true;
 %! assert (isupper (charset), result);
+%!assert (isupper("Ä8Aa(Uß ;"), logical ([1 1 0 1 0 0 1 0 0 0 0]));
 
 %!error isupper ()
 %!error isupper (1, 2)
@@ -1620,6 +1629,7 @@
 %! result(double ("0":"9") + 1) = true;
 %! result(double ("a":"f") + 1) = true;
 %! assert (isxdigit (charset), result);
+%!assert (isxdigit("Ä8Aa(Uß ;"), logical ([0 0 1 1 1 0 0 0 0 0 0]));
 
 %!error isxdigit ()
 %!error isxdigit (1, 2)
--- a/libinterp/octave-value/ov-ch-mat.cc	Tue May 29 09:08:25 2018 +0200
+++ b/libinterp/octave-value/ov-ch-mat.cc	Sun May 27 22:13:24 2018 +0200
@@ -42,6 +42,8 @@
 #include "lo-ieee.h"
 #include "mx-base.h"
 #include "unicase-wrappers.h"
+#include "unictype-wrappers.h"
+#include "unistr-wrappers.h"
 
 #include "mxarray.h"
 #include "ov-base.h"
@@ -259,18 +261,49 @@
     case umap_ ## UMAP:                                               \
       return octave_value (matrix.map<TYPE, int (&) (int)> (FCN))
 
-    STRING_MAPPER (xisalnum, std::isalnum, bool);
-    STRING_MAPPER (xisalpha, std::isalpha, bool);
     STRING_MAPPER (xisascii, xisascii, bool);
-    STRING_MAPPER (xiscntrl, std::iscntrl, bool);
-    STRING_MAPPER (xisdigit, std::isdigit, bool);
-    STRING_MAPPER (xisgraph, std::isgraph, bool);
-    STRING_MAPPER (xislower, std::islower, bool);
-    STRING_MAPPER (xisprint, std::isprint, bool);
-    STRING_MAPPER (xispunct, std::ispunct, bool);
-    STRING_MAPPER (xisspace, std::isspace, bool);
-    STRING_MAPPER (xisupper, std::isupper, bool);
-    STRING_MAPPER (xisxdigit, std::isxdigit, bool);
+
+#define STRING_U8_MAPPER(UMAP,FCN)                                             \
+    case umap_ ## UMAP:                                                        \
+      {                                                                        \
+        charNDArray in_m = matrix;                                             \
+        Array<octave_idx_type> p (dim_vector (matrix.ndims (), 1));            \
+        if (matrix.ndims () > 1)                                               \
+          {                                                                    \
+            for (octave_idx_type i=0; i < matrix.ndims (); i++)                \
+              p(i) = i;                                                        \
+            p(0) = 1;                                                          \
+            p(1) = 0;                                                          \
+            in_m = matrix.permute (p);                                         \
+          }                                                                    \
+        boolNDArray b_array = boolNDArray (in_m.dims ());                      \
+        const uint8_t *in = reinterpret_cast<const uint8_t *> (in_m.data ());  \
+        uint32_t uc;                                                           \
+        for (octave_idx_type i = 0; i < in_m.numel (); )                       \
+        {                                                                      \
+          int mblen = octave_u8_strmbtouc_wrapper (&uc, in + i);               \
+          if (mblen < 1)                                                       \
+            mblen = 1;                                                         \
+          bool is_upper = FCN (uc);                                            \
+          for (int j = 0; j < mblen; j++)                                      \
+            b_array(i+j) = is_upper;                                           \
+          i += mblen;                                                          \
+        }                                                                      \
+        return octave_value ((matrix.ndims () > 1) ? b_array.permute (p, true) \
+                                                   : b_array);                 \
+      }
+
+    STRING_U8_MAPPER (xisalnum, octave_uc_is_alnum_wrapper);
+    STRING_U8_MAPPER (xisalpha, octave_uc_is_alpha_wrapper);
+    STRING_U8_MAPPER (xiscntrl, octave_uc_is_cntrl_wrapper);
+    STRING_U8_MAPPER (xisdigit, octave_uc_is_digit_wrapper);
+    STRING_U8_MAPPER (xisgraph, octave_uc_is_graph_wrapper);
+    STRING_U8_MAPPER (xislower, octave_uc_is_lower_wrapper);
+    STRING_U8_MAPPER (xisprint, octave_uc_is_print_wrapper);
+    STRING_U8_MAPPER (xispunct, octave_uc_is_punct_wrapper);
+    STRING_U8_MAPPER (xisspace, octave_uc_is_space_wrapper);
+    STRING_U8_MAPPER (xisupper, octave_uc_is_upper_wrapper);
+    STRING_U8_MAPPER (xisxdigit, octave_uc_is_xdigit_wrapper);
 
 #define STRING_U8_FCN(UMAP,U8_FCN,STD_FCN)                                     \
     case umap_ ## UMAP:                                                        \
@@ -278,13 +311,13 @@
         charNDArray in_m = matrix;                                             \
         Array<octave_idx_type> p (dim_vector (matrix.ndims (), 1));            \
         if (matrix.ndims () > 1)                                               \
-        {                                                                      \
-          for (octave_idx_type i=0; i < matrix.ndims (); i++)                  \
-            p(i) = i;                                                          \
-          p(0) = 1;                                                            \
-          p(1) = 0;                                                            \
-          in_m = matrix.permute (p);                                           \
-        }                                                                      \
+          {                                                                    \
+            for (octave_idx_type i=0; i < matrix.ndims (); i++)                \
+              p(i) = i;                                                        \
+            p(0) = 1;                                                          \
+            p(1) = 0;                                                          \
+            in_m = matrix.permute (p);                                         \
+          }                                                                    \
         size_t output_length = in_m.numel ();                                  \
         charNDArray ch_array = charNDArray (in_m.dims ());                     \
         const uint8_t *in = reinterpret_cast<const uint8_t *> (in_m.data ());  \
--- a/liboctave/wrappers/module.mk	Tue May 29 09:08:25 2018 +0200
+++ b/liboctave/wrappers/module.mk	Sun May 27 22:13:24 2018 +0200
@@ -31,6 +31,7 @@
   %reldir%/uname-wrapper.h \
   %reldir%/unicase-wrappers.h \
   %reldir%/uniconv-wrappers.h \
+  %reldir%/unictype-wrappers.h \
   %reldir%/unistd-wrappers.h \
   %reldir%/unistr-wrappers.h \
   %reldir%/unsetenv-wrapper.h \
@@ -71,6 +72,7 @@
   %reldir%/uname-wrapper.c \
   %reldir%/unicase-wrappers.c \
   %reldir%/uniconv-wrappers.c \
+  %reldir%/unictype-wrappers.c \
   %reldir%/unistd-wrappers.c \
   %reldir%/unistr-wrappers.c \
   %reldir%/unsetenv-wrapper.c \
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/liboctave/wrappers/unictype-wrappers.c	Sun May 27 22:13:24 2018 +0200
@@ -0,0 +1,101 @@
+/*
+
+Copyright (C) 2018 Markus Mützel
+
+This file is part of Octave.
+
+Octave is free software: you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+Octave is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Octave; see the file COPYING.  If not, see
+<https://www.gnu.org/licenses/>.
+
+*/
+
+#if defined (HAVE_CONFIG_H)
+#  include "config.h"
+#endif
+
+#include "unictype.h"
+
+#include "unictype-wrappers.h"
+
+bool
+octave_uc_is_alnum_wrapper (ucs4_t uc)
+{
+  return uc_is_alnum (uc);
+}
+
+bool
+octave_uc_is_alpha_wrapper (ucs4_t uc)
+{
+  return uc_is_alpha (uc);
+}
+
+bool
+octave_uc_is_blank_wrapper (ucs4_t uc)
+{
+  return uc_is_blank (uc);
+}
+
+bool
+octave_uc_is_cntrl_wrapper (ucs4_t uc)
+{
+  return uc_is_cntrl (uc);
+}
+
+bool
+octave_uc_is_digit_wrapper (ucs4_t uc)
+{
+  return uc_is_digit (uc);
+}
+
+bool
+octave_uc_is_graph_wrapper (ucs4_t uc)
+{
+  return uc_is_graph (uc);
+}
+
+bool
+octave_uc_is_lower_wrapper (ucs4_t uc)
+{
+  return uc_is_lower (uc);
+}
+
+bool
+octave_uc_is_print_wrapper (ucs4_t uc)
+{
+  return uc_is_print (uc);
+}
+
+bool
+octave_uc_is_punct_wrapper (ucs4_t uc)
+{
+  return uc_is_punct (uc);
+}
+
+bool
+octave_uc_is_space_wrapper (ucs4_t uc)
+{
+  return uc_is_space (uc);
+}
+
+bool
+octave_uc_is_upper_wrapper (ucs4_t uc)
+{
+  return uc_is_upper (uc);
+}
+
+bool
+octave_uc_is_xdigit_wrapper (ucs4_t uc)
+{
+  return uc_is_xdigit (uc);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/liboctave/wrappers/unictype-wrappers.h	Sun May 27 22:13:24 2018 +0200
@@ -0,0 +1,72 @@
+/*
+
+Copyright (C) 2018 Markus Mützel
+
+This file is part of Octave.
+
+Octave is free software: you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+Octave is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Octave; see the file COPYING.  If not, see
+<https://www.gnu.org/licenses/>.
+
+*/
+
+#if ! defined (octave_unictype_wrappers_h)
+#define octave_unictype_wrappers_h 1
+
+typedef uint32_t ucs4_t;
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+extern bool
+octave_uc_is_alnum_wrapper (ucs4_t uc);
+
+extern bool
+octave_uc_is_alpha_wrapper (ucs4_t uc);
+
+extern bool
+octave_uc_is_blank_wrapper (ucs4_t uc);
+
+extern bool
+octave_uc_is_cntrl_wrapper (ucs4_t uc);
+
+extern bool
+octave_uc_is_digit_wrapper (ucs4_t uc);
+
+extern bool
+octave_uc_is_graph_wrapper (ucs4_t uc);
+
+extern bool
+octave_uc_is_lower_wrapper (ucs4_t uc);
+
+extern bool
+octave_uc_is_print_wrapper (ucs4_t uc);
+
+extern bool
+octave_uc_is_punct_wrapper (ucs4_t uc);
+
+extern bool
+octave_uc_is_space_wrapper (ucs4_t uc);
+
+extern bool
+octave_uc_is_upper_wrapper (ucs4_t uc);
+
+extern bool
+octave_uc_is_xdigit_wrapper (ucs4_t uc);
+
+#if defined __cplusplus
+}
+#endif
+
+#endif
--- a/liboctave/wrappers/unistr-wrappers.c	Tue May 29 09:08:25 2018 +0200
+++ b/liboctave/wrappers/unistr-wrappers.c	Sun May 27 22:13:24 2018 +0200
@@ -34,6 +34,12 @@
   return u8_strmblen (src);
 }
 
+int
+octave_u8_strmbtouc_wrapper (uint32_t *puc, const uint8_t *src)
+{
+  return u8_strmbtouc (puc, src);
+}
+
 uint32_t *
 octave_u8_to_u32_wrapper (const uint8_t *src, size_t src_len,
                           uint32_t *result_buf, size_t *lengthp)
--- a/liboctave/wrappers/unistr-wrappers.h	Tue May 29 09:08:25 2018 +0200
+++ b/liboctave/wrappers/unistr-wrappers.h	Sun May 27 22:13:24 2018 +0200
@@ -30,6 +30,9 @@
 extern int
 octave_u8_strmblen_wrapper (const uint8_t *src);
 
+extern int
+octave_u8_strmbtouc_wrapper (uint32_t *puc, const uint8_t *src);
+
 extern uint32_t *
 octave_u8_to_u32_wrapper (const uint8_t *src, size_t src_len,
                           uint32_t *result_buf, size_t *lengthp);