changeset 25413:39cf8145405f

Make "tolower" and "toupper" Unicode aware (bug #53873). * ov-ch-mat.cc (map): Use UTF-8 aware functions for "tolower" and "toupper". * mappers.cc: Add tests for "tolower" and "toupper". * unicase-wrappers.[c/h]: Add wrappers for "u8_tolower" and "u8_toupper". * module.mk: Add new files. * bootstrap.conf: Add modules "unicase/u8-tolower" and "unicase/u8-toupper".
author Markus Mützel <markus.muetzel@gmx.de>
date Wed, 16 May 2018 21:36:27 +0200
parents 922a93fc73ec
children 8fae933e7228
files bootstrap.conf libinterp/corefcn/mappers.cc libinterp/octave-value/ov-ch-mat.cc liboctave/wrappers/module.mk liboctave/wrappers/unicase-wrappers.c liboctave/wrappers/unicase-wrappers.h
diffstat 6 files changed, 129 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/bootstrap.conf	Sun May 27 19:20:47 2018 +0200
+++ b/bootstrap.conf	Wed May 16 21:36:27 2018 +0200
@@ -91,6 +91,8 @@
   tempname
   tmpfile
   uname
+  unicase/u8-tolower
+  unicase/u8-toupper
   uniconv/u8-conv-from-enc
   uniconv/u8-conv-to-enc
   unistd
--- a/libinterp/corefcn/mappers.cc	Sun May 27 19:20:47 2018 +0200
+++ b/libinterp/corefcn/mappers.cc	Wed May 16 21:36:27 2018 +0200
@@ -2139,6 +2139,8 @@
 %!assert (tolower ({"ABC", "DEF", {"GHI", {"JKL"}}}), {"abc", "def", {"ghi", {"jkl"}}})
 %!assert (tolower (["ABC"; "DEF"]), ["abc"; "def"])
 %!assert (tolower ({["ABC"; "DEF"]}), {["abc";"def"]})
+%!assert (tolower (["ABCÄÖÜSS"; "abcäöüß"]), ["abcäöüss"; "abcäöüß"])
+%!assert (tolower (repmat ("ÄÖÜ", 2, 1, 3)), repmat ("äöü", 2, 1, 3))
 %!assert (tolower (68), 68)
 %!assert (tolower ({[68, 68; 68, 68]}), {[68, 68; 68, 68]})
 %!assert (tolower (68i), 68i)
@@ -2203,6 +2205,8 @@
 %!assert (toupper ({"abc", "def", {"ghi", {"jkl"}}}), {"ABC", "DEF", {"GHI", {"JKL"}}})
 %!assert (toupper (["abc"; "def"]), ["ABC"; "DEF"])
 %!assert (toupper ({["abc"; "def"]}), {["ABC";"DEF"]})
+%!assert (toupper (["ABCÄÖÜSS"; "abcäöüß"]), ["ABCÄÖÜSS"; "ABCÄÖÜSS"])
+%!assert (toupper (repmat ("äöü", 2, 1, 3)), repmat ("ÄÖÜ", 2, 1, 3))
 %!assert (toupper (100), 100)
 %!assert (toupper ({[100, 100; 100, 100]}), {[100, 100; 100, 100]})
 %!assert (toupper (100i), 100i)
--- a/libinterp/octave-value/ov-ch-mat.cc	Sun May 27 19:20:47 2018 +0200
+++ b/libinterp/octave-value/ov-ch-mat.cc	Wed May 16 21:36:27 2018 +0200
@@ -41,6 +41,7 @@
 
 #include "lo-ieee.h"
 #include "mx-base.h"
+#include "unicase-wrappers.h"
 
 #include "mxarray.h"
 #include "ov-base.h"
@@ -270,8 +271,37 @@
     STRING_MAPPER (xisspace, std::isspace, bool);
     STRING_MAPPER (xisupper, std::isupper, bool);
     STRING_MAPPER (xisxdigit, std::isxdigit, bool);
-    STRING_MAPPER (xtolower, std::tolower, char);
-    STRING_MAPPER (xtoupper, std::toupper, char);
+
+#define STRING_U8_FCN(UMAP,U8_FCN,STD_FCN)                                     \
+    case umap_ ## UMAP:                                                        \
+      {                                                                        \
+        charNDArray in_m = matrix;                                             \
+        Array<octave_idx_type> p (dim_vector (matrix.ndims (), 1));            \
+        if (matrix.ndims () > 1)                                               \
+        {                                                                      \
+          for (octave_idx_type i=0; i < matrix.ndims (); i++)                  \
+            p(i) = i;                                                          \
+          p(0) = 1;                                                            \
+          p(1) = 0;                                                            \
+          in_m = matrix.permute (p);                                           \
+        }                                                                      \
+        size_t output_length = in_m.numel ();                                  \
+        charNDArray ch_array = charNDArray (in_m.dims ());                     \
+        const uint8_t *in = reinterpret_cast<const uint8_t *> (in_m.data ());  \
+        uint8_t *buf = reinterpret_cast<uint8_t *> (ch_array.fortran_vec ());  \
+        U8_FCN (in, matrix.numel (), nullptr, buf, &output_length);            \
+        if (output_length != static_cast<size_t> (matrix.numel ()))            \
+          {                                                                    \
+            warning_with_id ("octave:multi_byte_char_length",                  \
+                             "UMAP: Possible multi-byte error.");              \
+            return octave_value (matrix.map<char, int (&) (int)> (STD_FCN));   \
+          }                                                                    \
+        return octave_value ((matrix.ndims () > 1) ? ch_array.permute (p, true)\
+                                                   : ch_array);                \
+      }
+
+    STRING_U8_FCN (xtolower, octave_u8_tolower_wrapper, std::tolower);
+    STRING_U8_FCN (xtoupper, octave_u8_toupper_wrapper, std::toupper);
 
     // For Matlab compatibility, these should work on ASCII values
     // without error or warning.
--- a/liboctave/wrappers/module.mk	Sun May 27 19:20:47 2018 +0200
+++ b/liboctave/wrappers/module.mk	Wed May 16 21:36:27 2018 +0200
@@ -29,6 +29,7 @@
   %reldir%/time-wrappers.h \
   %reldir%/tmpfile-wrapper.h \
   %reldir%/uname-wrapper.h \
+  %reldir%/unicase-wrappers.h \
   %reldir%/uniconv-wrappers.h \
   %reldir%/unistd-wrappers.h \
   %reldir%/unistr-wrappers.h \
@@ -68,6 +69,7 @@
   %reldir%/time-wrappers.c \
   %reldir%/tmpfile-wrapper.c \
   %reldir%/uname-wrapper.c \
+  %reldir%/unicase-wrappers.c \
   %reldir%/uniconv-wrappers.c \
   %reldir%/unistd-wrappers.c \
   %reldir%/unistr-wrappers.c \
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/liboctave/wrappers/unicase-wrappers.c	Wed May 16 21:36:27 2018 +0200
@@ -0,0 +1,45 @@
+/*
+
+Copyright (C) 2018 Markus Mützel
+
+This file is part of Octave.
+
+Octave is free software: you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+Octave is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Octave; see the file COPYING.  If not, see
+<https://www.gnu.org/licenses/>.
+
+*/
+
+#if defined (HAVE_CONFIG_H)
+#  include "config.h"
+#endif
+
+#include "unicase.h"
+
+#include "unicase-wrappers.h"
+
+uint8_t *
+octave_u8_tolower_wrapper (const uint8_t *s, size_t n,
+                           const char *iso639_language,
+                           uint8_t *resultbuf, size_t *lengthp)
+{
+  return u8_tolower (s, n, iso639_language, NULL, resultbuf, lengthp);
+}
+
+uint8_t *
+octave_u8_toupper_wrapper (const uint8_t *s, size_t n,
+                           const char *iso639_language,
+                           uint8_t *resultbuf, size_t *lengthp)
+{
+  return u8_toupper (s, n, iso639_language, NULL, resultbuf, lengthp);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/liboctave/wrappers/unicase-wrappers.h	Wed May 16 21:36:27 2018 +0200
@@ -0,0 +1,44 @@
+/*
+
+Copyright (C) 2018 Markus Mützel
+
+This file is part of Octave.
+
+Octave is free software: you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+Octave is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Octave; see the file COPYING.  If not, see
+<https://www.gnu.org/licenses/>.
+
+*/
+
+#if ! defined (octave_unicase_wrappers_h)
+#define octave_unicase_wrappers_h 1
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+extern uint8_t *
+octave_u8_tolower_wrapper (const uint8_t *s, size_t n,
+                           const char *iso639_language,
+                           uint8_t *resultbuf, size_t *lengthp);
+
+extern uint8_t *
+octave_u8_toupper_wrapper (const uint8_t *s, size_t n,
+                           const char *iso639_language,
+                           uint8_t *resultbuf, size_t *lengthp);
+
+#if defined __cplusplus
+}
+#endif
+
+#endif