# HG changeset patch # User Markus Mützel # Date 1527452004 -7200 # Node ID d4bc8590b5cfc07211bce13823f33fa789d971cb # Parent 8fae933e7228506d6f4492ba451a96297d7a10e0 Make "is*" string functions Unicode aware. * ov-ch-mat.cc (map): Use UTF-8 aware functions for "is*" string functions. * mappers.cc: Add tests for "is*" string functions. * unistr-wrappers.[c/h]: Add wrapper for "u8_strmbtouc". * unictype-wrappers.[c/h]: Add wrappers for UTF-8 aware C-like "is*" functions. * module.mk: Add new files. * bootstrap.conf: Add new modules. diff -r 8fae933e7228 -r d4bc8590b5cf bootstrap.conf --- a/bootstrap.conf Tue May 29 09:08:25 2018 +0200 +++ b/bootstrap.conf Sun May 27 22:13:24 2018 +0200 @@ -95,8 +95,21 @@ unicase/u8-toupper uniconv/u8-conv-from-enc uniconv/u8-conv-to-enc + unictype/ctype-alnum + unictype/ctype-alpha + unictype/ctype-blank + unictype/ctype-cntrl + unictype/ctype-digit + unictype/ctype-graph + unictype/ctype-lower + unictype/ctype-print + unictype/ctype-punct + unictype/ctype-space + unictype/ctype-upper + unictype/ctype-xdigit unistd unistr/u8-strmblen + unistr/u8-strmbtouc unistr/u8-to-u32 unlink unsetenv diff -r 8fae933e7228 -r d4bc8590b5cf libinterp/corefcn/mappers.cc --- a/libinterp/corefcn/mappers.cc Tue May 29 09:08:25 2018 +0200 +++ b/libinterp/corefcn/mappers.cc Sun May 27 22:13:24 2018 +0200 @@ -1214,6 +1214,7 @@ %! result(double ("0":"9") + 1) = true; %! result(double ("a":"z") + 1) = true; %! assert (isalnum (charset), result); +%!assert (isalnum(["Ä8Aa?"; "(Uß ;"]), logical ([1 1 1 1 1 0; 0 1 1 1 0 0])); %!error isalnum () %!error isalnum (1, 2) @@ -1242,6 +1243,7 @@ %! result(double ("A":"Z") + 1) = true; %! result(double ("a":"z") + 1) = true; %! assert (isalpha (charset), result); +%!assert (isalpha("Ä8Aa(Uß ;"), logical ([1 1 0 1 1 0 1 1 1 0 0])); %!error isalpha () %!error isalpha (1, 2) @@ -1317,6 +1319,7 @@ %! result = false (1, 128); %! result(double ("0":"9") + 1) = true; %! assert (isdigit (charset), result); +%!assert (isdigit("Ä8Aa(Uß ;"), logical ([0 0 1 0 0 0 0 0 0 0 0])); %!error isdigit () %!error isdigit (1, 2) @@ -1383,6 +1386,7 @@ %! result = false (1, 128); %! result(34:127) = true; %! assert (isgraph (charset), result); +%!assert (isgraph("Ä8Aa(Uß ;"), logical ([1 1 1 1 1 1 1 1 1 0 1])); %!error isgraph () %!error isgraph (1, 2) @@ -1408,6 +1412,7 @@ %! result = false (1, 128); %! result(double ("a":"z") + 1) = true; %! assert (islower (charset), result); +%!assert (islower("Ä8Aa(Uß ;"), logical ([0 0 0 0 1 0 0 1 1 0 0])); %!error islower () %!error islower (1, 2) @@ -1514,6 +1519,7 @@ %! result = false (1, 128); %! result(33:127) = true; %! assert (isprint (charset), result); +%!assert (isprint("Ä8Aa(Uß ;"), logical ([1 1 1 1 1 1 1 1 1 1 1])); %!error isprint () %!error isprint (1, 2) @@ -1542,6 +1548,7 @@ %! result(92:97) = true; %! result(124:127) = true; %! assert (ispunct (charset), result); +%!assert (ispunct("Ä8Aa(Uß ;"), logical ([0 0 0 0 0 1 0 0 0 0 1])); %!error ispunct () %!error ispunct (1, 2) @@ -1568,6 +1575,7 @@ %! result = false (1, 128); %! result(double (" \f\n\r\t\v") + 1) = true; %! assert (isspace (charset), result); +%!assert (isspace("Ä8Aa(Uß ;"), logical ([0 0 0 0 0 0 0 0 0 1 0])); %!error isspace () %!error isspace (1, 2) @@ -1593,6 +1601,7 @@ %! result = false (1, 128); %! result(double ("A":"Z") + 1) = true; %! assert (isupper (charset), result); +%!assert (isupper("Ä8Aa(Uß ;"), logical ([1 1 0 1 0 0 1 0 0 0 0])); %!error isupper () %!error isupper (1, 2) @@ -1620,6 +1629,7 @@ %! result(double ("0":"9") + 1) = true; %! result(double ("a":"f") + 1) = true; %! assert (isxdigit (charset), result); +%!assert (isxdigit("Ä8Aa(Uß ;"), logical ([0 0 1 1 1 0 0 0 0 0 0])); %!error isxdigit () %!error isxdigit (1, 2) diff -r 8fae933e7228 -r d4bc8590b5cf libinterp/octave-value/ov-ch-mat.cc --- a/libinterp/octave-value/ov-ch-mat.cc Tue May 29 09:08:25 2018 +0200 +++ b/libinterp/octave-value/ov-ch-mat.cc Sun May 27 22:13:24 2018 +0200 @@ -42,6 +42,8 @@ #include "lo-ieee.h" #include "mx-base.h" #include "unicase-wrappers.h" +#include "unictype-wrappers.h" +#include "unistr-wrappers.h" #include "mxarray.h" #include "ov-base.h" @@ -259,18 +261,49 @@ case umap_ ## UMAP: \ return octave_value (matrix.map (FCN)) - STRING_MAPPER (xisalnum, std::isalnum, bool); - STRING_MAPPER (xisalpha, std::isalpha, bool); STRING_MAPPER (xisascii, xisascii, bool); - STRING_MAPPER (xiscntrl, std::iscntrl, bool); - STRING_MAPPER (xisdigit, std::isdigit, bool); - STRING_MAPPER (xisgraph, std::isgraph, bool); - STRING_MAPPER (xislower, std::islower, bool); - STRING_MAPPER (xisprint, std::isprint, bool); - STRING_MAPPER (xispunct, std::ispunct, bool); - STRING_MAPPER (xisspace, std::isspace, bool); - STRING_MAPPER (xisupper, std::isupper, bool); - STRING_MAPPER (xisxdigit, std::isxdigit, bool); + +#define STRING_U8_MAPPER(UMAP,FCN) \ + case umap_ ## UMAP: \ + { \ + charNDArray in_m = matrix; \ + Array p (dim_vector (matrix.ndims (), 1)); \ + if (matrix.ndims () > 1) \ + { \ + for (octave_idx_type i=0; i < matrix.ndims (); i++) \ + p(i) = i; \ + p(0) = 1; \ + p(1) = 0; \ + in_m = matrix.permute (p); \ + } \ + boolNDArray b_array = boolNDArray (in_m.dims ()); \ + const uint8_t *in = reinterpret_cast (in_m.data ()); \ + uint32_t uc; \ + for (octave_idx_type i = 0; i < in_m.numel (); ) \ + { \ + int mblen = octave_u8_strmbtouc_wrapper (&uc, in + i); \ + if (mblen < 1) \ + mblen = 1; \ + bool is_upper = FCN (uc); \ + for (int j = 0; j < mblen; j++) \ + b_array(i+j) = is_upper; \ + i += mblen; \ + } \ + return octave_value ((matrix.ndims () > 1) ? b_array.permute (p, true) \ + : b_array); \ + } + + STRING_U8_MAPPER (xisalnum, octave_uc_is_alnum_wrapper); + STRING_U8_MAPPER (xisalpha, octave_uc_is_alpha_wrapper); + STRING_U8_MAPPER (xiscntrl, octave_uc_is_cntrl_wrapper); + STRING_U8_MAPPER (xisdigit, octave_uc_is_digit_wrapper); + STRING_U8_MAPPER (xisgraph, octave_uc_is_graph_wrapper); + STRING_U8_MAPPER (xislower, octave_uc_is_lower_wrapper); + STRING_U8_MAPPER (xisprint, octave_uc_is_print_wrapper); + STRING_U8_MAPPER (xispunct, octave_uc_is_punct_wrapper); + STRING_U8_MAPPER (xisspace, octave_uc_is_space_wrapper); + STRING_U8_MAPPER (xisupper, octave_uc_is_upper_wrapper); + STRING_U8_MAPPER (xisxdigit, octave_uc_is_xdigit_wrapper); #define STRING_U8_FCN(UMAP,U8_FCN,STD_FCN) \ case umap_ ## UMAP: \ @@ -278,13 +311,13 @@ charNDArray in_m = matrix; \ Array p (dim_vector (matrix.ndims (), 1)); \ if (matrix.ndims () > 1) \ - { \ - for (octave_idx_type i=0; i < matrix.ndims (); i++) \ - p(i) = i; \ - p(0) = 1; \ - p(1) = 0; \ - in_m = matrix.permute (p); \ - } \ + { \ + for (octave_idx_type i=0; i < matrix.ndims (); i++) \ + p(i) = i; \ + p(0) = 1; \ + p(1) = 0; \ + in_m = matrix.permute (p); \ + } \ size_t output_length = in_m.numel (); \ charNDArray ch_array = charNDArray (in_m.dims ()); \ const uint8_t *in = reinterpret_cast (in_m.data ()); \ diff -r 8fae933e7228 -r d4bc8590b5cf liboctave/wrappers/module.mk --- a/liboctave/wrappers/module.mk Tue May 29 09:08:25 2018 +0200 +++ b/liboctave/wrappers/module.mk Sun May 27 22:13:24 2018 +0200 @@ -31,6 +31,7 @@ %reldir%/uname-wrapper.h \ %reldir%/unicase-wrappers.h \ %reldir%/uniconv-wrappers.h \ + %reldir%/unictype-wrappers.h \ %reldir%/unistd-wrappers.h \ %reldir%/unistr-wrappers.h \ %reldir%/unsetenv-wrapper.h \ @@ -71,6 +72,7 @@ %reldir%/uname-wrapper.c \ %reldir%/unicase-wrappers.c \ %reldir%/uniconv-wrappers.c \ + %reldir%/unictype-wrappers.c \ %reldir%/unistd-wrappers.c \ %reldir%/unistr-wrappers.c \ %reldir%/unsetenv-wrapper.c \ diff -r 8fae933e7228 -r d4bc8590b5cf liboctave/wrappers/unictype-wrappers.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/liboctave/wrappers/unictype-wrappers.c Sun May 27 22:13:24 2018 +0200 @@ -0,0 +1,101 @@ +/* + +Copyright (C) 2018 Markus Mützel + +This file is part of Octave. + +Octave is free software: you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +Octave is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with Octave; see the file COPYING. If not, see +. + +*/ + +#if defined (HAVE_CONFIG_H) +# include "config.h" +#endif + +#include "unictype.h" + +#include "unictype-wrappers.h" + +bool +octave_uc_is_alnum_wrapper (ucs4_t uc) +{ + return uc_is_alnum (uc); +} + +bool +octave_uc_is_alpha_wrapper (ucs4_t uc) +{ + return uc_is_alpha (uc); +} + +bool +octave_uc_is_blank_wrapper (ucs4_t uc) +{ + return uc_is_blank (uc); +} + +bool +octave_uc_is_cntrl_wrapper (ucs4_t uc) +{ + return uc_is_cntrl (uc); +} + +bool +octave_uc_is_digit_wrapper (ucs4_t uc) +{ + return uc_is_digit (uc); +} + +bool +octave_uc_is_graph_wrapper (ucs4_t uc) +{ + return uc_is_graph (uc); +} + +bool +octave_uc_is_lower_wrapper (ucs4_t uc) +{ + return uc_is_lower (uc); +} + +bool +octave_uc_is_print_wrapper (ucs4_t uc) +{ + return uc_is_print (uc); +} + +bool +octave_uc_is_punct_wrapper (ucs4_t uc) +{ + return uc_is_punct (uc); +} + +bool +octave_uc_is_space_wrapper (ucs4_t uc) +{ + return uc_is_space (uc); +} + +bool +octave_uc_is_upper_wrapper (ucs4_t uc) +{ + return uc_is_upper (uc); +} + +bool +octave_uc_is_xdigit_wrapper (ucs4_t uc) +{ + return uc_is_xdigit (uc); +} diff -r 8fae933e7228 -r d4bc8590b5cf liboctave/wrappers/unictype-wrappers.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/liboctave/wrappers/unictype-wrappers.h Sun May 27 22:13:24 2018 +0200 @@ -0,0 +1,72 @@ +/* + +Copyright (C) 2018 Markus Mützel + +This file is part of Octave. + +Octave is free software: you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +Octave is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with Octave; see the file COPYING. If not, see +. + +*/ + +#if ! defined (octave_unictype_wrappers_h) +#define octave_unictype_wrappers_h 1 + +typedef uint32_t ucs4_t; + +#if defined __cplusplus +extern "C" { +#endif + +extern bool +octave_uc_is_alnum_wrapper (ucs4_t uc); + +extern bool +octave_uc_is_alpha_wrapper (ucs4_t uc); + +extern bool +octave_uc_is_blank_wrapper (ucs4_t uc); + +extern bool +octave_uc_is_cntrl_wrapper (ucs4_t uc); + +extern bool +octave_uc_is_digit_wrapper (ucs4_t uc); + +extern bool +octave_uc_is_graph_wrapper (ucs4_t uc); + +extern bool +octave_uc_is_lower_wrapper (ucs4_t uc); + +extern bool +octave_uc_is_print_wrapper (ucs4_t uc); + +extern bool +octave_uc_is_punct_wrapper (ucs4_t uc); + +extern bool +octave_uc_is_space_wrapper (ucs4_t uc); + +extern bool +octave_uc_is_upper_wrapper (ucs4_t uc); + +extern bool +octave_uc_is_xdigit_wrapper (ucs4_t uc); + +#if defined __cplusplus +} +#endif + +#endif diff -r 8fae933e7228 -r d4bc8590b5cf liboctave/wrappers/unistr-wrappers.c --- a/liboctave/wrappers/unistr-wrappers.c Tue May 29 09:08:25 2018 +0200 +++ b/liboctave/wrappers/unistr-wrappers.c Sun May 27 22:13:24 2018 +0200 @@ -34,6 +34,12 @@ return u8_strmblen (src); } +int +octave_u8_strmbtouc_wrapper (uint32_t *puc, const uint8_t *src) +{ + return u8_strmbtouc (puc, src); +} + uint32_t * octave_u8_to_u32_wrapper (const uint8_t *src, size_t src_len, uint32_t *result_buf, size_t *lengthp) diff -r 8fae933e7228 -r d4bc8590b5cf liboctave/wrappers/unistr-wrappers.h --- a/liboctave/wrappers/unistr-wrappers.h Tue May 29 09:08:25 2018 +0200 +++ b/liboctave/wrappers/unistr-wrappers.h Sun May 27 22:13:24 2018 +0200 @@ -30,6 +30,9 @@ extern int octave_u8_strmblen_wrapper (const uint8_t *src); +extern int +octave_u8_strmbtouc_wrapper (uint32_t *puc, const uint8_t *src); + extern uint32_t * octave_u8_to_u32_wrapper (const uint8_t *src, size_t src_len, uint32_t *result_buf, size_t *lengthp);