Mercurial > octave
changeset 23122:e310b5b6da6f
Add functions native2unicode and unicode2native (bug #49842).
* native2unicode.m, unicode2native.m: Add new functions that wrap
__native2unicode__ and __unicode2native__ with input conversions and
checks.
* strfns.cc (F__native2unicode__, F__unicode2native__): New functions.
* bootstrap.conf (gnulib_modules): Add libunistring to the list.
* __unimplemented.m: Delete names from list of missing functions.
* scripts/strings/module.mk: Update.
* strings.txi: Add doc strings to manual.
author | Markus Mützel <markus.muetzel@gmx.de> |
---|---|
date | Sun, 22 Jan 2017 13:58:57 +0100 |
parents | 0fe35c34fdc2 |
children | c6ca5fe1505c |
files | bootstrap.conf doc/interpreter/strings.txi libinterp/corefcn/strfns.cc scripts/help/__unimplemented__.m scripts/strings/module.mk scripts/strings/native2unicode.m scripts/strings/unicode2native.m |
diffstat | 7 files changed, 297 insertions(+), 2 deletions(-) [+] |
line wrap: on
line diff
--- a/bootstrap.conf Tue Jan 31 06:22:13 2017 -0500 +++ b/bootstrap.conf Sun Jan 22 13:58:57 2017 +0100 @@ -52,6 +52,7 @@ glob isatty largefile + libunistring link lstat malloc-gnu
--- a/doc/interpreter/strings.txi Tue Jan 31 06:22:13 2017 -0500 +++ b/doc/interpreter/strings.txi Sun Jan 22 13:58:57 2017 +0100 @@ -506,6 +506,10 @@ @DOCSTRING(toupper) +@DOCSTRING(unicode2native) + +@DOCSTRING(native2unicode) + @DOCSTRING(do_string_escapes) @DOCSTRING(undo_string_escapes)
--- a/libinterp/corefcn/strfns.cc Tue Jan 31 06:22:13 2017 -0500 +++ b/libinterp/corefcn/strfns.cc Sun Jan 22 13:58:57 2017 +0100 @@ -29,6 +29,10 @@ #include <queue> #include <sstream> +#ifdef HAVE_LIBUNISTRING +# include <uniconv.h> +#endif + #include "dMatrix.h" #include "Cell.h" @@ -730,6 +734,99 @@ %!assert (strncmpi ("abc123", "ABC456", 3), true) */ +DEFUN (__native2unicode__, args, , + doc: /* -*- texinfo -*- +@deftypefn {} {@var{utf8_str} =} __native2unicode__ (@var{native_bytes}, @var{codepage}) +Convert byte stream @var{native_bytes} to UTF-8 using @var{codepage}. + + +@seealso{native2unicode, __unicode2native__} +@end deftypefn */) +{ +#ifdef HAVE_LIBUNISTRING + int nargin = args.length (); + + if (nargin < 1 || nargin > 2) + print_usage (); + + if (args(0).is_string ()) + return ovl(args(0)); + + // codepage + const char *codepage = locale_charset (); + string_vector tmp; + if (! args(1).is_numeric_type ()) + { + tmp = args(1).string_vector_value (); + codepage = tmp(0).c_str (); + } + + // convert byte stream with local encoding to UTF-8 + charNDArray native_bytes = args(0).char_array_value (); + size_t length; + char *utf8_str = reinterpret_cast<char *> + (u8_conv_from_encoding (codepage, iconveh_question_mark, + native_bytes.fortran_vec (), + native_bytes.numel (), NULL, + NULL, &length)); + if (utf8_str == NULL) + error("native2unicode: Error '%s' converting from codepage '%s' to UTF-8.", + std::strerror (errno), codepage); + + std::string ret_val = std::string (utf8_str, length); + free (utf8_str); + return ovl (charNDArray (ret_val)); +#else + octave_unused_parameter (args); + + err_disabled_feature ("__native2unicode__", "libunistring"); +#endif +} + +DEFUN (__unicode2native__, args, , + doc: /* -*- texinfo -*- +@deftypefn {} {@var{native_bytes} =} __unicode2native__ (@var{utf8_str}, @var{codepage}) +Convert UTF-8 string @var{utf8_str} to byte stream @var{native_bytes} using +@var{codepage}. + + +@seealso{unicode2native, __native2unicode__} +@end deftypefn */) +{ +#ifdef HAVE_LIBUNISTRING + int nargin = args.length (); + + if (nargin != 2) + print_usage (); + + // codepage + const char *codepage = locale_charset (); + string_vector tmp; + if (! args(1).is_numeric_type ()) + { + tmp = args(1).string_vector_value (); + codepage = tmp(0).c_str (); + } + + // convert UTF-8 string vector to byte-stream with local encoding + charNDArray utf8_str = args(0).char_array_value (); + size_t length; + char *native_bytes = u8_conv_to_encoding (codepage, iconveh_question_mark, + reinterpret_cast<uint8_t*> (utf8_str.fortran_vec ()), + utf8_str.numel (), NULL, NULL, &length); + if (native_bytes == NULL) + error("native2unicode: Error '%s' converting from UTF-8 to codepage '%s'.", + std::strerror (errno), codepage); + + std::string ret_val = std::string (native_bytes, length); + free (native_bytes); + return ovl (NDArray (ret_val)); +#else + octave_unused_parameter (args); + + err_disabled_feature ("__unicode2native__", "libunistring"); +#endif +} DEFUN (list_in_columns, args, , doc: /* -*- texinfo -*- @deftypefn {} {} list_in_columns (@var{arg}, @var{width}, @var{prefix})
--- a/scripts/help/__unimplemented__.m Tue Jan 31 06:22:13 2017 -0500 +++ b/scripts/help/__unimplemented__.m Sun Jan 22 13:58:57 2017 +0100 @@ -723,7 +723,6 @@ "multibandread", "multibandwrite", "NaT", - "native2unicode", "nccreate", "ncdisp", "ncinfo", @@ -827,7 +826,6 @@ "uitoolbar", "uiwait", "undocheckout", - "unicode2native", "unloadlibrary", "unmesh", "unstack",
--- a/scripts/strings/module.mk Tue Jan 31 06:22:13 2017 -0500 +++ b/scripts/strings/module.mk Sun Jan 22 13:58:57 2017 +0100 @@ -15,6 +15,7 @@ scripts/strings/isletter.m \ scripts/strings/isstrprop.m \ scripts/strings/mat2str.m \ + scripts/strings/native2unicode.m \ scripts/strings/ostrsplit.m \ scripts/strings/regexptranslate.m \ scripts/strings/rindex.m \ @@ -29,6 +30,7 @@ scripts/strings/strtrim.m \ scripts/strings/strtrunc.m \ scripts/strings/substr.m \ + scripts/strings/unicode2native.m \ scripts/strings/untabify.m \ scripts/strings/validatestring.m
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/strings/native2unicode.m Sun Jan 22 13:58:57 2017 +0100 @@ -0,0 +1,104 @@ +## Copyright (C) 2016 Markus Mützel +## +## This file is part of Octave. +## +## Octave is free software; you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or +## (at your option) any later version. +## +## Octave is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Octave; see the file COPYING. If not, see +## <http://www.gnu.org/licenses/>. + +## -*- texinfo -*- +## @deftypefn {} {@var{utf8_str} =} native2unicode (@var{native_bytes}, @var{codepage}) +## @deftypefnx {} {@var{utf8_str} =} native2unicode (@var{native_bytes}) +## Convert byte stream @var{native_bytes} to UTF-8 using @var{codepage}. +## +## The numbers in the vector @var{native_bytes} are rounded and clipped to +## integers between 0 and 255. This byte stream is then mapped into the +## codepage given by the string @var{codepage} and returned in the string +## @var{utf8_str}. Octave uses UTF-8 as its internal encoding. +## The string @var{codepage} must be an identifier of a valid codepage. +## Examples for valid codepages are "ISO 8859-1", "Latin-1" or "Shift-JIS". +## If @var{codepage} is omitted or empty, the system default codepage is used. +## +## If @var{native_bytes} is a string vector, it is returned as is. +## +## @seealso{unicode2native} +## @end deftypefn + +function utf8_str = native2unicode (native_bytes, codepage) + + ## check input + if (nargin < 1 || nargin > 2) + print_usage (); + endif + + if (ischar (native_bytes)) + utf8_str = native_bytes; + return + endif + + if (! isnumeric (native_bytes) || ! isvector (native_bytes)) + error ("native2unicode: NATIVE_BYTES must be a numeric vector.") + endif + + is_column = false; + if (! isrow (native_bytes)) + is_column = true; + native_bytes = native_bytes'; + endif + + if (nargin < 2 || isempty (codepage)) + codepage = 0; + endif + + if (! ischar (codepage) && codepage != 0) + error ("native2unicode: CODEPAGE must be a string or 0.") + endif + + native_bytes = round (native_bytes); + native_bytes(native_bytes < 0) = 0; + native_bytes(native_bytes > 255) = 255; + + ## pass to internal function + utf8_str = __native2unicode__ (native_bytes, codepage); + + if (is_column) + utf8_str = utf8_str'; + endif + +endfunction + +%!testif(HAVE_LIBUNISTRING) +%! assert (double (native2unicode (164:170, 'ISO 8859-5')), +%! [208 132 208 133 208 134 208 135 208 136 208 137 208 138]); # "ЄЅІЇЈЉЊ" +%!testif(HAVE_LIBUNISTRING) +%! assert (double (native2unicode ([164:166 0 167:170], 'ISO 8859-5')), +%! [208 132 208 133 208 134 0 208 135 208 136 208 137 208 138]); # ["ЄЅІ" 0 "ЇЈЉЊ"] +%!testif(HAVE_LIBUNISTRING) +%! assert (native2unicode ("foobar"), "foobar"); +%!testif(HAVE_LIBUNISTRING) +%! assert (double (native2unicode ([0 0 120.3 0 0 122.6 0 0])), +%! [0 0 120 0 0 123 0 0]); +%!testif(HAVE_LIBUNISTRING) +%! fail ("native2unicode ([1 2; 3 4])", "NATIVE_BYTES must be a numeric vector"); +%!testif(HAVE_LIBUNISTRING) +%! fail ("native2unicode ({1 2 3 4})", "NATIVE_BYTES must be a numeric vector"); +%!testif(HAVE_LIBUNISTRING) +%! fail ("native2unicode (164:170, 123)", "CODEPAGE must be a string or 0"); +%!testif(HAVE_LIBUNISTRING) +%! fail ("native2unicode (234, 'foo')", +%! "Error .* converting from codepage 'foo' to UTF-8"); +%!testif(HAVE_LIBUNISTRING) +%! fail ("native2unicode ()", "Invalid call"); +%!testif(HAVE_LIBUNISTRING) +%! fail ("native2unicode (1, 'Latin-1', 'test')", "Invalid call"); + \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/strings/unicode2native.m Sun Jan 22 13:58:57 2017 +0100 @@ -0,0 +1,89 @@ +## Copyright (C) 2016 Markus Mützel +## +## This file is part of Octave. +## +## Octave is free software; you can redistribute it and/or modify it +## under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or +## (at your option) any later version. +## +## Octave is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Octave; see the file COPYING. If not, see +## <http://www.gnu.org/licenses/>. + +## -*- texinfo -*- +## @deftypefn {} {@var{native_bytes} =} unicode2native (@var{utf8_str}, @var{codepage}) +## @deftypefnx {} {@var{native_bytes} =} unicode2native (@var{utf8_str}) +## Convert UTF-8 string @var{utf8_str} to byte stream using @var{codepage}. +## +## The character vector @var{utf8_str} is converted to a byte stream +## @var{native_bytes} using the codepage given by @var{codepage}. +## The string @var{codepage} must be an identifier of a valid codepage. +## Examples for valid codepages are "ISO 8859-1", "Latin-1" or "Shift-JIS". +## If @var{codepage} is omitted or empty, the system default codepage is used. +## +## If any of the characters cannot be mapped into the codepage @var{codepage}, +## they are replaced with the appropriate substitution sequence for that +## codepage. +## +## @seealso{native2unicode} +## @end deftypefn + +function native_bytes = unicode2native (utf8_str, codepage) + + ## check input + if (nargin < 1 || nargin > 2) + print_usage (); + endif + + + if (! ischar (utf8_str) || ! isvector (utf8_str)) + error ("unicode2native: UTF8_STR must be a character vector.") + endif + + is_column = false; + if (! isrow (utf8_str)) + is_column = true; + utf8_str = utf8_str'; + endif + + if (nargin < 2 || isempty (codepage)) + codepage = 0; + endif + + if (! ischar (codepage) && codepage != 0) + error ("unicode2native: CODEPAGE must be a string or 0.") + endif + + ## pass to internal function + native_bytes = __unicode2native__ (utf8_str, codepage); + + if (is_column) + native_bytes = native_bytes'; + endif + +endfunction + +%!testif(HAVE_LIBUNISTRING) +%! assert (unicode2native ("ЄЅІЇЈЉЊ", "ISO 8859-5"), 164:170); +%!testif(HAVE_LIBUNISTRING) +%! assert (unicode2native (["ЄЅІ" 0 "ЇЈЉЊ"], "ISO 8859-5"), [164:166 0 167:170]); +%!testif(HAVE_LIBUNISTRING) +%! fail ("unicode2native (['ab'; 'cd'])", "UTF8_STR must be a character vector"); +%!testif(HAVE_LIBUNISTRING) +%! fail ("unicode2native ({1 2 3 4})", "UTF8_STR must be a character vector"); +%!testif(HAVE_LIBUNISTRING) +%! fail ("unicode2native ('ЄЅІЇЈЉЊ', 123)", "CODEPAGE must be a string or 0"); +%!testif(HAVE_LIBUNISTRING) +%! fail ("unicode2native ('a', 'foo')", +%! "Error .* converting from UTF-8 to codepage 'foo'"); +%!testif(HAVE_LIBUNISTRING) +%! fail ("unicode2native ()", "Invalid call"); +%!testif(HAVE_LIBUNISTRING) +%! fail ("unicode2native ('a', 'Latin-1', 'test')", "Invalid call"); + \ No newline at end of file