changeset 23122:e310b5b6da6f

Add functions native2unicode and unicode2native (bug #49842). * native2unicode.m, unicode2native.m: Add new functions that wrap __native2unicode__ and __unicode2native__ with input conversions and checks. * strfns.cc (F__native2unicode__, F__unicode2native__): New functions. * bootstrap.conf (gnulib_modules): Add libunistring to the list. * __unimplemented.m: Delete names from list of missing functions. * scripts/strings/module.mk: Update. * strings.txi: Add doc strings to manual.
author Markus Mützel <markus.muetzel@gmx.de>
date Sun, 22 Jan 2017 13:58:57 +0100
parents 0fe35c34fdc2
children c6ca5fe1505c
files bootstrap.conf doc/interpreter/strings.txi libinterp/corefcn/strfns.cc scripts/help/__unimplemented__.m scripts/strings/module.mk scripts/strings/native2unicode.m scripts/strings/unicode2native.m
diffstat 7 files changed, 297 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/bootstrap.conf	Tue Jan 31 06:22:13 2017 -0500
+++ b/bootstrap.conf	Sun Jan 22 13:58:57 2017 +0100
@@ -52,6 +52,7 @@
   glob
   isatty
   largefile
+  libunistring
   link
   lstat
   malloc-gnu
--- a/doc/interpreter/strings.txi	Tue Jan 31 06:22:13 2017 -0500
+++ b/doc/interpreter/strings.txi	Sun Jan 22 13:58:57 2017 +0100
@@ -506,6 +506,10 @@
 
 @DOCSTRING(toupper)
 
+@DOCSTRING(unicode2native)
+
+@DOCSTRING(native2unicode)
+
 @DOCSTRING(do_string_escapes)
 
 @DOCSTRING(undo_string_escapes)
--- a/libinterp/corefcn/strfns.cc	Tue Jan 31 06:22:13 2017 -0500
+++ b/libinterp/corefcn/strfns.cc	Sun Jan 22 13:58:57 2017 +0100
@@ -29,6 +29,10 @@
 #include <queue>
 #include <sstream>
 
+#ifdef HAVE_LIBUNISTRING
+#  include <uniconv.h>
+#endif
+
 #include "dMatrix.h"
 
 #include "Cell.h"
@@ -730,6 +734,99 @@
 %!assert (strncmpi ("abc123", "ABC456", 3), true)
 */
 
+DEFUN (__native2unicode__, args, ,
+       doc: /* -*- texinfo -*-
+@deftypefn {} {@var{utf8_str} =} __native2unicode__ (@var{native_bytes}, @var{codepage})
+Convert byte stream @var{native_bytes} to UTF-8 using @var{codepage}.
+
+
+@seealso{native2unicode, __unicode2native__}
+@end deftypefn */)
+{
+#ifdef HAVE_LIBUNISTRING
+  int nargin = args.length ();
+
+  if (nargin < 1 || nargin > 2)
+    print_usage ();
+
+  if (args(0).is_string ())
+    return ovl(args(0));
+
+  // codepage
+  const char *codepage = locale_charset ();
+  string_vector tmp; 
+  if (! args(1).is_numeric_type ())
+    {
+      tmp = args(1).string_vector_value ();
+      codepage = tmp(0).c_str ();
+    }
+
+  // convert byte stream with local encoding to UTF-8
+  charNDArray native_bytes = args(0).char_array_value ();
+  size_t length;
+  char *utf8_str = reinterpret_cast<char *>
+                   (u8_conv_from_encoding (codepage, iconveh_question_mark,
+                                           native_bytes.fortran_vec (),
+                                           native_bytes.numel (), NULL,
+                                           NULL, &length));
+  if (utf8_str == NULL)
+    error("native2unicode: Error '%s' converting from codepage '%s' to UTF-8.",
+          std::strerror (errno), codepage);
+
+  std::string ret_val = std::string (utf8_str, length);
+  free (utf8_str);
+  return ovl (charNDArray (ret_val));
+#else
+  octave_unused_parameter (args);
+
+  err_disabled_feature ("__native2unicode__", "libunistring");
+#endif
+}
+
+DEFUN (__unicode2native__, args, ,
+       doc: /* -*- texinfo -*-
+@deftypefn {} {@var{native_bytes} =} __unicode2native__ (@var{utf8_str}, @var{codepage})
+Convert UTF-8 string @var{utf8_str} to byte stream @var{native_bytes} using
+@var{codepage}.
+
+
+@seealso{unicode2native, __native2unicode__}
+@end deftypefn */)
+{
+#ifdef HAVE_LIBUNISTRING
+  int nargin = args.length ();
+
+  if (nargin != 2)
+    print_usage ();
+
+  // codepage
+  const char *codepage = locale_charset ();
+  string_vector tmp; 
+  if (! args(1).is_numeric_type ())
+    {
+      tmp = args(1).string_vector_value ();
+      codepage = tmp(0).c_str ();
+    }
+
+  // convert UTF-8 string vector to byte-stream with local encoding
+  charNDArray utf8_str = args(0).char_array_value ();
+  size_t length;
+  char *native_bytes = u8_conv_to_encoding (codepage, iconveh_question_mark,
+                            reinterpret_cast<uint8_t*> (utf8_str.fortran_vec ()),
+                            utf8_str.numel (), NULL, NULL, &length);
+  if (native_bytes == NULL)
+    error("native2unicode: Error '%s' converting from UTF-8 to codepage '%s'.",
+          std::strerror (errno), codepage);
+
+  std::string ret_val = std::string (native_bytes, length);
+  free (native_bytes);
+  return ovl (NDArray (ret_val));
+#else
+  octave_unused_parameter (args);
+
+  err_disabled_feature ("__unicode2native__", "libunistring");
+#endif
+}
 DEFUN (list_in_columns, args, ,
        doc: /* -*- texinfo -*-
 @deftypefn {} {} list_in_columns (@var{arg}, @var{width}, @var{prefix})
--- a/scripts/help/__unimplemented__.m	Tue Jan 31 06:22:13 2017 -0500
+++ b/scripts/help/__unimplemented__.m	Sun Jan 22 13:58:57 2017 +0100
@@ -723,7 +723,6 @@
   "multibandread",
   "multibandwrite",
   "NaT",
-  "native2unicode",
   "nccreate",
   "ncdisp",
   "ncinfo",
@@ -827,7 +826,6 @@
   "uitoolbar",
   "uiwait",
   "undocheckout",
-  "unicode2native",
   "unloadlibrary",
   "unmesh",
   "unstack",
--- a/scripts/strings/module.mk	Tue Jan 31 06:22:13 2017 -0500
+++ b/scripts/strings/module.mk	Sun Jan 22 13:58:57 2017 +0100
@@ -15,6 +15,7 @@
   scripts/strings/isletter.m \
   scripts/strings/isstrprop.m \
   scripts/strings/mat2str.m \
+  scripts/strings/native2unicode.m \
   scripts/strings/ostrsplit.m \
   scripts/strings/regexptranslate.m \
   scripts/strings/rindex.m \
@@ -29,6 +30,7 @@
   scripts/strings/strtrim.m \
   scripts/strings/strtrunc.m \
   scripts/strings/substr.m \
+  scripts/strings/unicode2native.m \
   scripts/strings/untabify.m \
   scripts/strings/validatestring.m
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/strings/native2unicode.m	Sun Jan 22 13:58:57 2017 +0100
@@ -0,0 +1,104 @@
+## Copyright (C) 2016 Markus Mützel
+##
+## This file is part of Octave.
+##
+## Octave is free software; you can redistribute it and/or modify it
+## under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 3 of the License, or
+## (at your option) any later version.
+##
+## Octave is distributed in the hope that it will be useful, but
+## WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with Octave; see the file COPYING.  If not, see
+## <http://www.gnu.org/licenses/>.
+
+## -*- texinfo -*-
+## @deftypefn  {} {@var{utf8_str} =} native2unicode (@var{native_bytes}, @var{codepage})
+## @deftypefnx {} {@var{utf8_str} =} native2unicode (@var{native_bytes})
+## Convert byte stream @var{native_bytes} to UTF-8 using @var{codepage}.
+##
+## The numbers in the vector @var{native_bytes} are rounded and clipped to
+## integers between 0 and 255.  This byte stream is then mapped into the
+## codepage given by the string @var{codepage} and returned in the string
+## @var{utf8_str}.  Octave uses UTF-8 as its internal encoding.
+## The string @var{codepage} must be an identifier of a valid codepage.
+## Examples for valid codepages are "ISO 8859-1", "Latin-1" or "Shift-JIS".
+## If @var{codepage} is omitted or empty, the system default codepage is used.
+##
+## If @var{native_bytes} is a string vector, it is returned as is.
+##
+## @seealso{unicode2native}
+## @end deftypefn
+
+function utf8_str = native2unicode (native_bytes, codepage)
+
+  ## check input
+  if (nargin < 1 || nargin > 2)
+    print_usage ();
+  endif
+
+  if (ischar (native_bytes))
+    utf8_str = native_bytes;
+    return
+  endif
+
+  if (! isnumeric (native_bytes) || ! isvector (native_bytes))
+    error ("native2unicode: NATIVE_BYTES must be a numeric vector.")
+  endif
+
+  is_column = false;
+  if (! isrow (native_bytes))
+    is_column = true;
+    native_bytes = native_bytes';
+  endif
+
+  if (nargin < 2 || isempty (codepage))
+    codepage = 0;
+  endif
+
+  if (! ischar (codepage) && codepage != 0)
+    error ("native2unicode: CODEPAGE must be a string or 0.")
+  endif
+
+  native_bytes = round (native_bytes);
+  native_bytes(native_bytes < 0) = 0;
+  native_bytes(native_bytes > 255) = 255;
+
+  ## pass to internal function
+  utf8_str = __native2unicode__ (native_bytes, codepage);
+
+  if (is_column)
+    utf8_str = utf8_str';
+  endif
+
+endfunction
+
+%!testif(HAVE_LIBUNISTRING)
+%! assert (double (native2unicode (164:170, 'ISO 8859-5')),
+%!         [208 132 208 133 208 134 208 135 208 136 208 137 208 138]);  # "ЄЅІЇЈЉЊ"
+%!testif(HAVE_LIBUNISTRING)
+%! assert (double (native2unicode ([164:166 0 167:170], 'ISO 8859-5')),
+%!         [208 132 208 133 208 134 0 208 135 208 136 208 137 208 138]);  # ["ЄЅІ" 0 "ЇЈЉЊ"]
+%!testif(HAVE_LIBUNISTRING)
+%! assert (native2unicode ("foobar"), "foobar");
+%!testif(HAVE_LIBUNISTRING)
+%! assert (double (native2unicode ([0 0 120.3 0 0 122.6 0 0])),
+%!         [0 0 120 0 0 123 0 0]);
+%!testif(HAVE_LIBUNISTRING)
+%! fail ("native2unicode ([1 2; 3 4])", "NATIVE_BYTES must be a numeric vector");
+%!testif(HAVE_LIBUNISTRING)
+%! fail ("native2unicode ({1 2 3 4})", "NATIVE_BYTES must be a numeric vector");
+%!testif(HAVE_LIBUNISTRING)
+%! fail ("native2unicode (164:170, 123)", "CODEPAGE must be a string or 0");
+%!testif(HAVE_LIBUNISTRING)
+%! fail ("native2unicode (234, 'foo')", 
+%!       "Error .* converting from codepage 'foo' to UTF-8");
+%!testif(HAVE_LIBUNISTRING)
+%! fail ("native2unicode ()", "Invalid call");
+%!testif(HAVE_LIBUNISTRING)
+%! fail ("native2unicode (1, 'Latin-1', 'test')", "Invalid call");
+ 
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/strings/unicode2native.m	Sun Jan 22 13:58:57 2017 +0100
@@ -0,0 +1,89 @@
+## Copyright (C) 2016 Markus Mützel
+##
+## This file is part of Octave.
+##
+## Octave is free software; you can redistribute it and/or modify it
+## under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 3 of the License, or
+## (at your option) any later version.
+##
+## Octave is distributed in the hope that it will be useful, but
+## WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with Octave; see the file COPYING.  If not, see
+## <http://www.gnu.org/licenses/>.
+
+## -*- texinfo -*-
+## @deftypefn  {} {@var{native_bytes} =} unicode2native (@var{utf8_str}, @var{codepage})
+## @deftypefnx {} {@var{native_bytes} =} unicode2native (@var{utf8_str})
+## Convert UTF-8 string @var{utf8_str} to byte stream using @var{codepage}.
+##
+## The character vector @var{utf8_str} is converted to a byte stream
+## @var{native_bytes} using the codepage given by @var{codepage}.
+## The string @var{codepage} must be an identifier of a valid codepage.
+## Examples for valid codepages are "ISO 8859-1", "Latin-1" or "Shift-JIS".
+## If @var{codepage} is omitted or empty, the system default codepage is used.
+##
+## If any of the characters cannot be mapped into the codepage @var{codepage},
+## they are replaced with the appropriate substitution sequence for that
+## codepage.
+##
+## @seealso{native2unicode}
+## @end deftypefn
+
+function native_bytes = unicode2native (utf8_str, codepage)
+
+  ## check input
+  if (nargin < 1 || nargin > 2)
+    print_usage ();
+  endif
+
+
+  if (! ischar (utf8_str) || ! isvector (utf8_str))
+    error ("unicode2native: UTF8_STR must be a character vector.")
+  endif
+
+  is_column = false;
+  if (! isrow (utf8_str))
+    is_column = true;
+    utf8_str = utf8_str';
+  endif
+
+  if (nargin < 2 || isempty (codepage))
+    codepage = 0;
+  endif
+
+  if (! ischar (codepage) && codepage != 0)
+    error ("unicode2native: CODEPAGE must be a string or 0.")
+  endif
+
+  ## pass to internal function
+  native_bytes = __unicode2native__ (utf8_str, codepage);
+
+  if (is_column)
+    native_bytes = native_bytes';
+  endif
+
+endfunction
+
+%!testif(HAVE_LIBUNISTRING)
+%! assert (unicode2native ("ЄЅІЇЈЉЊ", "ISO 8859-5"), 164:170);
+%!testif(HAVE_LIBUNISTRING)
+%! assert (unicode2native (["ЄЅІ" 0 "ЇЈЉЊ"], "ISO 8859-5"), [164:166 0 167:170]);
+%!testif(HAVE_LIBUNISTRING)
+%! fail ("unicode2native (['ab'; 'cd'])", "UTF8_STR must be a character vector");
+%!testif(HAVE_LIBUNISTRING)
+%! fail ("unicode2native ({1 2 3 4})", "UTF8_STR must be a character vector");
+%!testif(HAVE_LIBUNISTRING)
+%! fail ("unicode2native ('ЄЅІЇЈЉЊ', 123)", "CODEPAGE must be a string or 0");
+%!testif(HAVE_LIBUNISTRING)
+%! fail ("unicode2native ('a', 'foo')",
+%!       "Error .* converting from UTF-8 to codepage 'foo'");
+%!testif(HAVE_LIBUNISTRING)
+%! fail ("unicode2native ()", "Invalid call");
+%!testif(HAVE_LIBUNISTRING)
+%! fail ("unicode2native ('a', 'Latin-1', 'test')", "Invalid call");
+ 
\ No newline at end of file