view scripts/strings/unicode2native.m @ 32072:f7206b6577c2 stable

unicode2native: Fix conversion to UTF-16 (bug #64139). * liboctave/wrappers/uniconv-wrappers.c (octave_u8_conv_to_encoding_intern): Avoid appending a zero-byte when converting to UTF-* to avoid having to strip a varying number of bytes after the conversion. * scripts/strings/unicode2native.m: Add test for conversion to UTF-16.
author Markus Mützel <markus.muetzel@gmx.de>
date Wed, 03 May 2023 20:43:36 +0200
parents 470134b3fc28
children fab3e312a0b4
line wrap: on
line source

########################################################################
##
## Copyright (C) 2016-2023 The Octave Project Developers
##
## See the file COPYRIGHT.md in the top-level directory of this
## distribution or <https://octave.org/copyright/>.
##
## This file is part of Octave.
##
## Octave is free software: you can redistribute it and/or modify it
## under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## Octave is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Octave; see the file COPYING.  If not, see
## <https://www.gnu.org/licenses/>.
##
########################################################################

## -*- texinfo -*-
## @deftypefn  {} {@var{native_bytes} =} unicode2native (@var{utf8_str}, @var{codepage})
## @deftypefnx {} {@var{native_bytes} =} unicode2native (@var{utf8_str})
## Convert UTF-8 string @var{utf8_str} to byte stream using @var{codepage}.
##
## The character vector @var{utf8_str} is converted to a byte stream
## @var{native_bytes} using the code page given by @var{codepage}.  The
## string @var{codepage} must be an identifier of a valid code page.
## Examples for valid code pages are @qcode{"ISO-8859-1"},
## @qcode{"Shift-JIS"}, or @qcode{"UTF-16"}.  For a list of supported code
## pages, see @url{https://www.gnu.org/software/libiconv}.  If @var{codepage}
## is omitted or empty, the system default codepage is used.
##
## If any of the characters cannot be mapped into the codepage @var{codepage},
## they are replaced with the appropriate substitution sequence for that
## codepage.
##
## @seealso{native2unicode}
## @end deftypefn

function native_bytes = unicode2native (utf8_str, codepage = "")

  if (nargin < 1)
    print_usage ();
  endif

  ## For Matlab compatibility, return empty output for empty input.
  if (isempty (utf8_str))
    native_bytes = uint8 ([]);
    return;
  endif

  if (! ischar (utf8_str) || ! isvector (utf8_str))
    error ("unicode2native: UTF8_STR must be a character vector");
  endif

  if (! (ischar (codepage) && isrow (codepage)))
    error ("unicode2native: CODEPAGE must be a string");
  endif

  native_bytes = __unicode2native__ (utf8_str, codepage);

  if (iscolumn (utf8_str))
    native_bytes = native_bytes.';
  endif

endfunction


%!testif HAVE_ICONV
%! assert (unicode2native ("ЄЅІЇЈЉЊ", "ISO-8859-5"), uint8 (164:170));
%!testif HAVE_ICONV
%! assert (unicode2native (["ЄЅІ" "\0" "ЇЈЉЊ"], "ISO-8859-5"),
%!         uint8 ([164:166 0 167:170]));
%!assert <*60480> (unicode2native (''), uint8 ([]))

# short character arrays with invalid UTF-8
%!testif HAVE_ICONV <*63930>
%! assert (unicode2native (char (230), 'windows-1252'), uint8 (63));
%!testif HAVE_ICONV <*63930>
%! assert (unicode2native (char (249), 'windows-1252'), uint8 (63));
%!testif HAVE_ICONV <*63930>
%! assert (unicode2native (char (230:231), 'windows-1252'), uint8 ([63, 63]));
%!testif HAVE_ICONV <*63930>
%! assert (unicode2native (char (230:234), 'windows-1252'),
%!         uint8 ([63, 63, 63, 63, 63]));
%!testif HAVE_ICONV <*63930>
%! assert (unicode2native (char ([230, 10]), 'windows-1252'),
%!         uint8 ([63, 10]));

# target encoding with surrogates larger than a byte
%!testif HAVE_ICONV <*64139>
%! assert (typecast (unicode2native ('abcde',
%!                                   ['utf-16', nthargout(3, 'computer'), 'e']),
%!                   'uint16'),
%!         uint16 (97:101));

%!error <Invalid call> unicode2native ()
%!error <called with too many inputs> unicode2native ('a', 'ISO-8859-1', 'test')
%!error <UTF8_STR must be a character vector> unicode2native (['ab'; 'cd'])
%!error <UTF8_STR must be a character vector> unicode2native ({1 2 3 4})
%!error <CODEPAGE must be a string> unicode2native ('ЄЅІЇЈЉЊ', 123)
%!error <CODEPAGE must be a string> unicode2native ('ЄЅІЇЈЉЊ', ['ISO-8859-1']')
%!testif HAVE_ICONV
%! fail ("unicode2native ('a', 'foo')",
%!       "converting from UTF-8 to codepage 'foo'");