octave: liboctave/wrappers/uniconv-wrappers.c comparison

comparison liboctave/wrappers/uniconv-wrappers.c @ 32072:f7206b6577c2 stable

unicode2native: Fix conversion to UTF-16 (bug #64139). * liboctave/wrappers/uniconv-wrappers.c (octave_u8_conv_to_encoding_intern): Avoid appending a zero-byte when converting to UTF-* to avoid having to strip a varying number of bytes after the conversion. * scripts/strings/unicode2native.m: Add test for conversion to UTF-16.

author	Markus Mützel <markus.muetzel@gmx.de>
date	Wed, 03 May 2023 20:43:36 +0200
parents	470134b3fc28
children	24752aa8be11 e2911d0176dc

comparison

equal deleted inserted replaced

-:bc46d7c2768f
+:f7206b6577c2
 {
 // FIXME: It looks like the input to u8_conv_to_encoding must be at least
 //        four bytes and zero-terminated to work correctly.  Zero-pad input.
 //        Should this be fixed in gnulib or iconv instead?
 size_t minlen = 4;
-size_t padlen = (srclen > minlen ? srclen : minlen) + 1;
+size_t padlen = (srclen > minlen ? srclen : minlen);
-uint8_t *u8_str = (uint8_t *) malloc (padlen);
-memcpy (u8_str, src, srclen);
+// Do not zero-terminate when the output encoding is a UTF encoding, i.e.,
-for (size_t i_pad = 0; i_pad < padlen-srclen; i_pad++)
+// the surrogates are different than a byte.
-u8_str[srclen+i_pad] = 0;
+if ((tocode[0] != 'u' && tocode[0] != 'U')
+|| (tocode[1] != 't' && tocode[1] != 'T')
+|| (tocode[2] != 'f' && tocode[2] != 'F'))
+padlen++;
+uint8_t *u8_str;
+const uint8_t *cu8_str;
+if (srclen < padlen)
+{
+u8_str = (uint8_t *) malloc (padlen);
+memcpy (u8_str, src, srclen);
+for (size_t i_pad = 0; i_pad < padlen-srclen; i_pad++)
+u8_str[srclen+i_pad] = 0;
+cu8_str = u8_str;
+}
+else
+cu8_str = src;
 // Convert from UTF-8 to output encoding
-char *ret = u8_conv_to_encoding (tocode, handler, u8_str, padlen,
+char *ret = u8_conv_to_encoding (tocode, handler, cu8_str, padlen,
 offsets, NULL, lengthp);
-free ((void *) u8_str);
+if (srclen > padlen)
+free ((void *) u8_str);
 // FIXME: This assumes that "\0" is converted to a single byte.  This might
 //        not be true for some exotic output encodings (like UTF-7?).
 *lengthp = (*lengthp <= (padlen-srclen) ? 0 : *lengthp - (padlen-srclen));
 return ret;
 }
 char *
 octave_u8_conv_to_encoding (const char *tocode, const uint8_t *src,

Mercurial > octave

comparison liboctave/wrappers/uniconv-wrappers.c @ 32072:f7206b6577c2 stable