Mercurial > octave
comparison liboctave/wrappers/uniconv-wrappers.c @ 32072:f7206b6577c2 stable
unicode2native: Fix conversion to UTF-16 (bug #64139).
* liboctave/wrappers/uniconv-wrappers.c (octave_u8_conv_to_encoding_intern):
Avoid appending a zero-byte when converting to UTF-* to avoid having to strip
a varying number of bytes after the conversion.
* scripts/strings/unicode2native.m: Add test for conversion to UTF-16.
author | Markus Mützel <markus.muetzel@gmx.de> |
---|---|
date | Wed, 03 May 2023 20:43:36 +0200 |
parents | 470134b3fc28 |
children | 24752aa8be11 e2911d0176dc |
comparison
equal
deleted
inserted
replaced
32070:bc46d7c2768f | 32072:f7206b6577c2 |
---|---|
56 { | 56 { |
57 // FIXME: It looks like the input to u8_conv_to_encoding must be at least | 57 // FIXME: It looks like the input to u8_conv_to_encoding must be at least |
58 // four bytes and zero-terminated to work correctly. Zero-pad input. | 58 // four bytes and zero-terminated to work correctly. Zero-pad input. |
59 // Should this be fixed in gnulib or iconv instead? | 59 // Should this be fixed in gnulib or iconv instead? |
60 size_t minlen = 4; | 60 size_t minlen = 4; |
61 size_t padlen = (srclen > minlen ? srclen : minlen) + 1; | 61 size_t padlen = (srclen > minlen ? srclen : minlen); |
62 uint8_t *u8_str = (uint8_t *) malloc (padlen); | 62 |
63 memcpy (u8_str, src, srclen); | 63 // Do not zero-terminate when the output encoding is a UTF encoding, i.e., |
64 for (size_t i_pad = 0; i_pad < padlen-srclen; i_pad++) | 64 // the surrogates are different than a byte. |
65 u8_str[srclen+i_pad] = 0; | 65 if ((tocode[0] != 'u' && tocode[0] != 'U') |
66 || (tocode[1] != 't' && tocode[1] != 'T') | |
67 || (tocode[2] != 'f' && tocode[2] != 'F')) | |
68 padlen++; | |
69 | |
70 uint8_t *u8_str; | |
71 const uint8_t *cu8_str; | |
72 if (srclen < padlen) | |
73 { | |
74 u8_str = (uint8_t *) malloc (padlen); | |
75 memcpy (u8_str, src, srclen); | |
76 for (size_t i_pad = 0; i_pad < padlen-srclen; i_pad++) | |
77 u8_str[srclen+i_pad] = 0; | |
78 cu8_str = u8_str; | |
79 } | |
80 else | |
81 cu8_str = src; | |
66 | 82 |
67 // Convert from UTF-8 to output encoding | 83 // Convert from UTF-8 to output encoding |
68 char *ret = u8_conv_to_encoding (tocode, handler, u8_str, padlen, | 84 char *ret = u8_conv_to_encoding (tocode, handler, cu8_str, padlen, |
69 offsets, NULL, lengthp); | 85 offsets, NULL, lengthp); |
70 free ((void *) u8_str); | 86 |
87 if (srclen > padlen) | |
88 free ((void *) u8_str); | |
71 | 89 |
72 // FIXME: This assumes that "\0" is converted to a single byte. This might | 90 // FIXME: This assumes that "\0" is converted to a single byte. This might |
73 // not be true for some exotic output encodings (like UTF-7?). | 91 // not be true for some exotic output encodings (like UTF-7?). |
74 *lengthp = (*lengthp <= (padlen-srclen) ? 0 : *lengthp - (padlen-srclen)); | 92 *lengthp = (*lengthp <= (padlen-srclen) ? 0 : *lengthp - (padlen-srclen)); |
75 | 93 |
76 return ret; | 94 return ret; |
77 } | 95 } |
78 | 96 |
79 char * | 97 char * |
80 octave_u8_conv_to_encoding (const char *tocode, const uint8_t *src, | 98 octave_u8_conv_to_encoding (const char *tocode, const uint8_t *src, |