comparison liboctave/wrappers/uniconv-wrappers.c @ 32072:f7206b6577c2 stable

unicode2native: Fix conversion to UTF-16 (bug #64139). * liboctave/wrappers/uniconv-wrappers.c (octave_u8_conv_to_encoding_intern): Avoid appending a zero-byte when converting to UTF-* to avoid having to strip a varying number of bytes after the conversion. * scripts/strings/unicode2native.m: Add test for conversion to UTF-16.
author Markus Mützel <markus.muetzel@gmx.de>
date Wed, 03 May 2023 20:43:36 +0200
parents 470134b3fc28
children 24752aa8be11 e2911d0176dc
comparison
equal deleted inserted replaced
32070:bc46d7c2768f 32072:f7206b6577c2
56 { 56 {
57 // FIXME: It looks like the input to u8_conv_to_encoding must be at least 57 // FIXME: It looks like the input to u8_conv_to_encoding must be at least
58 // four bytes and zero-terminated to work correctly. Zero-pad input. 58 // four bytes and zero-terminated to work correctly. Zero-pad input.
59 // Should this be fixed in gnulib or iconv instead? 59 // Should this be fixed in gnulib or iconv instead?
60 size_t minlen = 4; 60 size_t minlen = 4;
61 size_t padlen = (srclen > minlen ? srclen : minlen) + 1; 61 size_t padlen = (srclen > minlen ? srclen : minlen);
62 uint8_t *u8_str = (uint8_t *) malloc (padlen); 62
63 memcpy (u8_str, src, srclen); 63 // Do not zero-terminate when the output encoding is a UTF encoding, i.e.,
64 for (size_t i_pad = 0; i_pad < padlen-srclen; i_pad++) 64 // the surrogates are different than a byte.
65 u8_str[srclen+i_pad] = 0; 65 if ((tocode[0] != 'u' && tocode[0] != 'U')
66 || (tocode[1] != 't' && tocode[1] != 'T')
67 || (tocode[2] != 'f' && tocode[2] != 'F'))
68 padlen++;
69
70 uint8_t *u8_str;
71 const uint8_t *cu8_str;
72 if (srclen < padlen)
73 {
74 u8_str = (uint8_t *) malloc (padlen);
75 memcpy (u8_str, src, srclen);
76 for (size_t i_pad = 0; i_pad < padlen-srclen; i_pad++)
77 u8_str[srclen+i_pad] = 0;
78 cu8_str = u8_str;
79 }
80 else
81 cu8_str = src;
66 82
67 // Convert from UTF-8 to output encoding 83 // Convert from UTF-8 to output encoding
68 char *ret = u8_conv_to_encoding (tocode, handler, u8_str, padlen, 84 char *ret = u8_conv_to_encoding (tocode, handler, cu8_str, padlen,
69 offsets, NULL, lengthp); 85 offsets, NULL, lengthp);
70 free ((void *) u8_str); 86
87 if (srclen > padlen)
88 free ((void *) u8_str);
71 89
72 // FIXME: This assumes that "\0" is converted to a single byte. This might 90 // FIXME: This assumes that "\0" is converted to a single byte. This might
73 // not be true for some exotic output encodings (like UTF-7?). 91 // not be true for some exotic output encodings (like UTF-7?).
74 *lengthp = (*lengthp <= (padlen-srclen) ? 0 : *lengthp - (padlen-srclen)); 92 *lengthp = (*lengthp <= (padlen-srclen) ? 0 : *lengthp - (padlen-srclen));
75 93
76 return ret; 94 return ret;
77 } 95 }
78 96
79 char * 97 char *
80 octave_u8_conv_to_encoding (const char *tocode, const uint8_t *src, 98 octave_u8_conv_to_encoding (const char *tocode, const uint8_t *src,