# HG changeset patch # User Markus Mützel # Date 1646511658 -3600 # Node ID 0826c503f294e879fb736b4162356adf2f05ff7b # Parent 7a647311ba8a5b9e47cdc5523dca393042aabbd6 Encoding facet based on gnulib uniconv for STL iostreams (bug #61839). * liboctave/util/oct-string.h, liboctave/util/oct-string.cc (codecvt_u8): Add encoding facet based on gnulib uniconv for STL iostreams. * liboctave/wrappers/uniconv-wrappers.h, liboctave/wrappers/uniconv-wrappers.c (octave_u8_conv_from_encoding_offsets, octave_u8_conv_to_encoding_offsets): Add new wrappers. * libinterp/corefcn/oct-stream.h, libinterp/corefcn/oct-stream.cc (octave::stream): Wrap output stream with encoding facet. * libinterp/corefcn/utils.h, libinterp/corefcn/utils.cc (format, vformat): Remove functions that are no longer needed. * libinterp/corefcn/file-io.cc: Add BIST. diff -r 7a647311ba8a -r 0826c503f294 libinterp/corefcn/file-io.cc --- a/libinterp/corefcn/file-io.cc Sun Mar 06 18:20:05 2022 -0800 +++ b/libinterp/corefcn/file-io.cc Sat Mar 05 21:20:58 2022 +0100 @@ -904,6 +904,34 @@ return puts_internal (interp, who, args); } +/* +## Check if text is correctly converted to output encoding +%!test <*61839> +%! str = "aäöu"; # string with non-ASCII characters +%! fname = tempname (); +%! fid = fopen (fname, "wt", "n", "ISO-8859-1"); +%! unwind_protect +%! fprintf (fid, '%s\n', str); +%! fdisp (fid, str); +%! fputs (fid, str); +%! fclose (fid); +%! ## re-open file for reading in binary mode +%! fid = fopen (fname, "rb"); +%! fb = fread (fid); +%! fclose (fid); +%! ## check file content +%! encoded = [97 228 246 117]; # original string in ISO-8859-1 encoding +%! if (ispc ()) +%! eol = double ("\r\n"); +%! else +%! eol = double ("\n"); +%! endif +%! assert (fb.', [encoded eol encoded eol encoded]) +%! unwind_protect_cleanup +%! unlink (fname); +%! end_unwind_protect +*/ + DEFMETHOD (puts, interp, args, , doc: /* -*- texinfo -*- @deftypefn {} {} puts (@var{string}) @@ -2277,6 +2305,7 @@ %! fid = fopen (f, "w+", "n", "iso-8859-1"); %! unwind_protect %! fprintf (fid, "abc,äöü\n"); +%! fflush (fid); %! fseek (fid, 0, "bof"); %! obs = textscan (fid, "%s", "delimiter", ","); %! fclose (fid); diff -r 7a647311ba8a -r 0826c503f294 libinterp/corefcn/oct-stream.cc --- a/libinterp/corefcn/oct-stream.cc Sun Mar 06 18:20:05 2022 -0800 +++ b/libinterp/corefcn/oct-stream.cc Sat Mar 05 21:20:58 2022 +0100 @@ -4045,7 +4045,7 @@ base_stream::clearerr (void) { std::istream *is = input_stream (); - std::ostream *os = output_stream (); + std::ostream *os = preferred_output_stream (); if (is) is->clear (); @@ -5425,7 +5425,7 @@ { int retval = -1; - std::ostream *os = output_stream (); + std::ostream *os = preferred_output_stream (); if (! os) invalid_operation ("fflush", "writing"); @@ -5635,24 +5635,23 @@ template static int - do_printf_conv (std::ostream& os, const std::string& encoding, - const char *fmt, int nsa, int sa_1, int sa_2, T arg, - const std::string& who) + do_printf_conv (std::ostream& os, const char *fmt, int nsa, int sa_1, + int sa_2, T arg, const std::string& who) { int retval = 0; switch (nsa) { case 2: - retval = format (os, encoding, fmt, sa_1, sa_2, arg); + retval = format (os, fmt, sa_1, sa_2, arg); break; case 1: - retval = format (os, encoding, fmt, sa_1, arg); + retval = format (os, fmt, sa_1, arg); break; case 0: - retval = format (os, encoding, fmt, arg); + retval = format (os, fmt, arg); break; default: @@ -5666,7 +5665,7 @@ static std::size_t do_printf_string (std::ostream& os, const printf_format_elt *elt, int nsa, int sa_1, int sa_2, const std::string& arg, - const std::string& encoding, const std::string& who) + const std::string& who) { if (nsa > 2) ::error ("%s: internal error handling format", who.c_str ()); @@ -5680,12 +5679,6 @@ std::size_t prec = (nsa > 1 ? sa_2 : (elt->prec == -1 ? len : elt->prec)); std::string print_str = prec < arg.length () ? arg.substr (0, prec) : arg; - if (encoding.compare ("utf-8")) - { - std::size_t src_len = print_str.length (); - print_str = string::u8_to_encoding (who, print_str, encoding); - len -= src_len - print_str.length (); - } std::size_t fw = (nsa > 0 ? sa_1 : (elt->fw == -1 ? len : elt->fw)); @@ -5812,8 +5805,8 @@ tval = (lo_ieee_is_NA (dval) ? "NA" : "NaN"); } - retval += do_printf_conv (os, encoding (), tfmt.c_str (), nsa, sa_1, - sa_2, tval, who); + retval += do_printf_conv (os, tfmt.c_str (), nsa, sa_1, sa_2, tval, + who); } else { @@ -5832,8 +5825,8 @@ // Insert "long" modifier. tfmt.replace (tfmt.rfind (type), 1, llmod + type); - retval += do_printf_conv (os, encoding (), tfmt.c_str (), nsa, - sa_1, sa_2, tval.value (), who); + retval += do_printf_conv (os, tfmt.c_str (), nsa, sa_1, sa_2, + tval.value (), who); } else { @@ -5841,8 +5834,8 @@ double dval = val.double_value (true); - retval += do_printf_conv (os, encoding (), tfmt.c_str (), nsa, - sa_1, sa_2, dval, who); + retval += do_printf_conv (os, tfmt.c_str (), nsa, sa_1, sa_2, + dval, who); } break; @@ -5854,8 +5847,8 @@ // Insert "long" modifier. tfmt.replace (tfmt.rfind (type), 1, llmod + type); - retval += do_printf_conv (os, encoding (), tfmt.c_str (), nsa, - sa_1, sa_2, tval.value (), who); + retval += do_printf_conv (os, tfmt.c_str (), nsa, sa_1, sa_2, + tval.value (), who); } else { @@ -5863,8 +5856,8 @@ double dval = val.double_value (true); - retval += do_printf_conv (os, encoding (), tfmt.c_str (), nsa, - sa_1, sa_2, dval, who); + retval += do_printf_conv (os, tfmt.c_str (), nsa, sa_1, sa_2, + dval, who); } break; @@ -5873,8 +5866,8 @@ { double dval = val.double_value (true); - retval += do_printf_conv (os, encoding (), tfmt.c_str (), nsa, - sa_1, sa_2, dval, who); + retval += do_printf_conv (os, tfmt.c_str (), nsa, sa_1, sa_2, + dval, who); } break; @@ -5906,7 +5899,7 @@ octave_idx_type m_nconv = fmt_list.num_conversions (); - std::ostream *osp = output_stream (); + std::ostream *osp = preferred_output_stream (); if (! osp) invalid_operation (who, "writing"); @@ -5959,18 +5952,12 @@ if (elt->type == '%') { - if (encoding ().compare ("utf-8")) - os << string::u8_to_encoding (who, "%", encoding ()); - else - os << '%'; + os << '%'; retval++; } else if (elt->args == 0 && ! elt->text.empty ()) { - if (encoding ().compare ("utf-8")) - os << string::u8_to_encoding (who, elt->text, encoding ()); - else - os << elt->text; + os << elt->text; retval += (elt->text.length ()); } else if (elt->type == 's' || elt->type == 'c') @@ -5984,8 +5971,7 @@ std::string sval = val.string_value (); retval += do_printf_string (os, elt, nsa, sa_1, - sa_2, sval, encoding (), - who); + sa_2, sval, who); } else retval += do_numeric_printf_conv (os, elt, nsa, sa_1, @@ -6042,7 +6028,7 @@ { int retval = -1; - std::ostream *osp = output_stream (); + std::ostream *osp = preferred_output_stream (); if (! osp) invalid_operation (who, "writing"); @@ -6050,10 +6036,7 @@ { std::ostream& os = *osp; - if (encoding ().compare ("utf-8")) - os << string::u8_to_encoding (who, s, encoding ()); - else - os << s; + os << s; if (! os) error (who, "write error"); @@ -6371,7 +6354,10 @@ stream::close (void) { if (stream_ok ()) + { + m_rep->flush (); m_rep->close (); + } } template diff -r 7a647311ba8a -r 0826c503f294 libinterp/corefcn/oct-stream.h --- a/libinterp/corefcn/oct-stream.h Sun Mar 06 18:20:05 2022 -0800 +++ b/libinterp/corefcn/oct-stream.h Sat Mar 05 21:20:58 2022 +0100 @@ -35,6 +35,8 @@ #include #include +#include "oct-string.h" + // These only appear as reference arguments or return values. class Cell; @@ -73,7 +75,8 @@ mach_info::float_format ff = mach_info::native_float_format (), const std::string& encoding = "utf-8") : m_mode (arg_md), m_flt_fmt (ff), m_encoding (encoding), - m_fail (false), m_open_state (true), m_errmsg () + m_conv_ostream (nullptr), m_fail (false), m_open_state (true), + m_errmsg () { } // No copying! @@ -115,6 +118,31 @@ virtual std::ostream * output_stream (void) { return nullptr; } + // Return either the original output stream or one wrapped with the + // encoding facet. + + std::ostream * preferred_output_stream (void) + { + if (! m_encoding.compare ("utf-8")) + return output_stream (); + + if (m_conv_ostream) + return m_conv_ostream; + + // wrap the output stream with encoding conversion facet + std::ostream *os = output_stream (); + if (os && *os) + { + convfacet_u8 *facet = new convfacet_u8 (m_encoding); + std::wbuffer_convert *converter + = new std::wbuffer_convert (os->rdbuf (), + facet); + m_conv_ostream = new std::ostream (converter); + } + + return (m_conv_ostream ? m_conv_ostream : output_stream ()); + } + // Return TRUE if this stream is open. bool is_open (void) const { return m_open_state; } @@ -183,6 +211,16 @@ // Code page std::string m_encoding; + // encoding conversion facet + typedef string::deletable_facet convfacet_u8; + + std::wbuffer_convert *m_converter; + + // wrappers for encoding conversion + // std::istream *m_conv_istream; + + std::ostream *m_conv_ostream; + // TRUE if an error has occurred. bool m_fail; @@ -415,7 +453,7 @@ std::ostream * output_stream (void) { - return m_rep ? m_rep->output_stream () : nullptr; + return (m_rep ? m_rep->preferred_output_stream () : nullptr); } void clearerr (void) { if (m_rep) m_rep->clearerr (); } diff -r 7a647311ba8a -r 0826c503f294 libinterp/corefcn/utils.cc --- a/libinterp/corefcn/utils.cc Sun Mar 06 18:20:05 2022 -0800 +++ b/libinterp/corefcn/utils.cc Sat Mar 05 21:20:58 2022 +0100 @@ -1492,34 +1492,6 @@ return s.length (); } - std::size_t format (std::ostream& os, const std::string& enc, - const char *fmt, ...) - { - std::size_t retval; - - va_list args; - va_start (args, fmt); - - retval = vformat (os, enc, fmt, args); - - va_end (args); - - return retval; - } - - std::size_t vformat (std::ostream& os, const std::string& enc, - const char *fmt, va_list args) - { - std::string s = vasprintf (fmt, args); - - if (enc.compare ("utf-8")) - os << string::u8_to_encoding ("printf", s, enc); - else - os << s; - - return s.length (); - } - std::string vasprintf (const char *fmt, va_list args) { std::string retval; diff -r 7a647311ba8a -r 0826c503f294 libinterp/corefcn/utils.h --- a/libinterp/corefcn/utils.h Sun Mar 06 18:20:05 2022 -0800 +++ b/libinterp/corefcn/utils.h Sat Mar 05 21:20:58 2022 +0100 @@ -169,15 +169,8 @@ format (std::ostream& os, const char *fmt, ...); extern OCTINTERP_API std::size_t - format (std::ostream& os, const std::string& enc, const char *fmt, ...); - - extern OCTINTERP_API std::size_t vformat (std::ostream& os, const char *fmt, va_list args); - extern OCTINTERP_API std::size_t - vformat (std::ostream& os, const std::string& enc, - const char *fmt, va_list args); - extern OCTINTERP_API std::string vasprintf (const char *fmt, va_list args); diff -r 7a647311ba8a -r 0826c503f294 liboctave/util/oct-string.cc --- a/liboctave/util/oct-string.cc Sun Mar 06 18:20:05 2022 -0800 +++ b/liboctave/util/oct-string.cc Sat Mar 05 21:20:58 2022 +0100 @@ -607,6 +607,90 @@ return num_replacements; } +typedef octave::string::codecvt_u8::InternT InternT; +typedef octave::string::codecvt_u8::ExternT ExternT; +typedef octave::string::codecvt_u8::StateT StateT; + +typename std::codecvt::result +octave::string::codecvt_u8::do_out + (StateT& /* state */, + const InternT* from, const InternT* from_end, const InternT*& from_next, + ExternT* to, ExternT* to_end, ExternT*& to_next) const +{ + if (from_end < from) + return std::codecvt::noconv; + + // Convert from UTF-8 to output encoding + std::size_t srclen = (from_end-from) * sizeof (InternT); + std::size_t lengthp = (to_end-to) * sizeof (ExternT); + const uint8_t *u8_str = reinterpret_cast (from); + char *enc_str = octave_u8_conv_to_encoding (m_enc.c_str (), u8_str, srclen, + &lengthp); + + size_t max = to_end - to; + if (lengthp < max) + max = lengthp; + + // copy conversion result to output + // FIXME: Handle incomplete UTF-8 characters at end of buffer. + std::copy_n (enc_str, max, to); + ::free (enc_str); + + from_next = from + srclen; + to_next = to + max; + + return std::codecvt::ok; +} + +typename std::codecvt::result +octave::string::codecvt_u8::do_in + (StateT& /* state */, + const ExternT* from, const ExternT* from_end, const ExternT*& from_next, + InternT* to, InternT* to_end, InternT*& to_next) const +{ + // Convert from input encoding to UTF-8 + std::size_t srclen = (from_end-from) * sizeof (ExternT); + std::size_t lengthp = (to_end-to) * sizeof (InternT); + const char *enc_str = reinterpret_cast (from); + uint8_t *u8_str = octave_u8_conv_from_encoding (m_enc.c_str (), + enc_str, srclen, &lengthp); + + std::size_t max = to_end - to; + if (lengthp < max) + max = lengthp; + + // copy conversion result to output + std::copy_n (u8_str, max, to); + ::free (u8_str); + + from_next = from + srclen; + to_next = to + max; + + return std::codecvt::ok; +} + +int octave::string::codecvt_u8::do_length + (StateT& /* state */, const ExternT *src, const ExternT *end, + std::size_t max) const +{ + // return number of external characters that produce MAX internal ones + std::size_t srclen = end-src; + std::size_t offsets[srclen]; + std::size_t lengthp = max; + octave_u8_conv_from_encoding_offsets (m_enc.c_str (), src, srclen, offsets, + &lengthp); + std::size_t ext_char; + for (ext_char = 0; ext_char < srclen; ext_char++) + { + if (offsets[ext_char] != static_cast (-1) + && offsets[ext_char] >= max) + break; + } + + return ext_char; +} + + template std::string rational_approx (T val, int len) diff -r 7a647311ba8a -r 0826c503f294 liboctave/util/oct-string.h --- a/liboctave/util/oct-string.h Sun Mar 06 18:20:05 2022 -0800 +++ b/liboctave/util/oct-string.h Sat Mar 05 21:20:58 2022 +0100 @@ -28,6 +28,8 @@ #include "octave-config.h" +#include + #include "oct-cmplx.h" namespace octave @@ -155,6 +157,94 @@ extern OCTAVE_API unsigned int u8_validate (const std::string& who, std::string& in_string, const u8_fallback_type type = U8_REPLACEMENT_CHAR); + + + template + struct + deletable_facet : Facet + { + template + deletable_facet (Args&& ...args) + : Facet (std::forward (args)...) + { } + + // destructor needs to be public + ~deletable_facet () {} + }; + + class + OCTAVE_API + codecvt_u8 : public std::codecvt + { + public: + + // No copying! + + codecvt_u8 (codecvt_u8 &) = delete; + + codecvt_u8& operator = (codecvt_u8 &) = delete; + + codecvt_u8 (const std::string &enc) + : m_enc (enc) + { } + + virtual ~codecvt_u8 () { } + + typedef char InternT; + typedef char ExternT; + typedef std::mbstate_t StateT; + + private: + + OCTAVE_API + typename std::codecvt::result + do_out (StateT& state, + const InternT* from, const InternT* from_end, const InternT*& from_next, + ExternT* to, ExternT* to_end, ExternT*& to_next) const; + + OCTAVE_API + typename std::codecvt::result + do_in (StateT& state, + const ExternT* from, const ExternT* from_end, const ExternT*& from_next, + InternT* to, InternT* to_end, InternT*& to_next) const; + + typename std::codecvt::result + do_unshift (StateT& /* state */, ExternT* to, ExternT* /* to_end */, + ExternT*& to_next) const + { + // FIXME: What is the correct thing to unshift? + // Just reset? + to_next = to; + + return std::codecvt::ok; + } + + int do_encoding () const throw () + { + // return 0 because UTF-8 encoding is variable length + return 0; + } + + bool do_always_noconv () const throw () + { + // return false to indicate non-identity conversion + return false; + } + + OCTAVE_API int + do_length (StateT& state, const ExternT *src, const ExternT *end, + std::size_t max) const; + + int do_max_length() const throw () + { + // For UTF-8, a maximum of 4 bytes are needed for one character. + return 4; + } + + std::string m_enc; + + }; + } } diff -r 7a647311ba8a -r 0826c503f294 liboctave/wrappers/uniconv-wrappers.c --- a/liboctave/wrappers/uniconv-wrappers.c Sun Mar 06 18:20:05 2022 -0800 +++ b/liboctave/wrappers/uniconv-wrappers.c Sat Mar 05 21:20:58 2022 +0100 @@ -72,6 +72,24 @@ src, srclen, NULL, NULL, lengthp); } +uint8_t * +octave_u8_conv_from_encoding_offsets + (const char *fromcode, const char *src, size_t srclen, + size_t *offsets, size_t *lengthp) +{ + return u8_conv_from_encoding (fromcode, iconveh_question_mark, + src, srclen, offsets, NULL, lengthp); +} + +char * +octave_u8_conv_to_encoding_offsets + (const char *tocode, const uint8_t *src, size_t srclen, + size_t *offsets, size_t *lengthp) +{ + return u8_conv_to_encoding (tocode, iconveh_question_mark, + src, srclen, offsets, NULL, lengthp); +} + char * u8_from_wchar (const wchar_t *wc) { diff -r 7a647311ba8a -r 0826c503f294 liboctave/wrappers/uniconv-wrappers.h --- a/liboctave/wrappers/uniconv-wrappers.h Sun Mar 06 18:20:05 2022 -0800 +++ b/liboctave/wrappers/uniconv-wrappers.h Sat Mar 05 21:20:58 2022 +0100 @@ -57,6 +57,16 @@ octave_u32_conv_to_encoding_strict (const char *tocode, const uint32_t *src, size_t srclen, size_t *lengthp); +extern OCTAVE_API uint8_t * +octave_u8_conv_from_encoding_offsets + (const char *fromcode, const char *src, size_t srclen, + size_t *offsets, size_t *lengthp); + +extern OCTAVE_API char * +octave_u8_conv_to_encoding_offsets + (const char *tocode, const uint8_t *src, size_t srclen, + size_t *offsets, size_t *lengthp); + extern OCTAVE_API char * u8_from_wchar (const wchar_t *wc); extern OCTAVE_API wchar_t * u8_to_wchar (const char *u8_char);