changeset 30827:0826c503f294

Encoding facet based on gnulib uniconv for STL iostreams (bug #61839). * liboctave/util/oct-string.h, liboctave/util/oct-string.cc (codecvt_u8): Add encoding facet based on gnulib uniconv for STL iostreams. * liboctave/wrappers/uniconv-wrappers.h, liboctave/wrappers/uniconv-wrappers.c (octave_u8_conv_from_encoding_offsets, octave_u8_conv_to_encoding_offsets): Add new wrappers. * libinterp/corefcn/oct-stream.h, libinterp/corefcn/oct-stream.cc (octave::stream): Wrap output stream with encoding facet. * libinterp/corefcn/utils.h, libinterp/corefcn/utils.cc (format, vformat): Remove functions that are no longer needed. * libinterp/corefcn/file-io.cc: Add BIST.
author Markus Mützel <markus.muetzel@gmx.de>
date Sat, 05 Mar 2022 21:20:58 +0100
parents 7a647311ba8a
children aee523d3036f
files libinterp/corefcn/file-io.cc libinterp/corefcn/oct-stream.cc libinterp/corefcn/oct-stream.h libinterp/corefcn/utils.cc libinterp/corefcn/utils.h liboctave/util/oct-string.cc liboctave/util/oct-string.h liboctave/wrappers/uniconv-wrappers.c liboctave/wrappers/uniconv-wrappers.h
diffstat 9 files changed, 300 insertions(+), 80 deletions(-) [+]
line wrap: on
line diff
--- a/libinterp/corefcn/file-io.cc	Sun Mar 06 18:20:05 2022 -0800
+++ b/libinterp/corefcn/file-io.cc	Sat Mar 05 21:20:58 2022 +0100
@@ -904,6 +904,34 @@
   return puts_internal (interp, who, args);
 }
 
+/*
+## Check if text is correctly converted to output encoding
+%!test <*61839>
+%! str = "aäöu";  # string with non-ASCII characters
+%! fname = tempname ();
+%! fid = fopen (fname, "wt", "n", "ISO-8859-1");
+%! unwind_protect
+%!   fprintf (fid, '%s\n', str);
+%!   fdisp (fid, str);
+%!   fputs (fid, str);
+%!   fclose (fid);
+%!   ## re-open file for reading in binary mode
+%!   fid = fopen (fname, "rb");
+%!   fb = fread (fid);
+%!   fclose (fid);
+%!   ## check file content
+%!   encoded = [97 228 246 117];  # original string in ISO-8859-1 encoding
+%!   if (ispc ())
+%!     eol = double ("\r\n");
+%!   else
+%!     eol = double ("\n");
+%!   endif
+%!   assert (fb.', [encoded eol encoded eol encoded])
+%! unwind_protect_cleanup
+%!   unlink (fname);
+%! end_unwind_protect
+*/
+
 DEFMETHOD (puts, interp, args, ,
            doc: /* -*- texinfo -*-
 @deftypefn  {} {} puts (@var{string})
@@ -2277,6 +2305,7 @@
 %! fid = fopen (f, "w+", "n", "iso-8859-1");
 %! unwind_protect
 %!   fprintf (fid, "abc,äöü\n");
+%!   fflush (fid);
 %!   fseek (fid, 0, "bof");
 %!   obs = textscan (fid, "%s", "delimiter", ",");
 %!   fclose (fid);
--- a/libinterp/corefcn/oct-stream.cc	Sun Mar 06 18:20:05 2022 -0800
+++ b/libinterp/corefcn/oct-stream.cc	Sat Mar 05 21:20:58 2022 +0100
@@ -4045,7 +4045,7 @@
   base_stream::clearerr (void)
   {
     std::istream *is = input_stream ();
-    std::ostream *os = output_stream ();
+    std::ostream *os = preferred_output_stream ();
 
     if (is)
       is->clear ();
@@ -5425,7 +5425,7 @@
   {
     int retval = -1;
 
-    std::ostream *os = output_stream ();
+    std::ostream *os = preferred_output_stream ();
 
     if (! os)
       invalid_operation ("fflush", "writing");
@@ -5635,24 +5635,23 @@
 
   template <typename T>
   static int
-  do_printf_conv (std::ostream& os, const std::string& encoding,
-                  const char *fmt, int nsa, int sa_1, int sa_2, T arg,
-                  const std::string& who)
+  do_printf_conv (std::ostream& os, const char *fmt, int nsa, int sa_1,
+                  int sa_2, T arg, const std::string& who)
   {
     int retval = 0;
 
     switch (nsa)
       {
       case 2:
-        retval = format (os, encoding, fmt, sa_1, sa_2, arg);
+        retval = format (os, fmt, sa_1, sa_2, arg);
         break;
 
       case 1:
-        retval = format (os, encoding, fmt, sa_1, arg);
+        retval = format (os, fmt, sa_1, arg);
         break;
 
       case 0:
-        retval = format (os, encoding, fmt, arg);
+        retval = format (os, fmt, arg);
         break;
 
       default:
@@ -5666,7 +5665,7 @@
   static std::size_t
   do_printf_string (std::ostream& os, const printf_format_elt *elt,
                     int nsa, int sa_1, int sa_2, const std::string& arg,
-                    const std::string& encoding, const std::string& who)
+                    const std::string& who)
   {
     if (nsa > 2)
       ::error ("%s: internal error handling format", who.c_str ());
@@ -5680,12 +5679,6 @@
     std::size_t prec = (nsa > 1 ? sa_2 : (elt->prec == -1 ? len : elt->prec));
 
     std::string print_str = prec < arg.length () ? arg.substr (0, prec) : arg;
-    if (encoding.compare ("utf-8"))
-      {
-        std::size_t src_len = print_str.length ();
-        print_str = string::u8_to_encoding (who, print_str, encoding);
-        len -= src_len - print_str.length ();
-      }
 
     std::size_t fw = (nsa > 0 ? sa_1 : (elt->fw == -1 ? len : elt->fw));
 
@@ -5812,8 +5805,8 @@
               tval = (lo_ieee_is_NA (dval) ? "NA" : "NaN");
           }
 
-        retval += do_printf_conv (os, encoding (), tfmt.c_str (), nsa, sa_1,
-                                  sa_2, tval, who);
+        retval += do_printf_conv (os, tfmt.c_str (), nsa, sa_1, sa_2, tval,
+                                  who);
       }
     else
       {
@@ -5832,8 +5825,8 @@
                 // Insert "long" modifier.
                 tfmt.replace (tfmt.rfind (type), 1, llmod + type);
 
-                retval += do_printf_conv (os, encoding (), tfmt.c_str (), nsa,
-                                          sa_1, sa_2, tval.value (), who);
+                retval += do_printf_conv (os, tfmt.c_str (), nsa, sa_1, sa_2,
+                                          tval.value (), who);
               }
             else
               {
@@ -5841,8 +5834,8 @@
 
                 double dval = val.double_value (true);
 
-                retval += do_printf_conv (os, encoding (), tfmt.c_str (), nsa,
-                                          sa_1, sa_2, dval, who);
+                retval += do_printf_conv (os, tfmt.c_str (), nsa, sa_1, sa_2,
+                                          dval, who);
               }
             break;
 
@@ -5854,8 +5847,8 @@
                 // Insert "long" modifier.
                 tfmt.replace (tfmt.rfind (type), 1, llmod + type);
 
-                retval += do_printf_conv (os, encoding (), tfmt.c_str (), nsa,
-                                          sa_1, sa_2, tval.value (), who);
+                retval += do_printf_conv (os, tfmt.c_str (), nsa, sa_1, sa_2,
+                                          tval.value (), who);
               }
             else
               {
@@ -5863,8 +5856,8 @@
 
                 double dval = val.double_value (true);
 
-                retval += do_printf_conv (os, encoding (), tfmt.c_str (), nsa,
-                                          sa_1, sa_2, dval, who);
+                retval += do_printf_conv (os, tfmt.c_str (), nsa, sa_1, sa_2,
+                                          dval, who);
               }
             break;
 
@@ -5873,8 +5866,8 @@
             {
               double dval = val.double_value (true);
 
-              retval += do_printf_conv (os, encoding (), tfmt.c_str (), nsa,
-                                        sa_1, sa_2, dval, who);
+              retval += do_printf_conv (os, tfmt.c_str (), nsa, sa_1, sa_2,
+                                        dval, who);
             }
             break;
 
@@ -5906,7 +5899,7 @@
 
     octave_idx_type m_nconv = fmt_list.num_conversions ();
 
-    std::ostream *osp = output_stream ();
+    std::ostream *osp = preferred_output_stream ();
 
     if (! osp)
       invalid_operation (who, "writing");
@@ -5959,18 +5952,12 @@
 
             if (elt->type == '%')
               {
-                if (encoding ().compare ("utf-8"))
-                  os << string::u8_to_encoding (who, "%", encoding ());
-                else
-                  os << '%';
+                os << '%';
                 retval++;
               }
             else if (elt->args == 0 && ! elt->text.empty ())
               {
-                if (encoding ().compare ("utf-8"))
-                  os << string::u8_to_encoding (who, elt->text, encoding ());
-                else
-                  os << elt->text;
+                os << elt->text;
                 retval += (elt->text.length ());
               }
             else if (elt->type == 's' || elt->type == 'c')
@@ -5984,8 +5971,7 @@
                         std::string sval = val.string_value ();
 
                         retval += do_printf_string (os, elt, nsa, sa_1,
-                                                    sa_2, sval, encoding (),
-                                                    who);
+                                                    sa_2, sval, who);
                       }
                     else
                       retval += do_numeric_printf_conv (os, elt, nsa, sa_1,
@@ -6042,7 +6028,7 @@
   {
     int retval = -1;
 
-    std::ostream *osp = output_stream ();
+    std::ostream *osp = preferred_output_stream ();
 
     if (! osp)
       invalid_operation (who, "writing");
@@ -6050,10 +6036,7 @@
       {
         std::ostream& os = *osp;
 
-        if (encoding ().compare ("utf-8"))
-          os << string::u8_to_encoding (who, s, encoding ());
-        else
-          os << s;
+        os << s;
 
         if (! os)
           error (who, "write error");
@@ -6371,7 +6354,10 @@
   stream::close (void)
   {
     if (stream_ok ())
+    {
+      m_rep->flush ();
       m_rep->close ();
+    }
   }
 
   template <typename SRC_T, typename DST_T>
--- a/libinterp/corefcn/oct-stream.h	Sun Mar 06 18:20:05 2022 -0800
+++ b/libinterp/corefcn/oct-stream.h	Sat Mar 05 21:20:58 2022 +0100
@@ -35,6 +35,8 @@
 #include <memory>
 #include <string>
 
+#include "oct-string.h"
+
 // These only appear as reference arguments or return values.
 
 class Cell;
@@ -73,7 +75,8 @@
                  mach_info::float_format ff = mach_info::native_float_format (),
                  const std::string& encoding = "utf-8")
       : m_mode (arg_md), m_flt_fmt (ff), m_encoding (encoding),
-        m_fail (false), m_open_state (true), m_errmsg ()
+        m_conv_ostream (nullptr), m_fail (false), m_open_state (true),
+        m_errmsg ()
     { }
 
     // No copying!
@@ -115,6 +118,31 @@
 
     virtual std::ostream * output_stream (void) { return nullptr; }
 
+    // Return either the original output stream or one wrapped with the
+    // encoding facet.
+
+    std::ostream * preferred_output_stream (void)
+    {
+      if (! m_encoding.compare ("utf-8"))
+        return output_stream ();
+
+      if (m_conv_ostream)
+        return m_conv_ostream;
+
+      // wrap the output stream with encoding conversion facet
+      std::ostream *os = output_stream ();
+      if (os && *os)
+      {
+        convfacet_u8 *facet = new convfacet_u8 (m_encoding);
+        std::wbuffer_convert<convfacet_u8, char> *converter
+          = new std::wbuffer_convert<convfacet_u8, char> (os->rdbuf (),
+                                                          facet);
+        m_conv_ostream = new std::ostream (converter);
+      }
+
+      return (m_conv_ostream ? m_conv_ostream : output_stream ());
+    }
+
     // Return TRUE if this stream is open.
 
     bool is_open (void) const { return m_open_state; }
@@ -183,6 +211,16 @@
     // Code page
     std::string m_encoding;
 
+    // encoding conversion facet
+    typedef string::deletable_facet<string::codecvt_u8> convfacet_u8;
+
+    std::wbuffer_convert<convfacet_u8, char> *m_converter;
+
+    // wrappers for encoding conversion
+    // std::istream *m_conv_istream;
+
+    std::ostream *m_conv_ostream;
+
     // TRUE if an error has occurred.
     bool m_fail;
 
@@ -415,7 +453,7 @@
 
     std::ostream * output_stream (void)
     {
-      return m_rep ? m_rep->output_stream () : nullptr;
+      return (m_rep ? m_rep->preferred_output_stream () : nullptr);
     }
 
     void clearerr (void) { if (m_rep) m_rep->clearerr (); }
--- a/libinterp/corefcn/utils.cc	Sun Mar 06 18:20:05 2022 -0800
+++ b/libinterp/corefcn/utils.cc	Sat Mar 05 21:20:58 2022 +0100
@@ -1492,34 +1492,6 @@
     return s.length ();
   }
 
-  std::size_t format (std::ostream& os, const std::string& enc,
-                      const char *fmt, ...)
-  {
-    std::size_t retval;
-
-    va_list args;
-    va_start (args, fmt);
-
-    retval = vformat (os, enc, fmt, args);
-
-    va_end (args);
-
-    return retval;
-  }
-
-  std::size_t vformat (std::ostream& os, const std::string& enc,
-                       const char *fmt, va_list args)
-  {
-    std::string s = vasprintf (fmt, args);
-
-    if (enc.compare ("utf-8"))
-      os << string::u8_to_encoding ("printf", s, enc);
-    else
-      os << s;
-
-    return s.length ();
-  }
-
   std::string vasprintf (const char *fmt, va_list args)
   {
     std::string retval;
--- a/libinterp/corefcn/utils.h	Sun Mar 06 18:20:05 2022 -0800
+++ b/libinterp/corefcn/utils.h	Sat Mar 05 21:20:58 2022 +0100
@@ -169,15 +169,8 @@
   format (std::ostream& os, const char *fmt, ...);
 
   extern OCTINTERP_API std::size_t
-  format (std::ostream& os, const std::string& enc, const char *fmt, ...);
-
-  extern OCTINTERP_API std::size_t
   vformat (std::ostream& os, const char *fmt, va_list args);
 
-  extern OCTINTERP_API std::size_t
-  vformat (std::ostream& os, const std::string& enc,
-           const char *fmt, va_list args);
-
   extern OCTINTERP_API std::string
   vasprintf (const char *fmt, va_list args);
 
--- a/liboctave/util/oct-string.cc	Sun Mar 06 18:20:05 2022 -0800
+++ b/liboctave/util/oct-string.cc	Sat Mar 05 21:20:58 2022 +0100
@@ -607,6 +607,90 @@
   return num_replacements;
 }
 
+typedef octave::string::codecvt_u8::InternT InternT;
+typedef octave::string::codecvt_u8::ExternT ExternT;
+typedef octave::string::codecvt_u8::StateT StateT;
+
+typename std::codecvt<InternT, ExternT, StateT>::result
+octave::string::codecvt_u8::do_out
+  (StateT& /* state */,
+   const InternT* from, const InternT* from_end, const InternT*& from_next,
+   ExternT* to, ExternT* to_end, ExternT*& to_next) const
+{
+  if (from_end < from)
+    return std::codecvt<InternT, ExternT, StateT>::noconv;
+
+  // Convert from UTF-8 to output encoding
+  std::size_t srclen = (from_end-from) * sizeof (InternT);
+  std::size_t lengthp = (to_end-to) * sizeof (ExternT);
+  const uint8_t *u8_str = reinterpret_cast<const uint8_t *> (from);
+  char *enc_str = octave_u8_conv_to_encoding (m_enc.c_str (), u8_str, srclen,
+                                              &lengthp);
+
+  size_t max = to_end - to;
+  if (lengthp < max)
+    max = lengthp;
+
+  // copy conversion result to output
+  // FIXME: Handle incomplete UTF-8 characters at end of buffer.
+  std::copy_n (enc_str, max, to);
+  ::free (enc_str);
+
+  from_next = from + srclen;
+  to_next = to + max;
+
+  return std::codecvt<InternT, ExternT, StateT>::ok;
+}
+
+typename std::codecvt<InternT, ExternT, StateT>::result
+octave::string::codecvt_u8::do_in
+  (StateT& /* state */,
+   const ExternT* from, const ExternT* from_end, const ExternT*& from_next,
+   InternT* to, InternT* to_end, InternT*& to_next) const
+{
+  // Convert from input encoding to UTF-8
+  std::size_t srclen = (from_end-from) * sizeof (ExternT);
+  std::size_t lengthp = (to_end-to) * sizeof (InternT);
+  const char *enc_str = reinterpret_cast<const char *> (from);
+  uint8_t *u8_str = octave_u8_conv_from_encoding (m_enc.c_str (),
+                                                  enc_str, srclen, &lengthp);
+
+  std::size_t max = to_end - to;
+  if (lengthp < max)
+    max = lengthp;
+
+  // copy conversion result to output
+  std::copy_n (u8_str, max, to);
+  ::free (u8_str);
+
+  from_next = from + srclen;
+  to_next = to + max;
+
+  return std::codecvt<InternT, ExternT, StateT>::ok;
+}
+
+int octave::string::codecvt_u8::do_length
+  (StateT& /* state */, const ExternT *src, const ExternT *end,
+   std::size_t max) const
+{
+  // return number of external characters that produce MAX internal ones
+  std::size_t srclen = end-src;
+  std::size_t offsets[srclen];
+  std::size_t lengthp = max;
+  octave_u8_conv_from_encoding_offsets (m_enc.c_str (), src, srclen, offsets,
+                                        &lengthp);
+  std::size_t ext_char;
+  for (ext_char = 0; ext_char < srclen; ext_char++)
+  {
+    if (offsets[ext_char] != static_cast<size_t> (-1)
+        && offsets[ext_char] >= max)
+      break;
+  }
+
+  return ext_char;
+}
+
+
 template <typename T>
 std::string
 rational_approx (T val, int len)
--- a/liboctave/util/oct-string.h	Sun Mar 06 18:20:05 2022 -0800
+++ b/liboctave/util/oct-string.h	Sat Mar 05 21:20:58 2022 +0100
@@ -28,6 +28,8 @@
 
 #include "octave-config.h"
 
+#include <locale>
+
 #include "oct-cmplx.h"
 
 namespace octave
@@ -155,6 +157,94 @@
     extern OCTAVE_API unsigned int
     u8_validate (const std::string& who, std::string& in_string,
                  const u8_fallback_type type = U8_REPLACEMENT_CHAR);
+
+
+    template<class Facet>
+    struct
+    deletable_facet : Facet
+    {
+      template<class ...Args>
+      deletable_facet (Args&& ...args)
+      : Facet (std::forward<Args> (args)...)
+      { }
+
+      // destructor needs to be public
+      ~deletable_facet () {}
+    };
+
+    class
+    OCTAVE_API
+    codecvt_u8 : public std::codecvt<char, char, std::mbstate_t>
+    {
+    public:
+
+      // No copying!
+
+      codecvt_u8 (codecvt_u8 &) = delete;
+
+      codecvt_u8& operator = (codecvt_u8 &) = delete;
+
+      codecvt_u8 (const std::string &enc)
+      : m_enc (enc)
+      { }
+
+      virtual ~codecvt_u8 () { }
+
+      typedef char InternT;
+      typedef char ExternT;
+      typedef std::mbstate_t StateT;
+
+    private:
+
+      OCTAVE_API
+      typename std::codecvt<InternT, ExternT, StateT>::result
+      do_out (StateT& state,
+              const InternT* from, const InternT* from_end, const InternT*& from_next,
+              ExternT* to, ExternT* to_end, ExternT*& to_next) const;
+
+      OCTAVE_API
+      typename std::codecvt<InternT, ExternT, StateT>::result
+      do_in (StateT& state,
+             const ExternT* from, const ExternT* from_end, const ExternT*& from_next,
+             InternT* to, InternT* to_end, InternT*& to_next) const;
+
+      typename std::codecvt<InternT, ExternT, StateT>::result
+      do_unshift (StateT& /* state */, ExternT* to, ExternT* /* to_end */,
+                  ExternT*& to_next) const
+      {
+        // FIXME: What is the correct thing to unshift?
+        // Just reset?
+        to_next = to;
+
+        return std::codecvt<InternT, ExternT, StateT>::ok;
+      }
+
+      int do_encoding () const throw ()
+      {
+        // return 0 because UTF-8 encoding is variable length
+        return 0;
+      }
+
+      bool do_always_noconv () const throw ()
+      {
+        // return false to indicate non-identity conversion
+        return false;
+      }
+
+      OCTAVE_API int
+      do_length (StateT& state, const ExternT *src, const ExternT *end,
+                 std::size_t max) const;
+
+      int do_max_length() const throw ()
+      {
+        // For UTF-8, a maximum of 4 bytes are needed for one character.
+        return 4;
+      }
+
+      std::string m_enc;
+
+    };
+
   }
 }
 
--- a/liboctave/wrappers/uniconv-wrappers.c	Sun Mar 06 18:20:05 2022 -0800
+++ b/liboctave/wrappers/uniconv-wrappers.c	Sat Mar 05 21:20:58 2022 +0100
@@ -72,6 +72,24 @@
                                src, srclen, NULL, NULL, lengthp);
 }
 
+uint8_t *
+octave_u8_conv_from_encoding_offsets
+  (const char *fromcode, const char *src, size_t srclen,
+   size_t *offsets, size_t *lengthp)
+{
+  return u8_conv_from_encoding (fromcode, iconveh_question_mark,
+                                src, srclen, offsets, NULL, lengthp);
+}
+
+char *
+octave_u8_conv_to_encoding_offsets
+  (const char *tocode, const uint8_t *src, size_t srclen,
+   size_t *offsets, size_t *lengthp)
+{
+  return u8_conv_to_encoding (tocode, iconveh_question_mark,
+                              src, srclen, offsets, NULL, lengthp);
+}
+
 char *
 u8_from_wchar (const wchar_t *wc)
 {
--- a/liboctave/wrappers/uniconv-wrappers.h	Sun Mar 06 18:20:05 2022 -0800
+++ b/liboctave/wrappers/uniconv-wrappers.h	Sat Mar 05 21:20:58 2022 +0100
@@ -57,6 +57,16 @@
 octave_u32_conv_to_encoding_strict (const char *tocode, const uint32_t *src,
                                     size_t srclen, size_t *lengthp);
 
+extern OCTAVE_API uint8_t *
+octave_u8_conv_from_encoding_offsets
+  (const char *fromcode, const char *src, size_t srclen,
+   size_t *offsets, size_t *lengthp);
+
+extern OCTAVE_API char *
+octave_u8_conv_to_encoding_offsets
+  (const char *tocode, const uint8_t *src, size_t srclen,
+   size_t *offsets, size_t *lengthp);
+
 extern OCTAVE_API char * u8_from_wchar (const wchar_t *wc);
 
 extern OCTAVE_API wchar_t * u8_to_wchar (const char *u8_char);