changeset 26706:ccea3574f36b

Support encoding of file streams in textscan (bug #55452). * oct-stream.cc (do_textscan): Pass encoding in constructor for textscan object. (textscan): Store encoding in object. Convert strings from encoding. * file-io.cc (textscan): Add BIST. * io.tst (fopen): Use code page identifier that better works cross-platform.
author Markus Mützel <markus.muetzel@gmx.de>
date Sat, 09 Feb 2019 20:05:47 +0100
parents c13143821eef
children f35db7d5b7a4
files libinterp/corefcn/file-io.cc libinterp/corefcn/oct-stream.cc test/io.tst
diffstat 3 files changed, 40 insertions(+), 12 deletions(-) [+]
line wrap: on
line diff
--- a/libinterp/corefcn/file-io.cc	Sat Feb 09 17:41:48 2019 +0100
+++ b/libinterp/corefcn/file-io.cc	Sat Feb 09 20:05:47 2019 +0100
@@ -2267,6 +2267,19 @@
 %! obs = textscan (str, "%f", "delimiter", {",",";","$"});
 %! assert (obs, { [0; 1; NaN; 2; 3] });
 
+## file stream with encoding
+%!test
+%! f = tempname ();
+%! fid = fopen (f, "w+", "n", "iso-8859-1");
+%! unwind_protect
+%!   fprintf (fid, "abc,äöü\n");
+%!   fseek (fid, 0, "bof");
+%!   obs = textscan (fid, "%s", "delimiter", ",");
+%!   fclose (fid);
+%!   assert (obs, { {"abc"; "äöü"} });
+%! unwind_protect_cleanup
+%!   unlink (f);
+%! end_unwind_protect
 */
 
 // These tests have end-comment sequences, so can't just be in a comment
--- a/libinterp/corefcn/oct-stream.cc	Sat Feb 09 17:41:48 2019 +0100
+++ b/libinterp/corefcn/oct-stream.cc	Sat Feb 09 20:05:47 2019 +0100
@@ -1775,7 +1775,8 @@
   {
   public:
 
-    textscan (const std::string& who_arg = "textscan");
+    textscan (const std::string& who_arg = "textscan",
+              const std::string& encoding = "utf-8");
 
     // No copying!
 
@@ -1797,6 +1798,8 @@
     // What function name should be shown when reporting errors.
     std::string who;
 
+    std::string m_encoding;
+
     std::string buf;
 
     // Three cases for delim_table and delim_list
@@ -2506,13 +2509,13 @@
     return retval;             // May have returned 4 above.
   }
 
-  textscan::textscan (const std::string& who_arg)
-    : who (who_arg), buf (), whitespace_table (), delim_table (),
-      delims (), comment_style (), comment_len (0), comment_char (-2),
-      buffer_size (0), date_locale (), inf_nan (init_inf_nan ()),
-      empty_value (numeric_limits<double>::NaN ()), exp_chars ("edED"),
-      header_lines (0), treat_as_empty (), treat_as_empty_len (0),
-      whitespace (" \b\t"), eol1 ('\r'), eol2 ('\n'),
+  textscan::textscan (const std::string& who_arg, const std::string& encoding)
+    : who (who_arg), m_encoding (encoding), buf (), whitespace_table (),
+      delim_table (), delims (), comment_style (), comment_len (0),
+      comment_char (-2), buffer_size (0), date_locale (),
+      inf_nan (init_inf_nan ()), empty_value (numeric_limits<double>::NaN ()),
+      exp_chars ("edED"), header_lines (0), treat_as_empty (),
+      treat_as_empty_len (0), whitespace (" \b\t"), eol1 ('\r'), eol2 ('\n'),
       return_on_error (1), collect_output (false),
       multiple_delims_as_one (false), default_exp (true), lines (0)
   { }
@@ -3148,6 +3151,10 @@
         ends[i++] = eol2;
         val = textscan::read_until (is, delim_list, ends);
       }
+
+    // convert from codepage
+    if (m_encoding.compare ("utf-8"))
+      val = string::u8_from_encoding ("textscan", val, m_encoding);
   }
 
   // Return in VAL the run of characters from IS contained in PATTERN.
@@ -3195,6 +3202,10 @@
             is.get_undelim ();
           }
       }
+
+    // convert from codepage
+    if (m_encoding.compare ("utf-8"))
+      val = string::u8_from_encoding ("textscan", val, m_encoding);
   }
 
   // Read from IS into VAL a string of the next fmt.width characters,
@@ -3217,6 +3228,10 @@
             break;
           }
       }
+
+    // convert from codepage
+    if (m_encoding.compare ("utf-8"))
+      val = string::u8_from_encoding ("textscan", val, m_encoding);
   }
 
   //  Read a single '%...' conversion and place it in position ROW of OV.
@@ -5309,7 +5324,7 @@
       invalid_operation (who, "reading");
     else
       {
-        textscan scanner (who);
+        textscan scanner (who, encoding ());
 
         retval = scanner.scan (*isp, fmt, ntimes, options, read_count);
       }
--- a/test/io.tst	Sat Feb 09 17:41:48 2019 +0100
+++ b/test/io.tst	Sat Feb 09 20:05:47 2019 +0100
@@ -652,12 +652,12 @@
 
 %!test   # write to and read from file with encoding
 %! temp_file = [tempname(), ".txt"];
-%! fid = fopen (temp_file, "wt", "n", "latin 1");
+%! fid = fopen (temp_file, "wt", "n", "iso-8859-1");
 %! unwind_protect
 %!   [name, mode, arch, codepage] = fopen (fid);
 %!   assert (name, temp_file);
 %!   assert (mode, "w");
-%!   assert (codepage, "latin 1");
+%!   assert (codepage, "iso-8859-1");
 %!   fprintf (fid, "aäu %s\n", "AÄU");
 %!   fclose (fid);
 %!   # open in binary mode
@@ -670,7 +670,7 @@
 %!   fclose (fid2);
 %!   assert (read_binary, [97 228 117 32 65 196 85 10].');
 %!   # open in text mode with correct encoding
-%!   fid3 = fopen (temp_file, "rt", "n", "latin 1");
+%!   fid3 = fopen (temp_file, "rt", "n", "iso-8859-1");
 %!   read_text = fscanf (fid3, "%s");
 %!   fclose (fid3);
 %!   assert (read_text, "aäuAÄU");