Mercurial > octave
changeset 26706:ccea3574f36b
Support encoding of file streams in textscan (bug #55452).
* oct-stream.cc (do_textscan): Pass encoding in constructor for textscan object.
(textscan): Store encoding in object. Convert strings from encoding.
* file-io.cc (textscan): Add BIST.
* io.tst (fopen): Use code page identifier that better works cross-platform.
author | Markus Mützel <markus.muetzel@gmx.de> |
---|---|
date | Sat, 09 Feb 2019 20:05:47 +0100 |
parents | c13143821eef |
children | f35db7d5b7a4 |
files | libinterp/corefcn/file-io.cc libinterp/corefcn/oct-stream.cc test/io.tst |
diffstat | 3 files changed, 40 insertions(+), 12 deletions(-) [+] |
line wrap: on
line diff
--- a/libinterp/corefcn/file-io.cc Sat Feb 09 17:41:48 2019 +0100 +++ b/libinterp/corefcn/file-io.cc Sat Feb 09 20:05:47 2019 +0100 @@ -2267,6 +2267,19 @@ %! obs = textscan (str, "%f", "delimiter", {",",";","$"}); %! assert (obs, { [0; 1; NaN; 2; 3] }); +## file stream with encoding +%!test +%! f = tempname (); +%! fid = fopen (f, "w+", "n", "iso-8859-1"); +%! unwind_protect +%! fprintf (fid, "abc,äöü\n"); +%! fseek (fid, 0, "bof"); +%! obs = textscan (fid, "%s", "delimiter", ","); +%! fclose (fid); +%! assert (obs, { {"abc"; "äöü"} }); +%! unwind_protect_cleanup +%! unlink (f); +%! end_unwind_protect */ // These tests have end-comment sequences, so can't just be in a comment
--- a/libinterp/corefcn/oct-stream.cc Sat Feb 09 17:41:48 2019 +0100 +++ b/libinterp/corefcn/oct-stream.cc Sat Feb 09 20:05:47 2019 +0100 @@ -1775,7 +1775,8 @@ { public: - textscan (const std::string& who_arg = "textscan"); + textscan (const std::string& who_arg = "textscan", + const std::string& encoding = "utf-8"); // No copying! @@ -1797,6 +1798,8 @@ // What function name should be shown when reporting errors. std::string who; + std::string m_encoding; + std::string buf; // Three cases for delim_table and delim_list @@ -2506,13 +2509,13 @@ return retval; // May have returned 4 above. } - textscan::textscan (const std::string& who_arg) - : who (who_arg), buf (), whitespace_table (), delim_table (), - delims (), comment_style (), comment_len (0), comment_char (-2), - buffer_size (0), date_locale (), inf_nan (init_inf_nan ()), - empty_value (numeric_limits<double>::NaN ()), exp_chars ("edED"), - header_lines (0), treat_as_empty (), treat_as_empty_len (0), - whitespace (" \b\t"), eol1 ('\r'), eol2 ('\n'), + textscan::textscan (const std::string& who_arg, const std::string& encoding) + : who (who_arg), m_encoding (encoding), buf (), whitespace_table (), + delim_table (), delims (), comment_style (), comment_len (0), + comment_char (-2), buffer_size (0), date_locale (), + inf_nan (init_inf_nan ()), empty_value (numeric_limits<double>::NaN ()), + exp_chars ("edED"), header_lines (0), treat_as_empty (), + treat_as_empty_len (0), whitespace (" \b\t"), eol1 ('\r'), eol2 ('\n'), return_on_error (1), collect_output (false), multiple_delims_as_one (false), default_exp (true), lines (0) { } @@ -3148,6 +3151,10 @@ ends[i++] = eol2; val = textscan::read_until (is, delim_list, ends); } + + // convert from codepage + if (m_encoding.compare ("utf-8")) + val = string::u8_from_encoding ("textscan", val, m_encoding); } // Return in VAL the run of characters from IS contained in PATTERN. @@ -3195,6 +3202,10 @@ is.get_undelim (); } } + + // convert from codepage + if (m_encoding.compare ("utf-8")) + val = string::u8_from_encoding ("textscan", val, m_encoding); } // Read from IS into VAL a string of the next fmt.width characters, @@ -3217,6 +3228,10 @@ break; } } + + // convert from codepage + if (m_encoding.compare ("utf-8")) + val = string::u8_from_encoding ("textscan", val, m_encoding); } // Read a single '%...' conversion and place it in position ROW of OV. @@ -5309,7 +5324,7 @@ invalid_operation (who, "reading"); else { - textscan scanner (who); + textscan scanner (who, encoding ()); retval = scanner.scan (*isp, fmt, ntimes, options, read_count); }
--- a/test/io.tst Sat Feb 09 17:41:48 2019 +0100 +++ b/test/io.tst Sat Feb 09 20:05:47 2019 +0100 @@ -652,12 +652,12 @@ %!test # write to and read from file with encoding %! temp_file = [tempname(), ".txt"]; -%! fid = fopen (temp_file, "wt", "n", "latin 1"); +%! fid = fopen (temp_file, "wt", "n", "iso-8859-1"); %! unwind_protect %! [name, mode, arch, codepage] = fopen (fid); %! assert (name, temp_file); %! assert (mode, "w"); -%! assert (codepage, "latin 1"); +%! assert (codepage, "iso-8859-1"); %! fprintf (fid, "aäu %s\n", "AÄU"); %! fclose (fid); %! # open in binary mode @@ -670,7 +670,7 @@ %! fclose (fid2); %! assert (read_binary, [97 228 117 32 65 196 85 10].'); %! # open in text mode with correct encoding -%! fid3 = fopen (temp_file, "rt", "n", "latin 1"); +%! fid3 = fopen (temp_file, "rt", "n", "iso-8859-1"); %! read_text = fscanf (fid3, "%s"); %! fclose (fid3); %! assert (read_text, "aäuAÄU");