diff libinterp/corefcn/dlmread.cc @ 28685:65fde9fe3b60

Strip BOM from UTF-8 encoded files in "dlmread" (bug #58813). * dlmread.cc (Fdlmread): Peek into file and strip Byte Order Mark if needed. Add BIST test for bug #58813.
author Markus Mützel <markus.muetzel@gmx.de>
date Fri, 28 Aug 2020 16:06:32 +0200
parents 83172e1c77f2
children 7854d5752dd2
line wrap: on
line diff
--- a/libinterp/corefcn/dlmread.cc	Thu Sep 03 17:43:52 2020 -0700
+++ b/libinterp/corefcn/dlmread.cc	Fri Aug 28 16:06:32 2020 +0200
@@ -305,6 +305,32 @@
   bool sep_is_wspace = (sep.find_first_of (" \t") != std::string::npos);
   bool auto_sep_is_wspace = false;
 
+  if (r0 == 0)
+    {
+      // Peek into stream and potentially strip Byte Order Mark (BOM)
+      const char BOM[3] = {'\xEF', '\xBB', '\xBF'};
+      char buf[3];
+      int i_bom;
+      bool found_bom = true;
+      for (i_bom = 0; i_bom < 3; i_bom++)
+        {
+          char ch_p = input->peek ();
+          if (ch_p == BOM[i_bom])
+            buf[i_bom] = input->get ();
+          else
+            {
+              found_bom = false;
+              break;
+            }
+        }
+      // Put back read characters if it wasn't a BOM
+      if (! found_bom)
+        {
+          for (int i_ret = i_bom-1; i_ret >= 0; i_ret--)
+            input->putback (buf[i_ret]);
+        }
+    }
+
   std::string line;
 
   // Skip the r0 leading lines
@@ -713,4 +739,18 @@
 %!   unlink (file);
 %! end_unwind_protect
 
+## Verify UTF-8 Byte Order Mark does not cause problems with reading
+%!test <*58813>
+%! file = tempname ();
+%! unwind_protect
+%!   fid = fopen (file, "wt");
+%!   fwrite (fid, char ([0xEF, 0xBB, 0xBF]));  # UTF-8 BOM
+%!   fwrite (fid, "1,2\n3,4");
+%!   fclose (fid);
+%!
+%!   assert (dlmread (file), [1, 2; 3, 4]);
+%! unwind_protect_cleanup
+%!   unlink (file);
+%! end_unwind_protect
+
 */