Mercurial > octave
diff libinterp/dldfcn/xzip.cc @ 22160:766f934db568
Rewrite gzip and bzip2 in C++ instead of using its applications (bug #43431)
* bzip2.m, gzip.m, __xzip__.m: remove old implementation as m files that
copy all files into a temporary directory and then call gzip or bzip2
application. Add several new tests and remove duplication of existing
tests.
* scripts/miscellaneous/module.mk: unlist removed files.
* xzip.cc: new implementation of bzip2 and gzip functions making direct
use of the libraries in C++. Also add more tests.
* libinterp/dldfcn/module-files: list new file and required flags.
* configure.ac: add check for bzip2 library.
author | Carnë Draug <carandraug@octave.org> |
---|---|
date | Sun, 26 Jun 2016 13:32:03 +0200 |
parents | |
children | 8de49f15e182 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libinterp/dldfcn/xzip.cc Sun Jun 26 13:32:03 2016 +0200 @@ -0,0 +1,698 @@ +// Copyright (C) 2016 Carnë Draug +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see <http://www.gnu.org/licenses/>. + +//! Octave interface to the compression and uncompression libraries. +/*! + This was originally implemented as an m file which directly called + bzip2 and gzip applications. This may look simpler but causes some + issues (see bug #43431) because we have no control over the output + file: + + - created file is always in the same directory as the original file; + - automatically skip files that already have gz/bz2/etc extension; + - some olders versions miss the --keep option. + + In addition, because system() does not have a method that allows + passing a list of arguments, there is the issue of having to escape + filenames. + + A solution is to pipe file contents into the applications instead of + filenames. However, that solution causes: + + # missing file header with original file information; + # implementing ourselves the recursive transversion of directories; + # do the above in a m file which will be slow; + # popen2 is frail on windows. + +*/ + +#if defined (HAVE_CONFIG_H) +# include "config.h" +#endif + +#include <cstdio> +#include <cstring> + +#include <string> +#include <list> +#include <functional> +#include <stdexcept> +#include <iostream> +#include <fstream> + +#ifdef HAVE_BZLIB_H +# include <bzlib.h> +#endif + +#ifdef HAVE_ZLIB_H +# include <zlib.h> +#endif + +#include "Array.h" +#include "str-vec.h" +#include "glob-match.h" +#include "file-ops.h" +#include "dir-ops.h" +#include "file-stat.h" +#include "oct-env.h" + +#include "defun-dld.h" +#include "defun-int.h" +#include "errwarn.h" + +class CFile +{ +public: + FILE* fp; + + CFile (const std::string& path, const std::string& mode) + { + fp = std::fopen (path.c_str (), mode.c_str ()); + if (! fp) + throw std::runtime_error ("unable to open file"); + } + + ~CFile () + { + if (std::fclose (fp)) + // Not pedantic. If this is dest, maybe it failed to flush + // so we should signal this before someone removes the source. + throw std::runtime_error ("unable to close file"); + } +}; + +#ifdef HAVE_BZ2 +class bz2 +{ +private: + class zipper + { + private: + int status = BZ_OK; + CFile source; + CFile dest; + BZFILE* bz; + + public: + zipper (const std::string& source_path, const std::string& dest_path) + : source (source_path, "rb"), dest (dest_path, "wb") + { + bz = BZ2_bzWriteOpen (&status, dest.fp, 9, 0, 30); + if (status != BZ_OK) + throw std::runtime_error ("failed to open bzip2 stream"); + } + + void + deflate (void) + { + const std::size_t buf_len = 8192; + char buf[buf_len]; + std::size_t n_read; + while ((n_read = std::fread (buf, sizeof (buf[0]), buf_len, source.fp)) != 0) + { + if (std::ferror (source.fp)) + throw std::runtime_error ("failed to read from source file"); + BZ2_bzWrite (&status, bz, buf, n_read); + if (status == BZ_IO_ERROR) + throw std::runtime_error ("failed to write or compress"); + } + if (std::ferror (source.fp)) + throw std::runtime_error ("failed to read from source file"); + } + + ~zipper () + { + int abandon = (status == BZ_IO_ERROR) ? 1 : 0; + BZ2_bzWriteClose (&status, bz, abandon, 0, 0); + if (status != BZ_OK) + throw std::runtime_error ("failed to close bzip2 stream"); + } + }; + +public: + static const constexpr char* extension = ".bz2"; + + static void + zip (const std::string& source_path, const std::string& dest_path) + { + bz2::zipper (source_path, dest_path).deflate (); + } + +}; +#endif // HAVE_BZL2 + +// Note about zlib and gzip +// +// gzip is a format for compressed single files. zlib is a format +// designed for in-memory and communication channel applications. +// gzip uses the same format internally for the compressed data but +// has different headers and trailers. +// +// zlib is also a library but gzip is not. Very old versions of zlib do +// not include functions to create useful gzip headers and trailers: +// +// Note that you cannot specify special gzip header contents (e.g. +// a file name or modification date), nor will inflate tell you what +// was in the gzip header. If you need to customize the header or +// see what's in it, you can use the raw deflate and inflate +// operations and the crc32() function and roll your own gzip +// encoding and decoding. Read the gzip RFC 1952 for details of the +// header and trailer format. +// zlib FAQ +// +// Recent versions (on which we are already dependent) have deflateInit2() +// to do it. We still need to get the right metadata for the header +// ourselves though. +// +// The header is defined in RFC #1952 +// GZIP file format specification version 4.3 + + +#ifdef HAVE_Z +class gz +{ +private: + + // Util class to get a non-const char* + class uchar_array + { + public: + // Bytef is a typedef for unsigned char + unsigned char* p; + + uchar_array (const std::string& str) + { + p = new Bytef[str.length () +1]; + std::strcpy (reinterpret_cast<char*> (p), str.c_str ()); + } + + ~uchar_array (void) { delete[] p; } + }; + + // This is the really thing that needs to be + class gzip_stream : public z_stream + { + public: + gzip_stream () + { + zalloc = Z_NULL; + zfree = Z_NULL; + opaque = Z_NULL; + } + + ~gzip_stream () + { + int status = deflateEnd (this); + if (status != Z_OK) + throw std::runtime_error ("failed to close zlib stream"); + } + }; + + class gzip_header : public gz_header + { + private: + uchar_array basename; + + public: + gzip_header (const std::string& source_path) + : basename (octave::sys::env::base_pathname (source_path)) + { + const octave::sys::file_stat source_stat (source_path); + if (! source_stat) + throw std::runtime_error ("unable to stat source file"); + + // time_t may be a signed int in which case it will be a + // positive number so it is safe to uLong. Or is it? Can + // unix_time really never be negative? + time = uLong (source_stat.mtime ().unix_time ()); + + // If FNAME is set, an original file name is present, + // terminated by a zero byte. The name must consist of ISO + // 8859-1 (LATIN-1) characters; on operating systems using + // EBCDIC or any other character set for file names, the name + // must be translated to the ISO LATIN-1 character set. This + // is the original name of the file being compressed, with any + // directory components removed, and, if the file being + // compressed is on a file system with case insensitive names, + // forced to lower case. + name = basename.p; + + // If we don't set it to Z_NULL, then it will set FCOMMENT (4th bit) + // on the FLG byte, and then write {0, 3} comment. + comment = Z_NULL; + + // Seems to already be the default but we are not taking chances. + extra = Z_NULL; + + // We do not want a CRC for the header. That would be only 2 more + // bytes, and maybe it would be a good thing but we want to generate + // gz files similar to the default gzip application. + hcrc = 0; + + // OS (Operating System): + // 0 - FAT filesystem (MS-DOS, OS/2, NT/Win32) + // 1 - Amiga + // 2 - VMS (or OpenVMS) + // 3 - Unix + // 4 - VM/CMS + // 5 - Atari TOS + // 6 - HPFS filesystem (OS/2, NT) + // 7 - Macintosh + // 8 - Z-System + // 9 - CP/M + // 10 - TOPS-20 + // 11 - NTFS filesystem (NT) + // 12 - QDOS + // 13 - Acorn RISCOS + // 255 - unknown + // + // The list is problematic because it mixes OS and filesystem. It + // also does not specify whether filesystem relates to source or + // destination file. + +#if defined (__WIN32__) + os = 0; // or should it be 11? +#elif defined (__APPLE__) + os = 7; +#else // unix by default? + os = 3; +#endif + } + }; + + class zipper + { + private: + CFile source; + CFile dest; + gzip_header header; + gzip_stream strm = gzip_stream (); + + public: + zipper (const std::string& source_path, const std::string& dest_path) + : source (source_path, "rb"), dest (dest_path, "wb"), + header (source_path) + { } + + void + deflate () + { + // int deflateInit2 (z_streamp strm, + // int level, // compression level (default is 8) + // int method, + // int windowBits, // 15 (default) + 16 (gzip format) + // int memLevel, // memory usage (default is 8) + // int strategy); + + int status = deflateInit2 (&strm, 8, Z_DEFLATED, 31, 8, + Z_DEFAULT_STRATEGY); + if (status != Z_OK) + throw std::runtime_error ("failed to open zlib stream"); + + deflateSetHeader (&strm, &header); + + const std::size_t buf_len = 8192; + unsigned char buf_in[buf_len]; + unsigned char buf_out[buf_len]; + + while ((strm.avail_in = std::fread (buf_in, sizeof (buf_in[0]), + buf_len, source.fp)) != 0) + { + if (std::ferror (source.fp)) + throw std::runtime_error ("failed to read source file"); + + strm.next_in = buf_in; + const int flush = std::feof (source.fp) ? Z_FINISH : Z_NO_FLUSH; + + // If deflate returns Z_OK and with zero avail_out, it must be + // called again after making room in the output buffer because + // there might be more output pending. + do + { + strm.avail_out = buf_len; + strm.next_out = buf_out; + status = ::deflate (&strm, flush); + if (status == Z_STREAM_ERROR) + throw std::runtime_error ("failed to deflate"); + + std::fwrite (buf_out, sizeof (buf_out[0]), + buf_len - strm.avail_out, dest.fp); + if (std::ferror (dest.fp)) + throw std::runtime_error ("failed to write file"); + } + while (strm.avail_out == 0); + + if (strm.avail_in != 0) + throw std::runtime_error ("failed to wrote file"); + } + } + }; + +public: + static const constexpr char* extension = ".gz"; + + static void + zip (const std::string& source_path, const std::string& dest_path) + { + gz::zipper (source_path, dest_path).deflate (); + } +}; +#endif // HAVE_Z + + +template<typename X> +string_vector +xzip (const Array<std::string>& source_patterns, + const std::function<std::string(const std::string&)>& mk_dest_path) +{ + std::list<std::string> dest_paths; + + std::function<void(const std::string&)> walk; + walk = [&walk, &mk_dest_path, &dest_paths] (const std::string& path) -> void + { + const octave::sys::file_stat fs (path); + // is_dir and is_reg will return false if failed to stat. + if (fs.is_dir ()) + { + octave::sys::dir_entry dir (path); + if (dir) + { + // Collect the whole list of filenames first, before recursion + // to avoid issues with infinite loop if the action generates + // files in the same directory (highly likely). + string_vector dirlist = dir.read (); + for (octave_idx_type i = 0; i < dirlist.numel (); i++) + if (dirlist(i) != "." && dirlist(i) != "..") + walk (octave::sys::file_ops::concat (path, dirlist(i))); + } + // Note that we skip any problem with directories. + } + else if (fs.is_reg ()) + { + const std::string dest_path = mk_dest_path (path); + try + { + X::zip (path, dest_path); + } + catch (...) + { + // Error "handling" is not including filename on the output list. + // Also remove created file which maybe was not even created + // in the first place. Note that it is possible for the file + // to exist in the first place and for for X::zip to not have + // clobber it yet but we remove it anyway by design. + octave::sys::unlink (dest_path); + return; + } + dest_paths.push_front (dest_path); + } + // Skip all other file types and errors. + return; + }; + + for (octave_idx_type i = 0; i < source_patterns.numel (); i++) + { + const glob_match pattern (octave::sys::file_ops::tilde_expand (source_patterns(i))); + const string_vector filepaths = pattern.glob (); + for (octave_idx_type j = 0; j < filepaths.numel (); j++) + walk (filepaths(j)); + } + return string_vector (dest_paths); +} + + +template<typename X> +string_vector +xzip (const Array<std::string>& source_patterns) +{ + const std::string ext = X::extension; + const std::function<std::string(const std::string&)> mk_dest_path + = [&ext] (const std::string& source_path) -> std::string + { + return source_path + ext; + }; + return xzip<X> (source_patterns, mk_dest_path); +} + +template<typename X> +string_vector +xzip (const Array<std::string>& source_patterns, const std::string& out_dir) +{ + const std::string ext = X::extension; + const std::function<std::string(const std::string&)> mk_dest_path + = [&out_dir, &ext] (const std::string& source_path) -> std::string + { + const std::string basename = octave::sys::env::base_pathname (source_path); + return octave::sys::file_ops::concat (out_dir, basename + ext); + }; + + // We don't care if mkdir fails. Maybe it failed because it already + // exists, or maybe it can't bre created. If the first, then there's + // nothing to do, if the later, then it will be handled later. Any + // is to be handled by not listing files in the output. + octave::sys::mkdir (out_dir, 0777); + return xzip<X> (source_patterns, mk_dest_path); +} + +template<typename X> +static octave_value_list +xzip (const std::string& func_name, const octave_value_list& args) +{ + const octave_idx_type nargin = args.length (); + if (nargin < 1 || nargin > 2) + print_usage (); + + const Array<std::string> source_patterns + = args(0).xcellstr_value ("%s: FILES must be a character array or cellstr", + func_name.c_str ()); + if (nargin == 1) + return octave_value (Cell (xzip<X> (source_patterns))); + else // nargin == 2 + { + const std::string out_dir = args(1).string_value (); + return octave_value (Cell (xzip<X> (source_patterns, out_dir))); + } +} + +DEFUN_DLD (gzip, args, , + doc: /* -*- texinfo -*- +@deftypefn {} {@var{filelist} =} gzip (@var{files}) +@deftypefnx {} {@var{filelist} =} gzip (@var{files}, @var{dir}) +Compress the list of files and directories specified in @var{files}. + +@var{files} is a character array or cell array of strings. Shell wildcards +in the filename such as @samp{*} or @samp{?} are accepted and expanded. +Each file is compressed separately and a new file with a @file{".gz"} +extension is created. The original files are not modified, but existing +compressed files will be silently overwritten. If a directory is +specified then @code{gzip} recursively compresses all files in the +directory. + +If @var{dir} is defined the compressed files are placed in this directory, +rather than the original directory where the uncompressed file resides. +Note that this does not replicate a directory tree in @var{dir} which may +lead to files overwritting each other if there are multiple files with the +same name. + +If @var{dir} does not exist it is created. + +The optional output @var{filelist} is a list of the compressed files. +@seealso{gunzip, unpack, bzip2, zip, tar} +@end deftypefn */) +{ +#ifndef HAVE_Z + err_disabled_feature ("gzip", "gzip"); +#else + return xzip<gz> ("gzip", args); +#endif +} + +/* +%!error gzip () +%!error gzip ("1", "2", "3") +%!error <FILES must be a character array or cellstr> gzip (1) +*/ + +DEFUN_DLD (bzip2, args, , + doc: /* -*- texinfo -*- +@deftypefn {} {@var{filelist} =} bzip2 (@var{files}) +@deftypefnx {} {@var{filelist} =} bzip2 (@var{files}, @var{dir}) +Compress the list of files specified in @var{files}. + +@var{files} is a character array or cell array of strings. Shell wildcards +in the filename such as @samp{*} or @samp{?} are accepted and expanded. +Each file is compressed separately and a new file with a @file{".bz2"} +extension is created. The original files are not modified, but existing +compressed files will be silently overwritten. + +If @var{dir} is defined the compressed files are placed in this directory, +rather than the original directory where the uncompressed file resides. +Note that this does not replicate a directory tree in @var{dir} which may +lead to files overwritting each other if there are multiple files with the +same name. + +If @var{dir} does not exist it is created. + +The optional output @var{filelist} is a list of the compressed files. +@seealso{bunzip2, unpack, gzip, zip, tar} +@end deftypefn */) +{ +#ifndef HAVE_BZ2 + err_disabled_feature ("bzip2", "bzip2"); +#else + return xzip<bz2> ("bzip2", args); +#endif +} + +// Tests for both gzip/bzip2 and gunzip/bunzip2 +/* + +## Takes a single argument, a function handle for the test. This other +## function must accept two arguments, a directory for the tests, and +## a cell array with zip function, unzip function, and file extension. + +%!function run_test_function (test_function) +%! enabled_zippers = cell (0, 0); +%! if (__octave_config_info__ ().build_features.BZ2) +%! enabled_zippers(1, end+1) = @bzip2; +%! enabled_zippers(2, end) = @bunzip2; +%! enabled_zippers(3, end) = ".bz2"; +%! endif +%! if (__octave_config_info__ ().build_features.Z) +%! enabled_zippers(1, end+1) = @gzip; +%! enabled_zippers(2, end) = @gunzip; +%! enabled_zippers(3, end) = ".gz"; +%! endif +%! +%! for z = enabled_zippers +%! test_dir = tempname (); +%! if (! mkdir (test_dir)) +%! error ("unable to create directory for tests"); +%! endif +%! unwind_protect +%! test_function (test_dir, z) +%! unwind_protect_cleanup +%! confirm_recursive_rmdir (false, "local"); +%! rmdir (test_dir, "s"); +%! end_unwind_protect +%! endfor +%!endfunction + +%!function create_file (fpath, data) +%! fid = fopen (fpath, "wb"); +%! if (fid < 0) +%! error ("unable to open file for writing"); +%! endif +%! if (fwrite (fid, data, class (data)) != numel (data)) +%! error ("unable to write to file"); +%! endif +%! if (fflush (fid) || fclose (fid)) +%! error ("unable to flush or close file"); +%! endif +%!endfunction + +## Test with large files because of varied buffer size +%!function test_large_file (test_dir, z) +%! test_file = tempname (test_dir); +%! create_file (test_file, rand (500000, 1)); +%! md5 = hash ("md5", fileread (test_file)); +%! +%! expected_z_file = [test_file z{3}]; +%! z_files = z{1} (test_file); +%! assert (z_files, {expected_z_file}) +%! +%! unlink (test_file); +%! assert (z{2} (z_files{1}), {test_file}) +%! assert (hash ("md5", fileread (test_file)), md5) +%!endfunction +%!test run_test_function (@test_large_file) + +## Test that xzipped files are rexzipped +%!function test_z_z (test_dir, z) +%! ori_file = tempname (test_dir); +%! create_file (ori_file, rand (100, 1)); +%! md5_ori = hash ("md5", fileread (ori_file)); +%! +%! z_file = [ori_file z{3}]; +%! filelist = z{1} (ori_file); +%! assert (filelist, {z_file}) # check output +%! assert (exist (z_file), 2) # confirm file exists +%! assert (exist (z_file), 2) # and did not remove original file +%! md5_z = hash ("md5", fileread (z_file)); +%! +%! unlink (ori_file); +%! assert (z{2} (z_file), {ori_file}) +%! ## bug #48597 +%! assert (exist (ori_file), 2) # bug #48597 (Xunzip should not remove file) +%! assert (hash ("md5", fileread (ori_file)), md5_ori) +%! +%! ## xzip should dutifully re-xzip files even if they already are zipped +%! z_z_file = [z_file z{3}]; +%! +%! filelist = z{1} (z_file); +%! assert (filelist, {z_z_file}) # check output +%! assert (exist (z_z_file), 2) # confirm file exists +%! assert (exist (z_z_file), 2) # and did not remove original file +%! +%! unlink (z_file); +%! assert (z{2} (z_z_file), {z_file}) +%! assert (hash ("md5", fileread (z_file)), md5_z) +%!endfunction +%!test run_test_function (@test_z_z) + +%!function test_xzip_dir (test_dir, z) +%! fpaths = fullfile (test_dir, {"test1", "test2", "test3"}); +%! z_files = strcat (fpaths, z{3}); +%! md5s = cell (1, 3); +%! for idx = 1:numel(fpaths) +%! create_file (fpaths{idx}, rand (100, 1)); +%! md5s(idx) = hash ("md5", fileread (fpaths{idx})); +%! endfor +%! +%! assert (sort (z{1} ([test_dir filesep()])), z_files(:)) +%! for idx = 1:numel(fpaths) +%! assert (exist (z_files{idx}), 2) +%! unlink (fpaths{idx}); +%! endfor +%! for idx = 1:numel(fpaths) +%! assert (z{2} (z_files{idx}), fpaths{idx}); # bug #48598 +%! assert (hash ("md5", fileread (fpaths{idx})), md5s{idx}) +%! endfor +%!endfunction +%!test run_test_function (@test_xzip_dir) + +%!function test_save_to_dir (test_dir, z) +%! filename = "test-file"; +%! filepath = fullfile (test_dir, filename); +%! create_file (filepath, rand (100, 1)); +%! md5 = hash ("md5", fileread (filepath)); +%! +%! ## test with existing and non-existing directory +%! out_dirs = {tempname (test_dir), tempname (test_dir)}; +%! if (! mkdir (out_dirs{1})) +%! error ("unable to create directory for test"); +%! endif +%! for idx = 1:numel(out_dirs) +%! out_dir = out_dirs{idx}; +%! z_file = fullfile (out_dir, [filename z{3}]); +%! assert (z{1} (filepath, out_dir), {z_file}) +%! assert (exist (z_file, "file"), 2) +%! uz_file = z_file(1:(end-numel(z{3}))); +%! assert (z{2} (z_file), uz_file); # bug #48598 +%! assert (hash ("md5", fileread (uz_file)), md5) +%! endfor +%!endfunction +%!test run_test_function (@test_save_to_dir) +*/