Mercurial > octave
view libinterp/dldfcn/xzip.cc @ 22160:766f934db568
Rewrite gzip and bzip2 in C++ instead of using its applications (bug #43431)
* bzip2.m, gzip.m, __xzip__.m: remove old implementation as m files that
copy all files into a temporary directory and then call gzip or bzip2
application. Add several new tests and remove duplication of existing
tests.
* scripts/miscellaneous/module.mk: unlist removed files.
* xzip.cc: new implementation of bzip2 and gzip functions making direct
use of the libraries in C++. Also add more tests.
* libinterp/dldfcn/module-files: list new file and required flags.
* configure.ac: add check for bzip2 library.
author | Carnë Draug <carandraug@octave.org> |
---|---|
date | Sun, 26 Jun 2016 13:32:03 +0200 |
parents | |
children | 8de49f15e182 |
line wrap: on
line source
// Copyright (C) 2016 Carnë Draug // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program. If not, see <http://www.gnu.org/licenses/>. //! Octave interface to the compression and uncompression libraries. /*! This was originally implemented as an m file which directly called bzip2 and gzip applications. This may look simpler but causes some issues (see bug #43431) because we have no control over the output file: - created file is always in the same directory as the original file; - automatically skip files that already have gz/bz2/etc extension; - some olders versions miss the --keep option. In addition, because system() does not have a method that allows passing a list of arguments, there is the issue of having to escape filenames. A solution is to pipe file contents into the applications instead of filenames. However, that solution causes: # missing file header with original file information; # implementing ourselves the recursive transversion of directories; # do the above in a m file which will be slow; # popen2 is frail on windows. */ #if defined (HAVE_CONFIG_H) # include "config.h" #endif #include <cstdio> #include <cstring> #include <string> #include <list> #include <functional> #include <stdexcept> #include <iostream> #include <fstream> #ifdef HAVE_BZLIB_H # include <bzlib.h> #endif #ifdef HAVE_ZLIB_H # include <zlib.h> #endif #include "Array.h" #include "str-vec.h" #include "glob-match.h" #include "file-ops.h" #include "dir-ops.h" #include "file-stat.h" #include "oct-env.h" #include "defun-dld.h" #include "defun-int.h" #include "errwarn.h" class CFile { public: FILE* fp; CFile (const std::string& path, const std::string& mode) { fp = std::fopen (path.c_str (), mode.c_str ()); if (! fp) throw std::runtime_error ("unable to open file"); } ~CFile () { if (std::fclose (fp)) // Not pedantic. If this is dest, maybe it failed to flush // so we should signal this before someone removes the source. throw std::runtime_error ("unable to close file"); } }; #ifdef HAVE_BZ2 class bz2 { private: class zipper { private: int status = BZ_OK; CFile source; CFile dest; BZFILE* bz; public: zipper (const std::string& source_path, const std::string& dest_path) : source (source_path, "rb"), dest (dest_path, "wb") { bz = BZ2_bzWriteOpen (&status, dest.fp, 9, 0, 30); if (status != BZ_OK) throw std::runtime_error ("failed to open bzip2 stream"); } void deflate (void) { const std::size_t buf_len = 8192; char buf[buf_len]; std::size_t n_read; while ((n_read = std::fread (buf, sizeof (buf[0]), buf_len, source.fp)) != 0) { if (std::ferror (source.fp)) throw std::runtime_error ("failed to read from source file"); BZ2_bzWrite (&status, bz, buf, n_read); if (status == BZ_IO_ERROR) throw std::runtime_error ("failed to write or compress"); } if (std::ferror (source.fp)) throw std::runtime_error ("failed to read from source file"); } ~zipper () { int abandon = (status == BZ_IO_ERROR) ? 1 : 0; BZ2_bzWriteClose (&status, bz, abandon, 0, 0); if (status != BZ_OK) throw std::runtime_error ("failed to close bzip2 stream"); } }; public: static const constexpr char* extension = ".bz2"; static void zip (const std::string& source_path, const std::string& dest_path) { bz2::zipper (source_path, dest_path).deflate (); } }; #endif // HAVE_BZL2 // Note about zlib and gzip // // gzip is a format for compressed single files. zlib is a format // designed for in-memory and communication channel applications. // gzip uses the same format internally for the compressed data but // has different headers and trailers. // // zlib is also a library but gzip is not. Very old versions of zlib do // not include functions to create useful gzip headers and trailers: // // Note that you cannot specify special gzip header contents (e.g. // a file name or modification date), nor will inflate tell you what // was in the gzip header. If you need to customize the header or // see what's in it, you can use the raw deflate and inflate // operations and the crc32() function and roll your own gzip // encoding and decoding. Read the gzip RFC 1952 for details of the // header and trailer format. // zlib FAQ // // Recent versions (on which we are already dependent) have deflateInit2() // to do it. We still need to get the right metadata for the header // ourselves though. // // The header is defined in RFC #1952 // GZIP file format specification version 4.3 #ifdef HAVE_Z class gz { private: // Util class to get a non-const char* class uchar_array { public: // Bytef is a typedef for unsigned char unsigned char* p; uchar_array (const std::string& str) { p = new Bytef[str.length () +1]; std::strcpy (reinterpret_cast<char*> (p), str.c_str ()); } ~uchar_array (void) { delete[] p; } }; // This is the really thing that needs to be class gzip_stream : public z_stream { public: gzip_stream () { zalloc = Z_NULL; zfree = Z_NULL; opaque = Z_NULL; } ~gzip_stream () { int status = deflateEnd (this); if (status != Z_OK) throw std::runtime_error ("failed to close zlib stream"); } }; class gzip_header : public gz_header { private: uchar_array basename; public: gzip_header (const std::string& source_path) : basename (octave::sys::env::base_pathname (source_path)) { const octave::sys::file_stat source_stat (source_path); if (! source_stat) throw std::runtime_error ("unable to stat source file"); // time_t may be a signed int in which case it will be a // positive number so it is safe to uLong. Or is it? Can // unix_time really never be negative? time = uLong (source_stat.mtime ().unix_time ()); // If FNAME is set, an original file name is present, // terminated by a zero byte. The name must consist of ISO // 8859-1 (LATIN-1) characters; on operating systems using // EBCDIC or any other character set for file names, the name // must be translated to the ISO LATIN-1 character set. This // is the original name of the file being compressed, with any // directory components removed, and, if the file being // compressed is on a file system with case insensitive names, // forced to lower case. name = basename.p; // If we don't set it to Z_NULL, then it will set FCOMMENT (4th bit) // on the FLG byte, and then write {0, 3} comment. comment = Z_NULL; // Seems to already be the default but we are not taking chances. extra = Z_NULL; // We do not want a CRC for the header. That would be only 2 more // bytes, and maybe it would be a good thing but we want to generate // gz files similar to the default gzip application. hcrc = 0; // OS (Operating System): // 0 - FAT filesystem (MS-DOS, OS/2, NT/Win32) // 1 - Amiga // 2 - VMS (or OpenVMS) // 3 - Unix // 4 - VM/CMS // 5 - Atari TOS // 6 - HPFS filesystem (OS/2, NT) // 7 - Macintosh // 8 - Z-System // 9 - CP/M // 10 - TOPS-20 // 11 - NTFS filesystem (NT) // 12 - QDOS // 13 - Acorn RISCOS // 255 - unknown // // The list is problematic because it mixes OS and filesystem. It // also does not specify whether filesystem relates to source or // destination file. #if defined (__WIN32__) os = 0; // or should it be 11? #elif defined (__APPLE__) os = 7; #else // unix by default? os = 3; #endif } }; class zipper { private: CFile source; CFile dest; gzip_header header; gzip_stream strm = gzip_stream (); public: zipper (const std::string& source_path, const std::string& dest_path) : source (source_path, "rb"), dest (dest_path, "wb"), header (source_path) { } void deflate () { // int deflateInit2 (z_streamp strm, // int level, // compression level (default is 8) // int method, // int windowBits, // 15 (default) + 16 (gzip format) // int memLevel, // memory usage (default is 8) // int strategy); int status = deflateInit2 (&strm, 8, Z_DEFLATED, 31, 8, Z_DEFAULT_STRATEGY); if (status != Z_OK) throw std::runtime_error ("failed to open zlib stream"); deflateSetHeader (&strm, &header); const std::size_t buf_len = 8192; unsigned char buf_in[buf_len]; unsigned char buf_out[buf_len]; while ((strm.avail_in = std::fread (buf_in, sizeof (buf_in[0]), buf_len, source.fp)) != 0) { if (std::ferror (source.fp)) throw std::runtime_error ("failed to read source file"); strm.next_in = buf_in; const int flush = std::feof (source.fp) ? Z_FINISH : Z_NO_FLUSH; // If deflate returns Z_OK and with zero avail_out, it must be // called again after making room in the output buffer because // there might be more output pending. do { strm.avail_out = buf_len; strm.next_out = buf_out; status = ::deflate (&strm, flush); if (status == Z_STREAM_ERROR) throw std::runtime_error ("failed to deflate"); std::fwrite (buf_out, sizeof (buf_out[0]), buf_len - strm.avail_out, dest.fp); if (std::ferror (dest.fp)) throw std::runtime_error ("failed to write file"); } while (strm.avail_out == 0); if (strm.avail_in != 0) throw std::runtime_error ("failed to wrote file"); } } }; public: static const constexpr char* extension = ".gz"; static void zip (const std::string& source_path, const std::string& dest_path) { gz::zipper (source_path, dest_path).deflate (); } }; #endif // HAVE_Z template<typename X> string_vector xzip (const Array<std::string>& source_patterns, const std::function<std::string(const std::string&)>& mk_dest_path) { std::list<std::string> dest_paths; std::function<void(const std::string&)> walk; walk = [&walk, &mk_dest_path, &dest_paths] (const std::string& path) -> void { const octave::sys::file_stat fs (path); // is_dir and is_reg will return false if failed to stat. if (fs.is_dir ()) { octave::sys::dir_entry dir (path); if (dir) { // Collect the whole list of filenames first, before recursion // to avoid issues with infinite loop if the action generates // files in the same directory (highly likely). string_vector dirlist = dir.read (); for (octave_idx_type i = 0; i < dirlist.numel (); i++) if (dirlist(i) != "." && dirlist(i) != "..") walk (octave::sys::file_ops::concat (path, dirlist(i))); } // Note that we skip any problem with directories. } else if (fs.is_reg ()) { const std::string dest_path = mk_dest_path (path); try { X::zip (path, dest_path); } catch (...) { // Error "handling" is not including filename on the output list. // Also remove created file which maybe was not even created // in the first place. Note that it is possible for the file // to exist in the first place and for for X::zip to not have // clobber it yet but we remove it anyway by design. octave::sys::unlink (dest_path); return; } dest_paths.push_front (dest_path); } // Skip all other file types and errors. return; }; for (octave_idx_type i = 0; i < source_patterns.numel (); i++) { const glob_match pattern (octave::sys::file_ops::tilde_expand (source_patterns(i))); const string_vector filepaths = pattern.glob (); for (octave_idx_type j = 0; j < filepaths.numel (); j++) walk (filepaths(j)); } return string_vector (dest_paths); } template<typename X> string_vector xzip (const Array<std::string>& source_patterns) { const std::string ext = X::extension; const std::function<std::string(const std::string&)> mk_dest_path = [&ext] (const std::string& source_path) -> std::string { return source_path + ext; }; return xzip<X> (source_patterns, mk_dest_path); } template<typename X> string_vector xzip (const Array<std::string>& source_patterns, const std::string& out_dir) { const std::string ext = X::extension; const std::function<std::string(const std::string&)> mk_dest_path = [&out_dir, &ext] (const std::string& source_path) -> std::string { const std::string basename = octave::sys::env::base_pathname (source_path); return octave::sys::file_ops::concat (out_dir, basename + ext); }; // We don't care if mkdir fails. Maybe it failed because it already // exists, or maybe it can't bre created. If the first, then there's // nothing to do, if the later, then it will be handled later. Any // is to be handled by not listing files in the output. octave::sys::mkdir (out_dir, 0777); return xzip<X> (source_patterns, mk_dest_path); } template<typename X> static octave_value_list xzip (const std::string& func_name, const octave_value_list& args) { const octave_idx_type nargin = args.length (); if (nargin < 1 || nargin > 2) print_usage (); const Array<std::string> source_patterns = args(0).xcellstr_value ("%s: FILES must be a character array or cellstr", func_name.c_str ()); if (nargin == 1) return octave_value (Cell (xzip<X> (source_patterns))); else // nargin == 2 { const std::string out_dir = args(1).string_value (); return octave_value (Cell (xzip<X> (source_patterns, out_dir))); } } DEFUN_DLD (gzip, args, , doc: /* -*- texinfo -*- @deftypefn {} {@var{filelist} =} gzip (@var{files}) @deftypefnx {} {@var{filelist} =} gzip (@var{files}, @var{dir}) Compress the list of files and directories specified in @var{files}. @var{files} is a character array or cell array of strings. Shell wildcards in the filename such as @samp{*} or @samp{?} are accepted and expanded. Each file is compressed separately and a new file with a @file{".gz"} extension is created. The original files are not modified, but existing compressed files will be silently overwritten. If a directory is specified then @code{gzip} recursively compresses all files in the directory. If @var{dir} is defined the compressed files are placed in this directory, rather than the original directory where the uncompressed file resides. Note that this does not replicate a directory tree in @var{dir} which may lead to files overwritting each other if there are multiple files with the same name. If @var{dir} does not exist it is created. The optional output @var{filelist} is a list of the compressed files. @seealso{gunzip, unpack, bzip2, zip, tar} @end deftypefn */) { #ifndef HAVE_Z err_disabled_feature ("gzip", "gzip"); #else return xzip<gz> ("gzip", args); #endif } /* %!error gzip () %!error gzip ("1", "2", "3") %!error <FILES must be a character array or cellstr> gzip (1) */ DEFUN_DLD (bzip2, args, , doc: /* -*- texinfo -*- @deftypefn {} {@var{filelist} =} bzip2 (@var{files}) @deftypefnx {} {@var{filelist} =} bzip2 (@var{files}, @var{dir}) Compress the list of files specified in @var{files}. @var{files} is a character array or cell array of strings. Shell wildcards in the filename such as @samp{*} or @samp{?} are accepted and expanded. Each file is compressed separately and a new file with a @file{".bz2"} extension is created. The original files are not modified, but existing compressed files will be silently overwritten. If @var{dir} is defined the compressed files are placed in this directory, rather than the original directory where the uncompressed file resides. Note that this does not replicate a directory tree in @var{dir} which may lead to files overwritting each other if there are multiple files with the same name. If @var{dir} does not exist it is created. The optional output @var{filelist} is a list of the compressed files. @seealso{bunzip2, unpack, gzip, zip, tar} @end deftypefn */) { #ifndef HAVE_BZ2 err_disabled_feature ("bzip2", "bzip2"); #else return xzip<bz2> ("bzip2", args); #endif } // Tests for both gzip/bzip2 and gunzip/bunzip2 /* ## Takes a single argument, a function handle for the test. This other ## function must accept two arguments, a directory for the tests, and ## a cell array with zip function, unzip function, and file extension. %!function run_test_function (test_function) %! enabled_zippers = cell (0, 0); %! if (__octave_config_info__ ().build_features.BZ2) %! enabled_zippers(1, end+1) = @bzip2; %! enabled_zippers(2, end) = @bunzip2; %! enabled_zippers(3, end) = ".bz2"; %! endif %! if (__octave_config_info__ ().build_features.Z) %! enabled_zippers(1, end+1) = @gzip; %! enabled_zippers(2, end) = @gunzip; %! enabled_zippers(3, end) = ".gz"; %! endif %! %! for z = enabled_zippers %! test_dir = tempname (); %! if (! mkdir (test_dir)) %! error ("unable to create directory for tests"); %! endif %! unwind_protect %! test_function (test_dir, z) %! unwind_protect_cleanup %! confirm_recursive_rmdir (false, "local"); %! rmdir (test_dir, "s"); %! end_unwind_protect %! endfor %!endfunction %!function create_file (fpath, data) %! fid = fopen (fpath, "wb"); %! if (fid < 0) %! error ("unable to open file for writing"); %! endif %! if (fwrite (fid, data, class (data)) != numel (data)) %! error ("unable to write to file"); %! endif %! if (fflush (fid) || fclose (fid)) %! error ("unable to flush or close file"); %! endif %!endfunction ## Test with large files because of varied buffer size %!function test_large_file (test_dir, z) %! test_file = tempname (test_dir); %! create_file (test_file, rand (500000, 1)); %! md5 = hash ("md5", fileread (test_file)); %! %! expected_z_file = [test_file z{3}]; %! z_files = z{1} (test_file); %! assert (z_files, {expected_z_file}) %! %! unlink (test_file); %! assert (z{2} (z_files{1}), {test_file}) %! assert (hash ("md5", fileread (test_file)), md5) %!endfunction %!test run_test_function (@test_large_file) ## Test that xzipped files are rexzipped %!function test_z_z (test_dir, z) %! ori_file = tempname (test_dir); %! create_file (ori_file, rand (100, 1)); %! md5_ori = hash ("md5", fileread (ori_file)); %! %! z_file = [ori_file z{3}]; %! filelist = z{1} (ori_file); %! assert (filelist, {z_file}) # check output %! assert (exist (z_file), 2) # confirm file exists %! assert (exist (z_file), 2) # and did not remove original file %! md5_z = hash ("md5", fileread (z_file)); %! %! unlink (ori_file); %! assert (z{2} (z_file), {ori_file}) %! ## bug #48597 %! assert (exist (ori_file), 2) # bug #48597 (Xunzip should not remove file) %! assert (hash ("md5", fileread (ori_file)), md5_ori) %! %! ## xzip should dutifully re-xzip files even if they already are zipped %! z_z_file = [z_file z{3}]; %! %! filelist = z{1} (z_file); %! assert (filelist, {z_z_file}) # check output %! assert (exist (z_z_file), 2) # confirm file exists %! assert (exist (z_z_file), 2) # and did not remove original file %! %! unlink (z_file); %! assert (z{2} (z_z_file), {z_file}) %! assert (hash ("md5", fileread (z_file)), md5_z) %!endfunction %!test run_test_function (@test_z_z) %!function test_xzip_dir (test_dir, z) %! fpaths = fullfile (test_dir, {"test1", "test2", "test3"}); %! z_files = strcat (fpaths, z{3}); %! md5s = cell (1, 3); %! for idx = 1:numel(fpaths) %! create_file (fpaths{idx}, rand (100, 1)); %! md5s(idx) = hash ("md5", fileread (fpaths{idx})); %! endfor %! %! assert (sort (z{1} ([test_dir filesep()])), z_files(:)) %! for idx = 1:numel(fpaths) %! assert (exist (z_files{idx}), 2) %! unlink (fpaths{idx}); %! endfor %! for idx = 1:numel(fpaths) %! assert (z{2} (z_files{idx}), fpaths{idx}); # bug #48598 %! assert (hash ("md5", fileread (fpaths{idx})), md5s{idx}) %! endfor %!endfunction %!test run_test_function (@test_xzip_dir) %!function test_save_to_dir (test_dir, z) %! filename = "test-file"; %! filepath = fullfile (test_dir, filename); %! create_file (filepath, rand (100, 1)); %! md5 = hash ("md5", fileread (filepath)); %! %! ## test with existing and non-existing directory %! out_dirs = {tempname (test_dir), tempname (test_dir)}; %! if (! mkdir (out_dirs{1})) %! error ("unable to create directory for test"); %! endif %! for idx = 1:numel(out_dirs) %! out_dir = out_dirs{idx}; %! z_file = fullfile (out_dir, [filename z{3}]); %! assert (z{1} (filepath, out_dir), {z_file}) %! assert (exist (z_file, "file"), 2) %! uz_file = z_file(1:(end-numel(z{3}))); %! assert (z{2} (z_file), uz_file); # bug #48598 %! assert (hash ("md5", fileread (uz_file)), md5) %! endfor %!endfunction %!test run_test_function (@test_save_to_dir) */