view libinterp/dldfcn/xzip.cc @ 22160:766f934db568

Rewrite gzip and bzip2 in C++ instead of using its applications (bug #43431) * bzip2.m, gzip.m, __xzip__.m: remove old implementation as m files that copy all files into a temporary directory and then call gzip or bzip2 application. Add several new tests and remove duplication of existing tests. * scripts/miscellaneous/module.mk: unlist removed files. * xzip.cc: new implementation of bzip2 and gzip functions making direct use of the libraries in C++. Also add more tests. * libinterp/dldfcn/module-files: list new file and required flags. * configure.ac: add check for bzip2 library.
author Carnë Draug <carandraug@octave.org>
date Sun, 26 Jun 2016 13:32:03 +0200
parents
children 8de49f15e182
line wrap: on
line source

// Copyright (C) 2016 Carnë Draug
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program.  If not, see <http://www.gnu.org/licenses/>.

//! Octave interface to the compression and uncompression libraries.
/*!
  This was originally implemented as an m file which directly called
  bzip2 and gzip applications.  This may look simpler but causes some
  issues (see bug #43431) because we have no control over the output
  file:

    - created file is always in the same directory as the original file;
    - automatically skip files that already have gz/bz2/etc extension;
    - some olders versions miss the --keep option.

  In addition, because system() does not have a method that allows
  passing a list of arguments, there is the issue of having to escape
  filenames.

  A solution is to pipe file contents into the applications instead of
  filenames.  However, that solution causes:

    # missing file header with original file information;
    # implementing ourselves the recursive transversion of directories;
    # do the above in a m file which will be slow;
    # popen2 is frail on windows.

*/

#if defined (HAVE_CONFIG_H)
#  include "config.h"
#endif

#include <cstdio>
#include <cstring>

#include <string>
#include <list>
#include <functional>
#include <stdexcept>
#include <iostream>
#include <fstream>

#ifdef HAVE_BZLIB_H
#  include <bzlib.h>
#endif

#ifdef HAVE_ZLIB_H
#  include <zlib.h>
#endif

#include "Array.h"
#include "str-vec.h"
#include "glob-match.h"
#include "file-ops.h"
#include "dir-ops.h"
#include "file-stat.h"
#include "oct-env.h"

#include "defun-dld.h"
#include "defun-int.h"
#include "errwarn.h"

class CFile
{
public:
  FILE* fp;

  CFile (const std::string& path, const std::string& mode)
  {
    fp = std::fopen (path.c_str (), mode.c_str ());
    if (! fp)
      throw std::runtime_error ("unable to open file");
  }

  ~CFile ()
  {
    if (std::fclose (fp))
      // Not pedantic.  If this is dest, maybe it failed to flush
      // so we should signal this before someone removes the source.
      throw std::runtime_error ("unable to close file");
  }
};

#ifdef HAVE_BZ2
class bz2
{
private:
  class zipper
  {
  private:
    int status = BZ_OK;
    CFile source;
    CFile dest;
    BZFILE* bz;

  public:
    zipper (const std::string& source_path, const std::string& dest_path)
      : source (source_path, "rb"), dest (dest_path, "wb")
    {
      bz = BZ2_bzWriteOpen (&status, dest.fp, 9, 0, 30);
      if (status != BZ_OK)
        throw std::runtime_error ("failed to open bzip2 stream");
    }

    void
    deflate (void)
    {
      const std::size_t buf_len = 8192;
      char buf[buf_len];
      std::size_t n_read;
      while ((n_read = std::fread (buf, sizeof (buf[0]), buf_len, source.fp)) != 0)
        {
          if (std::ferror (source.fp))
            throw std::runtime_error ("failed to read from source file");
          BZ2_bzWrite (&status, bz, buf, n_read);
          if (status == BZ_IO_ERROR)
            throw std::runtime_error ("failed to write or compress");
        }
      if (std::ferror (source.fp))
        throw std::runtime_error ("failed to read from source file");
    }

    ~zipper ()
    {
      int abandon = (status == BZ_IO_ERROR) ? 1 : 0;
      BZ2_bzWriteClose (&status, bz, abandon, 0, 0);
      if (status != BZ_OK)
        throw std::runtime_error ("failed to close bzip2 stream");
    }
  };

public:
  static const constexpr char* extension = ".bz2";

  static void
  zip (const std::string& source_path, const std::string& dest_path)
  {
    bz2::zipper (source_path, dest_path).deflate ();
  }

};
#endif // HAVE_BZL2

// Note about zlib and gzip
//
// gzip is a format for compressed single files.  zlib is a format
// designed for in-memory and communication channel applications.
// gzip uses the same format internally for the compressed data but
// has different headers and trailers.
//
// zlib is also a library but gzip is not.  Very old versions of zlib do
// not include functions to create useful gzip headers and trailers:
//
//      Note that you cannot specify special gzip header contents (e.g.
//      a file name or modification date), nor will inflate tell you what
//      was in the gzip header. If you need to customize the header or
//      see what's in it, you can use the raw deflate and inflate
//      operations and the crc32() function and roll your own gzip
//      encoding and decoding. Read the gzip RFC 1952 for details of the
//      header and trailer format.
//                                                          zlib FAQ
//
// Recent versions (on which we are already dependent) have deflateInit2()
// to do it.  We still need to get the right metadata for the header
// ourselves though.
//
// The header is defined in RFC #1952
// GZIP file format specification version 4.3


#ifdef HAVE_Z
class gz
{
private:

  // Util class to get a non-const char*
  class uchar_array
  {
  public:
    // Bytef is a typedef for unsigned char
    unsigned char* p;

    uchar_array (const std::string& str)
    {
      p = new Bytef[str.length () +1];
      std::strcpy (reinterpret_cast<char*> (p), str.c_str ());
    }

    ~uchar_array (void) { delete[] p; }
  };

  // This is the really thing that needs to be 
  class gzip_stream : public z_stream
  {
  public:
    gzip_stream ()
    {
      zalloc = Z_NULL;
      zfree = Z_NULL;
      opaque = Z_NULL;
    }

    ~gzip_stream ()
    {
      int status = deflateEnd (this);
      if (status != Z_OK)
        throw std::runtime_error ("failed to close zlib stream");
    }
  };

  class gzip_header : public gz_header
  {
  private:
    uchar_array basename;

  public:
    gzip_header (const std::string& source_path)
      : basename (octave::sys::env::base_pathname (source_path))
    {
      const octave::sys::file_stat source_stat (source_path);
      if (! source_stat)
        throw std::runtime_error ("unable to stat source file");

      // time_t may be a signed int in which case it will be a
      // positive number so it is safe to uLong.  Or is it?  Can
      // unix_time really never be negative?
      time = uLong (source_stat.mtime ().unix_time ());

      //  If FNAME is set, an original file name is present,
      //  terminated by a zero byte.  The name must consist of ISO
      //  8859-1 (LATIN-1) characters; on operating systems using
      //  EBCDIC or any other character set for file names, the name
      //  must be translated to the ISO LATIN-1 character set.  This
      //  is the original name of the file being compressed, with any
      //  directory components removed, and, if the file being
      //  compressed is on a file system with case insensitive names,
      //  forced to lower case.
      name = basename.p;

      // If we don't set it to Z_NULL, then it will set FCOMMENT (4th bit)
      // on the FLG byte, and then write {0, 3} comment.
      comment = Z_NULL;

      // Seems to already be the default but we are not taking chances.
      extra = Z_NULL;

      // We do not want a CRC for the header.  That would be only 2 more
      // bytes, and maybe it would be a good thing but we want to generate
      // gz files similar to the default gzip application.
      hcrc = 0;

      // OS (Operating System):
      //      0 - FAT filesystem (MS-DOS, OS/2, NT/Win32)
      //      1 - Amiga
      //      2 - VMS (or OpenVMS)
      //      3 - Unix
      //      4 - VM/CMS
      //      5 - Atari TOS
      //      6 - HPFS filesystem (OS/2, NT)
      //      7 - Macintosh
      //      8 - Z-System
      //      9 - CP/M
      //     10 - TOPS-20
      //     11 - NTFS filesystem (NT)
      //     12 - QDOS
      //     13 - Acorn RISCOS
      //    255 - unknown
      //
      // The list is problematic because it mixes OS and filesystem.  It
      // also does not specify whether filesystem relates to source or
      // destination file.

#if defined (__WIN32__)
      os = 0; // or should it be 11?
#elif defined (__APPLE__)
      os = 7;
#else // unix by default?
      os = 3;
#endif
    }
  };

  class zipper
  {
  private:
    CFile source;
    CFile dest;
    gzip_header header;
    gzip_stream strm = gzip_stream ();

  public:
    zipper (const std::string& source_path, const std::string& dest_path)
      : source (source_path, "rb"), dest (dest_path, "wb"),
        header (source_path)
    { }

    void
    deflate ()
    {
      // int deflateInit2 (z_streamp strm,
      //                   int  level,      // compression level (default is 8)
      //                   int  method,
      //                   int  windowBits, // 15 (default) + 16 (gzip format)
      //                   int  memLevel,   // memory usage (default is 8)
      //                   int  strategy);

      int status = deflateInit2 (&strm, 8, Z_DEFLATED, 31, 8,
                                 Z_DEFAULT_STRATEGY);
      if (status != Z_OK)
        throw std::runtime_error ("failed to open zlib stream");

      deflateSetHeader (&strm, &header);

      const std::size_t buf_len = 8192;
      unsigned char buf_in[buf_len];
      unsigned char buf_out[buf_len];

      while ((strm.avail_in = std::fread (buf_in, sizeof (buf_in[0]),
                                          buf_len, source.fp)) != 0)
        {
          if (std::ferror (source.fp))
            throw std::runtime_error ("failed to read source file");

          strm.next_in = buf_in;
          const int flush = std::feof (source.fp) ? Z_FINISH : Z_NO_FLUSH;

          // If deflate returns Z_OK and with zero avail_out, it must be
          // called again after making room in the output buffer because
          // there might be more output pending.
          do
            {
              strm.avail_out = buf_len;
              strm.next_out = buf_out;
              status = ::deflate (&strm, flush);
              if (status == Z_STREAM_ERROR)
                throw std::runtime_error ("failed to deflate");

              std::fwrite (buf_out, sizeof (buf_out[0]),
                           buf_len - strm.avail_out, dest.fp);
              if (std::ferror (dest.fp))
                throw std::runtime_error ("failed to write file");
            }
          while (strm.avail_out == 0);

          if (strm.avail_in != 0)
            throw std::runtime_error ("failed to wrote file");
        }
    }
  };

public:
  static const constexpr char* extension = ".gz";

  static void
  zip (const std::string& source_path, const std::string& dest_path)
  {
    gz::zipper (source_path, dest_path).deflate ();
  }
};
#endif // HAVE_Z


template<typename X>
string_vector
xzip (const Array<std::string>& source_patterns,
      const std::function<std::string(const std::string&)>& mk_dest_path)
{
  std::list<std::string> dest_paths;

  std::function<void(const std::string&)> walk;
  walk = [&walk, &mk_dest_path, &dest_paths] (const std::string& path) -> void
  {
    const octave::sys::file_stat fs (path);
    // is_dir and is_reg will return false if failed to stat.
    if (fs.is_dir ())
      {
        octave::sys::dir_entry dir (path);
        if (dir)
          {
            // Collect the whole list of filenames first, before recursion
            // to avoid issues with infinite loop if the action generates
            // files in the same directory (highly likely).
            string_vector dirlist = dir.read ();
            for (octave_idx_type i = 0; i < dirlist.numel (); i++)
              if (dirlist(i) != "." && dirlist(i) != "..")
                walk (octave::sys::file_ops::concat (path, dirlist(i)));
          }
        // Note that we skip any problem with directories.
      }
    else if (fs.is_reg ())
      {
        const std::string dest_path = mk_dest_path (path);
        try
          {
            X::zip (path, dest_path);
          }
        catch (...)
          {
            // Error "handling" is not including filename on the output list.
            // Also remove created file which maybe was not even created
            // in the first place.  Note that it is possible for the file
            // to exist in the first place and for for X::zip to not have
            // clobber it yet but we remove it anyway by design.
            octave::sys::unlink (dest_path);
            return;
          }
        dest_paths.push_front (dest_path);
      }
    // Skip all other file types and errors.
    return;
  };

  for (octave_idx_type i = 0; i < source_patterns.numel (); i++)
    {
      const glob_match pattern (octave::sys::file_ops::tilde_expand (source_patterns(i)));
      const string_vector filepaths = pattern.glob ();
      for (octave_idx_type j = 0; j < filepaths.numel (); j++)
        walk (filepaths(j));
    }
  return string_vector (dest_paths);
}


template<typename X>
string_vector
xzip (const Array<std::string>& source_patterns)
{
  const std::string ext = X::extension;
  const std::function<std::string(const std::string&)> mk_dest_path
    = [&ext] (const std::string& source_path) -> std::string
  {
    return source_path + ext;
  };
  return xzip<X> (source_patterns, mk_dest_path);
}

template<typename X>
string_vector
xzip (const Array<std::string>& source_patterns, const std::string& out_dir)
{
  const std::string ext = X::extension;
  const std::function<std::string(const std::string&)> mk_dest_path
    = [&out_dir, &ext] (const std::string& source_path) -> std::string
  {
    const std::string basename = octave::sys::env::base_pathname (source_path);
    return octave::sys::file_ops::concat (out_dir, basename + ext);
  };

  // We don't care if mkdir fails.  Maybe it failed because it already
  // exists, or maybe it can't bre created.  If the first, then there's
  // nothing to do, if the later, then it will be handled later.  Any
  // is to be handled by not listing files in the output.
  octave::sys::mkdir (out_dir, 0777);
  return xzip<X> (source_patterns, mk_dest_path);
}

template<typename X>
static octave_value_list
xzip (const std::string& func_name, const octave_value_list& args)
{
  const octave_idx_type nargin = args.length ();
  if (nargin < 1 || nargin > 2)
    print_usage ();

  const Array<std::string> source_patterns
    = args(0).xcellstr_value ("%s: FILES must be a character array or cellstr",
                              func_name.c_str ());
  if (nargin == 1)
    return octave_value (Cell (xzip<X> (source_patterns)));
  else // nargin == 2
    {
      const std::string out_dir = args(1).string_value ();
      return octave_value (Cell (xzip<X> (source_patterns, out_dir)));
    }
}

DEFUN_DLD (gzip, args, ,
           doc: /* -*- texinfo -*-
@deftypefn  {} {@var{filelist} =} gzip (@var{files})
@deftypefnx {} {@var{filelist} =} gzip (@var{files}, @var{dir})
Compress the list of files and directories specified in @var{files}.

@var{files} is a character array or cell array of strings.  Shell wildcards
in the filename such as @samp{*} or @samp{?} are accepted and expanded.
Each file is compressed separately and a new file with a @file{".gz"}
extension is created.  The original files are not modified, but existing
compressed files will be silently overwritten.  If a directory is
specified then @code{gzip} recursively compresses all files in the
directory.

If @var{dir} is defined the compressed files are placed in this directory,
rather than the original directory where the uncompressed file resides.
Note that this does not replicate a directory tree in @var{dir} which may
lead to files overwritting each other if there are multiple files with the
same name.

If @var{dir} does not exist it is created.

The optional output @var{filelist} is a list of the compressed files.
@seealso{gunzip, unpack, bzip2, zip, tar}
@end deftypefn */)
{
#ifndef HAVE_Z
  err_disabled_feature ("gzip", "gzip");
#else
  return xzip<gz> ("gzip", args);
#endif
}

/*
%!error gzip ()
%!error gzip ("1", "2", "3")
%!error <FILES must be a character array or cellstr> gzip (1)
*/

DEFUN_DLD (bzip2, args, ,
           doc: /* -*- texinfo -*-
@deftypefn  {} {@var{filelist} =} bzip2 (@var{files})
@deftypefnx {} {@var{filelist} =} bzip2 (@var{files}, @var{dir})
Compress the list of files specified in @var{files}.

@var{files} is a character array or cell array of strings.  Shell wildcards
in the filename such as @samp{*} or @samp{?} are accepted and expanded.
Each file is compressed separately and a new file with a @file{".bz2"}
extension is created.  The original files are not modified, but existing
compressed files will be silently overwritten.

If @var{dir} is defined the compressed files are placed in this directory,
rather than the original directory where the uncompressed file resides.
Note that this does not replicate a directory tree in @var{dir} which may
lead to files overwritting each other if there are multiple files with the
same name.

If @var{dir} does not exist it is created.

The optional output @var{filelist} is a list of the compressed files.
@seealso{bunzip2, unpack, gzip, zip, tar}
@end deftypefn */)
{
#ifndef HAVE_BZ2
  err_disabled_feature ("bzip2", "bzip2");
#else
  return xzip<bz2> ("bzip2", args);
#endif
}

// Tests for both gzip/bzip2 and gunzip/bunzip2
/*

## Takes a single argument, a function handle for the test.  This other
## function must accept two arguments, a directory for the tests, and
## a cell array with zip function, unzip function, and file extension.

%!function run_test_function (test_function)
%!  enabled_zippers = cell (0, 0);
%!  if (__octave_config_info__ ().build_features.BZ2)
%!    enabled_zippers(1, end+1) = @bzip2;
%!    enabled_zippers(2, end) = @bunzip2;
%!    enabled_zippers(3, end) = ".bz2";
%!  endif
%!  if (__octave_config_info__ ().build_features.Z)
%!    enabled_zippers(1, end+1) = @gzip;
%!    enabled_zippers(2, end) = @gunzip;
%!    enabled_zippers(3, end) = ".gz";
%!  endif
%!
%!  for z = enabled_zippers
%!    test_dir = tempname ();
%!    if (! mkdir (test_dir))
%!      error ("unable to create directory for tests");
%!    endif
%!    unwind_protect
%!      test_function (test_dir, z)
%!    unwind_protect_cleanup
%!      confirm_recursive_rmdir (false, "local");
%!      rmdir (test_dir, "s");
%!    end_unwind_protect
%!  endfor
%!endfunction

%!function create_file (fpath, data)
%!  fid = fopen (fpath, "wb");
%!  if (fid < 0)
%!    error ("unable to open file for writing");
%!  endif
%!  if (fwrite (fid, data, class (data)) != numel (data))
%!    error ("unable to write to file");
%!  endif
%!  if (fflush (fid) || fclose (fid))
%!    error ("unable to flush or close file");
%!  endif
%!endfunction

## Test with large files because of varied buffer size
%!function test_large_file (test_dir, z)
%!  test_file = tempname (test_dir);
%!  create_file (test_file, rand (500000, 1));
%!  md5 = hash ("md5", fileread (test_file));
%!
%!  expected_z_file = [test_file z{3}];
%!  z_files = z{1} (test_file);
%!  assert (z_files, {expected_z_file})
%!
%!  unlink (test_file);
%!  assert (z{2} (z_files{1}), {test_file})
%!  assert (hash ("md5", fileread (test_file)), md5)
%!endfunction
%!test run_test_function (@test_large_file)

## Test that xzipped files are rexzipped
%!function test_z_z (test_dir, z)
%!  ori_file = tempname (test_dir);
%!  create_file (ori_file, rand (100, 1));
%!  md5_ori = hash ("md5", fileread (ori_file));
%!
%!  z_file = [ori_file z{3}];
%!  filelist = z{1} (ori_file);
%!  assert (filelist, {z_file}) # check output
%!  assert (exist (z_file), 2) # confirm file exists
%!  assert (exist (z_file), 2) # and did not remove original file
%!  md5_z = hash ("md5", fileread (z_file));
%!
%!  unlink (ori_file);
%!  assert (z{2} (z_file), {ori_file})
%!  ## bug #48597
%!  assert (exist (ori_file), 2) # bug #48597 (Xunzip should not remove file)
%!  assert (hash ("md5", fileread (ori_file)), md5_ori)
%!
%!  ## xzip should dutifully re-xzip files even if they already are zipped
%!  z_z_file = [z_file z{3}];
%!
%!  filelist = z{1} (z_file);
%!  assert (filelist, {z_z_file}) # check output
%!  assert (exist (z_z_file), 2) # confirm file exists
%!  assert (exist (z_z_file), 2) # and did not remove original file
%!
%!  unlink (z_file);
%!  assert (z{2} (z_z_file), {z_file})
%!  assert (hash ("md5", fileread (z_file)), md5_z)
%!endfunction
%!test run_test_function (@test_z_z)

%!function test_xzip_dir (test_dir, z)
%!  fpaths = fullfile (test_dir, {"test1", "test2", "test3"});
%!  z_files = strcat (fpaths, z{3});
%!  md5s = cell (1, 3);
%!  for idx = 1:numel(fpaths)
%!    create_file (fpaths{idx}, rand (100, 1));
%!    md5s(idx) = hash ("md5", fileread (fpaths{idx}));
%!  endfor
%!
%!  assert (sort (z{1} ([test_dir filesep()])), z_files(:))
%!  for idx = 1:numel(fpaths)
%!    assert (exist (z_files{idx}), 2)
%!    unlink (fpaths{idx});
%!  endfor
%!  for idx = 1:numel(fpaths)
%!    assert (z{2} (z_files{idx}), fpaths{idx}); # bug #48598
%!    assert (hash ("md5", fileread (fpaths{idx})), md5s{idx})
%!  endfor
%!endfunction
%!test run_test_function (@test_xzip_dir)

%!function test_save_to_dir (test_dir, z)
%!  filename = "test-file";
%!  filepath = fullfile (test_dir, filename);
%!  create_file (filepath, rand (100, 1));
%!  md5 = hash ("md5", fileread (filepath));
%!
%!  ## test with existing and non-existing directory
%!  out_dirs = {tempname (test_dir), tempname (test_dir)};
%!  if (! mkdir (out_dirs{1}))
%!    error ("unable to create directory for test");
%!  endif
%!  for idx = 1:numel(out_dirs)
%!    out_dir = out_dirs{idx};
%!    z_file = fullfile (out_dir, [filename z{3}]);
%!    assert (z{1} (filepath, out_dir), {z_file})
%!    assert (exist (z_file, "file"), 2)
%!    uz_file = z_file(1:(end-numel(z{3})));
%!    assert (z{2} (z_file), uz_file); # bug #48598
%!    assert (hash ("md5", fileread (uz_file)), md5)
%!  endfor
%!endfunction
%!test run_test_function (@test_save_to_dir)
*/