Skip to content
Closed
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ include_directories(SYSTEM ${MASON_PACKAGE_catch_INCLUDE_DIRS})
mason_use(benchmark VERSION 1.3.0)
include_directories(SYSTEM ${MASON_PACKAGE_benchmark_INCLUDE_DIRS})

mason_use(zlib VERSION 1.2.8)
include_directories(SYSTEM ${MASON_PACKAGE_zlib_INCLUDE_DIRS})
mason_use(libdeflate VERSION e9d1014)
include_directories(SYSTEM ${MASON_PACKAGE_libdeflate_INCLUDE_DIRS})

include_directories("${PROJECT_SOURCE_DIR}/include")

Expand All @@ -49,5 +49,5 @@ file(GLOB BENCH_SOURCES bench/*.cpp)
add_executable(bench-tests ${BENCH_SOURCES})

# link zlib static library to the unit-tests binary so the tests know where to find the zlib impl code
target_link_libraries(unit-tests ${MASON_PACKAGE_zlib_STATIC_LIBS})
target_link_libraries(bench-tests ${MASON_PACKAGE_benchmark_STATIC_LIBS} ${CMAKE_THREAD_LIBS_INIT} ${MASON_PACKAGE_zlib_STATIC_LIBS})
target_link_libraries(unit-tests ${MASON_PACKAGE_libdeflate_STATIC_LIBS})
target_link_libraries(bench-tests ${MASON_PACKAGE_benchmark_STATIC_LIBS} ${CMAKE_THREAD_LIBS_INIT} ${MASON_PACKAGE_libdeflate_STATIC_LIBS})
85 changes: 30 additions & 55 deletions include/gzip/compress.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#include <gzip/config.hpp>

// zlib
#include <zlib.h>
#include <libdeflate.h>

// std
#include <limits>
Expand All @@ -14,13 +14,27 @@ class Compressor
{
std::size_t max_;
int level_;
struct libdeflate_compressor* compressor_ = nullptr;

public:
Compressor(int level = Z_DEFAULT_COMPRESSION,
Compressor(int level = 6,
std::size_t max_bytes = 2000000000) // by default refuse operation if uncompressed data is > 2GB
: max_(max_bytes),
level_(level)
{
compressor_ = libdeflate_alloc_compressor(level_);
if (!compressor_)
{
throw std::runtime_error("libdeflate_alloc_compressor failed");
}
}

~Compressor()
{
if (compressor_)
{
libdeflate_free_compressor(compressor_);
}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Noting that this approach (initialize C struct pointer in constructor + free the memory in the deconstructor) is applying RAII (https://en.wikipedia.org/wiki/Resource_acquisition_is_initialization) to avoid a memory leak.

}

template <typename InputType>
Expand All @@ -41,68 +55,29 @@ class Compressor
throw std::runtime_error("size may use more memory than intended when decompressing");
}

z_stream deflate_s;
deflate_s.zalloc = Z_NULL;
deflate_s.zfree = Z_NULL;
deflate_s.opaque = Z_NULL;
deflate_s.avail_in = 0;
deflate_s.next_in = Z_NULL;

// The windowBits parameter is the base two logarithm of the window size (the size of the history buffer).
// It should be in the range 8..15 for this version of the library.
// Larger values of this parameter result in better compression at the expense of memory usage.
// This range of values also changes the decoding type:
// -8 to -15 for raw deflate
// 8 to 15 for zlib
// (8 to 15) + 16 for gzip
// (8 to 15) + 32 to automatically detect gzip/zlib header (decompression/inflate only)
constexpr int window_bits = 15 + 16; // gzip with windowbits of 15

constexpr int mem_level = 8;
// The memory requirements for deflate are (in bytes):
// (1 << (window_bits+2)) + (1 << (mem_level+9))
// with a default value of 8 for mem_level and our window_bits of 15
// this is 128Kb

#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wold-style-cast"
if (deflateInit2(&deflate_s, level_, Z_DEFLATED, window_bits, mem_level, Z_DEFAULT_STRATEGY) != Z_OK)
std::size_t max_compressed_size = libdeflate_gzip_compress_bound(compressor_, size);
// TODO: sanity check this before allocating
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reason for this comment is fear/lack of knowledge on my part. What happens if libdeflate_gzip_compress_bound is buggy and returns a really massive value? Is that possible? Would we end up trying to allocate so much memory the machine would crumble? Probably not possible, but I've also not looked inside libdeflate yet to figure out how much to worry about this.

if (max_compressed_size > output.size())
{
throw std::runtime_error("deflate init failed");
output.resize(max_compressed_size);
}
#pragma GCC diagnostic pop

deflate_s.next_in = reinterpret_cast<z_const Bytef*>(data);
deflate_s.avail_in = static_cast<unsigned int>(size);

std::size_t size_compressed = 0;
do
std::size_t actual_compressed_size = libdeflate_gzip_compress(compressor_,
data,
size,
const_cast<char*>(output.data()),
max_compressed_size);
if (actual_compressed_size == 0)
{
size_t increase = size / 2 + 1024;
if (output.size() < (size_compressed + increase))
{
output.resize(size_compressed + increase);
}
// There is no way we see that "increase" would not fit in an unsigned int,
// hence we use static cast here to avoid -Wshorten-64-to-32 error
deflate_s.avail_out = static_cast<unsigned int>(increase);
deflate_s.next_out = reinterpret_cast<Bytef*>((&output[0] + size_compressed));
// From http://www.zlib.net/zlib_how.html
// "deflate() has a return value that can indicate errors, yet we do not check it here.
// Why not? Well, it turns out that deflate() can do no wrong here."
// Basically only possible error is from deflateInit not working properly
deflate(&deflate_s, Z_FINISH);
size_compressed += (increase - deflate_s.avail_out);
} while (deflate_s.avail_out == 0);

deflateEnd(&deflate_s);
output.resize(size_compressed);
throw std::runtime_error("actual_compressed_size 0");
}
output.resize(actual_compressed_size);
}
};

inline std::string compress(const char* data,
std::size_t size,
int level = Z_DEFAULT_COMPRESSION)
int level = 6)
{
Compressor comp(level);
std::string output;
Expand Down
95 changes: 45 additions & 50 deletions include/gzip/decompress.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#include <gzip/config.hpp>

// zlib
#include <zlib.h>
#include <libdeflate.h>

// std
#include <limits>
Expand All @@ -13,84 +13,79 @@ namespace gzip {
class Decompressor
{
std::size_t max_;
struct libdeflate_decompressor* decompressor_ = nullptr;

public:
Decompressor(std::size_t max_bytes = 1000000000) // by default refuse operation if compressed data is > 1GB
: max_(max_bytes)
{
decompressor_ = libdeflate_alloc_decompressor();
if (!decompressor_)
{
throw std::runtime_error("libdeflate_alloc_decompressor failed");
}
}

~Decompressor()
{
if (decompressor_)
{
libdeflate_free_decompressor(decompressor_);
}
}

template <typename OutputType>
void decompress(OutputType& output,
const char* data,
std::size_t size) const
{
z_stream inflate_s;

inflate_s.zalloc = Z_NULL;
inflate_s.zfree = Z_NULL;
inflate_s.opaque = Z_NULL;
inflate_s.avail_in = 0;
inflate_s.next_in = Z_NULL;

// The windowBits parameter is the base two logarithm of the window size (the size of the history buffer).
// It should be in the range 8..15 for this version of the library.
// Larger values of this parameter result in better compression at the expense of memory usage.
// This range of values also changes the decoding type:
// -8 to -15 for raw deflate
// 8 to 15 for zlib
// (8 to 15) + 16 for gzip
// (8 to 15) + 32 to automatically detect gzip/zlib header
constexpr int window_bits = 15 + 32; // auto with windowbits of 15

#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wold-style-cast"
if (inflateInit2(&inflate_s, window_bits) != Z_OK)
{
throw std::runtime_error("inflate init failed");
}
#pragma GCC diagnostic pop
inflate_s.next_in = reinterpret_cast<z_const Bytef*>(data);

#ifdef DEBUG
// Verify if size (long type) input will fit into unsigned int, type used for zlib's avail_in
std::uint64_t size_64 = size * 2;
if (size_64 > std::numeric_limits<unsigned int>::max())
{
inflateEnd(&inflate_s);
throw std::runtime_error("size arg is too large to fit into unsigned int type x2");
}
#endif
if (size > max_ || (size * 2) > max_)
{
inflateEnd(&inflate_s);
throw std::runtime_error("size may use more memory than intended when decompressing");
}
inflate_s.avail_in = static_cast<unsigned int>(size);
std::size_t size_uncompressed = 0;
do

// https://github.com/kaorimatz/libdeflate-ruby/blob/0e33da96cdaad3162f03ec924b25b2f4f2847538/ext/libdeflate/libdeflate_ext.c#L340
// https://github.com/ebiggers/libdeflate/commit/5a9d25a8922e2d74618fba96e56db4fe145510f4
std::size_t actual_size;
std::size_t uncompressed_size_guess = size * 4;
output.reserve(uncompressed_size_guess);
enum libdeflate_result result;
for (;;)
{
std::size_t resize_to = size_uncompressed + 2 * size;
if (resize_to > max_)
{
inflateEnd(&inflate_s);
throw std::runtime_error("size of output string will use more memory then intended when decompressing");
}
output.resize(resize_to);
inflate_s.avail_out = static_cast<unsigned int>(2 * size);
inflate_s.next_out = reinterpret_cast<Bytef*>(&output[0] + size_uncompressed);
int ret = inflate(&inflate_s, Z_FINISH);
if (ret != Z_STREAM_END && ret != Z_OK && ret != Z_BUF_ERROR)
result = libdeflate_gzip_decompress(decompressor_,
data,
size,
const_cast<char*>(output.data()),
output.capacity(), &actual_size);
if (result != LIBDEFLATE_INSUFFICIENT_SPACE)
{
std::string error_msg = inflate_s.msg;
inflateEnd(&inflate_s);
throw std::runtime_error(error_msg);
break;
}
output.reserve((output.capacity() << 1) - output.size());
}

size_uncompressed += (2 * size - inflate_s.avail_out);
} while (inflate_s.avail_out == 0);
inflateEnd(&inflate_s);
output.resize(size_uncompressed);
if (result == LIBDEFLATE_SHORT_OUTPUT)
{
throw std::runtime_error("short output: did not succeed");
}
else if (result == LIBDEFLATE_BAD_DATA)
{
throw std::runtime_error("bad data: did not succeed");
}
else if (result != LIBDEFLATE_SUCCESS)
{
throw std::runtime_error("did not succeed");
}
output.resize(actual_size);
}
};

Expand Down
8 changes: 4 additions & 4 deletions test/test_io.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ TEST_CASE("round trip compression - gzip")

SECTION("no compression")
{
int level = Z_NO_COMPRESSION;
int level = 0;
std::string compressed_data = gzip::compress(data.data(), data.size());
CHECK(gzip::is_compressed(compressed_data.data(), compressed_data.size()));
std::string new_data = gzip::decompress(compressed_data.data(), compressed_data.size());
Expand All @@ -99,7 +99,7 @@ TEST_CASE("round trip compression - gzip")

SECTION("default compression level")
{
int level = Z_DEFAULT_COMPRESSION;
int level = 6;
std::string compressed_data = gzip::compress(data.data(), data.size());
CHECK(gzip::is_compressed(compressed_data.data(), compressed_data.size()));
std::string new_data = gzip::decompress(compressed_data.data(), compressed_data.size());
Expand All @@ -108,7 +108,7 @@ TEST_CASE("round trip compression - gzip")

SECTION("compression level -- min to max")
{
for (int level = Z_BEST_SPEED; level <= Z_BEST_COMPRESSION; ++level)
for (int level = 1; level <= 9; ++level)
{
std::string compressed_data = gzip::compress(data.data(), data.size());
CHECK(gzip::is_compressed(compressed_data.data(), compressed_data.size()));
Expand All @@ -130,7 +130,7 @@ TEST_CASE("test decompression size limit")
std::istreambuf_iterator<char>());
stream.close();

std::size_t limit = 20 * 1024 * 1024; // 20 Mb
std::size_t limit = 500 * 1024 * 1024; // 500 Mb
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@springmeyer - I've changed logic to validate output buffer size rather then input, which makes more sense in my opinion.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👌

// file should be about 500 mb uncompressed
gzip::Decompressor decomp(limit);
std::string output;
Expand Down