diff --git a/CMakeLists.txt b/CMakeLists.txt index 9280ae1..41dcb42 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -35,8 +35,8 @@ include_directories(SYSTEM ${MASON_PACKAGE_catch_INCLUDE_DIRS}) mason_use(benchmark VERSION 1.3.0) include_directories(SYSTEM ${MASON_PACKAGE_benchmark_INCLUDE_DIRS}) -mason_use(zlib VERSION 1.2.8) -include_directories(SYSTEM ${MASON_PACKAGE_zlib_INCLUDE_DIRS}) +mason_use(libdeflate VERSION 1.0) +include_directories(SYSTEM ${MASON_PACKAGE_libdeflate_INCLUDE_DIRS}) include_directories("${PROJECT_SOURCE_DIR}/include") @@ -49,5 +49,5 @@ file(GLOB BENCH_SOURCES bench/*.cpp) add_executable(bench-tests ${BENCH_SOURCES}) # link zlib static library to the unit-tests binary so the tests know where to find the zlib impl code -target_link_libraries(unit-tests ${MASON_PACKAGE_zlib_STATIC_LIBS}) -target_link_libraries(bench-tests ${MASON_PACKAGE_benchmark_STATIC_LIBS} ${CMAKE_THREAD_LIBS_INIT} ${MASON_PACKAGE_zlib_STATIC_LIBS}) +target_link_libraries(unit-tests ${MASON_PACKAGE_libdeflate_STATIC_LIBS}) +target_link_libraries(bench-tests ${MASON_PACKAGE_benchmark_STATIC_LIBS} ${CMAKE_THREAD_LIBS_INIT} ${MASON_PACKAGE_libdeflate_STATIC_LIBS}) diff --git a/include/gzip/compress.hpp b/include/gzip/compress.hpp index 2ec56c2..1ff90dd 100644 --- a/include/gzip/compress.hpp +++ b/include/gzip/compress.hpp @@ -1,7 +1,7 @@ #include // zlib -#include +#include // std #include @@ -14,95 +14,65 @@ class Compressor { std::size_t max_; int level_; + struct libdeflate_compressor* compressor_ = nullptr; + // make noncopyable + Compressor(Compressor const&) = delete; + Compressor& operator=(Compressor const&) = delete; public: - Compressor(int level = Z_DEFAULT_COMPRESSION, + Compressor(int level = 6, std::size_t max_bytes = 2000000000) // by default refuse operation if uncompressed data is > 2GB : max_(max_bytes), level_(level) { + compressor_ = libdeflate_alloc_compressor(level_); + if (!compressor_) + { + throw std::runtime_error("libdeflate_alloc_compressor failed"); + } } - template - void compress(InputType& output, - const char* data, - std::size_t size) const + ~Compressor() { - -#ifdef DEBUG - // Verify if size input will fit into unsigned int, type used for zlib's avail_in - if (size > std::numeric_limits::max()) + if (compressor_) { - throw std::runtime_error("size arg is too large to fit into unsigned int type"); + libdeflate_free_compressor(compressor_); } -#endif + } + + template + void compress(OutputType& output, + char const* data, + std::size_t size) const + { if (size > max_) { throw std::runtime_error("size may use more memory than intended when decompressing"); } - z_stream deflate_s; - deflate_s.zalloc = Z_NULL; - deflate_s.zfree = Z_NULL; - deflate_s.opaque = Z_NULL; - deflate_s.avail_in = 0; - deflate_s.next_in = Z_NULL; - - // The windowBits parameter is the base two logarithm of the window size (the size of the history buffer). - // It should be in the range 8..15 for this version of the library. - // Larger values of this parameter result in better compression at the expense of memory usage. - // This range of values also changes the decoding type: - // -8 to -15 for raw deflate - // 8 to 15 for zlib - // (8 to 15) + 16 for gzip - // (8 to 15) + 32 to automatically detect gzip/zlib header (decompression/inflate only) - constexpr int window_bits = 15 + 16; // gzip with windowbits of 15 - - constexpr int mem_level = 8; - // The memory requirements for deflate are (in bytes): - // (1 << (window_bits+2)) + (1 << (mem_level+9)) - // with a default value of 8 for mem_level and our window_bits of 15 - // this is 128Kb - -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wold-style-cast" - if (deflateInit2(&deflate_s, level_, Z_DEFLATED, window_bits, mem_level, Z_DEFAULT_STRATEGY) != Z_OK) + std::size_t max_compressed_size = libdeflate_gzip_compress_bound(compressor_, size); + // TODO: sanity check this before allocating + if (max_compressed_size > output.size()) { - throw std::runtime_error("deflate init failed"); + output.resize(max_compressed_size); } -#pragma GCC diagnostic pop - deflate_s.next_in = reinterpret_cast(data); - deflate_s.avail_in = static_cast(size); - - std::size_t size_compressed = 0; - do + std::size_t actual_compressed_size = libdeflate_gzip_compress(compressor_, + data, + size, + const_cast(output.data()), + max_compressed_size); + if (actual_compressed_size == 0) { - size_t increase = size / 2 + 1024; - if (output.size() < (size_compressed + increase)) - { - output.resize(size_compressed + increase); - } - // There is no way we see that "increase" would not fit in an unsigned int, - // hence we use static cast here to avoid -Wshorten-64-to-32 error - deflate_s.avail_out = static_cast(increase); - deflate_s.next_out = reinterpret_cast((&output[0] + size_compressed)); - // From http://www.zlib.net/zlib_how.html - // "deflate() has a return value that can indicate errors, yet we do not check it here. - // Why not? Well, it turns out that deflate() can do no wrong here." - // Basically only possible error is from deflateInit not working properly - deflate(&deflate_s, Z_FINISH); - size_compressed += (increase - deflate_s.avail_out); - } while (deflate_s.avail_out == 0); - - deflateEnd(&deflate_s); - output.resize(size_compressed); + throw std::runtime_error("actual_compressed_size 0"); + } + output.resize(actual_compressed_size); } }; inline std::string compress(const char* data, std::size_t size, - int level = Z_DEFAULT_COMPRESSION) + int level = 6) { Compressor comp(level); std::string output; diff --git a/include/gzip/decompress.hpp b/include/gzip/decompress.hpp index b70670f..2efc214 100644 --- a/include/gzip/decompress.hpp +++ b/include/gzip/decompress.hpp @@ -1,7 +1,7 @@ #include // zlib -#include +#include // std #include @@ -12,85 +12,74 @@ namespace gzip { class Decompressor { - std::size_t max_; + std::size_t const max_; + struct libdeflate_decompressor* decompressor_ = nullptr; + // make noncopyable + Decompressor(Decompressor const&) = delete; + Decompressor& operator=(Decompressor const&) = delete; public: - Decompressor(std::size_t max_bytes = 1000000000) // by default refuse operation if compressed data is > 1GB + Decompressor(std::size_t max_bytes = 2147483648u) // by default refuse operation if required uutput buffer is > 2GB : max_(max_bytes) { + decompressor_ = libdeflate_alloc_decompressor(); + if (!decompressor_) + { + throw std::runtime_error("libdeflate_alloc_decompressor failed"); + } + } + + ~Decompressor() + { + if (decompressor_) + { + libdeflate_free_decompressor(decompressor_); + } } template void decompress(OutputType& output, - const char* data, + char const* data, std::size_t size) const { - z_stream inflate_s; - - inflate_s.zalloc = Z_NULL; - inflate_s.zfree = Z_NULL; - inflate_s.opaque = Z_NULL; - inflate_s.avail_in = 0; - inflate_s.next_in = Z_NULL; - - // The windowBits parameter is the base two logarithm of the window size (the size of the history buffer). - // It should be in the range 8..15 for this version of the library. - // Larger values of this parameter result in better compression at the expense of memory usage. - // This range of values also changes the decoding type: - // -8 to -15 for raw deflate - // 8 to 15 for zlib - // (8 to 15) + 16 for gzip - // (8 to 15) + 32 to automatically detect gzip/zlib header - constexpr int window_bits = 15 + 32; // auto with windowbits of 15 - -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wold-style-cast" - if (inflateInit2(&inflate_s, window_bits) != Z_OK) + // https://github.com/kaorimatz/libdeflate-ruby/blob/0e33da96cdaad3162f03ec924b25b2f4f2847538/ext/libdeflate/libdeflate_ext.c#L340 + // https://github.com/ebiggers/libdeflate/commit/5a9d25a8922e2d74618fba96e56db4fe145510f4 + std::size_t actual_size; + std::size_t uncompressed_size_guess = std::min(size * 4, max_); + output.resize(uncompressed_size_guess); + enum libdeflate_result result; + for (;;) { - throw std::runtime_error("inflate init failed"); + result = libdeflate_gzip_decompress(decompressor_, + data, + size, + const_cast(output.data()), + output.size(), &actual_size); + if (result != LIBDEFLATE_INSUFFICIENT_SPACE) + { + break; + } + if (output.size() == max_) + { + throw std::runtime_error("request to resize output buffer can't exceed maximum limit"); + } + std::size_t new_size = std::min((output.capacity() << 1) - output.size(), max_); + output.resize(new_size); } -#pragma GCC diagnostic pop - inflate_s.next_in = reinterpret_cast(data); -#ifdef DEBUG - // Verify if size (long type) input will fit into unsigned int, type used for zlib's avail_in - std::uint64_t size_64 = size * 2; - if (size_64 > std::numeric_limits::max()) + if (result == LIBDEFLATE_SHORT_OUTPUT) { - inflateEnd(&inflate_s); - throw std::runtime_error("size arg is too large to fit into unsigned int type x2"); + throw std::runtime_error("short output: did not succeed"); } -#endif - if (size > max_ || (size * 2) > max_) + else if (result == LIBDEFLATE_BAD_DATA) { - inflateEnd(&inflate_s); - throw std::runtime_error("size may use more memory than intended when decompressing"); + throw std::runtime_error("bad data: did not succeed"); } - inflate_s.avail_in = static_cast(size); - std::size_t size_uncompressed = 0; - do + else if (result != LIBDEFLATE_SUCCESS) { - std::size_t resize_to = size_uncompressed + 2 * size; - if (resize_to > max_) - { - inflateEnd(&inflate_s); - throw std::runtime_error("size of output string will use more memory then intended when decompressing"); - } - output.resize(resize_to); - inflate_s.avail_out = static_cast(2 * size); - inflate_s.next_out = reinterpret_cast(&output[0] + size_uncompressed); - int ret = inflate(&inflate_s, Z_FINISH); - if (ret != Z_STREAM_END && ret != Z_OK && ret != Z_BUF_ERROR) - { - std::string error_msg = inflate_s.msg; - inflateEnd(&inflate_s); - throw std::runtime_error(error_msg); - } - - size_uncompressed += (2 * size - inflate_s.avail_out); - } while (inflate_s.avail_out == 0); - inflateEnd(&inflate_s); - output.resize(size_uncompressed); + throw std::runtime_error("did not succeed"); + } + output.resize(actual_size); } }; diff --git a/include/gzip/utils.hpp b/include/gzip/utils.hpp index db123d1..affcd64 100644 --- a/include/gzip/utils.hpp +++ b/include/gzip/utils.hpp @@ -5,7 +5,7 @@ namespace gzip { // These live in gzip.hpp because it doesnt need to use deps. // Otherwise, they would need to live in impl files if these methods used // zlib structures or functions like inflate/deflate) -inline bool is_compressed(const char* data, std::size_t size) +inline bool is_compressed(const char* data, std::size_t size) noexcept { return size > 2 && ( diff --git a/test/test_io.cpp b/test/test_io.cpp index a245a11..39d0181 100644 --- a/test/test_io.cpp +++ b/test/test_io.cpp @@ -6,13 +6,9 @@ TEST_CASE("successful compress") { std::string data = "hello hello hello hello"; - - SECTION("pointer") - { - const char* pointer = data.data(); - std::string value = gzip::compress(pointer, data.size()); - REQUIRE(!value.empty()); - } + const char* pointer = data.data(); + std::string value = gzip::compress(pointer, data.size()); + REQUIRE(!value.empty()); } TEST_CASE("fail compress - throws max size limit") @@ -25,20 +21,6 @@ TEST_CASE("fail compress - throws max size limit") CHECK_THROWS_WITH(gzip::compress(pointer, l), Catch::Contains("size may use more memory than intended when decompressing")); } -#ifdef DEBUG -TEST_CASE("fail compress - pointer, debug throws int overflow") -{ - std::string data = "hello hello hello hello"; - const char* pointer = data.data(); - // numeric_limit useful for integer conversion - unsigned int i = std::numeric_limits::max(); - // turn int i into a long, so we can add to it safely without overflow - unsigned long l = static_cast(i) + 1; - - CHECK_THROWS_WITH(gzip::compress(pointer, l), Catch::Contains("size arg is too large to fit into unsigned int type")); -} -#endif - TEST_CASE("successful decompress - pointer") { std::string data = "hello hello hello hello"; @@ -49,23 +31,6 @@ TEST_CASE("successful decompress - pointer") REQUIRE(data == value); } -#ifdef DEBUG -TEST_CASE("fail decompress - pointer, debug throws int overflow") -{ - std::string data = "hello hello hello hello"; - const char* pointer = data.data(); - std::string compressed_data = gzip::compress(pointer, data.size()); - const char* compressed_pointer = compressed_data.data(); - - // numeric_limit useful for integer conversion - unsigned int i = std::numeric_limits::max(); - // turn int i into a long, so we can add to it safely without overflow - unsigned long l = static_cast(i) + 1; - - CHECK_THROWS_WITH(gzip::decompress(compressed_pointer, l), Catch::Contains("size arg is too large to fit into unsigned int type x2")); -} -#endif - TEST_CASE("invalid decompression") { std::string data("this is a string that should be compressed data"); @@ -90,7 +55,7 @@ TEST_CASE("round trip compression - gzip") SECTION("no compression") { - int level = Z_NO_COMPRESSION; + int level = 0; std::string compressed_data = gzip::compress(data.data(), data.size()); CHECK(gzip::is_compressed(compressed_data.data(), compressed_data.size())); std::string new_data = gzip::decompress(compressed_data.data(), compressed_data.size()); @@ -99,7 +64,7 @@ TEST_CASE("round trip compression - gzip") SECTION("default compression level") { - int level = Z_DEFAULT_COMPRESSION; + int level = 6; std::string compressed_data = gzip::compress(data.data(), data.size()); CHECK(gzip::is_compressed(compressed_data.data(), compressed_data.size())); std::string new_data = gzip::decompress(compressed_data.data(), compressed_data.size()); @@ -108,7 +73,7 @@ TEST_CASE("round trip compression - gzip") SECTION("compression level -- min to max") { - for (int level = Z_BEST_SPEED; level <= Z_BEST_COMPRESSION; ++level) + for (int level = 1; level <= 9; ++level) { std::string compressed_data = gzip::compress(data.data(), data.size()); CHECK(gzip::is_compressed(compressed_data.data(), compressed_data.size())); @@ -130,10 +95,10 @@ TEST_CASE("test decompression size limit") std::istreambuf_iterator()); stream.close(); - std::size_t limit = 20 * 1024 * 1024; // 20 Mb + std::size_t limit = 500 * 1024 * 1024; // 500 Mb // file should be about 500 mb uncompressed gzip::Decompressor decomp(limit); std::string output; CHECK_THROWS(decomp.decompress(output, str_compressed.data(), str_compressed.size())); - CHECK(output.size() < limit); + CHECK(output.size() <= limit); }