-
Notifications
You must be signed in to change notification settings - Fork 69
Libdeflate port #25
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Libdeflate port #25
Changes from all commits
e08daa7
17e9ace
a2fcbdb
5c0b86f
3cb55a8
32f4c39
da1d85c
72ca105
16c669c
864905e
4b7ec6f
d9e64d1
abfdc49
bd480a6
487927c
4e284d0
15205f2
b4684ca
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
#include <gzip/config.hpp> | ||
|
||
// zlib | ||
#include <zlib.h> | ||
#include <libdeflate.h> | ||
|
||
// std | ||
#include <limits> | ||
|
@@ -14,95 +14,65 @@ class Compressor | |
{ | ||
std::size_t max_; | ||
int level_; | ||
struct libdeflate_compressor* compressor_ = nullptr; | ||
// make noncopyable | ||
Compressor(Compressor const&) = delete; | ||
Compressor& operator=(Compressor const&) = delete; | ||
|
||
public: | ||
Compressor(int level = Z_DEFAULT_COMPRESSION, | ||
Compressor(int level = 6, | ||
std::size_t max_bytes = 2000000000) // by default refuse operation if uncompressed data is > 2GB | ||
: max_(max_bytes), | ||
level_(level) | ||
{ | ||
compressor_ = libdeflate_alloc_compressor(level_); | ||
if (!compressor_) | ||
{ | ||
throw std::runtime_error("libdeflate_alloc_compressor failed"); | ||
} | ||
} | ||
|
||
template <typename InputType> | ||
void compress(InputType& output, | ||
const char* data, | ||
std::size_t size) const | ||
~Compressor() | ||
{ | ||
|
||
#ifdef DEBUG | ||
// Verify if size input will fit into unsigned int, type used for zlib's avail_in | ||
if (size > std::numeric_limits<unsigned int>::max()) | ||
if (compressor_) | ||
{ | ||
throw std::runtime_error("size arg is too large to fit into unsigned int type"); | ||
libdeflate_free_compressor(compressor_); | ||
} | ||
#endif | ||
} | ||
|
||
template <typename OutputType> | ||
void compress(OutputType& output, | ||
char const* data, | ||
std::size_t size) const | ||
{ | ||
if (size > max_) | ||
{ | ||
throw std::runtime_error("size may use more memory than intended when decompressing"); | ||
} | ||
|
||
z_stream deflate_s; | ||
deflate_s.zalloc = Z_NULL; | ||
deflate_s.zfree = Z_NULL; | ||
deflate_s.opaque = Z_NULL; | ||
deflate_s.avail_in = 0; | ||
deflate_s.next_in = Z_NULL; | ||
|
||
// The windowBits parameter is the base two logarithm of the window size (the size of the history buffer). | ||
// It should be in the range 8..15 for this version of the library. | ||
// Larger values of this parameter result in better compression at the expense of memory usage. | ||
// This range of values also changes the decoding type: | ||
// -8 to -15 for raw deflate | ||
// 8 to 15 for zlib | ||
// (8 to 15) + 16 for gzip | ||
// (8 to 15) + 32 to automatically detect gzip/zlib header (decompression/inflate only) | ||
constexpr int window_bits = 15 + 16; // gzip with windowbits of 15 | ||
|
||
constexpr int mem_level = 8; | ||
// The memory requirements for deflate are (in bytes): | ||
// (1 << (window_bits+2)) + (1 << (mem_level+9)) | ||
// with a default value of 8 for mem_level and our window_bits of 15 | ||
// this is 128Kb | ||
|
||
#pragma GCC diagnostic push | ||
#pragma GCC diagnostic ignored "-Wold-style-cast" | ||
if (deflateInit2(&deflate_s, level_, Z_DEFLATED, window_bits, mem_level, Z_DEFAULT_STRATEGY) != Z_OK) | ||
std::size_t max_compressed_size = libdeflate_gzip_compress_bound(compressor_, size); | ||
artemp marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// TODO: sanity check this before allocating | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The reason for this comment is fear/lack of knowledge on my part. What happens if |
||
if (max_compressed_size > output.size()) | ||
{ | ||
throw std::runtime_error("deflate init failed"); | ||
output.resize(max_compressed_size); | ||
} | ||
#pragma GCC diagnostic pop | ||
|
||
deflate_s.next_in = reinterpret_cast<z_const Bytef*>(data); | ||
deflate_s.avail_in = static_cast<unsigned int>(size); | ||
|
||
std::size_t size_compressed = 0; | ||
do | ||
std::size_t actual_compressed_size = libdeflate_gzip_compress(compressor_, | ||
data, | ||
size, | ||
const_cast<char*>(output.data()), | ||
max_compressed_size); | ||
if (actual_compressed_size == 0) | ||
{ | ||
size_t increase = size / 2 + 1024; | ||
if (output.size() < (size_compressed + increase)) | ||
{ | ||
output.resize(size_compressed + increase); | ||
} | ||
// There is no way we see that "increase" would not fit in an unsigned int, | ||
// hence we use static cast here to avoid -Wshorten-64-to-32 error | ||
deflate_s.avail_out = static_cast<unsigned int>(increase); | ||
deflate_s.next_out = reinterpret_cast<Bytef*>((&output[0] + size_compressed)); | ||
// From http://www.zlib.net/zlib_how.html | ||
// "deflate() has a return value that can indicate errors, yet we do not check it here. | ||
// Why not? Well, it turns out that deflate() can do no wrong here." | ||
// Basically only possible error is from deflateInit not working properly | ||
deflate(&deflate_s, Z_FINISH); | ||
size_compressed += (increase - deflate_s.avail_out); | ||
} while (deflate_s.avail_out == 0); | ||
|
||
deflateEnd(&deflate_s); | ||
output.resize(size_compressed); | ||
throw std::runtime_error("actual_compressed_size 0"); | ||
} | ||
output.resize(actual_compressed_size); | ||
} | ||
}; | ||
|
||
inline std::string compress(const char* data, | ||
std::size_t size, | ||
int level = Z_DEFAULT_COMPRESSION) | ||
int level = 6) | ||
artemp marked this conversation as resolved.
Show resolved
Hide resolved
|
||
{ | ||
Compressor comp(level); | ||
std::string output; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
#include <gzip/config.hpp> | ||
|
||
// zlib | ||
#include <zlib.h> | ||
#include <libdeflate.h> | ||
|
||
// std | ||
#include <limits> | ||
|
@@ -12,85 +12,74 @@ namespace gzip { | |
|
||
class Decompressor | ||
{ | ||
std::size_t max_; | ||
std::size_t const max_; | ||
struct libdeflate_decompressor* decompressor_ = nullptr; | ||
artemp marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// make noncopyable | ||
Decompressor(Decompressor const&) = delete; | ||
Decompressor& operator=(Decompressor const&) = delete; | ||
|
||
public: | ||
Decompressor(std::size_t max_bytes = 1000000000) // by default refuse operation if compressed data is > 1GB | ||
Decompressor(std::size_t max_bytes = 2147483648u) // by default refuse operation if required uutput buffer is > 2GB | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @artemp typo: |
||
: max_(max_bytes) | ||
{ | ||
decompressor_ = libdeflate_alloc_decompressor(); | ||
if (!decompressor_) | ||
{ | ||
throw std::runtime_error("libdeflate_alloc_decompressor failed"); | ||
} | ||
} | ||
|
||
~Decompressor() | ||
{ | ||
if (decompressor_) | ||
{ | ||
libdeflate_free_decompressor(decompressor_); | ||
} | ||
} | ||
|
||
template <typename OutputType> | ||
void decompress(OutputType& output, | ||
const char* data, | ||
char const* data, | ||
std::size_t size) const | ||
{ | ||
z_stream inflate_s; | ||
|
||
inflate_s.zalloc = Z_NULL; | ||
inflate_s.zfree = Z_NULL; | ||
inflate_s.opaque = Z_NULL; | ||
inflate_s.avail_in = 0; | ||
inflate_s.next_in = Z_NULL; | ||
|
||
// The windowBits parameter is the base two logarithm of the window size (the size of the history buffer). | ||
// It should be in the range 8..15 for this version of the library. | ||
// Larger values of this parameter result in better compression at the expense of memory usage. | ||
// This range of values also changes the decoding type: | ||
// -8 to -15 for raw deflate | ||
// 8 to 15 for zlib | ||
// (8 to 15) + 16 for gzip | ||
// (8 to 15) + 32 to automatically detect gzip/zlib header | ||
constexpr int window_bits = 15 + 32; // auto with windowbits of 15 | ||
|
||
#pragma GCC diagnostic push | ||
#pragma GCC diagnostic ignored "-Wold-style-cast" | ||
if (inflateInit2(&inflate_s, window_bits) != Z_OK) | ||
// https://github.com/kaorimatz/libdeflate-ruby/blob/0e33da96cdaad3162f03ec924b25b2f4f2847538/ext/libdeflate/libdeflate_ext.c#L340 | ||
// https://github.com/ebiggers/libdeflate/commit/5a9d25a8922e2d74618fba96e56db4fe145510f4 | ||
artemp marked this conversation as resolved.
Show resolved
Hide resolved
|
||
std::size_t actual_size; | ||
std::size_t uncompressed_size_guess = std::min(size * 4, max_); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What happens when |
||
output.resize(uncompressed_size_guess); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This probably needs to be re-written to avoid needing to guess the size up front. @artemp could you tackle that? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
diff --git a/include/gzip/decompress.hpp b/include/gzip/decompress.hpp
index badd315..627e670 100644
--- a/include/gzip/decompress.hpp
+++ b/include/gzip/decompress.hpp
@@ -57,17 +57,23 @@ class Decompressor
// https://github.com/ebiggers/libdeflate/commit/5a9d25a8922e2d74618fba96e56db4fe145510f4
std::size_t actual_size;
std::size_t uncompressed_size_guess = size * 4;
- output.resize(uncompressed_size_guess);
- enum libdeflate_result result = libdeflate_gzip_decompress(decompressor_,
- data,
- size,
- static_cast<void*>(&output[0]),
- uncompressed_size_guess, &actual_size);
- if (result == LIBDEFLATE_INSUFFICIENT_SPACE)
+ output.reserve(uncompressed_size_guess);
+ enum libdeflate_result result;
+ for (;;)
{
- throw std::runtime_error("no space: did not succeed");
+ result = libdeflate_gzip_decompress(decompressor_,
+ data,
+ size,
+ const_cast<char*>(output.data()),
+ output.capacity(), &actual_size);
+ if (result != LIBDEFLATE_INSUFFICIENT_SPACE)
+ {
+ break;
+ }
+ output.reserve((output.capacity() << 1) - output.size());
}
- else if (result == LIBDEFLATE_SHORT_OUTPUT)
+
+ if (result == LIBDEFLATE_SHORT_OUTPUT)
{
throw std::runtime_error("short output: did not succeed");
}
Couple notes:
/cc @springmeyer There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @artemp great fix - can you please commit directly to this branch? But two concerns to keep in mind:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Good point, I'll add conditional exception in the loop if allocation size is getting to big.
Let me investigate and fix this ^ I'm going to push updates and continue work on this branch There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
👍 I think you'll likely need an easy way to test larger, varied files and a CLI would allow that. So could you finish #29 to aid your testing of this branch? |
||
enum libdeflate_result result; | ||
for (;;) | ||
{ | ||
throw std::runtime_error("inflate init failed"); | ||
result = libdeflate_gzip_decompress(decompressor_, | ||
data, | ||
size, | ||
const_cast<char*>(output.data()), | ||
output.size(), &actual_size); | ||
if (result != LIBDEFLATE_INSUFFICIENT_SPACE) | ||
{ | ||
break; | ||
} | ||
if (output.size() == max_) | ||
{ | ||
throw std::runtime_error("request to resize output buffer can't exceed maximum limit"); | ||
} | ||
std::size_t new_size = std::min((output.capacity() << 1) - output.size(), max_); | ||
output.resize(new_size); | ||
} | ||
#pragma GCC diagnostic pop | ||
inflate_s.next_in = reinterpret_cast<z_const Bytef*>(data); | ||
|
||
#ifdef DEBUG | ||
// Verify if size (long type) input will fit into unsigned int, type used for zlib's avail_in | ||
std::uint64_t size_64 = size * 2; | ||
if (size_64 > std::numeric_limits<unsigned int>::max()) | ||
if (result == LIBDEFLATE_SHORT_OUTPUT) | ||
{ | ||
inflateEnd(&inflate_s); | ||
throw std::runtime_error("size arg is too large to fit into unsigned int type x2"); | ||
throw std::runtime_error("short output: did not succeed"); | ||
} | ||
#endif | ||
if (size > max_ || (size * 2) > max_) | ||
else if (result == LIBDEFLATE_BAD_DATA) | ||
{ | ||
inflateEnd(&inflate_s); | ||
throw std::runtime_error("size may use more memory than intended when decompressing"); | ||
throw std::runtime_error("bad data: did not succeed"); | ||
} | ||
inflate_s.avail_in = static_cast<unsigned int>(size); | ||
std::size_t size_uncompressed = 0; | ||
do | ||
else if (result != LIBDEFLATE_SUCCESS) | ||
{ | ||
std::size_t resize_to = size_uncompressed + 2 * size; | ||
if (resize_to > max_) | ||
{ | ||
inflateEnd(&inflate_s); | ||
throw std::runtime_error("size of output string will use more memory then intended when decompressing"); | ||
} | ||
output.resize(resize_to); | ||
inflate_s.avail_out = static_cast<unsigned int>(2 * size); | ||
inflate_s.next_out = reinterpret_cast<Bytef*>(&output[0] + size_uncompressed); | ||
int ret = inflate(&inflate_s, Z_FINISH); | ||
if (ret != Z_STREAM_END && ret != Z_OK && ret != Z_BUF_ERROR) | ||
{ | ||
std::string error_msg = inflate_s.msg; | ||
inflateEnd(&inflate_s); | ||
throw std::runtime_error(error_msg); | ||
} | ||
|
||
size_uncompressed += (2 * size - inflate_s.avail_out); | ||
} while (inflate_s.avail_out == 0); | ||
inflateEnd(&inflate_s); | ||
output.resize(size_uncompressed); | ||
throw std::runtime_error("did not succeed"); | ||
} | ||
output.resize(actual_size); | ||
} | ||
}; | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Noting that this approach (initialize C struct pointer in constructor + free the memory in the deconstructor) is applying RAII (https://en.wikipedia.org/wiki/Resource_acquisition_is_initialization) to avoid a memory leak.