From 8ebb7a22dae24a4b55435e47398445020889e7ba Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Sat, 28 Jun 2025 07:10:07 -0400 Subject: [PATCH 1/9] Add support for multiple zstd frames in decompression --- numcodecs/tests/test_zstd.py | 14 ++++++++++++- numcodecs/zstd.pyx | 39 +++++++++++++++++++++++++++++++++--- 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/numcodecs/tests/test_zstd.py b/numcodecs/tests/test_zstd.py index 04b474df..7891cded 100644 --- a/numcodecs/tests/test_zstd.py +++ b/numcodecs/tests/test_zstd.py @@ -87,7 +87,6 @@ def test_native_functions(): assert Zstd.min_level() == -131072 assert Zstd.max_level() == 22 - def test_streaming_decompression(): # Test input frames with unknown frame content size codec = Zstd() @@ -156,3 +155,16 @@ def zstd_cli_available() -> bool: return not subprocess.run( ["zstd", "-V"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL ).returncode + +def test_multi_frame(): + codec = Zstd() + + hello_world = codec.encode(b"Hello world!") + assert codec.decode(hello_world) == b"Hello world!" + assert codec.decode(hello_world*2) == b"Hello world!Hello world!" + + hola = codec.encode(b"Hola ") + mundo = codec.encode(b"Mundo!") + assert codec.decode(hola) == b"Hola " + assert codec.decode(mundo) == b"Mundo!" + assert codec.decode(hola+mundo) == b"Hola Mundo!" diff --git a/numcodecs/zstd.pyx b/numcodecs/zstd.pyx index f93da633..0ae4f7f8 100644 --- a/numcodecs/zstd.pyx +++ b/numcodecs/zstd.pyx @@ -68,10 +68,12 @@ cdef extern from "zstd.h": size_t ZSTD_freeDStream(ZSTD_DStream* zds) nogil size_t ZSTD_initDStream(ZSTD_DStream* zds) nogil - cdef long ZSTD_CONTENTSIZE_UNKNOWN - cdef long ZSTD_CONTENTSIZE_ERROR + cdef unsigned long long ZSTD_CONTENTSIZE_UNKNOWN + cdef unsigned long long ZSTD_CONTENTSIZE_ERROR + unsigned long long ZSTD_getFrameContentSize(const void* src, size_t srcSize) nogil + size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize) nogil int ZSTD_minCLevel() nogil int ZSTD_maxCLevel() nogil @@ -216,7 +218,11 @@ def decompress(source, dest=None): try: # determine uncompressed size - dest_size = ZSTD_getFrameContentSize(source_ptr, source_size) + try: + dest_size = findTotalContentSize(source_ptr, source_size) + except RuntimeError: + raise RuntimeError('Zstd decompression error: invalid input data') + if dest_size == 0 or dest_size == ZSTD_CONTENTSIZE_ERROR: raise RuntimeError('Zstd decompression error: invalid input data') @@ -353,6 +359,33 @@ cdef stream_decompress(const Py_buffer* source_pb): return dest +cdef findTotalContentSize(const void* source_ptr, size_t source_size): + cdef: + unsigned long long frame_content_size = 0 + unsigned long long total_content_size = 0 + size_t frame_compressed_size = 0 + size_t offset = 0 + + while offset < source_size: + frame_compressed_size = ZSTD_findFrameCompressedSize(source_ptr + offset, source_size - offset); + + if ZSTD_isError(frame_compressed_size): + error = ZSTD_getErrorName(frame_compressed_size) + raise RuntimeError('Could not set determine zstd frame size: %s' % error) + + frame_content_size = ZSTD_getFrameContentSize(source_ptr + offset, frame_compressed_size); + + if frame_content_size == ZSTD_CONTENTSIZE_ERROR: + return ZSTD_CONTENTSIZE_ERROR + + if frame_content_size == ZSTD_CONTENTSIZE_UNKNOWN: + return ZSTD_CONTENTSIZE_UNKNOWN + + total_content_size += frame_content_size + offset += frame_compressed_size + + return total_content_size + class Zstd(Codec): """Codec providing compression using Zstandard. From 28d92edf8d3ef691fb3f274971d99dce8575bdfd Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Sat, 28 Jun 2025 07:27:49 -0400 Subject: [PATCH 2/9] Add release notes --- docs/release.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/release.rst b/docs/release.rst index ac4f851d..5e80861b 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -27,6 +27,8 @@ Improvements By :user:`John Kirkham `, :issue:`723` * All codecs are now pickleable. By :user:`Tom Nicholas `, :issue:`744` +* The Zstandard codec can now decode bytes containing multiple frames + By :user:`Mark Kittisopikul `, :issue:`757` Fixes ~~~~~ From 5c7fecb8739a02aa73474aef9169eba38414dec1 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Sat, 28 Jun 2025 15:37:17 -0400 Subject: [PATCH 3/9] Format with ruff --- numcodecs/tests/test_zstd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/numcodecs/tests/test_zstd.py b/numcodecs/tests/test_zstd.py index 7891cded..a854cf56 100644 --- a/numcodecs/tests/test_zstd.py +++ b/numcodecs/tests/test_zstd.py @@ -161,10 +161,10 @@ def test_multi_frame(): hello_world = codec.encode(b"Hello world!") assert codec.decode(hello_world) == b"Hello world!" - assert codec.decode(hello_world*2) == b"Hello world!Hello world!" + assert codec.decode(hello_world * 2) == b"Hello world!Hello world!" hola = codec.encode(b"Hola ") mundo = codec.encode(b"Mundo!") assert codec.decode(hola) == b"Hola " assert codec.decode(mundo) == b"Mundo!" - assert codec.decode(hola+mundo) == b"Hola Mundo!" + assert codec.decode(hola + mundo) == b"Hola Mundo!" From 7835f416532e75a28caafc3a81ddd5b88d371318 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Sat, 28 Jun 2025 15:38:01 -0400 Subject: [PATCH 4/9] Address MSVC type errors --- numcodecs/zstd.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numcodecs/zstd.pyx b/numcodecs/zstd.pyx index 0ae4f7f8..8de8a0ec 100644 --- a/numcodecs/zstd.pyx +++ b/numcodecs/zstd.pyx @@ -359,7 +359,7 @@ cdef stream_decompress(const Py_buffer* source_pb): return dest -cdef findTotalContentSize(const void* source_ptr, size_t source_size): +cdef findTotalContentSize(const char* source_ptr, size_t source_size): cdef: unsigned long long frame_content_size = 0 unsigned long long total_content_size = 0 From 2096a4269d5f186f25ad48de62cb51b0e897ed15 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Sat, 28 Jun 2025 15:51:43 -0400 Subject: [PATCH 5/9] Explicitly declare return type of findTotalContentSize --- numcodecs/zstd.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numcodecs/zstd.pyx b/numcodecs/zstd.pyx index 8de8a0ec..5a65503e 100644 --- a/numcodecs/zstd.pyx +++ b/numcodecs/zstd.pyx @@ -359,7 +359,7 @@ cdef stream_decompress(const Py_buffer* source_pb): return dest -cdef findTotalContentSize(const char* source_ptr, size_t source_size): +cdef unsigned long long findTotalContentSize(const char* source_ptr, size_t source_size): cdef: unsigned long long frame_content_size = 0 unsigned long long total_content_size = 0 From c61ccce2a57d7ca8d0d1e78083c01f1f44f8ed8b Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Mon, 14 Jul 2025 03:39:27 -0400 Subject: [PATCH 6/9] Mark multiframe pyzstd tests as now passing --- numcodecs/tests/test_pyzstd.py | 1 - 1 file changed, 1 deletion(-) diff --git a/numcodecs/tests/test_pyzstd.py b/numcodecs/tests/test_pyzstd.py index b9dd6db2..7ee6084b 100644 --- a/numcodecs/tests/test_pyzstd.py +++ b/numcodecs/tests/test_pyzstd.py @@ -25,7 +25,6 @@ def test_pyzstd_simple(input): assert pyzstd.decompress(z.encode(input)) == input -@pytest.mark.xfail @pytest.mark.parametrize("input", test_data) def test_pyzstd_simple_multiple_frames_decode(input): """ From 786b09cd154182e6a4240949cd2205aab0cf6664 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Mon, 14 Jul 2025 03:43:12 -0400 Subject: [PATCH 7/9] Format with ruff --- numcodecs/tests/test_zstd.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/numcodecs/tests/test_zstd.py b/numcodecs/tests/test_zstd.py index a854cf56..d745ae6c 100644 --- a/numcodecs/tests/test_zstd.py +++ b/numcodecs/tests/test_zstd.py @@ -87,6 +87,7 @@ def test_native_functions(): assert Zstd.min_level() == -131072 assert Zstd.max_level() == 22 + def test_streaming_decompression(): # Test input frames with unknown frame content size codec = Zstd() @@ -156,6 +157,7 @@ def zstd_cli_available() -> bool: ["zstd", "-V"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL ).returncode + def test_multi_frame(): codec = Zstd() From f365747c3ebf0f7de52226b4333ccf3a1e405099 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Wed, 16 Jul 2025 22:56:34 -0400 Subject: [PATCH 8/9] Test concatenated frames of known and unknown sizes --- numcodecs/tests/test_zstd.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/numcodecs/tests/test_zstd.py b/numcodecs/tests/test_zstd.py index d745ae6c..a3a926eb 100644 --- a/numcodecs/tests/test_zstd.py +++ b/numcodecs/tests/test_zstd.py @@ -170,3 +170,20 @@ def test_multi_frame(): assert codec.decode(hola) == b"Hola " assert codec.decode(mundo) == b"Mundo!" assert codec.decode(hola + mundo) == b"Hola Mundo!" + + bytes_val = b'(\xb5/\xfd\x00Xa\x00\x00Hello World!' + dec = codec.decode(bytes_val) + dec_expected = b'Hello World!' + assert dec == dec_expected + cli = zstd_cli_available() + if cli: + assert bytes_val == generate_zstd_streaming_bytes(dec_expected) + assert dec_expected == generate_zstd_streaming_bytes(bytes_val, decompress=True) + + # Concatenate frames of known sizes and unknown sizes + # unknown size frame at the end + assert codec.decode(hola + mundo + bytes_val) == b"Hola Mundo!Hello World!" + # unknown size frame at the beginning + assert codec.decode(bytes_val + hola + mundo) == b"Hello World!Hola Mundo!" + # unknown size frame in the middle + assert codec.decode(hola + bytes_val + mundo) == b"Hola Hello World!Mundo!" From d67fba9299caf1c377f3419810efbec9ad8860d1 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Wed, 16 Jul 2025 23:01:16 -0400 Subject: [PATCH 9/9] Add docstring for findTotalContentSize --- numcodecs/zstd.pyx | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/numcodecs/zstd.pyx b/numcodecs/zstd.pyx index 5a65503e..b3cc19f3 100644 --- a/numcodecs/zstd.pyx +++ b/numcodecs/zstd.pyx @@ -360,6 +360,19 @@ cdef stream_decompress(const Py_buffer* source_pb): return dest cdef unsigned long long findTotalContentSize(const char* source_ptr, size_t source_size): + """Find the total uncompressed content size of all frames in the source buffer + + Parameters + ---------- + source_ptr : Pointer to the beginning of the buffer + source_size : Size of the buffer containing the frame sizes to sum + + Returns + ------- + total_content_size: Sum of the content size of all frames within the source buffer + If any of the frame sizes is unknown, return ZSTD_CONTENTSIZE_UNKNOWN. + If any of the frames causes ZSTD_getFrameContentSize to error, return ZSTD_CONTENTSIZE_ERROR. + """ cdef: unsigned long long frame_content_size = 0 unsigned long long total_content_size = 0