Skip to content

Commit 398fc5a

Browse files
authored
GH-48112: [C++][Parquet] Use more accurate data length estimate when decoding PLAIN BYTE_ARRAY data (#48113)
### Rationale for this change Avoid reserving too many data bytes when decoding a PLAIN BYTE_ARRAY Parquet column as Arrow Binary or LargeBinary. ### Are these changes tested? By existing tests. ### Are there any user-facing changes? No. * GitHub Issue: #48112 Authored-by: Antoine Pitrou <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
1 parent 277faa9 commit 398fc5a

File tree

1 file changed

+11
-4
lines changed

1 file changed

+11
-4
lines changed

cpp/src/parquet/decoder.cc

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -754,6 +754,12 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType> {
754754
int64_t valid_bits_offset,
755755
typename EncodingTraits<ByteArrayType>::Accumulator* out,
756756
int* out_values_decoded) {
757+
// We're going to decode up to `num_values - null_count` PLAIN values,
758+
// and each value has a 4-byte length header that doesn't count for the
759+
// Arrow binary data length.
760+
int64_t estimated_data_length =
761+
std::max<int64_t>(0, len_ - 4 * (num_values - null_count));
762+
757763
auto visit_binary_helper = [&](auto* helper) {
758764
int values_decoded = 0;
759765

@@ -772,11 +778,12 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType> {
772778
"Invalid or truncated PLAIN-encoded BYTE_ARRAY data");
773779
}
774780
RETURN_NOT_OK(
775-
helper->AppendValue(data_ + 4, value_len,
776-
/*estimated_remaining_data_length=*/len_));
781+
helper->AppendValue(data_ + 4, value_len, estimated_data_length));
777782
auto increment = value_len + 4;
778783
data_ += increment;
779784
len_ -= increment;
785+
estimated_data_length -= value_len;
786+
DCHECK_GE(estimated_data_length, 0);
780787
}
781788
values_decoded += static_cast<int>(run_length);
782789
return Status::OK();
@@ -790,8 +797,8 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType> {
790797
return Status::OK();
791798
};
792799

793-
return DispatchArrowBinaryHelper<ByteArrayType>(out, num_values, len_,
794-
visit_binary_helper);
800+
return DispatchArrowBinaryHelper<ByteArrayType>(
801+
out, num_values, estimated_data_length, visit_binary_helper);
795802
}
796803

797804
template <typename BuilderType>

0 commit comments

Comments
 (0)