Skip to content

Commit ff1f71d

Browse files
AntoinePrvpitrou
andauthored
apacheGH-47895: [C++][Parquet] Add prolog and epilog in unpack (apache#47896)
### Rationale for this change - Simplify the use of `unpack` - Reduce code spread for unpacking integers ### What changes are included in this PR? - epilog: `unpack` extract exactly the required number of values -> change return type to `void`. - prolog: `unpack` can handled non aligned data -> include `bit_offset` in input parameters. - Include prolog/epilog cases in `unpack` tests. - Simplify a roundtrip test from packed -> unpacked -> packed to unpacked -> packed -> unpacked Decoder benchmarks should remain the same (tested linux x86-64). I have not benchmark the `unpack` functions themselves but I don't believe it's relevant since they now do more work. ### Are these changes tested? Yes ### Are there any user-facing changes? No * GitHub Issue: apache#47895 Lead-authored-by: AntoinePrv <[email protected]> Co-authored-by: Antoine Prouvost <[email protected]> Co-authored-by: Antoine Pitrou <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
1 parent 6a08785 commit ff1f71d

19 files changed

+581
-509
lines changed

cpp/src/arrow/util/bit_run_reader.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ BitRunReader::BitRunReader(const uint8_t* bitmap, int64_t start_offset, int64_t
4545

4646
// Prepare for inversion in NextRun.
4747
// Clear out any preceding bits.
48-
word_ = word_ & ~bit_util::LeastSignificantBitMask(position_);
48+
word_ = word_ & ~bit_util::LeastSignificantBitMask<uint64_t>(position_);
4949
}
5050

5151
#endif

cpp/src/arrow/util/bit_run_reader.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ class ARROW_EXPORT BitRunReader {
106106
int64_t start_bit_offset = start_position & 63;
107107
// Invert the word for proper use of CountTrailingZeros and
108108
// clear bits so CountTrailingZeros can do it magic.
109-
word_ = ~word_ & ~bit_util::LeastSignificantBitMask(start_bit_offset);
109+
word_ = ~word_ & ~bit_util::LeastSignificantBitMask<uint64_t>(start_bit_offset);
110110

111111
// Go forward until the next change from unset to set.
112112
int64_t new_bits = bit_util::CountTrailingZeros(word_) - start_bit_offset;
@@ -311,12 +311,12 @@ class BaseSetBitRunReader {
311311
memcpy(reinterpret_cast<char*>(&word) + 8 - num_bytes, bitmap_, num_bytes);
312312
// XXX MostSignificantBitmask
313313
return (bit_util::ToLittleEndian(word) << bit_offset) &
314-
~bit_util::LeastSignificantBitMask(64 - num_bits);
314+
~bit_util::LeastSignificantBitMask<uint64_t>(64 - num_bits);
315315
} else {
316316
memcpy(&word, bitmap_, num_bytes);
317317
bitmap_ += num_bytes;
318318
return (bit_util::ToLittleEndian(word) >> bit_offset) &
319-
bit_util::LeastSignificantBitMask(num_bits);
319+
bit_util::LeastSignificantBitMask<uint64_t>(num_bits);
320320
}
321321
}
322322

cpp/src/arrow/util/bit_stream_utils_internal.h

Lines changed: 12 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919

2020
#pragma once
2121

22-
#include <algorithm>
2322
#include <cstdint>
2423
#include <cstring>
2524
#include <type_traits>
@@ -249,110 +248,36 @@ inline bool BitWriter::PutAligned(T val, int num_bytes) {
249248
return true;
250249
}
251250

252-
namespace detail {
253-
254-
template <typename T>
255-
inline void GetValue_(int num_bits, T* v, int max_bytes, const uint8_t* buffer,
256-
int* bit_offset, int* byte_offset, uint64_t* buffered_values) {
257-
#ifdef _MSC_VER
258-
# pragma warning(push)
259-
# pragma warning(disable : 4800)
260-
#endif
261-
*v = static_cast<T>(bit_util::TrailingBits(*buffered_values, *bit_offset + num_bits) >>
262-
*bit_offset);
263-
#ifdef _MSC_VER
264-
# pragma warning(pop)
265-
#endif
266-
*bit_offset += num_bits;
267-
if (*bit_offset >= 64) {
268-
*byte_offset += 8;
269-
*bit_offset -= 64;
270-
271-
*buffered_values =
272-
detail::ReadLittleEndianWord(buffer + *byte_offset, max_bytes - *byte_offset);
273-
#ifdef _MSC_VER
274-
# pragma warning(push)
275-
# pragma warning(disable : 4800 4805)
276-
#endif
277-
// Read bits of v that crossed into new buffered_values_
278-
if (ARROW_PREDICT_TRUE(num_bits - *bit_offset < static_cast<int>(8 * sizeof(T)))) {
279-
// if shift exponent(num_bits - *bit_offset) is not less than sizeof(T), *v will not
280-
// change and the following code may cause a runtime error that the shift exponent
281-
// is too large
282-
*v = *v | static_cast<T>(bit_util::TrailingBits(*buffered_values, *bit_offset)
283-
<< (num_bits - *bit_offset));
284-
}
285-
#ifdef _MSC_VER
286-
# pragma warning(pop)
287-
#endif
288-
ARROW_DCHECK_LE(*bit_offset, 64);
289-
}
290-
}
291-
292-
} // namespace detail
293-
294251
template <typename T>
295252
inline bool BitReader::GetValue(int num_bits, T* v) {
296253
return GetBatch(num_bits, v, 1) == 1;
297254
}
298255

299-
namespace internal_bit_reader {
300-
template <typename T>
301-
struct unpack_detect {
302-
using type = std::make_unsigned_t<T>;
303-
};
304-
305-
template <>
306-
struct unpack_detect<bool> {
307-
using type = bool;
308-
};
309-
} // namespace internal_bit_reader
310-
311256
template <typename T>
312257
inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) {
313-
ARROW_DCHECK(buffer_ != NULL);
314-
ARROW_DCHECK_LE(num_bits, static_cast<int>(sizeof(T) * 8)) << "num_bits: " << num_bits;
258+
constexpr uint64_t kBitsPerByte = 8;
315259

316-
int bit_offset = bit_offset_;
317-
int byte_offset = byte_offset_;
318-
uint64_t buffered_values = buffered_values_;
319-
int max_bytes = max_bytes_;
320-
const uint8_t* buffer = buffer_;
260+
ARROW_DCHECK(buffer_ != NULLPTR);
261+
ARROW_DCHECK_LE(num_bits, static_cast<int>(sizeof(T) * 8)) << "num_bits: " << num_bits;
321262

322263
const int64_t needed_bits = num_bits * static_cast<int64_t>(batch_size);
323-
constexpr uint64_t kBitsPerByte = 8;
324264
const int64_t remaining_bits =
325-
static_cast<int64_t>(max_bytes - byte_offset) * kBitsPerByte - bit_offset;
265+
static_cast<int64_t>(max_bytes_ - byte_offset_) * kBitsPerByte - bit_offset_;
326266
if (remaining_bits < needed_bits) {
327267
batch_size = static_cast<int>(remaining_bits / num_bits);
328268
}
329269

330-
int i = 0;
331-
if (ARROW_PREDICT_FALSE(bit_offset != 0)) {
332-
for (; i < batch_size && bit_offset != 0; ++i) {
333-
detail::GetValue_(num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset,
334-
&buffered_values);
335-
}
336-
}
337-
338-
using unpack_t = typename internal_bit_reader::unpack_detect<T>::type;
339-
340-
int num_unpacked = ::arrow::internal::unpack(
341-
buffer + byte_offset, reinterpret_cast<unpack_t*>(v + i), batch_size - i, num_bits);
342-
i += num_unpacked;
343-
byte_offset += num_unpacked * num_bits / 8;
344-
345-
buffered_values =
346-
detail::ReadLittleEndianWord(buffer + byte_offset, max_bytes - byte_offset);
270+
if constexpr (std::is_same_v<T, bool>) {
271+
::arrow::internal::unpack(buffer_ + byte_offset_, v, batch_size, num_bits,
272+
bit_offset_);
347273

348-
for (; i < batch_size; ++i) {
349-
detail::GetValue_(num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset,
350-
&buffered_values);
274+
} else {
275+
::arrow::internal::unpack(buffer_ + byte_offset_,
276+
reinterpret_cast<std::make_unsigned_t<T>*>(v), batch_size,
277+
num_bits, bit_offset_);
351278
}
352279

353-
bit_offset_ = bit_offset;
354-
byte_offset_ = byte_offset;
355-
buffered_values_ = buffered_values;
280+
Advance(batch_size * num_bits);
356281

357282
return batch_size;
358283
}

cpp/src/arrow/util/bit_util.h

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -113,9 +113,16 @@ constexpr bool IsMultipleOf64(int64_t n) { return (n & 63) == 0; }
113113
constexpr bool IsMultipleOf8(int64_t n) { return (n & 7) == 0; }
114114

115115
// Returns a mask for the bit_index lower order bits.
116-
// Only valid for bit_index in the range [0, 64).
117-
constexpr uint64_t LeastSignificantBitMask(int64_t bit_index) {
118-
return (static_cast<uint64_t>(1) << bit_index) - 1;
116+
// Valid in the range `[0, 8*sizof(Uint)]` if `kAllowUpperBound`
117+
// otherwise `[0, 8*sizof(Uint)[`
118+
template <typename Uint, bool kAllowUpperBound = false>
119+
constexpr auto LeastSignificantBitMask(Uint bit_index) {
120+
if constexpr (kAllowUpperBound) {
121+
if (bit_index == 8 * sizeof(Uint)) {
122+
return ~Uint{0};
123+
}
124+
}
125+
return (Uint{1} << bit_index) - Uint{1};
119126
}
120127

121128
// Returns 'value' rounded up to the nearest multiple of 'factor'

cpp/src/arrow/util/bitmap_reader.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ class BitmapUInt64Reader {
136136
memcpy(&word, bitmap_, num_bytes);
137137
bitmap_ += num_bytes;
138138
return (bit_util::ToLittleEndian(word) >> bit_offset) &
139-
bit_util::LeastSignificantBitMask(num_bits);
139+
bit_util::LeastSignificantBitMask<uint64_t>(num_bits);
140140
}
141141

142142
const uint8_t* bitmap_;

cpp/src/arrow/util/bpacking.cc

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -50,19 +50,19 @@ struct UnpackDynamicFunction {
5050
} // namespace
5151

5252
template <typename Uint>
53-
int unpack(const uint8_t* in, Uint* out, int batch_size, int num_bits) {
53+
void unpack(const uint8_t* in, Uint* out, int batch_size, int num_bits, int bit_offset) {
5454
#if defined(ARROW_HAVE_NEON)
55-
return unpack_neon(in, out, batch_size, num_bits);
55+
return unpack_neon(in, out, batch_size, num_bits, bit_offset);
5656
#else
5757
static DynamicDispatch<UnpackDynamicFunction<Uint> > dispatch;
58-
return dispatch.func(in, out, batch_size, num_bits);
58+
return dispatch.func(in, out, batch_size, num_bits, bit_offset);
5959
#endif
6060
}
6161

62-
template int unpack<bool>(const uint8_t*, bool*, int, int);
63-
template int unpack<uint8_t>(const uint8_t*, uint8_t*, int, int);
64-
template int unpack<uint16_t>(const uint8_t*, uint16_t*, int, int);
65-
template int unpack<uint32_t>(const uint8_t*, uint32_t*, int, int);
66-
template int unpack<uint64_t>(const uint8_t*, uint64_t*, int, int);
62+
template void unpack<bool>(const uint8_t*, bool*, int, int, int);
63+
template void unpack<uint8_t>(const uint8_t*, uint8_t*, int, int, int);
64+
template void unpack<uint16_t>(const uint8_t*, uint16_t*, int, int, int);
65+
template void unpack<uint32_t>(const uint8_t*, uint32_t*, int, int, int);
66+
template void unpack<uint64_t>(const uint8_t*, uint64_t*, int, int, int);
6767

6868
} // namespace arrow::internal

cpp/src/arrow/util/bpacking_benchmark.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ namespace arrow::internal {
3333
namespace {
3434

3535
template <typename Int>
36-
using UnpackFunc = int (*)(const uint8_t*, Int*, int, int);
36+
using UnpackFunc = void (*)(const uint8_t*, Int*, int, int, int);
3737

3838
/// Get the number of bytes associate with a packing.
3939
constexpr int32_t GetNumBytes(int32_t num_values, int32_t bit_width) {
@@ -89,7 +89,7 @@ void BM_Unpack(benchmark::State& state, bool aligned, UnpackFunc<Int> unpack, bo
8989
std::vector<Int> unpacked(num_values, 0);
9090

9191
for (auto _ : state) {
92-
unpack(packed_ptr, unpacked.data(), num_values, bit_width);
92+
unpack(packed_ptr, unpacked.data(), num_values, bit_width, /* bit_offset = */ 0);
9393
benchmark::ClobberMemory();
9494
}
9595
state.SetItemsProcessed(num_values * state.iterations());

0 commit comments

Comments
 (0)