Skip to content

Commit 7b219f9

Browse files
zhuqi-lucasalamb
andauthored
perf: speed up StringViewArray gc 1.4 ~5.x faster (#7873)
# Which issue does this PR close? Improve the StringViewArray gc performance # Rationale for this change Improve the StringViewArray gc performance 1. Such as precompute the len and reserve 2. Split function for inlined and not inlined 3. Remove builder and construct ourself # What changes are included in this PR? Improve the StringViewArray gc performance 1. Such as precompute the len and reserve 2. Split function for inlined and not inlined 3. Remove builder and construct ourself # Are these changes tested? Yes # Are there any user-facing changes? No --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent 6d11232 commit 7b219f9

File tree

1 file changed

+209
-5
lines changed

1 file changed

+209
-5
lines changed

arrow-array/src/array/byte_view_array.rs

Lines changed: 209 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -473,13 +473,89 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
473473
/// Note: this function does not attempt to canonicalize / deduplicate values. For this
474474
/// feature see [`GenericByteViewBuilder::with_deduplicate_strings`].
475475
pub fn gc(&self) -> Self {
476-
let mut builder = GenericByteViewBuilder::<T>::with_capacity(self.len());
476+
// 1) Read basic properties once
477+
let len = self.len(); // number of elements
478+
let nulls = self.nulls().cloned(); // reuse & clone existing null bitmap
479+
480+
// 1.5) Fast path: if there are no buffers, just reuse original views and no data blocks
481+
if self.data_buffers().is_empty() {
482+
return unsafe {
483+
GenericByteViewArray::new_unchecked(
484+
self.views().clone(),
485+
vec![], // empty data blocks
486+
nulls,
487+
)
488+
};
489+
}
477490

478-
for v in self.iter() {
479-
builder.append_option(v);
491+
// 2) Calculate total size of all non-inline data and detect if any exists
492+
let total_large = self.total_buffer_bytes_used();
493+
494+
// 2.5) Fast path: if there is no non-inline data, avoid buffer allocation & processing
495+
if total_large == 0 {
496+
// Views are inline-only or all null; just reuse original views and no data blocks
497+
return unsafe {
498+
GenericByteViewArray::new_unchecked(
499+
self.views().clone(),
500+
vec![], // empty data blocks
501+
nulls,
502+
)
503+
};
480504
}
481505

482-
builder.finish()
506+
// 3) Allocate exactly capacity for all non-inline data
507+
let mut data_buf = Vec::with_capacity(total_large);
508+
509+
// 4) Iterate over views and process each inline/non-inline view
510+
let views_buf: Vec<u128> = (0..len)
511+
.map(|i| unsafe { self.copy_view_to_buffer(i, &mut data_buf) })
512+
.collect();
513+
514+
// 5) Wrap up buffers
515+
let data_block = Buffer::from_vec(data_buf);
516+
let views_scalar = ScalarBuffer::from(views_buf);
517+
let data_blocks = vec![data_block];
518+
519+
// SAFETY: views_scalar, data_blocks, and nulls are correctly aligned and sized
520+
unsafe { GenericByteViewArray::new_unchecked(views_scalar, data_blocks, nulls) }
521+
}
522+
523+
/// Copy the i‑th view into `data_buf` if it refers to an out‑of‑line buffer.
524+
///
525+
/// # Safety
526+
///
527+
/// - `i < self.len()`.
528+
/// - Every element in `self.views()` must currently refer to a valid slice
529+
/// inside one of `self.buffers`.
530+
/// - `data_buf` must be ready to have additional bytes appended.
531+
/// - After this call, the returned view will have its
532+
/// `buffer_index` reset to `0` and its `offset` updated so that it points
533+
/// into the bytes just appended at the end of `data_buf`.
534+
#[inline(always)]
535+
unsafe fn copy_view_to_buffer(&self, i: usize, data_buf: &mut Vec<u8>) -> u128 {
536+
// SAFETY: `i < self.len()` ensures this is in‑bounds.
537+
let raw_view = *self.views().get_unchecked(i);
538+
let mut bv = ByteView::from(raw_view);
539+
540+
// Inline‑small views stay as‑is.
541+
if bv.length <= MAX_INLINE_VIEW_LEN {
542+
raw_view
543+
} else {
544+
// SAFETY: `bv.buffer_index` and `bv.offset..bv.offset+bv.length`
545+
// must both lie within valid ranges for `self.buffers`.
546+
let buffer = self.buffers.get_unchecked(bv.buffer_index as usize);
547+
let start = bv.offset as usize;
548+
let end = start + bv.length as usize;
549+
let slice = buffer.get_unchecked(start..end);
550+
551+
// Copy out‑of‑line data into our single “0” buffer.
552+
let new_offset = data_buf.len() as u32;
553+
data_buf.extend_from_slice(slice);
554+
555+
bv.buffer_index = 0;
556+
bv.offset = new_offset;
557+
bv.into()
558+
}
483559
}
484560

485561
/// Returns the total number of bytes used by all non inlined views in all
@@ -998,7 +1074,11 @@ mod tests {
9981074
Array, BinaryViewArray, GenericBinaryArray, GenericByteViewArray, StringViewArray,
9991075
};
10001076
use arrow_buffer::{Buffer, ScalarBuffer};
1001-
use arrow_data::ByteView;
1077+
use arrow_data::{ByteView, MAX_INLINE_VIEW_LEN};
1078+
use rand::prelude::StdRng;
1079+
use rand::{Rng, SeedableRng};
1080+
1081+
const BLOCK_SIZE: u32 = 8;
10021082

10031083
#[test]
10041084
fn try_new_string() {
@@ -1188,6 +1268,130 @@ mod tests {
11881268
check_gc(&array.slice(3, 1));
11891269
}
11901270

1271+
/// 1) Empty array: no elements, expect gc to return empty with no data buffers
1272+
#[test]
1273+
fn test_gc_empty_array() {
1274+
let array = StringViewBuilder::new()
1275+
.with_fixed_block_size(BLOCK_SIZE)
1276+
.finish();
1277+
let gced = array.gc();
1278+
// length and null count remain zero
1279+
assert_eq!(gced.len(), 0);
1280+
assert_eq!(gced.null_count(), 0);
1281+
// no underlying data buffers should be allocated
1282+
assert!(
1283+
gced.data_buffers().is_empty(),
1284+
"Expected no data buffers for empty array"
1285+
);
1286+
}
1287+
1288+
/// 2) All inline values (<= INLINE_LEN): capacity-only data buffer, same values
1289+
#[test]
1290+
fn test_gc_all_inline() {
1291+
let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1292+
// append many short strings, each exactly INLINE_LEN long
1293+
for _ in 0..100 {
1294+
let s = "A".repeat(MAX_INLINE_VIEW_LEN as usize);
1295+
builder.append_option(Some(&s));
1296+
}
1297+
let array = builder.finish();
1298+
let gced = array.gc();
1299+
// Since all views fit inline, data buffer is empty
1300+
assert_eq!(
1301+
gced.data_buffers().len(),
1302+
0,
1303+
"Should have no data buffers for inline values"
1304+
);
1305+
assert_eq!(gced.len(), 100);
1306+
// verify element-wise equality
1307+
array.iter().zip(gced.iter()).for_each(|(orig, got)| {
1308+
assert_eq!(orig, got, "Inline value mismatch after gc");
1309+
});
1310+
}
1311+
1312+
/// 3) All large values (> INLINE_LEN): each must be copied into the new data buffer
1313+
#[test]
1314+
fn test_gc_all_large() {
1315+
let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1316+
let large_str = "X".repeat(MAX_INLINE_VIEW_LEN as usize + 5);
1317+
// append multiple large strings
1318+
for _ in 0..50 {
1319+
builder.append_option(Some(&large_str));
1320+
}
1321+
let array = builder.finish();
1322+
let gced = array.gc();
1323+
// New data buffers should be populated (one or more blocks)
1324+
assert!(
1325+
!gced.data_buffers().is_empty(),
1326+
"Expected data buffers for large values"
1327+
);
1328+
assert_eq!(gced.len(), 50);
1329+
// verify that every large string emerges unchanged
1330+
array.iter().zip(gced.iter()).for_each(|(orig, got)| {
1331+
assert_eq!(orig, got, "Large view mismatch after gc");
1332+
});
1333+
}
1334+
1335+
/// 4) All null elements: ensure null bitmap handling path is correct
1336+
#[test]
1337+
fn test_gc_all_nulls() {
1338+
let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1339+
for _ in 0..20 {
1340+
builder.append_null();
1341+
}
1342+
let array = builder.finish();
1343+
let gced = array.gc();
1344+
// length and null count match
1345+
assert_eq!(gced.len(), 20);
1346+
assert_eq!(gced.null_count(), 20);
1347+
// data buffers remain empty for null-only array
1348+
assert!(
1349+
gced.data_buffers().is_empty(),
1350+
"No data should be stored for nulls"
1351+
);
1352+
}
1353+
1354+
/// 5) Random mix of inline, large, and null values with slicing tests
1355+
#[test]
1356+
fn test_gc_random_mixed_and_slices() {
1357+
let mut rng = StdRng::seed_from_u64(42);
1358+
let mut builder = StringViewBuilder::new().with_fixed_block_size(BLOCK_SIZE);
1359+
// Keep a Vec of original Option<String> for later comparison
1360+
let mut original: Vec<Option<String>> = Vec::new();
1361+
1362+
for _ in 0..200 {
1363+
if rng.random_bool(0.1) {
1364+
// 10% nulls
1365+
builder.append_null();
1366+
original.push(None);
1367+
} else {
1368+
// random length between 0 and twice the inline limit
1369+
let len = rng.random_range(0..(MAX_INLINE_VIEW_LEN * 2));
1370+
let s: String = "A".repeat(len as usize);
1371+
builder.append_option(Some(&s));
1372+
original.push(Some(s));
1373+
}
1374+
}
1375+
1376+
let array = builder.finish();
1377+
// Test multiple slice ranges to ensure offset logic is correct
1378+
for (offset, slice_len) in &[(0, 50), (10, 100), (150, 30)] {
1379+
let sliced = array.slice(*offset, *slice_len);
1380+
let gced = sliced.gc();
1381+
// Build expected slice of Option<&str>
1382+
let expected: Vec<Option<&str>> = original[*offset..(*offset + *slice_len)]
1383+
.iter()
1384+
.map(|opt| opt.as_deref())
1385+
.collect();
1386+
1387+
assert_eq!(gced.len(), *slice_len, "Slice length mismatch");
1388+
// Compare element-wise
1389+
gced.iter().zip(expected.iter()).for_each(|(got, expect)| {
1390+
assert_eq!(got, *expect, "Value mismatch in mixed slice after gc");
1391+
});
1392+
}
1393+
}
1394+
11911395
#[test]
11921396
fn test_eq() {
11931397
let test_data = [

0 commit comments

Comments
 (0)