@@ -473,13 +473,89 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
473
473
/// Note: this function does not attempt to canonicalize / deduplicate values. For this
474
474
/// feature see [`GenericByteViewBuilder::with_deduplicate_strings`].
475
475
pub fn gc ( & self ) -> Self {
476
- let mut builder = GenericByteViewBuilder :: < T > :: with_capacity ( self . len ( ) ) ;
476
+ // 1) Read basic properties once
477
+ let len = self . len ( ) ; // number of elements
478
+ let nulls = self . nulls ( ) . cloned ( ) ; // reuse & clone existing null bitmap
479
+
480
+ // 1.5) Fast path: if there are no buffers, just reuse original views and no data blocks
481
+ if self . data_buffers ( ) . is_empty ( ) {
482
+ return unsafe {
483
+ GenericByteViewArray :: new_unchecked (
484
+ self . views ( ) . clone ( ) ,
485
+ vec ! [ ] , // empty data blocks
486
+ nulls,
487
+ )
488
+ } ;
489
+ }
477
490
478
- for v in self . iter ( ) {
479
- builder. append_option ( v) ;
491
+ // 2) Calculate total size of all non-inline data and detect if any exists
492
+ let total_large = self . total_buffer_bytes_used ( ) ;
493
+
494
+ // 2.5) Fast path: if there is no non-inline data, avoid buffer allocation & processing
495
+ if total_large == 0 {
496
+ // Views are inline-only or all null; just reuse original views and no data blocks
497
+ return unsafe {
498
+ GenericByteViewArray :: new_unchecked (
499
+ self . views ( ) . clone ( ) ,
500
+ vec ! [ ] , // empty data blocks
501
+ nulls,
502
+ )
503
+ } ;
480
504
}
481
505
482
- builder. finish ( )
506
+ // 3) Allocate exactly capacity for all non-inline data
507
+ let mut data_buf = Vec :: with_capacity ( total_large) ;
508
+
509
+ // 4) Iterate over views and process each inline/non-inline view
510
+ let views_buf: Vec < u128 > = ( 0 ..len)
511
+ . map ( |i| unsafe { self . copy_view_to_buffer ( i, & mut data_buf) } )
512
+ . collect ( ) ;
513
+
514
+ // 5) Wrap up buffers
515
+ let data_block = Buffer :: from_vec ( data_buf) ;
516
+ let views_scalar = ScalarBuffer :: from ( views_buf) ;
517
+ let data_blocks = vec ! [ data_block] ;
518
+
519
+ // SAFETY: views_scalar, data_blocks, and nulls are correctly aligned and sized
520
+ unsafe { GenericByteViewArray :: new_unchecked ( views_scalar, data_blocks, nulls) }
521
+ }
522
+
523
+ /// Copy the i‑th view into `data_buf` if it refers to an out‑of‑line buffer.
524
+ ///
525
+ /// # Safety
526
+ ///
527
+ /// - `i < self.len()`.
528
+ /// - Every element in `self.views()` must currently refer to a valid slice
529
+ /// inside one of `self.buffers`.
530
+ /// - `data_buf` must be ready to have additional bytes appended.
531
+ /// - After this call, the returned view will have its
532
+ /// `buffer_index` reset to `0` and its `offset` updated so that it points
533
+ /// into the bytes just appended at the end of `data_buf`.
534
+ #[ inline( always) ]
535
+ unsafe fn copy_view_to_buffer ( & self , i : usize , data_buf : & mut Vec < u8 > ) -> u128 {
536
+ // SAFETY: `i < self.len()` ensures this is in‑bounds.
537
+ let raw_view = * self . views ( ) . get_unchecked ( i) ;
538
+ let mut bv = ByteView :: from ( raw_view) ;
539
+
540
+ // Inline‑small views stay as‑is.
541
+ if bv. length <= MAX_INLINE_VIEW_LEN {
542
+ raw_view
543
+ } else {
544
+ // SAFETY: `bv.buffer_index` and `bv.offset..bv.offset+bv.length`
545
+ // must both lie within valid ranges for `self.buffers`.
546
+ let buffer = self . buffers . get_unchecked ( bv. buffer_index as usize ) ;
547
+ let start = bv. offset as usize ;
548
+ let end = start + bv. length as usize ;
549
+ let slice = buffer. get_unchecked ( start..end) ;
550
+
551
+ // Copy out‑of‑line data into our single “0” buffer.
552
+ let new_offset = data_buf. len ( ) as u32 ;
553
+ data_buf. extend_from_slice ( slice) ;
554
+
555
+ bv. buffer_index = 0 ;
556
+ bv. offset = new_offset;
557
+ bv. into ( )
558
+ }
483
559
}
484
560
485
561
/// Returns the total number of bytes used by all non inlined views in all
@@ -998,7 +1074,11 @@ mod tests {
998
1074
Array , BinaryViewArray , GenericBinaryArray , GenericByteViewArray , StringViewArray ,
999
1075
} ;
1000
1076
use arrow_buffer:: { Buffer , ScalarBuffer } ;
1001
- use arrow_data:: ByteView ;
1077
+ use arrow_data:: { ByteView , MAX_INLINE_VIEW_LEN } ;
1078
+ use rand:: prelude:: StdRng ;
1079
+ use rand:: { Rng , SeedableRng } ;
1080
+
1081
+ const BLOCK_SIZE : u32 = 8 ;
1002
1082
1003
1083
#[ test]
1004
1084
fn try_new_string ( ) {
@@ -1188,6 +1268,130 @@ mod tests {
1188
1268
check_gc ( & array. slice ( 3 , 1 ) ) ;
1189
1269
}
1190
1270
1271
+ /// 1) Empty array: no elements, expect gc to return empty with no data buffers
1272
+ #[ test]
1273
+ fn test_gc_empty_array ( ) {
1274
+ let array = StringViewBuilder :: new ( )
1275
+ . with_fixed_block_size ( BLOCK_SIZE )
1276
+ . finish ( ) ;
1277
+ let gced = array. gc ( ) ;
1278
+ // length and null count remain zero
1279
+ assert_eq ! ( gced. len( ) , 0 ) ;
1280
+ assert_eq ! ( gced. null_count( ) , 0 ) ;
1281
+ // no underlying data buffers should be allocated
1282
+ assert ! (
1283
+ gced. data_buffers( ) . is_empty( ) ,
1284
+ "Expected no data buffers for empty array"
1285
+ ) ;
1286
+ }
1287
+
1288
+ /// 2) All inline values (<= INLINE_LEN): capacity-only data buffer, same values
1289
+ #[ test]
1290
+ fn test_gc_all_inline ( ) {
1291
+ let mut builder = StringViewBuilder :: new ( ) . with_fixed_block_size ( BLOCK_SIZE ) ;
1292
+ // append many short strings, each exactly INLINE_LEN long
1293
+ for _ in 0 ..100 {
1294
+ let s = "A" . repeat ( MAX_INLINE_VIEW_LEN as usize ) ;
1295
+ builder. append_option ( Some ( & s) ) ;
1296
+ }
1297
+ let array = builder. finish ( ) ;
1298
+ let gced = array. gc ( ) ;
1299
+ // Since all views fit inline, data buffer is empty
1300
+ assert_eq ! (
1301
+ gced. data_buffers( ) . len( ) ,
1302
+ 0 ,
1303
+ "Should have no data buffers for inline values"
1304
+ ) ;
1305
+ assert_eq ! ( gced. len( ) , 100 ) ;
1306
+ // verify element-wise equality
1307
+ array. iter ( ) . zip ( gced. iter ( ) ) . for_each ( |( orig, got) | {
1308
+ assert_eq ! ( orig, got, "Inline value mismatch after gc" ) ;
1309
+ } ) ;
1310
+ }
1311
+
1312
+ /// 3) All large values (> INLINE_LEN): each must be copied into the new data buffer
1313
+ #[ test]
1314
+ fn test_gc_all_large ( ) {
1315
+ let mut builder = StringViewBuilder :: new ( ) . with_fixed_block_size ( BLOCK_SIZE ) ;
1316
+ let large_str = "X" . repeat ( MAX_INLINE_VIEW_LEN as usize + 5 ) ;
1317
+ // append multiple large strings
1318
+ for _ in 0 ..50 {
1319
+ builder. append_option ( Some ( & large_str) ) ;
1320
+ }
1321
+ let array = builder. finish ( ) ;
1322
+ let gced = array. gc ( ) ;
1323
+ // New data buffers should be populated (one or more blocks)
1324
+ assert ! (
1325
+ !gced. data_buffers( ) . is_empty( ) ,
1326
+ "Expected data buffers for large values"
1327
+ ) ;
1328
+ assert_eq ! ( gced. len( ) , 50 ) ;
1329
+ // verify that every large string emerges unchanged
1330
+ array. iter ( ) . zip ( gced. iter ( ) ) . for_each ( |( orig, got) | {
1331
+ assert_eq ! ( orig, got, "Large view mismatch after gc" ) ;
1332
+ } ) ;
1333
+ }
1334
+
1335
+ /// 4) All null elements: ensure null bitmap handling path is correct
1336
+ #[ test]
1337
+ fn test_gc_all_nulls ( ) {
1338
+ let mut builder = StringViewBuilder :: new ( ) . with_fixed_block_size ( BLOCK_SIZE ) ;
1339
+ for _ in 0 ..20 {
1340
+ builder. append_null ( ) ;
1341
+ }
1342
+ let array = builder. finish ( ) ;
1343
+ let gced = array. gc ( ) ;
1344
+ // length and null count match
1345
+ assert_eq ! ( gced. len( ) , 20 ) ;
1346
+ assert_eq ! ( gced. null_count( ) , 20 ) ;
1347
+ // data buffers remain empty for null-only array
1348
+ assert ! (
1349
+ gced. data_buffers( ) . is_empty( ) ,
1350
+ "No data should be stored for nulls"
1351
+ ) ;
1352
+ }
1353
+
1354
+ /// 5) Random mix of inline, large, and null values with slicing tests
1355
+ #[ test]
1356
+ fn test_gc_random_mixed_and_slices ( ) {
1357
+ let mut rng = StdRng :: seed_from_u64 ( 42 ) ;
1358
+ let mut builder = StringViewBuilder :: new ( ) . with_fixed_block_size ( BLOCK_SIZE ) ;
1359
+ // Keep a Vec of original Option<String> for later comparison
1360
+ let mut original: Vec < Option < String > > = Vec :: new ( ) ;
1361
+
1362
+ for _ in 0 ..200 {
1363
+ if rng. random_bool ( 0.1 ) {
1364
+ // 10% nulls
1365
+ builder. append_null ( ) ;
1366
+ original. push ( None ) ;
1367
+ } else {
1368
+ // random length between 0 and twice the inline limit
1369
+ let len = rng. random_range ( 0 ..( MAX_INLINE_VIEW_LEN * 2 ) ) ;
1370
+ let s: String = "A" . repeat ( len as usize ) ;
1371
+ builder. append_option ( Some ( & s) ) ;
1372
+ original. push ( Some ( s) ) ;
1373
+ }
1374
+ }
1375
+
1376
+ let array = builder. finish ( ) ;
1377
+ // Test multiple slice ranges to ensure offset logic is correct
1378
+ for ( offset, slice_len) in & [ ( 0 , 50 ) , ( 10 , 100 ) , ( 150 , 30 ) ] {
1379
+ let sliced = array. slice ( * offset, * slice_len) ;
1380
+ let gced = sliced. gc ( ) ;
1381
+ // Build expected slice of Option<&str>
1382
+ let expected: Vec < Option < & str > > = original[ * offset..( * offset + * slice_len) ]
1383
+ . iter ( )
1384
+ . map ( |opt| opt. as_deref ( ) )
1385
+ . collect ( ) ;
1386
+
1387
+ assert_eq ! ( gced. len( ) , * slice_len, "Slice length mismatch" ) ;
1388
+ // Compare element-wise
1389
+ gced. iter ( ) . zip ( expected. iter ( ) ) . for_each ( |( got, expect) | {
1390
+ assert_eq ! ( got, * expect, "Value mismatch in mixed slice after gc" ) ;
1391
+ } ) ;
1392
+ }
1393
+ }
1394
+
1191
1395
#[ test]
1192
1396
fn test_eq ( ) {
1193
1397
let test_data = [
0 commit comments