@@ -23,14 +23,12 @@ use std::mem::{size_of, size_of_val, take};
23
23
use std:: sync:: Arc ;
24
24
25
25
use arrow:: array:: {
26
- make_array, new_empty_array, Array , ArrayRef , AsArray , BooleanArray , ListArray ,
27
- StructArray ,
26
+ new_empty_array, Array , ArrayRef , AsArray , BooleanArray , ListArray , StructArray ,
28
27
} ;
29
28
use arrow:: compute:: { filter, SortOptions } ;
30
29
use arrow:: datatypes:: { DataType , Field , FieldRef , Fields } ;
31
30
32
31
use datafusion_common:: cast:: as_list_array;
33
- use datafusion_common:: scalar:: copy_array_data;
34
32
use datafusion_common:: utils:: {
35
33
compare_rows, get_row_at_idx, take_function_args, SingleRowListArrayBuilder ,
36
34
} ;
@@ -335,11 +333,7 @@ impl Accumulator for ArrayAggAccumulator {
335
333
} ;
336
334
337
335
if !val. is_empty ( ) {
338
- // The ArrayRef might be holding a reference to its original input buffer, so
339
- // storing it here directly copied/compacted avoids over accounting memory
340
- // not used here.
341
- self . values
342
- . push ( make_array ( copy_array_data ( & val. to_data ( ) ) ) ) ;
336
+ self . values . push ( val)
343
337
}
344
338
345
339
Ok ( ( ) )
@@ -398,7 +392,18 @@ impl Accumulator for ArrayAggAccumulator {
398
392
+ self
399
393
. values
400
394
. iter ( )
401
- . map ( |arr| arr. get_array_memory_size ( ) )
395
+ // Each ArrayRef might be just a reference to a bigger array, and many
396
+ // ArrayRefs here might be referencing exactly the same array, so if we
397
+ // were to call `arr.get_array_memory_size()`, we would be double-counting
398
+ // the same underlying data many times.
399
+ //
400
+ // Instead, we do an approximation by estimating how much memory each
401
+ // ArrayRef would occupy if its underlying data was fully owned by this
402
+ // accumulator.
403
+ //
404
+ // Note that this is just an estimation, but the reality is that this
405
+ // accumulator might not own any data.
406
+ . map ( |arr| arr. to_data ( ) . get_slice_memory_size ( ) . unwrap_or_default ( ) )
402
407
. sum :: < usize > ( )
403
408
+ self . datatype . size ( )
404
409
- size_of_val ( & self . datatype )
@@ -1064,8 +1069,7 @@ mod tests {
1064
1069
acc2. update_batch ( & [ data ( [ "b" , "c" , "a" ] ) ] ) ?;
1065
1070
acc1 = merge ( acc1, acc2) ?;
1066
1071
1067
- // without compaction, the size is 2652.
1068
- assert_eq ! ( acc1. size( ) , 732 ) ;
1072
+ assert_eq ! ( acc1. size( ) , 266 ) ;
1069
1073
1070
1074
Ok ( ( ) )
1071
1075
}
0 commit comments