Skip to content
Merged
65 changes: 64 additions & 1 deletion arrow-row/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3026,13 +3026,16 @@ mod tests {
}
}

// Convert rows produced from convert_columns().
// Note: validate_utf8 is set to false since Row is initialized through empty_rows()
let back = converter.convert_rows(&rows).unwrap();
for (actual, expected) in back.iter().zip(&arrays) {
actual.to_data().validate_full().unwrap();
dictionary_eq(actual, expected)
}

// Check that we can convert
// Check that we can convert rows into ByteArray and then parse, convert it back to array
// Note: validate_utf8 is set to true since Row is initialized through RowParser
let rows = rows.try_into_binary().expect("reasonable size");
let parser = converter.parser();
let back = converter
Expand Down Expand Up @@ -3163,4 +3166,64 @@ mod tests {
Ok(_) => panic!("Expected NotYetImplemented error for map data type"),
}
}

#[test]
fn test_values_buffer_smaller_when_utf8_validation_disabled() {
fn get_values_buffer_len(col: ArrayRef) -> (usize, usize) {
// 1. Convert cols into rows
let converter = RowConverter::new(vec![SortField::new(DataType::Utf8View)]).unwrap();

// 2a. Convert rows into colsa (validate_utf8 = false)
let rows = converter.convert_columns(&[col]).unwrap();
let converted = converter.convert_rows(&rows).unwrap();
let unchecked_values_len = converted[0].as_string_view().data_buffers()[0].len();

// 2b. Convert rows into cols (validate_utf8 = true since Row is initialized through RowParser)
let rows = rows.try_into_binary().expect("reasonable size");
let parser = converter.parser();
let converted = converter
.convert_rows(rows.iter().map(|b| parser.parse(b.expect("valid bytes"))))
.unwrap();
let checked_values_len = converted[0].as_string_view().data_buffers()[0].len();
(unchecked_values_len, checked_values_len)
}

// Case1. StringViewArray with inline strings
let col = Arc::new(StringViewArray::from_iter([
Some("hello"), // short(5)
None, // null
Some("short"), // short(5)
Some("tiny"), // short(4)
])) as ArrayRef;

let (unchecked_values_len, checked_values_len) = get_values_buffer_len(col);
// Since there are no long (>12) strings, len of values buffer is 0
assert_eq!(unchecked_values_len, 0);
// When utf8 validation enabled, values buffer includes inline strings (5+5+4)
assert_eq!(checked_values_len, 14);

// Case2. StringViewArray with long(>12) strings
let col = Arc::new(StringViewArray::from_iter([
Some("this is a very long string over 12 bytes"),
Some("another long string to test the buffer"),
])) as ArrayRef;

let (unchecked_values_len, checked_values_len) = get_values_buffer_len(col);
// Since there are no inline strings, expected length of values buffer is the same
assert!(unchecked_values_len > 0);
assert_eq!(unchecked_values_len, checked_values_len);

// Case3. StringViewArray with both short and long strings
let col = Arc::new(StringViewArray::from_iter([
Some("tiny"), // 4 (short)
Some("thisisexact13"), // 13 (long)
None,
Some("short"), // 5 (short)
])) as ArrayRef;

let (unchecked_values_len, checked_values_len) = get_values_buffer_len(col);
// Since there is single long string, len of values buffer is 13
assert_eq!(unchecked_values_len, 13);
assert!(checked_values_len > unchecked_values_len);
}
}
32 changes: 27 additions & 5 deletions arrow-row/src/variable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ use arrow_array::builder::BufferBuilder;
use arrow_array::*;
use arrow_buffer::bit_util::ceil;
use arrow_buffer::MutableBuffer;
use arrow_data::ArrayDataBuilder;
use arrow_data::{ArrayDataBuilder, MAX_INLINE_VIEW_LEN};
use arrow_schema::{DataType, SortOptions};
use builder::make_view;

Expand Down Expand Up @@ -249,9 +249,10 @@ pub fn decode_binary<I: OffsetSizeTrait>(
fn decode_binary_view_inner(
rows: &mut [&[u8]],
options: SortOptions,
check_utf8: bool,
validate_utf8: bool,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to keep the naming consistent with

pub unsafe fn decode_string_view(
rows: &mut [&[u8]],
options: SortOptions,
validate_utf8: bool,

) -> BinaryViewArray {
let len = rows.len();
let inline_str_max_len = MAX_INLINE_VIEW_LEN as usize;

let mut null_count = 0;

Expand All @@ -261,13 +262,29 @@ fn decode_binary_view_inner(
valid
});

let values_capacity: usize = rows.iter().map(|row| decoded_len(row, options)).sum();
let values_capacity = if validate_utf8 {
// Capacity for all long and short strings
rows.iter().map(|row| decoded_len(row, options)).sum()
} else {
// Capacity for all long strings plus room for one short string
rows.iter().fold(0, |acc, row| {
let len = decoded_len(row, options);
if len > inline_str_max_len {
acc + len
} else {
acc
}
}) + inline_str_max_len
};
let mut values = MutableBuffer::new(values_capacity);
let mut views = BufferBuilder::<u128>::new(len);

let mut views = BufferBuilder::<u128>::new(len);
for row in rows {
let start_offset = values.len();
let offset = decode_blocks(row, options, |b| values.extend_from_slice(b));
// Measure string length via change in values buffer.
// Used to check if decoded value should be truncated (short string) when validate_utf8 is false
let decoded_len = values.len() - start_offset;
if row[0] == null_sentinel(options) {
debug_assert_eq!(offset, 1);
debug_assert_eq!(start_offset, values.len());
Expand All @@ -282,11 +299,16 @@ fn decode_binary_view_inner(

let view = make_view(val, 0, start_offset as u32);
views.append(view);

// truncate inline string in values buffer if validate_utf8 is false
if !validate_utf8 && decoded_len <= inline_str_max_len {
values.truncate(start_offset);
}
}
*row = &row[offset..];
}

if check_utf8 {
if validate_utf8 {
// the values contains all data, no matter if it is short or long
// we can validate utf8 in one go.
std::str::from_utf8(values.as_slice()).unwrap();
Expand Down
7 changes: 7 additions & 0 deletions arrow/benches/row_format.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ use arrow::row::{RowConverter, SortField};
use arrow::util::bench_util::{
create_boolean_array, create_dict_from_values, create_primitive_array,
create_string_array_with_len, create_string_dict_array, create_string_view_array_with_len,
create_string_view_array_with_max_len,
};
use arrow_array::types::Int32Type;
use arrow_array::Array;
Expand Down Expand Up @@ -125,6 +126,12 @@ fn row_bench(c: &mut Criterion) {
let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0.5, 100, false)) as ArrayRef];
do_bench(c, "4096 string view(100, 0.5)", cols);

let cols = vec![Arc::new(create_string_view_array_with_max_len(4096, 0., 100)) as ArrayRef];
do_bench(c, "4096 string view(1..100, 0)", cols);

let cols = vec![Arc::new(create_string_view_array_with_max_len(4096, 0.5, 100)) as ArrayRef];
do_bench(c, "4096 string view(1..100, 0.5)", cols);

let cols = vec![Arc::new(create_string_dict_array::<Int32Type>(4096, 0., 10)) as ArrayRef];
do_bench(c, "4096 string_dictionary(10, 0)", cols);

Expand Down
Loading