Skip to content

Commit 7089786

Browse files
[Variant] Avoid collecting offset iterator (apache#7934)
# Which issue does this PR close? We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. - Closes apache#7901 . # Rationale for this change Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed. Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes. # What changes are included in this PR? There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR. # Are these changes tested? We typically require tests for all PRs in order to: 1. Prevent the code from being accidentally broken by subsequent changes 2. Serve as another way to document the expected behavior of the code If tests are not included in your PR, please explain why (for example, are they covered by existing tests)? # Are there any user-facing changes? If there are user-facing changes then we may require documentation to be updated before approving the PR. If there are any breaking changes to public APIs, please call them out. --------- Signed-off-by: codephage2020 <[email protected]>
1 parent d809f19 commit 7089786

File tree

2 files changed

+72
-44
lines changed

2 files changed

+72
-44
lines changed

parquet-variant/src/variant/metadata.rs

Lines changed: 36 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -234,44 +234,58 @@ impl<'m> VariantMetadata<'m> {
234234
self.header.first_offset_byte() as _..self.first_value_byte as _,
235235
)?;
236236

237-
let offsets =
238-
map_bytes_to_offsets(offset_bytes, self.header.offset_size).collect::<Vec<_>>();
239-
240237
// Verify the string values in the dictionary are UTF-8 encoded strings.
241238
let value_buffer =
242239
string_from_slice(self.bytes, 0, self.first_value_byte as _..self.bytes.len())?;
243240

241+
let mut offsets_iter = map_bytes_to_offsets(offset_bytes, self.header.offset_size);
242+
let mut current_offset = offsets_iter.next().unwrap_or(0);
243+
244244
if self.header.is_sorted {
245245
// Validate the dictionary values are unique and lexicographically sorted
246246
//
247247
// Since we use the offsets to access dictionary values, this also validates
248248
// offsets are in-bounds and monotonically increasing
249-
let are_dictionary_values_unique_and_sorted = (1..offsets.len())
250-
.map(|i| {
251-
let field_range = offsets[i - 1]..offsets[i];
252-
value_buffer.get(field_range)
253-
})
254-
.is_sorted_by(|a, b| match (a, b) {
255-
(Some(a), Some(b)) => a < b,
256-
_ => false,
257-
});
258-
259-
if !are_dictionary_values_unique_and_sorted {
260-
return Err(ArrowError::InvalidArgumentError(
261-
"dictionary values are not unique and ordered".to_string(),
262-
));
249+
let mut prev_value: Option<&str> = None;
250+
251+
for next_offset in offsets_iter {
252+
if next_offset <= current_offset {
253+
return Err(ArrowError::InvalidArgumentError(
254+
"offsets not monotonically increasing".to_string(),
255+
));
256+
}
257+
258+
let current_value =
259+
value_buffer
260+
.get(current_offset..next_offset)
261+
.ok_or_else(|| {
262+
ArrowError::InvalidArgumentError("offset out of bounds".to_string())
263+
})?;
264+
265+
if let Some(prev_val) = prev_value {
266+
if current_value <= prev_val {
267+
return Err(ArrowError::InvalidArgumentError(
268+
"dictionary values are not unique and ordered".to_string(),
269+
));
270+
}
271+
}
272+
273+
prev_value = Some(current_value);
274+
current_offset = next_offset;
263275
}
264276
} else {
265277
// Validate offsets are in-bounds and monotonically increasing
266278
//
267279
// Since shallow validation ensures the first and last offsets are in bounds,
268280
// we can also verify all offsets are in-bounds by checking if
269281
// offsets are monotonically increasing
270-
let are_offsets_monotonic = offsets.is_sorted_by(|a, b| a < b);
271-
if !are_offsets_monotonic {
272-
return Err(ArrowError::InvalidArgumentError(
273-
"offsets not monotonically increasing".to_string(),
274-
));
282+
for next_offset in offsets_iter {
283+
if next_offset <= current_offset {
284+
return Err(ArrowError::InvalidArgumentError(
285+
"offsets not monotonically increasing".to_string(),
286+
));
287+
}
288+
current_offset = next_offset;
275289
}
276290
}
277291

parquet-variant/src/variant/object.rs

Lines changed: 36 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -217,23 +217,31 @@ impl<'m, 'v> VariantObject<'m, 'v> {
217217
self.header.field_ids_start_byte() as _..self.first_field_offset_byte as _,
218218
)?;
219219

220-
let field_ids = map_bytes_to_offsets(field_id_buffer, self.header.field_id_size)
221-
.collect::<Vec<_>>();
222-
220+
let mut field_ids_iter =
221+
map_bytes_to_offsets(field_id_buffer, self.header.field_id_size);
223222
// Validate all field ids exist in the metadata dictionary and the corresponding field names are lexicographically sorted
224223
if self.metadata.is_sorted() {
225224
// Since the metadata dictionary has unique and sorted field names, we can also guarantee this object's field names
226225
// are lexicographically sorted by their field id ordering
227-
if !field_ids.is_sorted() {
228-
return Err(ArrowError::InvalidArgumentError(
229-
"field names not sorted".to_string(),
230-
));
231-
}
226+
let dictionary_size = self.metadata.dictionary_size();
227+
228+
if let Some(mut current_id) = field_ids_iter.next() {
229+
for next_id in field_ids_iter {
230+
if current_id >= dictionary_size {
231+
return Err(ArrowError::InvalidArgumentError(
232+
"field id is not valid".to_string(),
233+
));
234+
}
235+
236+
if next_id <= current_id {
237+
return Err(ArrowError::InvalidArgumentError(
238+
"field names not sorted".to_string(),
239+
));
240+
}
241+
current_id = next_id;
242+
}
232243

233-
// Since field ids are sorted, if the last field is smaller than the dictionary size,
234-
// we also know all field ids are smaller than the dictionary size and in-bounds.
235-
if let Some(&last_field_id) = field_ids.last() {
236-
if last_field_id >= self.metadata.dictionary_size() {
244+
if current_id >= dictionary_size {
237245
return Err(ArrowError::InvalidArgumentError(
238246
"field id is not valid".to_string(),
239247
));
@@ -244,16 +252,22 @@ impl<'m, 'v> VariantObject<'m, 'v> {
244252
// to check lexicographical order
245253
//
246254
// Since we are probing the metadata dictionary by field id, this also verifies field ids are in-bounds
247-
let are_field_names_sorted = field_ids
248-
.iter()
249-
.map(|&i| self.metadata.get(i))
250-
.collect::<Result<Vec<_>, _>>()?
251-
.is_sorted();
252-
253-
if !are_field_names_sorted {
254-
return Err(ArrowError::InvalidArgumentError(
255-
"field names not sorted".to_string(),
256-
));
255+
let mut current_field_name = match field_ids_iter.next() {
256+
Some(field_id) => Some(self.metadata.get(field_id)?),
257+
None => None,
258+
};
259+
260+
for field_id in field_ids_iter {
261+
let next_field_name = self.metadata.get(field_id)?;
262+
263+
if let Some(current_name) = current_field_name {
264+
if next_field_name <= current_name {
265+
return Err(ArrowError::InvalidArgumentError(
266+
"field names not sorted".to_string(),
267+
));
268+
}
269+
}
270+
current_field_name = Some(next_field_name);
257271
}
258272
}
259273

0 commit comments

Comments
 (0)