Skip to content

Commit a9707ae

Browse files
authored
Allow file_offset and total_compressed_size thrift fields in RowGroupMetaData
These are from the newer Parquet thrift version that was added to the current Arrow parquet implementation (in this branch).
1 parent b6c25a9 commit a9707ae

File tree

1 file changed

+8
-15
lines changed

1 file changed

+8
-15
lines changed

parquet/src/file/metadata.rs

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,9 @@ pub struct RowGroupMetaData {
217217
num_rows: i64,
218218
total_byte_size: i64,
219219
schema_descr: SchemaDescPtr,
220+
/// Cube: We'll roundtrip file_offset from thrift, but we always originate it as None. It was
221+
/// not present in the older Parquet RowGroupMetaData definition.
222+
file_offset: Option<i64>,
220223
/// Ordinal position of this row group in file
221224
ordinal: Option<i16>,
222225
}
@@ -280,37 +283,25 @@ impl RowGroupMetaData {
280283
let cc = ColumnChunkMetaData::from_thrift(d.clone(), c)?;
281284
columns.push(cc);
282285
}
283-
// Notably, the function to_thrift, below, doesn't write these fields, and RowGroupMetadata doesn't have them.
284-
if rg.file_offset.is_some() {
285-
return Err(ParquetError::NYI(
286-
"Parsing RowGroup file_offset fields is not yet implemented".to_string(),
287-
));
288-
}
289-
if rg.total_compressed_size.is_some() {
290-
return Err(ParquetError::NYI(
291-
"Parsing RowGroup total_compressed_size fields is not yet implemented"
292-
.to_string(),
293-
));
294-
}
295286
Ok(RowGroupMetaData {
296287
columns,
297288
num_rows,
298289
total_byte_size,
290+
file_offset: rg.file_offset,
299291
schema_descr,
300292
ordinal: rg.ordinal,
301293
})
302294
}
303295

304296
/// Method to convert to Thrift.
305297
pub fn to_thrift(&self) -> RowGroup {
306-
// TODO: Understand file_offset and total_compressed_size fields.
307298
RowGroup {
308299
columns: self.columns().iter().map(|v| v.to_thrift()).collect(),
309300
total_byte_size: self.total_byte_size,
310301
num_rows: self.num_rows,
311302
sorting_columns: None,
312-
file_offset: None,
313-
total_compressed_size: None,
303+
file_offset: self.file_offset,
304+
total_compressed_size: Some(self.compressed_size()),
314305
ordinal: self.ordinal,
315306
}
316307
}
@@ -375,6 +366,8 @@ impl RowGroupMetaDataBuilder {
375366
columns: self.columns,
376367
num_rows: self.num_rows,
377368
total_byte_size: self.total_byte_size,
369+
// Cube: Here is where we originate the None value for the optionally supplied value file_offset field.
370+
file_offset: None,
378371
schema_descr: self.schema_descr,
379372
ordinal: self.ordinal,
380373
})

0 commit comments

Comments
 (0)