-
Notifications
You must be signed in to change notification settings - Fork 1k
[Parquet] Account for FileDecryptor in ParquetMetaData heap size calculation #8671
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
a6f033e
ec03919
f58565e
a53eb7b
0c8ab49
955ad16
4ce0a66
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,7 @@ use crate::encryption::ciphers::{BlockDecryptor, RingGcmBlockDecryptor, TAG_LEN} | |
| use crate::encryption::modules::{ModuleType, create_footer_aad, create_module_aad}; | ||
| use crate::errors::{ParquetError, Result}; | ||
| use crate::file::column_crypto_metadata::ColumnCryptoMetaData; | ||
| use crate::file::metadata::HeapSize; | ||
| use std::borrow::Cow; | ||
| use std::collections::HashMap; | ||
| use std::fmt::Formatter; | ||
|
|
@@ -271,6 +272,12 @@ struct ExplicitDecryptionKeys { | |
| column_keys: HashMap<String, Vec<u8>>, | ||
| } | ||
|
|
||
| impl HeapSize for ExplicitDecryptionKeys { | ||
| fn heap_size(&self) -> usize { | ||
| self.footer_key.heap_size() + self.column_keys.heap_size() | ||
| } | ||
| } | ||
|
|
||
| #[derive(Clone)] | ||
| enum DecryptionKeys { | ||
| Explicit(ExplicitDecryptionKeys), | ||
|
|
@@ -290,6 +297,19 @@ impl PartialEq for DecryptionKeys { | |
| } | ||
| } | ||
|
|
||
| impl HeapSize for DecryptionKeys { | ||
| fn heap_size(&self) -> usize { | ||
| match self { | ||
| Self::Explicit(keys) => keys.heap_size(), | ||
| Self::ViaRetriever(_) => { | ||
| // The retriever is a user-defined type we don't control, | ||
| // so we can't determine the heap size. | ||
|
Comment on lines
+305
to
+306
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As discussed in #8472, we could potentially add a new trait method to allow a key retriever to provide a heap size later. |
||
| 0 | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /// `FileDecryptionProperties` hold keys and AAD data required to decrypt a Parquet file. | ||
| /// | ||
| /// When reading Arrow data, the `FileDecryptionProperties` should be included in the | ||
|
|
@@ -334,6 +354,11 @@ pub struct FileDecryptionProperties { | |
| footer_signature_verification: bool, | ||
| } | ||
|
|
||
| impl HeapSize for FileDecryptionProperties { | ||
| fn heap_size(&self) -> usize { | ||
| self.keys.heap_size() + self.aad_prefix.heap_size() | ||
| } | ||
| } | ||
| impl FileDecryptionProperties { | ||
| /// Returns a new [`FileDecryptionProperties`] builder that will use the provided key to | ||
| /// decrypt footer metadata. | ||
|
|
@@ -547,6 +572,21 @@ impl PartialEq for FileDecryptor { | |
| } | ||
| } | ||
|
|
||
| /// Estimate the size in bytes required for the file decryptor. | ||
| /// This is important to track the memory usage of cached Parquet meta data, | ||
| /// and is used via [`crate::file::metadata::ParquetMetaData::memory_size`]. | ||
| /// Note that when a [`KeyRetriever`] is used, its heap size won't be included | ||
| /// and the result will be an underestimate. | ||
| /// If the [`FileDecryptionProperties`] are shared between multiple files then the | ||
| /// heap size may also be an overestimate. | ||
|
Comment on lines
+575
to
+581
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ❤️ |
||
| impl HeapSize for FileDecryptor { | ||
| fn heap_size(&self) -> usize { | ||
| self.decryption_properties.heap_size() | ||
| + (Arc::clone(&self.footer_decryptor) as Arc<dyn HeapSize>).heap_size() | ||
| + self.file_aad.heap_size() | ||
| } | ||
| } | ||
|
|
||
| impl FileDecryptor { | ||
| pub(crate) fn new( | ||
| decryption_properties: &Arc<FileDecryptionProperties>, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -287,11 +287,17 @@ impl ParquetMetaData { | |
| /// | ||
| /// 4. Does not include any allocator overheads | ||
| pub fn memory_size(&self) -> usize { | ||
| #[cfg(feature = "encryption")] | ||
| let encryption_size = self.file_decryptor.heap_size(); | ||
| #[cfg(not(feature = "encryption"))] | ||
| let encryption_size = 0usize; | ||
|
|
||
| std::mem::size_of::<Self>() | ||
| + self.file_metadata.heap_size() | ||
| + self.row_groups.heap_size() | ||
| + self.column_index.heap_size() | ||
| + self.offset_index.heap_size() | ||
| + encryption_size | ||
| } | ||
|
|
||
| /// Override the column index | ||
|
|
@@ -1875,10 +1881,9 @@ mod tests { | |
| .build(); | ||
|
|
||
| #[cfg(not(feature = "encryption"))] | ||
| let base_expected_size = 2248; | ||
| let base_expected_size = 2766; | ||
| #[cfg(feature = "encryption")] | ||
| // Not as accurate as it should be: https://github.com/apache/arrow-rs/issues/8472 | ||
| let base_expected_size = 2416; | ||
| let base_expected_size = 2934; | ||
|
|
||
| assert_eq!(parquet_meta.memory_size(), base_expected_size); | ||
|
|
||
|
|
@@ -1907,16 +1912,90 @@ mod tests { | |
| .build(); | ||
|
|
||
| #[cfg(not(feature = "encryption"))] | ||
| let bigger_expected_size = 2674; | ||
| let bigger_expected_size = 3192; | ||
|
Comment on lines
-1910
to
+1915
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So sad to see this increase so much. Truth hurts 😢 |
||
| #[cfg(feature = "encryption")] | ||
| // Not as accurate as it should be: https://github.com/apache/arrow-rs/issues/8472 | ||
| let bigger_expected_size = 2842; | ||
| let bigger_expected_size = 3360; | ||
|
|
||
| // more set fields means more memory usage | ||
| assert!(bigger_expected_size > base_expected_size); | ||
| assert_eq!(parquet_meta.memory_size(), bigger_expected_size); | ||
| } | ||
|
|
||
| #[test] | ||
| #[cfg(feature = "encryption")] | ||
| fn test_memory_size_with_decryptor() { | ||
| use crate::encryption::decrypt::FileDecryptionProperties; | ||
| use crate::file::metadata::thrift::encryption::AesGcmV1; | ||
|
|
||
| let schema_descr = get_test_schema_descr(); | ||
|
|
||
| let columns = schema_descr | ||
| .columns() | ||
| .iter() | ||
| .map(|column_descr| ColumnChunkMetaData::builder(column_descr.clone()).build()) | ||
| .collect::<Result<Vec<_>>>() | ||
| .unwrap(); | ||
| let row_group_meta = RowGroupMetaData::builder(schema_descr.clone()) | ||
| .set_num_rows(1000) | ||
| .set_column_metadata(columns) | ||
| .build() | ||
| .unwrap(); | ||
| let row_group_meta = vec![row_group_meta]; | ||
|
|
||
| let version = 2; | ||
| let num_rows = 1000; | ||
| let aad_file_unique = vec![1u8; 8]; | ||
| let aad_prefix = vec![2u8; 8]; | ||
| let encryption_algorithm = EncryptionAlgorithm::AES_GCM_V1(AesGcmV1 { | ||
| aad_prefix: Some(aad_prefix.clone()), | ||
| aad_file_unique: Some(aad_file_unique.clone()), | ||
| supply_aad_prefix: Some(true), | ||
| }); | ||
| let footer_key_metadata = Some(vec![3u8; 8]); | ||
| let file_metadata = | ||
| FileMetaData::new(version, num_rows, None, None, schema_descr.clone(), None) | ||
| .with_encryption_algorithm(Some(encryption_algorithm)) | ||
| .with_footer_signing_key_metadata(footer_key_metadata.clone()); | ||
|
|
||
| let parquet_meta_data = ParquetMetaDataBuilder::new(file_metadata.clone()) | ||
| .set_row_groups(row_group_meta.clone()) | ||
| .build(); | ||
|
|
||
| let base_expected_size = 2058; | ||
| assert_eq!(parquet_meta_data.memory_size(), base_expected_size); | ||
|
|
||
| let footer_key = "0123456789012345".as_bytes(); | ||
| let column_key = "1234567890123450".as_bytes(); | ||
| let mut decryption_properties_builder = | ||
| FileDecryptionProperties::builder(footer_key.to_vec()) | ||
| .with_aad_prefix(aad_prefix.clone()); | ||
| for column in schema_descr.columns() { | ||
| decryption_properties_builder = decryption_properties_builder | ||
| .with_column_key(&column.path().string(), column_key.to_vec()); | ||
| } | ||
| let decryption_properties = decryption_properties_builder.build().unwrap(); | ||
| let decryptor = FileDecryptor::new( | ||
| &decryption_properties, | ||
| footer_key_metadata.as_deref(), | ||
| aad_file_unique, | ||
| aad_prefix, | ||
| ) | ||
| .unwrap(); | ||
|
|
||
| let parquet_meta_data = ParquetMetaDataBuilder::new(file_metadata.clone()) | ||
| .set_row_groups(row_group_meta.clone()) | ||
| .set_file_decryptor(Some(decryptor)) | ||
| .build(); | ||
|
|
||
| let expected_size_with_decryptor = 3072; | ||
| assert!(expected_size_with_decryptor > base_expected_size); | ||
|
|
||
| assert_eq!( | ||
| parquet_meta_data.memory_size(), | ||
| expected_size_with_decryptor | ||
| ); | ||
| } | ||
|
|
||
| /// Returns sample schema descriptor so we can create column metadata. | ||
| fn get_test_schema_descr() -> SchemaDescPtr { | ||
| let schema = SchemaType::group_type_builder("schema") | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -845,7 +845,9 @@ pub struct ColumnDescriptor { | |
|
|
||
| impl HeapSize for ColumnDescriptor { | ||
| fn heap_size(&self) -> usize { | ||
| self.primitive_type.heap_size() + self.path.heap_size() | ||
| // Don't include the heap size of primitive_type, this is already | ||
| // accounted for via SchemaDescriptor::schema | ||
| self.path.heap_size() | ||
|
Comment on lines
+848
to
+850
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🚀 |
||
| } | ||
| } | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I looked into changing
FileDecryptorto hold aVec<u8>for the footer key instead of anArc<dyn BlockDecryptor>to simplify the heap size calculation, as mentioned in #8472 (comment). But this decreased read speed by about 10% in a small test case, and also increased the memory usage.After looking more closely at the
LessSafeKeyimplementation, it doesn't appear to hold any heap allocated memory.I think it's fine to assume the heap size is going to stay zero. The ring crate has an
allocfeature that isn't required for theaeadmodule, so it would be a big change for this to start allocating.