From a6f033e0ee4e001cc8120368df0f08d89898db35 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 9 Oct 2025 15:36:35 -0700 Subject: [PATCH 1/7] checkpoint --- parquet/src/encryption/ciphers.rs | 9 ++++++++- parquet/src/encryption/decrypt.rs | 29 +++++++++++++++++++++++++++++ parquet/src/file/metadata/memory.rs | 11 +++++++++++ parquet/src/file/metadata/mod.rs | 6 ++++++ 4 files changed, 54 insertions(+), 1 deletion(-) diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index faff28f8acff..295881dd5a60 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -18,6 +18,7 @@ use crate::errors::ParquetError; use crate::errors::ParquetError::General; use crate::errors::Result; +use crate::file::metadata::HeapSize; use ring::aead::{AES_128_GCM, Aad, LessSafeKey, NonceSequence, UnboundKey}; use ring::rand::{SecureRandom, SystemRandom}; use std::fmt::Debug; @@ -27,7 +28,7 @@ pub(crate) const NONCE_LEN: usize = 12; pub(crate) const TAG_LEN: usize = 16; pub(crate) const SIZE_LEN: usize = 4; -pub(crate) trait BlockDecryptor: Debug + Send + Sync { +pub(crate) trait BlockDecryptor: Debug + Send + Sync + HeapSize { fn decrypt(&self, length_and_ciphertext: &[u8], aad: &[u8]) -> Result>; fn compute_plaintext_tag(&self, aad: &[u8], plaintext: &[u8]) -> Result>; @@ -50,6 +51,12 @@ impl RingGcmBlockDecryptor { } } +impl HeapSize for RingGcmBlockDecryptor { + fn heap_size(&self) -> usize { + 0 // FIXME(ets): how to even approximate this??? + } +} + impl BlockDecryptor for RingGcmBlockDecryptor { fn decrypt(&self, length_and_ciphertext: &[u8], aad: &[u8]) -> Result> { let mut result = Vec::with_capacity(length_and_ciphertext.len() - SIZE_LEN - NONCE_LEN); diff --git a/parquet/src/encryption/decrypt.rs b/parquet/src/encryption/decrypt.rs index b5374066dfc3..545b0741c6ec 100644 --- a/parquet/src/encryption/decrypt.rs +++ b/parquet/src/encryption/decrypt.rs @@ -21,6 +21,7 @@ use crate::encryption::ciphers::{BlockDecryptor, RingGcmBlockDecryptor, TAG_LEN} use crate::encryption::modules::{ModuleType, create_footer_aad, create_module_aad}; use crate::errors::{ParquetError, Result}; use crate::file::column_crypto_metadata::ColumnCryptoMetaData; +use crate::file::metadata::HeapSize; use std::borrow::Cow; use std::collections::HashMap; use std::fmt::Formatter; @@ -271,6 +272,12 @@ struct ExplicitDecryptionKeys { column_keys: HashMap>, } +impl HeapSize for ExplicitDecryptionKeys { + fn heap_size(&self) -> usize { + self.footer_key.heap_size() + self.column_keys.heap_size() + } +} + #[derive(Clone)] enum DecryptionKeys { Explicit(ExplicitDecryptionKeys), @@ -290,6 +297,15 @@ impl PartialEq for DecryptionKeys { } } +impl HeapSize for DecryptionKeys { + fn heap_size(&self) -> usize { + match self { + Self::Explicit(keys) => keys.heap_size(), + Self::ViaRetriever(_) => 0, // FIXME(ets): how to deal with this??? + } + } +} + /// `FileDecryptionProperties` hold keys and AAD data required to decrypt a Parquet file. /// /// When reading Arrow data, the `FileDecryptionProperties` should be included in the @@ -334,6 +350,11 @@ pub struct FileDecryptionProperties { footer_signature_verification: bool, } +impl HeapSize for FileDecryptionProperties { + fn heap_size(&self) -> usize { + self.keys.heap_size() + self.aad_prefix.heap_size() + } +} impl FileDecryptionProperties { /// Returns a new [`FileDecryptionProperties`] builder that will use the provided key to /// decrypt footer metadata. @@ -547,6 +568,14 @@ impl PartialEq for FileDecryptor { } } +impl HeapSize for FileDecryptor { + fn heap_size(&self) -> usize { + self.decryption_properties.heap_size() + + self.footer_decryptor.heap_size() + + self.file_aad.heap_size() + } +} + impl FileDecryptor { pub(crate) fn new( decryption_properties: &Arc, diff --git a/parquet/src/file/metadata/memory.rs b/parquet/src/file/metadata/memory.rs index 98ce5736ae1d..871cfdb35a64 100644 --- a/parquet/src/file/metadata/memory.rs +++ b/parquet/src/file/metadata/memory.rs @@ -28,6 +28,7 @@ use crate::file::page_index::column_index::{ }; use crate::file::page_index::offset_index::{OffsetIndexMetaData, PageLocation}; use crate::file::statistics::{Statistics, ValueStatistics}; +use std::collections::HashMap; use std::sync::Arc; /// Trait for calculating the size of various containers @@ -50,6 +51,16 @@ impl HeapSize for Vec { } } +impl HeapSize for HashMap { + fn heap_size(&self) -> usize { + let key_size = std::mem::size_of::(); + let val_size = std::mem::size_of::(); + (self.capacity() * (key_size + val_size)) + + self.keys().map(|k| k.heap_size()).sum::() + + self.values().map(|v| v.heap_size()).sum::() + } +} + impl HeapSize for Arc { fn heap_size(&self) -> usize { self.as_ref().heap_size() diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 763025fe142b..124284968251 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -287,11 +287,17 @@ impl ParquetMetaData { /// /// 4. Does not include any allocator overheads pub fn memory_size(&self) -> usize { + #[cfg(feature = "encryption")] + let encryption_size = self.file_decryptor.heap_size(); + #[cfg(not(feature = "encryption"))] + let encryption_size = 0usize; + std::mem::size_of::() + self.file_metadata.heap_size() + self.row_groups.heap_size() + self.column_index.heap_size() + self.offset_index.heap_size() + + encryption_size } /// Override the column index From ec0391978f43f6718660eba6566aafa71cfa8c3d Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Mon, 20 Oct 2025 15:25:17 +1300 Subject: [PATCH 2/7] Format and update comments --- parquet/src/encryption/ciphers.rs | 3 ++- parquet/src/encryption/decrypt.rs | 13 ++++++++++++- parquet/src/file/metadata/memory.rs | 6 +++--- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/parquet/src/encryption/ciphers.rs b/parquet/src/encryption/ciphers.rs index 295881dd5a60..a94c72dcd5ec 100644 --- a/parquet/src/encryption/ciphers.rs +++ b/parquet/src/encryption/ciphers.rs @@ -53,7 +53,8 @@ impl RingGcmBlockDecryptor { impl HeapSize for RingGcmBlockDecryptor { fn heap_size(&self) -> usize { - 0 // FIXME(ets): how to even approximate this??? + // Ring's LessSafeKey doesn't allocate on the heap + 0 } } diff --git a/parquet/src/encryption/decrypt.rs b/parquet/src/encryption/decrypt.rs index 545b0741c6ec..488f100ce4bb 100644 --- a/parquet/src/encryption/decrypt.rs +++ b/parquet/src/encryption/decrypt.rs @@ -301,7 +301,11 @@ impl HeapSize for DecryptionKeys { fn heap_size(&self) -> usize { match self { Self::Explicit(keys) => keys.heap_size(), - Self::ViaRetriever(_) => 0, // FIXME(ets): how to deal with this??? + Self::ViaRetriever(_) => { + // The retriever is a user-defined type we don't control, + // so we can't determine the heap size. + 0 + } } } } @@ -568,6 +572,13 @@ impl PartialEq for FileDecryptor { } } +/// Estimate the size in bytes required for the file decryptor. +/// This is important to track the memory usage of cached Parquet meta data, +/// and is used via [`crate::file::metadata::ParquetMetaData::memory_size`]. +/// Note that when a [`KeyRetriever`] is used, its heap size won't be included +/// and the result will be an underestimate. +/// If the [`FileDecryptionProperties`] are shared between multiple files then the +/// heap size may also be an overestimate. impl HeapSize for FileDecryptor { fn heap_size(&self) -> usize { self.decryption_properties.heap_size() diff --git a/parquet/src/file/metadata/memory.rs b/parquet/src/file/metadata/memory.rs index 871cfdb35a64..2f6ee7e07475 100644 --- a/parquet/src/file/metadata/memory.rs +++ b/parquet/src/file/metadata/memory.rs @@ -55,9 +55,9 @@ impl HeapSize for HashMap { fn heap_size(&self) -> usize { let key_size = std::mem::size_of::(); let val_size = std::mem::size_of::(); - (self.capacity() * (key_size + val_size)) + - self.keys().map(|k| k.heap_size()).sum::() + - self.values().map(|v| v.heap_size()).sum::() + (self.capacity() * (key_size + val_size)) + + self.keys().map(|k| k.heap_size()).sum::() + + self.values().map(|v| v.heap_size()).sum::() } } From f58565e5c858884bfc881974b507a438c087c42f Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Mon, 20 Oct 2025 16:52:14 +1300 Subject: [PATCH 3/7] Add unit test --- parquet/src/file/metadata/mod.rs | 77 +++++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 2 deletions(-) diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 124284968251..38f0bedd14c7 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -1883,7 +1883,6 @@ mod tests { #[cfg(not(feature = "encryption"))] let base_expected_size = 2248; #[cfg(feature = "encryption")] - // Not as accurate as it should be: https://github.com/apache/arrow-rs/issues/8472 let base_expected_size = 2416; assert_eq!(parquet_meta.memory_size(), base_expected_size); @@ -1915,7 +1914,6 @@ mod tests { #[cfg(not(feature = "encryption"))] let bigger_expected_size = 2674; #[cfg(feature = "encryption")] - // Not as accurate as it should be: https://github.com/apache/arrow-rs/issues/8472 let bigger_expected_size = 2842; // more set fields means more memory usage @@ -1923,6 +1921,81 @@ mod tests { assert_eq!(parquet_meta.memory_size(), bigger_expected_size); } + #[test] + #[cfg(feature = "encryption")] + fn test_memory_size_with_decryptor() { + use crate::encryption::decrypt::FileDecryptionProperties; + use crate::file::metadata::thrift::encryption::AesGcmV1; + + let schema_descr = get_test_schema_descr(); + + let columns = schema_descr + .columns() + .iter() + .map(|column_descr| ColumnChunkMetaData::builder(column_descr.clone()).build()) + .collect::>>() + .unwrap(); + let row_group_meta = RowGroupMetaData::builder(schema_descr.clone()) + .set_num_rows(1000) + .set_column_metadata(columns) + .build() + .unwrap(); + let row_group_meta = vec![row_group_meta]; + + let version = 2; + let num_rows = 1000; + let aad_file_unique = vec![1u8; 8]; + let aad_prefix = vec![2u8; 8]; + let encryption_algorithm = EncryptionAlgorithm::AES_GCM_V1(AesGcmV1 { + aad_prefix: Some(aad_prefix.clone()), + aad_file_unique: Some(aad_file_unique.clone()), + supply_aad_prefix: Some(true), + }); + let footer_key_metadata = Some(vec![3u8; 8]); + let file_metadata = + FileMetaData::new(version, num_rows, None, None, schema_descr.clone(), None) + .with_encryption_algorithm(Some(encryption_algorithm)) + .with_footer_signing_key_metadata(footer_key_metadata.clone()); + + let parquet_meta_data = ParquetMetaDataBuilder::new(file_metadata.clone()) + .set_row_groups(row_group_meta.clone()) + .build(); + + let base_expected_size = 1540; + assert_eq!(parquet_meta_data.memory_size(), base_expected_size); + + let footer_key = "0123456789012345".as_bytes(); + let column_key = "1234567890123450".as_bytes(); + let mut decryption_properties_builder = + FileDecryptionProperties::builder(footer_key.to_vec()) + .with_aad_prefix(aad_prefix.clone()); + for column in schema_descr.columns() { + decryption_properties_builder = decryption_properties_builder + .with_column_key(&column.path().string(), column_key.to_vec()); + } + let decryption_properties = decryption_properties_builder.build().unwrap(); + let decryptor = FileDecryptor::new( + &decryption_properties, + footer_key_metadata.as_deref(), + aad_file_unique, + aad_prefix, + ) + .unwrap(); + + let parquet_meta_data = ParquetMetaDataBuilder::new(file_metadata.clone()) + .set_row_groups(row_group_meta.clone()) + .set_file_decryptor(Some(decryptor)) + .build(); + + let expected_size_with_decryptor = 1806; + assert!(expected_size_with_decryptor > base_expected_size); + + assert_eq!( + parquet_meta_data.memory_size(), + expected_size_with_decryptor + ); + } + /// Returns sample schema descriptor so we can create column metadata. fn get_test_schema_descr() -> SchemaDescPtr { let schema = SchemaType::group_type_builder("schema") From a53eb7be94770a84051ede486fefb626025af177 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Wed, 22 Oct 2025 14:42:08 +1300 Subject: [PATCH 4/7] More accurate HashMap heap size calculation --- parquet/src/file/metadata/memory.rs | 38 ++++++++++++++++++++++++++--- parquet/src/file/metadata/mod.rs | 2 +- 2 files changed, 36 insertions(+), 4 deletions(-) diff --git a/parquet/src/file/metadata/memory.rs b/parquet/src/file/metadata/memory.rs index 2f6ee7e07475..c823467c0c79 100644 --- a/parquet/src/file/metadata/memory.rs +++ b/parquet/src/file/metadata/memory.rs @@ -53,9 +53,41 @@ impl HeapSize for Vec { impl HeapSize for HashMap { fn heap_size(&self) -> usize { - let key_size = std::mem::size_of::(); - let val_size = std::mem::size_of::(); - (self.capacity() * (key_size + val_size)) + let capacity = self.capacity(); + if capacity == 0 { + return 0; + } + + // HashMap doesn't provide a way to get its heap size, so this is an approximation based on + // the behavior of hashbrown::HashMap as at version 0.16.0, and may become inaccurate + // if the implementation changes. + let key_val_size = std::mem::size_of::<(K, V)>(); + // Overhead for the control tags group, which may be smaller depending on architecture + let group_size = 16; + // 1 byte of metadata stored per bucket. + let metadata_size = 1; + + // Compute the number of buckets for the capacity. Based on hashbrown's capacity_to_buckets + let buckets = if capacity < 15 { + let min_cap = match key_val_size { + 0..=1 => 14, + 2..=3 => 7, + _ => 3, + }; + let cap = min_cap.max(capacity); + if cap < 4 { + 4 + } else if cap < 8 { + 8 + } else { + 16 + } + } else { + (capacity.saturating_mul(8) / 7).next_power_of_two() + }; + + group_size + + (buckets * (key_val_size + metadata_size)) + self.keys().map(|k| k.heap_size()).sum::() + self.values().map(|v| v.heap_size()).sum::() } diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 38f0bedd14c7..27f027cdd042 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -1987,7 +1987,7 @@ mod tests { .set_file_decryptor(Some(decryptor)) .build(); - let expected_size_with_decryptor = 1806; + let expected_size_with_decryptor = 1874; assert!(expected_size_with_decryptor > base_expected_size); assert_eq!( From 0c8ab496ee125839a869a8fa0e4c4866025a2334 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Wed, 22 Oct 2025 19:43:53 +1300 Subject: [PATCH 5/7] Fix HeapSize implementation for Arc --- parquet/src/file/metadata/memory.rs | 3 ++- parquet/src/file/metadata/mod.rs | 12 ++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/parquet/src/file/metadata/memory.rs b/parquet/src/file/metadata/memory.rs index c823467c0c79..31ff0a95f933 100644 --- a/parquet/src/file/metadata/memory.rs +++ b/parquet/src/file/metadata/memory.rs @@ -95,7 +95,8 @@ impl HeapSize for HashMap { impl HeapSize for Arc { fn heap_size(&self) -> usize { - self.as_ref().heap_size() + // Arc stores weak and strong counts on the heap alongside an instance of T + 2 * std::mem::size_of::() + std::mem::size_of::() + self.as_ref().heap_size() } } diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 27f027cdd042..9fddef43e370 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -1881,9 +1881,9 @@ mod tests { .build(); #[cfg(not(feature = "encryption"))] - let base_expected_size = 2248; + let base_expected_size = 2992; #[cfg(feature = "encryption")] - let base_expected_size = 2416; + let base_expected_size = 3160; assert_eq!(parquet_meta.memory_size(), base_expected_size); @@ -1912,9 +1912,9 @@ mod tests { .build(); #[cfg(not(feature = "encryption"))] - let bigger_expected_size = 2674; + let bigger_expected_size = 3418; #[cfg(feature = "encryption")] - let bigger_expected_size = 2842; + let bigger_expected_size = 3586; // more set fields means more memory usage assert!(bigger_expected_size > base_expected_size); @@ -1961,7 +1961,7 @@ mod tests { .set_row_groups(row_group_meta.clone()) .build(); - let base_expected_size = 1540; + let base_expected_size = 2284; assert_eq!(parquet_meta_data.memory_size(), base_expected_size); let footer_key = "0123456789012345".as_bytes(); @@ -1987,7 +1987,7 @@ mod tests { .set_file_decryptor(Some(decryptor)) .build(); - let expected_size_with_decryptor = 1874; + let expected_size_with_decryptor = 2738; assert!(expected_size_with_decryptor > base_expected_size); assert_eq!( From 955ad16e6e60692b5a185bd6881a6e601d909f38 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Fri, 24 Oct 2025 13:45:40 +1300 Subject: [PATCH 6/7] Fix decryptor size not being included --- parquet/src/encryption/decrypt.rs | 2 +- parquet/src/file/metadata/memory.rs | 8 ++++++++ parquet/src/file/metadata/mod.rs | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/parquet/src/encryption/decrypt.rs b/parquet/src/encryption/decrypt.rs index 488f100ce4bb..0066523419de 100644 --- a/parquet/src/encryption/decrypt.rs +++ b/parquet/src/encryption/decrypt.rs @@ -582,7 +582,7 @@ impl PartialEq for FileDecryptor { impl HeapSize for FileDecryptor { fn heap_size(&self) -> usize { self.decryption_properties.heap_size() - + self.footer_decryptor.heap_size() + + (Arc::clone(&self.footer_decryptor) as Arc).heap_size() + self.file_aad.heap_size() } } diff --git a/parquet/src/file/metadata/memory.rs b/parquet/src/file/metadata/memory.rs index 31ff0a95f933..11536bbbd41e 100644 --- a/parquet/src/file/metadata/memory.rs +++ b/parquet/src/file/metadata/memory.rs @@ -100,6 +100,14 @@ impl HeapSize for Arc { } } +impl HeapSize for Arc { + fn heap_size(&self) -> usize { + 2 * std::mem::size_of::() + + std::mem::size_of_val(self.as_ref()) + + self.as_ref().heap_size() + } +} + impl HeapSize for Box { fn heap_size(&self) -> usize { std::mem::size_of::() + self.as_ref().heap_size() diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 9fddef43e370..b231ad666c02 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -1987,7 +1987,7 @@ mod tests { .set_file_decryptor(Some(decryptor)) .build(); - let expected_size_with_decryptor = 2738; + let expected_size_with_decryptor = 3298; assert!(expected_size_with_decryptor > base_expected_size); assert_eq!( From 4ce0a66d60e6a0a248f724c4f4d7b62ae643a4cf Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Fri, 24 Oct 2025 14:49:45 +1300 Subject: [PATCH 7/7] Fix double counting heap sie of type pointers in column descriptors --- parquet/src/file/metadata/mod.rs | 12 ++++++------ parquet/src/schema/types.rs | 4 +++- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index b231ad666c02..7022bd61c44d 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -1881,9 +1881,9 @@ mod tests { .build(); #[cfg(not(feature = "encryption"))] - let base_expected_size = 2992; + let base_expected_size = 2766; #[cfg(feature = "encryption")] - let base_expected_size = 3160; + let base_expected_size = 2934; assert_eq!(parquet_meta.memory_size(), base_expected_size); @@ -1912,9 +1912,9 @@ mod tests { .build(); #[cfg(not(feature = "encryption"))] - let bigger_expected_size = 3418; + let bigger_expected_size = 3192; #[cfg(feature = "encryption")] - let bigger_expected_size = 3586; + let bigger_expected_size = 3360; // more set fields means more memory usage assert!(bigger_expected_size > base_expected_size); @@ -1961,7 +1961,7 @@ mod tests { .set_row_groups(row_group_meta.clone()) .build(); - let base_expected_size = 2284; + let base_expected_size = 2058; assert_eq!(parquet_meta_data.memory_size(), base_expected_size); let footer_key = "0123456789012345".as_bytes(); @@ -1987,7 +1987,7 @@ mod tests { .set_file_decryptor(Some(decryptor)) .build(); - let expected_size_with_decryptor = 3298; + let expected_size_with_decryptor = 3072; assert!(expected_size_with_decryptor > base_expected_size); assert_eq!( diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index 1ae37d0a462f..378a71b90249 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -845,7 +845,9 @@ pub struct ColumnDescriptor { impl HeapSize for ColumnDescriptor { fn heap_size(&self) -> usize { - self.primitive_type.heap_size() + self.path.heap_size() + // Don't include the heap size of primitive_type, this is already + // accounted for via SchemaDescriptor::schema + self.path.heap_size() } }