Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion parquet/src/encryption/ciphers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
use crate::errors::ParquetError;
use crate::errors::ParquetError::General;
use crate::errors::Result;
use crate::file::metadata::HeapSize;
use ring::aead::{AES_128_GCM, Aad, LessSafeKey, NonceSequence, UnboundKey};
use ring::rand::{SecureRandom, SystemRandom};
use std::fmt::Debug;
Expand All @@ -27,7 +28,7 @@ pub(crate) const NONCE_LEN: usize = 12;
pub(crate) const TAG_LEN: usize = 16;
pub(crate) const SIZE_LEN: usize = 4;

pub(crate) trait BlockDecryptor: Debug + Send + Sync {
pub(crate) trait BlockDecryptor: Debug + Send + Sync + HeapSize {
fn decrypt(&self, length_and_ciphertext: &[u8], aad: &[u8]) -> Result<Vec<u8>>;

fn compute_plaintext_tag(&self, aad: &[u8], plaintext: &[u8]) -> Result<Vec<u8>>;
Expand All @@ -50,6 +51,13 @@ impl RingGcmBlockDecryptor {
}
}

impl HeapSize for RingGcmBlockDecryptor {
fn heap_size(&self) -> usize {
// Ring's LessSafeKey doesn't allocate on the heap
0
Comment on lines +56 to +57
Copy link
Contributor Author

@adamreeve adamreeve Oct 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I looked into changing FileDecryptor to hold a Vec<u8> for the footer key instead of an Arc<dyn BlockDecryptor> to simplify the heap size calculation, as mentioned in #8472 (comment). But this decreased read speed by about 10% in a small test case, and also increased the memory usage.

After looking more closely at the LessSafeKey implementation, it doesn't appear to hold any heap allocated memory.

I think it's fine to assume the heap size is going to stay zero. The ring crate has an alloc feature that isn't required for the aead module, so it would be a big change for this to start allocating.

}
}

impl BlockDecryptor for RingGcmBlockDecryptor {
fn decrypt(&self, length_and_ciphertext: &[u8], aad: &[u8]) -> Result<Vec<u8>> {
let mut result = Vec::with_capacity(length_and_ciphertext.len() - SIZE_LEN - NONCE_LEN);
Expand Down
40 changes: 40 additions & 0 deletions parquet/src/encryption/decrypt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use crate::encryption::ciphers::{BlockDecryptor, RingGcmBlockDecryptor, TAG_LEN}
use crate::encryption::modules::{ModuleType, create_footer_aad, create_module_aad};
use crate::errors::{ParquetError, Result};
use crate::file::column_crypto_metadata::ColumnCryptoMetaData;
use crate::file::metadata::HeapSize;
use std::borrow::Cow;
use std::collections::HashMap;
use std::fmt::Formatter;
Expand Down Expand Up @@ -271,6 +272,12 @@ struct ExplicitDecryptionKeys {
column_keys: HashMap<String, Vec<u8>>,
}

impl HeapSize for ExplicitDecryptionKeys {
fn heap_size(&self) -> usize {
self.footer_key.heap_size() + self.column_keys.heap_size()
}
}

#[derive(Clone)]
enum DecryptionKeys {
Explicit(ExplicitDecryptionKeys),
Expand All @@ -290,6 +297,19 @@ impl PartialEq for DecryptionKeys {
}
}

impl HeapSize for DecryptionKeys {
fn heap_size(&self) -> usize {
match self {
Self::Explicit(keys) => keys.heap_size(),
Self::ViaRetriever(_) => {
// The retriever is a user-defined type we don't control,
// so we can't determine the heap size.
Comment on lines +305 to +306
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As discussed in #8472, we could potentially add a new trait method to allow a key retriever to provide a heap size later.

0
}
}
}
}

/// `FileDecryptionProperties` hold keys and AAD data required to decrypt a Parquet file.
///
/// When reading Arrow data, the `FileDecryptionProperties` should be included in the
Expand Down Expand Up @@ -334,6 +354,11 @@ pub struct FileDecryptionProperties {
footer_signature_verification: bool,
}

impl HeapSize for FileDecryptionProperties {
fn heap_size(&self) -> usize {
self.keys.heap_size() + self.aad_prefix.heap_size()
}
}
impl FileDecryptionProperties {
/// Returns a new [`FileDecryptionProperties`] builder that will use the provided key to
/// decrypt footer metadata.
Expand Down Expand Up @@ -547,6 +572,21 @@ impl PartialEq for FileDecryptor {
}
}

/// Estimate the size in bytes required for the file decryptor.
/// This is important to track the memory usage of cached Parquet meta data,
/// and is used via [`crate::file::metadata::ParquetMetaData::memory_size`].
/// Note that when a [`KeyRetriever`] is used, its heap size won't be included
/// and the result will be an underestimate.
/// If the [`FileDecryptionProperties`] are shared between multiple files then the
/// heap size may also be an overestimate.
Comment on lines +575 to +581
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

❤️

impl HeapSize for FileDecryptor {
fn heap_size(&self) -> usize {
self.decryption_properties.heap_size()
+ (Arc::clone(&self.footer_decryptor) as Arc<dyn HeapSize>).heap_size()
+ self.file_aad.heap_size()
}
}

impl FileDecryptor {
pub(crate) fn new(
decryption_properties: &Arc<FileDecryptionProperties>,
Expand Down
54 changes: 53 additions & 1 deletion parquet/src/file/metadata/memory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ use crate::file::page_index::column_index::{
};
use crate::file::page_index::offset_index::{OffsetIndexMetaData, PageLocation};
use crate::file::statistics::{Statistics, ValueStatistics};
use std::collections::HashMap;
use std::sync::Arc;

/// Trait for calculating the size of various containers
Expand All @@ -50,9 +51,60 @@ impl<T: HeapSize> HeapSize for Vec<T> {
}
}

impl<K: HeapSize, V: HeapSize> HeapSize for HashMap<K, V> {
fn heap_size(&self) -> usize {
let capacity = self.capacity();
if capacity == 0 {
return 0;
}

// HashMap doesn't provide a way to get its heap size, so this is an approximation based on
// the behavior of hashbrown::HashMap as at version 0.16.0, and may become inaccurate
// if the implementation changes.
let key_val_size = std::mem::size_of::<(K, V)>();
// Overhead for the control tags group, which may be smaller depending on architecture
let group_size = 16;
// 1 byte of metadata stored per bucket.
let metadata_size = 1;

// Compute the number of buckets for the capacity. Based on hashbrown's capacity_to_buckets
let buckets = if capacity < 15 {
let min_cap = match key_val_size {
0..=1 => 14,
2..=3 => 7,
_ => 3,
};
let cap = min_cap.max(capacity);
if cap < 4 {
4
} else if cap < 8 {
8
} else {
16
}
} else {
(capacity.saturating_mul(8) / 7).next_power_of_two()
};

group_size
+ (buckets * (key_val_size + metadata_size))
+ self.keys().map(|k| k.heap_size()).sum::<usize>()
+ self.values().map(|v| v.heap_size()).sum::<usize>()
}
}

impl<T: HeapSize> HeapSize for Arc<T> {
fn heap_size(&self) -> usize {
self.as_ref().heap_size()
// Arc stores weak and strong counts on the heap alongside an instance of T
2 * std::mem::size_of::<usize>() + std::mem::size_of::<T>() + self.as_ref().heap_size()
}
}

impl HeapSize for Arc<dyn HeapSize> {
fn heap_size(&self) -> usize {
2 * std::mem::size_of::<usize>()
+ std::mem::size_of_val(self.as_ref())
+ self.as_ref().heap_size()
}
}

Expand Down
91 changes: 85 additions & 6 deletions parquet/src/file/metadata/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -287,11 +287,17 @@ impl ParquetMetaData {
///
/// 4. Does not include any allocator overheads
pub fn memory_size(&self) -> usize {
#[cfg(feature = "encryption")]
let encryption_size = self.file_decryptor.heap_size();
#[cfg(not(feature = "encryption"))]
let encryption_size = 0usize;

std::mem::size_of::<Self>()
+ self.file_metadata.heap_size()
+ self.row_groups.heap_size()
+ self.column_index.heap_size()
+ self.offset_index.heap_size()
+ encryption_size
}

/// Override the column index
Expand Down Expand Up @@ -1875,10 +1881,9 @@ mod tests {
.build();

#[cfg(not(feature = "encryption"))]
let base_expected_size = 2248;
let base_expected_size = 2766;
#[cfg(feature = "encryption")]
// Not as accurate as it should be: https://github.com/apache/arrow-rs/issues/8472
let base_expected_size = 2416;
let base_expected_size = 2934;

assert_eq!(parquet_meta.memory_size(), base_expected_size);

Expand Down Expand Up @@ -1907,16 +1912,90 @@ mod tests {
.build();

#[cfg(not(feature = "encryption"))]
let bigger_expected_size = 2674;
let bigger_expected_size = 3192;
Comment on lines -1910 to +1915
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So sad to see this increase so much. Truth hurts 😢

#[cfg(feature = "encryption")]
// Not as accurate as it should be: https://github.com/apache/arrow-rs/issues/8472
let bigger_expected_size = 2842;
let bigger_expected_size = 3360;

// more set fields means more memory usage
assert!(bigger_expected_size > base_expected_size);
assert_eq!(parquet_meta.memory_size(), bigger_expected_size);
}

#[test]
#[cfg(feature = "encryption")]
fn test_memory_size_with_decryptor() {
use crate::encryption::decrypt::FileDecryptionProperties;
use crate::file::metadata::thrift::encryption::AesGcmV1;

let schema_descr = get_test_schema_descr();

let columns = schema_descr
.columns()
.iter()
.map(|column_descr| ColumnChunkMetaData::builder(column_descr.clone()).build())
.collect::<Result<Vec<_>>>()
.unwrap();
let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
.set_num_rows(1000)
.set_column_metadata(columns)
.build()
.unwrap();
let row_group_meta = vec![row_group_meta];

let version = 2;
let num_rows = 1000;
let aad_file_unique = vec![1u8; 8];
let aad_prefix = vec![2u8; 8];
let encryption_algorithm = EncryptionAlgorithm::AES_GCM_V1(AesGcmV1 {
aad_prefix: Some(aad_prefix.clone()),
aad_file_unique: Some(aad_file_unique.clone()),
supply_aad_prefix: Some(true),
});
let footer_key_metadata = Some(vec![3u8; 8]);
let file_metadata =
FileMetaData::new(version, num_rows, None, None, schema_descr.clone(), None)
.with_encryption_algorithm(Some(encryption_algorithm))
.with_footer_signing_key_metadata(footer_key_metadata.clone());

let parquet_meta_data = ParquetMetaDataBuilder::new(file_metadata.clone())
.set_row_groups(row_group_meta.clone())
.build();

let base_expected_size = 2058;
assert_eq!(parquet_meta_data.memory_size(), base_expected_size);

let footer_key = "0123456789012345".as_bytes();
let column_key = "1234567890123450".as_bytes();
let mut decryption_properties_builder =
FileDecryptionProperties::builder(footer_key.to_vec())
.with_aad_prefix(aad_prefix.clone());
for column in schema_descr.columns() {
decryption_properties_builder = decryption_properties_builder
.with_column_key(&column.path().string(), column_key.to_vec());
}
let decryption_properties = decryption_properties_builder.build().unwrap();
let decryptor = FileDecryptor::new(
&decryption_properties,
footer_key_metadata.as_deref(),
aad_file_unique,
aad_prefix,
)
.unwrap();

let parquet_meta_data = ParquetMetaDataBuilder::new(file_metadata.clone())
.set_row_groups(row_group_meta.clone())
.set_file_decryptor(Some(decryptor))
.build();

let expected_size_with_decryptor = 3072;
assert!(expected_size_with_decryptor > base_expected_size);

assert_eq!(
parquet_meta_data.memory_size(),
expected_size_with_decryptor
);
}

/// Returns sample schema descriptor so we can create column metadata.
fn get_test_schema_descr() -> SchemaDescPtr {
let schema = SchemaType::group_type_builder("schema")
Expand Down
4 changes: 3 additions & 1 deletion parquet/src/schema/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -845,7 +845,9 @@ pub struct ColumnDescriptor {

impl HeapSize for ColumnDescriptor {
fn heap_size(&self) -> usize {
self.primitive_type.heap_size() + self.path.heap_size()
// Don't include the heap size of primitive_type, this is already
// accounted for via SchemaDescriptor::schema
self.path.heap_size()
Comment on lines +848 to +850
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚀

}
}

Expand Down
Loading