-
Notifications
You must be signed in to change notification settings - Fork 3.9k
Open
Description
Describe the enhancement requested
I am having trouble decrypting a parquet file encrypted in the C++ Arrow library using PyArrow.
In C++, I encrypt the file with one key and no metadata.
// Convert vector<uint8_t> to string for the FileEncryptionProperties::Builder
std::string keyStr(reinterpret_cast<const char*>(key.data()), key.size());
auto fileEncryptionProps = parquet::FileEncryptionProperties::Builder(keyStr)
.algorithm(parquet::ParquetCipher::AES_GCM_V1)
->build();
const auto props = parquet::WriterProperties::Builder()
.encryption(fileEncryptionProps)
->build();
const auto fields = convertSchema(table->schema());
const auto schemaNode = std::static_pointer_cast<parquet::schema::GroupNode>(
parquet::schema::GroupNode::Make(
"schema", parquet::Repetition::REQUIRED, fields));
auto schema =
std::static_pointer_cast<parquet::schema::GroupNode>(schemaNode);
// Open output file
std::shared_ptr<arrow::io::FileOutputStream> outFile;
auto result = arrow::io::FileOutputStream::Open(outputFilePath);
if (!result.ok()) {
throw std::runtime_error("Failed to open output file: " + outputFilePath);
}
outFile = result.ValueOrDie();
auto parquetStreamWriter = make_unique<parquet::StreamWriter>(
parquet::ParquetFileWriter::Open(outFile, schema, props));
return writeToStreamWriter(table, *(parquetStreamWriter.get()));I can decrypt the file in C++ using the same key and no metadata.
const std::string keyStr(key.begin(), key.end());
auto decryptionProps =
parquet::FileDecryptionProperties::Builder().footer_key(keyStr)->build();
auto readerProps = parquet::ReaderProperties();
readerProps.file_decryption_properties(decryptionProps);
std::unique_ptr<parquet::ParquetFileReader> reader =
parquet::ParquetFileReader::OpenFile(outputPath, false, readerProps);
// Read the file content into an arrow table actualTable using reader
std::shared_ptr<arrow::Table> actualTable;
std::unique_ptr<parquet::arrow::FileReader> arrowReader;
auto status = parquet::arrow::FileReader::Make(
arrow::default_memory_pool(), std::move(reader), &arrowReader);
status = arrowReader->ReadTable(&actualTable);
However, I can't find an API in PyArrow to do the equivalent.
In Python, I can do the following, but it requires setting extra dummy key metadata in c++ (i.e., this code won't decrypt a file encrypted by the C++ approach at the beginning of this post).
import pyarrow.parquet.encryption as pe
# Create a simple KMS client that returns our DEK
class SimpleKmsClient(pe.KmsClient):
def __init__(self):
pe.KmsClient.__init__(self)
def unwrap_key(self, wrapped_key, master_key_identifier):
return dek
def wrap_key(self, key_bytes, master_key_identifier):
raise NotImplementedError("wrap_key not needed for decryption")
# Create KMS factory
def kms_factory(kms_connection_configuration):
return SimpleKmsClient()
crypto_factory = pe.CryptoFactory(kms_factory)
# Simple decryption config
decryption_config = pe.DecryptionConfiguration()
kms_connection_config = pe.KmsConnectionConfig()
# Create file decryption properties
file_decryption_props = crypto_factory.file_decryption_properties(
kms_connection_config, decryption_config
)
# Read the file
table = pq.read_table(path, decryption_properties=file_decryption_props)My question is: Is there any Pyhton API to decrypt using only one DEK and no metadata?
Component(s)
Python