This is an automated email from the ASF dual-hosted git repository. alamb pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push: new 11158ba8cf Support writing encrypted Parquet files with plaintext footers (#7439) 11158ba8cf is described below commit 11158ba8cfc471354ed14d04c88c9c631143459a Author: Rok Mihevc <r...@mihevc.org> AuthorDate: Thu May 8 12:59:00 2025 +0200 Support writing encrypted Parquet files with plaintext footers (#7439) * Initial commit * Lint and clippy * Plaintext layout is different to encrypted one * Lint and expected memory size at decryption * Apply suggestions from code review Co-authored-by: Adam Reeve <adre...@gmail.com> * Review feedback * Lint * Update parquet/tests/encryption/encryption.rs Co-authored-by: Adam Reeve <adre...@gmail.com> * Review feedback * Initial commit * Encrypt plaintext to extract nonce and footer * fix * Add encryption algorithm to file_metadata before writing * fix * Apply suggestions from code review Co-authored-by: Adam Reeve <adre...@gmail.com> * Fix * lint * Move get_footer_encryption_algorithm into MetadataObjectWriter * Fix * Review feedback * Avoid unwraps in file_crypto_metada method * Minor tidy * Test wrting and reading with a different footer key --------- Co-authored-by: Adam Reeve <adre...@gmail.com> --- parquet/src/encryption/encrypt.rs | 27 ++++++++++++++- parquet/src/file/metadata/writer.rs | 62 +++++++++++++++++++++++----------- parquet/src/file/writer.rs | 6 ---- parquet/tests/encryption/encryption.rs | 52 ++++++++++++++++++++++------ 4 files changed, 111 insertions(+), 36 deletions(-) diff --git a/parquet/src/encryption/encrypt.rs b/parquet/src/encryption/encrypt.rs index 9a801434c0..c8d3ffc0ee 100644 --- a/parquet/src/encryption/encrypt.rs +++ b/parquet/src/encryption/encrypt.rs @@ -17,7 +17,9 @@ //! Configuration and utilities for Parquet Modular Encryption -use crate::encryption::ciphers::{BlockEncryptor, RingGcmBlockEncryptor}; +use crate::encryption::ciphers::{ + BlockEncryptor, RingGcmBlockEncryptor, NONCE_LEN, SIZE_LEN, TAG_LEN, +}; use crate::errors::{ParquetError, Result}; use crate::file::column_crypto_metadata::{ColumnCryptoMetaData, EncryptionWithColumnKey}; use crate::schema::types::{ColumnDescPtr, SchemaDescriptor}; @@ -374,6 +376,29 @@ pub(crate) fn encrypt_object<T: TSerializable, W: Write>( Ok(()) } +pub(crate) fn write_signed_plaintext_object<T: TSerializable, W: Write>( + object: &T, + encryptor: &mut Box<dyn BlockEncryptor>, + sink: &mut W, + module_aad: &[u8], +) -> Result<()> { + let mut buffer: Vec<u8> = vec![]; + { + let mut protocol = TCompactOutputProtocol::new(&mut buffer); + object.write_to_out_protocol(&mut protocol)?; + } + sink.write_all(&buffer)?; + buffer = encryptor.encrypt(buffer.as_ref(), module_aad)?; + + // Format of encrypted buffer is: [ciphertext size, nonce, ciphertext, authentication tag] + let nonce = &buffer[SIZE_LEN..SIZE_LEN + NONCE_LEN]; + let tag = &buffer[buffer.len() - TAG_LEN..]; + sink.write_all(nonce)?; + sink.write_all(tag)?; + + Ok(()) +} + /// Encrypt a Thrift serializable object to a byte vector pub(crate) fn encrypt_object_to_vec<T: TSerializable>( object: &T, diff --git a/parquet/src/file/metadata/writer.rs b/parquet/src/file/metadata/writer.rs index c1fc413144..a01ad5d881 100644 --- a/parquet/src/file/metadata/writer.rs +++ b/parquet/src/file/metadata/writer.rs @@ -16,17 +16,21 @@ // under the License. #[cfg(feature = "encryption")] -use crate::encryption::encrypt::{encrypt_object, encrypt_object_to_vec, FileEncryptor}; -#[cfg(feature = "encryption")] -use crate::encryption::modules::{create_footer_aad, create_module_aad, ModuleType}; +use crate::encryption::{ + encrypt::{ + encrypt_object, encrypt_object_to_vec, write_signed_plaintext_object, FileEncryptor, + }, + modules::{create_footer_aad, create_module_aad, ModuleType}, +}; #[cfg(feature = "encryption")] use crate::errors::ParquetError; use crate::errors::Result; use crate::file::metadata::{KeyValue, ParquetMetaData}; use crate::file::page_index::index::Index; use crate::file::writer::{get_file_magic, TrackedWrite}; +use crate::format::EncryptionAlgorithm; #[cfg(feature = "encryption")] -use crate::format::{AesGcmV1, ColumnCryptoMetaData, EncryptionAlgorithm}; +use crate::format::{AesGcmV1, ColumnCryptoMetaData}; use crate::format::{ColumnChunk, ColumnIndex, FileMetaData, OffsetIndex, RowGroup}; use crate::schema::types; use crate::schema::types::{SchemaDescPtr, SchemaDescriptor, TypePtr}; @@ -149,7 +153,7 @@ impl<'a, W: Write> ThriftMetadataWriter<'a, W> { schema: types::to_thrift(self.schema.as_ref())?, created_by: self.created_by.clone(), column_orders, - encryption_algorithm: None, + encryption_algorithm: self.object_writer.get_footer_encryption_algorithm(), footer_signing_key_metadata: None, }; @@ -474,6 +478,10 @@ impl MetadataObjectWriter { pub fn get_file_magic(&self) -> &[u8; 4] { get_file_magic() } + + fn get_footer_encryption_algorithm(&self) -> Option<EncryptionAlgorithm> { + None + } } /// Implementations of [`MetadataObjectWriter`] methods that rely on encryption being enabled @@ -503,6 +511,11 @@ impl MetadataObjectWriter { let mut encryptor = file_encryptor.get_footer_encryptor()?; encrypt_object(file_metadata, &mut encryptor, &mut sink, &aad) } + Some(file_encryptor) if file_metadata.encryption_algorithm.is_some() => { + let aad = create_footer_aad(file_encryptor.file_aad())?; + let mut encryptor = file_encryptor.get_footer_encryptor()?; + write_signed_plaintext_object(file_metadata, &mut encryptor, &mut sink, &aad) + } _ => Self::write_object(file_metadata, &mut sink), } } @@ -622,25 +635,36 @@ impl MetadataObjectWriter { } } - fn file_crypto_metadata( - file_encryptor: &FileEncryptor, - ) -> Result<crate::format::FileCryptoMetaData> { - let properties = file_encryptor.properties(); - let supply_aad_prefix = properties + fn get_footer_encryption_algorithm(&self) -> Option<EncryptionAlgorithm> { + if let Some(file_encryptor) = &self.file_encryptor { + return Some(Self::encryption_algorithm_from_encryptor(file_encryptor)); + } + None + } + + fn encryption_algorithm_from_encryptor(file_encryptor: &FileEncryptor) -> EncryptionAlgorithm { + let supply_aad_prefix = file_encryptor + .properties() .aad_prefix() - .map(|_| !properties.store_aad_prefix()); - let encryption_algorithm = AesGcmV1 { - aad_prefix: if properties.store_aad_prefix() { - properties.aad_prefix().cloned() - } else { - None - }, + .map(|_| !file_encryptor.properties().store_aad_prefix()); + let aad_prefix = if file_encryptor.properties().store_aad_prefix() { + file_encryptor.properties().aad_prefix().cloned() + } else { + None + }; + EncryptionAlgorithm::AESGCMV1(AesGcmV1 { + aad_prefix, aad_file_unique: Some(file_encryptor.aad_file_unique().clone()), supply_aad_prefix, - }; + }) + } + fn file_crypto_metadata( + file_encryptor: &FileEncryptor, + ) -> Result<crate::format::FileCryptoMetaData> { + let properties = file_encryptor.properties(); Ok(crate::format::FileCryptoMetaData { - encryption_algorithm: EncryptionAlgorithm::AESGCMV1(encryption_algorithm), + encryption_algorithm: Self::encryption_algorithm_from_encryptor(file_encryptor), key_metadata: properties.footer_key_metadata().cloned(), }) } diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 18e357ebc2..0298d8a51d 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -212,12 +212,6 @@ impl<W: Write + Send> SerializedFileWriter<W> { if let Some(file_encryption_properties) = &properties.file_encryption_properties { file_encryption_properties.validate_encrypted_column_names(schema_descriptor)?; - if !file_encryption_properties.encrypt_footer() { - return Err(general_err!( - "Writing encrypted files with plaintext footers is not supported yet" - )); - } - Ok(Some(Arc::new(FileEncryptor::new( file_encryption_properties.clone(), )?))) diff --git a/parquet/tests/encryption/encryption.rs b/parquet/tests/encryption/encryption.rs index da263a5df6..134e3383b3 100644 --- a/parquet/tests/encryption/encryption.rs +++ b/parquet/tests/encryption/encryption.rs @@ -433,18 +433,23 @@ fn test_write_non_uniform_encryption() { read_and_roundtrip_to_encrypted_file(&path, decryption_properties, file_encryption_properties); } -// todo: currently we raise if writing with plaintext footer, but we should support it -// for uniform and non-uniform encryption (see https://github.com/apache/arrow-rs/issues/7320) #[test] fn test_write_uniform_encryption_plaintext_footer() { let testdata = arrow::util::test_util::parquet_test_data(); - let path = format!("{testdata}/encrypt_columns_and_footer.parquet.encrypted"); + let path = format!("{testdata}/encrypt_columns_plaintext_footer.parquet.encrypted"); let footer_key = b"0123456789012345".to_vec(); // 128bit/16 + let wrong_footer_key = b"0000000000000000".to_vec(); // 128bit/16 let column_1_key = b"1234567890123450".to_vec(); let column_2_key = b"1234567890123451".to_vec(); let decryption_properties = FileDecryptionProperties::builder(footer_key.clone()) + .with_column_key("double_field", column_1_key.clone()) + .with_column_key("float_field", column_2_key.clone()) + .build() + .unwrap(); + + let wrong_decryption_properties = FileDecryptionProperties::builder(wrong_footer_key) .with_column_key("double_field", column_1_key) .with_column_key("float_field", column_2_key) .build() @@ -455,26 +460,53 @@ fn test_write_uniform_encryption_plaintext_footer() { .build() .unwrap(); + // Try writing plaintext footer and then reading it with the correct footer key + read_and_roundtrip_to_encrypted_file( + &path, + decryption_properties.clone(), + file_encryption_properties.clone(), + ); + + // Try writing plaintext footer and then reading it with the wrong footer key + let temp_file = tempfile::tempfile().unwrap(); + + // read example data let file = File::open(path).unwrap(); let options = ArrowReaderOptions::default() .with_file_decryption_properties(decryption_properties.clone()); let metadata = ArrowReaderMetadata::load(&file, options.clone()).unwrap(); + let builder = ParquetRecordBatchReaderBuilder::try_new_with_options(file, options).unwrap(); + let batch_reader = builder.build().unwrap(); + let batches = batch_reader + .collect::<parquet::errors::Result<Vec<RecordBatch>, _>>() + .unwrap(); + + // write example data let props = WriterProperties::builder() .with_file_encryption_properties(file_encryption_properties) .build(); - let temp_file = tempfile::tempfile().unwrap(); - let writer = ArrowWriter::try_new( + let mut writer = ArrowWriter::try_new( temp_file.try_clone().unwrap(), metadata.schema().clone(), Some(props), - ); - assert!(writer.is_err()); - assert_eq!( - writer.unwrap_err().to_string(), - "Parquet error: Writing encrypted files with plaintext footers is not supported yet" ) + .unwrap(); + for batch in batches { + writer.write(&batch).unwrap(); + } + writer.close().unwrap(); + + // Try reading plaintext footer and with the wrong footer key + let options = + ArrowReaderOptions::default().with_file_decryption_properties(wrong_decryption_properties); + let result = ArrowReaderMetadata::load(&temp_file, options.clone()); + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .starts_with("Parquet error: Footer signature verification failed. Computed: [")); } #[test]