Re: [PR] [WIP] Upgrade to arrow/parquet 57.0.0 [datafusion]

via GitHub Thu, 23 Oct 2025 09:38:41 -0700


alamb commented on code in PR #17888:
URL: https://github.com/apache/datafusion/pull/17888#discussion_r2455906664



##########
datafusion-cli/src/main.rs:
##########
@@ -497,7 +497,7 @@ mod tests {
         
+-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+
         | filename                                                    | 
row_group_id | row_group_num_rows | row_group_num_columns | row_group_bytes | 
column_id | file_offset | num_values | path_in_schema | type  | stats_min | 
stats_max | stats_null_count | stats_distinct_count | stats_min_value | 
stats_max_value | compression | encodings                    | 
index_page_offset | dictionary_page_offset | data_page_offset | 
total_compressed_size | total_uncompressed_size |
         
+-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+
-        | ../datafusion/core/tests/data/fixed_size_list_array.parquet | 0      
      | 2                  | 1                     | 123             | 0        
 | 125         | 4          | "f0.list.item" | INT64 | 1         | 4         | 
0                |                      | 1               | 4               | 
SNAPPY      | [RLE_DICTIONARY, PLAIN, RLE] |                   | 4              
        | 46               | 121                   | 123                     |
+        | ../datafusion/core/tests/data/fixed_size_list_array.parquet | 0      
      | 2                  | 1                     | 123             | 0        
 | 125         | 4          | "f0.list.item" | INT64 | 1         | 4         | 
0                |                      | 1               | 4               | 
SNAPPY      | [PLAIN, RLE, RLE_DICTIONARY] |                   | 4              
        | 46               | 121                   | 123                     |

Review Comment:
   this is due to https://github.com/apache/arrow-rs/pull/8587 which change the 
order the encodings are displayed



##########
datafusion-examples/examples/parquet_encrypted.rs:
##########
@@ -100,7 +101,8 @@ async fn query_dataframe(df: &DataFrame) -> Result<(), 
DataFusionError> {
 // Setup encryption and decryption properties
 fn setup_encryption(
     parquet_df: &DataFrame,
-) -> Result<(FileEncryptionProperties, FileDecryptionProperties), 
DataFusionError> {
+) -> Result<(Arc<FileEncryptionProperties>, Arc<FileDecryptionProperties>), 
DataFusionError>

Review Comment:
   The arrow upstream APIs now use Arc instead of raw objects, see 
https://github.com/apache/arrow-rs/pull/8470



##########
datafusion/common/src/encryption.rs:
##########
@@ -24,38 +24,10 @@ pub use 
parquet::encryption::decrypt::FileDecryptionProperties;
 pub use parquet::encryption::encrypt::FileEncryptionProperties;
 
 #[cfg(not(feature = "parquet_encryption"))]
-#[derive(Default, Debug)]
+#[derive(Default, Clone, Debug)]
 pub struct FileDecryptionProperties;
 #[cfg(not(feature = "parquet_encryption"))]
-#[derive(Default, Debug)]
+#[derive(Default, Clone, Debug)]
 pub struct FileEncryptionProperties;
 
 pub use crate::config::{ConfigFileDecryptionProperties, 
ConfigFileEncryptionProperties};
-
-#[cfg(feature = "parquet_encryption")]
-pub fn map_encryption_to_config_encryption(

Review Comment:
   These methods are redundant and were not used, so I removed them



##########
datafusion-examples/examples/flight/flight_client.rs:
##########
@@ -34,7 +35,9 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     let testdata = datafusion::test_util::parquet_test_data();
 
     // Create Flight client
-    let mut client = 
FlightServiceClient::connect("http://localhost:50051";).await?;
+    let endpoint = Endpoint::new("http://localhost:50051";)?;

Review Comment:
   this is required due to the tonic upgrade



##########
datafusion-cli/src/main.rs:
##########
@@ -592,9 +592,9 @@ mod tests {
         
+-----------------------------------+-----------------+---------------------+------+------------------+
         | filename                          | file_size_bytes | 
metadata_size_bytes | hits | extra            |
         
+-----------------------------------+-----------------+---------------------+------+------------------+
-        | alltypes_plain.parquet            | 1851            | 10181          
     | 2    | page_index=false |
-        | alltypes_tiny_pages.parquet       | 454233          | 881418         
     | 2    | page_index=true  |
-        | lz4_raw_compressed_larger.parquet | 380836          | 2939           
     | 2    | page_index=false |
+        | alltypes_plain.parquet            | 1851            | 6957           
     | 2    | page_index=false |

Review Comment:
   The thrift-remodel has made the in-memory footprint of `ParquetMetaData` 
significantly smaller, likely due to a more efficient representation in memory 
for PageIndex
   
   (huge thanks to @etseidl )



##########
datafusion/common/src/dfschema.rs:
##########
@@ -1417,7 +1417,7 @@ mod tests {
     fn from_qualified_schema_into_arrow_schema() -> Result<()> {
         let schema = DFSchema::try_from_qualified_schema("t1", 
&test_schema_1())?;
         let arrow_schema = schema.as_arrow();
-        insta::assert_snapshot!(arrow_schema, @r#"Field { name: "c0", 
data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, 
metadata: {} }, Field { name: "c1", data_type: Boolean, nullable: true, 
dict_id: 0, dict_is_ordered: false, metadata: {} }"#);
+        insta::assert_snapshot!(arrow_schema.to_string(), @r#"Field { "c0": 
nullable Boolean }, Field { "c1": nullable Boolean }"#);

Review Comment:
   due to improvement in displaying Fields



##########
datafusion/core/src/datasource/file_format/parquet.rs:
##########
@@ -1144,18 +1144,14 @@ mod tests {
 
         // 325 pages in int_col
         assert_eq!(int_col_offset.len(), 325);
-        match int_col_index {
-            Index::INT32(index) => {
-                assert_eq!(index.indexes.len(), 325);
-                for min_max in index.clone().indexes {
-                    assert!(min_max.min.is_some());
-                    assert!(min_max.max.is_some());
-                    assert!(min_max.null_count.is_some());
-                }
-            }
-            _ => {
-                error!("fail to read page index.")
-            }
+        let ColumnIndexMetaData::INT32(index) = int_col_index else {

Review Comment:
   The PageIndex structures have now changed



##########
datafusion/common/src/pyarrow.rs:
##########
@@ -52,11 +52,11 @@ impl FromPyArrow for ScalarValue {
 }
 
 impl ToPyArrow for ScalarValue {
-    fn to_pyarrow(&self, py: Python) -> PyResult<PyObject> {
+    fn to_pyarrow<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyAny>> {

Review Comment:
   due to pyo3 upgrade



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [WIP] Upgrade to arrow/parquet 57.0.0 [datafusion]

Reply via email to