Re: [PR] [WIP] Upgrade to arrow/parquet 57.0.0 [datafusion]

via GitHub Sat, 18 Oct 2025 15:28:19 -0700


nuno-faria commented on code in PR #17888:
URL: https://github.com/apache/datafusion/pull/17888#discussion_r2422745900



##########
datafusion/core/tests/parquet/filter_pushdown.rs:
##########
@@ -632,7 +632,7 @@ async fn predicate_cache_pushdown_default() -> 
datafusion_common::Result<()> {
 #[tokio::test]
 async fn predicate_cache_pushdown_disable() -> datafusion_common::Result<()> {
     // Can disable the cache even with filter pushdown by setting the size to 
0. In this case we
-    // expect the inner records are reported but no records are read from the 
cache
+    //  no records are read from the cache and no metrics are reported

Review Comment:
   Thanks @alamb for handling the upgrade.
   I think we could add a test to confirm that 
`datafusion.execution.parquet.max_predicate_cache_size` now works as expected, 
by analyzing the explain output.
   
   Here is a potential example:
   
   ```rust
   #[tokio::test]
   async fn test_disable_predicate_cache() {
       let mut parquet_options = TableParquetOptions::new();
       parquet_options.global.data_page_row_count_limit = 1;
       parquet_options.global.write_batch_size = 1;
   
       let tempdir = TempDir::new_in(Path::new(".")).unwrap();
       let path = tempdir.path().to_str().unwrap();
   
       let ctx = SessionContext::new();
       ctx.sql("select i from generate_series(1, 1000) t(i)")
           .await
           .unwrap()
           .write_parquet(path, DataFrameWriteOptions::new(), 
Some(parquet_options))
           .await
           .unwrap();
   
       let regex = Regex::new(r"bytes_scanned=(\d+)").ok().unwrap();
   
       let config = SessionConfig::new()
           .set_bool("datafusion.execution.parquet.pushdown_filters", true);
       let ctx = SessionContext::new_with_config(config);
   
       // default: predicate cache is enabled
       ctx.register_parquet("t", path, ParquetReadOptions::new())
           .await
           .unwrap();
       let plan = ctx
           .sql("select * from t where i = 123")
           .await
           .unwrap()
           .explain(false, true)
           .unwrap()
           .to_string()
           .await
           .unwrap();
       let captures = regex.captures(&plan).unwrap();
       let bytes_scanned_default =
           captures.get(1).unwrap().as_str().parse::<usize>().unwrap();
   
       // disabling the predicate cache by setting the limit to 0
       ctx.sql("set datafusion.execution.parquet.max_predicate_cache_size = 0")
           .await
           .unwrap()
           .collect()
           .await
           .unwrap();
       ctx.deregister_table("t").unwrap();
       ctx.register_parquet("t", path, ParquetReadOptions::new())
           .await
           .unwrap();
       let plan = ctx
           .sql("select * from t where i = 123")
           .await
           .unwrap()
           .explain(false, true)
           .unwrap()
           .to_string()
           .await
           .unwrap();
       let captures = regex.captures(&plan).unwrap();
       let bytes_scanned_cache_disabled =
           captures.get(1).unwrap().as_str().parse::<usize>().unwrap();
   
       // with the cache disabled, fewer data pages should be retrieved (the 
predicate cache can
       // retrieve multiple data pages when their size is less than batch_size)
       assert_eq!(bytes_scanned_default, 31405);
       assert_eq!(bytes_scanned_cache_disabled, 1691);
   }
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [WIP] Upgrade to arrow/parquet 57.0.0 [datafusion]

Reply via email to