alamb commented on code in PR #8003:
URL: https://github.com/apache/arrow-rs/pull/8003#discussion_r2231849766


##########
parquet/src/arrow/arrow_reader/metrics.rs:
##########
@@ -0,0 +1,137 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [ArrowReaderMetrics] for collecting metrics about the Arrow reader
+
+use std::sync::atomic::AtomicUsize;
+use std::sync::Arc;
+
+/// This enum represents the state of Arrow reader metrics collection.

Review Comment:
   This is a new API I am proposing to help write end to end tests and report 
out on the status of the parquet reader. The first actual usecase is reporting 
how many rows are read from the cache vs not the cache



##########
parquet/src/arrow/arrow_reader/mod.rs:
##########
@@ -296,6 +302,44 @@ impl<T> ArrowReaderBuilder<T> {
             ..self
         }
     }
+
+    /// Specify metrics collection during reading
+    ///
+    /// To access the metrics, create an [`ArrowReaderMetrics`] and pass a
+    /// clone of the provided metrics to the builder.
+    ///
+    /// For example:

Review Comment:
   this shows how the metrics API is used



##########
parquet/tests/arrow_reader/predicate_cache.rs:
##########
@@ -0,0 +1,284 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Test for predicate cache in Parquet Arrow reader
+
+use arrow::array::ArrayRef;
+use arrow::array::Int64Array;
+use arrow::compute::and;
+use arrow::compute::kernels::cmp::{gt, lt};
+use arrow_array::cast::AsArray;
+use arrow_array::types::Int64Type;
+use arrow_array::{RecordBatch, StringViewArray};
+use bytes::Bytes;
+use futures::future::BoxFuture;
+use futures::{FutureExt, StreamExt};
+use parquet::arrow::arrow_reader::metrics::ArrowReaderMetrics;
+use parquet::arrow::arrow_reader::{ArrowPredicateFn, ArrowReaderOptions, 
RowFilter};
+use parquet::arrow::arrow_reader::{ArrowReaderBuilder, 
ParquetRecordBatchReaderBuilder};
+use parquet::arrow::async_reader::AsyncFileReader;
+use parquet::arrow::{ArrowWriter, ParquetRecordBatchStreamBuilder, 
ProjectionMask};
+use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataReader};
+use parquet::file::properties::WriterProperties;
+use std::ops::Range;
+use std::sync::Arc;
+use std::sync::LazyLock;
+
+#[tokio::test]
+async fn test_default_read() {
+    // The cache is not used without predicates, so we expect 0 records read 
from cache
+    let test = 
ParquetPredicateCacheTest::new().with_expected_records_read_from_cache(0);
+    let sync_builder = test.sync_builder();
+    test.run_sync(sync_builder);
+    let async_builder = test.async_builder().await;
+    test.run_async(async_builder).await;
+}
+
+// Fails until https://github.com/apache/arrow-rs/pull/7850 is merged
+#[ignore]
+#[tokio::test]
+async fn test_async_cache_with_filters() {
+    let test = 
ParquetPredicateCacheTest::new().with_expected_records_read_from_cache(49);
+    let async_builder = test.async_builder().await;
+    let async_builder = test.add_project_ab_and_filter_b(async_builder);
+    test.run_async(async_builder).await;
+}
+
+#[tokio::test]
+async fn test_sync_cache_with_filters() {
+    let test = ParquetPredicateCacheTest::new()
+        // The sync reader does not use the cache. See 
https://github.com/apache/arrow-rs/issues/8000
+        .with_expected_records_read_from_cache(0);
+
+    let sync_builder = test.sync_builder();
+    let sync_builder = test.add_project_ab_and_filter_b(sync_builder);
+    test.run_sync(sync_builder);
+}
+
+/*
+
+#[tokio::test]
+async fn test_cache_disabled_with_filters() {
+    // expect no records to be read from cache, because the cache is disabled
+    let test = 
ParquetPredicateCacheTest::new().with_expected_records_read_from_cache(0);

Review Comment:
   this test is enabled as part of https://github.com/apache/arrow-rs/pull/7850



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to