This is an automated email from the ASF dual-hosted git repository.
jorisvandenbossche pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 8ceb3b8a79 ARROW-17483: [Python] Support Expression filters in
non-legacy ParquetDataset/read_table (#14011)
8ceb3b8a79 is described below
commit 8ceb3b8a793f378bc73aa4dd24d98e81c66cdfb0
Author: Miles Granger <[email protected]>
AuthorDate: Tue Sep 13 09:26:58 2022 +0200
ARROW-17483: [Python] Support Expression filters in non-legacy
ParquetDataset/read_table (#14011)
Authored-by: Miles Granger <[email protected]>
Signed-off-by: Joris Van den Bossche <[email protected]>
---
python/pyarrow/parquet/core.py | 9 ++++---
python/pyarrow/tests/parquet/test_dataset.py | 38 ++++++++++++++++------------
2 files changed, 28 insertions(+), 19 deletions(-)
diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py
index fd51829f50..4eb02ddeab 100644
--- a/python/pyarrow/parquet/core.py
+++ b/python/pyarrow/parquet/core.py
@@ -1803,6 +1803,9 @@ Examples
raise NotImplementedError("split_row_groups not yet implemented")
if filters is not None:
+ if hasattr(filters, "cast"):
+ raise TypeError(
+ "Expressions as filter not supported for legacy dataset")
filters = _check_filters(filters)
self._filter(filters)
@@ -2338,9 +2341,9 @@ class _ParquetDatasetV2:
if decryption_properties is not None:
read_options.update(decryption_properties=decryption_properties)
- # map filters to Expressions
- self._filters = filters
- self._filter_expression = filters and _filters_to_expression(filters)
+ self._filter_expression = None
+ if filters is not None:
+ self._filter_expression = _filters_to_expression(filters)
# map old filesystems to new one
if filesystem is not None:
diff --git a/python/pyarrow/tests/parquet/test_dataset.py
b/python/pyarrow/tests/parquet/test_dataset.py
index 463d282f08..fe31fe7931 100644
--- a/python/pyarrow/tests/parquet/test_dataset.py
+++ b/python/pyarrow/tests/parquet/test_dataset.py
@@ -23,6 +23,7 @@ import numpy as np
import pytest
import pyarrow as pa
+import pyarrow.compute as pc
from pyarrow import fs
from pyarrow.filesystem import LocalFileSystem
from pyarrow.tests import util
@@ -556,7 +557,14 @@ def test_filters_invalid_column(tempdir,
use_legacy_dataset):
@pytest.mark.pandas
@parametrize_legacy_dataset
-def test_filters_read_table(tempdir, use_legacy_dataset):
[email protected]("filters",
+ ([('integers', '<', 3)],
+ [[('integers', '<', 3)]],
+ pc.field('integers') < 3,
+ pc.field('nested', 'a') < 3,
+ pc.field('nested', 'b').cast(pa.int64()) < 3))
[email protected]("read", (pq.read_table, pq.read_pandas))
+def test_filters_read_table(tempdir, use_legacy_dataset, filters, read):
# test that filters keyword is passed through in read_table
fs = LocalFileSystem._get_instance()
base_path = tempdir
@@ -565,29 +573,27 @@ def test_filters_read_table(tempdir, use_legacy_dataset):
partition_spec = [
['integers', integer_keys],
]
- N = 5
+ N = len(integer_keys)
df = pd.DataFrame({
'index': np.arange(N),
'integers': np.array(integer_keys, dtype='i4'),
- }, columns=['index', 'integers'])
+ 'nested': np.array([{'a': i, 'b': str(i)} for i in range(N)])
+ })
_generate_partition_directories(fs, base_path, partition_spec, df)
- table = pq.read_table(
- base_path, filesystem=fs, filters=[('integers', '<', 3)],
- use_legacy_dataset=use_legacy_dataset)
- assert table.num_rows == 3
-
- table = pq.read_table(
- base_path, filesystem=fs, filters=[[('integers', '<', 3)]],
- use_legacy_dataset=use_legacy_dataset)
- assert table.num_rows == 3
+ kwargs = dict(filesystem=fs, filters=filters,
+ use_legacy_dataset=use_legacy_dataset)
- table = pq.read_pandas(
- base_path, filters=[('integers', '<', 3)],
- use_legacy_dataset=use_legacy_dataset)
- assert table.num_rows == 3
+ # Using Expression in legacy dataset not supported
+ if use_legacy_dataset and isinstance(filters, pc.Expression):
+ msg = "Expressions as filter not supported for legacy dataset"
+ with pytest.raises(TypeError, match=msg):
+ read(base_path, **kwargs)
+ else:
+ table = read(base_path, **kwargs)
+ assert table.num_rows == 3
@pytest.mark.pandas