This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 631fa0a6b7 GH-45457: [Python] Add `pyarrow.ArrayStatistics` (#45550)
631fa0a6b7 is described below
commit 631fa0a6b780109194e7f1c318bd685ec7efd6e8
Author: Sutou Kouhei <[email protected]>
AuthorDate: Tue Feb 25 22:25:52 2025 +0900
GH-45457: [Python] Add `pyarrow.ArrayStatistics` (#45550)
### Rationale for this change
Apache Arrow C++ can attach statistics read from Apache Parquet data to
`arrow::Array`. If we have the bindings of the feature in Python, Python users
can also use attached statistics.
### What changes are included in this PR?
* Add `pyarrow.ArrayStatistics`
* Add `pyarrow.Array.statistics()`.
### Are these changes tested?
Yes.
### Are there any user-facing changes?
Yes.
* GitHub Issue: #45457
Lead-authored-by: Sutou Kouhei <[email protected]>
Co-authored-by: Sutou Kouhei <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
---
python/pyarrow/array.pxi | 112 ++++++++++++++++++++++
python/pyarrow/includes/libarrow.pxd | 22 +++++
python/pyarrow/lib.pxd | 8 ++
python/pyarrow/tests/parquet/test_parquet_file.py | 19 ++++
4 files changed, 161 insertions(+)
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 91770a5219..b738dc04b0 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -21,6 +21,9 @@ import os
import warnings
from cython import sizeof
+cdef extern from "<variant>" namespace "std":
+ c_bool holds_alternative[T](...)
+ T get[T](...)
cdef _sequence_to_array(object sequence, object mask, object size,
DataType type, CMemoryPool* pool, c_bool from_pandas):
@@ -704,6 +707,101 @@ def _restore_array(data):
return pyarrow_wrap_array(MakeArray(ad))
+cdef class ArrayStatistics(_Weakrefable):
+ """
+ The class for statistics of an array.
+ """
+
+ def __init__(self):
+ raise TypeError(f"Do not call {self.__class__.__name__}'s constructor "
+ "directly")
+
+ cdef void init(self, const shared_ptr[CArrayStatistics]& sp_statistics):
+ self.sp_statistics = sp_statistics
+
+ def __repr__(self):
+ return (f"arrow.ArrayStatistics<null_count={self.null_count}, "
+ f"distinct_count={self.distinct_count}, min={self.min}, "
+ f"is_min_exact={self.is_min_exact}, max={self.max}, "
+ f"is_max_exact={self.is_max_exact}>")
+
+ @property
+ def null_count(self):
+ """
+ The number of nulls.
+ """
+ null_count = self.sp_statistics.get().null_count
+ # We'll be able to simplify this after
+ # https://github.com/cython/cython/issues/6692 is solved.
+ if null_count.has_value():
+ return null_count.value()
+ else:
+ return None
+
+ @property
+ def distinct_count(self):
+ """
+ The number of distinct values.
+ """
+ distinct_count = self.sp_statistics.get().distinct_count
+ # We'll be able to simplify this after
+ # https://github.com/cython/cython/issues/6692 is solved.
+ if distinct_count.has_value():
+ return distinct_count.value()
+ else:
+ return None
+
+ @property
+ def min(self):
+ """
+ The minimum value.
+ """
+ return self._get_value(self.sp_statistics.get().min)
+
+ @property
+ def is_min_exact(self):
+ """
+ Whether the minimum value is an exact value or not.
+ """
+ return self.sp_statistics.get().is_min_exact
+
+ @property
+ def max(self):
+ """
+ The maximum value.
+ """
+ return self._get_value(self.sp_statistics.get().max)
+
+ @property
+ def is_max_exact(self):
+ """
+ Whether the maximum value is an exact value or not.
+ """
+ return self.sp_statistics.get().is_max_exact
+
+ cdef _get_value(self, const optional[CArrayStatisticsValueType]&
optional_value):
+ """
+ Get a raw value from
+ std::optional<arrow::ArrayStatistics::ValueType>> data.
+
+ arrow::ArrayStatistics::ValueType is
+ std::variant<bool, int64_t, uint64_t, double, std::string>.
+ """
+ if not optional_value.has_value():
+ return None
+ value = optional_value.value()
+ if holds_alternative[c_bool](value):
+ return get[c_bool](value)
+ elif holds_alternative[int64_t](value):
+ return get[int64_t](value)
+ elif holds_alternative[uint64_t](value):
+ return get[uint64_t](value)
+ elif holds_alternative[double](value):
+ return get[double](value)
+ else:
+ return get[c_string](value)
+
+
cdef class _PandasConvertible(_Weakrefable):
def to_pandas(
@@ -2099,6 +2197,20 @@ cdef class Array(_PandasConvertible):
if self.sp_array.get().device_type() != CDeviceAllocationType_kCPU:
raise NotImplementedError("Implemented only for data on CPU
device")
+ @property
+ def statistics(self):
+ """
+ Statistics of the array.
+ """
+ cdef ArrayStatistics stat
+ sp_stat = self.sp_array.get().statistics()
+ if sp_stat.get() == nullptr:
+ return None
+ else:
+ stat = ArrayStatistics.__new__(ArrayStatistics)
+ stat.init(sp_stat)
+ return stat
+
cdef _array_like_to_pandas(obj, options, types_mapper):
cdef:
diff --git a/python/pyarrow/includes/libarrow.pxd
b/python/pyarrow/includes/libarrow.pxd
index d4e34e0a84..556696e344 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -101,6 +101,16 @@ cdef extern from "arrow/util/future.h" namespace "arrow"
nogil:
CStatus status()
+cdef extern from "<variant>" namespace "std" nogil:
+ cdef cppclass CArrayStatisticsValueType" std::variant<bool, int64_t,
uint64_t, double, std::string>":
+ CArrayStatisticsValueType()
+ CArrayStatisticsValueType(c_bool)
+ CArrayStatisticsValueType(int64_t)
+ CArrayStatisticsValueType(uint64_t)
+ CArrayStatisticsValueType(double)
+ CArrayStatisticsValueType(c_string)
+
+
cdef extern from "arrow/api.h" namespace "arrow" nogil:
cdef enum Type" arrow::Type::type":
_Type_NA" arrow::Type::NA"
@@ -188,6 +198,16 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
c_bool is_primitive(Type type)
c_bool is_numeric(Type type)
+ cdef cppclass CArrayStatistics" arrow::ArrayStatistics":
+ optional[int64_t] null_count
+ optional[int64_t] distinct_count
+ optional[CArrayStatisticsValueType] min
+ c_bool is_min_exact
+ optional[CArrayStatisticsValueType] max
+ c_bool is_max_exact
+
+ c_bool Equals(const CArrayStatistics& statistics) const
+
cdef cppclass CArrayData" arrow::ArrayData":
shared_ptr[CDataType] type
int64_t length
@@ -251,6 +271,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
CDeviceAllocationType device_type()
CResult[shared_ptr[CArray]] CopyTo(const shared_ptr[CMemoryManager]&
to) const
+ const shared_ptr[CArrayStatistics]& statistics() const
+
shared_ptr[CArray] MakeArray(const shared_ptr[CArrayData]& data)
CResult[shared_ptr[CArray]] MakeArrayOfNull(
const shared_ptr[CDataType]& type, int64_t length, CMemoryPool* pool)
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index 892c974ab1..0b2dedad50 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -261,6 +261,14 @@ cdef class Scalar(_Weakrefable):
cdef inline shared_ptr[CScalar] unwrap(self) nogil
+cdef class ArrayStatistics(_Weakrefable):
+ cdef:
+ shared_ptr[CArrayStatistics] sp_statistics
+
+ cdef void init(self, const shared_ptr[CArrayStatistics]& sp_statistics)
except *
+ cdef _get_value(self, const optional[CArrayStatisticsValueType]&
optional_value)
+
+
cdef class _PandasConvertible(_Weakrefable):
pass
diff --git a/python/pyarrow/tests/parquet/test_parquet_file.py
b/python/pyarrow/tests/parquet/test_parquet_file.py
index 93097a1afa..ae8a16e874 100644
--- a/python/pyarrow/tests/parquet/test_parquet_file.py
+++ b/python/pyarrow/tests/parquet/test_parquet_file.py
@@ -334,3 +334,22 @@ def test_parquet_file_with_filesystem(s3_example_fs,
use_uri):
assert f.read() == table
assert not f.closed
assert f.closed
+
+
+def test_read_statistics():
+ table = pa.table({"value": pa.array([-1, None, 3])})
+ buf = io.BytesIO()
+ _write_table(table, buf)
+ buf.seek(0)
+
+ statistics = pq.ParquetFile(buf).read().columns[0].chunks[0].statistics
+ assert statistics.null_count == 1
+ assert statistics.distinct_count is None
+ assert statistics.min == -1
+ assert statistics.is_min_exact
+ assert statistics.max == 3
+ assert statistics.is_max_exact
+ assert repr(statistics) == ("arrow.ArrayStatistics<"
+ "null_count=1, distinct_count=None, "
+ "min=-1, is_min_exact=True, "
+ "max=3, is_max_exact=True>")