This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 4ed5a14969 GH-43797: [C++] Attach `arrow::ArrayStatistics` to
`arrow::ArrayData` (#43801)
4ed5a14969 is described below
commit 4ed5a149695644fe364466eabcae38d8dabfc090
Author: Sutou Kouhei <[email protected]>
AuthorDate: Tue Sep 3 10:13:48 2024 +0900
GH-43797: [C++] Attach `arrow::ArrayStatistics` to `arrow::ArrayData`
(#43801)
### Rationale for this change
If we can attach associated statistics to an array via `ArrayData`, we can
use it in later processes such as query planning.
If `ArrayData` not `Array` has statistics, we can use statistics in
computing kernels.
There was a concern that associated `arrow::ArrayStatistics` may be
outdated if `arrow::ArrayData` is mutated after attaching
`arrow::ArrayStatistics`. But `arrow::ArrayData` isn't mutable after the first
population. So `arrow::ArrayStatistics` will not be outdated. We can require
mutators to take responsibility for statistics.
### What changes are included in this PR?
* Add `arrow::ArrayData::statistics`
* Add `arrow::Array::statistics()` to get statistics attached in
`arrow::ArrayData`
This doesn't provide a new `arrow::ArrayData` constructor
(`arrow::ArrayData::Make()`) that accepts `arrow::ArrayStatistics`. We can
change `arrow::ArrayData::statistics` after we create `arrow::ArrayData`.
### Are these changes tested?
Yes.
### Are there any user-facing changes?
Yes.
`arrow::Array::statistics()` is a new public API.
* GitHub Issue: #43797
Authored-by: Sutou Kouhei <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
---
cpp/src/arrow/array/array_base.h | 8 +++
cpp/src/arrow/array/array_test.cc | 126 ++++++++++++++++++++++++++++++++++++++
cpp/src/arrow/array/data.cc | 3 +
cpp/src/arrow/array/data.h | 24 +++++++-
4 files changed, 159 insertions(+), 2 deletions(-)
diff --git a/cpp/src/arrow/array/array_base.h b/cpp/src/arrow/array/array_base.h
index 716ae07220..e4af67d7e5 100644
--- a/cpp/src/arrow/array/array_base.h
+++ b/cpp/src/arrow/array/array_base.h
@@ -232,6 +232,14 @@ class ARROW_EXPORT Array {
/// \return DeviceAllocationType
DeviceAllocationType device_type() const { return data_->device_type(); }
+ /// \brief Return the statistics of this Array
+ ///
+ /// This just delegates to calling statistics on the underlying ArrayData
+ /// object which backs this Array.
+ ///
+ /// \return const ArrayStatistics&
+ std::shared_ptr<ArrayStatistics> statistics() const { return
data_->statistics; }
+
protected:
Array() = default;
ARROW_DEFAULT_MOVE_AND_ASSIGN(Array);
diff --git a/cpp/src/arrow/array/array_test.cc
b/cpp/src/arrow/array/array_test.cc
index 32806d9d2e..73e0c69243 100644
--- a/cpp/src/arrow/array/array_test.cc
+++ b/cpp/src/arrow/array/array_test.cc
@@ -3709,6 +3709,132 @@ TEST(TestSwapEndianArrayData, InvalidLength) {
}
}
+class TestArrayDataStatistics : public ::testing::Test {
+ public:
+ void SetUp() {
+ valids_ = {1, 0, 1, 1};
+ null_count_ = std::count(valids_.begin(), valids_.end(), 0);
+ null_buffer_ = *internal::BytesToBits(valids_);
+ values_ = {1, 0, 3, -4};
+ min_ = *std::min_element(values_.begin(), values_.end());
+ max_ = *std::max_element(values_.begin(), values_.end());
+ values_buffer_ = Buffer::FromVector(values_);
+ data_ = ArrayData::Make(int32(), values_.size(), {null_buffer_,
values_buffer_},
+ null_count_);
+ data_->statistics = std::make_shared<ArrayStatistics>();
+ data_->statistics->null_count = null_count_;
+ data_->statistics->min = min_;
+ data_->statistics->is_min_exact = true;
+ data_->statistics->max = max_;
+ data_->statistics->is_max_exact = true;
+ }
+
+ protected:
+ std::vector<uint8_t> valids_;
+ size_t null_count_;
+ std::shared_ptr<Buffer> null_buffer_;
+ std::vector<int32_t> values_;
+ int64_t min_;
+ int64_t max_;
+ std::shared_ptr<Buffer> values_buffer_;
+ std::shared_ptr<ArrayData> data_;
+};
+
+TEST_F(TestArrayDataStatistics, MoveConstructor) {
+ ArrayData copied_data(*data_);
+ ArrayData moved_data(std::move(copied_data));
+
+ ASSERT_TRUE(moved_data.statistics->null_count.has_value());
+ ASSERT_EQ(null_count_, moved_data.statistics->null_count.value());
+
+ ASSERT_TRUE(moved_data.statistics->min.has_value());
+
ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics->min.value()));
+ ASSERT_EQ(min_, std::get<int64_t>(moved_data.statistics->min.value()));
+ ASSERT_TRUE(moved_data.statistics->is_min_exact);
+
+ ASSERT_TRUE(moved_data.statistics->max.has_value());
+
ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics->max.value()));
+ ASSERT_EQ(max_, std::get<int64_t>(moved_data.statistics->max.value()));
+ ASSERT_TRUE(moved_data.statistics->is_max_exact);
+}
+
+TEST_F(TestArrayDataStatistics, CopyConstructor) {
+ ArrayData copied_data(*data_);
+
+ ASSERT_TRUE(copied_data.statistics->null_count.has_value());
+ ASSERT_EQ(null_count_, copied_data.statistics->null_count.value());
+
+ ASSERT_TRUE(copied_data.statistics->min.has_value());
+
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics->min.value()));
+ ASSERT_EQ(min_, std::get<int64_t>(copied_data.statistics->min.value()));
+ ASSERT_TRUE(copied_data.statistics->is_min_exact);
+
+ ASSERT_TRUE(copied_data.statistics->max.has_value());
+
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics->max.value()));
+ ASSERT_EQ(max_, std::get<int64_t>(copied_data.statistics->max.value()));
+ ASSERT_TRUE(copied_data.statistics->is_max_exact);
+}
+
+TEST_F(TestArrayDataStatistics, MoveAssignment) {
+ ArrayData copied_data(*data_);
+ ArrayData moved_data;
+ moved_data = std::move(copied_data);
+
+ ASSERT_TRUE(moved_data.statistics->null_count.has_value());
+ ASSERT_EQ(null_count_, moved_data.statistics->null_count.value());
+
+ ASSERT_TRUE(moved_data.statistics->min.has_value());
+
ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics->min.value()));
+ ASSERT_EQ(min_, std::get<int64_t>(moved_data.statistics->min.value()));
+ ASSERT_TRUE(moved_data.statistics->is_min_exact);
+
+ ASSERT_TRUE(moved_data.statistics->max.has_value());
+
ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics->max.value()));
+ ASSERT_EQ(max_, std::get<int64_t>(moved_data.statistics->max.value()));
+ ASSERT_TRUE(moved_data.statistics->is_max_exact);
+}
+
+TEST_F(TestArrayDataStatistics, CopyAssignment) {
+ ArrayData copied_data;
+ copied_data = *data_;
+
+ ASSERT_TRUE(copied_data.statistics->null_count.has_value());
+ ASSERT_EQ(null_count_, copied_data.statistics->null_count.value());
+
+ ASSERT_TRUE(copied_data.statistics->min.has_value());
+
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics->min.value()));
+ ASSERT_EQ(min_, std::get<int64_t>(copied_data.statistics->min.value()));
+ ASSERT_TRUE(copied_data.statistics->is_min_exact);
+
+ ASSERT_TRUE(copied_data.statistics->max.has_value());
+
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics->max.value()));
+ ASSERT_EQ(max_, std::get<int64_t>(copied_data.statistics->max.value()));
+ ASSERT_TRUE(copied_data.statistics->is_max_exact);
+}
+
+TEST_F(TestArrayDataStatistics, CopyTo) {
+ ASSERT_OK_AND_ASSIGN(auto copied_data,
+ data_->CopyTo(arrow::default_cpu_memory_manager()));
+
+ ASSERT_TRUE(copied_data->statistics->null_count.has_value());
+ ASSERT_EQ(null_count_, copied_data->statistics->null_count.value());
+
+ ASSERT_TRUE(copied_data->statistics->min.has_value());
+
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data->statistics->min.value()));
+ ASSERT_EQ(min_, std::get<int64_t>(copied_data->statistics->min.value()));
+ ASSERT_TRUE(copied_data->statistics->is_min_exact);
+
+ ASSERT_TRUE(copied_data->statistics->max.has_value());
+
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data->statistics->max.value()));
+ ASSERT_EQ(max_, std::get<int64_t>(copied_data->statistics->max.value()));
+ ASSERT_TRUE(copied_data->statistics->is_max_exact);
+}
+
+TEST_F(TestArrayDataStatistics, Slice) {
+ auto sliced_data = data_->Slice(0, 1);
+ ASSERT_FALSE(sliced_data->statistics);
+}
+
template <typename PType>
class TestPrimitiveArray : public ::testing::Test {
public:
diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc
index 83eeb56c49..8e29297a8c 100644
--- a/cpp/src/arrow/array/data.cc
+++ b/cpp/src/arrow/array/data.cc
@@ -165,6 +165,8 @@ Result<std::shared_ptr<ArrayData>> CopyToImpl(const
ArrayData& data,
ARROW_ASSIGN_OR_RAISE(output->dictionary, CopyToImpl(*data.dictionary, to,
copy_fn));
}
+ output->statistics = data.statistics;
+
return output;
}
} // namespace
@@ -195,6 +197,7 @@ std::shared_ptr<ArrayData> ArrayData::Slice(int64_t off,
int64_t len) const {
} else {
copy->null_count = null_count != 0 ? kUnknownNullCount : 0;
}
+ copy->statistics = nullptr;
return copy;
}
diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h
index e0508fe698..1e6ee9a1d3 100644
--- a/cpp/src/arrow/array/data.h
+++ b/cpp/src/arrow/array/data.h
@@ -24,6 +24,7 @@
#include <utility>
#include <vector>
+#include "arrow/array/statistics.h"
#include "arrow/buffer.h"
#include "arrow/result.h"
#include "arrow/type.h"
@@ -152,7 +153,8 @@ struct ARROW_EXPORT ArrayData {
offset(other.offset),
buffers(std::move(other.buffers)),
child_data(std::move(other.child_data)),
- dictionary(std::move(other.dictionary)) {
+ dictionary(std::move(other.dictionary)),
+ statistics(std::move(other.statistics)) {
SetNullCount(other.null_count);
}
@@ -163,7 +165,8 @@ struct ARROW_EXPORT ArrayData {
offset(other.offset),
buffers(other.buffers),
child_data(other.child_data),
- dictionary(other.dictionary) {
+ dictionary(other.dictionary),
+ statistics(other.statistics) {
SetNullCount(other.null_count);
}
@@ -176,6 +179,7 @@ struct ARROW_EXPORT ArrayData {
buffers = std::move(other.buffers);
child_data = std::move(other.child_data);
dictionary = std::move(other.dictionary);
+ statistics = std::move(other.statistics);
return *this;
}
@@ -188,6 +192,7 @@ struct ARROW_EXPORT ArrayData {
buffers = other.buffers;
child_data = other.child_data;
dictionary = other.dictionary;
+ statistics = other.statistics;
return *this;
}
@@ -274,6 +279,18 @@ struct ARROW_EXPORT ArrayData {
}
/// \brief Construct a zero-copy slice of the data with the given offset and
length
+ ///
+ /// The associated `ArrayStatistics` is always discarded in a sliced
+ /// `ArrayData`. Because `ArrayStatistics` in the original
+ /// `ArrayData` may be invalid in a sliced `ArrayData`. If you want
+ /// to reuse statistics in the original `ArrayData`, you need to do
+ /// it by yourself.
+ ///
+ /// If the specified slice range has the same range as the original
+ /// `ArrayData`, we can reuse statistics in the original
+ /// `ArrayData`. Because it has the same data as the original
+ /// `ArrayData`. But the associated `ArrayStatistics` is discarded
+ /// in this case too. Use `Copy()` instead for the case.
std::shared_ptr<ArrayData> Slice(int64_t offset, int64_t length) const;
/// \brief Input-checking variant of Slice
@@ -390,6 +407,9 @@ struct ARROW_EXPORT ArrayData {
// The dictionary for this Array, if any. Only used for dictionary type
std::shared_ptr<ArrayData> dictionary;
+
+ // The statistics for this Array.
+ std::shared_ptr<ArrayStatistics> statistics;
};
/// \brief A non-owning Buffer reference