wgtmac commented on code in PR #35989:
URL: https://github.com/apache/arrow/pull/35989#discussion_r1251803477
##########
cpp/src/parquet/statistics.cc:
##########
@@ -463,6 +463,8 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
public:
using T = typename DType::c_type;
+ // This constructor would likely be called by ColumnWriter to create the
+ // statistics collector during write.
Review Comment:
```suggestion
// Create an empty stats.
```
##########
cpp/src/parquet/statistics.cc:
##########
@@ -552,12 +556,17 @@ class TypedStatisticsImpl : public TypedStatistics<DType>
{
void Merge(const TypedStatistics<DType>& other) override {
this->num_values_ += other.num_values();
+ // Merge always runs when Merge builder's page statistics
+ // into column chunk statistics, so it usually has null count.
if (other.HasNullCount()) {
this->statistics_.null_count += other.null_count();
+ } else {
+ this->has_null_count_ = false;
}
- if (other.HasDistinctCount()) {
- this->statistics_.distinct_count += other.distinct_count();
- }
+ // Distinct count cannot be merged.
Review Comment:
```suggestion
// Clear has_distinct_count_ as distinct count cannot be merged.
```
##########
cpp/src/parquet/statistics_test.cc:
##########
@@ -590,6 +593,173 @@ TYPED_TEST(TestNumericStatistics, Equals) {
ASSERT_NO_FATAL_FAILURE(this->TestEquals());
}
+template <typename TestType>
+class TestStatisticsHasFlag : public TestStatistics<TestType> {
+ public:
+ void SetUp() override {
+ TestStatistics<TestType>::SetUp();
+ this->SetUpSchema(Repetition::OPTIONAL);
+ }
+
+ std::shared_ptr<TypedStatistics<TestType>> BuildMergedStatistics(
+ const TypedStatistics<TestType>& stats1, const
TypedStatistics<TestType>& stats2) {
+ auto chunk_statistics = MakeStatistics<TestType>(this->schema_.Column(0));
+ chunk_statistics->Merge(stats1);
+ chunk_statistics->Merge(stats2);
+ return chunk_statistics;
+ }
+
+ void VerifyMergedStatistics(const TypedStatistics<TestType>& stats1,
+ const TypedStatistics<TestType>& stats2,
+ const
std::function<void(TypedStatistics<TestType>*)>& fn) {
+ fn(BuildMergedStatistics(stats1, stats2).get());
+ fn(BuildMergedStatistics(stats2, stats1).get());
+ }
+
+ // Distinct count should set to false when Merge is called.
+ void TestMergeDistinctCount() {
+ // Create a statistics object with distinct count.
+ std::shared_ptr<TypedStatistics<TestType>> statistics1;
+ {
+ EncodedStatistics encoded_statistics1;
+ statistics1 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+ Statistics::Make(this->schema_.Column(0), &encoded_statistics1,
+ /*num_values=*/1000));
+ EXPECT_FALSE(statistics1->HasDistinctCount());
+ }
+
+ // Create a statistics object with distinct count.
+ std::shared_ptr<TypedStatistics<TestType>> statistics2;
+ {
+ EncodedStatistics encoded_statistics2;
+ encoded_statistics2.has_distinct_count = true;
+ encoded_statistics2.distinct_count = 500;
+ statistics2 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+ Statistics::Make(this->schema_.Column(0), &encoded_statistics2,
+ /*num_values=*/1000));
+ EXPECT_TRUE(statistics2->HasDistinctCount());
+ }
+
+ VerifyMergedStatistics(*statistics1, *statistics2,
+ [](TypedStatistics<TestType>* merged_statistics) {
+
EXPECT_FALSE(merged_statistics->HasDistinctCount());
+
EXPECT_FALSE(merged_statistics->Encode().has_distinct_count);
+ });
+ }
+
+ // If a page is all nulls or nan, min and max should be set to null.
+ // And the all-null page being merged with a page having min-max,
+ // the result should have min-max.
+ void TestMergeMinMax() {
+ this->GenerateData(1000);
+ // Create a statistics object without min-max.
+ std::shared_ptr<TypedStatistics<TestType>> statistics1;
+ {
+ statistics1 = MakeStatistics<TestType>(this->schema_.Column(0));
+ statistics1->Update(this->values_ptr_, /*num_values=*/0,
+ /*null_count*/ this->values_.size());
+ auto encoded_stats1 = statistics1->Encode();
+ EXPECT_FALSE(statistics1->HasMinMax());
+ EXPECT_FALSE(encoded_stats1.has_min);
+ EXPECT_FALSE(encoded_stats1.has_max);
+ }
+ // Create a statistics object with min-max.
+ std::shared_ptr<TypedStatistics<TestType>> statistics2;
+ {
+ statistics2 = MakeStatistics<TestType>(this->schema_.Column(0));
+ statistics2->Update(this->values_ptr_, this->values_.size(), 0);
+ auto encoded_stats2 = statistics2->Encode();
+ EXPECT_TRUE(statistics2->HasMinMax());
+ EXPECT_TRUE(encoded_stats2.has_min);
+ EXPECT_TRUE(encoded_stats2.has_max);
+ }
+ VerifyMergedStatistics(*statistics1, *statistics2,
+ [](TypedStatistics<TestType>* merged_statistics) {
+ EXPECT_TRUE(merged_statistics->HasMinMax());
+ EXPECT_TRUE(merged_statistics->Encode().has_min);
+ EXPECT_TRUE(merged_statistics->Encode().has_max);
+ });
+ }
+
+ // Default statistics should have null_count even if no nulls is written.
+ // However, if statistics read from thrift, it might not have null_count,
+ // and page merge with it should also not have null_count.
+ void TestMergeNullCount() {
+ this->GenerateData(1000);
+
+ // Page should have null-count even if no nulls
+ std::shared_ptr<TypedStatistics<TestType>> statistics1;
+ {
+ statistics1 = MakeStatistics<TestType>(this->schema_.Column(0));
+ statistics1->Update(this->values_ptr_,
/*num_values=*/this->values_.size(),
+ /*null_count*/ 0);
Review Comment:
```suggestion
/*null_count=*/ 0);
```
##########
cpp/src/parquet/statistics_test.cc:
##########
@@ -590,6 +593,173 @@ TYPED_TEST(TestNumericStatistics, Equals) {
ASSERT_NO_FATAL_FAILURE(this->TestEquals());
}
+template <typename TestType>
+class TestStatisticsHasFlag : public TestStatistics<TestType> {
+ public:
+ void SetUp() override {
+ TestStatistics<TestType>::SetUp();
+ this->SetUpSchema(Repetition::OPTIONAL);
+ }
+
+ std::shared_ptr<TypedStatistics<TestType>> BuildMergedStatistics(
+ const TypedStatistics<TestType>& stats1, const
TypedStatistics<TestType>& stats2) {
+ auto chunk_statistics = MakeStatistics<TestType>(this->schema_.Column(0));
+ chunk_statistics->Merge(stats1);
+ chunk_statistics->Merge(stats2);
+ return chunk_statistics;
+ }
+
+ void VerifyMergedStatistics(const TypedStatistics<TestType>& stats1,
+ const TypedStatistics<TestType>& stats2,
+ const
std::function<void(TypedStatistics<TestType>*)>& fn) {
+ fn(BuildMergedStatistics(stats1, stats2).get());
+ fn(BuildMergedStatistics(stats2, stats1).get());
+ }
+
+ // Distinct count should set to false when Merge is called.
+ void TestMergeDistinctCount() {
+ // Create a statistics object with distinct count.
+ std::shared_ptr<TypedStatistics<TestType>> statistics1;
+ {
+ EncodedStatistics encoded_statistics1;
+ statistics1 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+ Statistics::Make(this->schema_.Column(0), &encoded_statistics1,
+ /*num_values=*/1000));
+ EXPECT_FALSE(statistics1->HasDistinctCount());
+ }
+
+ // Create a statistics object with distinct count.
+ std::shared_ptr<TypedStatistics<TestType>> statistics2;
+ {
+ EncodedStatistics encoded_statistics2;
+ encoded_statistics2.has_distinct_count = true;
+ encoded_statistics2.distinct_count = 500;
+ statistics2 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+ Statistics::Make(this->schema_.Column(0), &encoded_statistics2,
+ /*num_values=*/1000));
+ EXPECT_TRUE(statistics2->HasDistinctCount());
+ }
+
+ VerifyMergedStatistics(*statistics1, *statistics2,
+ [](TypedStatistics<TestType>* merged_statistics) {
+
EXPECT_FALSE(merged_statistics->HasDistinctCount());
+
EXPECT_FALSE(merged_statistics->Encode().has_distinct_count);
+ });
+ }
+
+ // If a page is all nulls or nan, min and max should be set to null.
+ // And the all-null page being merged with a page having min-max,
+ // the result should have min-max.
Review Comment:
```suggestion
// If all values in a page are null or nan, its stats should not set
min-max.
// Merging its stats with another page having good min-max stats should not
// drop the valid min-max from the latter page.
```
##########
cpp/src/parquet/statistics.cc:
##########
@@ -627,7 +638,11 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
T min_;
T max_;
::arrow::MemoryPool* pool_;
- int64_t num_values_ = 0; // # of non-null values.
+ // Number of non-null values.
+ // num_values_ would be reliable when has_null_count_.
+ // If statistics is created from a page thrift statistics, and statistics
+ // doesn't have null_count, `num_values_` may include null values.
Review Comment:
```suggestion
// Number of non-null values.
// Please note that num_values_ is reliable when has_null_count_ is set.
// When has_null_count_ is not set, e.g. a page statistics created from
// a statistics thrift message which doesn't have the optional null_count,
// `num_values_` may include null values.
```
##########
cpp/src/parquet/statistics.cc:
##########
@@ -609,9 +618,11 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
}
if (HasNullCount()) {
s.set_null_count(this->null_count());
+ // num_values_ is reliable and it means number of non-null values.
+ s.all_null_value = num_values_ == 0;
}
- // num_values_ is reliable and it means number of non-null values.
- s.all_null_value = num_values_ == 0;
+ // Currently, distinct count would not be written.
+ // So, not call set_distinct_count.
Review Comment:
Move these comment to line 567 above?
##########
cpp/src/parquet/statistics.cc:
##########
@@ -552,12 +556,17 @@ class TypedStatisticsImpl : public TypedStatistics<DType>
{
void Merge(const TypedStatistics<DType>& other) override {
this->num_values_ += other.num_values();
+ // Merge always runs when Merge builder's page statistics
+ // into column chunk statistics, so it usually has null count.
Review Comment:
Additionally, I don't think we need to add comment above. We don't know how
external users call `Statistics::Merge()`. Rather we can comment that it tries
to maintain every field at its best effort.
##########
cpp/src/parquet/statistics.cc:
##########
@@ -552,12 +556,17 @@ class TypedStatisticsImpl : public TypedStatistics<DType>
{
void Merge(const TypedStatistics<DType>& other) override {
this->num_values_ += other.num_values();
+ // Merge always runs when Merge builder's page statistics
+ // into column chunk statistics, so it usually has null count.
Review Comment:
If Merge is called twice with other.HasNullCount()=false and then
other.HasNullCount()=true, here may have problem. Why not checking
this->has_null_count_ first? Probably we need add a test case to cover the
above case?
##########
cpp/src/parquet/statistics.cc:
##########
@@ -552,12 +556,17 @@ class TypedStatisticsImpl : public TypedStatistics<DType>
{
void Merge(const TypedStatistics<DType>& other) override {
this->num_values_ += other.num_values();
+ // Merge always runs when Merge builder's page statistics
+ // into column chunk statistics, so it usually has null count.
if (other.HasNullCount()) {
this->statistics_.null_count += other.null_count();
+ } else {
+ this->has_null_count_ = false;
}
- if (other.HasDistinctCount()) {
- this->statistics_.distinct_count += other.distinct_count();
- }
+ // Distinct count cannot be merged.
+ has_distinct_count_ = false;
+ // If !other.HasMinMax, might be all-nulls or nulls and nan,
+ // so, not clear `this->has_min_max_` here.
Review Comment:
```suggestion
// Do not clear min/max here if the other side does not provide
// min/max which may happen when other is an empty stats or all
// its values are null and/or NaN.
```
##########
cpp/src/parquet/statistics.cc:
##########
@@ -648,6 +664,16 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
this->num_values_ = 0;
}
+ void ResetHasFlags() {
+ this->has_min_max_ = false;
+ // writer will write `distinct_count` to EncodedStatistics.
+ // So, disable it when reset.
+ this->has_distinct_count_ = false;
+ // writer will always write `null_count` to EncodedStatistics.
+ // So, enable it when reset.
+ this->has_null_count_ = true;
Review Comment:
```suggestion
// has_min_max_ will only be set when it meets any valid value.
this->has_min_max_ = false;
// has_distinct_count_ will only be set once SetDistinctCount()
// is called because distinct count calculation is not cheap and
// disabled by default.
this->has_distinct_count_ = false;
// Null count calculation is cheap and enabled by default.
this->has_null_count_ = true;
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]