wgtmac commented on code in PR #35989:
URL: https://github.com/apache/arrow/pull/35989#discussion_r1252492049


##########
cpp/src/parquet/statistics_test.cc:
##########
@@ -590,6 +593,173 @@ TYPED_TEST(TestNumericStatistics, Equals) {
   ASSERT_NO_FATAL_FAILURE(this->TestEquals());
 }
 
+template <typename TestType>
+class TestStatisticsHasFlag : public TestStatistics<TestType> {
+ public:
+  void SetUp() override {
+    TestStatistics<TestType>::SetUp();
+    this->SetUpSchema(Repetition::OPTIONAL);
+  }
+
+  std::shared_ptr<TypedStatistics<TestType>> BuildMergedStatistics(
+      const TypedStatistics<TestType>& stats1, const 
TypedStatistics<TestType>& stats2) {
+    auto chunk_statistics = MakeStatistics<TestType>(this->schema_.Column(0));
+    chunk_statistics->Merge(stats1);
+    chunk_statistics->Merge(stats2);
+    return chunk_statistics;
+  }
+
+  void VerifyMergedStatistics(const TypedStatistics<TestType>& stats1,
+                              const TypedStatistics<TestType>& stats2,
+                              const 
std::function<void(TypedStatistics<TestType>*)>& fn) {
+    fn(BuildMergedStatistics(stats1, stats2).get());
+    fn(BuildMergedStatistics(stats2, stats1).get());
+  }
+
+  // Distinct count should set to false when Merge is called.
+  void TestMergeDistinctCount() {
+    // Create a statistics object with distinct count.
+    std::shared_ptr<TypedStatistics<TestType>> statistics1;
+    {
+      EncodedStatistics encoded_statistics1;
+      statistics1 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+          Statistics::Make(this->schema_.Column(0), &encoded_statistics1,
+                           /*num_values=*/1000));
+      EXPECT_FALSE(statistics1->HasDistinctCount());
+    }
+
+    // Create a statistics object with distinct count.
+    std::shared_ptr<TypedStatistics<TestType>> statistics2;
+    {
+      EncodedStatistics encoded_statistics2;
+      encoded_statistics2.has_distinct_count = true;
+      encoded_statistics2.distinct_count = 500;
+      statistics2 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+          Statistics::Make(this->schema_.Column(0), &encoded_statistics2,
+                           /*num_values=*/1000));
+      EXPECT_TRUE(statistics2->HasDistinctCount());
+    }
+
+    VerifyMergedStatistics(*statistics1, *statistics2,
+                           [](TypedStatistics<TestType>* merged_statistics) {
+                             
EXPECT_FALSE(merged_statistics->HasDistinctCount());
+                             
EXPECT_FALSE(merged_statistics->Encode().has_distinct_count);
+                           });
+  }
+
+  // If all values in a page are null or nan, its stats should not set min-max.
+  // Merging its stats with another page having good min-max stats should not
+  // drop the valid min-max from the latter page.
+  void TestMergeMinMax() {
+    this->GenerateData(1000);
+    // Create a statistics object without min-max.
+    std::shared_ptr<TypedStatistics<TestType>> statistics1;
+    {
+      statistics1 = MakeStatistics<TestType>(this->schema_.Column(0));
+      statistics1->Update(this->values_ptr_, /*num_values=*/0,
+                          /*null_count*/ this->values_.size());
+      auto encoded_stats1 = statistics1->Encode();
+      EXPECT_FALSE(statistics1->HasMinMax());
+      EXPECT_FALSE(encoded_stats1.has_min);
+      EXPECT_FALSE(encoded_stats1.has_max);
+    }
+    // Create a statistics object with min-max.
+    std::shared_ptr<TypedStatistics<TestType>> statistics2;
+    {
+      statistics2 = MakeStatistics<TestType>(this->schema_.Column(0));
+      statistics2->Update(this->values_ptr_, this->values_.size(), 0);
+      auto encoded_stats2 = statistics2->Encode();
+      EXPECT_TRUE(statistics2->HasMinMax());
+      EXPECT_TRUE(encoded_stats2.has_min);
+      EXPECT_TRUE(encoded_stats2.has_max);
+    }
+    VerifyMergedStatistics(*statistics1, *statistics2,
+                           [](TypedStatistics<TestType>* merged_statistics) {
+                             EXPECT_TRUE(merged_statistics->HasMinMax());
+                             EXPECT_TRUE(merged_statistics->Encode().has_min);
+                             EXPECT_TRUE(merged_statistics->Encode().has_max);
+                           });
+  }
+
+  // Default statistics should have null_count even if no nulls is written.
+  // However, if statistics read from thrift, it might not have null_count,
+  // and page merge with it should also not have null_count.
+  void TestMergeNullCount() {
+    this->GenerateData(1000);
+
+    // Page should have null-count even if no nulls
+    std::shared_ptr<TypedStatistics<TestType>> statistics1;
+    {
+      statistics1 = MakeStatistics<TestType>(this->schema_.Column(0));
+      statistics1->Update(this->values_ptr_, 
/*num_values=*/this->values_.size(),
+                          /*null_count=*/0);
+      auto encoded_stats1 = statistics1->Encode();
+      EXPECT_TRUE(statistics1->HasNullCount());
+      EXPECT_EQ(0, statistics1->null_count());
+      EXPECT_TRUE(statistics1->Encode().has_null_count);
+    }
+    // Merge with null-count should also have null count
+    VerifyMergedStatistics(*statistics1, *statistics1,
+                           [](TypedStatistics<TestType>* merged_statistics) {
+                             EXPECT_TRUE(merged_statistics->HasNullCount());
+                             EXPECT_EQ(0, merged_statistics->null_count());
+                             auto encoded = merged_statistics->Encode();
+                             EXPECT_TRUE(encoded.has_null_count);
+                             EXPECT_EQ(0, encoded.null_count);
+                           });
+
+    // When loaded from thrift, might not have null count.
+    std::shared_ptr<TypedStatistics<TestType>> statistics2;
+    {
+      EncodedStatistics encoded_statistics2;
+      encoded_statistics2.has_null_count = false;
+      statistics2 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+          Statistics::Make(this->schema_.Column(0), &encoded_statistics2,
+                           /*num_values=*/1000));
+      EXPECT_FALSE(statistics2->Encode().has_null_count);
+      EXPECT_FALSE(statistics2->HasNullCount());
+    }
+
+    // Merge without null-count should not have null count
+    VerifyMergedStatistics(*statistics1, *statistics2,
+                           [](TypedStatistics<TestType>* merged_statistics) {
+                             EXPECT_FALSE(merged_statistics->HasNullCount());
+                             
EXPECT_FALSE(merged_statistics->Encode().has_null_count);
+                           });
+  }
+
+  // statistics.all_null_value is used to build the page index.
+  // If statistics doesn't have null count, all_null_value should be false.
+  void TestNotHasNullValue() {
+    EncodedStatistics encoded_statistics1;
+    encoded_statistics1.has_null_count = false;
+    auto statistics1 = Statistics::Make(this->schema_.Column(0), 
&encoded_statistics1,
+                                        /*num_values=*/1000);
+    auto s1 = 
std::dynamic_pointer_cast<TypedStatistics<TestType>>(statistics1);
+    EXPECT_FALSE(s1->HasNullCount());
+    auto encoded = s1->Encode();
+    EXPECT_FALSE(encoded.all_null_value);

Review Comment:
   Check other attributes of `encoded` as well?



##########
cpp/src/parquet/statistics_test.cc:
##########
@@ -590,6 +593,173 @@ TYPED_TEST(TestNumericStatistics, Equals) {
   ASSERT_NO_FATAL_FAILURE(this->TestEquals());
 }
 
+template <typename TestType>
+class TestStatisticsHasFlag : public TestStatistics<TestType> {
+ public:
+  void SetUp() override {
+    TestStatistics<TestType>::SetUp();
+    this->SetUpSchema(Repetition::OPTIONAL);
+  }
+
+  std::shared_ptr<TypedStatistics<TestType>> BuildMergedStatistics(
+      const TypedStatistics<TestType>& stats1, const 
TypedStatistics<TestType>& stats2) {
+    auto chunk_statistics = MakeStatistics<TestType>(this->schema_.Column(0));
+    chunk_statistics->Merge(stats1);
+    chunk_statistics->Merge(stats2);
+    return chunk_statistics;
+  }
+
+  void VerifyMergedStatistics(const TypedStatistics<TestType>& stats1,
+                              const TypedStatistics<TestType>& stats2,
+                              const 
std::function<void(TypedStatistics<TestType>*)>& fn) {
+    fn(BuildMergedStatistics(stats1, stats2).get());
+    fn(BuildMergedStatistics(stats2, stats1).get());
+  }
+
+  // Distinct count should set to false when Merge is called.
+  void TestMergeDistinctCount() {
+    // Create a statistics object with distinct count.
+    std::shared_ptr<TypedStatistics<TestType>> statistics1;
+    {
+      EncodedStatistics encoded_statistics1;
+      statistics1 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+          Statistics::Make(this->schema_.Column(0), &encoded_statistics1,
+                           /*num_values=*/1000));
+      EXPECT_FALSE(statistics1->HasDistinctCount());
+    }
+
+    // Create a statistics object with distinct count.
+    std::shared_ptr<TypedStatistics<TestType>> statistics2;
+    {
+      EncodedStatistics encoded_statistics2;
+      encoded_statistics2.has_distinct_count = true;
+      encoded_statistics2.distinct_count = 500;
+      statistics2 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+          Statistics::Make(this->schema_.Column(0), &encoded_statistics2,
+                           /*num_values=*/1000));
+      EXPECT_TRUE(statistics2->HasDistinctCount());
+    }
+
+    VerifyMergedStatistics(*statistics1, *statistics2,
+                           [](TypedStatistics<TestType>* merged_statistics) {
+                             
EXPECT_FALSE(merged_statistics->HasDistinctCount());
+                             
EXPECT_FALSE(merged_statistics->Encode().has_distinct_count);
+                           });
+  }
+
+  // If all values in a page are null or nan, its stats should not set min-max.
+  // Merging its stats with another page having good min-max stats should not
+  // drop the valid min-max from the latter page.
+  void TestMergeMinMax() {
+    this->GenerateData(1000);
+    // Create a statistics object without min-max.
+    std::shared_ptr<TypedStatistics<TestType>> statistics1;
+    {
+      statistics1 = MakeStatistics<TestType>(this->schema_.Column(0));
+      statistics1->Update(this->values_ptr_, /*num_values=*/0,
+                          /*null_count*/ this->values_.size());
+      auto encoded_stats1 = statistics1->Encode();
+      EXPECT_FALSE(statistics1->HasMinMax());
+      EXPECT_FALSE(encoded_stats1.has_min);
+      EXPECT_FALSE(encoded_stats1.has_max);
+    }
+    // Create a statistics object with min-max.
+    std::shared_ptr<TypedStatistics<TestType>> statistics2;
+    {
+      statistics2 = MakeStatistics<TestType>(this->schema_.Column(0));
+      statistics2->Update(this->values_ptr_, this->values_.size(), 0);
+      auto encoded_stats2 = statistics2->Encode();
+      EXPECT_TRUE(statistics2->HasMinMax());
+      EXPECT_TRUE(encoded_stats2.has_min);
+      EXPECT_TRUE(encoded_stats2.has_max);
+    }
+    VerifyMergedStatistics(*statistics1, *statistics2,
+                           [](TypedStatistics<TestType>* merged_statistics) {
+                             EXPECT_TRUE(merged_statistics->HasMinMax());
+                             EXPECT_TRUE(merged_statistics->Encode().has_min);
+                             EXPECT_TRUE(merged_statistics->Encode().has_max);
+                           });
+  }
+
+  // Default statistics should have null_count even if no nulls is written.
+  // However, if statistics read from thrift, it might not have null_count,
+  // and page merge with it should also not have null_count.
+  void TestMergeNullCount() {
+    this->GenerateData(1000);
+
+    // Page should have null-count even if no nulls
+    std::shared_ptr<TypedStatistics<TestType>> statistics1;
+    {
+      statistics1 = MakeStatistics<TestType>(this->schema_.Column(0));
+      statistics1->Update(this->values_ptr_, 
/*num_values=*/this->values_.size(),
+                          /*null_count=*/0);
+      auto encoded_stats1 = statistics1->Encode();
+      EXPECT_TRUE(statistics1->HasNullCount());
+      EXPECT_EQ(0, statistics1->null_count());
+      EXPECT_TRUE(statistics1->Encode().has_null_count);
+    }
+    // Merge with null-count should also have null count
+    VerifyMergedStatistics(*statistics1, *statistics1,
+                           [](TypedStatistics<TestType>* merged_statistics) {
+                             EXPECT_TRUE(merged_statistics->HasNullCount());
+                             EXPECT_EQ(0, merged_statistics->null_count());
+                             auto encoded = merged_statistics->Encode();
+                             EXPECT_TRUE(encoded.has_null_count);
+                             EXPECT_EQ(0, encoded.null_count);
+                           });
+
+    // When loaded from thrift, might not have null count.
+    std::shared_ptr<TypedStatistics<TestType>> statistics2;
+    {
+      EncodedStatistics encoded_statistics2;
+      encoded_statistics2.has_null_count = false;
+      statistics2 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+          Statistics::Make(this->schema_.Column(0), &encoded_statistics2,
+                           /*num_values=*/1000));
+      EXPECT_FALSE(statistics2->Encode().has_null_count);
+      EXPECT_FALSE(statistics2->HasNullCount());
+    }
+
+    // Merge without null-count should not have null count
+    VerifyMergedStatistics(*statistics1, *statistics2,
+                           [](TypedStatistics<TestType>* merged_statistics) {
+                             EXPECT_FALSE(merged_statistics->HasNullCount());
+                             
EXPECT_FALSE(merged_statistics->Encode().has_null_count);
+                           });
+  }
+
+  // statistics.all_null_value is used to build the page index.
+  // If statistics doesn't have null count, all_null_value should be false.
+  void TestNotHasNullValue() {
+    EncodedStatistics encoded_statistics1;
+    encoded_statistics1.has_null_count = false;
+    auto statistics1 = Statistics::Make(this->schema_.Column(0), 
&encoded_statistics1,
+                                        /*num_values=*/1000);
+    auto s1 = 
std::dynamic_pointer_cast<TypedStatistics<TestType>>(statistics1);
+    EXPECT_FALSE(s1->HasNullCount());
+    auto encoded = s1->Encode();
+    EXPECT_FALSE(encoded.all_null_value);
+  }
+};
+
+TYPED_TEST_SUITE(TestStatisticsHasFlag, Types);
+
+TYPED_TEST(TestStatisticsHasFlag, MergeDistinctCount) {
+  ASSERT_NO_FATAL_FAILURE(this->TestMergeDistinctCount());
+}
+
+TYPED_TEST(TestStatisticsHasFlag, MergeNullCount) {
+  ASSERT_NO_FATAL_FAILURE(this->TestMergeNullCount());
+}
+
+TYPED_TEST(TestStatisticsHasFlag, TestMergeMinMax) {

Review Comment:
   ```suggestion
   TYPED_TEST(TestStatisticsHasFlag, MergeMinMax) {
   ```



##########
cpp/src/parquet/statistics_test.cc:
##########
@@ -590,6 +593,173 @@ TYPED_TEST(TestNumericStatistics, Equals) {
   ASSERT_NO_FATAL_FAILURE(this->TestEquals());
 }
 
+template <typename TestType>
+class TestStatisticsHasFlag : public TestStatistics<TestType> {
+ public:
+  void SetUp() override {
+    TestStatistics<TestType>::SetUp();
+    this->SetUpSchema(Repetition::OPTIONAL);
+  }
+
+  std::shared_ptr<TypedStatistics<TestType>> BuildMergedStatistics(
+      const TypedStatistics<TestType>& stats1, const 
TypedStatistics<TestType>& stats2) {
+    auto chunk_statistics = MakeStatistics<TestType>(this->schema_.Column(0));
+    chunk_statistics->Merge(stats1);
+    chunk_statistics->Merge(stats2);
+    return chunk_statistics;
+  }
+
+  void VerifyMergedStatistics(const TypedStatistics<TestType>& stats1,
+                              const TypedStatistics<TestType>& stats2,
+                              const 
std::function<void(TypedStatistics<TestType>*)>& fn) {
+    fn(BuildMergedStatistics(stats1, stats2).get());
+    fn(BuildMergedStatistics(stats2, stats1).get());

Review Comment:
   ```suggestion
                                 const 
std::function<void(TypedStatistics<TestType>*)>& test_fn) {
       ASSERT_NO_FATAL_FAILURE(test_fn(MergeStatistics(stats1, stats2).get()));
       ASSERT_NO_FATAL_FAILURE(test_fn(MergeStatistics(stats2, stats1).get()));
   ```



##########
cpp/src/parquet/statistics_test.cc:
##########
@@ -590,6 +593,173 @@ TYPED_TEST(TestNumericStatistics, Equals) {
   ASSERT_NO_FATAL_FAILURE(this->TestEquals());
 }
 
+template <typename TestType>
+class TestStatisticsHasFlag : public TestStatistics<TestType> {
+ public:
+  void SetUp() override {
+    TestStatistics<TestType>::SetUp();
+    this->SetUpSchema(Repetition::OPTIONAL);
+  }
+
+  std::shared_ptr<TypedStatistics<TestType>> BuildMergedStatistics(
+      const TypedStatistics<TestType>& stats1, const 
TypedStatistics<TestType>& stats2) {
+    auto chunk_statistics = MakeStatistics<TestType>(this->schema_.Column(0));
+    chunk_statistics->Merge(stats1);
+    chunk_statistics->Merge(stats2);
+    return chunk_statistics;
+  }
+
+  void VerifyMergedStatistics(const TypedStatistics<TestType>& stats1,
+                              const TypedStatistics<TestType>& stats2,
+                              const 
std::function<void(TypedStatistics<TestType>*)>& fn) {
+    fn(BuildMergedStatistics(stats1, stats2).get());
+    fn(BuildMergedStatistics(stats2, stats1).get());
+  }
+
+  // Distinct count should set to false when Merge is called.
+  void TestMergeDistinctCount() {

Review Comment:
   It would be good if we can test all flags in these cases even when the main 
purpose of a specific case is not for those flags.



##########
cpp/src/parquet/statistics_test.cc:
##########
@@ -590,6 +593,173 @@ TYPED_TEST(TestNumericStatistics, Equals) {
   ASSERT_NO_FATAL_FAILURE(this->TestEquals());
 }
 
+template <typename TestType>
+class TestStatisticsHasFlag : public TestStatistics<TestType> {
+ public:
+  void SetUp() override {
+    TestStatistics<TestType>::SetUp();
+    this->SetUpSchema(Repetition::OPTIONAL);
+  }
+
+  std::shared_ptr<TypedStatistics<TestType>> BuildMergedStatistics(

Review Comment:
   ```suggestion
     std::shared_ptr<TypedStatistics<TestType>> MergeStatistics(
   ```



##########
cpp/src/parquet/statistics_test.cc:
##########
@@ -590,6 +593,173 @@ TYPED_TEST(TestNumericStatistics, Equals) {
   ASSERT_NO_FATAL_FAILURE(this->TestEquals());
 }
 
+template <typename TestType>
+class TestStatisticsHasFlag : public TestStatistics<TestType> {
+ public:
+  void SetUp() override {
+    TestStatistics<TestType>::SetUp();
+    this->SetUpSchema(Repetition::OPTIONAL);
+  }
+
+  std::shared_ptr<TypedStatistics<TestType>> BuildMergedStatistics(
+      const TypedStatistics<TestType>& stats1, const 
TypedStatistics<TestType>& stats2) {
+    auto chunk_statistics = MakeStatistics<TestType>(this->schema_.Column(0));
+    chunk_statistics->Merge(stats1);
+    chunk_statistics->Merge(stats2);
+    return chunk_statistics;
+  }
+
+  void VerifyMergedStatistics(const TypedStatistics<TestType>& stats1,
+                              const TypedStatistics<TestType>& stats2,
+                              const 
std::function<void(TypedStatistics<TestType>*)>& fn) {
+    fn(BuildMergedStatistics(stats1, stats2).get());
+    fn(BuildMergedStatistics(stats2, stats1).get());
+  }
+
+  // Distinct count should set to false when Merge is called.
+  void TestMergeDistinctCount() {
+    // Create a statistics object with distinct count.
+    std::shared_ptr<TypedStatistics<TestType>> statistics1;
+    {
+      EncodedStatistics encoded_statistics1;
+      statistics1 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+          Statistics::Make(this->schema_.Column(0), &encoded_statistics1,
+                           /*num_values=*/1000));
+      EXPECT_FALSE(statistics1->HasDistinctCount());
+    }
+
+    // Create a statistics object with distinct count.
+    std::shared_ptr<TypedStatistics<TestType>> statistics2;
+    {
+      EncodedStatistics encoded_statistics2;
+      encoded_statistics2.has_distinct_count = true;
+      encoded_statistics2.distinct_count = 500;
+      statistics2 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+          Statistics::Make(this->schema_.Column(0), &encoded_statistics2,
+                           /*num_values=*/1000));
+      EXPECT_TRUE(statistics2->HasDistinctCount());
+    }
+
+    VerifyMergedStatistics(*statistics1, *statistics2,
+                           [](TypedStatistics<TestType>* merged_statistics) {
+                             
EXPECT_FALSE(merged_statistics->HasDistinctCount());
+                             
EXPECT_FALSE(merged_statistics->Encode().has_distinct_count);
+                           });
+  }
+
+  // If all values in a page are null or nan, its stats should not set min-max.
+  // Merging its stats with another page having good min-max stats should not
+  // drop the valid min-max from the latter page.
+  void TestMergeMinMax() {
+    this->GenerateData(1000);
+    // Create a statistics object without min-max.
+    std::shared_ptr<TypedStatistics<TestType>> statistics1;
+    {
+      statistics1 = MakeStatistics<TestType>(this->schema_.Column(0));
+      statistics1->Update(this->values_ptr_, /*num_values=*/0,
+                          /*null_count*/ this->values_.size());
+      auto encoded_stats1 = statistics1->Encode();
+      EXPECT_FALSE(statistics1->HasMinMax());
+      EXPECT_FALSE(encoded_stats1.has_min);
+      EXPECT_FALSE(encoded_stats1.has_max);
+    }
+    // Create a statistics object with min-max.
+    std::shared_ptr<TypedStatistics<TestType>> statistics2;
+    {
+      statistics2 = MakeStatistics<TestType>(this->schema_.Column(0));
+      statistics2->Update(this->values_ptr_, this->values_.size(), 0);
+      auto encoded_stats2 = statistics2->Encode();
+      EXPECT_TRUE(statistics2->HasMinMax());
+      EXPECT_TRUE(encoded_stats2.has_min);
+      EXPECT_TRUE(encoded_stats2.has_max);
+    }
+    VerifyMergedStatistics(*statistics1, *statistics2,
+                           [](TypedStatistics<TestType>* merged_statistics) {
+                             EXPECT_TRUE(merged_statistics->HasMinMax());
+                             EXPECT_TRUE(merged_statistics->Encode().has_min);
+                             EXPECT_TRUE(merged_statistics->Encode().has_max);
+                           });
+  }
+
+  // Default statistics should have null_count even if no nulls is written.
+  // However, if statistics read from thrift, it might not have null_count,
+  // and page merge with it should also not have null_count.
+  void TestMergeNullCount() {
+    this->GenerateData(1000);
+
+    // Page should have null-count even if no nulls
+    std::shared_ptr<TypedStatistics<TestType>> statistics1;
+    {
+      statistics1 = MakeStatistics<TestType>(this->schema_.Column(0));
+      statistics1->Update(this->values_ptr_, 
/*num_values=*/this->values_.size(),
+                          /*null_count=*/0);
+      auto encoded_stats1 = statistics1->Encode();
+      EXPECT_TRUE(statistics1->HasNullCount());
+      EXPECT_EQ(0, statistics1->null_count());
+      EXPECT_TRUE(statistics1->Encode().has_null_count);
+    }
+    // Merge with null-count should also have null count
+    VerifyMergedStatistics(*statistics1, *statistics1,
+                           [](TypedStatistics<TestType>* merged_statistics) {
+                             EXPECT_TRUE(merged_statistics->HasNullCount());
+                             EXPECT_EQ(0, merged_statistics->null_count());
+                             auto encoded = merged_statistics->Encode();
+                             EXPECT_TRUE(encoded.has_null_count);
+                             EXPECT_EQ(0, encoded.null_count);
+                           });
+
+    // When loaded from thrift, might not have null count.
+    std::shared_ptr<TypedStatistics<TestType>> statistics2;
+    {
+      EncodedStatistics encoded_statistics2;
+      encoded_statistics2.has_null_count = false;
+      statistics2 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+          Statistics::Make(this->schema_.Column(0), &encoded_statistics2,
+                           /*num_values=*/1000));
+      EXPECT_FALSE(statistics2->Encode().has_null_count);
+      EXPECT_FALSE(statistics2->HasNullCount());
+    }
+
+    // Merge without null-count should not have null count
+    VerifyMergedStatistics(*statistics1, *statistics2,
+                           [](TypedStatistics<TestType>* merged_statistics) {
+                             EXPECT_FALSE(merged_statistics->HasNullCount());
+                             
EXPECT_FALSE(merged_statistics->Encode().has_null_count);
+                           });
+  }
+
+  // statistics.all_null_value is used to build the page index.
+  // If statistics doesn't have null count, all_null_value should be false.
+  void TestNotHasNullValue() {
+    EncodedStatistics encoded_statistics1;
+    encoded_statistics1.has_null_count = false;
+    auto statistics1 = Statistics::Make(this->schema_.Column(0), 
&encoded_statistics1,
+                                        /*num_values=*/1000);
+    auto s1 = 
std::dynamic_pointer_cast<TypedStatistics<TestType>>(statistics1);
+    EXPECT_FALSE(s1->HasNullCount());
+    auto encoded = s1->Encode();
+    EXPECT_FALSE(encoded.all_null_value);
+  }
+};
+
+TYPED_TEST_SUITE(TestStatisticsHasFlag, Types);
+
+TYPED_TEST(TestStatisticsHasFlag, MergeDistinctCount) {
+  ASSERT_NO_FATAL_FAILURE(this->TestMergeDistinctCount());
+}
+
+TYPED_TEST(TestStatisticsHasFlag, MergeNullCount) {
+  ASSERT_NO_FATAL_FAILURE(this->TestMergeNullCount());
+}
+
+TYPED_TEST(TestStatisticsHasFlag, TestMergeMinMax) {
+  ASSERT_NO_FATAL_FAILURE(this->TestMergeMinMax());
+}
+
+TYPED_TEST(TestStatisticsHasFlag, NotHasNullValues) {
+  ASSERT_NO_FATAL_FAILURE(this->TestNotHasNullValue());

Review Comment:
   ```suggestion
   TYPED_TEST(TestStatisticsHasFlag, MissingNullCount) {
     ASSERT_NO_FATAL_FAILURE(this->TestMissingNullCount());
   ```



##########
cpp/src/parquet/statistics.cc:
##########
@@ -609,9 +618,11 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
     }
     if (HasNullCount()) {
       s.set_null_count(this->null_count());
+      // num_values_ is reliable and it means number of non-null values.
+      s.all_null_value = num_values_ == 0;
     }
-    // num_values_ is reliable and it means number of non-null values.
-    s.all_null_value = num_values_ == 0;
+    // Currently, distinct count would not be written.
+    // So, not call set_distinct_count.

Review Comment:
   Sorry my bad! Probably change the comment into a `TODO` or `FIXME`? 
Something like:
   
   // TODO: distinct count is not encoded for now.



##########
cpp/src/parquet/statistics_test.cc:
##########
@@ -590,6 +593,173 @@ TYPED_TEST(TestNumericStatistics, Equals) {
   ASSERT_NO_FATAL_FAILURE(this->TestEquals());
 }
 
+template <typename TestType>
+class TestStatisticsHasFlag : public TestStatistics<TestType> {
+ public:
+  void SetUp() override {
+    TestStatistics<TestType>::SetUp();
+    this->SetUpSchema(Repetition::OPTIONAL);
+  }
+
+  std::shared_ptr<TypedStatistics<TestType>> BuildMergedStatistics(
+      const TypedStatistics<TestType>& stats1, const 
TypedStatistics<TestType>& stats2) {
+    auto chunk_statistics = MakeStatistics<TestType>(this->schema_.Column(0));
+    chunk_statistics->Merge(stats1);
+    chunk_statistics->Merge(stats2);
+    return chunk_statistics;
+  }
+
+  void VerifyMergedStatistics(const TypedStatistics<TestType>& stats1,
+                              const TypedStatistics<TestType>& stats2,
+                              const 
std::function<void(TypedStatistics<TestType>*)>& fn) {
+    fn(BuildMergedStatistics(stats1, stats2).get());
+    fn(BuildMergedStatistics(stats2, stats1).get());

Review Comment:
   CMIW, directly calling `.get()` on the shared_ptr may introduce lifetime 
issue if the returned shared_ptr object is released before fn uses the raw 
pointer.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to