This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 60fdc25583 GH-34351: [C++][Parquet] Statistics: add detail 
documentation and tiny optimization (#35989)
60fdc25583 is described below

commit 60fdc25583cb391871cd7dd67372e8d9df9f0f45
Author: mwish <[email protected]>
AuthorDate: Fri Jul 7 01:10:30 2023 +0800

    GH-34351: [C++][Parquet] Statistics: add detail documentation and tiny 
optimization (#35989)
    
    
    
    ### Rationale for this change
    
    ### What changes are included in this PR?
    
    1. This patch does some tiny optimizations on Parquet C++ Statistics. It 
does:
    
    ```
    For min-max, using std::string. Because assume the case like that:
    EncodedStatistics c1;
    // do some operations
    EncodedStatistics c2 = c1;
    c2.set_max("dasdasdassd");
    After c2 set, c1 would be set too. So I use std::string here.
    ```
    
    2. Force clear ndv count during merging, and set `has_distinct_count_ = 
false`, and add some comments
    3. Add some specification in Statistics API
    
    ### Are these changes tested?
    
    Yes
    
    ### Are there any user-facing changes?
    
    No
    
    * Closes: #34351
    
    Lead-authored-by: mwish <[email protected]>
    Co-authored-by: mwish <[email protected]>
    Co-authored-by: Antoine Pitrou <[email protected]>
    Co-authored-by: Gang Wu <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 c_glib/test/parquet/test-statistics.rb |   2 +-
 cpp/src/parquet/statistics.cc          |  59 +++++++---
 cpp/src/parquet/statistics.h           |  27 ++---
 cpp/src/parquet/statistics_test.cc     | 191 +++++++++++++++++++++++++++++++++
 4 files changed, 249 insertions(+), 30 deletions(-)

diff --git a/c_glib/test/parquet/test-statistics.rb 
b/c_glib/test/parquet/test-statistics.rb
index 4f21ebf00c..0367084c88 100644
--- a/c_glib/test/parquet/test-statistics.rb
+++ b/c_glib/test/parquet/test-statistics.rb
@@ -51,7 +51,7 @@ class TestParquetStatistics < Test::Unit::TestCase
 
   test("#has_n_distinct_values?") do
     assert do
-      @statistics.has_n_distinct_values?
+      not @statistics.has_n_distinct_values?
     end
   end
 
diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc
index aa6df7e32a..a10fc2a4cf 100644
--- a/cpp/src/parquet/statistics.cc
+++ b/cpp/src/parquet/statistics.cc
@@ -463,6 +463,7 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
  public:
   using T = typename DType::c_type;
 
+  // Create an empty stats.
   TypedStatisticsImpl(const ColumnDescriptor* descr, MemoryPool* pool)
       : descr_(descr),
         pool_(pool),
@@ -471,10 +472,9 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
     auto comp = Comparator::Make(descr);
     comparator_ = std::static_pointer_cast<TypedComparator<DType>>(comp);
     TypedStatisticsImpl::Reset();
-    has_null_count_ = true;
-    has_distinct_count_ = true;
   }
 
+  // Create stats from provided values.
   TypedStatisticsImpl(const T& min, const T& max, int64_t num_values, int64_t 
null_count,
                       int64_t distinct_count)
       : pool_(default_memory_pool()),
@@ -482,24 +482,29 @@ class TypedStatisticsImpl : public TypedStatistics<DType> 
{
         max_buffer_(AllocateBuffer(pool_, 0)) {
     TypedStatisticsImpl::IncrementNumValues(num_values);
     TypedStatisticsImpl::IncrementNullCount(null_count);
-    IncrementDistinctCount(distinct_count);
+    SetDistinctCount(distinct_count);
 
     Copy(min, &min_, min_buffer_.get());
     Copy(max, &max_, max_buffer_.get());
     has_min_max_ = true;
   }
 
+  // Create stats from a thrift Statistics object.
   TypedStatisticsImpl(const ColumnDescriptor* descr, const std::string& 
encoded_min,
                       const std::string& encoded_max, int64_t num_values,
                       int64_t null_count, int64_t distinct_count, bool 
has_min_max,
                       bool has_null_count, bool has_distinct_count, 
MemoryPool* pool)
       : TypedStatisticsImpl(descr, pool) {
     TypedStatisticsImpl::IncrementNumValues(num_values);
-    if (has_null_count_) {
+    if (has_null_count) {
       TypedStatisticsImpl::IncrementNullCount(null_count);
+    } else {
+      has_null_count_ = false;
     }
     if (has_distinct_count) {
-      IncrementDistinctCount(distinct_count);
+      SetDistinctCount(distinct_count);
+    } else {
+      has_distinct_count_ = false;
     }
 
     if (!encoded_min.empty()) {
@@ -541,9 +546,7 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
 
   void Reset() override {
     ResetCounts();
-    has_min_max_ = false;
-    has_distinct_count_ = false;
-    has_null_count_ = false;
+    ResetHasFlags();
   }
 
   void SetMinMax(const T& arg_min, const T& arg_max) override {
@@ -552,12 +555,18 @@ class TypedStatisticsImpl : public TypedStatistics<DType> 
{
 
   void Merge(const TypedStatistics<DType>& other) override {
     this->num_values_ += other.num_values();
+    // null_count is always valid when merging page statistics into
+    // column chunk statistics.
     if (other.HasNullCount()) {
       this->statistics_.null_count += other.null_count();
+    } else {
+      this->has_null_count_ = false;
     }
-    if (other.HasDistinctCount()) {
-      this->statistics_.distinct_count += other.distinct_count();
-    }
+    // Clear has_distinct_count_ as distinct count cannot be merged.
+    has_distinct_count_ = false;
+    // Do not clear min/max here if the other side does not provide
+    // min/max which may happen when other is an empty stats or all
+    // its values are null and/or NaN.
     if (other.HasMinMax()) {
       SetMinMax(other.min(), other.max());
     }
@@ -609,9 +618,10 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
     }
     if (HasNullCount()) {
       s.set_null_count(this->null_count());
+      // num_values_ is reliable and it means number of non-null values.
+      s.all_null_value = num_values_ == 0;
     }
-    // num_values_ is reliable and it means number of non-null values.
-    s.all_null_value = num_values_ == 0;
+    // TODO (GH-36505): distinct count is not encoded for now.
     return s;
   }
 
@@ -627,7 +637,12 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
   T min_;
   T max_;
   ::arrow::MemoryPool* pool_;
-  int64_t num_values_ = 0;  // # of non-null values.
+  // Number of non-null values.
+  // Please note that num_values_ is reliable when has_null_count_ is set.
+  // When has_null_count_ is not set, e.g. a page statistics created from
+  // a statistics thrift message which doesn't have the optional null_count,
+  // `num_values_` may include null values.
+  int64_t num_values_ = 0;
   EncodedStatistics statistics_;
   std::shared_ptr<TypedComparator<DType>> comparator_;
   std::shared_ptr<ResizableBuffer> min_buffer_, max_buffer_;
@@ -637,8 +652,9 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
 
   void Copy(const T& src, T* dst, ResizableBuffer*) { *dst = src; }
 
-  void IncrementDistinctCount(int64_t n) {
-    statistics_.distinct_count += n;
+  void SetDistinctCount(int64_t n) {
+    // distinct count can only be "set", and cannot be incremented.
+    statistics_.distinct_count = n;
     has_distinct_count_ = true;
   }
 
@@ -648,6 +664,17 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
     this->num_values_ = 0;
   }
 
+  void ResetHasFlags() {
+    // has_min_max_ will only be set when it meets any valid value.
+    this->has_min_max_ = false;
+    // has_distinct_count_ will only be set once SetDistinctCount()
+    // is called because distinct count calculation is not cheap and
+    // disabled by default.
+    this->has_distinct_count_ = false;
+    // Null count calculation is cheap and enabled by default.
+    this->has_null_count_ = true;
+  }
+
   void SetMinMaxPair(std::pair<T, T> min_max) {
     // CleanStatistic can return a nullopt in case of erroneous values, e.g. 
NaN
     auto maybe_min_max = CleanStatistic(min_max);
diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h
index 3f168a938f..ae6c1ca29b 100644
--- a/cpp/src/parquet/statistics.h
+++ b/cpp/src/parquet/statistics.h
@@ -117,17 +117,16 @@ std::shared_ptr<TypedComparator<DType>> 
MakeComparator(const ColumnDescriptor* d
 // ----------------------------------------------------------------------
 
 /// \brief Structure represented encoded statistics to be written to
-/// and from Parquet serialized metadata
+/// and read from Parquet serialized metadata.
 class PARQUET_EXPORT EncodedStatistics {
-  std::shared_ptr<std::string> max_, min_;
+  std::string max_, min_;
   bool is_signed_ = false;
 
  public:
-  EncodedStatistics()
-      : max_(std::make_shared<std::string>()), 
min_(std::make_shared<std::string>()) {}
+  EncodedStatistics() = default;
 
-  const std::string& max() const { return *max_; }
-  const std::string& min() const { return *min_; }
+  const std::string& max() const { return max_; }
+  const std::string& min() const { return min_; }
 
   int64_t null_count = 0;
   int64_t distinct_count = 0;
@@ -149,11 +148,13 @@ class PARQUET_EXPORT EncodedStatistics {
   // the true minimum for aggregations and there is no way to mark that a
   // value has been truncated and is a lower bound and not in the page.
   void ApplyStatSizeLimits(size_t length) {
-    if (max_->length() > length) {
+    if (max_.length() > length) {
       has_max = false;
+      max_.clear();
     }
-    if (min_->length() > length) {
+    if (min_.length() > length) {
       has_min = false;
+      min_.clear();
     }
   }
 
@@ -165,14 +166,14 @@ class PARQUET_EXPORT EncodedStatistics {
 
   void set_is_signed(bool is_signed) { is_signed_ = is_signed; }
 
-  EncodedStatistics& set_max(const std::string& value) {
-    *max_ = value;
+  EncodedStatistics& set_max(std::string value) {
+    max_ = std::move(value);
     has_max = true;
     return *this;
   }
 
-  EncodedStatistics& set_min(const std::string& value) {
-    *min_ = value;
+  EncodedStatistics& set_min(std::string value) {
+    min_ = std::move(value);
     has_min = true;
     return *this;
   }
@@ -329,7 +330,7 @@ class TypedStatistics : public Statistics {
   /// null count is determined from the indices)
   virtual void IncrementNullCount(int64_t n) = 0;
 
-  /// \brief Increments the number ov values directly
+  /// \brief Increments the number of values directly
   /// The same note on IncrementNullCount applies here
   virtual void IncrementNumValues(int64_t n) = 0;
 };
diff --git a/cpp/src/parquet/statistics_test.cc 
b/cpp/src/parquet/statistics_test.cc
index 8b9a42aa18..76667f0029 100644
--- a/cpp/src/parquet/statistics_test.cc
+++ b/cpp/src/parquet/statistics_test.cc
@@ -346,6 +346,9 @@ class TestStatistics : public PrimitiveTypedTest<TestType> {
     ASSERT_EQ(this->values_.size(), statistics->num_values());
 
     statistics->Reset();
+    ASSERT_TRUE(statistics->HasNullCount());
+    ASSERT_FALSE(statistics->HasMinMax());
+    ASSERT_FALSE(statistics->HasDistinctCount());
     ASSERT_EQ(0, statistics->null_count());
     ASSERT_EQ(0, statistics->num_values());
     ASSERT_EQ(0, statistics->distinct_count());
@@ -590,6 +593,178 @@ TYPED_TEST(TestNumericStatistics, Equals) {
   ASSERT_NO_FATAL_FAILURE(this->TestEquals());
 }
 
+template <typename TestType>
+class TestStatisticsHasFlag : public TestStatistics<TestType> {
+ public:
+  void SetUp() override {
+    TestStatistics<TestType>::SetUp();
+    this->SetUpSchema(Repetition::OPTIONAL);
+  }
+
+  std::shared_ptr<TypedStatistics<TestType>> MergedStatistics(
+      const TypedStatistics<TestType>& stats1, const 
TypedStatistics<TestType>& stats2) {
+    auto chunk_statistics = MakeStatistics<TestType>(this->schema_.Column(0));
+    chunk_statistics->Merge(stats1);
+    chunk_statistics->Merge(stats2);
+    return chunk_statistics;
+  }
+
+  void VerifyMergedStatistics(
+      const TypedStatistics<TestType>& stats1, const 
TypedStatistics<TestType>& stats2,
+      const std::function<void(TypedStatistics<TestType>*)>& test_fn) {
+    ASSERT_NO_FATAL_FAILURE(test_fn(MergedStatistics(stats1, stats2).get()));
+    ASSERT_NO_FATAL_FAILURE(test_fn(MergedStatistics(stats2, stats1).get()));
+  }
+
+  // Distinct count should set to false when Merge is called.
+  void TestMergeDistinctCount() {
+    // Create a statistics object with distinct count.
+    std::shared_ptr<TypedStatistics<TestType>> statistics1;
+    {
+      EncodedStatistics encoded_statistics1;
+      statistics1 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+          Statistics::Make(this->schema_.Column(0), &encoded_statistics1,
+                           /*num_values=*/1000));
+      EXPECT_FALSE(statistics1->HasDistinctCount());
+    }
+
+    // Create a statistics object with distinct count.
+    std::shared_ptr<TypedStatistics<TestType>> statistics2;
+    {
+      EncodedStatistics encoded_statistics2;
+      encoded_statistics2.has_distinct_count = true;
+      encoded_statistics2.distinct_count = 500;
+      statistics2 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+          Statistics::Make(this->schema_.Column(0), &encoded_statistics2,
+                           /*num_values=*/1000));
+      EXPECT_TRUE(statistics2->HasDistinctCount());
+    }
+
+    VerifyMergedStatistics(*statistics1, *statistics2,
+                           [](TypedStatistics<TestType>* merged_statistics) {
+                             
EXPECT_FALSE(merged_statistics->HasDistinctCount());
+                             
EXPECT_FALSE(merged_statistics->Encode().has_distinct_count);
+                           });
+  }
+
+  // If all values in a page are null or nan, its stats should not set min-max.
+  // Merging its stats with another page having good min-max stats should not
+  // drop the valid min-max from the latter page.
+  void TestMergeMinMax() {
+    this->GenerateData(1000);
+    // Create a statistics object without min-max.
+    std::shared_ptr<TypedStatistics<TestType>> statistics1;
+    {
+      statistics1 = MakeStatistics<TestType>(this->schema_.Column(0));
+      statistics1->Update(this->values_ptr_, /*num_values=*/0,
+                          /*null_count=*/this->values_.size());
+      auto encoded_stats1 = statistics1->Encode();
+      EXPECT_FALSE(statistics1->HasMinMax());
+      EXPECT_FALSE(encoded_stats1.has_min);
+      EXPECT_FALSE(encoded_stats1.has_max);
+    }
+    // Create a statistics object with min-max.
+    std::shared_ptr<TypedStatistics<TestType>> statistics2;
+    {
+      statistics2 = MakeStatistics<TestType>(this->schema_.Column(0));
+      statistics2->Update(this->values_ptr_, this->values_.size(), 0);
+      auto encoded_stats2 = statistics2->Encode();
+      EXPECT_TRUE(statistics2->HasMinMax());
+      EXPECT_TRUE(encoded_stats2.has_min);
+      EXPECT_TRUE(encoded_stats2.has_max);
+    }
+    VerifyMergedStatistics(*statistics1, *statistics2,
+                           [](TypedStatistics<TestType>* merged_statistics) {
+                             EXPECT_TRUE(merged_statistics->HasMinMax());
+                             EXPECT_TRUE(merged_statistics->Encode().has_min);
+                             EXPECT_TRUE(merged_statistics->Encode().has_max);
+                           });
+  }
+
+  // Default statistics should have null_count even if no nulls is written.
+  // However, if statistics is created from thrift message, it might not
+  // have null_count. Merging statistics from such page will result in an
+  // invalid null_count as well.
+  void TestMergeNullCount() {
+    this->GenerateData(/*num_values=*/1000);
+
+    // Page should have null-count even if no nulls
+    std::shared_ptr<TypedStatistics<TestType>> statistics1;
+    {
+      statistics1 = MakeStatistics<TestType>(this->schema_.Column(0));
+      statistics1->Update(this->values_ptr_, 
/*num_values=*/this->values_.size(),
+                          /*null_count=*/0);
+      auto encoded_stats1 = statistics1->Encode();
+      EXPECT_TRUE(statistics1->HasNullCount());
+      EXPECT_EQ(0, statistics1->null_count());
+      EXPECT_TRUE(statistics1->Encode().has_null_count);
+    }
+    // Merge with null-count should also have null count
+    VerifyMergedStatistics(*statistics1, *statistics1,
+                           [](TypedStatistics<TestType>* merged_statistics) {
+                             EXPECT_TRUE(merged_statistics->HasNullCount());
+                             EXPECT_EQ(0, merged_statistics->null_count());
+                             auto encoded = merged_statistics->Encode();
+                             EXPECT_TRUE(encoded.has_null_count);
+                             EXPECT_EQ(0, encoded.null_count);
+                           });
+
+    // When loaded from thrift, might not have null count.
+    std::shared_ptr<TypedStatistics<TestType>> statistics2;
+    {
+      EncodedStatistics encoded_statistics2;
+      encoded_statistics2.has_null_count = false;
+      statistics2 = std::dynamic_pointer_cast<TypedStatistics<TestType>>(
+          Statistics::Make(this->schema_.Column(0), &encoded_statistics2,
+                           /*num_values=*/1000));
+      EXPECT_FALSE(statistics2->Encode().has_null_count);
+      EXPECT_FALSE(statistics2->HasNullCount());
+    }
+
+    // Merge without null-count should not have null count
+    VerifyMergedStatistics(*statistics1, *statistics2,
+                           [](TypedStatistics<TestType>* merged_statistics) {
+                             EXPECT_FALSE(merged_statistics->HasNullCount());
+                             
EXPECT_FALSE(merged_statistics->Encode().has_null_count);
+                           });
+  }
+
+  // statistics.all_null_value is used to build the page index.
+  // If statistics doesn't have null count, all_null_value should be false.
+  void TestMissingNullCount() {
+    EncodedStatistics encoded_statistics;
+    encoded_statistics.has_null_count = false;
+    auto statistics = Statistics::Make(this->schema_.Column(0), 
&encoded_statistics,
+                                       /*num_values=*/1000);
+    auto typed_stats = 
std::dynamic_pointer_cast<TypedStatistics<TestType>>(statistics);
+    EXPECT_FALSE(typed_stats->HasNullCount());
+    auto encoded = typed_stats->Encode();
+    EXPECT_FALSE(encoded.all_null_value);
+    EXPECT_FALSE(encoded.has_null_count);
+    EXPECT_FALSE(encoded.has_distinct_count);
+    EXPECT_FALSE(encoded.has_min);
+    EXPECT_FALSE(encoded.has_max);
+  }
+};
+
+TYPED_TEST_SUITE(TestStatisticsHasFlag, Types);
+
+TYPED_TEST(TestStatisticsHasFlag, MergeDistinctCount) {
+  ASSERT_NO_FATAL_FAILURE(this->TestMergeDistinctCount());
+}
+
+TYPED_TEST(TestStatisticsHasFlag, MergeNullCount) {
+  ASSERT_NO_FATAL_FAILURE(this->TestMergeNullCount());
+}
+
+TYPED_TEST(TestStatisticsHasFlag, MergeMinMax) {
+  ASSERT_NO_FATAL_FAILURE(this->TestMergeMinMax());
+}
+
+TYPED_TEST(TestStatisticsHasFlag, MissingNullCount) {
+  ASSERT_NO_FATAL_FAILURE(this->TestMissingNullCount());
+}
+
 // Helper for basic statistics tests below
 void AssertStatsSet(const ApplicationVersion& version,
                     std::shared_ptr<parquet::WriterProperties> props,
@@ -1212,5 +1387,21 @@ TEST(TestStatisticsSortOrderMinMax, Unsigned) {
   ASSERT_EQ(0x0b, stats->EncodeMax()[0]);
 }
 
+TEST(TestEncodedStatistics, CopySafe) {
+  EncodedStatistics encoded_statistics;
+  encoded_statistics.set_max("abc");
+  encoded_statistics.has_max = true;
+
+  encoded_statistics.set_min("abc");
+  encoded_statistics.has_min = true;
+
+  EncodedStatistics copy_statistics = encoded_statistics;
+  copy_statistics.set_max("abcd");
+  copy_statistics.set_min("a");
+
+  EXPECT_EQ("abc", encoded_statistics.min());
+  EXPECT_EQ("abc", encoded_statistics.max());
+}
+
 }  // namespace test
 }  // namespace parquet

Reply via email to