HuaHuaY commented on code in PR #49527:
URL: https://github.com/apache/arrow/pull/49527#discussion_r2989216942


##########
cpp/src/parquet/file_writer.cc:
##########
@@ -198,6 +202,22 @@ class RowGroupSerializer : public RowGroupWriter::Contents 
{
     return total_compressed_bytes_written;
   }
 
+  RowGroupWriter::BufferedStats EstimatedBufferedStats() const override {
+    RowGroupWriter::BufferedStats stats;
+    if (closed_) {
+      return stats;
+    }
+    for (size_t i = 0; i < column_writers_.size(); i++) {
+      if (column_writers_[i]) {
+        stats.def_level_bytes += 
column_writers_[i]->estimated_buffered_def_level_bytes();
+        stats.rep_level_bytes += 
column_writers_[i]->estimated_buffered_rep_level_bytes();
+        stats.value_bytes += 
column_writers_[i]->estimated_buffered_value_bytes();
+        stats.dict_bytes += 
column_writers_[i]->estimated_buffered_dict_bytes();
+      }
+    }

Review Comment:
   ```suggestion
       for (const auto& column_writer : column_writers_) {
         if (column_writer) {
           stats.def_level_bytes += 
column_writer->estimated_buffered_def_level_bytes();
           stats.rep_level_bytes += 
column_writer->estimated_buffered_rep_level_bytes();
           stats.value_bytes += column_writer->estimated_buffered_value_bytes();
           stats.dict_bytes += column_writer->estimated_buffered_dict_bytes();
         }
       }
   ```
   I suggest to use foreach loop instead.



##########
cpp/src/parquet/file_writer.h:
##########
@@ -99,6 +111,9 @@ class PARQUET_EXPORT RowGroupWriter {
   int64_t total_compressed_bytes() const;
   /// \brief total compressed bytes written by the page writer
   int64_t total_compressed_bytes_written() const;
+  /// \brief Estimated sizes of buffered data (levels, values, dict) not yet
+  /// written to a page.

Review Comment:
   ```suggestion
     /// written to pages.
   ```
   ditto



##########
cpp/src/parquet/file_writer.h:
##########
@@ -36,6 +36,15 @@ static constexpr uint8_t kParquetEMagic[4] = {'P', 'A', 'R', 
'E'};
 
 class PARQUET_EXPORT RowGroupWriter {
  public:
+  // Estimated uncompressed byte sizes of data buffered by column writers
+  // that have not yet been serialized into data pages.

Review Comment:
   ```suggestion
     // that have not yet been serialized into pages.
   ```
   There may be some dict pages in column writers, not just data pages.



##########
cpp/src/parquet/file_writer.h:
##########
@@ -58,6 +67,9 @@ class PARQUET_EXPORT RowGroupWriter {
     virtual int64_t total_compressed_bytes() const = 0;
     /// \brief total compressed bytes written by the page writer
     virtual int64_t total_compressed_bytes_written() const = 0;
+    /// \brief Estimated sizes of buffered data (levels, values, dict) not yet
+    /// written to a page.

Review Comment:
   ```suggestion
       /// written to pages.
   ```
   It is a summary  of all column writers in the row group.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to