mapleFU commented on code in PR #41362:
URL: https://github.com/apache/arrow/pull/41362#discussion_r1594920996


##########
cpp/src/parquet/column_reader.cc:
##########
@@ -1675,42 +1675,50 @@ class TypedRecordReader : public 
TypedColumnReaderImpl<DType>,
   //
   // \return Number of records delimited
   int64_t DelimitRecords(int64_t num_records, int64_t* values_seen) {
+    if (ARROW_PREDICT_FALSE(num_records == 0 || levels_position_ == 
levels_written_)) {
+      *values_seen = 0;
+      return 0;
+    }
     int64_t values_to_read = 0;
     int64_t records_read = 0;
-
-    const int16_t* def_levels = this->def_levels() + levels_position_;
-    const int16_t* rep_levels = this->rep_levels() + levels_position_;
-
+    const int16_t* const rep_levels = this->rep_levels();
+    const int16_t* const def_levels = this->def_levels();
     ARROW_DCHECK_GT(this->max_rep_level_, 0);
-
-    // Count logical records and number of values to read
-    while (levels_position_ < levels_written_) {
-      const int16_t rep_level = *rep_levels++;
-      if (rep_level == 0) {
-        // If at_record_start_ is true, we are seeing the start of a record
-        // for the second time, such as after repeated calls to
-        // DelimitRecords. In this case we must continue until we find
-        // another record start or exhausting the ColumnChunk
-        if (!at_record_start_) {
-          // We've reached the end of a record; increment the record count.
-          ++records_read;
-          if (records_read == num_records) {
-            // We've found the number of records we were looking for. Set
-            // at_record_start_ to true and break
-            at_record_start_ = true;
-            break;
-          }
-        }
-      }
+    // If at_record_start_ is true, we are seeing the start of a record
+    // for the second time, such as after repeated calls to
+    // DelimitRecords. In this case we must continue until we find
+    // another record start or exhausting the ColumnChunk
+    if (at_record_start_) {
+      ARROW_DCHECK_EQ(0, rep_levels[levels_position_]);
+      values_to_read += def_levels[levels_position_] == this->max_def_level_;
+      ++levels_position_;
       // We have decided to consume the level at this position; therefore we
       // must advance until we find another record boundary
       at_record_start_ = false;
+    }
 
-      const int16_t def_level = *def_levels++;
-      if (def_level == this->max_def_level_) {
-        ++values_to_read;
+    // Count logical records and number of non-null values to read

Review Comment:
   With:
   
   ```
   diff --git a/cpp/src/parquet/column_reader.cc 
b/cpp/src/parquet/column_reader.cc
   index d3f918186..cddb9f79d 100644
   --- a/cpp/src/parquet/column_reader.cc
   +++ b/cpp/src/parquet/column_reader.cc
   @@ -1699,13 +1699,13 @@ class TypedRecordReader : public 
TypedColumnReaderImpl<DType>,
    
        // Count logical records and number of non-null values to read
        ARROW_DCHECK(!at_record_start_);
   +    int64_t begin_levels_position = levels_position_;
        while (levels_position_ < levels_written_) {
   -      int64_t stride =
   -          std::min(levels_written_ - levels_position_, num_records - 
records_read);
   +      const int64_t stride = 1;
          const int64_t position_end = levels_position_ + stride;
          for (int64_t i = levels_position_; i < position_end; ++i) {
            records_read += rep_levels[i] == 0;
   -        values_to_read += def_levels[i] == this->max_def_level_;
   +        // values_to_read += def_levels[i] == this->max_def_level_;
          }
          levels_position_ = position_end;
          if (records_read == num_records) {
   @@ -1716,11 +1716,11 @@ class TypedRecordReader : public 
TypedColumnReaderImpl<DType>,
            at_record_start_ = true;
            // Remove last value if we have reaches the end of the record
            levels_position_ = levels_position_ - 1;
   -        values_to_read -= def_levels[levels_position_] == 
this->max_def_level_;
   +        // values_to_read -= def_levels[levels_position_] == 
this->max_def_level_;
            break;
          }
        }
   -    *values_seen = values_to_read;
   +    *values_seen = std::count(def_levels + begin_levels_position, 
def_levels + levels_position_, this->max_def_level_);
        return records_read;
      }
    
   diff --git a/testing b/testing
   index 735ae7128..ad82a736c 160000
   --- a/testing
   +++ b/testing
   @@ -1 +1 @@
   -Subproject commit 735ae7128d571398dd798d7ff004adebeb342883
   ```
   
   The result is:
   
   ```
   RecordReaderReadRecords/Repetition:2/BatchSize:1000/ReadDense:1              
        4492581 ns      4482167 ns          156 bytes_per_second=618.928M/s 
items_per_second=285.576M/s
   RecordReaderReadRecords/Repetition:2/BatchSize:1000/ReadDense:0              
        8848493 ns      8791543 ns           81 bytes_per_second=315.546M/s 
items_per_second=145.594M/s
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to