pitrou commented on code in PR #46886:
URL: https://github.com/apache/arrow/pull/46886#discussion_r2167037078


##########
cpp/src/parquet/decoder.cc:
##########
@@ -347,22 +384,29 @@ int PlainDecoder<DType>::DecodeArrow(
 
   constexpr int value_size = static_cast<int>(sizeof(value_type));
   int values_decoded = num_values - null_count;
-  if (ARROW_PREDICT_FALSE(len_ < value_size * values_decoded)) {
+  if (ARROW_PREDICT_FALSE(this->len_ < value_size * values_decoded)) {
     ParquetException::EofException();
   }
 
-  PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
+  const uint8_t* data = this->data_;
 
-  VisitNullBitmapInline(
-      valid_bits, valid_bits_offset, num_values, null_count,
-      [&]() {
-        builder->UnsafeAppend(SafeLoadAs<value_type>(data_));
-        data_ += sizeof(value_type);
-      },
-      [&]() { builder->UnsafeAppendNull(); });
-
-  num_values_ -= values_decoded;
-  len_ -= sizeof(value_type) * values_decoded;
+  PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
+  PARQUET_THROW_NOT_OK(
+      VisitBitRuns(valid_bits, valid_bits_offset, num_values,

Review Comment:
   It's the worst case indeed, but even the slowdown is very small in 
`null_probability:50` benchmarks:
   ```
   
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
   Non-regressions: (8)
   
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
                                                                    benchmark   
     baseline       contender  change %                                         
                                                                                
                                                                                
      counters
              BM_ReadColumnPlain<true,Float16LogicalType>/null_probability:50 
180.961 MiB/sec 523.042 MiB/sec   189.037            {'family_index': 11, 
'per_family_instance_index': 2, 'run_name': 
'BM_ReadColumnPlain<true,Float16LogicalType>/null_probability:50', 
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 7}
    BM_ReadColumnByteStreamSplit<true,Float16LogicalType>/null_probability:50 
187.537 MiB/sec 515.843 MiB/sec   175.061  {'family_index': 13, 
'per_family_instance_index': 2, 'run_name': 
'BM_ReadColumnByteStreamSplit<true,Float16LogicalType>/null_probability:50', 
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 6}
                     BM_ReadBinaryColumn/null_probability:50/unique_values:32 
505.814 MiB/sec 532.305 MiB/sec     5.237                   {'family_index': 
14, 'per_family_instance_index': 3, 'run_name': 
'BM_ReadBinaryColumn/null_probability:50/unique_values:32', 'repetitions': 1, 
'repetition_index': 0, 'threads': 1, 'iterations': 3}
                 BM_ReadBinaryViewColumn/null_probability:50/unique_values:32 
723.390 MiB/sec 718.730 MiB/sec    -0.644               {'family_index': 15, 
'per_family_instance_index': 3, 'run_name': 
'BM_ReadBinaryViewColumn/null_probability:50/unique_values:32', 'repetitions': 
1, 'repetition_index': 0, 'threads': 1, 'iterations': 3}
                 BM_ReadBinaryViewColumn/null_probability:50/unique_values:-1 
929.657 MiB/sec 915.051 MiB/sec    -1.571               {'family_index': 15, 
'per_family_instance_index': 6, 'run_name': 
'BM_ReadBinaryViewColumn/null_probability:50/unique_values:-1', 'repetitions': 
1, 'repetition_index': 0, 'threads': 1, 'iterations': 4}
                     BM_ReadBinaryColumn/null_probability:50/unique_values:-1 
657.236 MiB/sec 634.225 MiB/sec    -3.501                   {'family_index': 
14, 'per_family_instance_index': 6, 'run_name': 
'BM_ReadBinaryColumn/null_probability:50/unique_values:-1', 'repetitions': 1, 
'repetition_index': 0, 'threads': 1, 'iterations': 3}
       BM_ReadBinaryColumnDeltaByteArray/null_probability:50/unique_values:-1 
659.450 MiB/sec 635.794 MiB/sec    -3.587     {'family_index': 16, 
'per_family_instance_index': 2, 'run_name': 
'BM_ReadBinaryColumnDeltaByteArray/null_probability:50/unique_values:-1', 
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 4}
   BM_ReadBinaryViewColumnDeltaByteArray/null_probability:50/unique_values:-1 
953.573 MiB/sec 915.133 MiB/sec    -4.031 {'family_index': 17, 
'per_family_instance_index': 2, 'run_name': 
'BM_ReadBinaryViewColumnDeltaByteArray/null_probability:50/unique_values:-1', 
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 4}
   
   
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
   Regressions: (1)
   
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
                                                benchmark      baseline     
contender  change %                                                             
                                                                                
                                              counters
   BM_ReadColumnPlain<true,Int32Type>/null_probability:50 1.130 GiB/sec 1.050 
GiB/sec    -7.094 {'family_index': 9, 'per_family_instance_index': 2, 
'run_name': 'BM_ReadColumnPlain<true,Int32Type>/null_probability:50', 
'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 20}
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to