Re: [PR] GH-47112: [Parquet][C++] Rle BitPacked parser [arrow]

via GitHub Thu, 04 Sep 2025 10:16:48 -0700


pitrou commented on code in PR #47294:
URL: https://github.com/apache/arrow/pull/47294#discussion_r2322162945



##########
cpp/src/arrow/util/rle_encoding_test.cc:
##########
@@ -553,47 +853,178 @@ TEST(BitRle, Overflow) {
   }
 }
 
-template <typename Type>
-void CheckRoundTripSpaced(const Array& data, int bit_width) {
+/// Check RleBitPacked encoding/decoding round trip.
+///
+/// \tparam kSpaced If set to false, treat Nulls in the input array as regular 
data.
+/// \tparam kParts The number of parts in which the data will be decoded.
+///         For number greater than one, this ensure that the decoder 
intermediary state
+///         is valid.
+template <typename Type, bool kSpaced, int32_t kParts>
+void CheckRoundTrip(const Array& data, int bit_width,
+                    std::shared_ptr<FloatArray> dict = {}) {
   using ArrayType = typename TypeTraits<Type>::ArrayType;
-  using T = typename Type::c_type;
+  using value_type = typename Type::c_type;
 
-  int num_values = static_cast<int>(data.length());
-  int buffer_size = RleEncoder::MaxBufferSize(bit_width, num_values);
+  int const data_size = static_cast<int>(data.length());
+  int const data_values_count =
+      static_cast<int>(data.length() - kSpaced * data.null_count());
+  int const buffer_size = RleBitPackedEncoder::MaxBufferSize(bit_width, 
data_size);
+  ASSERT_GE(kParts, 1);
+  ASSERT_LE(kParts, data_size);
 
-  const T* values = static_cast<const ArrayType&>(data).raw_values();
+  const value_type* data_values = static_cast<const 
ArrayType&>(data).raw_values();
 
+  // Encode the data into ``buffer`` using the encoder.
   std::vector<uint8_t> buffer(buffer_size);
-  RleEncoder encoder(buffer.data(), buffer_size, bit_width);
-  for (int i = 0; i < num_values; ++i) {
-    if (data.IsValid(i)) {
-      if (!encoder.Put(static_cast<uint64_t>(values[i]))) {
-        FAIL() << "Encoding failed";
-      }
+  RleBitPackedEncoder encoder(buffer.data(), buffer_size, bit_width);
+  int32_t encoded_values_size = 0;
+  for (int i = 0; i < data_size; ++i) {
+    // Depending on kSpaced we treat nulls as regular values.
+    if (data.IsValid(i) || !kSpaced) {
+      bool success = encoder.Put(static_cast<uint64_t>(data_values[i]));
+      ASSERT_TRUE(success) << "Encoding failed in pos " << i;
+      ++encoded_values_size;
     }
   }
-  int encoded_size = encoder.Flush();
+  int encoded_byte_size = encoder.Flush();
+  ASSERT_EQ(encoded_values_size, data_values_count)
+      << "All values input were not encoded successfully by the encoder";
+
+  // On to verify batch read
+  RleBitPackedDecoder<value_type> decoder(buffer.data(), encoded_byte_size, 
bit_width);
+  // We will only use one of them depending on whether this is a dictonnary 
tests
+  std::vector<float> dict_read;
+  std::vector<value_type> values_read;
+  if (dict) {
+    dict_read.resize(data_size);
+  } else {
+    values_read.resize(data_size);
+  }
 
-  // Verify batch read
-  RleDecoder decoder(buffer.data(), encoded_size, bit_width);
-  std::vector<T> values_read(num_values);
+  // We will read the data in kParts calls to make sure intermediate states 
are valid
+  int32_t actual_read_count = 0;
+  int32_t requested_read_count = 0;
+  while (requested_read_count < data_size) {
+    auto const remaining = data_size - requested_read_count;
+    auto to_read = data_size / kParts;
+    if (remaining / to_read == 1) {
+      to_read = remaining;
+    }
 
-  if (num_values != decoder.GetBatchSpaced(
-                        num_values, static_cast<int>(data.null_count()),
-                        data.null_bitmap_data(), data.offset(), 
values_read.data())) {
-    FAIL();
-  }
+    auto read = 0;
+    if constexpr (kSpaced) {
+      // We need to slice the input array get the proper null count and bitmap
+      auto data_remaining = data.Slice(requested_read_count, to_read);
+
+      if (dict) {
+        auto* out = dict_read.data() + requested_read_count;
+        read = decoder.GetBatchWithDictSpaced(
+            dict->raw_values(), static_cast<int32_t>(dict->length()), out, 
to_read,
+            static_cast<int32_t>(data_remaining->null_count()),
+            data_remaining->null_bitmap_data(), data_remaining->offset());
+      } else {
+        auto* out = values_read.data() + requested_read_count;
+        read = decoder.GetBatchSpaced(
+            to_read, static_cast<int32_t>(data_remaining->null_count()),
+            data_remaining->null_bitmap_data(), data_remaining->offset(), out);
+      }
+    } else {
+      if (dict) {
+        auto* out = dict_read.data() + requested_read_count;
+        read = decoder.GetBatchWithDict(
+            dict->raw_values(), static_cast<int32_t>(dict->length()), out, 
to_read);
+      } else {
+        auto* out = values_read.data() + requested_read_count;
+        read = decoder.GetBatch(out, to_read);
+      }
+    }
+    ASSERT_EQ(read, to_read) << "Decoder did not read as many values as 
requested";
 
-  for (int64_t i = 0; i < num_values; ++i) {
-    if (data.IsValid(i)) {
-      if (values_read[i] != values[i]) {
-        FAIL() << "Index " << i << " read " << values_read[i] << " but should 
be "
-               << values[i];
+    actual_read_count += read;
+    requested_read_count += to_read;
+  }
+  EXPECT_EQ(requested_read_count, data_size) << "This test logic is wrong";
+  EXPECT_EQ(actual_read_count, data_size) << "Total number of values read is 
off";
+
+  // Verify the round trip: encoded-decoded values must equal the original one
+  for (int64_t i = 0; i < data_size; ++i) {
+    if (data.IsValid(i) || !kSpaced) {
+      if (dict) {
+        EXPECT_EQ(dict_read.at(i), dict->Value(data_values[i]))
+            << "Encoded then decoded and mapped value at position " << i << " 
("
+            << values_read[i] << ") differs from original value (" << 
data_values[i]
+            << " mapped to " << dict->Value(data_values[i]) << ")";
+      } else {
+        EXPECT_EQ(values_read.at(i), data_values[i])
+            << "Encoded then decoded value at position " << i << " (" << 
values_read.at(i)
+            << ") differs from original value (" << data_values[i] << ")";
       }
     }
   }
 }
 
+template <typename T>
+struct DataTestRleBitPackedRandomPart {
+  using value_type = T;
+
+  value_type max;
+  int32_t size;
+  double null_probability;
+};
+
+template <typename T>
+struct DataTestRleBitPackedRepeatPart {
+  using value_type = T;
+
+  value_type value;
+  int32_t size;
+};
+
+template <typename T>
+struct DataTestRleBitPackedNullPart {
+  using value_type = T;
+
+  int32_t size;
+};
+
+template <typename T>
+struct DataTestRleBitPacked {
+  using value_type = T;
+  using ArrowType = typename arrow::CTypeTraits<value_type>::ArrowType;
+  using RandomPart = DataTestRleBitPackedRandomPart<value_type>;
+  using RepeatPart = DataTestRleBitPackedRepeatPart<value_type>;
+  using NullPart = DataTestRleBitPackedNullPart<value_type>;
+
+  std::vector<std::variant<RandomPart, RepeatPart, NullPart>> parts;
+  int32_t bit_width;
+
+  std::shared_ptr<::arrow::Array> MakeArray(
+      ::arrow::random::RandomArrayGenerator& rand) const {
+    std::vector<std::shared_ptr<::arrow::Array>> arrays = {};
+
+    for (auto const& dyn_part : parts) {
+      if (auto* part = std::get_if<RandomPart>(&dyn_part)) {
+        auto arr = rand.Numeric<ArrowType>(part->size, /* min= */ 
value_type(0),
+                                           part->max, part->null_probability);
+        arrays.push_back(std::move(arr));
+
+      } else if (auto* part = std::get_if<RepeatPart>(&dyn_part)) {
+        auto scalar = ::arrow::MakeScalar(part->value);
+        arrays.push_back(::arrow::MakeArrayFromScalar(*scalar, 
part->size).ValueOrDie());

Review Comment:
   Note you may use `EXPECT_OK_AND_ASSIGN` instead of `ValueOrDie` (but it's 
probably not necessary here).
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] GH-47112: [Parquet][C++] Rle BitPacked parser [arrow]

Reply via email to