mapleFU commented on code in PR #14341:
URL: https://github.com/apache/arrow/pull/14341#discussion_r1285715935


##########
cpp/src/parquet/test_util.h:
##########
@@ -783,8 +789,33 @@ inline void GenerateData<ByteArray>(int num_values, 
ByteArray* out,
   random_byte_array(num_values, 0, heap->data(), out, 2, max_byte_array_len);
 }
 
+template <typename T>
+inline void GeneratePrefixedData(int num_values, T* out, std::vector<uint8_t>* 
heap,
+                                 double prefixed_probability);
+
+template <>
+inline void GeneratePrefixedData(int num_values, ByteArray* out,
+                                 std::vector<uint8_t>* heap,
+                                 double prefixed_probability) {
+  // seed the prng so failure is deterministic
+  int max_byte_array_len = 12;
+  heap->resize(num_values * max_byte_array_len);
+  prefixed_random_byte_array(num_values, 0, heap->data(), out, 2, 
max_byte_array_len,

Review Comment:
   ```suggestion
     prefixed_random_byte_array(num_values, /*seed=*/0, heap->data(), out, 
/*min_size=*/2, /*max_size=*/max_byte_array_len,
   ```



##########
cpp/src/parquet/test_util.h:
##########
@@ -783,8 +789,33 @@ inline void GenerateData<ByteArray>(int num_values, 
ByteArray* out,
   random_byte_array(num_values, 0, heap->data(), out, 2, max_byte_array_len);
 }
 
+template <typename T>
+inline void GeneratePrefixedData(int num_values, T* out, std::vector<uint8_t>* 
heap,
+                                 double prefixed_probability);
+
+template <>
+inline void GeneratePrefixedData(int num_values, ByteArray* out,
+                                 std::vector<uint8_t>* heap,
+                                 double prefixed_probability) {
+  // seed the prng so failure is deterministic
+  int max_byte_array_len = 12;
+  heap->resize(num_values * max_byte_array_len);
+  prefixed_random_byte_array(num_values, 0, heap->data(), out, 2, 
max_byte_array_len,
+                             prefixed_probability);
+}
+
 static constexpr int kGenerateDataFLBALength = 8;
 
+template <>
+inline void GeneratePrefixedData<FLBA>(int num_values, FLBA* out,
+                                       std::vector<uint8_t>* heap,
+                                       double prefixed_probability) {
+  // seed the prng so failure is deterministic
+  heap->resize(num_values * kGenerateDataFLBALength);
+  prefixed_random_byte_array(num_values, 0, heap->data(), 
kGenerateDataFLBALength, out,

Review Comment:
   ```suggestion
     prefixed_random_byte_array(num_values, /*seed=*/0, heap->data(), 
kGenerateDataFLBALength, out,
   ```



##########
cpp/src/parquet/encoding.cc:
##########
@@ -3037,11 +3083,242 @@ class RleBooleanDecoder : public DecoderImpl, virtual 
public BooleanDecoder {
 // ----------------------------------------------------------------------
 // DELTA_BYTE_ARRAY
 
-class DeltaByteArrayDecoder : public DecoderImpl,
-                              virtual public TypedDecoder<ByteArrayType> {
+/// Delta Byte Array encoding also known as incremental encoding or front 
compression:
+/// for each element in a sequence of strings, store the prefix length of the 
previous
+/// entry plus the suffix.
+///
+/// This is stored as a sequence of delta-encoded prefix lengths 
(DELTA_BINARY_PACKED),
+/// followed by the suffixes encoded as delta length byte arrays
+/// (DELTA_LENGTH_BYTE_ARRAY).
+
+// ----------------------------------------------------------------------
+// DeltaByteArrayEncoder
+
+template <typename DType>
+class DeltaByteArrayEncoder : public EncoderImpl, virtual public 
TypedEncoder<DType> {
  public:
-  explicit DeltaByteArrayDecoder(const ColumnDescriptor* descr,
+  using T = typename DType::c_type;
+
+  explicit DeltaByteArrayEncoder(const ColumnDescriptor* descr,
                                  MemoryPool* pool = 
::arrow::default_memory_pool())
+      : EncoderImpl(descr, Encoding::DELTA_BYTE_ARRAY, pool),
+        sink_(pool),
+        prefix_length_encoder_(nullptr, pool),
+        suffix_encoder_(nullptr, pool),
+        last_value_(""),
+        kEmpty(ByteArray(0, reinterpret_cast<const uint8_t*>(""))) {}
+
+  std::shared_ptr<Buffer> FlushValues() override;
+
+  int64_t EstimatedDataEncodedSize() override {
+    return prefix_length_encoder_.EstimatedDataEncodedSize() +
+           suffix_encoder_.EstimatedDataEncodedSize();
+  }
+
+  using TypedEncoder<DType>::Put;
+
+  void Put(const ::arrow::Array& values) override;
+
+  void Put(const T* buffer, int num_values) override;
+
+  void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
+                 int64_t valid_bits_offset) override {
+    if (valid_bits != NULLPTR) {
+      PARQUET_ASSIGN_OR_THROW(
+          buffer_, ::arrow::AllocateBuffer(num_values * sizeof(T), 
this->memory_pool()));

Review Comment:
   1. `buffer_` will create everytime when `PutSpace` is called
   2. When `Flush` called , would `buffer_` been cleared?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to