pitrou commented on a change in pull request #10627:
URL: https://github.com/apache/arrow/pull/10627#discussion_r690153087



##########
File path: cpp/src/arrow/util/bit_stream_utils.h
##########
@@ -418,14 +450,59 @@ inline bool BitReader::GetVlqInt(uint32_t* v) {
 }
 
 inline bool BitWriter::PutZigZagVlqInt(int32_t v) {
-  auto u_v = ::arrow::util::SafeCopy<uint32_t>(v);
-  return PutVlqInt((u_v << 1) ^ (u_v >> 31));
+  uint32_t u_v = ::arrow::util::SafeCopy<uint32_t>(v);
+  u_v = (u_v << 1) ^ static_cast<uint32_t>(v >> 31);
+  return PutVlqInt(u_v);

Review comment:
       Can you explain why these implementations were changed?

##########
File path: cpp/src/parquet/types.h
##########
@@ -678,7 +678,7 @@ struct type_traits<Type::INT64> {
   using value_type = int64_t;
 
   static constexpr int value_byte_size = 8;
-  static constexpr const char* printf_code = "ld";
+  static constexpr const char* printf_code = "lld";

Review comment:
       This should really be a conditional, e.g.:
   ```c++
     static constexpr const char* printf_code = (sizeof(long) == 64) ? "ld" : 
"lld";
   ```

##########
File path: cpp/src/arrow/util/bpacking64_codegen.py
##########
@@ -0,0 +1,131 @@
+#!/bin/python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This script is modified from its original version in GitHub. Original source:
+# 
https://github.com/lemire/FrameOfReference/blob/146948b6058a976bc7767262ad3a2ce201486b93/scripts/turbopacking64.py
+
+# Usage:
+#   python bpacking64_codegen.py > bpacking64_default.h
+
+def howmany(bit):
+    """ how many values are we going to pack? """
+    return 32
+
+
+def howmanywords(bit):
+    return (howmany(bit) * bit + 63)//64
+
+
+def howmanybytes(bit):
+    return (howmany(bit) * bit + 7)//8
+
+
+print('''// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This file was generated by script which is modified from its original 
version in GitHub.
+// Original source:
+// 
https://github.com/lemire/FrameOfReference/blob/master/scripts/turbopacking64.py
+// The original copyright notice follows.
+
+// This code is released under the
+// Apache License Version 2.0 http://www.apache.org/licenses/.
+// (c) Daniel Lemire 2013
+
+#pragma once
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/ubsan.h"
+
+namespace arrow {
+namespace internal {
+''')
+
+
+print("inline const uint8_t* unpack0_64(const uint8_t* in, uint64_t* out) {")
+print("  for(int k = 0; k < {0} ; k += 1) {{".format(howmany(0)))
+print("    out[k] = 0;")
+print("  }")
+print("  return in;")
+print("}")
+
+for bit in range(1, 65):
+    print("")
+    print(
+        "inline const uint8_t* unpack{0}_64(const uint8_t* in, uint64_t* out) 
{{".format(bit))
+
+    if(bit < 64):
+        print("  const uint64_t mask = {0}ULL;".format((1 << bit)-1))
+    maskstr = " & mask"
+    if (bit == 64):
+        maskstr = ""  # no need
+
+    for k in range(howmanywords(bit)-1):
+        print("  uint64_t w{0} = util::SafeLoadAs<uint64_t>(in);".format(k))
+        print("  w{0} = arrow::BitUtil::FromLittleEndian(w{0});".format(k))
+        print("  in += 8;".format(k))
+    k = howmanywords(bit) - 1
+    if (bit % 2 == 0):
+        print("  uint64_t w{0} = util::SafeLoadAs<uint64_t>(in);".format(k))
+        print("  w{0} = arrow::BitUtil::FromLittleEndian(w{0});".format(k))
+        print("  in += 8;".format(k))
+    else:
+        print("  uint64_t w{0} = util::SafeLoadAs<uint32_t>(in);".format(k))

Review comment:
       Ok, is this because DELTA_BINARY_PACKED outputs values in multiples of 
32? Slightly annoying :-/

##########
File path: cpp/src/arrow/util/bit_stream_utils.h
##########
@@ -418,14 +450,59 @@ inline bool BitReader::GetVlqInt(uint32_t* v) {
 }
 
 inline bool BitWriter::PutZigZagVlqInt(int32_t v) {
-  auto u_v = ::arrow::util::SafeCopy<uint32_t>(v);
-  return PutVlqInt((u_v << 1) ^ (u_v >> 31));
+  uint32_t u_v = ::arrow::util::SafeCopy<uint32_t>(v);
+  u_v = (u_v << 1) ^ static_cast<uint32_t>(v >> 31);
+  return PutVlqInt(u_v);
 }
 
 inline bool BitReader::GetZigZagVlqInt(int32_t* v) {
   uint32_t u;
   if (!GetVlqInt(&u)) return false;
-  *v = ::arrow::util::SafeCopy<int32_t>((u >> 1) ^ (u << 31));
+  u = (u >> 1) ^ (~(u & 1) + 1);
+  *v = ::arrow::util::SafeCopy<int32_t>(u);
+  return true;
+}
+
+inline bool BitWriter::PutVlqInt(uint64_t v) {
+  bool result = true;
+  while ((v & 0xFFFFFFFFFFFFFF80ULL) != 0ULL) {
+    result &= PutAligned<uint8_t>(static_cast<uint8_t>((v & 0x7F) | 0x80), 1);

Review comment:
       Let's keep this code as-is.

##########
File path: cpp/src/arrow/util/bit_util_test.cc
##########
@@ -1939,24 +1939,77 @@ TEST(BitUtil, RoundUpToPowerOf2) {
 #undef U64
 #undef S64
 
-static void TestZigZag(int32_t v) {
+static void TestZigZag(int32_t v, uint8_t* buffer_expect) {

Review comment:
       Can pass a `std::array<uint8_t, 5>` to avoid having to declare the 
buffers out of line.

##########
File path: cpp/src/arrow/util/bit_util_test.cc
##########
@@ -1939,24 +1939,77 @@ TEST(BitUtil, RoundUpToPowerOf2) {
 #undef U64
 #undef S64
 
-static void TestZigZag(int32_t v) {
+static void TestZigZag(int32_t v, uint8_t* buffer_expect) {
   uint8_t buffer[BitUtil::BitReader::kMaxVlqByteLength] = {};
   BitUtil::BitWriter writer(buffer, sizeof(buffer));
   BitUtil::BitReader reader(buffer, sizeof(buffer));
   writer.PutZigZagVlqInt(v);
+  EXPECT_EQ(buffer_expect[0], buffer[0]);
+  EXPECT_EQ(buffer_expect[1], buffer[1]);
+  EXPECT_EQ(buffer_expect[2], buffer[2]);
+  EXPECT_EQ(buffer_expect[3], buffer[3]);
+  EXPECT_EQ(buffer_expect[4], buffer[4]);
   int32_t result;
   EXPECT_TRUE(reader.GetZigZagVlqInt(&result));
   EXPECT_EQ(v, result);
 }
 
 TEST(BitStreamUtil, ZigZag) {
-  TestZigZag(0);
-  TestZigZag(1);
-  TestZigZag(1234);
-  TestZigZag(-1);
-  TestZigZag(-1234);
-  TestZigZag(std::numeric_limits<int32_t>::max());
-  TestZigZag(-std::numeric_limits<int32_t>::max());
+  uint8_t buffer_expect0[5] = {0, 0, 0, 0, 0};
+  uint8_t buffer_expect1[5] = {2, 0, 0, 0, 0};
+  uint8_t buffer_expect2[5] = {164, 19, 0, 0, 0};
+  uint8_t buffer_expect3[5] = {1, 0, 0, 0, 0};
+  uint8_t buffer_expect4[5] = {163, 19, 0, 0, 0};
+  uint8_t buffer_expect5[5] = {254, 255, 255, 255, 15};
+  uint8_t buffer_expect6[5] = {253, 255, 255, 255, 15};
+  uint8_t buffer_expect7[5] = {255, 255, 255, 255, 15};
+  TestZigZag(0, buffer_expect0);

Review comment:
       This could be `TestZigZag(0, {0, 0, 0, 0, 0})` if `TestZigZag` took a 
`std::array` or `std::vector`.

##########
File path: cpp/src/parquet/encoding.cc
##########
@@ -2107,73 +2105,99 @@ class DeltaBitPackDecoder : public DecoderImpl, virtual 
public TypedDecoder<DTyp
   }
 
  private:
-  void InitBlock() {
-    // The number of values per block.
-    uint32_t block_size;
-    if (!decoder_.GetVlqInt(&block_size)) ParquetException::EofException();
-    if (!decoder_.GetVlqInt(&num_mini_blocks_)) 
ParquetException::EofException();
-    if (!decoder_.GetVlqInt(&values_current_block_)) {
+  void InitHeader() {
+    if (!decoder_.GetVlqInt(&values_per_block_)) 
ParquetException::EofException();
+    if (!decoder_.GetVlqInt(&mini_blocks_per_block_)) 
ParquetException::EofException();
+    if (!decoder_.GetVlqInt(&total_value_count_)) {
       ParquetException::EofException();
     }
     if (!decoder_.GetZigZagVlqInt(&last_value_)) 
ParquetException::EofException();
 
-    delta_bit_widths_ = AllocateBuffer(pool_, num_mini_blocks_);
-    uint8_t* bit_width_data = delta_bit_widths_->mutable_data();
+    delta_bit_widths_ = AllocateBuffer(pool_, mini_blocks_per_block_);
+    values_per_mini_block_ = values_per_block_ / mini_blocks_per_block_;
+    if (values_per_mini_block_ % 8 != 0) {
+      throw ParquetException("miniBlockSize must be multiple of 8, but it's " +

Review comment:
       Why 8? According to the 
[spec](https://github.com/apache/parquet-format/blob/master/Encodings.md#delta-encoding-delta_binary_packed--5),
 it should be a multiple of 32.

##########
File path: cpp/src/parquet/arrow/arrow_reader_writer_test.cc
##########
@@ -48,10 +48,9 @@
 #include "arrow/util/future.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/range.h"
-
+#include "gtest/gtest.h"

Review comment:
       Nit: can you keep the inclusion order as is?
   

##########
File path: cpp/src/arrow/util/bpacking64_codegen.py
##########
@@ -0,0 +1,131 @@
+#!/bin/python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This script is modified from its original version in GitHub. Original source:
+# 
https://github.com/lemire/FrameOfReference/blob/146948b6058a976bc7767262ad3a2ce201486b93/scripts/turbopacking64.py
+
+# Usage:
+#   python bpacking64_codegen.py > bpacking64_default.h
+
+def howmany(bit):
+    """ how many values are we going to pack? """
+    return 32
+
+
+def howmanywords(bit):
+    return (howmany(bit) * bit + 63)//64
+
+
+def howmanybytes(bit):
+    return (howmany(bit) * bit + 7)//8
+
+
+print('''// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This file was generated by script which is modified from its original 
version in GitHub.
+// Original source:
+// 
https://github.com/lemire/FrameOfReference/blob/master/scripts/turbopacking64.py
+// The original copyright notice follows.
+
+// This code is released under the
+// Apache License Version 2.0 http://www.apache.org/licenses/.
+// (c) Daniel Lemire 2013
+
+#pragma once
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/ubsan.h"
+
+namespace arrow {
+namespace internal {
+''')
+
+
+print("inline const uint8_t* unpack0_64(const uint8_t* in, uint64_t* out) {")
+print("  for(int k = 0; k < {0} ; k += 1) {{".format(howmany(0)))
+print("    out[k] = 0;")
+print("  }")
+print("  return in;")
+print("}")
+
+for bit in range(1, 65):
+    print("")
+    print(
+        "inline const uint8_t* unpack{0}_64(const uint8_t* in, uint64_t* out) 
{{".format(bit))
+
+    if(bit < 64):
+        print("  const uint64_t mask = {0}ULL;".format((1 << bit)-1))
+    maskstr = " & mask"
+    if (bit == 64):
+        maskstr = ""  # no need
+
+    for k in range(howmanywords(bit)-1):
+        print("  uint64_t w{0} = util::SafeLoadAs<uint64_t>(in);".format(k))
+        print("  w{0} = arrow::BitUtil::FromLittleEndian(w{0});".format(k))
+        print("  in += 8;".format(k))
+    k = howmanywords(bit) - 1
+    if (bit % 2 == 0):
+        print("  uint64_t w{0} = util::SafeLoadAs<uint64_t>(in);".format(k))
+        print("  w{0} = arrow::BitUtil::FromLittleEndian(w{0});".format(k))
+        print("  in += 8;".format(k))
+    else:
+        print("  uint64_t w{0} = util::SafeLoadAs<uint32_t>(in);".format(k))

Review comment:
       I find this implementation weird. Why does it not unpack 64 entries at a 
time? I'm afraid this may make writing the SIMD implementations more difficult.

##########
File path: cpp/src/parquet/encoding.cc
##########
@@ -2107,73 +2105,99 @@ class DeltaBitPackDecoder : public DecoderImpl, virtual 
public TypedDecoder<DTyp
   }
 
  private:
-  void InitBlock() {
-    // The number of values per block.
-    uint32_t block_size;
-    if (!decoder_.GetVlqInt(&block_size)) ParquetException::EofException();
-    if (!decoder_.GetVlqInt(&num_mini_blocks_)) 
ParquetException::EofException();
-    if (!decoder_.GetVlqInt(&values_current_block_)) {
+  void InitHeader() {
+    if (!decoder_.GetVlqInt(&values_per_block_)) 
ParquetException::EofException();
+    if (!decoder_.GetVlqInt(&mini_blocks_per_block_)) 
ParquetException::EofException();
+    if (!decoder_.GetVlqInt(&total_value_count_)) {
       ParquetException::EofException();
     }
     if (!decoder_.GetZigZagVlqInt(&last_value_)) 
ParquetException::EofException();
 
-    delta_bit_widths_ = AllocateBuffer(pool_, num_mini_blocks_);
-    uint8_t* bit_width_data = delta_bit_widths_->mutable_data();
+    delta_bit_widths_ = AllocateBuffer(pool_, mini_blocks_per_block_);
+    values_per_mini_block_ = values_per_block_ / mini_blocks_per_block_;
+    if (values_per_mini_block_ % 8 != 0) {
+      throw ParquetException("miniBlockSize must be multiple of 8, but it's " +
+                             std::to_string(values_per_mini_block_));
+    }
+
+    mini_block_idx_ = mini_blocks_per_block_;
+    values_current_mini_block_ = 0;
+  }
 
+  void InitBlock() {
     if (!decoder_.GetZigZagVlqInt(&min_delta_)) 
ParquetException::EofException();
-    for (uint32_t i = 0; i < num_mini_blocks_; ++i) {
+
+    // readBitWidthsForMiniBlocks

Review comment:
       What does this comment mean? Would you like to write it in plain English?

##########
File path: cpp/src/parquet/encoding.cc
##########
@@ -2107,73 +2105,99 @@ class DeltaBitPackDecoder : public DecoderImpl, virtual 
public TypedDecoder<DTyp
   }
 
  private:
-  void InitBlock() {
-    // The number of values per block.
-    uint32_t block_size;
-    if (!decoder_.GetVlqInt(&block_size)) ParquetException::EofException();
-    if (!decoder_.GetVlqInt(&num_mini_blocks_)) 
ParquetException::EofException();
-    if (!decoder_.GetVlqInt(&values_current_block_)) {
+  void InitHeader() {
+    if (!decoder_.GetVlqInt(&values_per_block_)) 
ParquetException::EofException();
+    if (!decoder_.GetVlqInt(&mini_blocks_per_block_)) 
ParquetException::EofException();
+    if (!decoder_.GetVlqInt(&total_value_count_)) {
       ParquetException::EofException();
     }
     if (!decoder_.GetZigZagVlqInt(&last_value_)) 
ParquetException::EofException();
 
-    delta_bit_widths_ = AllocateBuffer(pool_, num_mini_blocks_);
-    uint8_t* bit_width_data = delta_bit_widths_->mutable_data();
+    delta_bit_widths_ = AllocateBuffer(pool_, mini_blocks_per_block_);
+    values_per_mini_block_ = values_per_block_ / mini_blocks_per_block_;
+    if (values_per_mini_block_ % 8 != 0) {
+      throw ParquetException("miniBlockSize must be multiple of 8, but it's " +
+                             std::to_string(values_per_mini_block_));
+    }
+
+    mini_block_idx_ = mini_blocks_per_block_;
+    values_current_mini_block_ = 0;
+  }
 
+  void InitBlock() {
     if (!decoder_.GetZigZagVlqInt(&min_delta_)) 
ParquetException::EofException();
-    for (uint32_t i = 0; i < num_mini_blocks_; ++i) {
+
+    // readBitWidthsForMiniBlocks
+    uint8_t* bit_width_data = delta_bit_widths_->mutable_data();
+    for (uint32_t i = 0; i < mini_blocks_per_block_; ++i) {
       if (!decoder_.GetAligned<uint8_t>(1, bit_width_data + i)) {
         ParquetException::EofException();
       }
     }
-    values_per_mini_block_ = block_size / num_mini_blocks_;
     mini_block_idx_ = 0;
     delta_bit_width_ = bit_width_data[0];
     values_current_mini_block_ = values_per_mini_block_;
   }
 
-  template <typename T>
   int GetInternal(T* buffer, int max_values) {
     max_values = std::min(max_values, this->num_values_);
-    const uint8_t* bit_width_data = delta_bit_widths_->data();
-    for (int i = 0; i < max_values; ++i) {
+    DCHECK_LE(static_cast<uint32_t>(max_values), total_value_count_);
+    int i = 0;
+    while (i < max_values) {
       if (ARROW_PREDICT_FALSE(values_current_mini_block_ == 0)) {
         ++mini_block_idx_;
-        if (mini_block_idx_ < static_cast<size_t>(delta_bit_widths_->size())) {
-          delta_bit_width_ = bit_width_data[mini_block_idx_];
+        if (mini_block_idx_ < mini_blocks_per_block_) {
+          delta_bit_width_ = delta_bit_widths_->data()[mini_block_idx_];
           values_current_mini_block_ = values_per_mini_block_;
         } else {
+          // mini_block_idx_ > mini_blocks_per_block_ only if last_value_ is 
stored in
+          // header

Review comment:
       I don't understand this comment. `last_value_` is always stored in 
header, right?
   

##########
File path: cpp/src/parquet/encoding.cc
##########
@@ -2107,73 +2105,99 @@ class DeltaBitPackDecoder : public DecoderImpl, virtual 
public TypedDecoder<DTyp
   }
 
  private:
-  void InitBlock() {
-    // The number of values per block.
-    uint32_t block_size;
-    if (!decoder_.GetVlqInt(&block_size)) ParquetException::EofException();
-    if (!decoder_.GetVlqInt(&num_mini_blocks_)) 
ParquetException::EofException();
-    if (!decoder_.GetVlqInt(&values_current_block_)) {
+  void InitHeader() {
+    if (!decoder_.GetVlqInt(&values_per_block_)) 
ParquetException::EofException();
+    if (!decoder_.GetVlqInt(&mini_blocks_per_block_)) 
ParquetException::EofException();
+    if (!decoder_.GetVlqInt(&total_value_count_)) {
       ParquetException::EofException();
     }
     if (!decoder_.GetZigZagVlqInt(&last_value_)) 
ParquetException::EofException();
 
-    delta_bit_widths_ = AllocateBuffer(pool_, num_mini_blocks_);
-    uint8_t* bit_width_data = delta_bit_widths_->mutable_data();
+    delta_bit_widths_ = AllocateBuffer(pool_, mini_blocks_per_block_);
+    values_per_mini_block_ = values_per_block_ / mini_blocks_per_block_;
+    if (values_per_mini_block_ % 8 != 0) {
+      throw ParquetException("miniBlockSize must be multiple of 8, but it's " +
+                             std::to_string(values_per_mini_block_));
+    }
+
+    mini_block_idx_ = mini_blocks_per_block_;
+    values_current_mini_block_ = 0;
+  }
 
+  void InitBlock() {
     if (!decoder_.GetZigZagVlqInt(&min_delta_)) 
ParquetException::EofException();
-    for (uint32_t i = 0; i < num_mini_blocks_; ++i) {
+
+    // readBitWidthsForMiniBlocks
+    uint8_t* bit_width_data = delta_bit_widths_->mutable_data();
+    for (uint32_t i = 0; i < mini_blocks_per_block_; ++i) {
       if (!decoder_.GetAligned<uint8_t>(1, bit_width_data + i)) {
         ParquetException::EofException();
       }
     }
-    values_per_mini_block_ = block_size / num_mini_blocks_;
     mini_block_idx_ = 0;
     delta_bit_width_ = bit_width_data[0];
     values_current_mini_block_ = values_per_mini_block_;
   }
 
-  template <typename T>
   int GetInternal(T* buffer, int max_values) {
     max_values = std::min(max_values, this->num_values_);
-    const uint8_t* bit_width_data = delta_bit_widths_->data();
-    for (int i = 0; i < max_values; ++i) {
+    DCHECK_LE(static_cast<uint32_t>(max_values), total_value_count_);
+    int i = 0;
+    while (i < max_values) {
       if (ARROW_PREDICT_FALSE(values_current_mini_block_ == 0)) {
         ++mini_block_idx_;
-        if (mini_block_idx_ < static_cast<size_t>(delta_bit_widths_->size())) {
-          delta_bit_width_ = bit_width_data[mini_block_idx_];
+        if (mini_block_idx_ < mini_blocks_per_block_) {
+          delta_bit_width_ = delta_bit_widths_->data()[mini_block_idx_];
           values_current_mini_block_ = values_per_mini_block_;
         } else {
+          // mini_block_idx_ > mini_blocks_per_block_ only if last_value_ is 
stored in
+          // header
+          if (ARROW_PREDICT_FALSE(mini_block_idx_ > mini_blocks_per_block_)) {

Review comment:
       Instead of this cryptic condition, can you add a boolean member 
indicating that decoding hasn't started yet?

##########
File path: cpp/src/parquet/reader_test.cc
##########
@@ -18,8 +18,10 @@
 #include <fcntl.h>
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+
 #include <cstdint>
 #include <cstdlib>
+#include <fstream>

Review comment:
       Hmm... nothing else changed in this file. Is this necessary?

##########
File path: cpp/src/parquet/arrow/arrow_reader_writer_test.cc
##########
@@ -4056,5 +4055,32 @@ TEST(TestArrowWriteDictionaries, NestedSubfield) {
   ::arrow::AssertTablesEqual(*table, *actual);
 }
 
+TEST(TestArrowReadDeltaEncoding, DeltaBinaryPacked) {
+  auto file = test::get_data_file("delta_binary_packed.parquet");
+  auto expect_file = test::get_data_file("delta_binary_packed_expect.csv");
+  auto pool = ::arrow::default_memory_pool();
+  std::unique_ptr<FileReader> parquet_reader;
+  std::shared_ptr<::arrow::Table> table;
+  ASSERT_OK(
+      FileReader::Make(pool, ParquetFileReader::OpenFile(file, false), 
&parquet_reader));
+  ASSERT_OK(parquet_reader->ReadTable(&table));
+
+  ASSERT_OK_AND_ASSIGN(auto input_file, 
::arrow::io::ReadableFile::Open(expect_file));
+  auto convert_options = ::arrow::csv::ConvertOptions::Defaults();

Review comment:
       The CSV reader is only available if Arrow is compiled with 
`-DARROW_CSV=ON`. You'll have to make this test conditional on this somehow.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to