kou commented on code in PR #14147:
URL: https://github.com/apache/arrow/pull/14147#discussion_r975997169
##########
cpp/src/parquet/encoding.cc:
##########
@@ -2355,6 +2355,81 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl,
std::shared_ptr<ResizableBuffer> buffered_data_;
};
+// ----------------------------------------------------------------------
+// RLE_BOOLEAN_DECODER
+
+class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder {
+ public:
+ explicit RleBooleanDecoder(const ColumnDescriptor* descr)
+ : DecoderImpl(descr, Encoding::RLE) {}
+
+ void SetData(int num_values, const uint8_t* data, int len) override {
+ num_values_ = num_values;
+ uint32_t num_bytes = 0;
+
+ if (len < 4) {
+ throw ParquetException("Received invalid length : " +
std::to_string(len) +
+ " (corrupt data page?)");
+ }
+ // Load the first 4 bytes in little-endian, which indicates the length
+ num_bytes =
+
::arrow::bit_util::ToLittleEndian(::arrow::util::SafeLoadAs<uint32_t>(data));
+ if (num_bytes < 0 || num_bytes > static_cast<uint32_t>(len - 4)) {
+ throw ParquetException("Received invalid number of bytes : " +
+ std::to_string(num_bytes) + " (corrupt data
page?)");
+ }
+
+ auto decoder_data = data + 4;
+ decoder_ = std::make_shared<::arrow::util::RleDecoder>(decoder_data,
num_bytes,
+ /*bit_width=*/1);
+ }
+
+ int Decode(bool* buffer, int max_values) override {
+ max_values = std::min(max_values, num_values_);
+ int val = 0;
+
+ for (int i = 0; i < max_values; ++i) {
+ if (!decoder_->Get(&val)) {
+ throw ParquetException("Unable to parse bits for position (0 based) :
" +
+ std::to_string(i) + " (corrupt data page?)");
+ }
+ if (val) {
+ buffer[i] = true;
+ } else {
+ buffer[i] = false;
+ }
+ }
+ num_values_ -= max_values;
+ return max_values;
+ }
+
+ int Decode(uint8_t* buffer, int max_values) override {
+ max_values = std::min(max_values, num_values_);
+ if (decoder_->GetBatch(buffer, max_values) != max_values) {
Review Comment:
It seems that `bit_reader_->GetBatch(1, buffer, max_values)` with
`T=uint8_t` read 1 bit data and write 8 bits data to `buffer`.
See also:
*
https://github.com/apache/arrow/blob/8daa7a4ed5629c0020dadf7325a6b523bdfc62e9/cpp/src/arrow/util/bit_stream_utils.h#L337
*
https://github.com/apache/arrow/blob/8daa7a4ed5629c0020dadf7325a6b523bdfc62e9/cpp/src/arrow/util/bit_stream_utils.h#L279
I wanted to confirm this behavior by debugger but it seems that this method
isn't used in tests...
##########
cpp/src/parquet/encoding.cc:
##########
@@ -2355,6 +2355,80 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl,
std::shared_ptr<ResizableBuffer> buffered_data_;
};
+// ----------------------------------------------------------------------
+// RLE_BOOLEAN_DECODER
+
+class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder {
+ public:
+ explicit RleBooleanDecoder(const ColumnDescriptor* descr)
+ : DecoderImpl(descr, Encoding::RLE) {}
+
+ void SetData(int num_values, const uint8_t* data, int len) override {
+ num_values_ = num_values;
+ uint32_t num_bytes = 0;
+
+ if (len < 4) {
+ throw ParquetException("Received invalid length : " +
std::to_string(len) +
+ " (corrupt data page?)");
+ }
+ // Load the first 4 bytes in little-endian, which indicates the length
+ num_bytes =
+
::arrow::bit_util::ToLittleEndian(::arrow::util::SafeLoadAs<uint32_t>(data));
+ if (num_bytes < 0 || num_bytes > (uint32_t)(len - 4)) {
+ throw ParquetException("Received invalid number of bytes : " +
+ std::to_string(num_bytes) + " (corrupt data
page?)");
+ }
+
+ const uint8_t* decoder_data = data + 4;
+ decoder_ = std::make_shared<::arrow::util::RleDecoder>(decoder_data,
num_bytes,
+ /*bit_width=*/1);
+ }
+
+ int Decode(bool* buffer, int max_values) override {
+ max_values = std::min(max_values, num_values_);
+ int val = 0;
Review Comment:
Thanks.
I re-read the implementation and understand that `RleDecoder` reads 1 bit in
this case and write the read value to `buffer` with cast. (1 bit boolean value
-> `static_cast<int>(1)` for `true`/`static_cast<int>(0)` for `false` in this
case.) So we can use any type for `val`. But I think that `bool` is better than
`int` because we want to return `bool` values.
How about this?
```diff
diff --git a/cpp/src/arrow/util/endian.h b/cpp/src/arrow/util/endian.h
index f77077f809..d428287882 100644
--- a/cpp/src/arrow/util/endian.h
+++ b/cpp/src/arrow/util/endian.h
@@ -122,28 +122,28 @@ static inline void ByteSwap(void* dst, const void*
src, int len) {
#if ARROW_LITTLE_ENDIAN
template <typename T, typename = internal::EnableIfIsOneOf<
T, int64_t, uint64_t, int32_t, uint32_t, int16_t,
uint16_t,
- uint8_t, int8_t, float, double>>
+ uint8_t, int8_t, float, double, bool>>
static inline T ToBigEndian(T value) {
return ByteSwap(value);
}
template <typename T, typename = internal::EnableIfIsOneOf<
T, int64_t, uint64_t, int32_t, uint32_t, int16_t,
uint16_t,
- uint8_t, int8_t, float, double>>
+ uint8_t, int8_t, float, double, bool>>
static inline T ToLittleEndian(T value) {
return value;
}
#else
template <typename T, typename = internal::EnableIfIsOneOf<
T, int64_t, uint64_t, int32_t, uint32_t, int16_t,
uint16_t,
- uint8_t, int8_t, float, double>>
+ uint8_t, int8_t, float, double, bool>>
static inline T ToBigEndian(T value) {
return value;
}
template <typename T, typename = internal::EnableIfIsOneOf<
T, int64_t, uint64_t, int32_t, uint32_t, int16_t,
uint16_t,
- uint8_t, int8_t, float, double>>
+ uint8_t, int8_t, float, double, bool>>
static inline T ToLittleEndian(T value) {
return ByteSwap(value);
}
@@ -153,28 +153,28 @@ static inline T ToLittleEndian(T value) {
#if ARROW_LITTLE_ENDIAN
template <typename T, typename = internal::EnableIfIsOneOf<
T, int64_t, uint64_t, int32_t, uint32_t, int16_t,
uint16_t,
- uint8_t, int8_t, float, double>>
+ uint8_t, int8_t, float, double, bool>>
static inline T FromBigEndian(T value) {
return ByteSwap(value);
}
template <typename T, typename = internal::EnableIfIsOneOf<
T, int64_t, uint64_t, int32_t, uint32_t, int16_t,
uint16_t,
- uint8_t, int8_t, float, double>>
+ uint8_t, int8_t, float, double, bool>>
static inline T FromLittleEndian(T value) {
return value;
}
#else
template <typename T, typename = internal::EnableIfIsOneOf<
T, int64_t, uint64_t, int32_t, uint32_t, int16_t,
uint16_t,
- uint8_t, int8_t, float, double>>
+ uint8_t, int8_t, float, double, bool>>
static inline T FromBigEndian(T value) {
return value;
}
template <typename T, typename = internal::EnableIfIsOneOf<
T, int64_t, uint64_t, int32_t, uint32_t, int16_t,
uint16_t,
- uint8_t, int8_t, float, double>>
+ uint8_t, int8_t, float, double, bool>>
static inline T FromLittleEndian(T value) {
return ByteSwap(value);
}
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 98626c9791..b9257aab22 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -2386,18 +2386,8 @@ class RleBooleanDecoder : public DecoderImpl, virtual
public BooleanDecoder {
int Decode(bool* buffer, int max_values) override {
max_values = std::min(max_values, num_values_);
- int val = 0;
-
- for (int i = 0; i < max_values; ++i) {
- if (!decoder_->Get(&val)) {
- throw ParquetException("Unable to parse bits for position (0 based)
: " +
- std::to_string(i) + " (corrupt data page?)");
- }
- if (val) {
- buffer[i] = true;
- } else {
- buffer[i] = false;
- }
+ if (decoder_->GetBatch(buffer, max_values) != max_values) {
+ ParquetException::EofException();
}
num_values_ -= max_values;
return max_values;
```
##########
cpp/src/parquet/encoding.cc:
##########
@@ -2355,6 +2355,81 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl,
std::shared_ptr<ResizableBuffer> buffered_data_;
};
+// ----------------------------------------------------------------------
+// RLE_BOOLEAN_DECODER
+
+class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder {
+ public:
+ explicit RleBooleanDecoder(const ColumnDescriptor* descr)
+ : DecoderImpl(descr, Encoding::RLE) {}
+
+ void SetData(int num_values, const uint8_t* data, int len) override {
+ num_values_ = num_values;
+ uint32_t num_bytes = 0;
+
+ if (len < 4) {
+ throw ParquetException("Received invalid length : " +
std::to_string(len) +
+ " (corrupt data page?)");
+ }
+ // Load the first 4 bytes in little-endian, which indicates the length
+ num_bytes =
+
::arrow::bit_util::ToLittleEndian(::arrow::util::SafeLoadAs<uint32_t>(data));
+ if (num_bytes < 0 || num_bytes > static_cast<uint32_t>(len - 4)) {
+ throw ParquetException("Received invalid number of bytes : " +
+ std::to_string(num_bytes) + " (corrupt data
page?)");
+ }
+
+ auto decoder_data = data + 4;
+ decoder_ = std::make_shared<::arrow::util::RleDecoder>(decoder_data,
num_bytes,
+ /*bit_width=*/1);
+ }
+
+ int Decode(bool* buffer, int max_values) override {
+ max_values = std::min(max_values, num_values_);
+ int val = 0;
+
+ for (int i = 0; i < max_values; ++i) {
+ if (!decoder_->Get(&val)) {
+ throw ParquetException("Unable to parse bits for position (0 based) :
" +
+ std::to_string(i) + " (corrupt data page?)");
+ }
+ if (val) {
+ buffer[i] = true;
+ } else {
+ buffer[i] = false;
+ }
+ }
+ num_values_ -= max_values;
+ return max_values;
+ }
+
+ int Decode(uint8_t* buffer, int max_values) override {
+ max_values = std::min(max_values, num_values_);
+ if (decoder_->GetBatch(buffer, max_values) != max_values) {
Review Comment:
I applied the following patch
```diff
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 98626c9791..f3e498ac9d 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -2404,6 +2404,7 @@ class RleBooleanDecoder : public DecoderImpl, virtual
public BooleanDecoder {
}
int Decode(uint8_t* buffer, int max_values) override {
+ throw ParquetException("XXX");
max_values = std::min(max_values, num_values_);
if (decoder_->GetBatch(buffer, max_values) != max_values) {
ParquetException::EofException();
```
and ran `debug/parquet-reader-test` but no exception was raised:
<details>
```console
$ debug/parquet-reader-test
Running main() from ./googletest/src/gtest_main.cc
[==========] Running 82 tests from 27 test suites.
[----------] Global test environment set-up.
[----------] 10 tests from TestPrimitiveReader
[ RUN ] TestPrimitiveReader.TestInt32FlatRequired
[ OK ] TestPrimitiveReader.TestInt32FlatRequired (22 ms)
[ RUN ] TestPrimitiveReader.TestInt32FlatOptional
[ OK ] TestPrimitiveReader.TestInt32FlatOptional (5 ms)
[ RUN ] TestPrimitiveReader.TestInt32FlatRepeated
[ OK ] TestPrimitiveReader.TestInt32FlatRepeated (6 ms)
[ RUN ] TestPrimitiveReader.TestInt32FlatRequiredSkip
[ OK ] TestPrimitiveReader.TestInt32FlatRequiredSkip (0 ms)
[ RUN ] TestPrimitiveReader.TestReadValuesMissing
[ OK ] TestPrimitiveReader.TestReadValuesMissing (0 ms)
[ RUN ] TestPrimitiveReader.TestRepetitionLvlBytesWithMaxRepetitionZero
[ OK ] TestPrimitiveReader.TestRepetitionLvlBytesWithMaxRepetitionZero
(0 ms)
[ RUN ] TestPrimitiveReader.TestReadValuesMissingWithDictionary
[ OK ] TestPrimitiveReader.TestReadValuesMissingWithDictionary (0 ms)
[ RUN ] TestPrimitiveReader.TestDictionaryEncodedPages
[ OK ] TestPrimitiveReader.TestDictionaryEncodedPages (0 ms)
[ RUN ] TestPrimitiveReader.TestDictionaryEncodedPagesWithExposeEncoding
[ OK ]
TestPrimitiveReader.TestDictionaryEncodedPagesWithExposeEncoding (0 ms)
[ RUN ]
TestPrimitiveReader.TestNonDictionaryEncodedPagesWithExposeEncoding
[ OK ]
TestPrimitiveReader.TestNonDictionaryEncodedPagesWithExposeEncoding (0 ms)
[----------] 10 tests from TestPrimitiveReader (35 ms total)
[----------] 2 tests from TestColumnReader
[ RUN ] TestColumnReader.DefLevelsToBitmap
[ OK ] TestColumnReader.DefLevelsToBitmap (0 ms)
[ RUN ] TestColumnReader.DefLevelsToBitmapPowerOfTwo
[ OK ] TestColumnReader.DefLevelsToBitmapPowerOfTwo (0 ms)
[----------] 2 tests from TestColumnReader (0 ms total)
[----------] 1 test from GreaterThanBitmap
[ RUN ] GreaterThanBitmap.GeneratesExpectedBitmasks
[ OK ] GreaterThanBitmap.GeneratesExpectedBitmasks (0 ms)
[----------] 1 test from GreaterThanBitmap (0 ms total)
[----------] 1 test from DefLevelsToBitmap
[ RUN ] DefLevelsToBitmap.WithRepetitionLevelFiltersOutEmptyListValues
[ OK ] DefLevelsToBitmap.WithRepetitionLevelFiltersOutEmptyListValues
(0 ms)
[----------] 1 test from DefLevelsToBitmap (0 ms total)
[----------] 5 tests from NestedListTest/0, where TypeParam =
parquet::internal::RepDefLevelConverter<int>
[ RUN ] NestedListTest/0.OuterMostTest
[ OK ] NestedListTest/0.OuterMostTest (0 ms)
[ RUN ] NestedListTest/0.MiddleListTest
[ OK ] NestedListTest/0.MiddleListTest (0 ms)
[ RUN ] NestedListTest/0.InnerMostListTest
[ OK ] NestedListTest/0.InnerMostListTest (0 ms)
[ RUN ] NestedListTest/0.SimpleLongList
[ OK ] NestedListTest/0.SimpleLongList (0 ms)
[ RUN ] NestedListTest/0.TestOverflow
[ OK ] NestedListTest/0.TestOverflow (0 ms)
[----------] 5 tests from NestedListTest/0 (0 ms total)
[----------] 5 tests from NestedListTest/1, where TypeParam =
parquet::internal::RepDefLevelConverter<long>
[ RUN ] NestedListTest/1.OuterMostTest
[ OK ] NestedListTest/1.OuterMostTest (0 ms)
[ RUN ] NestedListTest/1.MiddleListTest
[ OK ] NestedListTest/1.MiddleListTest (0 ms)
[ RUN ] NestedListTest/1.InnerMostListTest
[ OK ] NestedListTest/1.InnerMostListTest (0 ms)
[ RUN ] NestedListTest/1.SimpleLongList
[ OK ] NestedListTest/1.SimpleLongList (0 ms)
[ RUN ] NestedListTest/1.TestOverflow
[ OK ] NestedListTest/1.TestOverflow (0 ms)
[----------] 5 tests from NestedListTest/1 (0 ms total)
[----------] 1 test from TestOnlyExtractBitsSoftware
[ RUN ] TestOnlyExtractBitsSoftware.BasicTest
[ OK ] TestOnlyExtractBitsSoftware.BasicTest (0 ms)
[----------] 1 test from TestOnlyExtractBitsSoftware (0 ms total)
[----------] 2 tests from TestFlatScanner/0, where TypeParam =
parquet::PhysicalType<(parquet::Type::type)1>
[ RUN ] TestFlatScanner/0.TestPlainScanner
[ OK ] TestFlatScanner/0.TestPlainScanner (2 ms)
[ RUN ] TestFlatScanner/0.TestDictScanner
[ OK ] TestFlatScanner/0.TestDictScanner (2 ms)
[----------] 2 tests from TestFlatScanner/0 (4 ms total)
[----------] 2 tests from TestFlatScanner/1, where TypeParam =
parquet::PhysicalType<(parquet::Type::type)2>
[ RUN ] TestFlatScanner/1.TestPlainScanner
[ OK ] TestFlatScanner/1.TestPlainScanner (2 ms)
[ RUN ] TestFlatScanner/1.TestDictScanner
[ OK ] TestFlatScanner/1.TestDictScanner (2 ms)
[----------] 2 tests from TestFlatScanner/1 (4 ms total)
[----------] 2 tests from TestFlatScanner/2, where TypeParam =
parquet::PhysicalType<(parquet::Type::type)3>
[ RUN ] TestFlatScanner/2.TestPlainScanner
[ OK ] TestFlatScanner/2.TestPlainScanner (2 ms)
[ RUN ] TestFlatScanner/2.TestDictScanner
[ OK ] TestFlatScanner/2.TestDictScanner (2 ms)
[----------] 2 tests from TestFlatScanner/2 (5 ms total)
[----------] 2 tests from TestFlatScanner/3, where TypeParam =
parquet::PhysicalType<(parquet::Type::type)4>
[ RUN ] TestFlatScanner/3.TestPlainScanner
[ OK ] TestFlatScanner/3.TestPlainScanner (2 ms)
[ RUN ] TestFlatScanner/3.TestDictScanner
[ OK ] TestFlatScanner/3.TestDictScanner (2 ms)
[----------] 2 tests from TestFlatScanner/3 (4 ms total)
[----------] 2 tests from TestFlatScanner/4, where TypeParam =
parquet::PhysicalType<(parquet::Type::type)5>
[ RUN ] TestFlatScanner/4.TestPlainScanner
[ OK ] TestFlatScanner/4.TestPlainScanner (2 ms)
[ RUN ] TestFlatScanner/4.TestDictScanner
[ OK ] TestFlatScanner/4.TestDictScanner (2 ms)
[----------] 2 tests from TestFlatScanner/4 (4 ms total)
[----------] 2 tests from TestFlatScanner/5, where TypeParam =
parquet::PhysicalType<(parquet::Type::type)6>
[ RUN ] TestFlatScanner/5.TestPlainScanner
[ OK ] TestFlatScanner/5.TestPlainScanner (2 ms)
[ RUN ] TestFlatScanner/5.TestDictScanner
[ OK ] TestFlatScanner/5.TestDictScanner (2 ms)
[----------] 2 tests from TestFlatScanner/5 (5 ms total)
[----------] 1 test from TestBooleanFlatScanner
[ RUN ] TestBooleanFlatScanner.TestPlainScanner
[ OK ] TestBooleanFlatScanner.TestPlainScanner (2 ms)
[----------] 1 test from TestBooleanFlatScanner (2 ms total)
[----------] 6 tests from TestFLBAFlatScanner
[ RUN ] TestFLBAFlatScanner.TestPlainScanner
[ OK ] TestFLBAFlatScanner.TestPlainScanner (2 ms)
[ RUN ] TestFLBAFlatScanner.TestDictScanner
[ OK ] TestFLBAFlatScanner.TestDictScanner (2 ms)
[ RUN ] TestFLBAFlatScanner.TestPlainDictScanner
[ OK ] TestFLBAFlatScanner.TestPlainDictScanner (2 ms)
[ RUN ] TestFLBAFlatScanner.TestSmallBatch
[ OK ] TestFLBAFlatScanner.TestSmallBatch (0 ms)
[ RUN ] TestFLBAFlatScanner.TestDescriptorAPI
[ OK ] TestFLBAFlatScanner.TestDescriptorAPI (0 ms)
[ RUN ] TestFLBAFlatScanner.TestFLBAPrinterNext
[ OK ] TestFLBAFlatScanner.TestFLBAPrinterNext (0 ms)
[----------] 6 tests from TestFLBAFlatScanner (9 ms total)
[----------] 2 tests from TestBooleanRLE
[ RUN ] TestBooleanRLE.TestBooleanScanner
[ OK ] TestBooleanRLE.TestBooleanScanner (0 ms)
[ RUN ] TestBooleanRLE.TestBatchRead
[ OK ] TestBooleanRLE.TestBatchRead (0 ms)
[----------] 2 tests from TestBooleanRLE (1 ms total)
[----------] 2 tests from TestTextDeltaLengthByteArray
[ RUN ] TestTextDeltaLengthByteArray.TestTextScanner
[ OK ] TestTextDeltaLengthByteArray.TestTextScanner (0 ms)
[ RUN ] TestTextDeltaLengthByteArray.TestBatchRead
[ OK ] TestTextDeltaLengthByteArray.TestBatchRead (0 ms)
[----------] 2 tests from TestTextDeltaLengthByteArray (1 ms total)
[----------] 9 tests from TestAllTypesPlain
[ RUN ] TestAllTypesPlain.NoopConstructDestruct
[ OK ] TestAllTypesPlain.NoopConstructDestruct (0 ms)
[ RUN ] TestAllTypesPlain.RowGroupMetaData
[ OK ] TestAllTypesPlain.RowGroupMetaData (0 ms)
[ RUN ] TestAllTypesPlain.TestBatchRead
[ OK ] TestAllTypesPlain.TestBatchRead (0 ms)
[ RUN ] TestAllTypesPlain.RowGroupColumnBoundchecking
[ OK ] TestAllTypesPlain.RowGroupColumnBoundchecking (0 ms)
[ RUN ] TestAllTypesPlain.TestFlatScannerInt32
[ OK ] TestAllTypesPlain.TestFlatScannerInt32 (0 ms)
[ RUN ] TestAllTypesPlain.TestSetScannerBatchSize
[ OK ] TestAllTypesPlain.TestSetScannerBatchSize (0 ms)
[ RUN ] TestAllTypesPlain.DebugPrintWorks
[ OK ] TestAllTypesPlain.DebugPrintWorks (0 ms)
[ RUN ] TestAllTypesPlain.ColumnSelection
[ OK ] TestAllTypesPlain.ColumnSelection (0 ms)
[ RUN ] TestAllTypesPlain.ColumnSelectionOutOfRange
[ OK ] TestAllTypesPlain.ColumnSelectionOutOfRange (0 ms)
[----------] 9 tests from TestAllTypesPlain (2 ms total)
[----------] 1 test from TestLocalFile
[ RUN ] TestLocalFile.OpenWithMetadata
[ OK ] TestLocalFile.OpenWithMetadata (1 ms)
[----------] 1 test from TestLocalFile (1 ms total)
[----------] 1 test from TestFileReaderAdHoc
[ RUN ] TestFileReaderAdHoc.NationDictTruncatedDataPage
[ OK ] TestFileReaderAdHoc.NationDictTruncatedDataPage (1 ms)
[----------] 1 test from TestFileReaderAdHoc (1 ms total)
[----------] 1 test from TestDumpWithLocalFile
[ RUN ] TestDumpWithLocalFile.DumpOutput
[ OK ] TestDumpWithLocalFile.DumpOutput (0 ms)
[----------] 1 test from TestDumpWithLocalFile (0 ms total)
[----------] 1 test from TestJSONWithLocalFile
[ RUN ] TestJSONWithLocalFile.JSONOutput
[ OK ] TestJSONWithLocalFile.JSONOutput (0 ms)
[----------] 1 test from TestJSONWithLocalFile (0 ms total)
[----------] 3 tests from TestFileReader
[ RUN ] TestFileReader.BufferedReadsWithDictionary
[ OK ] TestFileReader.BufferedReadsWithDictionary (2 ms)
[ RUN ] TestFileReader.BufferedReads
[ OK ] TestFileReader.BufferedReads (14 ms)
[ RUN ] TestFileReader.TestOpenErrors
[ OK ] TestFileReader.TestOpenErrors (1 ms)
[----------] 3 tests from TestFileReader (18 ms total)
[----------] 7 tests from TestStreamReader
[ RUN ] TestStreamReader.DefaultConstructed
[ OK ] TestStreamReader.DefaultConstructed (28 ms)
[ RUN ] TestStreamReader.TypeChecking
[ OK ] TestStreamReader.TypeChecking (28 ms)
[ RUN ] TestStreamReader.ValueChecking
[ OK ] TestStreamReader.ValueChecking (34 ms)
[ RUN ] TestStreamReader.ReadRequiredFieldAsOptionalField
[ OK ] TestStreamReader.ReadRequiredFieldAsOptionalField (35 ms)
[ RUN ] TestStreamReader.SkipRows
[ OK ] TestStreamReader.SkipRows (36 ms)
[ RUN ] TestStreamReader.SkipAllRows
[ OK ] TestStreamReader.SkipAllRows (27 ms)
[ RUN ] TestStreamReader.SkipColumns
[ OK ] TestStreamReader.SkipColumns (45 ms)
[----------] 7 tests from TestStreamReader (235 ms total)
[----------] 2 tests from TestOptionalFields
[ RUN ] TestOptionalFields.ValueChecking
[ OK ] TestOptionalFields.ValueChecking (41 ms)
[ RUN ] TestOptionalFields.ReadOptionalFieldAsRequiredField
[ OK ] TestOptionalFields.ReadOptionalFieldAsRequiredField (55 ms)
[----------] 2 tests from TestOptionalFields (97 ms total)
[----------] 3 tests from TestReadingDataFiles
[ RUN ] TestReadingDataFiles.AllTypesPlain
[ OK ] TestReadingDataFiles.AllTypesPlain (0 ms)
[ RUN ] TestReadingDataFiles.Int32Decimal
[ OK ] TestReadingDataFiles.Int32Decimal (0 ms)
[ RUN ] TestReadingDataFiles.Int64Decimal
[ OK ] TestReadingDataFiles.Int64Decimal (0 ms)
[----------] 3 tests from TestReadingDataFiles (1 ms total)
[----------] 6 tests from Lz4CodecTests/TestCodec
[ RUN ]
Lz4CodecTests/TestCodec.SmallFileMetadataAndValues/LegacyLZ4Hadoop
[ OK ]
Lz4CodecTests/TestCodec.SmallFileMetadataAndValues/LegacyLZ4Hadoop (0 ms)
[ RUN ]
Lz4CodecTests/TestCodec.SmallFileMetadataAndValues/LegacyLZ4NonHadoop
[ OK ]
Lz4CodecTests/TestCodec.SmallFileMetadataAndValues/LegacyLZ4NonHadoop (0 ms)
[ RUN ] Lz4CodecTests/TestCodec.SmallFileMetadataAndValues/LZ4Raw
[ OK ] Lz4CodecTests/TestCodec.SmallFileMetadataAndValues/LZ4Raw (0 ms)
[ RUN ] Lz4CodecTests/TestCodec.LargeFileValues/LegacyLZ4Hadoop
[ OK ] Lz4CodecTests/TestCodec.LargeFileValues/LegacyLZ4Hadoop (0 ms)
[ RUN ] Lz4CodecTests/TestCodec.LargeFileValues/LegacyLZ4NonHadoop
/home/kou/work/cpp/arrow.kou/cpp/src/parquet/reader_test.cc:950: Skipped
Larger data file not available for this codec
[ SKIPPED ] Lz4CodecTests/TestCodec.LargeFileValues/LegacyLZ4NonHadoop (0
ms)
[ RUN ] Lz4CodecTests/TestCodec.LargeFileValues/LZ4Raw
[ OK ] Lz4CodecTests/TestCodec.LargeFileValues/LZ4Raw (0 ms)
[----------] 6 tests from Lz4CodecTests/TestCodec (2 ms total)
[----------] Global test environment tear-down
[==========] 82 tests from 27 test suites ran. (440 ms total)
[ PASSED ] 81 tests.
[ SKIPPED ] 1 test, listed below:
[ SKIPPED ] Lz4CodecTests/TestCodec.LargeFileValues/LegacyLZ4NonHadoop
```
</details>
Which test does use this method?
##########
cpp/src/parquet/reader_test.cc:
##########
@@ -127,6 +127,89 @@ void CheckRowGroupMetadata(const RowGroupMetaData*
rg_metadata,
}
}
+class TestBooleanRLE : public ::testing::Test {
+ public:
+ void SetUp() {
+ reader_ =
ParquetFileReader::OpenFile(data_file("rle_boolean_encoding.parquet"));
+ }
+
+ void TearDown() {}
+
+ protected:
+ std::unique_ptr<ParquetFileReader> reader_;
+};
+
+TEST_F(TestBooleanRLE, TestBooleanScanner) {
+ auto group = reader_->RowGroup(0);
+
+ // column 0, id
+ auto scanner = std::make_shared<BoolScanner>(group->Column(0));
+
+ bool val = false;
+ bool is_null = false;
+ for (int i = 0; i < 8; i++) {
+ ASSERT_TRUE(scanner->HasNext());
+ ASSERT_TRUE(scanner->NextValue(&val, &is_null));
+
+ // For this file, 3rd index value is null
+ if (i == 2) {
+ ASSERT_TRUE(is_null);
+ } else {
+ ASSERT_FALSE(is_null);
+ }
+ }
+
+ ASSERT_FALSE(scanner->HasNext());
+ ASSERT_FALSE(scanner->NextValue(&val, &is_null));
+}
+
+TEST_F(TestBooleanRLE, TestBatchRead) {
+ auto group = reader_->RowGroup(0);
+
+ // column 0, id
+ auto col = std::dynamic_pointer_cast<BoolReader>(group->Column(0));
+
+ // This file only has 8 rows
+ ASSERT_EQ(8, reader_->metadata()->num_rows());
+ // This file only has 1 row group
+ ASSERT_EQ(1, reader_->metadata()->num_row_groups());
+ // Size of the metadata is 106 bytes
+ ASSERT_EQ(106, reader_->metadata()->size());
+ // This row group must have 8 rows
+ ASSERT_EQ(8, group->metadata()->num_rows());
+
+ // Check if the column is encoded with RLE
+ auto col_chunk = group->metadata()->ColumnChunk(0);
+ ASSERT_TRUE(std::find(col_chunk->encodings().begin(),
col_chunk->encodings().end(),
+ Encoding::RLE) != col_chunk->encodings().end());
+
+ // Assert column has values to be read
+ ASSERT_TRUE(col->HasNext());
+ int64_t curr_batch_read = 0;
+
+ const int16_t batch_size = 8;
+ int16_t def_levels[batch_size];
+ int16_t rep_levels[batch_size];
+ bool values[batch_size - 1];
+
+ auto levels_read =
+ col->ReadBatch(batch_size, def_levels, rep_levels, values,
&curr_batch_read);
+ ASSERT_EQ(batch_size, levels_read);
+
+ // Since one value is a null value, expect batches read to be one less than
indicated
+ // batch_size
+ ASSERT_EQ(batch_size - 1, curr_batch_read);
+
+ // 3rd index is null value
+ ASSERT_THAT(def_levels, testing::ElementsAre(1, 1, 0, 1, 1, 1, 1, 1));
+
+ // Validate inserted data is as expected
+ ASSERT_THAT(values, testing::ElementsAre(1, 0, 1, 1, 0, 0, 1));
Review Comment:
I used `gdb` and watched `RleDecoder::NextCounts()`. All calls use the
`is_literal` clause:
https://github.com/apache/arrow/blob/8daa7a4ed5629c0020dadf7325a6b523bdfc62e9/cpp/src/arrow/util/rle_encoding.h#L653-L656
It seems that the test input have only `<bit-packed-run>` case.
Are all boolean values (1/0) use `<bit-packed-run>` because they can be
represented with 1 bit?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]