This is an automated email from the ASF dual-hosted git repository.
suxiaogang223 pushed a commit to branch refact_reader_branch
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/refact_reader_branch by this
push:
new 97dc0f1ed3b [fix](be) Decode parquet dictionary page directly
97dc0f1ed3b is described below
commit 97dc0f1ed3b224f35eaa5297b967678d987bf2c2
Author: Socrates <[email protected]>
AuthorDate: Fri May 29 10:06:01 2026 +0800
[fix](be) Decode parquet dictionary page directly
### What problem does this PR solve?
Issue Number: close #xxx
Related PR: #xxx
Problem Summary: Avoid relying on an unavailable Arrow Parquet
ColumnReader::ReadDictionary API by reading the dictionary page directly and
decoding PLAIN byte array dictionaries for row group pruning.
### Release note
None
### Check List (For Author)
- Test: Manual test
- Ran build-support/clang-format.sh on parquet_statistics.cpp.
- Ran git diff --check.
- Fedora DEBUG BE build is rerun after this fix.
- Behavior changed: No
- Does this need documentation: No
---
be/src/format/new_parquet/parquet_statistics.cpp | 43 +++++++++++++++++-------
1 file changed, 31 insertions(+), 12 deletions(-)
diff --git a/be/src/format/new_parquet/parquet_statistics.cpp
b/be/src/format/new_parquet/parquet_statistics.cpp
index b7a4ad9b096..17c4e4911f5 100644
--- a/be/src/format/new_parquet/parquet_statistics.cpp
+++ b/be/src/format/new_parquet/parquet_statistics.cpp
@@ -19,7 +19,8 @@
#include <parquet/api/reader.h>
#include <parquet/api/schema.h>
-#include <parquet/column_reader.h>
+#include <parquet/column_page.h>
+#include <parquet/encoding.h>
#include <parquet/statistics.h>
#include <parquet/types.h>
@@ -219,28 +220,39 @@ bool read_dictionary_words(::parquet::ParquetFileReader*
file_reader, int row_gr
if (page_reader == nullptr) {
return false;
}
- auto column_reader =
- ::parquet::ColumnReader::Make(column_schema.descriptor,
std::move(page_reader));
- if (column_reader == nullptr) {
- return false;
- }
- int32_t dictionary_length = 0;
- const void* dictionary = nullptr;
+ std::shared_ptr<::parquet::Page> page;
try {
- dictionary = column_reader->ReadDictionary(&dictionary_length);
+ page = page_reader->NextPage();
} catch (const ::parquet::ParquetException&) {
return false;
} catch (const std::exception&) {
return false;
}
- if (dictionary == nullptr || dictionary_length <= 0) {
+ if (page == nullptr || page->type() !=
::parquet::PageType::DICTIONARY_PAGE) {
+ return false;
+ }
+ const auto* dictionary_page = static_cast<const
::parquet::DictionaryPage*>(page.get());
+ if (dictionary_page->encoding() != ::parquet::Encoding::PLAIN &&
+ dictionary_page->encoding() != ::parquet::Encoding::PLAIN_DICTIONARY) {
+ return false;
+ }
+ const int32_t dictionary_length = dictionary_page->num_values();
+ if (dictionary_length <= 0) {
return false;
}
+ const auto* dictionary_data = dictionary_page->data();
+ const int dictionary_size = dictionary_page->size();
dict_words->values.reserve(static_cast<size_t>(dictionary_length));
if (column_schema.descriptor->physical_type() ==
::parquet::Type::BYTE_ARRAY) {
- const auto* byte_array_values = reinterpret_cast<const
::parquet::ByteArray*>(dictionary);
+ auto decoder = ::parquet::MakeTypedDecoder<::parquet::ByteArrayType>(
+ ::parquet::Encoding::PLAIN, column_schema.descriptor);
+ decoder->SetData(dictionary_length, dictionary_data, dictionary_size);
+ std::vector<::parquet::ByteArray>
byte_array_values(static_cast<size_t>(dictionary_length));
+ if (decoder->Decode(byte_array_values.data(), dictionary_length) !=
dictionary_length) {
+ return false;
+ }
for (int32_t dict_idx = 0; dict_idx < dictionary_length; ++dict_idx) {
dict_words->values.emplace_back(
reinterpret_cast<const
char*>(byte_array_values[dict_idx].ptr),
@@ -254,7 +266,14 @@ bool read_dictionary_words(::parquet::ParquetFileReader*
file_reader, int row_gr
if (type_length <= 0) {
return false;
}
- const auto* flba_values = reinterpret_cast<const
::parquet::FixedLenByteArray*>(dictionary);
+ auto decoder =
::parquet::MakeTypedDecoder<::parquet::FLBAType>(::parquet::Encoding::PLAIN,
+
column_schema.descriptor);
+ decoder->SetData(dictionary_length, dictionary_data, dictionary_size);
+ std::vector<::parquet::FixedLenByteArray> flba_values(
+ static_cast<size_t>(dictionary_length));
+ if (decoder->Decode(flba_values.data(), dictionary_length) !=
dictionary_length) {
+ return false;
+ }
for (int32_t dict_idx = 0; dict_idx < dictionary_length; ++dict_idx) {
dict_words->values.emplace_back(
reinterpret_cast<const char*>(flba_values[dict_idx].ptr),
type_length);
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]