This is an automated email from the ASF dual-hosted git repository.

suxiaogang223 pushed a commit to branch refact_reader_branch
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/refact_reader_branch by this 
push:
     new 97dc0f1ed3b [fix](be) Decode parquet dictionary page directly
97dc0f1ed3b is described below

commit 97dc0f1ed3b224f35eaa5297b967678d987bf2c2
Author: Socrates <[email protected]>
AuthorDate: Fri May 29 10:06:01 2026 +0800

    [fix](be) Decode parquet dictionary page directly
    
    ### What problem does this PR solve?
    
    Issue Number: close #xxx
    
    Related PR: #xxx
    
    Problem Summary: Avoid relying on an unavailable Arrow Parquet 
ColumnReader::ReadDictionary API by reading the dictionary page directly and 
decoding PLAIN byte array dictionaries for row group pruning.
    
    ### Release note
    
    None
    
    ### Check List (For Author)
    
    - Test: Manual test
    
        - Ran build-support/clang-format.sh on parquet_statistics.cpp.
    
        - Ran git diff --check.
    
        - Fedora DEBUG BE build is rerun after this fix.
    
    - Behavior changed: No
    
    - Does this need documentation: No
---
 be/src/format/new_parquet/parquet_statistics.cpp | 43 +++++++++++++++++-------
 1 file changed, 31 insertions(+), 12 deletions(-)

diff --git a/be/src/format/new_parquet/parquet_statistics.cpp 
b/be/src/format/new_parquet/parquet_statistics.cpp
index b7a4ad9b096..17c4e4911f5 100644
--- a/be/src/format/new_parquet/parquet_statistics.cpp
+++ b/be/src/format/new_parquet/parquet_statistics.cpp
@@ -19,7 +19,8 @@
 
 #include <parquet/api/reader.h>
 #include <parquet/api/schema.h>
-#include <parquet/column_reader.h>
+#include <parquet/column_page.h>
+#include <parquet/encoding.h>
 #include <parquet/statistics.h>
 #include <parquet/types.h>
 
@@ -219,28 +220,39 @@ bool read_dictionary_words(::parquet::ParquetFileReader* 
file_reader, int row_gr
     if (page_reader == nullptr) {
         return false;
     }
-    auto column_reader =
-            ::parquet::ColumnReader::Make(column_schema.descriptor, 
std::move(page_reader));
-    if (column_reader == nullptr) {
-        return false;
-    }
 
-    int32_t dictionary_length = 0;
-    const void* dictionary = nullptr;
+    std::shared_ptr<::parquet::Page> page;
     try {
-        dictionary = column_reader->ReadDictionary(&dictionary_length);
+        page = page_reader->NextPage();
     } catch (const ::parquet::ParquetException&) {
         return false;
     } catch (const std::exception&) {
         return false;
     }
-    if (dictionary == nullptr || dictionary_length <= 0) {
+    if (page == nullptr || page->type() != 
::parquet::PageType::DICTIONARY_PAGE) {
+        return false;
+    }
+    const auto* dictionary_page = static_cast<const 
::parquet::DictionaryPage*>(page.get());
+    if (dictionary_page->encoding() != ::parquet::Encoding::PLAIN &&
+        dictionary_page->encoding() != ::parquet::Encoding::PLAIN_DICTIONARY) {
+        return false;
+    }
+    const int32_t dictionary_length = dictionary_page->num_values();
+    if (dictionary_length <= 0) {
         return false;
     }
+    const auto* dictionary_data = dictionary_page->data();
+    const int dictionary_size = dictionary_page->size();
 
     dict_words->values.reserve(static_cast<size_t>(dictionary_length));
     if (column_schema.descriptor->physical_type() == 
::parquet::Type::BYTE_ARRAY) {
-        const auto* byte_array_values = reinterpret_cast<const 
::parquet::ByteArray*>(dictionary);
+        auto decoder = ::parquet::MakeTypedDecoder<::parquet::ByteArrayType>(
+                ::parquet::Encoding::PLAIN, column_schema.descriptor);
+        decoder->SetData(dictionary_length, dictionary_data, dictionary_size);
+        std::vector<::parquet::ByteArray> 
byte_array_values(static_cast<size_t>(dictionary_length));
+        if (decoder->Decode(byte_array_values.data(), dictionary_length) != 
dictionary_length) {
+            return false;
+        }
         for (int32_t dict_idx = 0; dict_idx < dictionary_length; ++dict_idx) {
             dict_words->values.emplace_back(
                     reinterpret_cast<const 
char*>(byte_array_values[dict_idx].ptr),
@@ -254,7 +266,14 @@ bool read_dictionary_words(::parquet::ParquetFileReader* 
file_reader, int row_gr
         if (type_length <= 0) {
             return false;
         }
-        const auto* flba_values = reinterpret_cast<const 
::parquet::FixedLenByteArray*>(dictionary);
+        auto decoder = 
::parquet::MakeTypedDecoder<::parquet::FLBAType>(::parquet::Encoding::PLAIN,
+                                                                        
column_schema.descriptor);
+        decoder->SetData(dictionary_length, dictionary_data, dictionary_size);
+        std::vector<::parquet::FixedLenByteArray> flba_values(
+                static_cast<size_t>(dictionary_length));
+        if (decoder->Decode(flba_values.data(), dictionary_length) != 
dictionary_length) {
+            return false;
+        }
         for (int32_t dict_idx = 0; dict_idx < dictionary_length; ++dict_idx) {
             dict_words->values.emplace_back(
                     reinterpret_cast<const char*>(flba_values[dict_idx].ptr), 
type_length);


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to