This is an automated email from the ASF dual-hosted git repository.

morrysnow pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-3.1 by this push:
     new 706c716a51d branch-3.1: [fix](be) fix parquet file reader not updating 
page index when processing it #52228 (#52783)
706c716a51d is described below

commit 706c716a51df16a155c7d2f95afd952ce6683657
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Sat Jul 5 20:23:44 2025 +0800

    branch-3.1: [fix](be) fix parquet file reader not updating page index when 
processing it #52228 (#52783)
    
    Cherry-picked from #52228
    
    Co-authored-by: SWEI <z_s...@outlook.com>
    Co-authored-by: zengsiwei <zengsi...@kuaishou.com>
    Co-authored-by: suxiaogang223 <suxiaogang...@icloud.com>
---
 be/src/common/config.cpp                            |   2 +-
 be/src/vec/exec/format/parquet/vparquet_reader.cpp  |   2 +-
 .../parquet/small_2rowgroup.parquet                 | Bin 0 -> 13059 bytes
 .../hive/test_hive_parquet_skip_page.out            | Bin 31833 -> 31867 bytes
 .../hive/test_hive_parquet_skip_page.groovy         |  12 +++++++++++-
 5 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp
index 1c9ed17700a..c603d7d8e7c 100644
--- a/be/src/common/config.cpp
+++ b/be/src/common/config.cpp
@@ -1472,7 +1472,7 @@ DEFINE_mInt64(compaction_batch_size, "-1");
 // If set to false, the parquet reader will not use page index to filter data.
 // This is only for debug purpose, in case sometimes the page index
 // filter wrong data.
-DEFINE_mBool(enable_parquet_page_index, "false");
+DEFINE_mBool(enable_parquet_page_index, "true");
 
 DEFINE_mBool(ignore_not_found_file_in_external_table, "true");
 
diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp 
b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
index be149991759..a38031a668f 100644
--- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
@@ -863,7 +863,7 @@ Status ParquetReader::_process_page_index(const 
tparquet::RowGroup& row_group,
             // use the union row range
             skipped_row_ranges.emplace_back(skipped_row_range);
         }
-        _col_offsets.emplace(parquet_col_id, offset_index);
+        _col_offsets[parquet_col_id] = offset_index;
     }
     if (skipped_row_ranges.empty()) {
         read_whole_row_group();
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet/small_2rowgroup.parquet
 
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet/small_2rowgroup.parquet
new file mode 100644
index 00000000000..dcd05f5e28e
Binary files /dev/null and 
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet/small_2rowgroup.parquet
 differ
diff --git 
a/regression-test/data/external_table_p0/hive/test_hive_parquet_skip_page.out 
b/regression-test/data/external_table_p0/hive/test_hive_parquet_skip_page.out
index 6c869dbc789..a8973479e42 100644
Binary files 
a/regression-test/data/external_table_p0/hive/test_hive_parquet_skip_page.out 
and 
b/regression-test/data/external_table_p0/hive/test_hive_parquet_skip_page.out 
differ
diff --git 
a/regression-test/suites/external_table_p0/hive/test_hive_parquet_skip_page.groovy
 
b/regression-test/suites/external_table_p0/hive/test_hive_parquet_skip_page.groovy
index ebdbedf139d..0bded7d820a 100644
--- 
a/regression-test/suites/external_table_p0/hive/test_hive_parquet_skip_page.groovy
+++ 
b/regression-test/suites/external_table_p0/hive/test_hive_parquet_skip_page.groovy
@@ -94,11 +94,21 @@ suite("test_hive_parquet_skip_page", 
"p0,external,hive,external_docker,external_
         return;
     }
 
+    String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
+    def hdfs_port = context.config.otherConfigs.get("hive2HdfsPort")
+    def defaultFS = "hdfs://${externalEnvIp}:${hdfs_port}"
+    def hdfsUserName = "doris"
+    def uri = "${defaultFS}" + 
"/user/doris/preinstalled_data/parquet/small_2rowgroup.parquet"
+    qt_small_2rowgroup """ select * from HDFS(
+                "uri" = "${uri}",
+                "hadoop.username" = "${hdfsUserName}",
+                "format" = "parquet") where a = 1024 or a = 4049
+                order by a;"""
+
     for (String hivePrefix : ["hive2", "hive3"]) {
         try {
             String hms_port = context.config.otherConfigs.get(hivePrefix + 
"HmsPort")
             String catalog_name = "${hivePrefix}_test_parquet_skip_page"
-            String externalEnvIp = 
context.config.otherConfigs.get("externalEnvIp")
 
             sql """drop catalog if exists ${catalog_name}"""
             sql """create catalog if not exists ${catalog_name} properties (


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to