This is an automated email from the ASF dual-hosted git repository. morrysnow pushed a commit to branch branch-3.1 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.1 by this push: new 706c716a51d branch-3.1: [fix](be) fix parquet file reader not updating page index when processing it #52228 (#52783) 706c716a51d is described below commit 706c716a51df16a155c7d2f95afd952ce6683657 Author: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> AuthorDate: Sat Jul 5 20:23:44 2025 +0800 branch-3.1: [fix](be) fix parquet file reader not updating page index when processing it #52228 (#52783) Cherry-picked from #52228 Co-authored-by: SWEI <z_s...@outlook.com> Co-authored-by: zengsiwei <zengsi...@kuaishou.com> Co-authored-by: suxiaogang223 <suxiaogang...@icloud.com> --- be/src/common/config.cpp | 2 +- be/src/vec/exec/format/parquet/vparquet_reader.cpp | 2 +- .../parquet/small_2rowgroup.parquet | Bin 0 -> 13059 bytes .../hive/test_hive_parquet_skip_page.out | Bin 31833 -> 31867 bytes .../hive/test_hive_parquet_skip_page.groovy | 12 +++++++++++- 5 files changed, 13 insertions(+), 3 deletions(-) diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index 1c9ed17700a..c603d7d8e7c 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1472,7 +1472,7 @@ DEFINE_mInt64(compaction_batch_size, "-1"); // If set to false, the parquet reader will not use page index to filter data. // This is only for debug purpose, in case sometimes the page index // filter wrong data. -DEFINE_mBool(enable_parquet_page_index, "false"); +DEFINE_mBool(enable_parquet_page_index, "true"); DEFINE_mBool(ignore_not_found_file_in_external_table, "true"); diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_reader.cpp index be149991759..a38031a668f 100644 --- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp @@ -863,7 +863,7 @@ Status ParquetReader::_process_page_index(const tparquet::RowGroup& row_group, // use the union row range skipped_row_ranges.emplace_back(skipped_row_range); } - _col_offsets.emplace(parquet_col_id, offset_index); + _col_offsets[parquet_col_id] = offset_index; } if (skipped_row_ranges.empty()) { read_whole_row_group(); diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet/small_2rowgroup.parquet b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet/small_2rowgroup.parquet new file mode 100644 index 00000000000..dcd05f5e28e Binary files /dev/null and b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet/small_2rowgroup.parquet differ diff --git a/regression-test/data/external_table_p0/hive/test_hive_parquet_skip_page.out b/regression-test/data/external_table_p0/hive/test_hive_parquet_skip_page.out index 6c869dbc789..a8973479e42 100644 Binary files a/regression-test/data/external_table_p0/hive/test_hive_parquet_skip_page.out and b/regression-test/data/external_table_p0/hive/test_hive_parquet_skip_page.out differ diff --git a/regression-test/suites/external_table_p0/hive/test_hive_parquet_skip_page.groovy b/regression-test/suites/external_table_p0/hive/test_hive_parquet_skip_page.groovy index ebdbedf139d..0bded7d820a 100644 --- a/regression-test/suites/external_table_p0/hive/test_hive_parquet_skip_page.groovy +++ b/regression-test/suites/external_table_p0/hive/test_hive_parquet_skip_page.groovy @@ -94,11 +94,21 @@ suite("test_hive_parquet_skip_page", "p0,external,hive,external_docker,external_ return; } + String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") + def hdfs_port = context.config.otherConfigs.get("hive2HdfsPort") + def defaultFS = "hdfs://${externalEnvIp}:${hdfs_port}" + def hdfsUserName = "doris" + def uri = "${defaultFS}" + "/user/doris/preinstalled_data/parquet/small_2rowgroup.parquet" + qt_small_2rowgroup """ select * from HDFS( + "uri" = "${uri}", + "hadoop.username" = "${hdfsUserName}", + "format" = "parquet") where a = 1024 or a = 4049 + order by a;""" + for (String hivePrefix : ["hive2", "hive3"]) { try { String hms_port = context.config.otherConfigs.get(hivePrefix + "HmsPort") String catalog_name = "${hivePrefix}_test_parquet_skip_page" - String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") sql """drop catalog if exists ${catalog_name}""" sql """create catalog if not exists ${catalog_name} properties ( --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org