This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.1 by this push:
new dd24c4102d5 branch-4.1:[fix](iceberg) Fill row-id hidden column for
format-version 1 tables. (#64068)
dd24c4102d5 is described below
commit dd24c4102d5f5ad9a66d1bdb44c3e4b1f37a610e
Author: daidai <[email protected]>
AuthorDate: Thu Jun 4 09:53:45 2026 +0800
branch-4.1:[fix](iceberg) Fill row-id hidden column for format-version 1
tables. (#64068)
### What problem does this PR solve?
Problem Summary:
case : `iceberg-v1-parquet` + `show_hidden_columns = true`
fix be core :
```
F20260602 12:14:52.396889 56230 vparquet_group_reader.cpp:787] Check
failed: block->rows() == col.column->size() block rows = 1 , column rows = 0,
col name = __DORIS_ICEBERG_ROWID_COL__
*** Check failure stack trace: ***
@ 0x55b231819706 google::LogMessageFatal::~LogMessageFatal()
@ 0x55b22ac6505d doris::RowGroupReader::_do_lazy_read()
@ 0x55b22ac515cc doris::RowGroupReader::next_batch()
@ 0x55b22abb951c doris::ParquetReader::get_next_block()
@ 0x55b22ab414da doris::IcebergTableReader::get_next_block_inner()
@ 0x55b22a1510a9 doris::TableFormatReader::get_next_block()
@ 0x55b22a0e40da doris::FileScanner::_get_block_wrapped()
@ 0x55b22a0e2997 doris::FileScanner::_get_block_impl()
@ 0x55b22a168696 doris::Scanner::get_block()
@ 0x55b22a1671f3 doris::Scanner::get_block_after_projects()
@ 0x55b22a6cdc73 doris::ScannerScheduler::_scanner_scan()
@ 0x55b22a6d6ab4
_ZNSt17_Function_handlerIFbvEZZN5doris16ScannerScheduler6submitESt10shared_ptrINS1_14ScannerContextEES3_INS1_8ScanTaskEEENK3$_0clEvEUlvE_E9_M_invokeERKSt9_Any_data
@ 0x55b22a6d6236 doris::ScannerSplitRunner::process_for()
@ 0x55b22a6c1a85 doris::PrioritizedSplitRunner::process()
@ 0x55b22a6898b5 doris::TimeSharingTaskExecutor::_dispatch_thread()
@ 0x55b22e3a4b72 doris::Thread::supervise_thread()
@ 0x55b216635d27 asan_thread_start()
@ 0x7f25a6654ac3 (unknown)
@ 0x7f25a66e68d0 (unknown)
@ (nil) (unknown)
```
### Release note
None
### Check List (For Author)
- Test <!-- At least one of them must be included. -->
- [ ] Regression test
- [ ] Unit Test
- [ ] Manual test (add detailed scripts or steps below)
- [ ] No need to test or manual test. Explain why:
- [ ] This is a refactor/code format and no logic has been changed.
- [ ] Previous test can cover this change.
- [ ] No code files have been changed.
- [ ] Other reason <!-- Add your reason? -->
- Behavior changed:
- [ ] No.
- [ ] Yes. <!-- Explain the behavior change -->
- Does this need documentation?
- [ ] No.
- [ ] Yes. <!-- Add document PR link here. eg:
https://github.com/apache/doris-website/pull/1214 -->
### Check List (For Reviewer who merge this PR)
- [ ] Confirm the release note
- [ ] Confirm test cases
- [ ] Confirm document
- [ ] Add branch pick label <!-- Add branch pick label that this PR
should merge into -->
---
be/src/format/table/iceberg_reader.cpp | 18 ++-
.../test_iceberg_v1_rowid_hidden_column.groovy | 175 +++++++++++++++++++++
2 files changed, 188 insertions(+), 5 deletions(-)
diff --git a/be/src/format/table/iceberg_reader.cpp
b/be/src/format/table/iceberg_reader.cpp
index 5e1ee2d6cb2..5b90cac648d 100644
--- a/be/src/format/table/iceberg_reader.cpp
+++ b/be/src/format/table/iceberg_reader.cpp
@@ -205,16 +205,19 @@ Status IcebergTableReader::init_row_filters() {
}
const auto& table_desc = _range.table_format_params.iceberg_params;
- const auto& version = table_desc.format_version;
- if (version < MIN_SUPPORT_DELETE_FILES_VERSION) {
- return Status::OK();
- }
auto* parquet_reader =
dynamic_cast<ParquetReader*>(_file_format_reader.get());
auto* orc_reader = dynamic_cast<OrcReader*>(_file_format_reader.get());
// Initialize file information for $row_id generation
- // Extract from table_desc which contains current file's metadata
+ // Extract from table_desc which contains current file's metadata.
+ // NOTE: row-id generation only needs the data file path / partition info
/ row positions,
+ // which are independent of delete-file support, so it MUST be set up
before the
+ // format-version gate below. The FE adds the hidden
__DORIS_ICEBERG_ROWID_COL__ column
+ // whenever show_hidden_columns is on, regardless of format version (see
+ // IcebergExternalTable.getFullSchema). If a v1 table selects this column
we still have to
+ // fill it; otherwise it stays empty while the other columns are filtered
down, tripping the
+ // `block->rows() == col.column->size()` check in
RowGroupReader::_do_lazy_read.
if (_need_row_id_column) {
std::string file_path = table_desc.original_file_path;
int32_t partition_spec_id = 0;
@@ -237,6 +240,11 @@ Status IcebergTableReader::init_row_filters() {
<< ", partition_spec_id: " << partition_spec_id;
}
+ const auto& version = table_desc.format_version;
+ if (version < MIN_SUPPORT_DELETE_FILES_VERSION) {
+ return Status::OK();
+ }
+
std::vector<TIcebergDeleteFileDesc> position_delete_files;
std::vector<TIcebergDeleteFileDesc> equality_delete_files;
std::vector<TIcebergDeleteFileDesc> deletion_vector_files;
diff --git
a/regression-test/suites/external_table_p0/iceberg/test_iceberg_v1_rowid_hidden_column.groovy
b/regression-test/suites/external_table_p0/iceberg/test_iceberg_v1_rowid_hidden_column.groovy
new file mode 100644
index 00000000000..bc8774b3160
--- /dev/null
+++
b/regression-test/suites/external_table_p0/iceberg/test_iceberg_v1_rowid_hidden_column.groovy
@@ -0,0 +1,175 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Regression for a BE crash when reading the hidden Iceberg row-id column on a
+// format-version 1 table:
+//
+// F ... vparquet_group_reader.cpp:787 Check failed: block->rows() ==
col.column->size()
+// block rows = 1 , column rows = 0, col name =
__DORIS_ICEBERG_ROWID_COL__
+//
+// The FE adds the hidden __DORIS_ICEBERG_ROWID_COL__ column whenever
+// show_hidden_columns=true, regardless of the Iceberg format version
+// (IcebergExternalTable.getFullSchema). The BE, however, only enabled row-id
+// generation for format-version >= 2, because
IcebergTableReader::init_row_filters
+// set up the row-id params *after* the delete-files version gate
+// (version < MIN_SUPPORT_DELETE_FILES_VERSION returns early). On a v1 table a
+// SELECT * therefore left the row-id column empty while every other column was
+// filtered/filled to the surviving row count, tripping the consistency check
in
+// RowGroupReader::_do_lazy_read.
+//
+// The fix moves the row-id setup ahead of the version gate, so this test must
+// return correct results (and not crash the BE) for v1 tables.
+suite("test_iceberg_v1_rowid_hidden_column",
"p0,external,iceberg,external_docker,external_docker_iceberg") {
+ String enabled = context.config.otherConfigs.get("enableIcebergTest")
+ if (enabled == null || !enabled.equalsIgnoreCase("true")) {
+ logger.info("Iceberg test is disabled")
+ return
+ }
+
+ String catalogName = "test_iceberg_v1_rowid_hidden_column"
+ String dbName = "test_iceberg_v1_rowid_db"
+ String restPort = context.config.otherConfigs.get("iceberg_rest_uri_port")
+ String minioPort = context.config.otherConfigs.get("iceberg_minio_port")
+ String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
+ String endpoint = "http://${externalEnvIp}:${minioPort}"
+
+ def descColumns = { rows -> rows.collect { row ->
row[0].toString().toLowerCase() } }
+
+ sql """drop catalog if exists ${catalogName}"""
+ sql """
+ create catalog if not exists ${catalogName} properties (
+ "type" = "iceberg",
+ "iceberg.catalog.type" = "rest",
+ "uri" = "http://${externalEnvIp}:${restPort}",
+ "s3.access_key" = "admin",
+ "s3.secret_key" = "password",
+ "s3.endpoint" = "${endpoint}",
+ "s3.region" = "us-east-1"
+ )
+ """
+
+ sql """switch ${catalogName}"""
+ sql """create database if not exists ${dbName}"""
+ sql """use ${dbName}"""
+ sql """set enable_fallback_to_original_planner = false"""
+ sql """set show_hidden_columns = false"""
+
+ // Run the full set of repro queries against one v1 table (3 visible
columns: id, s, age).
+ def runChecks = { tbl, format ->
+ sql """
+ insert into ${tbl} values
+ (1, 'a', 25),
+ (2, 'a', 30),
+ (3, 'b', 35),
+ (4, 'c', 40)
+ """
+
+ // 1. Default visibility: the row-id hidden column is not exposed.
+ sql """set show_hidden_columns = false"""
+ def visible = sql """select * from ${tbl} order by id"""
+ assertEquals(4, visible.size())
+ assertEquals(3, visible[0].size())
+
+ // 2. show_hidden_columns=true exposes only
__DORIS_ICEBERG_ROWID_COL__ on a v1 table
+ // (v3 row lineage columns must NOT appear).
+ sql """set show_hidden_columns = true"""
+ def hiddenCols = descColumns(sql("""desc ${tbl}"""))
+ log.info("v1 desc with show_hidden_columns=true (${format}, ${tbl}) ->
${hiddenCols}")
+ assertTrue(hiddenCols.any { it.contains("doris_iceberg_rowid") },
+ "show_hidden_columns should expose __DORIS_ICEBERG_ROWID_COL__
for v1 table, got ${hiddenCols}")
+ assertTrue(!hiddenCols.contains("_row_id"),
+ "v1 table must not expose v3 row lineage column _row_id, got
${hiddenCols}")
+ assertTrue(!hiddenCols.contains("_last_updated_sequence_number"),
+ "v1 table must not expose v3 row lineage column
_last_updated_sequence_number, got ${hiddenCols}")
+
+ // 3. Crash repro: SELECT * with a predicate on the string column that
filters rows.
+ // Before the fix the BE aborted here (row-id column left at 0
rows).
+ def byString = sql """select * from ${tbl} where s = 'a' order by id"""
+ log.info("v1 SELECT * where s='a' (${format}, ${tbl}) -> ${byString}")
+ assertEquals(2, byString.size())
+ assertEquals(4, byString[0].size()) // id, s, age,
__DORIS_ICEBERG_ROWID_COL__
+ assertEquals(1, byString[0][0].toString().toInteger())
+ assertEquals(2, byString[1][0].toString().toInteger())
+ assertTrue(byString[0][byString[0].size() - 1] != null,
+ "row-id column must be populated, got null: ${byString[0]}")
+
+ // 4. Crash repro: predicate on a data column (exercises lazy
materialization, the path
+ // in the reported stack) with other lazy columns in the projection.
+ def byData = sql """select * from ${tbl} where id > 1 order by id"""
+ log.info("v1 SELECT * where id>1 (${format}, ${tbl}) -> ${byData}")
+ assertEquals(3, byData.size())
+ assertEquals(4, byData[0].size())
+ assertEquals([2, 3, 4], byData.collect { it[0].toString().toInteger()
})
+
+ // 5. Full scan with the hidden column also returns every row.
+ def all = sql """select * from ${tbl} order by id"""
+ assertEquals(4, all.size())
+ assertEquals(4, all[0].size())
+
+ sql """set show_hidden_columns = false"""
+ }
+
+ def formats = ["parquet", "orc"]
+ try {
+ formats.each { format ->
+ String unpartitioned = "test_iceberg_v1_rowid_${format}"
+ String partitioned = "test_iceberg_v1_rowid_part_${format}"
+ log.info("Run v1 row-id hidden-column test with format ${format}")
+ try {
+ // Unpartitioned v1 table: row-id with partition_spec_id = 0 /
empty partition data.
+ sql """drop table if exists ${unpartitioned}"""
+ sql """
+ create table ${unpartitioned} (
+ id int,
+ s string,
+ age int
+ ) engine=iceberg
+ properties (
+ "format-version" = "1",
+ "write.format.default" = "${format}"
+ )
+ """
+ runChecks(unpartitioned, format)
+
+ // Partitioned v1 table mirroring the original crash (a
Spark-written partitioned
+ // v1 table), so the row-id struct's partition_spec_id /
partition_data are exercised.
+ sql """drop table if exists ${partitioned}"""
+ sql """
+ create table ${partitioned} (
+ id int,
+ s string,
+ age int
+ ) engine=iceberg
+ partition by list (s) ()
+ properties (
+ "format-version" = "1",
+ "write.format.default" = "${format}"
+ )
+ """
+ runChecks(partitioned, format)
+ } finally {
+ sql """set show_hidden_columns = false"""
+ sql """drop table if exists ${partitioned}"""
+ sql """drop table if exists ${unpartitioned}"""
+ }
+ }
+ } finally {
+ sql """set show_hidden_columns = false"""
+ sql """drop database if exists ${dbName} force"""
+ sql """drop catalog if exists ${catalogName}"""
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]