This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 6d5ebe0dd4c [fix](iceberg)fix iceberg rewrite_data_file fail when
table had been updated. (#61112)
6d5ebe0dd4c is described below
commit 6d5ebe0dd4c27a03fddc122682887041926ab558
Author: daidai <[email protected]>
AuthorDate: Mon Mar 9 12:36:57 2026 +0800
[fix](iceberg)fix iceberg rewrite_data_file fail when table had been
updated. (#61112)
### What problem does this PR solve?
Related PR: #56413
Problem Summary:
fix iceberg `rewrite_data_file` fail when table had been updated.
---
.../create_preinstalled_scripts/iceberg/run27.sql | 40 ++++++++++++++
.../datasource/iceberg/IcebergTransaction.java | 6 +++
.../action/test_iceberg_rewrite_data_files.out | 18 +++++++
.../action/test_iceberg_rewrite_data_files.groovy | 63 ++++++++++++++++++++++
4 files changed, 127 insertions(+)
diff --git
a/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run27.sql
b/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run27.sql
new file mode 100644
index 00000000000..7c364fa7971
--- /dev/null
+++
b/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run27.sql
@@ -0,0 +1,40 @@
+use demo.test_db;
+
+
+create table if not exists test_rewrite_data_with_update (
+ id INT,
+ name STRING
+)
+USING iceberg
+TBLPROPERTIES (
+ 'format-version' = '2',
+ 'write.delete.mode' = 'merge-on-read',
+ 'write.update.mode' = 'merge-on-read',
+ 'write.merge.mode' = 'merge-on-read'
+);
+
+
+INSERT INTO test_rewrite_data_with_update VALUES
+(1, 'a'),(2, 'b'),(3, 'c');
+
+update test_rewrite_data_with_update set name = "bb" where id = 1;
+
+
+
+create table if not exists test_rewrite_data_with_delete (
+ id INT,
+ name STRING
+)
+USING iceberg
+TBLPROPERTIES (
+ 'format-version' = '2',
+ 'write.delete.mode' = 'merge-on-read',
+ 'write.update.mode' = 'merge-on-read',
+ 'write.merge.mode' = 'merge-on-read'
+);
+
+
+INSERT INTO test_rewrite_data_with_delete VALUES
+(1, 'a'),(2, 'b'),(3, 'c');
+
+delete from test_rewrite_data_with_delete where id = 1;
\ No newline at end of file
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergTransaction.java
b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergTransaction.java
index c6473ca64dd..e6a59fd78bc 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergTransaction.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergTransaction.java
@@ -73,6 +73,7 @@ public class IcebergTransaction implements Transaction {
private String branchName;
// Rewrite operation support
+ long startingSnapshotId = -1L; // Track the starting snapshot ID for
rewrite operations
private final List<DataFile> filesToDelete = Lists.newArrayList();
private final List<DataFile> filesToAdd = Lists.newArrayList();
private boolean isRewriteMode = false;
@@ -133,6 +134,9 @@ public class IcebergTransaction implements Transaction {
// create and start the iceberg transaction
this.table = IcebergUtils.getIcebergTable(dorisTable);
+ // Capture the starting snapshot ID for validation during
rewrite commit
+ this.startingSnapshotId = table.currentSnapshot().snapshotId();
+
// For rewrite operations, we work directly on the main table
// No branch information needed
this.transaction = table.newTransaction();
@@ -198,6 +202,8 @@ public class IcebergTransaction implements Transaction {
RewriteFiles rewriteFiles = transaction.newRewrite();
+ rewriteFiles = rewriteFiles.validateFromSnapshot(startingSnapshotId);
+
// For rewrite operations, we work directly on the main table
rewriteFiles =
rewriteFiles.scanManifestsWith(ops.getThreadPoolWithPreAuth());
diff --git
a/regression-test/data/external_table_p0/iceberg/action/test_iceberg_rewrite_data_files.out
b/regression-test/data/external_table_p0/iceberg/action/test_iceberg_rewrite_data_files.out
index 0c449ed9b44..6110e2e5aae 100644
---
a/regression-test/data/external_table_p0/iceberg/action/test_iceberg_rewrite_data_files.out
+++
b/regression-test/data/external_table_p0/iceberg/action/test_iceberg_rewrite_data_files.out
@@ -68,3 +68,21 @@
8 EAST 280.00 2024-01-02
9 EAST 380.00 2024-01-02
+-- !before_rewrite_update --
+1 bb
+2 b
+3 c
+
+-- !after_rewrite_update --
+1 bb
+2 b
+3 c
+
+-- !before_rewrite_delete --
+2 b
+3 c
+
+-- !after_rewrite_delete --
+2 b
+3 c
+
diff --git
a/regression-test/suites/external_table_p0/iceberg/action/test_iceberg_rewrite_data_files.groovy
b/regression-test/suites/external_table_p0/iceberg/action/test_iceberg_rewrite_data_files.groovy
index 67433722c77..623fdd16bc3 100644
---
a/regression-test/suites/external_table_p0/iceberg/action/test_iceberg_rewrite_data_files.groovy
+++
b/regression-test/suites/external_table_p0/iceberg/action/test_iceberg_rewrite_data_files.groovy
@@ -493,4 +493,67 @@ suite("test_iceberg_rewrite_data_files", "p0,external") {
logger.info("Specific partition rewrite test completed successfully")
+ //
=====================================================================================
+ // Test Case 4: Rewrite data files for merge-on-read update table
+ //
+ // Tables `test_rewrite_data_with_update` and
`test_rewrite_data_with_delete`
+ // are pre-created in Docker initialization (run23.sql) using Spark SQL
with
+ // format-version = 2 and merge-on-read enabled for delete/update/merge.
+ //
+ // This case verifies that executing rewrite_data_files on a table that has
+ // already performed UPDATE operations (implemented via delete + insert)
does
+ // not change the logical query results.
+ //
=====================================================================================
+ logger.info("Starting rewrite_data_files test for merge-on-read UPDATE
table")
+
+ def table_name_update = "test_rewrite_data_with_update"
+
+ // Verify data before rewrite: id = 1 should have been updated to 'bb'
+ qt_before_rewrite_update """SELECT id, name FROM ${table_name_update}
ORDER BY id"""
+
+ def rewriteResultUpdate = sql """
+ ALTER TABLE ${catalog_name}.${db_name}.${table_name_update}
+ EXECUTE rewrite_data_files(
+ "target-file-size-bytes" = "10485760",
+ "min-input-files" = "1"
+ )
+ """
+ logger.info("Rewrite data files result for update table:
${rewriteResultUpdate}")
+
+ // Verify data after rewrite (logical rows should remain the same)
+ qt_after_rewrite_update """SELECT id, name FROM ${table_name_update} ORDER
BY id"""
+
+ def totalUpdateRecords = sql """SELECT COUNT(*) FROM
${table_name_update}"""
+ assertTrue(totalUpdateRecords[0][0] == 3, "Update table should still have
3 logical records after rewrite")
+
+ //
=====================================================================================
+ // Test Case 5: Rewrite data files for merge-on-read delete table
+ //
+ // This case verifies that executing rewrite_data_files on a table that has
+ // already performed DELETE operations does not resurrect deleted rows and
+ // keeps the logical result set unchanged.
+ //
=====================================================================================
+ logger.info("Starting rewrite_data_files test for merge-on-read DELETE
table")
+
+ def table_name_delete = "test_rewrite_data_with_delete"
+
+ // Verify data before rewrite: row with id = 1 should have been deleted
+ qt_before_rewrite_delete """SELECT id, name FROM ${table_name_delete}
ORDER BY id"""
+
+ def rewriteResultDelete = sql """
+ ALTER TABLE ${catalog_name}.${db_name}.${table_name_delete}
+ EXECUTE rewrite_data_files(
+ "target-file-size-bytes" = "10485760",
+ "min-input-files" = "1"
+ )
+ """
+ logger.info("Rewrite data files result for delete table:
${rewriteResultDelete}")
+
+ // Verify data after rewrite (deleted rows should not reappear)
+ qt_after_rewrite_delete """SELECT id, name FROM ${table_name_delete} ORDER
BY id"""
+
+ def totalDeleteRecords = sql """SELECT COUNT(*) FROM
${table_name_delete}"""
+ assertTrue(totalDeleteRecords[0][0] == 2, "Delete table should still have
2 logical records after rewrite")
+
+ logger.info("Merge-on-read update/delete rewrite_data_files tests
completed successfully")
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]