nastra commented on code in PR #13245: URL: https://github.com/apache/iceberg/pull/13245#discussion_r2179246847
########## spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java: ########## @@ -538,6 +542,75 @@ public void testBinPackWithDeletes() throws IOException { assertThat(actualRecords).as("7 rows are removed").hasSize(total - 7); } + @TestTemplate + public void removeOrphanedDVsFromDeleteManifest() throws Exception { + assumeThat(formatVersion).isGreaterThanOrEqualTo(3); + Table table = createTable(); + int numDataFiles = 5; + // 100 / 5 = 20 records per data file + writeRecords(numDataFiles, 100); + int numPositionsToDelete = 10; + table.refresh(); + List<DataFile> dataFiles = TestHelpers.dataFiles(table); + assertThat(dataFiles).hasSize(numDataFiles); + + RowDelta rowDelta = table.newRowDelta(); + for (DataFile dataFile : dataFiles) { + writeDV(table, dataFile.partition(), dataFile.location(), numPositionsToDelete) + .forEach(rowDelta::addDeletes); + } + + rowDelta.commit(); + + Set<DeleteFile> deleteFiles = TestHelpers.deleteFiles(table); + assertThat(deleteFiles).hasSize(numDataFiles); + + for (ManifestFile manifestFile : table.currentSnapshot().deleteManifests(table.io())) { + Set<String> validDataFilePaths = + TestHelpers.dataFiles(table).stream() + .map(ContentFile::location) + .collect(Collectors.toSet()); + ManifestReader<DeleteFile> reader = + ManifestFiles.readDeleteManifest( + manifestFile, table.io(), ((BaseTable) table).operations().current().specsById()); + for (DeleteFile deleteFile : reader) { + // make sure there are no orphaned DVs + assertThat(validDataFilePaths).contains(deleteFile.referencedDataFile()); + } + } + + // all data files should be rewritten. Set MIN_INPUT_FILES > to the number of data files so that + // compaction is only triggered when the delete ratio of >= 30% is hit + RewriteDataFiles.Result result = + SparkActions.get(spark) + .rewriteDataFiles(table) + .option(SizeBasedFileRewritePlanner.MIN_INPUT_FILES, "10") + .option(SizeBasedFileRewritePlanner.MIN_FILE_SIZE_BYTES, "0") + .execute(); + + assertThat(result.rewrittenDataFilesCount()).isEqualTo(numDataFiles); + assertThat(result.removedDeleteFilesCount()).isEqualTo(numDataFiles); + + table.refresh(); + assertThat(TestHelpers.dataFiles(table)).hasSize(1); + assertThat(TestHelpers.deleteFiles(table)).isEmpty(); + + Set<String> validDataFilePaths = Review Comment: yes this is needed. Even if we assert that `deleteFiles` is empty, there could still be orphaned DVs in the delete manifests, which in fact was the case without propagating orphaned DVs -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org