ashishkumar50 commented on code in PR #10306:
URL: https://github.com/apache/ozone/pull/10306#discussion_r3332694982
##########
hadoop-ozone/iceberg/src/main/java/org/apache/hadoop/ozone/iceberg/RewriteTablePathOzoneAction.java:
##########
@@ -675,4 +695,188 @@ private static RewriteResult<DeleteFile>
writeDeleteManifest(
throw new RuntimeIOException(e);
}
}
+
+ static class OzonePositionDeleteReaderWriter implements
RewriteTablePathUtil.PositionDeleteReaderWriter {
+ @Override
+ public CloseableIterable<Record> reader(
+ InputFile inputFile, FileFormat format, PartitionSpec spec) {
+ return positionDeletesReader(inputFile, format, spec);
+ }
+
+ @Override
+ public PositionDeleteWriter<Record> writer(
+ OutputFile outputFile,
+ FileFormat format,
+ PartitionSpec spec,
+ StructLike partition,
+ Schema rowSchema)
+ throws IOException {
+ return positionDeletesWriter(outputFile, format, spec, partition,
rowSchema);
+ }
+ }
+
+ private void rewritePositionDeletes(Set<DeleteFile> toRewrite) {
+ /*
+ * NOTE: Rewriting position delete files updates embedded data file paths,
which changes the
+ * resulting file size. This causes a metadata mismatch in the manifests:
+ *
+ * 1. Dependency: Manifests MUST be rewritten first because they are the
source of truth used to identify which
+ * position delete files exist and need processing.
+ * 2. Issue: Because manifests are written before the delete files are
updated, the'file_size_in_bytes' field
+ * in the manifest reflects the original size, not the new size.
+ * 3. Impact: Some catalogs (e.g., REST catalogs like Polaris) will fail
to read these files as the reader uses
+ * the stale size from the manifest.
+ *
+ * This is a known Iceberg limitation being addressed by the Iceberg
community. Once that fix is available
Review Comment:
You can add this limitation in ozone-iceberg doc as well.
##########
hadoop-ozone/iceberg/src/main/java/org/apache/hadoop/ozone/iceberg/RewriteTablePathOzoneAction.java:
##########
@@ -675,4 +695,188 @@ private static RewriteResult<DeleteFile>
writeDeleteManifest(
throw new RuntimeIOException(e);
}
}
+
+ static class OzonePositionDeleteReaderWriter implements
RewriteTablePathUtil.PositionDeleteReaderWriter {
+ @Override
+ public CloseableIterable<Record> reader(
+ InputFile inputFile, FileFormat format, PartitionSpec spec) {
+ return positionDeletesReader(inputFile, format, spec);
+ }
+
+ @Override
+ public PositionDeleteWriter<Record> writer(
+ OutputFile outputFile,
+ FileFormat format,
+ PartitionSpec spec,
+ StructLike partition,
+ Schema rowSchema)
+ throws IOException {
+ return positionDeletesWriter(outputFile, format, spec, partition,
rowSchema);
+ }
+ }
+
+ private void rewritePositionDeletes(Set<DeleteFile> toRewrite) {
+ /*
+ * NOTE: Rewriting position delete files updates embedded data file paths,
which changes the
+ * resulting file size. This causes a metadata mismatch in the manifests:
+ *
+ * 1. Dependency: Manifests MUST be rewritten first because they are the
source of truth used to identify which
+ * position delete files exist and need processing.
+ * 2. Issue: Because manifests are written before the delete files are
updated, the'file_size_in_bytes' field
Review Comment:
nit: the 'file_size_in_bytes'
##########
hadoop-ozone/iceberg/src/test/java/org/apache/hadoop/ozone/iceberg/TestRewriteTablePathOzoneAction.java:
##########
@@ -366,6 +393,110 @@ void statsFileCopyPlanReturnsBeforeToAfterPathPairs() {
Pair.of("before-1.stats", "after-1.stats"),
Pair.of("before-2.stats", "after-2.stats")), copyPlan);
}
+
+ @Test
+ void rejectsTablesWithPartitionStatistics() {
+ TableMetadata baseMetadata = ((HasTableOperations)
table).operations().current();
+ long snapshotId = baseMetadata.currentSnapshot().snapshotId();
+ PartitionStatisticsFile statsFile =
Mockito.mock(PartitionStatisticsFile.class);
+ Mockito.when(statsFile.snapshotId()).thenReturn(snapshotId);
+ Mockito.when(statsFile.path()).thenReturn(sourcePrefix +
"/metadata/dummy.stats");
+ Mockito.when(statsFile.fileSizeInBytes()).thenReturn(100L);
+ TableMetadata metadataWithStats = TableMetadata.buildFrom(baseMetadata)
+ .setPartitionStatistics(statsFile)
+ .build();
+
+ TableOperations ops = ((HasTableOperations) table).operations();
+ ops.commit(baseMetadata, metadataWithStats);
+
+ RewriteTablePath action = new RewriteTablePathOzoneAction(table)
+ .rewriteLocationPrefix(sourcePrefix, targetPrefix)
+ .stagingLocation(stagingDir + "/");
+
+ IllegalArgumentException exception =
assertThrows(IllegalArgumentException.class, action::execute);
+ assertThat(exception).hasMessageContaining("Partition statistics files are
not supported yet.");
+ }
+
+ @Test
+ public void positionDeletesReaderUnsupportedFormat() {
+ InputFile mockInput = Mockito.mock(InputFile.class);
+ Mockito.when(mockInput.location()).thenReturn("s3://bucket/test.txt");
+ PartitionSpec spec = PartitionSpec.unpartitioned();
+ FileFormat mockUnsupportedFormat = Mockito.mock(FileFormat.class);
+ Mockito.when(mockUnsupportedFormat.toString()).thenReturn("txt");
+
+ UnsupportedOperationException exception =
assertThrows(UnsupportedOperationException.class,
+ () -> RewriteTablePathOzoneAction.positionDeletesReader(mockInput,
mockUnsupportedFormat, spec));
+
+ assertThat(exception).hasMessageContaining("Unsupported file format: txt");
+ }
+
+ @Test
+ public void positionDeletesWriterUnsupportedFormat() {
+ OutputFile mockOutput = Mockito.mock(OutputFile.class);
+ Mockito.when(mockOutput.location()).thenReturn("s3://bucket/test.txt");
+ PartitionSpec spec = PartitionSpec.unpartitioned();
+ FileFormat mockUnsupportedFormat = Mockito.mock(FileFormat.class);
+ Mockito.when(mockUnsupportedFormat.toString()).thenReturn("txt");
+
+ UnsupportedOperationException exception =
assertThrows(UnsupportedOperationException.class,
+ () -> RewriteTablePathOzoneAction.positionDeletesWriter(
+ mockOutput, mockUnsupportedFormat, spec, null, null));
+
+ assertThat(exception).hasMessageContaining("Unsupported file format: txt");
+ }
+
+ @ParameterizedTest
+ @EnumSource(value = FileFormat.class, names = {"AVRO", "ORC"})
Review Comment:
Add PARQUET as well?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]