Re: [PR] Core: Interface based DataFile reader and writer API [iceberg]

via GitHub Wed, 24 Sep 2025 22:28:21 -0700


stevenzwu commented on code in PR #12298:
URL: https://github.com/apache/iceberg/pull/12298#discussion_r2377014431



##########
flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkAppenderFactory.java:
##########
@@ -55,12 +53,11 @@ public class FlinkAppenderFactory implements 
FileAppenderFactory<RowData>, Seria
   private final PartitionSpec spec;
   private final int[] equalityFieldIds;
   private final Schema eqDeleteRowSchema;
-  private final Schema posDeleteRowSchema;
   private final Table table;
 
   private RowType eqDeleteFlinkSchema = null;
-  private RowType posDeleteFlinkSchema = null;
 
+  @Deprecated

Review Comment:
   nit: add Javadoc on the versions to remove like other deprecation notices



##########
core/src/main/java/org/apache/iceberg/deletes/EqualityDeleteWriter.java:
##########
@@ -52,12 +52,44 @@ public EqualityDeleteWriter(
       EncryptionKeyMetadata keyMetadata,
       SortOrder sortOrder,
       int... equalityFieldIds) {
+    this(
+        appender,
+        format,
+        location,
+        spec,
+        partition,
+        keyMetadata != null ? keyMetadata.buffer() : null,
+        sortOrder,
+        equalityFieldIds);
+  }
+
+  public EqualityDeleteWriter(EqualityDeleteWriter<T> wrapped) {
+    this(
+        wrapped.appender,
+        wrapped.format,
+        wrapped.location,
+        wrapped.spec,
+        wrapped.partition,
+        wrapped.keyMetadata,
+        wrapped.sortOrder,
+        wrapped.equalityFieldIds);
+  }
+
+  private EqualityDeleteWriter(

Review Comment:
   This refactoring doesn't seem to be required. it also doesn't seem to save 
any lines of code.



##########
flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java:
##########
@@ -65,111 +54,26 @@ class FlinkFileWriterFactory extends 
BaseFileWriterFactory<RowData> implements S
     super(
         table,
         dataFileFormat,
+        RowData.class,
         dataSchema,
         dataSortOrder,
         deleteFileFormat,
         equalityFieldIds,
         equalityDeleteRowSchema,
         equalityDeleteSortOrder,
-        positionDeleteRowSchema);
-
-    this.dataFlinkType = dataFlinkType;
-    this.equalityDeleteFlinkType = equalityDeleteFlinkType;
-    this.positionDeleteFlinkType = positionDeleteFlinkType;
+        ImmutableMap.of(),
+        dataFlinkType == null ? FlinkSchemaUtil.convert(dataSchema) : 
dataFlinkType,
+        equalityDeleteFlinkType == null

Review Comment:
   nit: nested ternary operator is a bit hard to read. maybe change it to a 
small method with if-else code?



##########
spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkFileWriterFactory.java:
##########
@@ -293,4 +217,48 @@ SparkFileWriterFactory build() {
           writeProperties);
     }
   }
+
+  private static StructType calculateSparkTypeForDelete(StructType sparkType, 
Schema schema) {
+    if (sparkType != null) {
+      // The delete types need to have the correct metadata columns.
+      if (sparkType.fields().length < 3) {
+        return PATH_POS_TYPE;
+      } else {
+        StructField rowField =
+            
sparkType.fields()[sparkType.fieldIndex(MetadataColumns.DELETE_FILE_ROW_FIELD_NAME)];
+        return new StructType(
+            new StructField[] {
+              new StructField(
+                  MetadataColumns.DELETE_FILE_PATH.name(),
+                  DataTypes.StringType,
+                  false,
+                  Metadata.empty()),
+              new StructField(
+                  MetadataColumns.DELETE_FILE_POS.name(),
+                  DataTypes.LongType,
+                  false,
+                  Metadata.empty()),
+              new StructField(
+                  MetadataColumns.DELETE_FILE_ROW_FIELD_NAME,
+                  rowField.dataType(),
+                  false,
+                  Metadata.empty())
+            });
+      }
+    } else if (schema != null) {
+      return SparkSchemaUtil.convert(schema);
+    } else {
+      return null;
+    }
+  }
+
+  private static StructType calculateSparkType(StructType sparkType, Schema 
schema) {
+    if (sparkType != null) {
+      return sparkType;
+    } else if (schema != null) {
+      return SparkSchemaUtil.convert(schema);
+    } else {
+      return null;

Review Comment:
   this should throw an exception. one of the two args should be non null. that 
will also match the old behavior



##########
spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkFileWriterFactory.java:
##########
@@ -293,4 +217,48 @@ SparkFileWriterFactory build() {
           writeProperties);
     }
   }
+
+  private static StructType calculateSparkTypeForDelete(StructType sparkType, 
Schema schema) {

Review Comment:
   this doesn't seem to be used.



##########
parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java:
##########
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.parquet;
+
+import java.util.Map;
+import org.apache.iceberg.FileFormat;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.io.FormatModel;
+import org.apache.iceberg.io.InputFile;
+import org.apache.iceberg.io.OutputFile;
+import org.apache.parquet.schema.MessageType;
+
+public class ParquetFormatModel<D, S, F> implements FormatModel<D, S> {
+  private final Class<D> type;
+  private final ReaderFunction<D> readerFunction;
+  private final BatchReaderFunction<D, F> batchReaderFunction;
+  private final WriterFunction<D, S> writerFunction;
+
+  private ParquetFormatModel(
+      Class<D> type,
+      ReaderFunction<D> readerFunction,
+      BatchReaderFunction<D, F> batchReaderFunction,
+      WriterFunction<D, S> writerFunction) {
+    this.type = type;
+    this.readerFunction = readerFunction;
+    this.batchReaderFunction = batchReaderFunction;
+    this.writerFunction = writerFunction;
+  }
+
+  public ParquetFormatModel(Class<D> type) {

Review Comment:
   is there any value to provide this constructor where reader and writer 
functions are null? is the constructed object usable?



##########
core/src/main/java/org/apache/iceberg/data/RegistryBasedFileWriterFactory.java:
##########
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.data;
+
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.util.Map;
+import org.apache.iceberg.FileFormat;
+import org.apache.iceberg.MetricsConfig;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.SortOrder;
+import org.apache.iceberg.StructLike;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.deletes.EqualityDeleteWriter;
+import org.apache.iceberg.deletes.PositionDeleteWriter;
+import org.apache.iceberg.encryption.EncryptedOutputFile;
+import org.apache.iceberg.encryption.EncryptionKeyMetadata;
+import org.apache.iceberg.io.DataWriter;
+import org.apache.iceberg.io.FileWriterFactory;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+
+/**
+ * A base writer factory to be extended by query engine integrations.
+ *
+ * @param <T> type of the engine specific records

Review Comment:
   nit: simplify as `<T> row type`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Core: Interface based DataFile reader and writer API [iceberg]

Reply via email to