Re: [PR] Spark: Support writing shredded variant in Iceberg-Spark [iceberg]

via GitHub Sat, 04 Apr 2026 18:03:36 -0700


aihuaxu commented on code in PR #14297:
URL: https://github.com/apache/iceberg/pull/14297#discussion_r3036220924



##########
parquet/src/main/java/org/apache/iceberg/parquet/VariantShreddingAnalyzer.java:
##########
@@ -0,0 +1,523 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.parquet;
+
+import java.math.BigDecimal;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.relocated.com.google.common.collect.Maps;
+import org.apache.iceberg.relocated.com.google.common.collect.Sets;
+import org.apache.iceberg.variants.PhysicalType;
+import org.apache.iceberg.variants.VariantArray;
+import org.apache.iceberg.variants.VariantObject;
+import org.apache.iceberg.variants.VariantPrimitive;
+import org.apache.iceberg.variants.VariantValue;
+import org.apache.parquet.schema.GroupType;
+import org.apache.parquet.schema.LogicalTypeAnnotation;
+import org.apache.parquet.schema.PrimitiveType;
+import org.apache.parquet.schema.Type;
+import org.apache.parquet.schema.Types;
+
+/**
+ * Analyzes variant data across buffered rows to determine an optimal 
shredding schema.
+ *
+ * <p>Determinism contract: for a given set of variant values (regardless of 
row arrival order),
+ * this analyzer produces the same shredded schema.
+ *
+ * <ul>
+ *   <li>Object fields use a TreeMap, so field ordering is alphabetical and 
deterministic.
+ *   <li>Type selection picks the most common type with explicit tie-break 
priority (see
+ *       TIE_BREAK_PRIORITY), not enum ordinal.
+ *   <li>Integer types (INT8/16/32/64) and decimal types (DECIMAL4/8/16) are 
each promoted to the
+ *       widest observed before competing with other types.
+ *   <li>Fields below {@code MIN_FIELD_FREQUENCY} are pruned. Above {@code 
MAX_SHREDDED_FIELDS}, the
+ *       most frequent are kept with alphabetical tie-breaking.
+ *   <li>Recursion into nested objects/arrays stops at {@code 
MAX_SHREDDING_DEPTH} (default 50).
+ *   <li>New struct fields are not tracked once a node reaches {@code 
MAX_INTERMEDIATE_FIELDS}
+ *       (default 1000) to bound memory during inference.
+ * </ul>
+ *
+ * <p>This contract holds within a single batch. Different batches with 
different distributions may
+ * produce different layouts; cross-batch stability requires schema pinning 
(not yet implemented).
+ *
+ * <p>Subclasses implement {@link #extractVariantValues} to convert 
engine-specific row types into
+ * {@link VariantValue} instances.
+ *
+ * @param <T> the engine-specific row type (e.g., Spark InternalRow, Flink 
RowData)
+ * @param <S> the engine-specific schema type (e.g., Spark StructType, Flink 
RowType)
+ */
+public abstract class VariantShreddingAnalyzer<T, S> {
+  private static final String TYPED_VALUE = "typed_value";
+  private static final String VALUE = "value";
+  private static final String ELEMENT = "element";
+  private static final double MIN_FIELD_FREQUENCY = 0.10;
+  private static final int MAX_SHREDDED_FIELDS = 300;
+  private static final int MAX_SHREDDING_DEPTH = 50;
+  private static final int MAX_INTERMEDIATE_FIELDS = 1000;
+
+  protected VariantShreddingAnalyzer() {}
+
+  /**
+   * Analyzes buffered variant values to determine the optimal shredding 
schema.
+   *
+   * @param bufferedRows the buffered rows to analyze
+   * @param variantFieldIndex the index of the variant field in the rows
+   * @return the shredded schema type, or null if no shredding should be 
performed
+   */
+  public Type analyzeAndCreateSchema(List<T> bufferedRows, int 
variantFieldIndex) {
+    List<VariantValue> variantValues = extractVariantValues(bufferedRows, 
variantFieldIndex);
+    if (variantValues.isEmpty()) {
+      return null;
+    }
+
+    PathNode root = buildPathTree(variantValues);
+    PhysicalType rootType = root.info.getMostCommonType();
+    if (rootType == null) {
+      return null;
+    }
+
+    if (rootType == PhysicalType.OBJECT) {
+      pruneInfrequentFields(root, root.info.observationCount);
+    }
+
+    return buildTypedValue(root, rootType);
+  }
+
+  protected abstract List<VariantValue> extractVariantValues(
+      List<T> bufferedRows, int variantFieldIndex);
+
+  /**
+   * Resolves a column name to its index in the engine-specific schema. 
Returns -1 if the column is
+   * not found.
+   */
+  protected abstract int resolveColumnIndex(S engineSchema, String columnName);
+
+  /**
+   * Analyzes all variant columns in the schema, resolving column indices via 
the engine-specific
+   * {@link #resolveColumnIndex} method.
+   *
+   * @param bufferedRows the buffered rows to analyze
+   * @param icebergSchema the Iceberg table schema
+   * @param engineSchema the engine-specific schema used to resolve column 
indices
+   * @return a map from Iceberg field ID to the shredded Parquet type for each 
variant column
+   */
+  public Map<Integer, Type> analyzeVariantColumns(
+      List<T> bufferedRows, Schema icebergSchema, S engineSchema) {
+    Map<Integer, Type> shreddedTypes = Maps.newHashMap();
+    for (org.apache.iceberg.types.Types.NestedField col : 
icebergSchema.columns()) {

Review Comment:
   Use imports this class. 



##########
parquet/src/main/java/org/apache/iceberg/parquet/VariantShreddingAnalyzer.java:
##########
@@ -0,0 +1,523 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.parquet;
+
+import java.math.BigDecimal;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.relocated.com.google.common.collect.Maps;
+import org.apache.iceberg.relocated.com.google.common.collect.Sets;
+import org.apache.iceberg.variants.PhysicalType;
+import org.apache.iceberg.variants.VariantArray;
+import org.apache.iceberg.variants.VariantObject;
+import org.apache.iceberg.variants.VariantPrimitive;
+import org.apache.iceberg.variants.VariantValue;
+import org.apache.parquet.schema.GroupType;
+import org.apache.parquet.schema.LogicalTypeAnnotation;
+import org.apache.parquet.schema.PrimitiveType;
+import org.apache.parquet.schema.Type;
+import org.apache.parquet.schema.Types;
+
+/**
+ * Analyzes variant data across buffered rows to determine an optimal 
shredding schema.
+ *
+ * <p>Determinism contract: for a given set of variant values (regardless of 
row arrival order),
+ * this analyzer produces the same shredded schema.
+ *
+ * <ul>
+ *   <li>Object fields use a TreeMap, so field ordering is alphabetical and 
deterministic.
+ *   <li>Type selection picks the most common type with explicit tie-break 
priority (see
+ *       TIE_BREAK_PRIORITY), not enum ordinal.
+ *   <li>Integer types (INT8/16/32/64) and decimal types (DECIMAL4/8/16) are 
each promoted to the
+ *       widest observed before competing with other types.
+ *   <li>Fields below {@code MIN_FIELD_FREQUENCY} are pruned. Above {@code 
MAX_SHREDDED_FIELDS}, the
+ *       most frequent are kept with alphabetical tie-breaking.
+ *   <li>Recursion into nested objects/arrays stops at {@code 
MAX_SHREDDING_DEPTH} (default 50).
+ *   <li>New struct fields are not tracked once a node reaches {@code 
MAX_INTERMEDIATE_FIELDS}
+ *       (default 1000) to bound memory during inference.
+ * </ul>
+ *
+ * <p>This contract holds within a single batch. Different batches with 
different distributions may
+ * produce different layouts; cross-batch stability requires schema pinning 
(not yet implemented).
+ *
+ * <p>Subclasses implement {@link #extractVariantValues} to convert 
engine-specific row types into
+ * {@link VariantValue} instances.
+ *
+ * @param <T> the engine-specific row type (e.g., Spark InternalRow, Flink 
RowData)
+ * @param <S> the engine-specific schema type (e.g., Spark StructType, Flink 
RowType)
+ */
+public abstract class VariantShreddingAnalyzer<T, S> {
+  private static final String TYPED_VALUE = "typed_value";
+  private static final String VALUE = "value";
+  private static final String ELEMENT = "element";
+  private static final double MIN_FIELD_FREQUENCY = 0.10;
+  private static final int MAX_SHREDDED_FIELDS = 300;
+  private static final int MAX_SHREDDING_DEPTH = 50;
+  private static final int MAX_INTERMEDIATE_FIELDS = 1000;
+
+  protected VariantShreddingAnalyzer() {}
+
+  /**
+   * Analyzes buffered variant values to determine the optimal shredding 
schema.
+   *
+   * @param bufferedRows the buffered rows to analyze
+   * @param variantFieldIndex the index of the variant field in the rows
+   * @return the shredded schema type, or null if no shredding should be 
performed
+   */
+  public Type analyzeAndCreateSchema(List<T> bufferedRows, int 
variantFieldIndex) {
+    List<VariantValue> variantValues = extractVariantValues(bufferedRows, 
variantFieldIndex);
+    if (variantValues.isEmpty()) {
+      return null;
+    }
+
+    PathNode root = buildPathTree(variantValues);
+    PhysicalType rootType = root.info.getMostCommonType();
+    if (rootType == null) {
+      return null;
+    }
+
+    if (rootType == PhysicalType.OBJECT) {

Review Comment:
   How about Array and primitive types? I think we should handle them the same 
way?



##########
core/src/main/java/org/apache/iceberg/io/BufferedFileAppender.java:
##########
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.io;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.function.Function;
+import java.util.function.UnaryOperator;
+import org.apache.iceberg.Metrics;
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+
+/**
+ * A FileAppender that buffers the first N rows, then creates a delegate 
appender via a factory.
+ *
+ * <p>The factory receives the buffered rows, is responsible for creating the 
real appender and
+ * writing the buffered rows into it before returning. All subsequent {@link 
#add} calls delegate
+ * directly to the real appender.
+ *
+ * <p>If fewer than N rows are written before {@link #close}, the factory is 
called at close time.
+ *
+ * @param <D> the row type
+ */
+public class BufferedFileAppender<D> implements FileAppender<D> {
+  private final int bufferRowCount;
+  private final Function<List<D>, FileAppender<D>> appenderFactory;
+  private final UnaryOperator<D> copyFunc;
+  private List<D> buffer;
+  private FileAppender<D> delegate;
+  private boolean closed = false;
+
+  /**
+   * @param bufferRowCount number of rows to buffer before creating the 
delegate appender
+   * @param appenderFactory given the buffered rows, creates the delegate 
appender and replays them
+   * @param copyFunc copies a row before buffering (needed when row objects 
are reused, e.g. Spark
+   *     InternalRow)
+   */
+  public BufferedFileAppender(
+      int bufferRowCount,
+      Function<List<D>, FileAppender<D>> appenderFactory,
+      UnaryOperator<D> copyFunc) {
+    Preconditions.checkArgument(
+        bufferRowCount > 0, "bufferRowCount must be > 0, got %s", 
bufferRowCount);
+    Preconditions.checkNotNull(appenderFactory, "appenderFactory must not be 
null");
+    Preconditions.checkNotNull(copyFunc, "copyFunc must not be null");
+    this.bufferRowCount = bufferRowCount;
+    this.appenderFactory = appenderFactory;
+    this.copyFunc = copyFunc;
+    this.buffer = Lists.newArrayList();
+  }
+
+  @Override
+  public void add(D datum) {
+    Preconditions.checkState(!closed, "Cannot add to a closed appender");
+    if (delegate != null) {
+      delegate.add(datum);
+    } else {
+      buffer.add(copyFunc.apply(datum));
+      if (buffer.size() >= bufferRowCount) {
+        initialize();
+      }
+    }
+  }
+
+  @Override
+  public Metrics metrics() {
+    Preconditions.checkState(closed, "Cannot return metrics for unclosed 
appender");
+    Preconditions.checkState(delegate != null, "Delegate appender was never 
created");
+    return delegate.metrics();
+  }
+
+  @Override
+  public long length() {
+    if (delegate != null) {
+      return delegate.length();
+    }
+    return 0L;

Review Comment:
   We use this length to decide if we should roll over to next file, right? 
   Does it cause the file includes more data when it later writes the actual 
data? 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Spark: Support writing shredded variant in Iceberg-Spark [iceberg]

Reply via email to