This is an automated email from the ASF dual-hosted git repository.
voonhous pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new 162cac2910ee fix(lance): fail fast when write schema contains VARIANT
columns (#18775)
162cac2910ee is described below
commit 162cac2910ee299fb112a8604a54553082481b7c
Author: Rahil C <[email protected]>
AuthorDate: Tue May 19 01:09:20 2026 -0700
fix(lance): fail fast when write schema contains VARIANT columns (#18775)
Lance does not currently support VARIANT in its file format
(https://lance.org/guide/data_types/#arrow-type-system). Without a
guard, writes that include VARIANT-typed columns fail deep inside the
Avro-to-Arrow conversion layer (LanceArrowUtils.toArrowSchema) with a
cryptic error.
Add a recursive Avro-schema walk in HoodieSparkLanceWriter that throws
HoodieNotSupportedException up front with a user-friendly message
naming the offending column path. Invoke it from
HoodieSparkFileWriterFactory.newLanceFileWriter so every Lance write
path (Spark DataSource, DeltaStreamer, bootstrap, async clustering,
async compaction) is covered before any Arrow allocator is opened.
Mirrors the existing VECTOR element-type guard in the same writer.
---
.../io/storage/HoodieSparkFileWriterFactory.java | 1 +
.../hudi/io/storage/HoodieSparkLanceWriter.java | 44 +++++++++++++
.../io/storage/TestHoodieSparkLanceWriter.java | 72 ++++++++++++++++++++++
3 files changed, 117 insertions(+)
diff --git
a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileWriterFactory.java
b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileWriterFactory.java
index e8faef103e7c..7b3571d8df20 100644
---
a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileWriterFactory.java
+++
b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileWriterFactory.java
@@ -115,6 +115,7 @@ public class HoodieSparkFileWriterFactory extends
HoodieFileWriterFactory {
@Override
protected HoodieFileWriter newLanceFileWriter(String instantTime,
StoragePath path, HoodieConfig config, HoodieSchema schema,
TaskContextSupplier
taskContextSupplier) throws IOException {
+ HoodieSparkLanceWriter.validateNoVariantColumns(schema);
boolean populateMetaFields =
config.getBooleanOrDefault(HoodieTableConfig.POPULATE_META_FIELDS);
StructType structType = HoodieInternalRowUtils.getCachedSchema(schema);
boolean enableBloomFilter = enableBloomFilter(populateMetaFields, config);
diff --git
a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkLanceWriter.java
b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkLanceWriter.java
index fe109a1ffe60..3bdf12d9059b 100644
---
a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkLanceWriter.java
+++
b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkLanceWriter.java
@@ -24,6 +24,7 @@ import org.apache.hudi.common.engine.TaskContextSupplier;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.schema.HoodieSchema;
+import org.apache.hudi.common.schema.HoodieSchemaField;
import org.apache.hudi.common.schema.HoodieSchemaType;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.exception.HoodieNotSupportedException;
@@ -196,6 +197,49 @@ public class HoodieSparkLanceWriter extends
HoodieBaseLanceWriter<InternalRow, U
* <p>Only top-level fields are inspected; Hudi BLOB and VECTOR are
top-level types in the Hudi
* schema model.
*/
+ /**
+ * Fail fast if the write schema contains any VARIANT-typed column. The
Lance file format
+ * does not currently support VARIANT (see
https://lance.org/guide/data_types/#arrow-type-system);
+ * without this guard the write would fail deep in the Avro-to-Arrow
conversion layer with a
+ * cryptic error. Walks the schema recursively so nested VARIANT fields
(inside records, unions,
+ * arrays, maps) are also caught.
+ */
+ static void validateNoVariantColumns(HoodieSchema schema) {
+ checkNoVariant(schema, "");
+ }
+
+ private static void checkNoVariant(HoodieSchema schema, String path) {
+ HoodieSchemaType type = schema.getType();
+ if (type == HoodieSchemaType.VARIANT) {
+ throw new HoodieNotSupportedException(
+ "Lance base-file format does not currently support VARIANT columns "
+ + "(see https://lance.org/guide/data_types/#arrow-type-system). "
+ + "Found VARIANT field at '" + (path.isEmpty() ? "<root>" :
path) + "'. "
+ + "Use Parquet for tables with VARIANT columns.");
+ }
+ switch (type) {
+ case RECORD:
+ for (HoodieSchemaField f : schema.getFields()) {
+ String childPath = path.isEmpty() ? f.name() : path + "." + f.name();
+ checkNoVariant(f.schema(), childPath);
+ }
+ break;
+ case UNION:
+ for (HoodieSchema branch : schema.getTypes()) {
+ checkNoVariant(branch, path);
+ }
+ break;
+ case ARRAY:
+ checkNoVariant(schema.getElementType(), path + "[]");
+ break;
+ case MAP:
+ checkNoVariant(schema.getValueType(), path + ".<value>");
+ break;
+ default:
+ // Primitive or BLOB / VECTOR — nothing to recurse into for VARIANT
detection.
+ }
+ }
+
private static StructType enrichSparkSchemaForLance(StructType sparkSchema) {
Map<Integer, HoodieSchema.Vector> vectorColumns =
VectorConversionUtils.detectVectorColumnsFromMetadata(sparkSchema);
diff --git
a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/io/storage/TestHoodieSparkLanceWriter.java
b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/io/storage/TestHoodieSparkLanceWriter.java
index 2f425b2158e4..722833a7756c 100644
---
a/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/io/storage/TestHoodieSparkLanceWriter.java
+++
b/hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/io/storage/TestHoodieSparkLanceWriter.java
@@ -23,8 +23,12 @@ import org.apache.hudi.common.bloom.BloomFilter;
import org.apache.hudi.common.bloom.BloomFilterFactory;
import org.apache.hudi.common.config.HoodieStorageConfig;
import org.apache.hudi.common.model.HoodieKey;
+import org.apache.hudi.common.schema.HoodieSchema;
+import org.apache.hudi.common.schema.HoodieSchemaField;
+import org.apache.hudi.common.schema.HoodieSchemaType;
import org.apache.hudi.common.testutils.HoodieTestUtils;
import org.apache.hudi.common.util.Option;
+import org.apache.hudi.exception.HoodieNotSupportedException;
import org.apache.hudi.io.memory.HoodieArrowAllocator;
import org.apache.hudi.storage.HoodieStorage;
import org.apache.hudi.storage.StoragePath;
@@ -57,6 +61,8 @@ import org.lance.file.LanceFileReader;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
import java.util.List;
import static org.apache.hudi.common.bloom.BloomFilterTypeCode.SIMPLE;
@@ -567,4 +573,70 @@ public class TestHoodieSparkLanceWriter {
return false;
}
}
+
+ // ----- VARIANT-on-Lance guard tests -----
+
+ @Test
+ public void testValidateNoVariantColumns_noVariant_succeeds() {
+ HoodieSchema record = HoodieSchema.createRecord("R", "ns", null,
Arrays.asList(
+ HoodieSchemaField.of("id", HoodieSchema.create(HoodieSchemaType.INT)),
+ HoodieSchemaField.of("name",
HoodieSchema.create(HoodieSchemaType.STRING))));
+ HoodieSparkLanceWriter.validateNoVariantColumns(record);
+ }
+
+ @Test
+ public void testValidateNoVariantColumns_topLevelVariant_throws() {
+ HoodieSchema record = HoodieSchema.createRecord("R", "ns", null,
Collections.singletonList(
+ HoodieSchemaField.of("payload", HoodieSchema.createVariant())));
+ HoodieNotSupportedException ex = assertThrows(
+ HoodieNotSupportedException.class,
+ () -> HoodieSparkLanceWriter.validateNoVariantColumns(record));
+ assertTrue(ex.getMessage().contains("Lance"));
+ assertTrue(ex.getMessage().contains("VARIANT"));
+ assertTrue(ex.getMessage().contains("payload"), "Error should name the
offending field: " + ex.getMessage());
+ }
+
+ @Test
+ public void testValidateNoVariantColumns_variantInNestedRecord_throws() {
+ HoodieSchema nested = HoodieSchema.createRecord("Nested", "ns", null,
Collections.singletonList(
+ HoodieSchemaField.of("v", HoodieSchema.createVariant())));
+ HoodieSchema record = HoodieSchema.createRecord("R", "ns", null,
Collections.singletonList(
+ HoodieSchemaField.of("inner", nested)));
+ HoodieNotSupportedException ex = assertThrows(
+ HoodieNotSupportedException.class,
+ () -> HoodieSparkLanceWriter.validateNoVariantColumns(record));
+ assertTrue(ex.getMessage().contains("inner.v"), "Error should point at
nested path: " + ex.getMessage());
+ }
+
+ @Test
+ public void testValidateNoVariantColumns_variantInArray_throws() {
+ HoodieSchema record = HoodieSchema.createRecord("R", "ns", null,
Collections.singletonList(
+ HoodieSchemaField.of("items",
HoodieSchema.createArray(HoodieSchema.createVariant()))));
+ HoodieNotSupportedException ex = assertThrows(
+ HoodieNotSupportedException.class,
+ () -> HoodieSparkLanceWriter.validateNoVariantColumns(record));
+ assertTrue(ex.getMessage().contains("items[]"), "Error should point at
array element path: " + ex.getMessage());
+ }
+
+ @Test
+ public void testValidateNoVariantColumns_variantInMap_throws() {
+ HoodieSchema record = HoodieSchema.createRecord("R", "ns", null,
Collections.singletonList(
+ HoodieSchemaField.of("attrs",
HoodieSchema.createMap(HoodieSchema.createVariant()))));
+ HoodieNotSupportedException ex = assertThrows(
+ HoodieNotSupportedException.class,
+ () -> HoodieSparkLanceWriter.validateNoVariantColumns(record));
+ assertTrue(ex.getMessage().contains("attrs.<value>"), "Error should point
at map value path: " + ex.getMessage());
+ }
+
+ @Test
+ public void testValidateNoVariantColumns_variantInNullableUnion_throws() {
+ HoodieSchema nullableVariant = HoodieSchema.createUnion(
+ HoodieSchema.NULL_SCHEMA, HoodieSchema.createVariant());
+ HoodieSchema record = HoodieSchema.createRecord("R", "ns", null,
Collections.singletonList(
+ HoodieSchemaField.of("payload", nullableVariant)));
+ HoodieNotSupportedException ex = assertThrows(
+ HoodieNotSupportedException.class,
+ () -> HoodieSparkLanceWriter.validateNoVariantColumns(record));
+ assertTrue(ex.getMessage().contains("payload"), "Error should name the
field: " + ex.getMessage());
+ }
}