This is an automated email from the ASF dual-hosted git repository.
vinoth pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new 1b2cee800994 feat(vector): add VECTOR type to HoodieSchema (#18146)
1b2cee800994 is described below
commit 1b2cee800994a5d878f045bd5e4e95618d6123a2
Author: Rahil C <[email protected]>
AuthorDate: Mon Mar 2 00:47:13 2026 -0800
feat(vector): add VECTOR type to HoodieSchema (#18146)
* add VECTOR type to HoodieSchema
* keep fixed bytes only
* address elementType
* fixes
* add all tests
* move metadata to schema level instead of as fields
* fix ci
* use enum for vector elemnt type
* remove nesting for fixed bytes
* checkstyle fixes
* remove fixed requirement
* address comments
* fix nested strutucres test
* check style
* minor fix
* address feedback
* address vinoth comments
* address vinoth and tim comments
* address voon comments
---
.../apache/hudi/common/schema/HoodieSchema.java | 352 +++++++++++++++++++++
.../hudi/common/schema/HoodieSchemaType.java | 5 +
.../hudi/common/schema/TestHoodieSchema.java | 317 +++++++++++++++++++
.../hudi/common/schema/TestHoodieSchemaType.java | 96 ++++++
4 files changed, 770 insertions(+)
diff --git
a/hudi-common/src/main/java/org/apache/hudi/common/schema/HoodieSchema.java
b/hudi-common/src/main/java/org/apache/hudi/common/schema/HoodieSchema.java
index e807576fba99..f35c77c099f5 100644
--- a/hudi-common/src/main/java/org/apache/hudi/common/schema/HoodieSchema.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/schema/HoodieSchema.java
@@ -123,6 +123,7 @@ public class HoodieSchema implements Serializable {
static {
LogicalTypes.register(VariantLogicalType.VARIANT_LOGICAL_TYPE_NAME, new
VariantLogicalTypeFactory());
LogicalTypes.register(BlobLogicalType.BLOB_LOGICAL_TYPE_NAME, new
BlobLogicalTypeFactory());
+ LogicalTypes.register(VectorLogicalType.VECTOR_LOGICAL_TYPE_NAME, new
VectorLogicalTypeFactory());
}
/**
@@ -173,6 +174,8 @@ public class HoodieSchema implements Serializable {
return new HoodieSchema.Variant(avroSchema);
} else if (logicalType == BlobLogicalType.blob()) {
return new HoodieSchema.Blob(avroSchema);
+ } else if (logicalType instanceof VectorLogicalType) {
+ return new HoodieSchema.Vector(avroSchema);
}
}
return new HoodieSchema(avroSchema);
@@ -645,6 +648,62 @@ public class HoodieSchema implements Serializable {
return new HoodieSchema.Blob(Blob.DEFAULT_NAME);
}
+ /**
+ * Creates Vector schema with default name and specified dimension.
+ * Defaults to {@link Vector.VectorElementType#FLOAT} element type.
+ *
+ * <p>The generated FIXED type name encodes dimension and element type
(e.g., {@code vector_float_128})
+ * to avoid Avro name collisions when multiple vector columns exist in the
same record.</p>
+ *
+ * @param dimension vector dimension (must be > 0)
+ * @return new HoodieSchema.Vector
+ */
+ public static HoodieSchema.Vector createVector(int dimension) {
+ return createVector(dimension, Vector.VectorElementType.FLOAT);
+ }
+
+ /**
+ * Creates Vector schema with custom name and dimension.
+ * Defaults to {@link Vector.VectorElementType#FLOAT} element type.
+ *
+ * @param name FIXED type name (must not be null or empty)
+ * @param dimension vector dimension (must be > 0)
+ * @return new HoodieSchema.Vector
+ */
+ public static HoodieSchema.Vector createVector(String name, int dimension) {
+ return createVector(name, dimension, Vector.VectorElementType.FLOAT);
+ }
+
+ /**
+ * Creates Vector schema with custom dimension and element type.
+ *
+ * <p>The generated FIXED type name encodes dimension and element type
(e.g., {@code vector_double_256})
+ * to avoid Avro name collisions when multiple vector columns exist in the
same record.</p>
+ *
+ * @param dimension vector dimension (must be > 0)
+ * @param elementType element type (use {@link
Vector.VectorElementType#FLOAT} or {@link Vector.VectorElementType#DOUBLE})
+ * @return new HoodieSchema.Vector
+ */
+ public static HoodieSchema.Vector createVector(int dimension,
Vector.VectorElementType elementType) {
+ String vectorName = Vector.DEFAULT_NAME + "_" +
elementType.name().toLowerCase() + "_" + dimension;
+ return createVector(vectorName, dimension, elementType);
+ }
+
+ /**
+ * Creates Vector schema with custom name, dimension, and element type.
+ *
+ * @param name FIXED type name (must not be null or empty)
+ * @param dimension vector dimension (must be > 0)
+ * @param elementType element type (use {@link
Vector.VectorElementType#FLOAT} or {@link Vector.VectorElementType#DOUBLE})
+ * @return new HoodieSchema.Vector
+ */
+ public static HoodieSchema.Vector createVector(String name, int dimension,
Vector.VectorElementType elementType) {
+ ValidationUtils.checkArgument(name != null && !name.isEmpty(),
+ () -> "Vector name must not be null or empty");
+ Schema vectorSchema = Vector.createSchema(name, dimension, elementType);
+ return new HoodieSchema.Vector(vectorSchema);
+ }
+
/**
* Returns the Hudi schema version information.
*
@@ -1551,6 +1610,215 @@ public class HoodieSchema implements Serializable {
}
}
+ public static class Vector extends HoodieSchema {
+ private static final String DEFAULT_NAME = "vector";
+
+ /**
+ * Enum representing vector element data types.
+ */
+ public enum VectorElementType {
+ FLOAT(4),
+ DOUBLE(8),
+ INT8(1);
+
+ private final int elementSize;
+
+ VectorElementType(int elementSize) {
+ this.elementSize = elementSize;
+ }
+
+ /**
+ * Returns the byte size of a single element.
+ *
+ * @return number of bytes per element
+ */
+ public int getElementSize() {
+ return elementSize;
+ }
+
+ /**
+ * Converts a string to VectorElementType enum.
+ *
+ * @param name the element type name (e.g., "FLOAT", "DOUBLE", "INT8")
+ * @return the corresponding enum value
+ * @throws IllegalArgumentException if name is unknown
+ */
+ public static VectorElementType fromString(String name) {
+ for (VectorElementType type : values()) {
+ if (type.name().equalsIgnoreCase(name)) {
+ return type;
+ }
+ }
+ throw new IllegalArgumentException("Unknown element type: " + name);
+ }
+ }
+
+ /**
+ * Enum representing the physical storage format backing a vector.
+ */
+ public enum StorageBacking {
+ FIXED_BYTES;
+
+ /**
+ * Converts a string to StorageBacking enum.
+ *
+ * @param name the storage backing name (e.g., "FIXED_BYTES")
+ * @return the corresponding enum value
+ * @throws IllegalArgumentException if name is unknown
+ */
+ public static StorageBacking fromString(String name) {
+ for (StorageBacking b : values()) {
+ if (b.name().equalsIgnoreCase(name)) {
+ return b;
+ }
+ }
+ throw new IllegalArgumentException("Unknown storage backing: " + name);
+ }
+ }
+
+ private final int dimension;
+ private final VectorElementType elementType;
+ private final StorageBacking storageBacking;
+
+ /**
+ * Creates Vector from pre-built schema (used by factory methods).
+ *
+ * @param avroSchema the Avro schema to wrap, must be a valid Vector schema
+ * @throws IllegalArgumentException if avroSchema is null or not a valid
Vector schema
+ */
+ Vector(Schema avroSchema) {
+ super(avroSchema);
+
+ // Extract properties from LogicalType
+ LogicalType logicalType = avroSchema.getLogicalType();
+ if (!(logicalType instanceof VectorLogicalType)) {
+ throw new IllegalArgumentException(
+ "Schema must have VectorLogicalType, got: " + logicalType);
+ }
+
+ VectorLogicalType vectorLogicalType = (VectorLogicalType) logicalType;
+ this.dimension = vectorLogicalType.getDimension();
+ this.elementType =
VectorElementType.fromString(vectorLogicalType.getElementType());
+ this.storageBacking =
StorageBacking.fromString(vectorLogicalType.getStorageBacking());
+
+ // Validate schema structure
+ validateVectorSchema(avroSchema);
+ }
+
+ @Override
+ public String getName() {
+ return VectorLogicalType.VECTOR_LOGICAL_TYPE_NAME;
+ }
+
+ @Override
+ public HoodieSchemaType getType() {
+ return HoodieSchemaType.VECTOR;
+ }
+
+ /**
+ * Creates vector schema with specified dimension and element type.
+ *
+ * @param name fixed type name (not null)
+ * @param dimension vector dimension (must be > 0)
+ * @param elementType element type (defaults to FLOAT if null)
+ * @return new Vector schema
+ */
+ private static Schema createSchema(String name, int dimension,
VectorElementType elementType) {
+ ValidationUtils.checkArgument(dimension > 0,
+ () -> "Vector dimension must be positive: " + dimension);
+
+ // Validate elementType
+ VectorElementType resolvedElementType = elementType != null ?
elementType : VectorElementType.FLOAT;
+
+ // Calculate fixed size: dimension × element size in bytes
+ int elementSize = resolvedElementType.getElementSize();
+ int fixedSize = dimension * elementSize;
+
+ // Create fixed Schema
+ Schema vectorSchema = Schema.createFixed(name, null, null, fixedSize);
+
+ // Apply logical type with properties directly to FIXED
+ VectorLogicalType vectorLogicalType = new VectorLogicalType(dimension,
resolvedElementType.name(), StorageBacking.FIXED_BYTES.name());
+ vectorLogicalType.addToSchema(vectorSchema);
+
+ return vectorSchema;
+ }
+
+ /**
+ * Validates that the given Avro schema conforms to Vector specification.
+ *
+ * @param avroSchema the schema to validate
+ * @throws IllegalArgumentException if schema is invalid
+ */
+ private void validateVectorSchema(Schema avroSchema) {
+ // Verify FIXED size matches: dimension × elementSize
+ int expectedSize = dimension * elementType.getElementSize();
+ int actualSize = avroSchema.getFixedSize();
+ ValidationUtils.checkArgument(actualSize == expectedSize,
+ () -> "Vector FIXED size mismatch: expected " + expectedSize
+ + " bytes (dimension=" + dimension + " × elementSize="
+ + elementType.getElementSize() + "), got " + actualSize);
+ }
+
+ /**
+ * Returns the dimension of this vector.
+ *
+ * @return vector dimension (always > 0)
+ */
+ public int getDimension() {
+ return dimension;
+ }
+
+ /**
+ * Returns the element type of this vector.
+ *
+ * @return element type enum (e.g., {@link VectorElementType#FLOAT},
{@link VectorElementType#DOUBLE}, {@link VectorElementType#INT8})
+ */
+ public VectorElementType getVectorElementType() {
+ return elementType;
+ }
+
+ /**
+ * Returns the storage backing type.
+ *
+ * @return storage backing enum value
+ */
+ public StorageBacking getStorageBacking() {
+ return storageBacking;
+ }
+
+ /**
+ * Returns the size of the fixed bytes backing this vector.
+ *
+ * @return size in bytes (dimension × elementSize)
+ */
+ public int getFixedSize() {
+ return getAvroSchema().getFixedSize();
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+ if (!super.equals(o)) {
+ return false;
+ }
+ Vector vector = (Vector) o;
+ return dimension == vector.dimension
+ && Objects.equals(elementType, vector.elementType)
+ && Objects.equals(storageBacking, vector.storageBacking);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(super.hashCode(), dimension, elementType,
storageBacking);
+ }
+ }
+
public static class Timestamp extends HoodieSchema {
private final boolean isUtcAdjusted;
private final TimePrecision precision;
@@ -1719,6 +1987,90 @@ public class HoodieSchema implements Serializable {
}
}
+ static class VectorLogicalType extends LogicalType {
+ private static final String VECTOR_LOGICAL_TYPE_NAME = "vector";
+ private static final String PROP_DIMENSION = "dimension";
+ private static final String PROP_ELEMENT_TYPE = "elementType";
+ private static final String PROP_STORAGE_BACKING = "storageBacking";
+
+ private final int dimension;
+ private final String elementType;
+ private final String storageBacking;
+
+ public VectorLogicalType(int dimension, String elementType, String
storageBacking) {
+ super(VectorLogicalType.VECTOR_LOGICAL_TYPE_NAME);
+ ValidationUtils.checkArgument(dimension > 0,
+ () -> "Vector dimension must be positive: " + dimension);
+ ValidationUtils.checkArgument(elementType != null &&
!elementType.isEmpty(),
+ () -> "Element type cannot be null or empty");
+ ValidationUtils.checkArgument(storageBacking != null &&
!storageBacking.isEmpty(),
+ () -> "Storage backing cannot be null or empty");
+
+ this.dimension = dimension;
+ this.elementType = elementType;
+ this.storageBacking = storageBacking;
+ }
+
+ public int getDimension() {
+ return dimension;
+ }
+
+ public String getElementType() {
+ return elementType;
+ }
+
+ public String getStorageBacking() {
+ return storageBacking;
+ }
+
+ @Override
+ public Schema addToSchema(Schema schema) {
+ super.addToSchema(schema);
+ schema.addProp(PROP_DIMENSION, dimension);
+ schema.addProp(PROP_ELEMENT_TYPE, elementType);
+ schema.addProp(PROP_STORAGE_BACKING, storageBacking);
+ return schema;
+ }
+ }
+
+ /**
+ * Factory for creating VectorLogicalType instances.
+ */
+ private static class VectorLogicalTypeFactory implements
LogicalTypes.LogicalTypeFactory {
+ @Override
+ public LogicalType fromSchema(Schema schema) {
+ // Extract properties from schema, defensively handling
string-serialized values
+ Object dimObj = schema.getObjectProp(VectorLogicalType.PROP_DIMENSION);
+ int dimension = 0;
+ if (dimObj != null) {
+ try {
+ dimension = Integer.parseInt(String.valueOf(dimObj));
+ } catch (NumberFormatException e) {
+ throw new IllegalArgumentException("Invalid vector dimension
property: " + dimObj);
+ }
+ }
+ ValidationUtils.checkArgument(dimension > 0,
+ () -> "Missing or invalid 'dimension' property in vector schema");
+
+ String elementType = schema.getProp(VectorLogicalType.PROP_ELEMENT_TYPE);
+ if (elementType == null) {
+ elementType = Vector.VectorElementType.FLOAT.name();
+ }
+
+ String storageBacking =
schema.getProp(VectorLogicalType.PROP_STORAGE_BACKING);
+ if (storageBacking == null) {
+ storageBacking = Vector.StorageBacking.FIXED_BYTES.name(); // default
+ }
+
+ return new VectorLogicalType(dimension, elementType, storageBacking);
+ }
+
+ @Override
+ public String getTypeName() {
+ return VectorLogicalType.VECTOR_LOGICAL_TYPE_NAME;
+ }
+ }
+
/**
* Factory for creating VariantLogicalType instances.
*/
diff --git
a/hudi-common/src/main/java/org/apache/hudi/common/schema/HoodieSchemaType.java
b/hudi-common/src/main/java/org/apache/hudi/common/schema/HoodieSchemaType.java
index 199d144a07d6..2b236d465334 100644
---
a/hudi-common/src/main/java/org/apache/hudi/common/schema/HoodieSchemaType.java
+++
b/hudi-common/src/main/java/org/apache/hudi/common/schema/HoodieSchemaType.java
@@ -124,6 +124,8 @@ public enum HoodieSchemaType {
*/
BLOB(Schema.Type.RECORD),
+ VECTOR(Schema.Type.FIXED),
+
/**
* Null type - represents the absence of a value
*/
@@ -163,6 +165,8 @@ public enum HoodieSchemaType {
return VARIANT;
} else if (logicalType == HoodieSchema.BlobLogicalType.blob()) {
return BLOB;
+ } else if (logicalType instanceof HoodieSchema.VectorLogicalType) {
+ return VECTOR;
}
}
switch (avroSchema.getType()) {
@@ -231,6 +235,7 @@ public enum HoodieSchemaType {
case UNION:
case VARIANT:
case BLOB:
+ case VECTOR:
return true;
default:
return false;
diff --git
a/hudi-common/src/test/java/org/apache/hudi/common/schema/TestHoodieSchema.java
b/hudi-common/src/test/java/org/apache/hudi/common/schema/TestHoodieSchema.java
index 286e739e52a9..4851d33d1ee6 100644
---
a/hudi-common/src/test/java/org/apache/hudi/common/schema/TestHoodieSchema.java
+++
b/hudi-common/src/test/java/org/apache/hudi/common/schema/TestHoodieSchema.java
@@ -19,6 +19,7 @@
package org.apache.hudi.common.schema;
import org.apache.hudi.common.schema.HoodieSchema.VariantLogicalType;
+import org.apache.hudi.common.schema.HoodieSchema.VectorLogicalType;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.exception.HoodieAvroSchemaException;
@@ -876,6 +877,322 @@ public class TestHoodieSchema {
assertEquals(5, decimalFixedSchema.getFixedSize());
}
+ @Test
+ void testCreateVectorWithDimension() {
+ // Create vector with dimension only (defaults to FLOAT)
+ HoodieSchema schema = HoodieSchema.createVector(1536);
+
+ HoodieSchema.Vector vectorSchema = assertVector(schema, 1536,
HoodieSchema.Vector.VectorElementType.FLOAT);
+ assertEquals(HoodieSchemaType.VECTOR, schema.getType());
+
+ assertTrue(schema.getAvroSchema().getLogicalType() instanceof
VectorLogicalType);
+
+ // Verify properties are at schema level
+ assertVectorAvroProperties(vectorSchema, 1536,
HoodieSchema.Vector.VectorElementType.FLOAT);
+
+ // Verify Vector is FIXED type (not RECORD)
+ Schema avroSchema = vectorSchema.getAvroSchema();
+ assertEquals(Schema.Type.FIXED, avroSchema.getType());
+ assertFalse(vectorSchema.hasFields());
+
+ // Verify FIXED size = dimension × elementSize (1536 × 4 bytes for FLOAT)
+ assertEquals(1536 * 4, avroSchema.getFixedSize());
+ }
+
+ @Test
+ void testCreateVectorWithNameAndDimension() {
+ // Create vector with custom name and dimension
+ HoodieSchema schema = HoodieSchema.createVector("embeddings", 768);
+ HoodieSchema.Vector vectorSchema = assertVector(schema, 768,
HoodieSchema.Vector.VectorElementType.FLOAT);
+ assertEquals(HoodieSchemaType.VECTOR, schema.getType());
+ assertEquals("embeddings", vectorSchema.getAvroSchema().getName());
+ }
+
+ @Test
+ void testCreateVectorWithDimensionAndElementType() {
+ // Create vector with DOUBLE element type
+ HoodieSchema schemaDouble = HoodieSchema.createVector(1536,
HoodieSchema.Vector.VectorElementType.DOUBLE);
+ assertVector(schemaDouble, 1536,
HoodieSchema.Vector.VectorElementType.DOUBLE);
+
+ // Create vector with FLOAT element type
+ HoodieSchema schemaFloat = HoodieSchema.createVector(512,
HoodieSchema.Vector.VectorElementType.FLOAT);
+ assertVector(schemaFloat, 512,
HoodieSchema.Vector.VectorElementType.FLOAT);
+
+ // Create vector with INT8 element type
+ HoodieSchema.Vector schemaInt = HoodieSchema.createVector(256,
HoodieSchema.Vector.VectorElementType.INT8);
+ assertVector(schemaInt, 256, HoodieSchema.Vector.VectorElementType.INT8);
+ }
+
+ @Test
+ void testCreateVectorWithAllParameters() {
+ // Create vector with all parameters: custom name, dimension, and element
type
+ HoodieSchema schema = HoodieSchema.createVector("precise_vectors", 512,
HoodieSchema.Vector.VectorElementType.DOUBLE);
+ HoodieSchema.Vector vectorSchema = assertVector(schema, 512,
HoodieSchema.Vector.VectorElementType.DOUBLE);
+ assertEquals("precise_vectors", vectorSchema.getAvroSchema().getName());
+ assertEquals(HoodieSchemaType.VECTOR, vectorSchema.getType());
+ }
+
+ @Test
+ void testVectorInvalidDimension() {
+ // Test zero dimension
+ IllegalArgumentException ex1 = assertThrows(
+ IllegalArgumentException.class,
+ () -> HoodieSchema.createVector(0)
+ );
+ assertTrue(ex1.getMessage().contains("must be positive"));
+
+ // Test negative dimension
+ IllegalArgumentException ex2 = assertThrows(
+ IllegalArgumentException.class,
+ () -> HoodieSchema.createVector(-1)
+ );
+ assertTrue(ex2.getMessage().contains("must be positive"));
+ }
+
+ @Test
+ void testVectorLogicalTypeDetection() {
+ // Create vector schema
+ HoodieSchema schema = HoodieSchema.createVector(1536);
+ assertTrue(schema.getAvroSchema().getLogicalType() instanceof
VectorLogicalType);
+ assertEquals(HoodieSchemaType.VECTOR, schema.getType());
+ }
+
+ @Test
+ void testVectorSchemaValidation() {
+ // Create vector and verify FIXED structure
+ HoodieSchema.Vector vectorSchema = HoodieSchema.createVector(768);
+ Schema avroSchema = vectorSchema.getAvroSchema();
+
+ // Verify Vector is FIXED type
+ assertEquals(Schema.Type.FIXED, avroSchema.getType());
+ assertFalse(vectorSchema.hasFields());
+
+ // Verify dimension, elementType, storageBacking are schema properties
+ assertVectorAvroProperties(vectorSchema, 768,
HoodieSchema.Vector.VectorElementType.FLOAT);
+
+ // Verify FIXED size = dimension × elementSize (768 × 4 bytes for FLOAT)
+ assertEquals(768 * 4, avroSchema.getFixedSize());
+ assertEquals(768 * 4, vectorSchema.getFixedSize());
+ }
+
+ @Test
+ void testVectorFieldAccess() {
+ // Create vector with FLOAT
+ HoodieSchema.Vector vectorFloat = HoodieSchema.createVector(1536);
+ assertVector(vectorFloat, 1536,
HoodieSchema.Vector.VectorElementType.FLOAT);
+
+ HoodieSchema.Vector vectorDouble = HoodieSchema.createVector(768,
HoodieSchema.Vector.VectorElementType.DOUBLE);
+ assertVector(vectorDouble, 768,
HoodieSchema.Vector.VectorElementType.DOUBLE);
+
+ // Verify dimension/elementType/storageBacking are accessible via
properties
+ assertVectorAvroProperties(vectorFloat, 1536,
HoodieSchema.Vector.VectorElementType.FLOAT);
+
+ // Verify FIXED size access
+ assertEquals(1536 * 4, vectorFloat.getFixedSize()); // FLOAT is 4 bytes
+ assertEquals(768 * 8, vectorDouble.getFixedSize()); // DOUBLE is 8 bytes
+ }
+
+ @Test
+ void testVectorEquality() {
+ HoodieSchema.Vector v1 = HoodieSchema.createVector(1536);
+ HoodieSchema.Vector v2 = HoodieSchema.createVector(1536);
+ HoodieSchema.Vector v3 = HoodieSchema.createVector(768);
+ HoodieSchema.Vector v4 = HoodieSchema.createVector(1536,
HoodieSchema.Vector.VectorElementType.DOUBLE);
+
+ // Same dimension and element type -> equal
+ assertEquals(v1, v2);
+ assertEquals(v1.hashCode(), v2.hashCode());
+
+ // Different dimension -> not equal
+ assertNotEquals(v1, v3);
+
+ // Different element type -> not equal
+ assertNotEquals(v1, v4);
+
+ // Reflexivity
+ assertEquals(v1, v1);
+
+ // Null check
+ assertNotEquals(v1, null);
+
+ // Different class
+ assertNotEquals(v1, "string");
+ }
+
+ @Test
+ void testVectorSerialization() throws Exception {
+ // Create vector with DOUBLE element type
+ HoodieSchema.Vector original = HoodieSchema.createVector(768,
HoodieSchema.Vector.VectorElementType.DOUBLE);
+
+ // Java serialize
+ ByteArrayOutputStream byteOut = new ByteArrayOutputStream();
+ ObjectOutputStream out = new ObjectOutputStream(byteOut);
+ out.writeObject(original);
+ out.close();
+
+ // Java deserialize
+ ByteArrayInputStream byteIn = new
ByteArrayInputStream(byteOut.toByteArray());
+ ObjectInputStream in = new ObjectInputStream(byteIn);
+ HoodieSchema deserialized = (HoodieSchema) in.readObject();
+ in.close();
+
+ // Verify
+ assertVector(deserialized, 768,
HoodieSchema.Vector.VectorElementType.DOUBLE);
+ assertEquals(original, deserialized);
+ }
+
+ @Test
+ void testVectorInNestedStructures() throws Exception {
+ // Create vector schema
+ HoodieSchema.Vector vectorSchema = HoodieSchema.createVector(128,
HoodieSchema.Vector.VectorElementType.FLOAT);
+
+ // Test vector in record - verify it can be used as a field
+ List<HoodieSchemaField> fields = Arrays.asList(
+ HoodieSchemaField.of("id", HoodieSchema.create(HoodieSchemaType.INT)),
+ HoodieSchemaField.of("embedding", vectorSchema)
+ );
+ HoodieSchema recordSchema = HoodieSchema.createRecord("TestRecord", null,
null, fields);
+ assertEquals(HoodieSchemaType.RECORD, recordSchema.getType());
+
+ // Verify vector field is preserved in the Avro schema
+ Schema.Field embeddingField =
recordSchema.getAvroSchema().getField("embedding");
+ assertNotNull(embeddingField);
+ HoodieSchema embeddingSchema =
HoodieSchema.fromAvroSchema(embeddingField.schema());
+ assertVector(embeddingSchema, 128,
HoodieSchema.Vector.VectorElementType.FLOAT);
+
+ // Round-trip record with vector field through JSON
+ String recordJson = recordSchema.toString();
+ HoodieSchema parsedRecord = HoodieSchema.parse(recordJson);
+ assertEquals(recordSchema, parsedRecord);
+ Schema.Field parsedEmbeddingField =
parsedRecord.getAvroSchema().getField("embedding");
+ assertNotNull(parsedEmbeddingField);
+ HoodieSchema parsedEmbedding =
HoodieSchema.fromAvroSchema(parsedEmbeddingField.schema());
+ assertVector(parsedEmbedding, 128,
HoodieSchema.Vector.VectorElementType.FLOAT);
+
+ // Test vector in array
+ HoodieSchema arraySchema = HoodieSchema.createArray(vectorSchema);
+ assertEquals(HoodieSchemaType.ARRAY, arraySchema.getType());
+ assertVector(arraySchema.getElementType(), 128,
HoodieSchema.Vector.VectorElementType.FLOAT);
+
+ // Round-trip array of vectors through JSON
+ String arrayJson = arraySchema.toString();
+ HoodieSchema parsedArray = HoodieSchema.parse(arrayJson);
+ assertEquals(arraySchema, parsedArray);
+ assertVector(parsedArray.getElementType(), 128,
HoodieSchema.Vector.VectorElementType.FLOAT);
+
+ // Test vector in map
+ HoodieSchema mapSchema = HoodieSchema.createMap(vectorSchema);
+ assertEquals(HoodieSchemaType.MAP, mapSchema.getType());
+ assertVector(mapSchema.getValueType(), 128,
HoodieSchema.Vector.VectorElementType.FLOAT);
+
+ // Round-trip map with vector values through JSON
+ String mapJson = mapSchema.toString();
+ HoodieSchema parsedMap = HoodieSchema.parse(mapJson);
+ assertEquals(mapSchema, parsedMap);
+ assertVector(parsedMap.getValueType(), 128,
HoodieSchema.Vector.VectorElementType.FLOAT);
+ }
+
+ @Test
+ void testVectorWithDefaultName() {
+ // createVector(dimension) generates a dimension-aware name to avoid Avro
collisions
+ HoodieSchema.Vector v1 = HoodieSchema.createVector(1536);
+ assertEquals("vector_float_1536", v1.getAvroSchema().getName());
+
+ HoodieSchema.Vector v2 = HoodieSchema.createVector(768,
HoodieSchema.Vector.VectorElementType.DOUBLE);
+ assertEquals("vector_double_768", v2.getAvroSchema().getName());
+
+ // Null or empty name is rejected
+ assertThrows(IllegalArgumentException.class, () ->
HoodieSchema.createVector(null, 128));
+ assertThrows(IllegalArgumentException.class, () ->
HoodieSchema.createVector("", 128));
+ }
+
+ @Test
+ void testMultipleVectorColumnsWithSameDimensionAndType() {
+ // Two vectors with identical dimension and element type share the same
FIXED type name,
+ // which Avro allows since the definitions are identical.
+ HoodieSchema.Vector v1 = HoodieSchema.createVector(128);
+ HoodieSchema.Vector v2 = HoodieSchema.createVector(128);
+
+ List<HoodieSchemaField> fields = Arrays.asList(
+ HoodieSchemaField.of("id", HoodieSchema.create(HoodieSchemaType.INT)),
+ HoodieSchemaField.of("title_embedding", v1),
+ HoodieSchemaField.of("content_embedding", v2)
+ );
+
+ HoodieSchema record = HoodieSchema.createRecord("TestRecord", null, null,
fields);
+ assertNotNull(record);
+
+ // Verify both fields survive a JSON round-trip
+ String json = record.toString();
+ HoodieSchema parsed = HoodieSchema.parse(json);
+ assertNotNull(parsed.getAvroSchema().getField("title_embedding"));
+ assertNotNull(parsed.getAvroSchema().getField("content_embedding"));
+
assertVector(HoodieSchema.fromAvroSchema(parsed.getAvroSchema().getField("title_embedding").schema()),
+ 128, HoodieSchema.Vector.VectorElementType.FLOAT);
+
assertVector(HoodieSchema.fromAvroSchema(parsed.getAvroSchema().getField("content_embedding").schema()),
+ 128, HoodieSchema.Vector.VectorElementType.FLOAT);
+ }
+
+ @Test
+ void testMultipleVectorColumnsWithDifferentDimensions() {
+ // Two vectors with different dimensions use dimension-aware names to
avoid Avro collisions
+ HoodieSchema.Vector v128 = HoodieSchema.createVector(128);
+ HoodieSchema.Vector v256 = HoodieSchema.createVector(256);
+
+ List<HoodieSchemaField> fields = Arrays.asList(
+ HoodieSchemaField.of("id", HoodieSchema.create(HoodieSchemaType.INT)),
+ HoodieSchemaField.of("embedding_small", v128),
+ HoodieSchemaField.of("embedding_large", v256)
+ );
+
+ // A table with two vector columns of different dimensions is a valid use
case
+ HoodieSchema record = HoodieSchema.createRecord("TestRecord", null, null,
fields);
+ assertNotNull(record);
+
+ // Verify both fields survive a JSON round-trip (schema
serialization/parsing)
+ String json = record.toString();
+ HoodieSchema parsed = HoodieSchema.parse(json);
+ assertNotNull(parsed.getAvroSchema().getField("embedding_small"));
+ assertNotNull(parsed.getAvroSchema().getField("embedding_large"));
+
assertVector(HoodieSchema.fromAvroSchema(parsed.getAvroSchema().getField("embedding_small").schema()),
+ 128, HoodieSchema.Vector.VectorElementType.FLOAT);
+
assertVector(HoodieSchema.fromAvroSchema(parsed.getAvroSchema().getField("embedding_large").schema()),
+ 256, HoodieSchema.Vector.VectorElementType.FLOAT);
+ }
+
+ @Test
+ void testVectorFromAvroSchema() {
+ // Create vector via factory
+ HoodieSchema.Vector original = HoodieSchema.createVector("embeddings",
512, HoodieSchema.Vector.VectorElementType.DOUBLE);
+
+ // Get Avro schema
+ Schema avroSchema = original.getAvroSchema();
+
+ // Re-wrap via fromAvroSchema
+ HoodieSchema rewrapped = HoodieSchema.fromAvroSchema(avroSchema);
+
+ // Verify returns Vector instance with preserved dimension and elementType
+ assertVector(rewrapped, 512, HoodieSchema.Vector.VectorElementType.DOUBLE);
+ assertEquals(original, rewrapped);
+ }
+
+ private HoodieSchema.Vector assertVector(HoodieSchema schema, int
expectedDimension,
+
HoodieSchema.Vector.VectorElementType expectedElementType) {
+ assertTrue(schema instanceof HoodieSchema.Vector);
+ HoodieSchema.Vector vector = (HoodieSchema.Vector) schema;
+ assertEquals(expectedDimension, vector.getDimension());
+ assertEquals(expectedElementType, vector.getVectorElementType());
+ return vector;
+ }
+
+ private void assertVectorAvroProperties(HoodieSchema.Vector vector, int
expectedDimension,
+
HoodieSchema.Vector.VectorElementType expectedElementType) {
+ Schema avroSchema = vector.getAvroSchema();
+ assertEquals(expectedDimension, ((Number)
avroSchema.getObjectProp("dimension")).intValue());
+ assertEquals(expectedElementType.name(),
avroSchema.getProp("elementType"));
+ assertEquals(HoodieSchema.Vector.StorageBacking.FIXED_BYTES.name(),
avroSchema.getProp("storageBacking"));
+ }
+
@Test
void testCreateTimestampMillis() {
HoodieSchema timestampSchema = HoodieSchema.createTimestampMillis();
diff --git
a/hudi-common/src/test/java/org/apache/hudi/common/schema/TestHoodieSchemaType.java
b/hudi-common/src/test/java/org/apache/hudi/common/schema/TestHoodieSchemaType.java
index a14435b2e7d8..012e4bb36b35 100644
---
a/hudi-common/src/test/java/org/apache/hudi/common/schema/TestHoodieSchemaType.java
+++
b/hudi-common/src/test/java/org/apache/hudi/common/schema/TestHoodieSchemaType.java
@@ -86,6 +86,7 @@ public class TestHoodieSchemaType {
assertTrue(HoodieSchemaType.UNION.isComplex(), "UNION should be complex");
assertTrue(HoodieSchemaType.VARIANT.isComplex(), "VARIANT should be
complex");
assertTrue(HoodieSchemaType.BLOB.isComplex(), "BLOB should be complex");
+ assertTrue(HoodieSchemaType.VECTOR.isComplex(), "VECTOR should be
complex");
assertFalse(HoodieSchemaType.STRING.isComplex(), "STRING should not be
complex");
assertFalse(HoodieSchemaType.INT.isComplex(), "INT should not be complex");
@@ -118,6 +119,7 @@ public class TestHoodieSchemaType {
assertFalse(HoodieSchemaType.UNION.isNumeric(), "UNION should not be
numeric");
assertFalse(HoodieSchemaType.VARIANT.isNumeric(), "VARIANT should not be
numeric");
assertFalse(HoodieSchemaType.BLOB.isNumeric(), "BLOB should not be
numeric");
+ assertFalse(HoodieSchemaType.VECTOR.isNumeric(), "VECTOR should not be
numeric");
}
@Test
@@ -207,6 +209,7 @@ public class TestHoodieSchemaType {
LogicalTypes.uuid().addToSchema(Schema.create(Schema.Type.STRING)));
map.put(HoodieSchemaType.VARIANT, createVariantSchemaForTest());
map.put(HoodieSchemaType.BLOB, HoodieSchema.createBlob().toAvroSchema());
+ map.put(HoodieSchemaType.VECTOR, createVectorSchemaForTest());
return map;
}
@@ -224,4 +227,97 @@ public class TestHoodieSchemaType {
HoodieSchema.VariantLogicalType.variant().addToSchema(variantRecord);
return variantRecord;
}
+
+ @Test
+ void testVectorFromSchemaWithStringProperties() {
+ // Manually craft a JSON schema where 'dimension' is a string rather than
an integer
+ String jsonSchema = "{"
+ + "\"type\":\"fixed\","
+ + "\"name\":\"vector_float_128\","
+ + "\"size\":512,"
+ + "\"logicalType\":\"vector\","
+ + "\"dimension\":\"128\","
+ + "\"elementType\":\"FLOAT\","
+ + "\"storageBacking\":\"FIXED_BYTES\""
+ + "}";
+
+ Schema avroSchema = new Schema.Parser().parse(jsonSchema);
+ HoodieSchema schema = HoodieSchema.fromAvroSchema(avroSchema);
+
+ assertTrue(schema instanceof HoodieSchema.Vector);
+ HoodieSchema.Vector vectorSchema = (HoodieSchema.Vector) schema;
+
+ // Verify it correctly parsed the string "128" into the integer 128
+ assertEquals(128, vectorSchema.getDimension());
+ assertEquals(HoodieSchema.Vector.VectorElementType.FLOAT,
vectorSchema.getVectorElementType());
+ }
+
+ @Test
+ void testVectorSizeMismatchValidation() {
+ // Dimension 10, FLOAT (4 bytes) -> expected fixed size is 40.
+ // We intentionally create a FIXED schema with size 42 via JSON parsing
+ // so the VectorLogicalTypeFactory is properly invoked.
+ String jsonSchema = "{"
+ + "\"type\":\"fixed\","
+ + "\"name\":\"bad_vector\","
+ + "\"size\":42,"
+ + "\"logicalType\":\"vector\","
+ + "\"dimension\":10,"
+ + "\"elementType\":\"FLOAT\","
+ + "\"storageBacking\":\"FIXED_BYTES\""
+ + "}";
+
+ Schema avroSchema = new Schema.Parser().parse(jsonSchema);
+
+ IllegalArgumentException ex = assertThrows(IllegalArgumentException.class,
+ () -> HoodieSchema.fromAvroSchema(avroSchema));
+
+ assertTrue(ex.getMessage().contains("FIXED size mismatch"),
+ "Should throw size mismatch error, got: " + ex.getMessage());
+ }
+
+ @Test
+ void testVectorUnknownElementType() {
+ // Create a FIXED schema with an invalid element type via JSON parsing
+ // so the VectorLogicalTypeFactory is properly invoked.
+ String jsonSchema = "{"
+ + "\"type\":\"fixed\","
+ + "\"name\":\"bad_vector\","
+ + "\"size\":40,"
+ + "\"logicalType\":\"vector\","
+ + "\"dimension\":10,"
+ + "\"elementType\":\"VARCHAR\","
+ + "\"storageBacking\":\"FIXED_BYTES\""
+ + "}";
+
+ Schema avroSchema = new Schema.Parser().parse(jsonSchema);
+
+ IllegalArgumentException ex = assertThrows(IllegalArgumentException.class,
+ () -> HoodieSchema.fromAvroSchema(avroSchema));
+
+ assertTrue(ex.getMessage().contains("Unknown element type: VARCHAR"),
+ "Should reject unknown element types");
+ }
+
+ /**
+ * Creates a vector schema manually using Avro APIs.
+ *
+ * @return a vector FIXED schema with VectorLogicalType metadata
+ */
+ private static Schema createVectorSchemaForTest() {
+ int dimension = 128;
+ String elementType = HoodieSchema.Vector.VectorElementType.FLOAT.name();
+ String storageBacking =
HoodieSchema.Vector.StorageBacking.FIXED_BYTES.name();
+
+ int fixedSize = dimension * 4;
+ // Create FIXED schema directly
+ Schema vectorSchema = Schema.createFixed("vector", null, null, fixedSize);
+
+ // Apply VectorLogicalType with metadata
+ HoodieSchema.VectorLogicalType vectorLogicalType =
+ new HoodieSchema.VectorLogicalType(dimension, elementType,
storageBacking);
+ vectorLogicalType.addToSchema(vectorSchema);
+
+ return vectorSchema;
+ }
}