This is an automated email from the ASF dual-hosted git repository.
lzljs3620320 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git
The following commit(s) were added to refs/heads/master by this push:
new dbc804289d [variant] Port changes of variant from spark (#5895)
dbc804289d is described below
commit dbc804289d4aed6710fc75ee8c45b5f4556cfbe3
Author: Zouxxyy <[email protected]>
AuthorDate: Tue Jul 15 14:25:51 2025 +0800
[variant] Port changes of variant from spark (#5895)
---
.../apache/paimon/data/variant/GenericVariant.java | 31 ++++++++++++++++---
.../paimon/data/variant/GenericVariantBuilder.java | 26 +++++++++++++---
.../paimon/data/variant/GenericVariantUtil.java | 36 +++++++++++++++++++---
3 files changed, 80 insertions(+), 13 deletions(-)
diff --git
a/paimon-common/src/main/java/org/apache/paimon/data/variant/GenericVariant.java
b/paimon-common/src/main/java/org/apache/paimon/data/variant/GenericVariant.java
index d79eb09054..17faf26097 100644
---
a/paimon-common/src/main/java/org/apache/paimon/data/variant/GenericVariant.java
+++
b/paimon-common/src/main/java/org/apache/paimon/data/variant/GenericVariant.java
@@ -35,6 +35,7 @@ import java.util.Arrays;
import java.util.Base64;
import java.util.Locale;
import java.util.Objects;
+import java.util.UUID;
import static
org.apache.paimon.data.variant.GenericVariantUtil.BINARY_SEARCH_THRESHOLD;
import static org.apache.paimon.data.variant.GenericVariantUtil.SIZE_LIMIT;
@@ -239,6 +240,11 @@ public final class GenericVariant implements Variant {
return GenericVariantUtil.getType(value, pos);
}
+ // Get a UUID value from the variant.
+ public UUID getUuid() {
+ return GenericVariantUtil.getUuid(value, pos);
+ }
+
// Get the number of object fields in the variant.
// It is only legal to call it when `getType()` is `Type.OBJECT`.
public int objectSize() {
@@ -456,8 +462,15 @@ public final class GenericVariant implements Variant {
sb.append(escapeJson(GenericVariantUtil.getString(value,
pos)));
break;
case DOUBLE:
- sb.append(GenericVariantUtil.getDouble(value, pos));
- break;
+ {
+ double d = GenericVariantUtil.getDouble(value, pos);
+ if (Double.isFinite(d)) {
+ sb.append(d);
+ } else {
+ appendQuoted(sb, Double.toString(d));
+ }
+ break;
+ }
case DECIMAL:
sb.append(GenericVariantUtil.getDecimal(value,
pos).toPlainString());
break;
@@ -482,14 +495,24 @@ public final class GenericVariant implements Variant {
.atZone(ZoneOffset.UTC)));
break;
case FLOAT:
- sb.append(GenericVariantUtil.getFloat(value, pos));
- break;
+ {
+ float f = GenericVariantUtil.getFloat(value, pos);
+ if (Float.isFinite(f)) {
+ sb.append(f);
+ } else {
+ appendQuoted(sb, Float.toString(f));
+ }
+ break;
+ }
case BINARY:
appendQuoted(
sb,
Base64.getEncoder()
.encodeToString(GenericVariantUtil.getBinary(value, pos)));
break;
+ case UUID:
+ appendQuoted(sb, GenericVariantUtil.getUuid(value,
pos).toString());
+ break;
}
}
}
diff --git
a/paimon-common/src/main/java/org/apache/paimon/data/variant/GenericVariantBuilder.java
b/paimon-common/src/main/java/org/apache/paimon/data/variant/GenericVariantBuilder.java
index 187fb9259e..169a2f8c9b 100644
---
a/paimon-common/src/main/java/org/apache/paimon/data/variant/GenericVariantBuilder.java
+++
b/paimon-common/src/main/java/org/apache/paimon/data/variant/GenericVariantBuilder.java
@@ -27,12 +27,15 @@ import
org.apache.paimon.shade.jackson2.com.fasterxml.jackson.core.exc.InputCoer
import java.io.IOException;
import java.math.BigDecimal;
import java.math.BigInteger;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
+import java.util.UUID;
import static org.apache.paimon.data.variant.GenericVariantUtil.ARRAY;
import static
org.apache.paimon.data.variant.GenericVariantUtil.BASIC_TYPE_MASK;
@@ -61,9 +64,9 @@ import static
org.apache.paimon.data.variant.GenericVariantUtil.TIMESTAMP_NTZ;
import static org.apache.paimon.data.variant.GenericVariantUtil.TRUE;
import static org.apache.paimon.data.variant.GenericVariantUtil.U16_MAX;
import static org.apache.paimon.data.variant.GenericVariantUtil.U24_MAX;
-import static org.apache.paimon.data.variant.GenericVariantUtil.U24_SIZE;
import static org.apache.paimon.data.variant.GenericVariantUtil.U32_SIZE;
import static org.apache.paimon.data.variant.GenericVariantUtil.U8_MAX;
+import static org.apache.paimon.data.variant.GenericVariantUtil.UUID;
import static org.apache.paimon.data.variant.GenericVariantUtil.VERSION;
import static org.apache.paimon.data.variant.GenericVariantUtil.arrayHeader;
import static org.apache.paimon.data.variant.GenericVariantUtil.checkIndex;
@@ -283,6 +286,18 @@ public class GenericVariantBuilder {
writePos += binary.length;
}
+ public void appendUuid(UUID uuid) {
+ checkCapacity(1 + 16);
+ writeBuffer[writePos++] = primitiveHeader(UUID);
+
+ // UUID is stored big-endian, so don't use writeLong.
+ ByteBuffer buffer = ByteBuffer.wrap(writeBuffer, writePos, 16);
+ buffer.order(ByteOrder.BIG_ENDIAN);
+ buffer.putLong(writePos, uuid.getMostSignificantBits());
+ buffer.putLong(writePos + 8, uuid.getLeastSignificantBits());
+ writePos += 16;
+ }
+
// Add a key to the variant dictionary. If the key already exists, the
dictionary is not
// modified.
// In either case, return the id of the key.
@@ -592,16 +607,19 @@ public class GenericVariantBuilder {
}
// Choose the smallest unsigned integer type that can store `value`. It
must be within
- // `[0, U24_MAX]`.
+ // `[0, SIZE_LIMIT]`.
private int getIntegerSize(int value) {
- assert value >= 0 && value <= U24_MAX;
+ assert value >= 0 && value <= SIZE_LIMIT;
if (value <= U8_MAX) {
return 1;
}
if (value <= U16_MAX) {
return 2;
}
- return U24_SIZE;
+ if (value <= U24_MAX) {
+ return 3;
+ }
+ return 4;
}
private void parseFloatingPoint(JsonParser parser) throws IOException {
diff --git
a/paimon-common/src/main/java/org/apache/paimon/data/variant/GenericVariantUtil.java
b/paimon-common/src/main/java/org/apache/paimon/data/variant/GenericVariantUtil.java
index b37cbd7f6f..c99428b9d0 100644
---
a/paimon-common/src/main/java/org/apache/paimon/data/variant/GenericVariantUtil.java
+++
b/paimon-common/src/main/java/org/apache/paimon/data/variant/GenericVariantUtil.java
@@ -20,7 +20,10 @@ package org.apache.paimon.data.variant;
import java.math.BigDecimal;
import java.math.BigInteger;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
import java.util.Arrays;
+import java.util.UUID;
/* This file is based on source code from the Spark Project
(http://spark.apache.org/), licensed by the Apache
* Software Foundation (ASF) under the Apache License, Version 2.0. See the
NOTICE file distributed with this work for
@@ -120,6 +123,8 @@ public class GenericVariantUtil {
// Long string value. The content is (4-byte little-endian unsigned
integer representing the
// string size) + (size bytes of string content).
public static final int LONG_STR = 16;
+ // UUID, 16-byte big-endian.
+ public static final int UUID = 20;
public static final byte VERSION = 1;
// The lower 4 bits of the first metadata byte contain the version.
@@ -131,8 +136,8 @@ public class GenericVariantUtil {
public static final int U24_SIZE = 3;
public static final int U32_SIZE = 4;
- // Both variant value and variant metadata need to be no longer than 16MiB.
- public static final int SIZE_LIMIT = U24_MAX + 1;
+ // Both variant value and variant metadata need to be no longer than
128MiB.
+ public static final int SIZE_LIMIT = 128 * 1024 * 1024;
public static final int MAX_DECIMAL4_PRECISION = 9;
public static final int MAX_DECIMAL8_PRECISION = 18;
@@ -248,7 +253,8 @@ public class GenericVariantUtil {
TIMESTAMP,
TIMESTAMP_NTZ,
FLOAT,
- BINARY
+ BINARY,
+ UUID
}
public static int getTypeInfo(byte[] value, int pos) {
@@ -301,6 +307,8 @@ public class GenericVariantUtil {
return Type.BINARY;
case LONG_STR:
return Type.STRING;
+ case UUID:
+ return Type.UUID;
default:
throw unknownPrimitiveTypeInVariant(typeInfo);
}
@@ -367,6 +375,8 @@ public class GenericVariantUtil {
case BINARY:
case LONG_STR:
return 1 + U32_SIZE + readUnsigned(value, pos + 1,
U32_SIZE);
+ case UUID:
+ return 17;
default:
throw unknownPrimitiveTypeInVariant(typeInfo);
}
@@ -531,7 +541,23 @@ public class GenericVariantUtil {
throw unexpectedType(Type.STRING);
}
- /** 1. */
+ // Get a UUID value from variant value `value[pos...]`.
+ // Throw `MALFORMED_VARIANT` if the variant is malformed.
+ public static UUID getUuid(byte[] value, int pos) {
+ checkIndex(pos, value.length);
+ int basicType = value[pos] & BASIC_TYPE_MASK;
+ int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK;
+ if (basicType != PRIMITIVE || typeInfo != UUID) {
+ throw unexpectedType(Type.UUID);
+ }
+ int start = pos + 1;
+ checkIndex(start + 15, value.length);
+ // UUID values are big-endian, so we can't use VariantUtil.readLong().
+ ByteBuffer bb = ByteBuffer.wrap(value, start,
16).order(ByteOrder.BIG_ENDIAN);
+ return new UUID(bb.getLong(), bb.getLong());
+ }
+
+ /** ObjectHandler. */
public interface ObjectHandler<T> {
/**
* @param size Number of object fields.
@@ -569,7 +595,7 @@ public class GenericVariantUtil {
return handler.apply(size, idSize, offsetSize, idStart, offsetStart,
dataStart);
}
- /** 1. */
+ /** ArrayHandler. */
public interface ArrayHandler<T> {
/**
* @param size Number of array elements.