aihuaxu commented on code in PR #13137:
URL: https://github.com/apache/iceberg/pull/13137#discussion_r2187633611
##########
api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java:
##########
@@ -646,6 +655,88 @@ private static String sanitizeSimpleString(CharSequence
value) {
return String.format(Locale.ROOT, "(hash-%08x)", HASH_FUNC.apply(value));
}
+ private static String sanitizeVariant(Variant value, long now, int today) {
+ return sanitizeVariant(value.value(), now, today);
+ }
+
+ private static String sanitizeVariant(VariantValue value, long now, int
today) {
+ if (value instanceof VariantObject) {
+ return sanitizeVariantObject(value.asObject(), now, today);
+ } else if (value instanceof VariantPrimitive) {
+ return sanitizeVariantValue(value.asPrimitive(), value.type(), now,
today);
+ } else {
+ return sanitizeVariantArray(value.asArray(), now, today);
+ }
+ }
+
+ private static String sanitizeVariantObject(VariantObject value, long now,
int today) {
+ StringBuilder builder = new StringBuilder();
+ builder.append("{");
+ boolean first = true;
+ for (String field : value.fieldNames()) {
+ if (first) {
+ first = false;
+ } else {
+ builder.append(", ");
+ }
+ builder.append(String.format(Locale.ROOT, "(hash-%s)", field)).append(":
");
+ VariantValue fieldValue = value.get(field);
+ PhysicalType fieldType = fieldValue.type();
+ builder.append(sanitizeVariantValue(fieldValue, fieldType, now, today));
+ }
+ builder.append("}");
+ return builder.toString();
+ }
+
+ private static String sanitizeVariantArray(VariantArray value, long now, int
today) {
+ StringBuilder builder = new StringBuilder();
+ builder.append("{");
+ boolean first = true;
+ for (int i = 0; i < value.numElements(); i++) {
+ if (first) {
+ first = false;
+ } else {
+ builder.append(", ");
+ }
+ builder.append(sanitizeVariantValue(value.get(i), value.get(i).type(),
now, today));
+ }
+ builder.append("}");
+ return builder.toString();
+ }
+
+ private static String sanitizeVariantValue(
+ VariantValue fieldValue, PhysicalType fieldType, long now, int today) {
+ StringBuilder builder = new StringBuilder();
+ switch (fieldType) {
+ case INT8:
+ case INT16:
+ case INT32:
+ case INT64:
+ case FLOAT:
+ case DOUBLE:
+ case DECIMAL4:
+ case DECIMAL8:
+ case DECIMAL16:
+ builder.append(sanitizeNumber((Number) fieldValue.asPrimitive().get(),
fieldType.name()));
+ break;
+ case DATE:
+ builder.append(sanitizeDate(((Number)
fieldValue.asPrimitive().get()).intValue(), today));
+ break;
+ case TIMESTAMPTZ:
+ case TIMESTAMPNTZ:
Review Comment:
How about nano timestamp? I think we also need to handle them here as well.
And also time type.
##########
api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java:
##########
@@ -1050,6 +1058,174 @@ public void testSelectsPartitions() {
.isFalse();
}
+ @Test
+ public void testSanitizeVariantArray() {
+ Expression bound =
+ Binder.bind(
+ STRUCT,
+ equal("var", Variant.of(VariantMetadata.empty(),
VariantTestUtil.createMixedArray())));
+ assertEquals(
+ Expressions.equal(
+ "var",
+ "{(date), (2-digit-INT8), (hash-2024a117), {(hash-7b813b77),
(hash-512b3ddf), (hash-36f6eef4)}, (hash-6926fdc4), "
+ + "(hash-06eb5017), (hash-64f79f23), (hash-3682a0b4),
(hash-451a5465), (4-digit-INT16)}"),
+ ExpressionUtil.sanitize(bound));
+ }
+
+ @Test
+ public void testSanitizeVariantPrimitive() {
+ Expression int8Bound =
+ Binder.bind(
+ STRUCT,
+ equal(
+ "var",
+ Variant.of(
+ VariantMetadata.empty(),
+ VariantTestUtil.createSerializedPrimitive(3, new byte[]
{(byte) 32}))));
+ assertEquals(Expressions.equal("var", "(2-digit-INT8)"),
ExpressionUtil.sanitize(int8Bound));
+ Expression int16Bound =
+ Binder.bind(
+ STRUCT,
+ equal(
+ "var",
+ Variant.of(
+ VariantMetadata.empty(),
+ VariantTestUtil.createSerializedPrimitive(4, new byte[]
{(byte) 0xD2, 0x04}))));
+ assertEquals(Expressions.equal("var", "(4-digit-INT16)"),
ExpressionUtil.sanitize(int16Bound));
+ Expression doubleBound =
+ Binder.bind(
+ STRUCT,
+ equal(
+ "var",
+ Variant.of(
+ VariantMetadata.empty(),
+ VariantTestUtil.createSerializedPrimitive(
+ 7,
+ new byte[] {
+ (byte) 0xB1, 0x1C, 0x6C, (byte) 0xB1, (byte) 0xF4,
0x10, 0x22, 0x11
+ }))));
+ assertEquals(
+ Expressions.equal("var", "(-224-digit-DOUBLE)"),
ExpressionUtil.sanitize(doubleBound));
+
+ Expression timestamp =
+ Binder.bind(
+ STRUCT,
+ equal(
+ "var",
+ Variant.of(
+ VariantMetadata.empty(),
+ VariantTestUtil.createSerializedPrimitive(
+ 12,
+ new byte[] {
+ 0x18, (byte) 0xD3, (byte) 0xB1, (byte) 0xD6, 0x07,
0x57, 0x05, 0x00
+ }))));
+ assertEquals(Expressions.equal("var", "(timestamp)"),
ExpressionUtil.sanitize(timestamp));
+ }
+
+ @Test
+ public void testSanitizeVariantObject() {
+ Map<String, VariantValue> data = new HashMap<String, VariantValue>();
+ data.put("event_id_8", VariantTestUtil.createSerializedPrimitive(3, new
byte[] {(byte) 32}));
+ data.put(
+ "event_id_16",
+ VariantTestUtil.createSerializedPrimitive(4, new byte[] {(byte) 0xD2,
0x04}));
+ data.put(
+ "event_id_32",
+ VariantTestUtil.createSerializedPrimitive(
+ 5, new byte[] {(byte) 0xD2, 0x02, (byte) 0x96, 0x49}));
+ data.put(
+ "event_id_64",
+ VariantTestUtil.createSerializedPrimitive(
+ 6, new byte[] {(byte) 0xB1, 0x1C, 0x6C, (byte) 0xB1, (byte) 0xF4,
0x10, 0x22, 0x11}));
+ data.put("event_name", VariantTestUtil.createString("test"));
+ data.put(
+ "event_float",
+ VariantTestUtil.createSerializedPrimitive(
+ 14, new byte[] {(byte) 0xD2, 0x02, (byte) 0x96, 0x49}));
+ data.put(
+ "event_double",
+ VariantTestUtil.createSerializedPrimitive(
+ 7, new byte[] {(byte) 0xB1, 0x1C, 0x6C, (byte) 0xB1, (byte) 0xF4,
0x10, 0x22, 0x11}));
+ byte[] empty = new byte[0];
+ data.put("event_bool_t", VariantTestUtil.createSerializedPrimitive(1,
empty));
+ data.put("event_bool_f", VariantTestUtil.createSerializedPrimitive(2,
empty));
+ data.put(
+ "event_id_dec4",
+ VariantTestUtil.createSerializedPrimitive(
+ 8, new byte[] {0x04, (byte) 0xD2, 0x02, (byte) 0x96, 0x49}));
+ data.put(
+ "event_id_dec8", // scale=9
+ VariantTestUtil.createSerializedPrimitive(
+ 9,
+ new byte[] {
+ 0x09, (byte) 0xB1, 0x1C, 0x6C, (byte) 0xB1, (byte) 0xF4, 0x10,
0x22, 0x11
+ }));
+ data.put(
+ "event_id_dec16", // scale=9
+ VariantTestUtil.createSerializedPrimitive(
+ 10,
+ new byte[] {
+ 0x09,
+ 0x15,
+ 0x71,
+ 0x34,
+ (byte) 0xB0,
+ (byte) 0xB8,
+ (byte) 0x87,
+ 0x10,
+ (byte) 0x89,
+ 0x00,
+ 0x00,
+ 0x00,
+ 0x00,
+ 0x00,
+ 0x00,
+ 0x00,
+ 0x00
+ }));
+ data.put(
+ "event_date",
+ VariantTestUtil.createSerializedPrimitive(11, new byte[] {(byte) 0xF4,
0x43, 0x00, 0x00}));
+ data.put(
+ "event_timestamp_tz",
+ VariantTestUtil.createSerializedPrimitive(
+ 12, new byte[] {0x18, (byte) 0xD3, (byte) 0xB1, (byte) 0xD6, 0x07,
0x57, 0x05, 0x00}));
+ data.put(
+ "event_timestamp_ntz",
+ VariantTestUtil.createSerializedPrimitive(
+ 13, new byte[] {0x18, (byte) 0xD3, (byte) 0xB1, (byte) 0xD6, 0x07,
0x57, 0x05, 0x00}));
+ data.put(
+ "event_binary",
+ VariantTestUtil.createSerializedPrimitive(
+ 15, new byte[] {0x05, 0x00, 0x00, 0x00, 'a', 'b', 'c', 'd', 'e'}));
+ data.put("event_array", VariantTestUtil.createMixedArray());
+ Variant value = VariantTestUtil.variant(data);
+ Expression bound = Binder.bind(STRUCT, equal("var", value));
+ assertEquals(
+ Expressions.equal(
+ "var",
+ "{(hash-event_array): "
+ + "{(date), (2-digit-INT8), (hash-2024a117), {(hash-7b813b77),
(hash-512b3ddf), (hash-36f6eef4)}, (hash-6926fdc4), "
+ + "(hash-06eb5017), (hash-64f79f23), (hash-3682a0b4),
(hash-451a5465), (4-digit-INT16)}, "
+ + "(hash-event_binary): (hash-04584bd6), "
+ + "(hash-event_bool_f): (hash-3682a0b4), "
+ + "(hash-event_bool_t): (hash-451a5465), "
+ + "(hash-event_date): (date), "
+ + "(hash-event_double): (-224-digit-DOUBLE), "
+ + "(hash-event_float): (7-digit-FLOAT), "
+ + "(hash-event_id_16): (4-digit-INT16), "
+ + "(hash-event_id_32): (10-digit-INT32), "
+ + "(hash-event_id_64): (19-digit-INT64), "
+ + "(hash-event_id_8): (2-digit-INT8), "
+ + "(hash-event_id_dec16): (10-digit-DECIMAL16), "
+ + "(hash-event_id_dec4): (6-digit-DECIMAL4), "
+ + "(hash-event_id_dec8): (10-digit-DECIMAL8), "
+ + "(hash-event_name): (hash-79b17dd6), "
+ + "(hash-event_timestamp_ntz): (timestamp), "
+ + "(hash-event_timestamp_tz): (timestamp)}"),
Review Comment:
We are missing nano timestamp, time, and UUID test coverage.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]