This is an automated email from the ASF dual-hosted git repository.
jackie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new 78b08e4 convert types per value for array with mixing types (#7234)
78b08e4 is described below
commit 78b08e4af41adc2e5e6d7387960df0524e2a7e90
Author: Xiaobing <[email protected]>
AuthorDate: Fri Jul 30 17:31:26 2021 -0700
convert types per value for array with mixing types (#7234)
1. convert types for individual value in array, handling cases where array
contains mixing types
2. for safety, stop skipping type conversion based on the speculated type
for values in array
3. cast to Number type for numeric PinotDataTypes to make conversion a bit
more general
---
.../apache/pinot/common/utils/PinotDataType.java | 122 ++++++++++++++-------
.../pinot/common/utils/PinotDataTypeTest.java | 59 +++++++++-
.../recordtransformer/DataTypeTransformer.java | 14 ++-
.../recordtransformer/RecordTransformerTest.java | 6 +-
4 files changed, 153 insertions(+), 48 deletions(-)
diff --git
a/pinot-common/src/main/java/org/apache/pinot/common/utils/PinotDataType.java
b/pinot-common/src/main/java/org/apache/pinot/common/utils/PinotDataType.java
index d6df3e4..cb4b5fa 100644
---
a/pinot-common/src/main/java/org/apache/pinot/common/utils/PinotDataType.java
+++
b/pinot-common/src/main/java/org/apache/pinot/common/utils/PinotDataType.java
@@ -119,22 +119,22 @@ public enum PinotDataType {
BYTE {
@Override
public int toInt(Object value) {
- return ((Byte) value).intValue();
+ return ((Number) value).intValue();
}
@Override
public long toLong(Object value) {
- return ((Byte) value).longValue();
+ return ((Number) value).longValue();
}
@Override
public float toFloat(Object value) {
- return ((Byte) value).floatValue();
+ return ((Number) value).floatValue();
}
@Override
public double toDouble(Object value) {
- return ((Byte) value).doubleValue();
+ return ((Number) value).doubleValue();
}
@Override
@@ -203,22 +203,22 @@ public enum PinotDataType {
SHORT {
@Override
public int toInt(Object value) {
- return ((Short) value).intValue();
+ return ((Number) value).intValue();
}
@Override
public long toLong(Object value) {
- return ((Short) value).longValue();
+ return ((Number) value).longValue();
}
@Override
public float toFloat(Object value) {
- return ((Short) value).floatValue();
+ return ((Number) value).floatValue();
}
@Override
public double toDouble(Object value) {
- return ((Short) value).doubleValue();
+ return ((Number) value).doubleValue();
}
@Override
@@ -250,17 +250,17 @@ public enum PinotDataType {
@Override
public long toLong(Object value) {
- return ((Integer) value).longValue();
+ return ((Number) value).longValue();
}
@Override
public float toFloat(Object value) {
- return ((Integer) value).floatValue();
+ return ((Number) value).floatValue();
}
@Override
public double toDouble(Object value) {
- return ((Integer) value).doubleValue();
+ return ((Number) value).doubleValue();
}
@Override
@@ -292,7 +292,7 @@ public enum PinotDataType {
LONG {
@Override
public int toInt(Object value) {
- return ((Long) value).intValue();
+ return ((Number) value).intValue();
}
@Override
@@ -302,12 +302,12 @@ public enum PinotDataType {
@Override
public float toFloat(Object value) {
- return ((Long) value).floatValue();
+ return ((Number) value).floatValue();
}
@Override
public double toDouble(Object value) {
- return ((Long) value).doubleValue();
+ return ((Number) value).doubleValue();
}
@Override
@@ -339,12 +339,12 @@ public enum PinotDataType {
FLOAT {
@Override
public int toInt(Object value) {
- return ((Float) value).intValue();
+ return ((Number) value).intValue();
}
@Override
public long toLong(Object value) {
- return ((Float) value).longValue();
+ return ((Number) value).longValue();
}
@Override
@@ -354,7 +354,7 @@ public enum PinotDataType {
@Override
public double toDouble(Object value) {
- return ((Float) value).doubleValue();
+ return ((Number) value).doubleValue();
}
@Override
@@ -386,17 +386,17 @@ public enum PinotDataType {
DOUBLE {
@Override
public int toInt(Object value) {
- return ((Double) value).intValue();
+ return ((Number) value).intValue();
}
@Override
public long toLong(Object value) {
- return ((Double) value).longValue();
+ return ((Number) value).longValue();
}
@Override
public float toFloat(Object value) {
- return ((Double) value).floatValue();
+ return ((Number) value).floatValue();
}
@Override
@@ -867,7 +867,11 @@ public enum PinotDataType {
int[] intArray = new int[length];
PinotDataType singleValueType = getSingleValueType();
for (int i = 0; i < length; i++) {
- intArray[i] = singleValueType.toInt(valueArray[i]);
+ try {
+ intArray[i] = singleValueType.toInt(valueArray[i]);
+ } catch (ClassCastException e) {
+ intArray[i] = anyToInt(valueArray[i]);
+ }
}
return intArray;
}
@@ -885,7 +889,11 @@ public enum PinotDataType {
Integer[] integerArray = new Integer[length];
PinotDataType singleValueType = getSingleValueType();
for (int i = 0; i < length; i++) {
- integerArray[i] = singleValueType.toInt(valueArray[i]);
+ try {
+ integerArray[i] = singleValueType.toInt(valueArray[i]);
+ } catch (ClassCastException e) {
+ integerArray[i] = anyToInt(valueArray[i]);
+ }
}
return integerArray;
}
@@ -903,7 +911,11 @@ public enum PinotDataType {
long[] longArray = new long[length];
PinotDataType singleValueType = getSingleValueType();
for (int i = 0; i < length; i++) {
- longArray[i] = singleValueType.toLong(valueArray[i]);
+ try {
+ longArray[i] = singleValueType.toLong(valueArray[i]);
+ } catch (ClassCastException e) {
+ longArray[i] = anyToLong(valueArray[i]);
+ }
}
return longArray;
}
@@ -921,7 +933,11 @@ public enum PinotDataType {
Long[] longArray = new Long[length];
PinotDataType singleValueType = getSingleValueType();
for (int i = 0; i < length; i++) {
- longArray[i] = singleValueType.toLong(valueArray[i]);
+ try {
+ longArray[i] = singleValueType.toLong(valueArray[i]);
+ } catch (ClassCastException e) {
+ longArray[i] = anyToLong(valueArray[i]);
+ }
}
return longArray;
}
@@ -939,7 +955,11 @@ public enum PinotDataType {
float[] floatArray = new float[length];
PinotDataType singleValueType = getSingleValueType();
for (int i = 0; i < length; i++) {
- floatArray[i] = singleValueType.toFloat(valueArray[i]);
+ try {
+ floatArray[i] = singleValueType.toFloat(valueArray[i]);
+ } catch (ClassCastException e) {
+ floatArray[i] = anyToFloat(valueArray[i]);
+ }
}
return floatArray;
}
@@ -957,7 +977,11 @@ public enum PinotDataType {
Float[] floatArray = new Float[length];
PinotDataType singleValueType = getSingleValueType();
for (int i = 0; i < length; i++) {
- floatArray[i] = singleValueType.toFloat(valueArray[i]);
+ try {
+ floatArray[i] = singleValueType.toFloat(valueArray[i]);
+ } catch (ClassCastException e) {
+ floatArray[i] = anyToFloat(valueArray[i]);
+ }
}
return floatArray;
}
@@ -975,7 +999,11 @@ public enum PinotDataType {
double[] doubleArray = new double[length];
PinotDataType singleValueType = getSingleValueType();
for (int i = 0; i < length; i++) {
- doubleArray[i] = singleValueType.toDouble(valueArray[i]);
+ try {
+ doubleArray[i] = singleValueType.toDouble(valueArray[i]);
+ } catch (ClassCastException e) {
+ doubleArray[i] = anyToDouble(valueArray[i]);
+ }
}
return doubleArray;
}
@@ -993,7 +1021,11 @@ public enum PinotDataType {
Double[] doubleArray = new Double[length];
PinotDataType singleValueType = getSingleValueType();
for (int i = 0; i < length; i++) {
- doubleArray[i] = singleValueType.toDouble(valueArray[i]);
+ try {
+ doubleArray[i] = singleValueType.toDouble(valueArray[i]);
+ } catch (ClassCastException e) {
+ doubleArray[i] = anyToDouble(valueArray[i]);
+ }
}
return doubleArray;
}
@@ -1096,6 +1128,22 @@ public enum PinotDataType {
return (pdt != null) ? pdt : OBJECT_ARRAY;
}
+ private static int anyToInt(Object val) {
+ return getSingleValueType(val.getClass()).toInt(val);
+ }
+
+ private static long anyToLong(Object val) {
+ return getSingleValueType(val.getClass()).toLong(val);
+ }
+
+ private static float anyToFloat(Object val) {
+ return getSingleValueType(val.getClass()).toFloat(val);
+ }
+
+ private static double anyToDouble(Object val) {
+ return getSingleValueType(val.getClass()).toDouble(val);
+ }
+
/**
* Returns the {@link PinotDataType} for the given {@link FieldSpec} for
data ingestion purpose. Returns object array
* type for multi-valued types.
@@ -1105,36 +1153,36 @@ public enum PinotDataType {
DataType dataType = fieldSpec.getDataType();
switch (dataType) {
case INT:
- return fieldSpec.isSingleValueField() ? PinotDataType.INTEGER :
PinotDataType.INTEGER_ARRAY;
+ return fieldSpec.isSingleValueField() ? INTEGER : INTEGER_ARRAY;
case LONG:
- return fieldSpec.isSingleValueField() ? PinotDataType.LONG :
PinotDataType.LONG_ARRAY;
+ return fieldSpec.isSingleValueField() ? LONG : LONG_ARRAY;
case FLOAT:
- return fieldSpec.isSingleValueField() ? PinotDataType.FLOAT :
PinotDataType.FLOAT_ARRAY;
+ return fieldSpec.isSingleValueField() ? FLOAT : FLOAT_ARRAY;
case DOUBLE:
- return fieldSpec.isSingleValueField() ? PinotDataType.DOUBLE :
PinotDataType.DOUBLE_ARRAY;
+ return fieldSpec.isSingleValueField() ? DOUBLE : DOUBLE_ARRAY;
case BOOLEAN:
if (fieldSpec.isSingleValueField()) {
- return PinotDataType.BOOLEAN;
+ return BOOLEAN;
} else {
throw new IllegalStateException("There is no multi-value type for
BOOLEAN");
}
case TIMESTAMP:
if (fieldSpec.isSingleValueField()) {
- return PinotDataType.TIMESTAMP;
+ return TIMESTAMP;
} else {
throw new IllegalStateException("There is no multi-value type for
TIMESTAMP");
}
case JSON:
if (fieldSpec.isSingleValueField()) {
- return PinotDataType.JSON;
+ return JSON;
} else {
throw new IllegalStateException("There is no multi-value type for
JSON");
}
case STRING:
- return fieldSpec.isSingleValueField() ? PinotDataType.STRING :
PinotDataType.STRING_ARRAY;
+ return fieldSpec.isSingleValueField() ? STRING : STRING_ARRAY;
case BYTES:
if (fieldSpec.isSingleValueField()) {
- return PinotDataType.BYTES;
+ return BYTES;
} else {
throw new IllegalStateException("There is no multi-value type for
BYTES");
}
diff --git
a/pinot-common/src/test/java/org/apache/pinot/common/utils/PinotDataTypeTest.java
b/pinot-common/src/test/java/org/apache/pinot/common/utils/PinotDataTypeTest.java
index 9ca7d4e..e1e3e39 100644
---
a/pinot-common/src/test/java/org/apache/pinot/common/utils/PinotDataTypeTest.java
+++
b/pinot-common/src/test/java/org/apache/pinot/common/utils/PinotDataTypeTest.java
@@ -46,6 +46,20 @@ public class PinotDataTypeTest {
(byte) 123), Character.toString((char) 123), Short.toString((short)
123), Integer.toString(
123), Long.toString(123L), Float.toString(123f),
Double.toString(123d), " 123"};
+ // Test cases where array for MV column contains values of mixing types.
+ private static final PinotDataType[] SOURCE_ARRAY_TYPES =
+ {SHORT_ARRAY, INTEGER_ARRAY, LONG_ARRAY, FLOAT_ARRAY, DOUBLE_ARRAY};
+ private static final Object[] SOURCE_ARRAY_VALUES = new Object[]{(short)
123, 4, 5L, 6f, 7d, "8"};
+
+ private static final PinotDataType[] DEST_ARRAY_TYPES = {INTEGER_ARRAY,
LONG_ARRAY, FLOAT_ARRAY, DOUBLE_ARRAY};
+ private static final Object[] EXPECTED_DEST_ARRAY_VALUES =
+ {new Object[]{123, 4, 5, 6, 7, 8}, new Object[]{123L, 4L, 5L, 6L, 7L,
8L}, new Object[]{123f, 4f, 5f, 6f, 7f, 8f}, new Object[]{123d, 4d, 5d, 6d, 7d,
8d}};
+
+ private static final PinotDataType[] DEST_PRIMITIVE_ARRAY_TYPES =
+ {PRIMITIVE_INT_ARRAY, PRIMITIVE_LONG_ARRAY, PRIMITIVE_FLOAT_ARRAY,
PRIMITIVE_DOUBLE_ARRAY};
+ private static final Object[] EXPECTED_DEST_PRIMITIVE_ARRAY_VALUES =
+ {new int[]{123, 4, 5, 6, 7, 8}, new long[]{123L, 4L, 5L, 6L, 7L, 8L},
new float[]{123f, 4f, 5f, 6f, 7f, 8f}, new double[]{123d, 4d, 5d, 6d, 7d, 8d}};
+
@Test
public void testNumberConversion() {
int numDestTypes = DEST_TYPES.length;
@@ -61,6 +75,36 @@ public class PinotDataTypeTest {
}
@Test
+ public void testConversionWithMixTypes() {
+ int numDestTypes = DEST_ARRAY_TYPES.length;
+ for (int i = 0; i < numDestTypes; i++) {
+ PinotDataType destType = DEST_ARRAY_TYPES[i];
+ Object expectedDestValue = EXPECTED_DEST_ARRAY_VALUES[i];
+ for (PinotDataType sourceArrayType : SOURCE_ARRAY_TYPES) {
+ Object actualDestValue = destType.convert(SOURCE_ARRAY_VALUES,
sourceArrayType);
+ assertEquals(actualDestValue, expectedDestValue);
+ }
+ }
+
+ numDestTypes = DEST_PRIMITIVE_ARRAY_TYPES.length;
+ for (int i = 0; i < numDestTypes; i++) {
+ PinotDataType destType = DEST_PRIMITIVE_ARRAY_TYPES[i];
+ Object expectedDestValue = EXPECTED_DEST_PRIMITIVE_ARRAY_VALUES[i];
+ for (PinotDataType sourceArrayType : SOURCE_ARRAY_TYPES) {
+ Object actualDestValue = destType.convert(SOURCE_ARRAY_VALUES,
sourceArrayType);
+ assertEquals(actualDestValue, expectedDestValue);
+ }
+ }
+
+ try {
+ INTEGER_ARRAY.convert(new Object[]{"abc"}, LONG_ARRAY);
+ fail();
+ } catch (NumberFormatException e) {
+ // expected to reach here
+ }
+ }
+
+ @Test
public void testToString() {
int numSourceTypes = SOURCE_TYPES.length;
for (int i = 0; i < numSourceTypes; i++) {
@@ -97,7 +141,7 @@ public class PinotDataTypeTest {
assertEquals(STRING.convert(new byte[]{0, 1}, BYTES), "0001");
assertEquals(BYTES.convert("0001", STRING), new byte[]{0, 1});
assertEquals(BYTES.convert(new byte[]{0, 1}, BYTES), new byte[]{0, 1});
- assertEquals(BYTES.convert("AAE=", JSON), new byte[]{0,1});
+ assertEquals(BYTES.convert("AAE=", JSON), new byte[]{0, 1});
assertEquals(BYTES.convert(new Byte[]{0, 1}, BYTE_ARRAY), new byte[]{0,
1});
assertEquals(BYTES.convert(new String[]{"0001"}, STRING_ARRAY), new
byte[]{0, 1});
}
@@ -116,7 +160,10 @@ public class PinotDataTypeTest {
assertEquals(JSON.convert(false, BOOLEAN), "false");
assertEquals(JSON.convert(true, BOOLEAN), "true");
assertEquals(JSON.convert(new byte[]{0, 1}, BYTES), "\"AAE=\""); // Base64
encoding.
-
assertEquals(JSON.convert("{\"bytes\":\"AAE=\",\"map\":{\"key1\":\"value\",\"key2\":null,\"array\":[-5.4,4,\"2\"]},\"timestamp\":1620324238610}",
STRING),
"{\"bytes\":\"AAE=\",\"map\":{\"key1\":\"value\",\"key2\":null,\"array\":[-5.4,4,\"2\"]},\"timestamp\":1620324238610}");
+ assertEquals(JSON.convert(
+
"{\"bytes\":\"AAE=\",\"map\":{\"key1\":\"value\",\"key2\":null,\"array\":[-5.4,4,\"2\"]},\"timestamp\":1620324238610}",
+ STRING),
+
"{\"bytes\":\"AAE=\",\"map\":{\"key1\":\"value\",\"key2\":null,\"array\":[-5.4,4,\"2\"]},\"timestamp\":1620324238610}");
assertEquals(JSON.convert(new Timestamp(1620324238610l), TIMESTAMP),
"1620324238610");
}
@@ -129,7 +176,8 @@ public class PinotDataTypeTest {
assertTrue(OBJECT.toBoolean(new NumberObject("123")));
assertEquals(OBJECT.toTimestamp(new NumberObject("123")).getTime(), 123L);
assertEquals(OBJECT.toString(new NumberObject("123")), "123");
- assertEquals(OBJECT.toJson(getGenericTestObject()),
"{\"bytes\":\"AAE=\",\"map\":{\"key1\":\"value\",\"key2\":null,\"array\":[-5.4,4,\"2\"]},\"timestamp\":1620324238610}");
+ assertEquals(OBJECT.toJson(getGenericTestObject()),
+
"{\"bytes\":\"AAE=\",\"map\":{\"key1\":\"value\",\"key2\":null,\"array\":[-5.4,4,\"2\"]},\"timestamp\":1620324238610}");
assertEquals(OBJECT_ARRAY.getSingleValueType(), OBJECT);
// Non-zero value is treated as true.
assertTrue(OBJECT.toBoolean(1.1d));
@@ -181,13 +229,13 @@ public class PinotDataTypeTest {
private static Object getGenericTestObject() {
Map<String, Object> map1 = new HashMap<>();
- map1.put("array", Arrays.asList(-5.4,4, "2"));
+ map1.put("array", Arrays.asList(-5.4, 4, "2"));
map1.put("key1", "value");
map1.put("key2", null);
Map<String, Object> map2 = new HashMap<>();
map2.put("map", map1);
- map2.put("bytes", new byte[]{0,1});
+ map2.put("bytes", new byte[]{0, 1});
map2.put("timestamp", new Timestamp(1620324238610l));
return map2;
@@ -222,7 +270,6 @@ public class PinotDataTypeTest {
}
assertInvalidConversion("xyz", STRING, JSON, RuntimeException.class);
-
}
private void assertInvalidConversion(Object value, PinotDataType sourceType,
PinotDataType destType,
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/DataTypeTransformer.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/DataTypeTransformer.java
index bbc6d7d..fbf18bf 100644
---
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/DataTypeTransformer.java
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/DataTypeTransformer.java
@@ -77,9 +77,17 @@ public class DataTypeTransformer implements
RecordTransformer {
// Single-value column
source = PinotDataType.getSingleValueType(value.getClass());
}
- if (source != dest) {
- value = dest.convert(value, source);
- }
+ // Skipping conversion when srcType!=destType is speculative, and can
be unsafe when
+ // the array for MV column contains values of mixing types. Mixing
types can lead
+ // to ClassCastException during conversion, often aborting data
ingestion jobs.
+ //
+ // So now, calling convert() unconditionally for safety. Perf impact
is negligible:
+ // 1. for SV column, when srcType=destType, the conversion is simply
pass through.
+ // 2. for MV column, when srcType=destType, the conversion is simply
pass through
+ // if the source type is not Object[] (but sth like Integer[],
Double[]). For Object[],
+ // the conversion loops through values in the array like before, but
can catch the
+ // ClassCastException if it happens and continue the conversion now.
+ value = dest.convert(value, source);
value = dest.toInternal(value);
record.putValue(column, value);
diff --git
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/RecordTransformerTest.java
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/RecordTransformerTest.java
index 367be7e..e711697 100644
---
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/RecordTransformerTest.java
+++
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/RecordTransformerTest.java
@@ -150,8 +150,10 @@ public class RecordTransformerTest {
assertEquals(record.getValue("mvDouble"), new Object[]{123d});
assertEquals(record.getValue("svStringWithNullCharacters"),
"1\0002\0003");
assertEquals(record.getValue("svStringWithLengthLimit"), "123");
- // NOTE: We identify the array type by the first element, so data type
conversion only applied to 'mvString2'
- assertEquals(record.getValue("mvString1"), new Object[]{"123", 123,
123L, 123f, 123.0});
+ // NOTE: We used to speculate the array type by the first element, but
has changed
+ // to convert type for values in array based on their real value, making
the logic
+ // safe to handle array of values with mixing types.
+ assertEquals(record.getValue("mvString1"), new Object[]{"123", "123",
"123", "123.0", "123.0"});
assertEquals(record.getValue("mvString2"), new Object[]{"123", "123",
"123.0", "123.0", "123"});
assertNull(record.getValue("$virtual"));
assertTrue(record.getNullValueFields().isEmpty());
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]