This is an automated email from the ASF dual-hosted git repository.
jackie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new ec3d6ed387 add support to differentiate null and emptyLists for
multi-value columns in avro decoder (#13572)
ec3d6ed387 is described below
commit ec3d6ed387332e802634bb87e13c5dff55ff6087
Author: Qiaochu Liu <[email protected]>
AuthorDate: Fri Sep 13 12:25:34 2024 -0700
add support to differentiate null and emptyLists for multi-value columns in
avro decoder (#13572)
---
.../inputformat/avro/AvroRecordExtractor.java | 1 +
.../avro/AvroRecordExtractorConfig.java | 10 ++++
.../avro/AvroRecordToPinotRowGeneratorTest.java | 62 ++++++++++++++++++++++
.../spi/data/readers/BaseRecordExtractor.java | 4 +-
.../spi/data/readers/RecordExtractorConfig.java | 3 +-
5 files changed, 78 insertions(+), 2 deletions(-)
diff --git
a/pinot-plugins/pinot-input-format/pinot-avro-base/src/main/java/org/apache/pinot/plugin/inputformat/avro/AvroRecordExtractor.java
b/pinot-plugins/pinot-input-format/pinot-avro-base/src/main/java/org/apache/pinot/plugin/inputformat/avro/AvroRecordExtractor.java
index d1a4dea0dc..770029b3e3 100644
---
a/pinot-plugins/pinot-input-format/pinot-avro-base/src/main/java/org/apache/pinot/plugin/inputformat/avro/AvroRecordExtractor.java
+++
b/pinot-plugins/pinot-input-format/pinot-avro-base/src/main/java/org/apache/pinot/plugin/inputformat/avro/AvroRecordExtractor.java
@@ -48,6 +48,7 @@ public class AvroRecordExtractor extends
BaseRecordExtractor<GenericRecord> {
AvroRecordExtractorConfig config = (AvroRecordExtractorConfig)
recordExtractorConfig;
if (config != null) {
_applyLogicalTypes = config.isEnableLogicalTypes();
+ _differentiateNullAndEmptyForMV =
config.isDifferentiateNullAndEmptyForMV();
}
if (fields == null || fields.isEmpty()) {
_extractAll = true;
diff --git
a/pinot-plugins/pinot-input-format/pinot-avro-base/src/main/java/org/apache/pinot/plugin/inputformat/avro/AvroRecordExtractorConfig.java
b/pinot-plugins/pinot-input-format/pinot-avro-base/src/main/java/org/apache/pinot/plugin/inputformat/avro/AvroRecordExtractorConfig.java
index da26b1048a..1882196777 100644
---
a/pinot-plugins/pinot-input-format/pinot-avro-base/src/main/java/org/apache/pinot/plugin/inputformat/avro/AvroRecordExtractorConfig.java
+++
b/pinot-plugins/pinot-input-format/pinot-avro-base/src/main/java/org/apache/pinot/plugin/inputformat/avro/AvroRecordExtractorConfig.java
@@ -27,10 +27,12 @@ import
org.apache.pinot.spi.data.readers.RecordExtractorConfig;
*/
public class AvroRecordExtractorConfig implements RecordExtractorConfig {
private boolean _enableLogicalTypes = false;
+ private boolean _differentiateNullAndEmptyForMV = false;
@Override
public void init(Map<String, String> props) {
_enableLogicalTypes =
Boolean.parseBoolean(props.get("enableLogicalTypes"));
+ _differentiateNullAndEmptyForMV =
Boolean.parseBoolean(props.get("differentiateNullAndEmptyForMV"));
}
public boolean isEnableLogicalTypes() {
@@ -40,4 +42,12 @@ public class AvroRecordExtractorConfig implements
RecordExtractorConfig {
public void setEnableLogicalTypes(boolean enableLogicalTypes) {
_enableLogicalTypes = enableLogicalTypes;
}
+
+ public boolean isDifferentiateNullAndEmptyForMV() {
+ return _differentiateNullAndEmptyForMV;
+ }
+
+ public void setDifferentiateNullAndEmptyForMV(boolean
differentiateNullAndEmptyForMV) {
+ _differentiateNullAndEmptyForMV = differentiateNullAndEmptyForMV;
+ }
}
diff --git
a/pinot-plugins/pinot-input-format/pinot-avro-base/src/test/java/org/apache/pinot/plugin/inputformat/avro/AvroRecordToPinotRowGeneratorTest.java
b/pinot-plugins/pinot-input-format/pinot-avro-base/src/test/java/org/apache/pinot/plugin/inputformat/avro/AvroRecordToPinotRowGeneratorTest.java
index 45cf943411..17402272a2 100644
---
a/pinot-plugins/pinot-input-format/pinot-avro-base/src/test/java/org/apache/pinot/plugin/inputformat/avro/AvroRecordToPinotRowGeneratorTest.java
+++
b/pinot-plugins/pinot-input-format/pinot-avro-base/src/test/java/org/apache/pinot/plugin/inputformat/avro/AvroRecordToPinotRowGeneratorTest.java
@@ -19,12 +19,15 @@
package org.apache.pinot.plugin.inputformat.avro;
import com.google.common.collect.Sets;
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import org.apache.avro.Schema;
+import org.apache.avro.SchemaBuilder;
import org.apache.avro.generic.GenericData;
+import org.apache.avro.generic.GenericRecord;
import org.apache.pinot.spi.data.readers.GenericRow;
import org.testng.Assert;
import org.testng.annotations.Test;
@@ -52,4 +55,63 @@ public class AvroRecordToPinotRowGeneratorTest {
genericRow.getFieldToValueMap().keySet().containsAll(Arrays.asList("incomingTime",
"outgoingTime")));
Assert.assertEquals(genericRow.getValue("incomingTime"), 12345L);
}
+
+ @Test
+ public void testNoDifferentiateNullAndEmptyForMultiValueFields() {
+ AvroRecordExtractorConfig config = new AvroRecordExtractorConfig();
+ AvroRecordExtractor avroRecordExtractor = new AvroRecordExtractor();
+ avroRecordExtractor.init(null, config);
+
+ Schema schema =
+
SchemaBuilder.record("GenericRow").fields().name("arrField1").type().array().items().stringType().noDefault()
+
.name("arrField2").type().array().items().stringType().noDefault().name("arrField3").type().array().items()
+ .stringType().noDefault().endRecord();
+
+ GenericRecord genericRecord = new GenericData.Record(schema);
+ List<String> arrayData1 = Arrays.asList("value1", "value2", "value3");
+ List<String> arrayData2 = null;
+ List<String> arrayData3 = new ArrayList<>();
+
+ GenericRow genericRow = new GenericRow();
+ genericRecord.put("arrField1", arrayData1);
+ genericRecord.put("arrField2", arrayData2);
+ genericRecord.put("arrField3", arrayData3);
+
+ avroRecordExtractor.extract(genericRecord, genericRow);
+ Assert.assertTrue(
+
genericRow.getFieldToValueMap().keySet().containsAll(Arrays.asList("arrField1",
"arrField2", "arrField3")));
+ Assert.assertEquals(genericRow.getValue("arrField1"),
arrayData1.toArray());
+ Assert.assertEquals(genericRow.getValue("arrField2"), null);
+ Assert.assertEquals(genericRow.getValue("arrField3"), null);
+ }
+
+ @Test
+ public void testDifferentiateNullAndEmptyForMultiValueFields() {
+ AvroRecordExtractorConfig config = new AvroRecordExtractorConfig();
+ config.setDifferentiateNullAndEmptyForMV(true);
+ AvroRecordExtractor avroRecordExtractor = new AvroRecordExtractor();
+ avroRecordExtractor.init(null, config);
+
+ Schema schema =
+
SchemaBuilder.record("GenericRow").fields().name("arrField1").type().array().items().stringType().noDefault()
+
.name("arrField2").type().array().items().stringType().noDefault().name("arrField3").type().array().items()
+ .stringType().noDefault().endRecord();
+
+ GenericRecord genericRecord = new GenericData.Record(schema);
+ List<String> arrayData1 = Arrays.asList("value1", "value2", "value3");
+ List<String> arrayData2 = null;
+ List<String> arrayData3 = new ArrayList<>();
+
+ GenericRow genericRow = new GenericRow();
+ genericRecord.put("arrField1", arrayData1);
+ genericRecord.put("arrField2", arrayData2);
+ genericRecord.put("arrField3", arrayData3);
+
+ avroRecordExtractor.extract(genericRecord, genericRow);
+ Assert.assertTrue(
+
genericRow.getFieldToValueMap().keySet().containsAll(Arrays.asList("arrField1",
"arrField2", "arrField3")));
+ Assert.assertEquals(genericRow.getValue("arrField1"),
arrayData1.toArray());
+ Assert.assertEquals(genericRow.getValue("arrField2"), null);
+ Assert.assertEquals(genericRow.getValue("arrField3"), new String[0]);
+ }
}
diff --git
a/pinot-spi/src/main/java/org/apache/pinot/spi/data/readers/BaseRecordExtractor.java
b/pinot-spi/src/main/java/org/apache/pinot/spi/data/readers/BaseRecordExtractor.java
index f4efdb6d2d..39c6a6da7c 100644
---
a/pinot-spi/src/main/java/org/apache/pinot/spi/data/readers/BaseRecordExtractor.java
+++
b/pinot-spi/src/main/java/org/apache/pinot/spi/data/readers/BaseRecordExtractor.java
@@ -33,6 +33,8 @@ import javax.annotation.Nullable;
*/
public abstract class BaseRecordExtractor<T> implements RecordExtractor<T> {
+ protected boolean _differentiateNullAndEmptyForMV = false;
+
/**
* Converts the field value to either a single value (string, number,
byte[]), multi value (Object[]) or a Map.
* Returns {@code null} if the value is an empty array/collection/map.
@@ -107,7 +109,7 @@ public abstract class BaseRecordExtractor<T> implements
RecordExtractor<T> {
protected Object convertMultiValue(Object value) {
Collection collection = (Collection) value;
if (collection.isEmpty()) {
- return null;
+ return _differentiateNullAndEmptyForMV ? new Object[0] : null;
}
int numValues = collection.size();
diff --git
a/pinot-spi/src/main/java/org/apache/pinot/spi/data/readers/RecordExtractorConfig.java
b/pinot-spi/src/main/java/org/apache/pinot/spi/data/readers/RecordExtractorConfig.java
index f33288de76..b4c265e351 100644
---
a/pinot-spi/src/main/java/org/apache/pinot/spi/data/readers/RecordExtractorConfig.java
+++
b/pinot-spi/src/main/java/org/apache/pinot/spi/data/readers/RecordExtractorConfig.java
@@ -25,5 +25,6 @@ import java.util.Map;
* Interface for configs of {@link RecordExtractor}
*/
public interface RecordExtractorConfig {
- default void init(Map<String, String> props) { }
+ default void init(Map<String, String> props) {
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]