>From Preetham Poluparthi <[email protected]>:
Preetham Poluparthi has uploaded this change for review. (
https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/20525?usp=email )
Change subject: WIP: parquet read null fix
......................................................................
WIP: parquet read null fix
Change-Id: Ib4cd841fa40d4fdd5bd330d2127a6437e5bd6565
---
M
asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-null-type/parquet-null-type.05.adm
M
asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-null-type/parquet-null-type.08.adm
M
asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-null2/parquet-null2.04.adm
M
asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-simple/parquet-simple.04.adm
M
asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.05.adm
M
asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/hdfs/parquet/converter/nested/ObjectConverter.java
M
asterixdb/asterix-om/src/main/java/org/apache/asterix/om/lazy/NullLazyVisitablePointable.java
7 files changed, 47 insertions(+), 15 deletions(-)
git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb
refs/changes/25/20525/1
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-null-type/parquet-null-type.05.adm
b/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-null-type/parquet-null-type.05.adm
index 763b652..84a38f7 100644
---
a/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-null-type/parquet-null-type.05.adm
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-null-type/parquet-null-type.05.adm
@@ -1,5 +1,5 @@
-{ "id": 8, "nested": { "List": [ 100, 300 ] }, "obj_array": [ { "first":
"first" }, { }, { "first": "second" } ], "a": { }, "arr": [ [ 1, 2 ] ] }
-{ "id": 10, "name": "Virat", "nested": { "List": [ ] }, "obj_array": [ {
"first": "first" }, { "first": "second" } ], "a": { }, "c": { }, "f": [ ],
"arr": [ [ 1, 2 ], [ ] ] }
-{ "id": 28, "name": "Virat", "nested": { "List": [ ], "A": "a" },
"obj_array": [ { "first": "first" }, { "first": "second" } ], "a": { "b": 1 },
"c": { "d": 1 }, "f": [ 1.0 ], "arr": [ [ 1, 2 ], [ ] ] }
-{ "id": 34, "nested": { "randomK": "randomV" }, "obj_array": [ { "first":
"first" }, { "first": "second" } ], "c": { "e": 1 }, "f": [ 2.0, 3.0 ], "arr":
[ [ ] ] }
-{ "id": 37, "name": "Kohli", "nested": { "List": [ 1, 2, 3 ], "A": "a" },
"obj_array": [ { "first": "first" }, { "first": "second" } ], "a": { "b": 1 },
"c": { "d": 1, "e": 1 }, "f": [ 3.5999999046325684, 4.0 ], "arr": [ [ 1, 2, 3 ]
] }
+{ "id": 8, "nested": { "List": [ 100, 300 ], "A": null, "randomK": null },
"obj_array": [ { "first": "first" }, { "first": null }, { "first": "second" }
], "a": { "b": null }, "arr": [ [ 1, 2 ] ], "c": null, "f": null, "name": null }
+{ "id": 10, "name": "Virat", "nested": { "List": [ ], "A": null, "randomK":
null }, "obj_array": [ { "first": "first" }, { "first": "second" } ], "a": {
"b": null }, "c": { "d": null, "e": null }, "f": [ ], "arr": [ [ 1, 2 ], [ ]
] }
+{ "id": 28, "name": "Virat", "nested": { "List": [ ], "A": "a", "randomK":
null }, "obj_array": [ { "first": "first" }, { "first": "second" } ], "a": {
"b": 1 }, "c": { "d": 1, "e": null }, "f": [ 1.0 ], "arr": [ [ 1, 2 ], [ ] ] }
+{ "id": 34, "nested": { "randomK": "randomV", "A": null, "List": null },
"obj_array": [ { "first": "first" }, { "first": "second" } ], "c": { "e": 1,
"d": null }, "f": [ 2.0, 3.0 ], "arr": [ [ ] ], "a": null, "name": null }
+{ "id": 37, "name": "Kohli", "nested": { "List": [ 1, 2, 3 ], "A": "a",
"randomK": null }, "obj_array": [ { "first": "first" }, { "first": "second" }
], "a": { "b": 1 }, "c": { "d": 1, "e": 1 }, "f": [ 3.5999999046325684, 4.0 ],
"arr": [ [ 1, 2, 3 ] ] }
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-null-type/parquet-null-type.08.adm
b/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-null-type/parquet-null-type.08.adm
index 628b82f..30e6687 100644
---
a/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-null-type/parquet-null-type.08.adm
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-null-type/parquet-null-type.08.adm
@@ -1,5 +1,5 @@
-{ "arr": [ [ 1, 2 ] ], "a": { }, "id": 8, "nested": { "List": [ 100, 300 ] },
"obj_array": [ { "first": "first" }, { }, { "first": "second" } ] }
-{ "arr": [ [ 1, 2 ], [ ] ], "a": { }, "c": { }, "f": [ ], "name": "Virat",
"id": 10, "nested": { "List": [ ] }, "obj_array": [ { "first": "first" }, {
"first": "second" } ] }
-{ "arr": [ [ 1, 2 ], [ ] ], "a": { "b": 1 }, "c": { "d": 1 }, "f": [ 1.0 ],
"name": "Virat", "id": 28, "nested": { "A": "a", "List": [ ] }, "obj_array": [
{ "first": "first" }, { "first": "second" } ] }
-{ "arr": [ [ ] ], "c": { "e": 1 }, "f": [ 2.0, 3.0 ], "id": 34, "nested": {
"randomK": "randomV" }, "obj_array": [ { "first": "first" }, { "first":
"second" } ] }
-{ "arr": [ [ 1, 2, 3 ] ], "a": { "b": 1 }, "c": { "d": 1, "e": 1 }, "f": [
3.6, 4.0 ], "name": "Kohli", "id": 37, "nested": { "A": "a", "List": [ 1, 2, 3
] }, "obj_array": [ { "first": "first" }, { "first": "second" } ] }
+{ "arr": [ [ 1, 2 ] ], "a": { "b": null }, "id": 8, "nested": { "List": [ 100,
300 ], "A": null, "randomK": null }, "obj_array": [ { "first": "first" }, {
"first": null }, { "first": "second" } ], "c": null, "f": null, "name": null }
+{ "arr": [ [ 1, 2 ], [ ] ], "a": { "b": null }, "c": { "d": null, "e": null
}, "f": [ ], "name": "Virat", "id": 10, "nested": { "List": [ ], "A": null,
"randomK": null }, "obj_array": [ { "first": "first" }, { "first": "second" } ]
}
+{ "arr": [ [ 1, 2 ], [ ] ], "a": { "b": 1 }, "c": { "d": 1, "e": null }, "f":
[ 1.0 ], "name": "Virat", "id": 28, "nested": { "A": "a", "List": [ ],
"randomK": null }, "obj_array": [ { "first": "first" }, { "first": "second" } ]
}
+{ "arr": [ [ ] ], "c": { "e": 1, "d": null }, "f": [ 2.0, 3.0 ], "id": 34,
"nested": { "randomK": "randomV", "A": null, "List": null }, "obj_array": [ {
"first": "first" }, { "first": "second" } ], "a": null, "name": null }
+{ "arr": [ [ 1, 2, 3 ] ], "a": { "b": 1 }, "c": { "d": 1, "e": 1 }, "f": [
3.6, 4.0 ], "name": "Kohli", "id": 37, "nested": { "A": "a", "List": [ 1, 2, 3
], "randomK": null }, "obj_array": [ { "first": "first" }, { "first": "second"
} ] }
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-null2/parquet-null2.04.adm
b/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-null2/parquet-null2.04.adm
index 29ca9ec..5122755 100644
---
a/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-null2/parquet-null2.04.adm
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-null2/parquet-null2.04.adm
@@ -1,3 +1,3 @@
{ "c": { "col2": { "centuries": [ ], "name": "aqay awil", "id": 1 } } }
{ "c": { "col2": { "centuries": [ ], "id": 2 } } }
-{ "c": { "col2": { "centuries": [ ], "id": 3 } } }
+{ "c": { "col2": { "centuries": [ ], "id": 3, "name": null } } }
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-simple/parquet-simple.04.adm
b/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-simple/parquet-simple.04.adm
index bf567b2..3bb97b9 100644
---
a/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-simple/parquet-simple.04.adm
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-simple/parquet-simple.04.adm
@@ -1 +1 @@
-{ "id": "123" }
\ No newline at end of file
+{ "id": "123", "name": null }
\ No newline at end of file
diff --git
a/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.05.adm
b/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.05.adm
index 4fd973e..42c906a 100644
---
a/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.05.adm
+++
b/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.05.adm
@@ -1,4 +1,4 @@
-{ "ratings": [ ], "id": 2 }
+{ "ratings": [ ], "id": 2, "rating": null }
{ "ratings": [ ], "rating": 1.0, "id": 5 }
{ "ratings": [ 1 ], "rating": 2.0, "id": 8 }
{ "ratings": [ 1, 2, 3 ], "rating": 3.0, "id": 10 }
diff --git
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/hdfs/parquet/converter/nested/ObjectConverter.java
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/hdfs/parquet/converter/nested/ObjectConverter.java
index 6b63a7b..f86f9f0 100644
---
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/hdfs/parquet/converter/nested/ObjectConverter.java
+++
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/hdfs/parquet/converter/nested/ObjectConverter.java
@@ -19,6 +19,8 @@
package
org.apache.asterix.external.input.record.reader.hdfs.parquet.converter.nested;
import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
import org.apache.asterix.builders.IARecordBuilder;
import
org.apache.asterix.external.input.filter.embedder.IExternalFilterValueEmbedder;
@@ -26,6 +28,7 @@
import
org.apache.asterix.external.input.record.reader.hdfs.parquet.converter.IFieldValue;
import
org.apache.asterix.external.input.record.reader.hdfs.parquet.converter.ParquetConverterContext;
import
org.apache.asterix.external.input.record.reader.hdfs.parquet.converter.primitve.PrimitiveConverterProvider;
+import org.apache.asterix.om.lazy.NullLazyVisitablePointable;
import org.apache.asterix.om.pointables.base.DefaultOpenFieldType;
import org.apache.asterix.om.types.ATypeTag;
import org.apache.hyracks.api.exceptions.HyracksDataException;
@@ -42,14 +45,21 @@
*/
private boolean ignore = false;
+ private Set<String> fieldNames;
+ private GroupType parquetType;
+
public ObjectConverter(AbstractComplexConverter parent, int index,
GroupType parquetType,
ParquetConverterContext context) throws IOException {
super(parent, index, parquetType, context);
+ fieldNames = new HashSet<>();
+ this.parquetType = parquetType;
}
public ObjectConverter(AbstractComplexConverter parent, String
stringFieldName, int index, GroupType parquetType,
ParquetConverterContext context) throws IOException {
super(parent, stringFieldName, index, parquetType, context);
+ fieldNames = new HashSet<>();
+ this.parquetType = parquetType;
}
@Override
@@ -63,12 +73,32 @@
} else {
ignore = checkValueEmbedder(valueEmbedder);
}
+
+ for (int i = 0; i < parquetType.getFieldCount(); i++) {
+ fieldNames.add(parquetType.getFieldName(i));
+ }
}
@Override
public void end() {
closeDirectRepeatedChildren();
if (!ignore) {
+ IExternalFilterValueEmbedder valueEmbedder =
context.getValueEmbedder();
+ for (String fieldNameStr : fieldNames) {
+ try {
+ if (valueEmbedder.shouldEmbed(fieldNameStr,
ATypeTag.NULL)) {
+
builder.addField(context.getSerializedFieldName(fieldNameStr),
+ valueEmbedder.getEmbeddedValue());
+ } else {
+
builder.addField(context.getSerializedFieldName(fieldNameStr),
+ NullLazyVisitablePointable.INSTANCE);
+ }
+ } catch (HyracksDataException e) {
+ throw new IllegalStateException(e);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
writeToParent();
context.getValueEmbedder().exitObject();
}
@@ -92,8 +122,10 @@
}
IExternalFilterValueEmbedder valueEmbedder =
context.getValueEmbedder();
IValueReference fieldName = value.getFieldName();
+ String fieldNameStr = value.getStringFieldName();
+ fieldNames.remove(fieldNameStr);
try {
- if (valueEmbedder.shouldEmbed(value.getStringFieldName(),
value.getTypeTag())) {
+ if (valueEmbedder.shouldEmbed(fieldNameStr, value.getTypeTag())) {
builder.addField(fieldName, valueEmbedder.getEmbeddedValue());
} else {
builder.addField(fieldName, getValue());
diff --git
a/asterixdb/asterix-om/src/main/java/org/apache/asterix/om/lazy/NullLazyVisitablePointable.java
b/asterixdb/asterix-om/src/main/java/org/apache/asterix/om/lazy/NullLazyVisitablePointable.java
index 95e0f5b..9aa7afa 100644
---
a/asterixdb/asterix-om/src/main/java/org/apache/asterix/om/lazy/NullLazyVisitablePointable.java
+++
b/asterixdb/asterix-om/src/main/java/org/apache/asterix/om/lazy/NullLazyVisitablePointable.java
@@ -20,7 +20,7 @@
import org.apache.asterix.om.types.ATypeTag;
-class NullLazyVisitablePointable extends FlatLazyVisitablePointable {
+public class NullLazyVisitablePointable extends FlatLazyVisitablePointable {
public static final AbstractLazyVisitablePointable INSTANCE = new
NullLazyVisitablePointable();
public NullLazyVisitablePointable() {
--
To view, visit https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/20525?usp=email
To unsubscribe, or for help writing mail filters, visit
https://asterix-gerrit.ics.uci.edu/settings?usp=email
Gerrit-MessageType: newchange
Gerrit-Project: asterixdb
Gerrit-Branch: phoenix
Gerrit-Change-Id: Ib4cd841fa40d4fdd5bd330d2127a6437e5bd6565
Gerrit-Change-Number: 20525
Gerrit-PatchSet: 1
Gerrit-Owner: Preetham Poluparthi <[email protected]>