tddfan commented on code in PR #1102:
URL: https://github.com/apache/parquet-mr/pull/1102#discussion_r1221862896
##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java:
##########
@@ -86,32 +89,71 @@ class ProtoMessageConverter extends GroupConverter {
this.conf = conf;
this.parent = pvc;
this.extraMetadata = extraMetadata;
- int parquetFieldIndex = 1;
+ boolean ignoreUnknownFields = conf.getBoolean("IGNORE_UNKNOWN_FIELDS",
false);
+
+ myBuilder = builder;
if (pvc == null) {
throw new IllegalStateException("Missing parent value container");
}
- myBuilder = builder;
+ if(builder == null && ignoreUnknownFields) {
+ IntStream.range(0, parquetSchema.getFieldCount())
+ .forEach(i-> converters[i] = dummyScalarConverter(DUMMY_PVC,
parquetSchema.getType(i), conf, extraMetadata));
- Descriptors.Descriptor protoDescriptor = builder.getDescriptorForType();
+ } else {
- for (Type parquetField : parquetSchema.getFields()) {
- Descriptors.FieldDescriptor protoField =
protoDescriptor.findFieldByName(parquetField.getName());
+ int parquetFieldIndex = 0;
+ Descriptors.Descriptor protoDescriptor = builder.getDescriptorForType();
- if (protoField == null) {
- String description = "Scheme mismatch \n\"" + parquetField + "\"" +
- "\n proto descriptor:\n" + protoDescriptor.toProto();
- throw new IncompatibleSchemaModificationException("Cant find \"" +
parquetField.getName() + "\" " + description);
- }
+ for (Type parquetField : parquetSchema.getFields()) {
+
+ Descriptors.FieldDescriptor protoField =
protoDescriptor.findFieldByName(parquetField.getName());
+
+ validateProtoField(ignoreUnknownFields, protoDescriptor.toProto(),
parquetField, protoField);
+
+ converters[parquetFieldIndex] = protoField != null ?
+ newMessageConverter(myBuilder, protoField, parquetField) :
+ dummyScalarConverter(DUMMY_PVC, parquetField, conf, extraMetadata);
- converters[parquetFieldIndex - 1] = newMessageConverter(myBuilder,
protoField, parquetField);
+ parquetFieldIndex++;
+ }
+
+ }
+ }
- parquetFieldIndex++;
+ private void validateProtoField(boolean ignoreUnknownFields,
DescriptorProtos.DescriptorProto protoDescriptor, Type parquetField,
Descriptors.FieldDescriptor protoField) {
+ if (protoField == null && !ignoreUnknownFields) {
+ String description = "Schema mismatch \n\"" + parquetField + "\"" +
+ "\n proto descriptor:\n" + protoDescriptor;
+ throw new IncompatibleSchemaModificationException("Cant find \"" +
parquetField.getName() + "\" " + description);
}
}
+ private Converter dummyScalarConverter(ParentValueContainer pvc,
+ Type parquetField, Configuration conf,
+ Map<String, String> extraMetadata) {
+
+ if(parquetField.isPrimitive()) {
+ PrimitiveType primitiveType = parquetField.asPrimitiveType();
+ PrimitiveType.PrimitiveTypeName primitiveTypeName =
primitiveType.getPrimitiveTypeName();
+ switch (primitiveTypeName) {
+ case BINARY: return new ProtoStringConverter(pvc);
Review Comment:
Good pick. Some Parquet "primitive type" were missing. Added to the list now.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]