[
https://issues.apache.org/jira/browse/PARQUET-2305?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17729884#comment-17729884
]
ASF GitHub Bot commented on PARQUET-2305:
-----------------------------------------
wgtmac commented on code in PR #1102:
URL: https://github.com/apache/parquet-mr/pull/1102#discussion_r1220671204
##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java:
##########
@@ -31,9 +29,7 @@
import org.apache.parquet.io.api.Converter;
import org.apache.parquet.io.api.GroupConverter;
import org.apache.parquet.io.api.PrimitiveConverter;
-import org.apache.parquet.schema.GroupType;
-import org.apache.parquet.schema.IncompatibleSchemaModificationException;
-import org.apache.parquet.schema.LogicalTypeAnnotation;
+import org.apache.parquet.schema.*;
Review Comment:
ditto
##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java:
##########
@@ -18,9 +18,7 @@
*/
package org.apache.parquet.proto;
-import com.google.protobuf.ByteString;
-import com.google.protobuf.Descriptors;
-import com.google.protobuf.Message;
+import com.google.protobuf.*;
Review Comment:
Please do not use import *
##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoParquetReader.java:
##########
@@ -71,6 +77,13 @@ protected Builder(InputFile file) {
super(file);
}
+ private Builder setIgnoreUnknownFields(boolean ignoreUnknownFields) {
+ if(ignoreUnknownFields) {
+ this.set("IGNORE_UNKNOWN_FIELDS", "TRUE");
Review Comment:
We may define `IGNORE_UNKNOWN_FIELDS` constant in ProtoConstants.java like
this:
https://github.com/apache/parquet-mr/blob/master/parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoConstants.java#L35
##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoParquetReader.java:
##########
@@ -71,6 +77,13 @@ protected Builder(InputFile file) {
super(file);
}
+ private Builder setIgnoreUnknownFields(boolean ignoreUnknownFields) {
+ if(ignoreUnknownFields) {
Review Comment:
```suggestion
if (ignoreUnknownFields) {
```
##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java:
##########
@@ -86,32 +89,71 @@ class ProtoMessageConverter extends GroupConverter {
this.conf = conf;
this.parent = pvc;
this.extraMetadata = extraMetadata;
- int parquetFieldIndex = 1;
+ boolean ignoreUnknownFields = conf.getBoolean("IGNORE_UNKNOWN_FIELDS",
false);
+
+ myBuilder = builder;
if (pvc == null) {
throw new IllegalStateException("Missing parent value container");
}
- myBuilder = builder;
+ if(builder == null && ignoreUnknownFields) {
Review Comment:
```suggestion
if (builder == null && ignoreUnknownFields) {
```
##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java:
##########
@@ -86,32 +89,71 @@ class ProtoMessageConverter extends GroupConverter {
this.conf = conf;
this.parent = pvc;
this.extraMetadata = extraMetadata;
- int parquetFieldIndex = 1;
+ boolean ignoreUnknownFields = conf.getBoolean("IGNORE_UNKNOWN_FIELDS",
false);
+
+ myBuilder = builder;
if (pvc == null) {
throw new IllegalStateException("Missing parent value container");
}
- myBuilder = builder;
+ if(builder == null && ignoreUnknownFields) {
+ IntStream.range(0, parquetSchema.getFieldCount())
+ .forEach(i-> converters[i] = dummyScalarConverter(DUMMY_PVC,
parquetSchema.getType(i), conf, extraMetadata));
- Descriptors.Descriptor protoDescriptor = builder.getDescriptorForType();
+ } else {
- for (Type parquetField : parquetSchema.getFields()) {
- Descriptors.FieldDescriptor protoField =
protoDescriptor.findFieldByName(parquetField.getName());
+ int parquetFieldIndex = 0;
+ Descriptors.Descriptor protoDescriptor = builder.getDescriptorForType();
- if (protoField == null) {
- String description = "Scheme mismatch \n\"" + parquetField + "\"" +
- "\n proto descriptor:\n" + protoDescriptor.toProto();
- throw new IncompatibleSchemaModificationException("Cant find \"" +
parquetField.getName() + "\" " + description);
- }
+ for (Type parquetField : parquetSchema.getFields()) {
Review Comment:
The indentation in the for loop looks weird to me. Could you please format
it a little bit?
##########
parquet-protobuf/src/test/java/org/apache/parquet/proto/ProtoSchemaEvolutionTest.java:
##########
@@ -65,4 +66,68 @@ public void testEnumSchemaWriteV1ReadV2() throws IOException
{
assertEquals(messagesV2.size(), 1);
assertSame(messagesV2.get(0).getOptionalLabelNumberPair(),
TestProto3SchemaV2.MessageSchema.LabelNumberPair.SECOND);
}
+
+ /**
Review Comment:
Thanks for adding the test!
##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoParquetReader.java:
##########
@@ -37,11 +37,17 @@
public static <T> ParquetReader.Builder<T> builder(Path file) {
return new ProtoParquetReader.Builder<T>(file);
}
-
+ public static <T> ParquetReader.Builder<T> builder(Path file, boolean
ignoreUnknownFields) {
Review Comment:
Please add one blank line before and after this new method.
##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoParquetReader.java:
##########
@@ -71,6 +77,13 @@ protected Builder(InputFile file) {
super(file);
}
+ private Builder setIgnoreUnknownFields(boolean ignoreUnknownFields) {
Review Comment:
What about making it protected?
##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java:
##########
@@ -86,32 +89,71 @@ class ProtoMessageConverter extends GroupConverter {
this.conf = conf;
this.parent = pvc;
this.extraMetadata = extraMetadata;
- int parquetFieldIndex = 1;
+ boolean ignoreUnknownFields = conf.getBoolean("IGNORE_UNKNOWN_FIELDS",
false);
+
+ myBuilder = builder;
if (pvc == null) {
throw new IllegalStateException("Missing parent value container");
}
- myBuilder = builder;
+ if(builder == null && ignoreUnknownFields) {
+ IntStream.range(0, parquetSchema.getFieldCount())
+ .forEach(i-> converters[i] = dummyScalarConverter(DUMMY_PVC,
parquetSchema.getType(i), conf, extraMetadata));
- Descriptors.Descriptor protoDescriptor = builder.getDescriptorForType();
+ } else {
- for (Type parquetField : parquetSchema.getFields()) {
- Descriptors.FieldDescriptor protoField =
protoDescriptor.findFieldByName(parquetField.getName());
+ int parquetFieldIndex = 0;
+ Descriptors.Descriptor protoDescriptor = builder.getDescriptorForType();
- if (protoField == null) {
- String description = "Scheme mismatch \n\"" + parquetField + "\"" +
- "\n proto descriptor:\n" + protoDescriptor.toProto();
- throw new IncompatibleSchemaModificationException("Cant find \"" +
parquetField.getName() + "\" " + description);
- }
+ for (Type parquetField : parquetSchema.getFields()) {
+
+ Descriptors.FieldDescriptor protoField =
protoDescriptor.findFieldByName(parquetField.getName());
+
+ validateProtoField(ignoreUnknownFields, protoDescriptor.toProto(),
parquetField, protoField);
+
+ converters[parquetFieldIndex] = protoField != null ?
+ newMessageConverter(myBuilder, protoField, parquetField) :
+ dummyScalarConverter(DUMMY_PVC, parquetField, conf, extraMetadata);
- converters[parquetFieldIndex - 1] = newMessageConverter(myBuilder,
protoField, parquetField);
+ parquetFieldIndex++;
+ }
+
+ }
+ }
- parquetFieldIndex++;
+ private void validateProtoField(boolean ignoreUnknownFields,
DescriptorProtos.DescriptorProto protoDescriptor, Type parquetField,
Descriptors.FieldDescriptor protoField) {
Review Comment:
This line is too long and should be split.
##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java:
##########
@@ -86,32 +89,71 @@ class ProtoMessageConverter extends GroupConverter {
this.conf = conf;
this.parent = pvc;
this.extraMetadata = extraMetadata;
- int parquetFieldIndex = 1;
+ boolean ignoreUnknownFields = conf.getBoolean("IGNORE_UNKNOWN_FIELDS",
false);
+
+ myBuilder = builder;
if (pvc == null) {
throw new IllegalStateException("Missing parent value container");
}
- myBuilder = builder;
+ if(builder == null && ignoreUnknownFields) {
+ IntStream.range(0, parquetSchema.getFieldCount())
+ .forEach(i-> converters[i] = dummyScalarConverter(DUMMY_PVC,
parquetSchema.getType(i), conf, extraMetadata));
- Descriptors.Descriptor protoDescriptor = builder.getDescriptorForType();
+ } else {
- for (Type parquetField : parquetSchema.getFields()) {
- Descriptors.FieldDescriptor protoField =
protoDescriptor.findFieldByName(parquetField.getName());
+ int parquetFieldIndex = 0;
+ Descriptors.Descriptor protoDescriptor = builder.getDescriptorForType();
- if (protoField == null) {
- String description = "Scheme mismatch \n\"" + parquetField + "\"" +
- "\n proto descriptor:\n" + protoDescriptor.toProto();
- throw new IncompatibleSchemaModificationException("Cant find \"" +
parquetField.getName() + "\" " + description);
- }
+ for (Type parquetField : parquetSchema.getFields()) {
+
+ Descriptors.FieldDescriptor protoField =
protoDescriptor.findFieldByName(parquetField.getName());
+
+ validateProtoField(ignoreUnknownFields, protoDescriptor.toProto(),
parquetField, protoField);
+
+ converters[parquetFieldIndex] = protoField != null ?
+ newMessageConverter(myBuilder, protoField, parquetField) :
+ dummyScalarConverter(DUMMY_PVC, parquetField, conf, extraMetadata);
- converters[parquetFieldIndex - 1] = newMessageConverter(myBuilder,
protoField, parquetField);
+ parquetFieldIndex++;
+ }
+
+ }
+ }
- parquetFieldIndex++;
+ private void validateProtoField(boolean ignoreUnknownFields,
DescriptorProtos.DescriptorProto protoDescriptor, Type parquetField,
Descriptors.FieldDescriptor protoField) {
+ if (protoField == null && !ignoreUnknownFields) {
+ String description = "Schema mismatch \n\"" + parquetField + "\"" +
+ "\n proto descriptor:\n" + protoDescriptor;
+ throw new IncompatibleSchemaModificationException("Cant find \"" +
parquetField.getName() + "\" " + description);
}
}
+ private Converter dummyScalarConverter(ParentValueContainer pvc,
+ Type parquetField, Configuration conf,
+ Map<String, String> extraMetadata) {
+
+ if(parquetField.isPrimitive()) {
+ PrimitiveType primitiveType = parquetField.asPrimitiveType();
+ PrimitiveType.PrimitiveTypeName primitiveTypeName =
primitiveType.getPrimitiveTypeName();
+ switch (primitiveTypeName) {
+ case BINARY: return new ProtoStringConverter(pvc);
Review Comment:
CMIW, do we miss some cases? It looks shorter than here:
https://github.com/apache/parquet-mr/blob/master/parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java#L176-L184
##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java:
##########
@@ -86,32 +89,71 @@ class ProtoMessageConverter extends GroupConverter {
this.conf = conf;
this.parent = pvc;
this.extraMetadata = extraMetadata;
- int parquetFieldIndex = 1;
+ boolean ignoreUnknownFields = conf.getBoolean("IGNORE_UNKNOWN_FIELDS",
false);
Review Comment:
Seems line 92-94 can be moved under line 98?
##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java:
##########
@@ -124,13 +166,15 @@ public void start() {
@Override
public void end() {
- parent.add(myBuilder.build());
- myBuilder.clear();
+ if(myBuilder != null) {
+ parent.add(myBuilder.build());
+ myBuilder.clear();
+ }
}
protected Converter newMessageConverter(final Message.Builder parentBuilder,
final Descriptors.FieldDescriptor fieldDescriptor, Type parquetType) {
- boolean isRepeated = fieldDescriptor.isRepeated();
+ boolean isRepeated = fieldDescriptor==null ? false :
fieldDescriptor.isRepeated();
Review Comment:
```suggestion
boolean isRepeated = fieldDescriptor != null &&
fieldDescriptor.isRepeated();
```
##########
parquet-protobuf/src/main/java/org/apache/parquet/proto/ProtoMessageConverter.java:
##########
@@ -124,13 +166,15 @@ public void start() {
@Override
public void end() {
- parent.add(myBuilder.build());
- myBuilder.clear();
+ if(myBuilder != null) {
Review Comment:
```suggestion
if (myBuilder != null) {
```
##########
parquet-protobuf/src/test/java/org/apache/parquet/proto/TestUtils.java:
##########
@@ -195,6 +196,18 @@ public static <T extends MessageOrBuilder> List<T>
readMessages(Path file, Class
}
}
+ /**
+ * Read messages from given file into the expected proto class.
+ * @param file
+ * @param messageClass
+ * @param <T>
+ * @return List of protobuf messages for the given type.
+ */
+ public static <T extends MessageOrBuilder> List<T> readMessages(Path file,
Class<T> messageClass) throws IOException {
+ return readMessages(file, messageClass, false);
+
Review Comment:
Please remove this blank line.
> Allow Parquet to Proto conversion even though Target Schema has less fields
> ---------------------------------------------------------------------------
>
> Key: PARQUET-2305
> URL: https://issues.apache.org/jira/browse/PARQUET-2305
> Project: Parquet
> Issue Type: Improvement
> Components: parquet-protobuf
> Reporter: Sanjay Sharma
> Priority: Major
>
> If Parquet has any field which has been removed from the schema and Parquet
> to Proto conversion happens, it errors out due to Unknown fields. There could
> be some scenarios that we want to still convert PARQUET into the target proto
> schema object which has lesser fields.
> If specified "ignoreUnknownFields" as an argument, this should allow the
> conversion which ignore fields it can't convert and not error out.
> Similar functionality exist in
> [https://github.com/protocolbuffers/protobuf/blob/main/java/util/src/main/java/com/google/protobuf/util/JsonFormat.java]
> with field "ignoringUnknownFields"
--
This message was sent by Atlassian Jira
(v8.20.10#820010)