Repository: parquet-mr Updated Branches: refs/heads/master 8bfd9b4d8 -> da3e8eb7e
PARQUET-357: Parquet-thrift generates wrong schema for Thrift binary fields Author: Nandor Kollar <nkol...@cloudera.com> Closes #439 from nandorKollar/PARQUET-357 and squashes the following commits: 90cfcfb [Nandor Kollar] Address code review feedback 4bf8089 [Nandor Kollar] PARQUET-357: Parquet-thrift generates wrong schema for Thrift binary fields Project: http://git-wip-us.apache.org/repos/asf/parquet-mr/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-mr/commit/da3e8eb7 Tree: http://git-wip-us.apache.org/repos/asf/parquet-mr/tree/da3e8eb7 Diff: http://git-wip-us.apache.org/repos/asf/parquet-mr/diff/da3e8eb7 Branch: refs/heads/master Commit: da3e8eb7e5a8cdc28ab0e36651bd7eceed35c2fe Parents: 8bfd9b4 Author: Nandor Kollar <nkol...@cloudera.com> Authored: Thu Jan 4 17:50:39 2018 +0100 Committer: Zoltan Ivanfi <z...@cloudera.com> Committed: Thu Jan 4 17:50:39 2018 +0100 ---------------------------------------------------------------------- .../thrift/ThriftSchemaConvertVisitor.java | 2 +- .../parquet/thrift/ThriftSchemaConverter.java | 8 +++++- .../parquet/thrift/struct/ThriftType.java | 10 ++++++++ .../parquet/hadoop/thrift/TestBinary.java | 27 ++++++++++++++++++-- 4 files changed, 43 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/da3e8eb7/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConvertVisitor.java ---------------------------------------------------------------------- diff --git a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConvertVisitor.java b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConvertVisitor.java index 88effc5..82175a2 100644 --- a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConvertVisitor.java +++ b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConvertVisitor.java @@ -331,7 +331,7 @@ class ThriftSchemaConvertVisitor implements ThriftType.StateVisitor<ConvertedFie @Override public ConvertedField visit(StringType stringType, State state) { - return visitPrimitiveType(BINARY, UTF8, state); + return stringType.isBinary() ? visitPrimitiveType(BINARY, state) : visitPrimitiveType(BINARY, UTF8, state); } private static boolean isUnion(StructOrUnionType s) { http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/da3e8eb7/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConverter.java ---------------------------------------------------------------------- diff --git a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConverter.java b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConverter.java index b72f605..c3a166a 100644 --- a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConverter.java +++ b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConverter.java @@ -35,6 +35,7 @@ import org.apache.parquet.thrift.struct.ThriftType; import org.apache.parquet.thrift.struct.ThriftType.*; import org.apache.parquet.thrift.struct.ThriftType.StructType.StructOrUnionType; import org.apache.parquet.thrift.struct.ThriftTypeID; +import org.apache.thrift.meta_data.FieldMetaData; import java.util.ArrayList; import java.util.Collection; @@ -162,7 +163,12 @@ public class ThriftSchemaConverter { type = new I64Type(); break; case STRING: - type = new StringType(); + StringType stringType = new StringType(); + FieldMetaData fieldMetaData = field.getFieldMetaData(); + if (fieldMetaData != null && fieldMetaData.valueMetaData.isBinary()) { + stringType.setBinary(true); + } + type = stringType; break; case STRUCT: type = toStructType(field.gettStructDescriptor()); http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/da3e8eb7/parquet-thrift/src/main/java/org/apache/parquet/thrift/struct/ThriftType.java ---------------------------------------------------------------------- diff --git a/parquet-thrift/src/main/java/org/apache/parquet/thrift/struct/ThriftType.java b/parquet-thrift/src/main/java/org/apache/parquet/thrift/struct/ThriftType.java index 19c7c9f..4c2d662 100644 --- a/parquet-thrift/src/main/java/org/apache/parquet/thrift/struct/ThriftType.java +++ b/parquet-thrift/src/main/java/org/apache/parquet/thrift/struct/ThriftType.java @@ -641,11 +641,21 @@ public abstract class ThriftType { } public static class StringType extends ThriftType { + private boolean binary = false; @JsonCreator public StringType() { super(STRING); } + + public boolean isBinary() { + return binary; + } + + public void setBinary(boolean binary) { + this.binary = binary; + } + @Override public <R, S> R accept(StateVisitor<R, S> visitor, S state) { return visitor.visit(this, state); http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/da3e8eb7/parquet-thrift/src/test/java/org/apache/parquet/hadoop/thrift/TestBinary.java ---------------------------------------------------------------------- diff --git a/parquet-thrift/src/test/java/org/apache/parquet/hadoop/thrift/TestBinary.java b/parquet-thrift/src/test/java/org/apache/parquet/hadoop/thrift/TestBinary.java index c00daf9..b0d4feb 100644 --- a/parquet-thrift/src/test/java/org/apache/parquet/hadoop/thrift/TestBinary.java +++ b/parquet-thrift/src/test/java/org/apache/parquet/hadoop/thrift/TestBinary.java @@ -21,9 +21,17 @@ package org.apache.parquet.hadoop.thrift; import java.io.File; import java.io.IOException; import java.nio.ByteBuffer; +import java.util.List; import java.util.UUID; + +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; -import org.junit.Assert; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; @@ -33,6 +41,9 @@ import org.apache.parquet.thrift.ThriftParquetReader; import org.apache.parquet.thrift.ThriftParquetWriter; import org.apache.parquet.thrift.test.binary.StringAndBinary; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; + public class TestBinary { @Rule public TemporaryFolder tempDir = new TemporaryFolder(); @@ -57,10 +68,22 @@ public class TestBinary { build(path) .withThriftClass(StringAndBinary.class) .build(); + + StringAndBinary record = reader.read(); reader.close(); - Assert.assertEquals("Should match after serialization round trip", + assertSchema(ParquetFileReader.readFooter(new Configuration(), path)); + assertEquals("Should match after serialization round trip", expected, record); } + + private void assertSchema(ParquetMetadata parquetMetadata) { + List<Type> fields = parquetMetadata.getFileMetaData().getSchema().getFields(); + assertEquals(2, fields.size()); + assertEquals(Types.required(PrimitiveType.PrimitiveTypeName.BINARY).named("s"), fields.get(0)); + assertEquals(OriginalType.UTF8, fields.get(0).getOriginalType()); + assertEquals(Types.required(PrimitiveType.PrimitiveTypeName.BINARY).named("b"), fields.get(1)); + assertNull(fields.get(1).getOriginalType()); + } }