Repository: parquet-mr
Updated Branches:
  refs/heads/master 8bfd9b4d8 -> da3e8eb7e


PARQUET-357: Parquet-thrift generates wrong schema for Thrift binary fields

Author: Nandor Kollar <nkol...@cloudera.com>

Closes #439 from nandorKollar/PARQUET-357 and squashes the following commits:

90cfcfb [Nandor Kollar] Address code review feedback
4bf8089 [Nandor Kollar] PARQUET-357: Parquet-thrift generates wrong schema for 
Thrift binary fields


Project: http://git-wip-us.apache.org/repos/asf/parquet-mr/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-mr/commit/da3e8eb7
Tree: http://git-wip-us.apache.org/repos/asf/parquet-mr/tree/da3e8eb7
Diff: http://git-wip-us.apache.org/repos/asf/parquet-mr/diff/da3e8eb7

Branch: refs/heads/master
Commit: da3e8eb7e5a8cdc28ab0e36651bd7eceed35c2fe
Parents: 8bfd9b4
Author: Nandor Kollar <nkol...@cloudera.com>
Authored: Thu Jan 4 17:50:39 2018 +0100
Committer: Zoltan Ivanfi <z...@cloudera.com>
Committed: Thu Jan 4 17:50:39 2018 +0100

----------------------------------------------------------------------
 .../thrift/ThriftSchemaConvertVisitor.java      |  2 +-
 .../parquet/thrift/ThriftSchemaConverter.java   |  8 +++++-
 .../parquet/thrift/struct/ThriftType.java       | 10 ++++++++
 .../parquet/hadoop/thrift/TestBinary.java       | 27 ++++++++++++++++++--
 4 files changed, 43 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/da3e8eb7/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConvertVisitor.java
----------------------------------------------------------------------
diff --git 
a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConvertVisitor.java
 
b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConvertVisitor.java
index 88effc5..82175a2 100644
--- 
a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConvertVisitor.java
+++ 
b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConvertVisitor.java
@@ -331,7 +331,7 @@ class ThriftSchemaConvertVisitor implements 
ThriftType.StateVisitor<ConvertedFie
 
   @Override
   public ConvertedField visit(StringType stringType, State state) {
-    return visitPrimitiveType(BINARY, UTF8, state);
+    return stringType.isBinary() ? visitPrimitiveType(BINARY, state) : 
visitPrimitiveType(BINARY, UTF8, state);
   }
 
   private static boolean isUnion(StructOrUnionType s) {

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/da3e8eb7/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConverter.java
----------------------------------------------------------------------
diff --git 
a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConverter.java
 
b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConverter.java
index b72f605..c3a166a 100644
--- 
a/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConverter.java
+++ 
b/parquet-thrift/src/main/java/org/apache/parquet/thrift/ThriftSchemaConverter.java
@@ -35,6 +35,7 @@ import org.apache.parquet.thrift.struct.ThriftType;
 import org.apache.parquet.thrift.struct.ThriftType.*;
 import 
org.apache.parquet.thrift.struct.ThriftType.StructType.StructOrUnionType;
 import org.apache.parquet.thrift.struct.ThriftTypeID;
+import org.apache.thrift.meta_data.FieldMetaData;
 
 import java.util.ArrayList;
 import java.util.Collection;
@@ -162,7 +163,12 @@ public class ThriftSchemaConverter {
         type = new I64Type();
         break;
       case STRING:
-        type = new StringType();
+        StringType stringType = new StringType();
+        FieldMetaData fieldMetaData = field.getFieldMetaData();
+        if (fieldMetaData != null && fieldMetaData.valueMetaData.isBinary()) {
+          stringType.setBinary(true);
+        }
+        type = stringType;
         break;
       case STRUCT:
         type = toStructType(field.gettStructDescriptor());

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/da3e8eb7/parquet-thrift/src/main/java/org/apache/parquet/thrift/struct/ThriftType.java
----------------------------------------------------------------------
diff --git 
a/parquet-thrift/src/main/java/org/apache/parquet/thrift/struct/ThriftType.java 
b/parquet-thrift/src/main/java/org/apache/parquet/thrift/struct/ThriftType.java
index 19c7c9f..4c2d662 100644
--- 
a/parquet-thrift/src/main/java/org/apache/parquet/thrift/struct/ThriftType.java
+++ 
b/parquet-thrift/src/main/java/org/apache/parquet/thrift/struct/ThriftType.java
@@ -641,11 +641,21 @@ public abstract class ThriftType {
   }
 
   public static class StringType extends ThriftType {
+    private boolean binary = false;
 
     @JsonCreator
     public StringType() {
       super(STRING);
     }
+
+    public boolean isBinary() {
+      return binary;
+    }
+
+    public void setBinary(boolean binary) {
+      this.binary = binary;
+    }
+
     @Override
     public <R, S> R accept(StateVisitor<R, S> visitor, S state) {
       return visitor.visit(this, state);

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/da3e8eb7/parquet-thrift/src/test/java/org/apache/parquet/hadoop/thrift/TestBinary.java
----------------------------------------------------------------------
diff --git 
a/parquet-thrift/src/test/java/org/apache/parquet/hadoop/thrift/TestBinary.java 
b/parquet-thrift/src/test/java/org/apache/parquet/hadoop/thrift/TestBinary.java
index c00daf9..b0d4feb 100644
--- 
a/parquet-thrift/src/test/java/org/apache/parquet/hadoop/thrift/TestBinary.java
+++ 
b/parquet-thrift/src/test/java/org/apache/parquet/hadoop/thrift/TestBinary.java
@@ -21,9 +21,17 @@ package org.apache.parquet.hadoop.thrift;
 import java.io.File;
 import java.io.IOException;
 import java.nio.ByteBuffer;
+import java.util.List;
 import java.util.UUID;
+
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
-import org.junit.Assert;
+import org.apache.parquet.hadoop.ParquetFileReader;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
+import org.apache.parquet.schema.OriginalType;
+import org.apache.parquet.schema.PrimitiveType;
+import org.apache.parquet.schema.Type;
+import org.apache.parquet.schema.Types;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
@@ -33,6 +41,9 @@ import org.apache.parquet.thrift.ThriftParquetReader;
 import org.apache.parquet.thrift.ThriftParquetWriter;
 import org.apache.parquet.thrift.test.binary.StringAndBinary;
 
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+
 public class TestBinary {
   @Rule
   public TemporaryFolder tempDir = new TemporaryFolder();
@@ -57,10 +68,22 @@ public class TestBinary {
         build(path)
         .withThriftClass(StringAndBinary.class)
         .build();
+
+
     StringAndBinary record = reader.read();
     reader.close();
 
-    Assert.assertEquals("Should match after serialization round trip",
+    assertSchema(ParquetFileReader.readFooter(new Configuration(), path));
+    assertEquals("Should match after serialization round trip",
         expected, record);
   }
+
+  private void assertSchema(ParquetMetadata parquetMetadata) {
+    List<Type> fields = 
parquetMetadata.getFileMetaData().getSchema().getFields();
+    assertEquals(2, fields.size());
+    
assertEquals(Types.required(PrimitiveType.PrimitiveTypeName.BINARY).named("s"), 
fields.get(0));
+    assertEquals(OriginalType.UTF8, fields.get(0).getOriginalType());
+    
assertEquals(Types.required(PrimitiveType.PrimitiveTypeName.BINARY).named("b"), 
fields.get(1));
+    assertNull(fields.get(1).getOriginalType());
+  }
 }

Reply via email to