Repository: spark
Updated Branches:
  refs/heads/master 0c88ce541 -> 1c05027a1


[SQL][SPARK-6471]: Metastore schema should only be a subset of parquet schema 
to support dropping of columns using replace columns

Currently in the parquet relation 2 implementation, error is thrown in case 
merged schema is not exactly the same as metastore schema.
But to support cases like deletion of column using replace column command, we 
can relax the restriction so that even if metastore schema is a subset of 
merged parquet schema, the query will work.

Author: Yash Datta <yash.da...@guavus.com>

Closes #5141 from saucam/replace_col and squashes the following commits:

e858d5b [Yash Datta] SPARK-6471: Fix test cases, add a new test case for 
metastore schema to be subset of parquet schema
5f2f467 [Yash Datta] SPARK-6471: Metastore schema should only be a subset of 
parquet schema to support dropping of columns using replace columns


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1c05027a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1c05027a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1c05027a

Branch: refs/heads/master
Commit: 1c05027a143d1b0bf3df192984e6cac752b1e926
Parents: 0c88ce5
Author: Yash Datta <yash.da...@guavus.com>
Authored: Thu Mar 26 21:13:38 2015 +0800
Committer: Cheng Lian <l...@databricks.com>
Committed: Thu Mar 26 21:13:38 2015 +0800

----------------------------------------------------------------------
 .../org/apache/spark/sql/parquet/newParquet.scala |  5 +++--
 .../spark/sql/parquet/ParquetSchemaSuite.scala    | 18 ++++++++++++++++--
 2 files changed, 19 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/1c05027a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index 410600b..3516cfe 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -758,12 +758,13 @@ private[sql] object ParquetRelation2 extends Logging {
          |${parquetSchema.prettyJson}
        """.stripMargin
 
-    assert(metastoreSchema.size == parquetSchema.size, schemaConflictMessage)
+    assert(metastoreSchema.size <= parquetSchema.size, schemaConflictMessage)
 
     val ordinalMap = metastoreSchema.zipWithIndex.map {
       case (field, index) => field.name.toLowerCase -> index
     }.toMap
-    val reorderedParquetSchema = parquetSchema.sortBy(f => 
ordinalMap(f.name.toLowerCase))
+    val reorderedParquetSchema = parquetSchema.sortBy(f => 
+      ordinalMap.getOrElse(f.name.toLowerCase, metastoreSchema.size + 1))
 
     StructType(metastoreSchema.zip(reorderedParquetSchema).map {
       // Uses Parquet field names but retains Metastore data types.

http://git-wip-us.apache.org/repos/asf/spark/blob/1c05027a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
index 321832c..8462f9b 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
@@ -212,8 +212,11 @@ class ParquetSchemaSuite extends FunSuite with ParquetTest 
{
           StructField("UPPERCase", IntegerType, nullable = true))))
     }
 
-    // Conflicting field count
-    assert(intercept[Throwable] {
+    // MetaStore schema is subset of parquet schema
+    assertResult(
+      StructType(Seq(
+        StructField("UPPERCase", DoubleType, nullable = false)))) {
+
       ParquetRelation2.mergeMetastoreParquetSchema(
         StructType(Seq(
           StructField("uppercase", DoubleType, nullable = false))),
@@ -221,6 +224,17 @@ class ParquetSchemaSuite extends FunSuite with ParquetTest 
{
         StructType(Seq(
           StructField("lowerCase", BinaryType),
           StructField("UPPERCase", IntegerType, nullable = true))))
+    }
+
+    // Conflicting field count
+    assert(intercept[Throwable] {
+      ParquetRelation2.mergeMetastoreParquetSchema(
+        StructType(Seq(
+          StructField("uppercase", DoubleType, nullable = false),
+          StructField("lowerCase", BinaryType))),
+
+        StructType(Seq(
+          StructField("UPPERCase", IntegerType, nullable = true))))
     }.getMessage.contains("detected conflicting schemas"))
 
     // Conflicting field names


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to