[spark] branch master updated: [SPARK-27555][SQL] HiveSerDe should fall back to hadoopconf if hive.default.fileformat is not found in SQLConf

gurwls223 Fri, 03 May 2019 17:03:07 -0700

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new c66ec43  [SPARK-27555][SQL] HiveSerDe should fall back to hadoopconf 
if hive.default.fileformat is not found in SQLConf
c66ec43 is described below

commit c66ec439456c5a160e3849e23c2ce3970d4c6ec7
Author: sandeep katta <sandeep.katta2...@gmail.com>
AuthorDate: Sat May 4 09:02:12 2019 +0900

    [SPARK-27555][SQL] HiveSerDe should fall back to hadoopconf if 
hive.default.fileformat is not found in SQLConf
    
    ## What changes were proposed in this pull request?
    
    SQLConf does not load hive-site.xml.So HiveSerDe should fall back to 
hadoopconf if  hive.default.fileformat is not found in SQLConf
    
    ## How was this patch tested?
    
    Tested manually.
    Added UT
    
    Closes #24489 from sandeep-katta/spark-27555.
    
    Authored-by: sandeep katta <sandeep.katta2...@gmail.com>
    Signed-off-by: HyukjinKwon <gurwls...@apache.org>
---
 docs/sql-migration-guide-upgrade.md                |  2 ++
 .../org/apache/spark/sql/internal/HiveSerDe.scala  | 12 +++++++++++-
 .../spark/sql/hive/execution/HiveSerDeSuite.scala  | 22 +++++++++++++++++++++-
 3 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/docs/sql-migration-guide-upgrade.md 
b/docs/sql-migration-guide-upgrade.md
index 54512ae..5fe7c7c 100644
--- a/docs/sql-migration-guide-upgrade.md
+++ b/docs/sql-migration-guide-upgrade.md
@@ -126,6 +126,8 @@ license: |
 
   - Since Spark 3.0, parquet logical type `TIMESTAMP_MICROS` is used by 
default while saving `TIMESTAMP` columns. In Spark version 2.4 and earlier, 
`TIMESTAMP` columns are saved as `INT96` in parquet files. To set `INT96` to 
`spark.sql.parquet.outputTimestampType` restores the previous behavior.
 
+  - Since Spark 3.0, if `hive.default.fileformat` is not found in `Spark SQL 
configuration` then it will fallback to hive-site.xml present in the `Hadoop 
configuration` of `SparkContext`.
+
 ## Upgrading from Spark SQL 2.4 to 2.4.1
 
   - The value of `spark.executor.heartbeatInterval`, when specified without 
units like "30" rather than "30s", was
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala
index bd25a64..4921e3c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.internal
 
 import java.util.Locale
 
+import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.catalog.CatalogStorageFormat
 
 case class HiveSerDe(
@@ -88,7 +89,16 @@ object HiveSerDe {
   }
 
   def getDefaultStorage(conf: SQLConf): CatalogStorageFormat = {
-    val defaultStorageType = conf.getConfString("hive.default.fileformat", 
"textfile")
+    // To respect hive-site.xml, it peeks Hadoop configuration from existing 
Spark session,
+    // as an easy workaround. See SPARK-27555.
+    val defaultFormatKey = "hive.default.fileformat"
+    val defaultValue = {
+      val defaultFormatValue = "textfile"
+      SparkSession.getActiveSession.map { session =>
+        session.sessionState.newHadoopConf().get(defaultFormatKey, 
defaultFormatValue)
+      }.getOrElse(defaultFormatValue)
+    }
+    val defaultStorageType = conf.getConfString("hive.default.fileformat", 
defaultValue)
     val defaultHiveSerde = sourceToSerDe(defaultStorageType)
     CatalogStorageFormat.empty.copy(
       inputFormat = defaultHiveSerde.flatMap(_.inputFormat)
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala
 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala
index d7752e9..ed4304b 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala
@@ -21,13 +21,14 @@ import java.net.URI
 
 import org.scalatest.BeforeAndAfterAll
 
-import org.apache.spark.sql.{AnalysisException, SaveMode}
+import org.apache.spark.sql.{AnalysisException, SaveMode, SparkSession}
 import org.apache.spark.sql.catalyst.catalog.CatalogTable
 import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.execution.command.{CreateTableCommand, DDLUtils}
 import org.apache.spark.sql.execution.datasources.CreateTable
 import org.apache.spark.sql.execution.metric.InputOutputMetricsHelper
 import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.internal.{HiveSerDe, SQLConf}
 import org.apache.spark.sql.types.StructType
 
 /**
@@ -210,4 +211,23 @@ class HiveSerDeSuite extends HiveComparisonTest with 
PlanTest with BeforeAndAfte
     val e8 = intercept[IllegalArgumentException](analyzeCreateTable(v8))
     assert(e8.getMessage.contains("invalid fileFormat: 'wrong'"))
   }
+
+  test("SPARK-27555: fall back to hive-site.xml if hive.default.fileformat " +
+    "is not found in SQLConf ") {
+    val testSession = SparkSession.getActiveSession.get
+    try {
+      
testSession.sparkContext.hadoopConfiguration.set("hive.default.fileformat", 
"parquetfile")
+      val sqlConf = new SQLConf()
+      var storageFormat = HiveSerDe.getDefaultStorage(sqlConf)
+      assert(storageFormat.serde.
+        
contains("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe"))
+      // should take orc as it is present in sqlConf
+      sqlConf.setConfString("hive.default.fileformat", "orc")
+      storageFormat = HiveSerDe.getDefaultStorage(sqlConf)
+      
assert(storageFormat.serde.contains("org.apache.hadoop.hive.ql.io.orc.OrcSerde"))
+    }
+    finally {
+      
testSession.sparkContext.hadoopConfiguration.unset("hive.default.fileformat")
+    }
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-27555][SQL] HiveSerDe should fall back to hadoopconf if hive.default.fileformat is not found in SQLConf

Reply via email to