spark git commit: [SPARK-15887][SQL] Bring back the hive-site.xml support for Spark 2.0

yhuai Mon, 13 Jun 2016 14:58:51 -0700

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 97fe1d8ee -> b148b0364



[SPARK-15887][SQL] Bring back the hive-site.xml support for Spark 2.0

## What changes were proposed in this pull request?

Right now, Spark 2.0 does not load hive-site.xml. Based on users' feedback, it 
seems make sense to still load this conf file.

This PR adds a `hadoopConf` API in `SharedState`, which is 
`sparkContext.hadoopConfiguration` by default. When users are under hive 
context, `SharedState.hadoopConf` will load hive-site.xml and append its 
configs to `sparkContext.hadoopConfiguration`.

When we need to read hadoop config in spark sql, we should call 
`SessionState.newHadoopConf`, which contains 
`sparkContext.hadoopConfiguration`, hive-site.xml and sql configs.

## How was this patch tested?

new test in `HiveDataFrameSuite`

Author: Wenchen Fan <wenc...@databricks.com>

Closes #13611 from cloud-fan/hive-site.

(cherry picked from commit c4b1ad020962c42be804d3a1a55171d9b51b01e7)
Signed-off-by: Yin Huai <yh...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b148b036
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b148b036
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b148b036

Branch: refs/heads/branch-2.0
Commit: b148b0364bcf1d0e31c320381227d13a022e6686
Parents: 97fe1d8
Author: Wenchen Fan <wenc...@databricks.com>
Authored: Mon Jun 13 14:57:35 2016 -0700
Committer: Yin Huai <yh...@databricks.com>
Committed: Mon Jun 13 14:57:48 2016 -0700

----------------------------------------------------------------------
 .../datasources/parquet/ParquetFileFormat.scala |  2 +-
 .../spark/sql/internal/SessionState.scala       |  2 +-
 .../apache/spark/sql/internal/SharedState.scala | 19 ++++++++++++--
 sql/core/src/test/resources/hive-site.xml       | 26 ++++++++++++++++++++
 .../org/apache/spark/sql/SQLQuerySuite.scala    |  4 +++
 .../apache/spark/sql/hive/HiveSharedState.scala |  5 ++--
 sql/hive/src/test/resources/hive-site.xml       | 26 ++++++++++++++++++++
 .../spark/sql/hive/HiveDataFrameSuite.scala     |  5 ++++
 8 files changed, 82 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/b148b036/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
index 2d4bef3..71c1600 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
@@ -405,7 +405,7 @@ private[sql] class ParquetFileFormat
     new ParquetOutputWriterFactory(
       sqlContext.conf,
       dataSchema,
-      sqlContext.sparkContext.hadoopConfiguration,
+      sqlContext.sessionState.newHadoopConf(),
       options)
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/b148b036/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala
index b2db377..b430950 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala
@@ -49,7 +49,7 @@ private[sql] class SessionState(sparkSession: SparkSession) {
   lazy val conf: SQLConf = new SQLConf
 
   def newHadoopConf(): Configuration = {
-    val hadoopConf = new 
Configuration(sparkSession.sparkContext.hadoopConfiguration)
+    val hadoopConf = new Configuration(sparkSession.sharedState.hadoopConf)
     conf.getAllConfs.foreach { case (k, v) => if (v ne null) hadoopConf.set(k, 
v) }
     hadoopConf
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/b148b036/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala
index 0d6f984..c37f7f1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala
@@ -17,12 +17,14 @@
 
 package org.apache.spark.sql.internal
 
+import org.apache.hadoop.conf.Configuration
+
 import org.apache.spark.SparkContext
 import org.apache.spark.sql.{SparkSession, SQLContext}
 import org.apache.spark.sql.catalyst.catalog.{ExternalCatalog, InMemoryCatalog}
 import org.apache.spark.sql.execution.CacheManager
 import org.apache.spark.sql.execution.ui.{SQLListener, SQLTab}
-import org.apache.spark.util.MutableURLClassLoader
+import org.apache.spark.util.{MutableURLClassLoader, Utils}
 
 
 /**
@@ -41,9 +43,22 @@ private[sql] class SharedState(val sparkContext: 
SparkContext) {
   val listener: SQLListener = createListenerAndUI(sparkContext)
 
   /**
+   * The base hadoop configuration which is shared among all spark sessions. 
It is based on the
+   * default hadoop configuration of Spark, with custom configurations inside 
`hive-site.xml`.
+   */
+  lazy val hadoopConf: Configuration = {
+    val conf = new Configuration(sparkContext.hadoopConfiguration)
+    val configFile = 
Utils.getContextOrSparkClassLoader.getResource("hive-site.xml")
+    if (configFile != null) {
+      conf.addResource(configFile)
+    }
+    conf
+  }
+
+  /**
    * A catalog that interacts with external systems.
    */
-  lazy val externalCatalog: ExternalCatalog = new 
InMemoryCatalog(sparkContext.hadoopConfiguration)
+  lazy val externalCatalog: ExternalCatalog = new InMemoryCatalog(hadoopConf)
 
   /**
    * A classloader used to load all user-added jar.

http://git-wip-us.apache.org/repos/asf/spark/blob/b148b036/sql/core/src/test/resources/hive-site.xml
----------------------------------------------------------------------
diff --git a/sql/core/src/test/resources/hive-site.xml 
b/sql/core/src/test/resources/hive-site.xml
new file mode 100644
index 0000000..17297b3
--- /dev/null
+++ b/sql/core/src/test/resources/hive-site.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<configuration>
+  <property>
+    <name>hive.in.test</name>
+      <value>true</value>
+      <description>Internal marker for test.</description>
+  </property>
+</configuration>

http://git-wip-us.apache.org/repos/asf/spark/blob/b148b036/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 90465b6..89f8685 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2843,4 +2843,8 @@ class SQLQuerySuite extends QueryTest with 
SharedSQLContext {
       sql(s"SELECT '$literal' AS DUMMY"),
       Row(s"$expected") :: Nil)
   }
+
+  test("SPARK-15887: hive-site.xml should be loaded") {
+    assert(spark.sessionState.newHadoopConf().get("hive.in.test") == "true")
+  }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/b148b036/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSharedState.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSharedState.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSharedState.scala
index a0106ee..78b1ecb 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSharedState.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSharedState.scala
@@ -45,12 +45,11 @@ private[hive] class HiveSharedState(override val 
sparkContext: SparkContext)
    */
   // This needs to be a lazy val at here because TestHiveSharedState is 
overriding it.
   lazy val metadataHive: HiveClient = {
-    HiveUtils.newClientForMetadata(sparkContext.conf, 
sparkContext.hadoopConfiguration)
+    HiveUtils.newClientForMetadata(sparkContext.conf, hadoopConf)
   }
 
   /**
    * A catalog that interacts with the Hive metastore.
    */
-  override lazy val externalCatalog =
-    new HiveExternalCatalog(metadataHive, sparkContext.hadoopConfiguration)
+  override lazy val externalCatalog = new HiveExternalCatalog(metadataHive, 
hadoopConf)
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/b148b036/sql/hive/src/test/resources/hive-site.xml
----------------------------------------------------------------------
diff --git a/sql/hive/src/test/resources/hive-site.xml 
b/sql/hive/src/test/resources/hive-site.xml
new file mode 100644
index 0000000..17297b3
--- /dev/null
+++ b/sql/hive/src/test/resources/hive-site.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<configuration>
+  <property>
+    <name>hive.in.test</name>
+      <value>true</value>
+      <description>Internal marker for test.</description>
+  </property>
+</configuration>

http://git-wip-us.apache.org/repos/asf/spark/blob/b148b036/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
index 1b31caa..2379843 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala
@@ -29,4 +29,9 @@ class HiveDataFrameSuite extends QueryTest with 
TestHiveSingleton {
     spark.sql("drop table usrdb.test")
     spark.sql("drop schema usrdb")
   }
+
+  test("SPARK-15887: hive-site.xml should be loaded") {
+    val hiveClient = 
spark.sharedState.asInstanceOf[HiveSharedState].metadataHive
+    assert(hiveClient.getConf("hive.in.test", "") == "true")
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15887][SQL] Bring back the hive-site.xml support for Spark 2.0

Reply via email to