Repository: spark Updated Branches: refs/heads/branch-2.0 97fe1d8ee -> b148b0364
[SPARK-15887][SQL] Bring back the hive-site.xml support for Spark 2.0 ## What changes were proposed in this pull request? Right now, Spark 2.0 does not load hive-site.xml. Based on users' feedback, it seems make sense to still load this conf file. This PR adds a `hadoopConf` API in `SharedState`, which is `sparkContext.hadoopConfiguration` by default. When users are under hive context, `SharedState.hadoopConf` will load hive-site.xml and append its configs to `sparkContext.hadoopConfiguration`. When we need to read hadoop config in spark sql, we should call `SessionState.newHadoopConf`, which contains `sparkContext.hadoopConfiguration`, hive-site.xml and sql configs. ## How was this patch tested? new test in `HiveDataFrameSuite` Author: Wenchen Fan <wenc...@databricks.com> Closes #13611 from cloud-fan/hive-site. (cherry picked from commit c4b1ad020962c42be804d3a1a55171d9b51b01e7) Signed-off-by: Yin Huai <yh...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b148b036 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b148b036 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b148b036 Branch: refs/heads/branch-2.0 Commit: b148b0364bcf1d0e31c320381227d13a022e6686 Parents: 97fe1d8 Author: Wenchen Fan <wenc...@databricks.com> Authored: Mon Jun 13 14:57:35 2016 -0700 Committer: Yin Huai <yh...@databricks.com> Committed: Mon Jun 13 14:57:48 2016 -0700 ---------------------------------------------------------------------- .../datasources/parquet/ParquetFileFormat.scala | 2 +- .../spark/sql/internal/SessionState.scala | 2 +- .../apache/spark/sql/internal/SharedState.scala | 19 ++++++++++++-- sql/core/src/test/resources/hive-site.xml | 26 ++++++++++++++++++++ .../org/apache/spark/sql/SQLQuerySuite.scala | 4 +++ .../apache/spark/sql/hive/HiveSharedState.scala | 5 ++-- sql/hive/src/test/resources/hive-site.xml | 26 ++++++++++++++++++++ .../spark/sql/hive/HiveDataFrameSuite.scala | 5 ++++ 8 files changed, 82 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/b148b036/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala index 2d4bef3..71c1600 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala @@ -405,7 +405,7 @@ private[sql] class ParquetFileFormat new ParquetOutputWriterFactory( sqlContext.conf, dataSchema, - sqlContext.sparkContext.hadoopConfiguration, + sqlContext.sessionState.newHadoopConf(), options) } } http://git-wip-us.apache.org/repos/asf/spark/blob/b148b036/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala index b2db377..b430950 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala @@ -49,7 +49,7 @@ private[sql] class SessionState(sparkSession: SparkSession) { lazy val conf: SQLConf = new SQLConf def newHadoopConf(): Configuration = { - val hadoopConf = new Configuration(sparkSession.sparkContext.hadoopConfiguration) + val hadoopConf = new Configuration(sparkSession.sharedState.hadoopConf) conf.getAllConfs.foreach { case (k, v) => if (v ne null) hadoopConf.set(k, v) } hadoopConf } http://git-wip-us.apache.org/repos/asf/spark/blob/b148b036/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala index 0d6f984..c37f7f1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala @@ -17,12 +17,14 @@ package org.apache.spark.sql.internal +import org.apache.hadoop.conf.Configuration + import org.apache.spark.SparkContext import org.apache.spark.sql.{SparkSession, SQLContext} import org.apache.spark.sql.catalyst.catalog.{ExternalCatalog, InMemoryCatalog} import org.apache.spark.sql.execution.CacheManager import org.apache.spark.sql.execution.ui.{SQLListener, SQLTab} -import org.apache.spark.util.MutableURLClassLoader +import org.apache.spark.util.{MutableURLClassLoader, Utils} /** @@ -41,9 +43,22 @@ private[sql] class SharedState(val sparkContext: SparkContext) { val listener: SQLListener = createListenerAndUI(sparkContext) /** + * The base hadoop configuration which is shared among all spark sessions. It is based on the + * default hadoop configuration of Spark, with custom configurations inside `hive-site.xml`. + */ + lazy val hadoopConf: Configuration = { + val conf = new Configuration(sparkContext.hadoopConfiguration) + val configFile = Utils.getContextOrSparkClassLoader.getResource("hive-site.xml") + if (configFile != null) { + conf.addResource(configFile) + } + conf + } + + /** * A catalog that interacts with external systems. */ - lazy val externalCatalog: ExternalCatalog = new InMemoryCatalog(sparkContext.hadoopConfiguration) + lazy val externalCatalog: ExternalCatalog = new InMemoryCatalog(hadoopConf) /** * A classloader used to load all user-added jar. http://git-wip-us.apache.org/repos/asf/spark/blob/b148b036/sql/core/src/test/resources/hive-site.xml ---------------------------------------------------------------------- diff --git a/sql/core/src/test/resources/hive-site.xml b/sql/core/src/test/resources/hive-site.xml new file mode 100644 index 0000000..17297b3 --- /dev/null +++ b/sql/core/src/test/resources/hive-site.xml @@ -0,0 +1,26 @@ +<?xml version="1.0"?> +<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<configuration> + <property> + <name>hive.in.test</name> + <value>true</value> + <description>Internal marker for test.</description> + </property> +</configuration> http://git-wip-us.apache.org/repos/asf/spark/blob/b148b036/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index 90465b6..89f8685 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -2843,4 +2843,8 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { sql(s"SELECT '$literal' AS DUMMY"), Row(s"$expected") :: Nil) } + + test("SPARK-15887: hive-site.xml should be loaded") { + assert(spark.sessionState.newHadoopConf().get("hive.in.test") == "true") + } } http://git-wip-us.apache.org/repos/asf/spark/blob/b148b036/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSharedState.scala ---------------------------------------------------------------------- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSharedState.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSharedState.scala index a0106ee..78b1ecb 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSharedState.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSharedState.scala @@ -45,12 +45,11 @@ private[hive] class HiveSharedState(override val sparkContext: SparkContext) */ // This needs to be a lazy val at here because TestHiveSharedState is overriding it. lazy val metadataHive: HiveClient = { - HiveUtils.newClientForMetadata(sparkContext.conf, sparkContext.hadoopConfiguration) + HiveUtils.newClientForMetadata(sparkContext.conf, hadoopConf) } /** * A catalog that interacts with the Hive metastore. */ - override lazy val externalCatalog = - new HiveExternalCatalog(metadataHive, sparkContext.hadoopConfiguration) + override lazy val externalCatalog = new HiveExternalCatalog(metadataHive, hadoopConf) } http://git-wip-us.apache.org/repos/asf/spark/blob/b148b036/sql/hive/src/test/resources/hive-site.xml ---------------------------------------------------------------------- diff --git a/sql/hive/src/test/resources/hive-site.xml b/sql/hive/src/test/resources/hive-site.xml new file mode 100644 index 0000000..17297b3 --- /dev/null +++ b/sql/hive/src/test/resources/hive-site.xml @@ -0,0 +1,26 @@ +<?xml version="1.0"?> +<?xml-stylesheet type="text/xsl" href="configuration.xsl"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<configuration> + <property> + <name>hive.in.test</name> + <value>true</value> + <description>Internal marker for test.</description> + </property> +</configuration> http://git-wip-us.apache.org/repos/asf/spark/blob/b148b036/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala ---------------------------------------------------------------------- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala index 1b31caa..2379843 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDataFrameSuite.scala @@ -29,4 +29,9 @@ class HiveDataFrameSuite extends QueryTest with TestHiveSingleton { spark.sql("drop table usrdb.test") spark.sql("drop schema usrdb") } + + test("SPARK-15887: hive-site.xml should be loaded") { + val hiveClient = spark.sharedState.asInstanceOf[HiveSharedState].metadataHive + assert(hiveClient.getConf("hive.in.test", "") == "true") + } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org