Repository: spark Updated Branches: refs/heads/master 9db06c442 -> 57626a557
[SPARK-16904][SQL] Removal of Hive Built-in Hash Functions and TestHiveFunctionRegistry ### What changes were proposed in this pull request? Currently, the Hive built-in `hash` function is not being used in Spark since Spark 2.0. The public interface does not allow users to unregister the Spark built-in functions. Thus, users will never use Hive's built-in `hash` function. The only exception here is `TestHiveFunctionRegistry`, which allows users to unregister the built-in functions. Thus, we can load Hive's hash function in the test cases. If we disable it, 10+ test cases will fail because the results are different from the Hive golden answer files. This PR is to remove `hash` from the list of `hiveFunctions` in `HiveSessionCatalog`. It will also remove `TestHiveFunctionRegistry`. This removal makes us easier to remove `TestHiveSessionState` in the future. ### How was this patch tested? N/A Author: gatorsmile <gatorsm...@gmail.com> Closes #14498 from gatorsmile/removeHash. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/57626a55 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/57626a55 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/57626a55 Branch: refs/heads/master Commit: 57626a55703a189e03148398f67c36cd0e557044 Parents: 9db06c4 Author: gatorsmile <gatorsm...@gmail.com> Authored: Mon Nov 7 01:16:37 2016 -0800 Committer: Reynold Xin <r...@databricks.com> Committed: Mon Nov 7 01:16:37 2016 -0800 ---------------------------------------------------------------------- .../hive/execution/HiveCompatibilitySuite.scala | 41 ++++++++++---------- .../spark/sql/hive/HiveSessionCatalog.scala | 1 - .../apache/spark/sql/hive/test/TestHive.scala | 28 ------------- 3 files changed, 20 insertions(+), 50 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/57626a55/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala ---------------------------------------------------------------------- diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala index f5d10de..5cd4935 100644 --- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala +++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala @@ -57,8 +57,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { TestHive.setConf(SQLConf.COLUMN_BATCH_SIZE, 5) // Enable in-memory partition pruning for testing purposes TestHive.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, true) - // Use Hive hash expression instead of the native one - TestHive.sessionState.functionRegistry.unregisterFunction("hash") // Ensures that the plans generation use metastore relation and not OrcRelation // Was done because SqlBuilder does not work with plans having logical relation TestHive.setConf(HiveUtils.CONVERT_METASTORE_ORC, false) @@ -76,7 +74,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { TestHive.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, originalInMemoryPartitionPruning) TestHive.setConf(HiveUtils.CONVERT_METASTORE_ORC, originalConvertMetastoreOrc) TestHive.setConf(SQLConf.CROSS_JOINS_ENABLED, originalCrossJoinEnabled) - TestHive.sessionState.functionRegistry.restore() // For debugging dump some statistics about how much time was spent in various optimizer rules logWarning(RuleExecutor.dumpTimeSpent()) @@ -581,7 +578,26 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { "auto_join6", "auto_join7", "auto_join8", - "auto_join9" + "auto_join9", + + // These tests are based on the Hive's hash function, which is different from Spark + "auto_join19", + "auto_join22", + "auto_join25", + "auto_join26", + "auto_join27", + "auto_join28", + "auto_join30", + "auto_join31", + "auto_join_nulls", + "auto_join_reordering_values", + "correlationoptimizer1", + "correlationoptimizer2", + "correlationoptimizer3", + "correlationoptimizer4", + "multiMapJoin1", + "orc_dictionary_threshold", + "udf_hash" ) /** @@ -601,16 +617,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { "annotate_stats_part", "annotate_stats_table", "annotate_stats_union", - "auto_join19", - "auto_join22", - "auto_join25", - "auto_join26", - "auto_join27", - "auto_join28", - "auto_join30", - "auto_join31", - "auto_join_nulls", - "auto_join_reordering_values", "binary_constant", "binarysortable_1", "cast1", @@ -623,15 +629,11 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { "compute_stats_long", "compute_stats_string", "convert_enum_to_string", - "correlationoptimizer1", "correlationoptimizer10", "correlationoptimizer11", "correlationoptimizer13", "correlationoptimizer14", "correlationoptimizer15", - "correlationoptimizer2", - "correlationoptimizer3", - "correlationoptimizer4", "correlationoptimizer6", "correlationoptimizer7", "correlationoptimizer8", @@ -871,7 +873,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { "merge2", "merge4", "mergejoins", - "multiMapJoin1", "multiMapJoin2", "multi_insert_gby", "multi_insert_gby3", @@ -893,7 +894,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { "nullinput2", "nullscript", "optional_outer", - "orc_dictionary_threshold", "order", "order2", "outer_join_ppr", @@ -1026,7 +1026,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { "udf_from_unixtime", "udf_greaterthan", "udf_greaterthanorequal", - "udf_hash", "udf_hex", "udf_if", "udf_index", http://git-wip-us.apache.org/repos/asf/spark/blob/57626a55/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala ---------------------------------------------------------------------- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala index 4f2910a..9df20ce 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala @@ -233,7 +233,6 @@ private[sql] class HiveSessionCatalog( // in_file, index, matchpath, ngrams, noop, noopstreaming, noopwithmap, // noopwithmapstreaming, parse_url_tuple, reflect2, windowingtablefunction. private val hiveFunctions = Seq( - "hash", "histogram_numeric", "percentile" ) http://git-wip-us.apache.org/repos/asf/spark/blob/57626a55/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala ---------------------------------------------------------------------- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala index 9000044..a8dd510 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala @@ -492,24 +492,6 @@ private[hive] class TestHiveQueryExecution( } } - -private[hive] class TestHiveFunctionRegistry extends SimpleFunctionRegistry { - - private val removedFunctions = - collection.mutable.ArrayBuffer.empty[(String, (ExpressionInfo, FunctionBuilder))] - - def unregisterFunction(name: String): Unit = synchronized { - functionBuilders.remove(name).foreach(f => removedFunctions += name -> f) - } - - def restore(): Unit = synchronized { - removedFunctions.foreach { - case (name, (info, builder)) => registerFunction(name, info, builder) - } - } -} - - private[hive] class TestHiveSessionState( sparkSession: TestHiveSparkSession) extends HiveSessionState(sparkSession) { self => @@ -525,16 +507,6 @@ private[hive] class TestHiveSessionState( } } - override lazy val functionRegistry: TestHiveFunctionRegistry = { - // We use TestHiveFunctionRegistry at here to track functions that have been explicitly - // unregistered (through TestHiveFunctionRegistry.unregisterFunction method). - val fr = new TestHiveFunctionRegistry - org.apache.spark.sql.catalyst.analysis.FunctionRegistry.expressions.foreach { - case (name, (info, builder)) => fr.registerFunction(name, info, builder) - } - fr - } - override def executePlan(plan: LogicalPlan): TestHiveQueryExecution = { new TestHiveQueryExecution(sparkSession, plan) } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org