spark git commit: [SPARK-16248][SQL] Whitelist the list of Hive fallback functions

rxin Tue, 28 Jun 2016 19:37:04 -0700

Repository: spark
Updated Branches:
  refs/heads/master 5bf8881b3 -> 363bcedee



[SPARK-16248][SQL] Whitelist the list of Hive fallback functions

## What changes were proposed in this pull request?
This patch removes the blind fallback into Hive for functions. Instead, it 
creates a whitelist and adds only a small number of functions to the whitelist, 
i.e. the ones we intend to support in the long run in Spark.

## How was this patch tested?
Updated tests to reflect the change.

Author: Reynold Xin <r...@databricks.com>

Closes #13939 from rxin/hive-whitelist.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/363bcede
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/363bcede
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/363bcede

Branch: refs/heads/master
Commit: 363bcedeea40fe3f1a92271b96af2acba63e058c
Parents: 5bf8881
Author: Reynold Xin <r...@databricks.com>
Authored: Tue Jun 28 19:36:53 2016 -0700
Committer: Reynold Xin <r...@databricks.com>
Committed: Tue Jun 28 19:36:53 2016 -0700

----------------------------------------------------------------------
 .../catalyst/analysis/FunctionRegistry.scala    |  1 +
 .../hive/execution/HiveCompatibilitySuite.scala | 22 +++++-----
 .../HiveWindowFunctionQuerySuite.scala          | 25 ------------
 .../spark/sql/hive/HiveSessionCatalog.scala     | 42 +++++++++++++-------
 4 files changed, 40 insertions(+), 50 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/363bcede/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 42a8faa..0bde48c 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -248,6 +248,7 @@ object FunctionRegistry {
     expression[Average]("mean"),
     expression[Min]("min"),
     expression[Skewness]("skewness"),
+    expression[StddevSamp]("std"),
     expression[StddevSamp]("stddev"),
     expression[StddevPop]("stddev_pop"),
     expression[StddevSamp]("stddev_samp"),

http://git-wip-us.apache.org/repos/asf/spark/blob/363bcede/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
 
b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index 2d5a970..13d18fd 100644
--- 
a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ 
b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -517,6 +517,18 @@ class HiveCompatibilitySuite extends HiveQueryFileTest 
with BeforeAndAfter {
     // This test uses CREATE EXTERNAL TABLE without specifying LOCATION
     "alter2",
 
+    // [SPARK-16248][SQL] Whitelist the list of Hive fallback functions
+    "udf_field",
+    "udf_reflect2",
+    "udf_xpath",
+    "udf_xpath_boolean",
+    "udf_xpath_double",
+    "udf_xpath_float",
+    "udf_xpath_int",
+    "udf_xpath_long",
+    "udf_xpath_short",
+    "udf_xpath_string",
+
     // These tests DROP TABLE that don't exist (but do not specify IF EXISTS)
     "alter_rename_partition1",
     "date_1",
@@ -1004,7 +1016,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest 
with BeforeAndAfter {
     "udf_elt",
     "udf_equal",
     "udf_exp",
-    "udf_field",
     "udf_find_in_set",
     "udf_float",
     "udf_floor",
@@ -1049,7 +1060,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest 
with BeforeAndAfter {
     "udf_power",
     "udf_radians",
     "udf_rand",
-    "udf_reflect2",
     "udf_regexp",
     "udf_regexp_extract",
     "udf_regexp_replace",
@@ -1090,14 +1100,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest 
with BeforeAndAfter {
     "udf_variance",
     "udf_weekofyear",
     "udf_when",
-    "udf_xpath",
-    "udf_xpath_boolean",
-    "udf_xpath_double",
-    "udf_xpath_float",
-    "udf_xpath_int",
-    "udf_xpath_long",
-    "udf_xpath_short",
-    "udf_xpath_string",
     "union10",
     "union11",
     "union13",

http://git-wip-us.apache.org/repos/asf/spark/blob/363bcede/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala
 
b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala
index 6c39781..7ba5790 100644
--- 
a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala
+++ 
b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala
@@ -534,31 +534,6 @@ class HiveWindowFunctionQuerySuite extends 
HiveComparisonTest with BeforeAndAfte
       |             rows between 2 preceding and 2 following);
     """.stripMargin, reset = false)
 
-  // collect_set() output array in an arbitrary order, hence causes different 
result
-  // when running this test suite under Java 7 and 8.
-  // We change the original sql query a little bit for making the test suite 
passed
-  // under different JDK
-  /* Disabled because:
-     - Spark uses a different default stddev.
-     - Tiny numerical differences in stddev results.
-  createQueryTest("windowing.q -- 20. testSTATs",
-    """
-      |select p_mfgr,p_name, p_size, sdev, sdev_pop, uniq_data, var, cor, 
covarp
-      |from (
-      |select  p_mfgr,p_name, p_size,
-      |stddev(p_retailprice) over w1 as sdev,
-      |stddev_pop(p_retailprice) over w1 as sdev_pop,
-      |collect_set(p_size) over w1 as uniq_size,
-      |variance(p_retailprice) over w1 as var,
-      |corr(p_size, p_retailprice) over w1 as cor,
-      |covar_pop(p_size, p_retailprice) over w1 as covarp
-      |from part
-      |window w1 as (distribute by p_mfgr sort by p_mfgr, p_name
-      |             rows between 2 preceding and 2 following)
-      |) t lateral view explode(uniq_size) d as uniq_data
-      |order by p_mfgr,p_name, p_size, sdev, sdev_pop, uniq_data, var, cor, 
covarp
-    """.stripMargin, reset = false)
-  */
   createQueryTest("windowing.q -- 21. testDISTs",
     """
       |select  p_mfgr,p_name, p_size,

http://git-wip-us.apache.org/repos/asf/spark/blob/363bcede/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
index 2f6a220..8a47dcf 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
@@ -162,17 +162,6 @@ private[sql] class HiveSessionCatalog(
     }
   }
 
-  // We have a list of Hive built-in functions that we do not support. So, we 
will check
-  // Hive's function registry and lazily load needed functions into our own 
function registry.
-  // Those Hive built-in functions are
-  // assert_true, collect_list, collect_set, compute_stats, context_ngrams, 
create_union,
-  // current_user ,elt, ewah_bitmap, ewah_bitmap_and, ewah_bitmap_empty, 
ewah_bitmap_or, field,
-  // histogram_numeric, in_file, index, inline, java_method, map_keys, 
map_values,
-  // matchpath, ngrams, noop, noopstreaming, noopwithmap, noopwithmapstreaming,
-  // parse_url, parse_url_tuple, percentile, percentile_approx, posexplode, 
reflect, reflect2,
-  // regexp, sentences, stack, std, str_to_map, windowingtablefunction, xpath, 
xpath_boolean,
-  // xpath_double, xpath_float, xpath_int, xpath_long, xpath_number,
-  // xpath_short, and xpath_string.
   override def lookupFunction(name: FunctionIdentifier, children: 
Seq[Expression]): Expression = {
     // TODO: Once lookupFunction accepts a FunctionIdentifier, we should 
refactor this method to
     // if (super.functionExists(name)) {
@@ -196,10 +185,12 @@ private[sql] class HiveSessionCatalog(
           // built-in function.
           // Hive is case insensitive.
           val functionName = funcName.unquotedString.toLowerCase
-          // TODO: This may not really work for current_user because 
current_user is not evaluated
-          // with session info.
-          // We do not need to use executionHive at here because we only load
-          // Hive's builtin functions, which do not need current db.
+          if (!hiveFunctions.contains(functionName)) {
+            failFunctionLookup(funcName.unquotedString)
+          }
+
+          // TODO: Remove this fallback path once we implement the list of 
fallback functions
+          // defined below in hiveFunctions.
           val functionInfo = {
             try {
               
Option(HiveFunctionRegistry.getFunctionInfo(functionName)).getOrElse(
@@ -221,4 +212,25 @@ private[sql] class HiveSessionCatalog(
         }
     }
   }
+
+  /** List of functions we pass over to Hive. Note that over time this list 
should go to 0. */
+  // We have a list of Hive built-in functions that we do not support. So, we 
will check
+  // Hive's function registry and lazily load needed functions into our own 
function registry.
+  // List of functions we are explicitly not supporting are:
+  // compute_stats, context_ngrams, create_union,
+  // current_user, ewah_bitmap, ewah_bitmap_and, ewah_bitmap_empty, 
ewah_bitmap_or, field,
+  // in_file, index, java_method,
+  // matchpath, ngrams, noop, noopstreaming, noopwithmap, noopwithmapstreaming,
+  // parse_url_tuple, posexplode, reflect2,
+  // str_to_map, windowingtablefunction.
+  private val hiveFunctions = Seq(
+    "elt", "hash", "java_method", "histogram_numeric",
+    "map_keys", "map_values",
+    "parse_url", "percentile", "percentile_approx", "reflect", "sentences", 
"stack", "str_to_map",
+    "xpath", "xpath_boolean", "xpath_double", "xpath_float", "xpath_int", 
"xpath_long",
+    "xpath_number", "xpath_short", "xpath_string",
+
+    // table generating function
+    "inline", "posexplode"
+  )
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-16248][SQL] Whitelist the list of Hive fallback functions

Reply via email to