spark git commit: [SPARK-19737][SQL] New analysis rule for reporting unregistered functions without relying on relation resolution

lian Mon, 06 Mar 2017 10:37:14 -0800

Repository: spark
Updated Branches:
  refs/heads/master 2a0bc867a -> 339b53a13



[SPARK-19737][SQL] New analysis rule for reporting unregistered functions 
without relying on relation resolution

## What changes were proposed in this pull request?

This PR adds a new `Once` analysis rule batch consists of a single analysis 
rule `LookupFunctions` that performs simple existence check over 
`UnresolvedFunctions` without actually resolving them.

The benefit of this rule is that it doesn't require function arguments to be 
resolved first and therefore doesn't rely on relation resolution, which may 
incur potentially expensive partition/schema discovery cost.

Please refer to [SPARK-19737][1] for more details about the motivation.

## How was this patch tested?

New test case added in `AnalysisErrorSuite`.

[1]: https://issues.apache.org/jira/browse/SPARK-19737

Author: Cheng Lian <l...@databricks.com>

Closes #17168 from liancheng/spark-19737-lookup-functions.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/339b53a1
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/339b53a1
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/339b53a1

Branch: refs/heads/master
Commit: 339b53a1311e08521d84a83c94201fcf3c766fb2
Parents: 2a0bc86
Author: Cheng Lian <l...@databricks.com>
Authored: Mon Mar 6 10:36:50 2017 -0800
Committer: Cheng Lian <l...@databricks.com>
Committed: Mon Mar 6 10:36:50 2017 -0800

----------------------------------------------------------------------
 .../spark/sql/catalyst/analysis/Analyzer.scala  | 21 ++++++++++++++++++
 .../catalyst/catalog/SessionCatalogSuite.scala  | 23 +++++++++++++++++++-
 .../spark/sql/hive/HiveSessionCatalog.scala     |  5 +++++
 3 files changed, 48 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/339b53a1/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 6d569b6..2f8489d 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -117,6 +117,8 @@ class Analyzer(
     Batch("Hints", fixedPoint,
       new ResolveHints.ResolveBroadcastHints(conf),
       ResolveHints.RemoveAllHints),
+    Batch("Simple Sanity Check", Once,
+      LookupFunctions),
     Batch("Substitution", fixedPoint,
       CTESubstitution,
       WindowsSubstitution,
@@ -1039,6 +1041,25 @@ class Analyzer(
   }
 
   /**
+   * Checks whether a function identifier referenced by an 
[[UnresolvedFunction]] is defined in the
+   * function registry. Note that this rule doesn't try to resolve the 
[[UnresolvedFunction]]. It
+   * only performs simple existence check according to the function identifier 
to quickly identify
+   * undefined functions without triggering relation resolution, which may 
incur potentially
+   * expensive partition/schema discovery process in some cases.
+   *
+   * @see [[ResolveFunctions]]
+   * @see https://issues.apache.org/jira/browse/SPARK-19737
+   */
+  object LookupFunctions extends Rule[LogicalPlan] {
+    override def apply(plan: LogicalPlan): LogicalPlan = 
plan.transformAllExpressions {
+      case f: UnresolvedFunction if !catalog.functionExists(f.name) =>
+        withPosition(f) {
+          throw new 
NoSuchFunctionException(f.name.database.getOrElse("default"), f.name.funcName)
+        }
+    }
+  }
+
+  /**
    * Replaces [[UnresolvedFunction]]s with concrete [[Expression]]s.
    */
   object ResolveFunctions extends Rule[LogicalPlan] {

http://git-wip-us.apache.org/repos/asf/spark/blob/339b53a1/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
index a755231..ffc272c 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst.catalog
 
 import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
+import org.apache.spark.sql.catalyst.{FunctionIdentifier, SimpleCatalystConf, 
TableIdentifier}
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
@@ -1196,4 +1196,25 @@ class SessionCatalogSuite extends PlanTest {
       catalog.listFunctions("unknown_db", "func*")
     }
   }
+
+  test("SPARK-19737: detect undefined functions without triggering relation 
resolution") {
+    import org.apache.spark.sql.catalyst.dsl.plans._
+
+    Seq(true, false) foreach { caseSensitive =>
+      val conf = SimpleCatalystConf(caseSensitive)
+      val catalog = new SessionCatalog(newBasicCatalog(), new 
SimpleFunctionRegistry, conf)
+      val analyzer = new Analyzer(catalog, conf)
+
+      // The analyzer should report the undefined function rather than the 
undefined table first.
+      val cause = intercept[AnalysisException] {
+        analyzer.execute(
+          UnresolvedRelation(TableIdentifier("undefined_table")).select(
+            UnresolvedFunction("undefined_fn", Nil, isDistinct = false)
+          )
+        )
+      }
+
+      assert(cause.getMessage.contains("Undefined function: 'undefined_fn'"))
+    }
+  }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/339b53a1/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
index c9be1b9..f1ea868 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
@@ -199,6 +199,11 @@ private[sql] class HiveSessionCatalog(
     }
   }
 
+  // TODO Removes this method after implementing Spark native 
"histogram_numeric".
+  override def functionExists(name: FunctionIdentifier): Boolean = {
+    super.functionExists(name) || hiveFunctions.contains(name.funcName)
+  }
+
   /** List of functions we pass over to Hive. Note that over time this list 
should go to 0. */
   // We have a list of Hive built-in functions that we do not support. So, we 
will check
   // Hive's function registry and lazily load needed functions into our own 
function registry.


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-19737][SQL] New analysis rule for reporting unregistered functions without relying on relation resolution

Reply via email to