spark git commit: [SPARK-20349][SQL][REVERT-BRANCH2.1] ListFunctions returns duplicate functions after using persistent functions

rxin Mon, 17 Apr 2017 17:57:39 -0700

Repository: spark
Updated Branches:
  refs/heads/branch-2.1 622d7a8bf -> 3808b4728



[SPARK-20349][SQL][REVERT-BRANCH2.1] ListFunctions returns duplicate functions 
after using persistent functions

Revert the changes of https://github.com/apache/spark/pull/17646 made in Branch 
2.1, because it breaks the build. It needs the parser interface, but 
SessionCatalog in branch 2.1 does not have it.

### What changes were proposed in this pull request?

The session catalog caches some persistent functions in the `FunctionRegistry`, 
so there can be duplicates. Our Catalog API `listFunctions` does not handle it.

It would be better if `SessionCatalog` API can de-duplciate the records, 
instead of doing it by each API caller. In `FunctionRegistry`, our functions 
are identified by the unquoted string. Thus, this PR is try to parse it using 
our parser interface and then de-duplicate the names.

### How was this patch tested?
Added test cases.

Author: Xiao Li <gatorsm...@gmail.com>

Closes #17661 from gatorsmile/compilationFix17646.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3808b472
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3808b472
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3808b472

Branch: refs/heads/branch-2.1
Commit: 3808b472813a2cdf560107787f6971e5202044a8
Parents: 622d7a8
Author: Xiao Li <gatorsm...@gmail.com>
Authored: Mon Apr 17 17:57:20 2017 -0700
Committer: Reynold Xin <r...@databricks.com>
Committed: Mon Apr 17 17:57:20 2017 -0700

----------------------------------------------------------------------
 .../sql/catalyst/catalog/SessionCatalog.scala   | 21 +++++---------------
 .../spark/sql/execution/command/functions.scala |  4 +++-
 .../spark/sql/hive/execution/HiveUDFSuite.scala | 17 ----------------
 3 files changed, 8 insertions(+), 34 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/3808b472/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
index 6f302d3..a5cf719 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -20,7 +20,6 @@ package org.apache.spark.sql.catalyst.catalog
 import javax.annotation.concurrent.GuardedBy
 
 import scala.collection.mutable
-import scala.util.{Failure, Success, Try}
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
@@ -1099,25 +1098,15 @@ class SessionCatalog(
   def listFunctions(db: String, pattern: String): Seq[(FunctionIdentifier, 
String)] = {
     val dbName = formatDatabaseName(db)
     requireDbExists(dbName)
-    val dbFunctions = externalCatalog.listFunctions(dbName, pattern).map { f =>
-      FunctionIdentifier(f, Some(dbName)) }
-    val loadedFunctions =
-      StringUtils.filterPattern(functionRegistry.listFunction(), pattern).map 
{ f =>
-        // In functionRegistry, function names are stored as an unquoted 
format.
-        Try(parser.parseFunctionIdentifier(f)) match {
-          case Success(e) => e
-          case Failure(_) =>
-            // The names of some built-in functions are not parsable by our 
parser, e.g., %
-            FunctionIdentifier(f)
-        }
-      }
+    val dbFunctions = externalCatalog.listFunctions(dbName, pattern)
+      .map { f => FunctionIdentifier(f, Some(dbName)) }
+    val loadedFunctions = 
StringUtils.filterPattern(functionRegistry.listFunction(), pattern)
+      .map { f => FunctionIdentifier(f) }
     val functions = dbFunctions ++ loadedFunctions
-    // The session catalog caches some persistent functions in the 
FunctionRegistry
-    // so there can be duplicates.
     functions.map {
       case f if FunctionRegistry.functionSet.contains(f.funcName) => (f, 
"SYSTEM")
       case f => (f, "USER")
-    }.distinct
+    }
   }
 
 

http://git-wip-us.apache.org/repos/asf/spark/blob/3808b472/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala
index 75272d2..ea53987 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala
@@ -208,6 +208,8 @@ case class ShowFunctionsCommand(
           case (f, "USER") if showUserFunctions => f.unquotedString
           case (f, "SYSTEM") if showSystemFunctions => f.unquotedString
         }
-    functionNames.sorted.map(Row(_))
+    // The session catalog caches some persistent functions in the 
FunctionRegistry
+    // so there can be duplicates.
+    functionNames.distinct.sorted.map(Row(_))
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/3808b472/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
index 9368d0b..78c80da 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
@@ -539,23 +539,6 @@ class HiveUDFSuite extends QueryTest with 
TestHiveSingleton with SQLTestUtils {
       checkAnswer(testData.selectExpr("statelessUDF() as s").agg(max($"s")), 
Row(1))
     }
   }
-
-  test("Show persistent functions") {
-    val testData = spark.sparkContext.parallelize(StringCaseClass("") :: 
Nil).toDF()
-    withTempView("inputTable") {
-      testData.createOrReplaceTempView("inputTable")
-      withUserDefinedFunction("testUDFToListInt" -> false) {
-        val numFunc = spark.catalog.listFunctions().count()
-        sql(s"CREATE FUNCTION testUDFToListInt AS 
'${classOf[UDFToListInt].getName}'")
-        assert(spark.catalog.listFunctions().count() == numFunc + 1)
-        checkAnswer(
-          sql("SELECT testUDFToListInt(s) FROM inputTable"),
-          Seq(Row(Seq(1, 2, 3))))
-        assert(sql("show functions").count() == numFunc + 1)
-        assert(spark.catalog.listFunctions().count() == numFunc + 1)
-      }
-    }
-  }
 }
 
 class TestPair(x: Int, y: Int) extends Writable with Serializable {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-20349][SQL][REVERT-BRANCH2.1] ListFunctions returns duplicate functions after using persistent functions

Reply via email to