This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 2496523900a [SPARK-43769][CONNECT] Implement 'levenshtein(str1, str2[, 
threshold])' functions
2496523900a is described below

commit 2496523900a662d0f63430cb758b91e002bd520e
Author: panbingkun <pbk1...@gmail.com>
AuthorDate: Fri May 26 09:40:32 2023 +0800

    [SPARK-43769][CONNECT] Implement 'levenshtein(str1, str2[, threshold])' 
functions
    
    ### What changes were proposed in this pull request?
    The pr aims to implement 'levenshtein(str1, str2[, threshold])' functions 
for `connect` module.
    
    ### Why are the changes needed?
    After [Add a max distance argument to the levenshtein() 
function](https://issues.apache.org/jira/browse/SPARK-43493) We have already 
implemented it on the scala side, so we need to align it.
    
    ### Does this PR introduce _any_ user-facing change?
    Yes, new API for Connect.
    
    ### How was this patch tested?
    - Pass GA.
    - Manual testing
    1../build/sbt "connect-client-jvm/testOnly *ClientE2ETestSuite*"
    2.sh dev/connect-jvm-client-mima-check
    
    Closes #41293 from panbingkun/SPARK-43769.
    
    Authored-by: panbingkun <pbk1...@gmail.com>
    Signed-off-by: Ruifeng Zheng <ruife...@apache.org>
---
 .../scala/org/apache/spark/sql/functions.scala     |  11 +++++++
 .../apache/spark/sql/PlanGenerationTestSuite.scala |   4 +++
 .../CheckConnectJvmClientCompatibility.scala       |   1 -
 .../function_levenshtein_with_threshold.explain    |   2 ++
 .../function_levenshtein_with_threshold.json       |  33 +++++++++++++++++++++
 .../function_levenshtein_with_threshold.proto.bin  | Bin 0 -> 195 bytes
 6 files changed, 50 insertions(+), 1 deletion(-)

diff --git 
a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
 
b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
index f92216f49bb..526f6904d68 100644
--- 
a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
+++ 
b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
@@ -2896,6 +2896,17 @@ object functions {
    */
   def levenshtein(l: Column, r: Column): Column = Column.fn("levenshtein", l, 
r)
 
+  /**
+   * Computes the Levenshtein distance of the two given string columns if it's 
less than or equal
+   * to a given threshold.
+   * @return
+   *   result distance, or -1
+   * @group string_funcs
+   * @since 3.5.0
+   */
+  def levenshtein(l: Column, r: Column, threshold: Int): Column =
+    Column.fn("levenshtein", l, r, lit(threshold))
+
   /**
    * Locate the position of the first occurrence of substr.
    *
diff --git 
a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
 
b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
index 7ece54d0439..94b9adda655 100644
--- 
a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
+++ 
b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
@@ -1361,6 +1361,10 @@ class PlanGenerationTestSuite
     fn.levenshtein(fn.col("g"), lit("bob"))
   }
 
+  functionTest("levenshtein with threshold") {
+    fn.levenshtein(fn.col("g"), lit("bob"), 2)
+  }
+
   functionTest("locate") {
     fn.locate("jar", fn.col("g"))
   }
diff --git 
a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/CheckConnectJvmClientCompatibility.scala
 
b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/CheckConnectJvmClientCompatibility.scala
index ed3660b791a..429e27827e8 100644
--- 
a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/CheckConnectJvmClientCompatibility.scala
+++ 
b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/CheckConnectJvmClientCompatibility.scala
@@ -199,7 +199,6 @@ object CheckConnectJvmClientCompatibility {
       
ProblemFilters.exclude[Problem]("org.apache.spark.sql.functions.callUDF"),
       
ProblemFilters.exclude[Problem]("org.apache.spark.sql.functions.unwrap_udt"),
       ProblemFilters.exclude[Problem]("org.apache.spark.sql.functions.udaf"),
-      
ProblemFilters.exclude[Problem]("org.apache.spark.sql.functions.levenshtein"),
 
       // KeyValueGroupedDataset
       ProblemFilters.exclude[Problem](
diff --git 
a/connector/connect/common/src/test/resources/query-tests/explain-results/function_levenshtein_with_threshold.explain
 
b/connector/connect/common/src/test/resources/query-tests/explain-results/function_levenshtein_with_threshold.explain
new file mode 100644
index 00000000000..5bd1d89ae06
--- /dev/null
+++ 
b/connector/connect/common/src/test/resources/query-tests/explain-results/function_levenshtein_with_threshold.explain
@@ -0,0 +1,2 @@
+Project [levenshtein(g#0, bob, Some(2)) AS levenshtein(g, bob, 2)#0]
++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git 
a/connector/connect/common/src/test/resources/query-tests/queries/function_levenshtein_with_threshold.json
 
b/connector/connect/common/src/test/resources/query-tests/queries/function_levenshtein_with_threshold.json
new file mode 100644
index 00000000000..5cc30772e8e
--- /dev/null
+++ 
b/connector/connect/common/src/test/resources/query-tests/queries/function_levenshtein_with_threshold.json
@@ -0,0 +1,33 @@
+{
+  "common": {
+    "planId": "1"
+  },
+  "project": {
+    "input": {
+      "common": {
+        "planId": "0"
+      },
+      "localRelation": {
+        "schema": 
"struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e"
+      }
+    },
+    "expressions": [{
+      "unresolvedFunction": {
+        "functionName": "levenshtein",
+        "arguments": [{
+          "unresolvedAttribute": {
+            "unparsedIdentifier": "g"
+          }
+        }, {
+          "literal": {
+            "string": "bob"
+          }
+        }, {
+          "literal": {
+            "integer": 2
+          }
+        }]
+      }
+    }]
+  }
+}
\ No newline at end of file
diff --git 
a/connector/connect/common/src/test/resources/query-tests/queries/function_levenshtein_with_threshold.proto.bin
 
b/connector/connect/common/src/test/resources/query-tests/queries/function_levenshtein_with_threshold.proto.bin
new file mode 100644
index 00000000000..22e1a332875
Binary files /dev/null and 
b/connector/connect/common/src/test/resources/query-tests/queries/function_levenshtein_with_threshold.proto.bin
 differ


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to