This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 2496523900a [SPARK-43769][CONNECT] Implement 'levenshtein(str1, str2[, threshold])' functions 2496523900a is described below commit 2496523900a662d0f63430cb758b91e002bd520e Author: panbingkun <pbk1...@gmail.com> AuthorDate: Fri May 26 09:40:32 2023 +0800 [SPARK-43769][CONNECT] Implement 'levenshtein(str1, str2[, threshold])' functions ### What changes were proposed in this pull request? The pr aims to implement 'levenshtein(str1, str2[, threshold])' functions for `connect` module. ### Why are the changes needed? After [Add a max distance argument to the levenshtein() function](https://issues.apache.org/jira/browse/SPARK-43493) We have already implemented it on the scala side, so we need to align it. ### Does this PR introduce _any_ user-facing change? Yes, new API for Connect. ### How was this patch tested? - Pass GA. - Manual testing 1../build/sbt "connect-client-jvm/testOnly *ClientE2ETestSuite*" 2.sh dev/connect-jvm-client-mima-check Closes #41293 from panbingkun/SPARK-43769. Authored-by: panbingkun <pbk1...@gmail.com> Signed-off-by: Ruifeng Zheng <ruife...@apache.org> --- .../scala/org/apache/spark/sql/functions.scala | 11 +++++++ .../apache/spark/sql/PlanGenerationTestSuite.scala | 4 +++ .../CheckConnectJvmClientCompatibility.scala | 1 - .../function_levenshtein_with_threshold.explain | 2 ++ .../function_levenshtein_with_threshold.json | 33 +++++++++++++++++++++ .../function_levenshtein_with_threshold.proto.bin | Bin 0 -> 195 bytes 6 files changed, 50 insertions(+), 1 deletion(-) diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala index f92216f49bb..526f6904d68 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala @@ -2896,6 +2896,17 @@ object functions { */ def levenshtein(l: Column, r: Column): Column = Column.fn("levenshtein", l, r) + /** + * Computes the Levenshtein distance of the two given string columns if it's less than or equal + * to a given threshold. + * @return + * result distance, or -1 + * @group string_funcs + * @since 3.5.0 + */ + def levenshtein(l: Column, r: Column, threshold: Int): Column = + Column.fn("levenshtein", l, r, lit(threshold)) + /** * Locate the position of the first occurrence of substr. * diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala index 7ece54d0439..94b9adda655 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala @@ -1361,6 +1361,10 @@ class PlanGenerationTestSuite fn.levenshtein(fn.col("g"), lit("bob")) } + functionTest("levenshtein with threshold") { + fn.levenshtein(fn.col("g"), lit("bob"), 2) + } + functionTest("locate") { fn.locate("jar", fn.col("g")) } diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/CheckConnectJvmClientCompatibility.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/CheckConnectJvmClientCompatibility.scala index ed3660b791a..429e27827e8 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/CheckConnectJvmClientCompatibility.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/CheckConnectJvmClientCompatibility.scala @@ -199,7 +199,6 @@ object CheckConnectJvmClientCompatibility { ProblemFilters.exclude[Problem]("org.apache.spark.sql.functions.callUDF"), ProblemFilters.exclude[Problem]("org.apache.spark.sql.functions.unwrap_udt"), ProblemFilters.exclude[Problem]("org.apache.spark.sql.functions.udaf"), - ProblemFilters.exclude[Problem]("org.apache.spark.sql.functions.levenshtein"), // KeyValueGroupedDataset ProblemFilters.exclude[Problem]( diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_levenshtein_with_threshold.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_levenshtein_with_threshold.explain new file mode 100644 index 00000000000..5bd1d89ae06 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_levenshtein_with_threshold.explain @@ -0,0 +1,2 @@ +Project [levenshtein(g#0, bob, Some(2)) AS levenshtein(g, bob, 2)#0] ++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_levenshtein_with_threshold.json b/connector/connect/common/src/test/resources/query-tests/queries/function_levenshtein_with_threshold.json new file mode 100644 index 00000000000..5cc30772e8e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_levenshtein_with_threshold.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "levenshtein", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "string": "bob" + } + }, { + "literal": { + "integer": 2 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_levenshtein_with_threshold.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_levenshtein_with_threshold.proto.bin new file mode 100644 index 00000000000..22e1a332875 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_levenshtein_with_threshold.proto.bin differ --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org