This is an automated email from the ASF dual-hosted git repository. hvanhovell pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 4f3afdcd756 [SPARK-42514][CONNECT] Scala Client add partition transforms functions 4f3afdcd756 is described below commit 4f3afdcd7561db38f3c0427d31db4f27fa94a83c Author: yangjie01 <yangji...@baidu.com> AuthorDate: Tue Feb 21 20:47:42 2023 -0400 [SPARK-42514][CONNECT] Scala Client add partition transforms functions ### What changes were proposed in this pull request? This PR aims add the partition transforms functions to the Scala spark connect client. ### Why are the changes needed? Provide same APIs in the Scala spark connect client as in the original Dataset API. ### Does this PR introduce _any_ user-facing change? Yes, it adds new for functions to the Spark Connect Scala client. ### How was this patch tested? - Add new test - Manual checked `connect-client-jvm` and `connect` with Scala-2.13 Closes #40105 from LuciferYang/partition-transforms-functions. Authored-by: yangjie01 <yangji...@baidu.com> Signed-off-by: Herman van Hovell <her...@databricks.com> --- .../scala/org/apache/spark/sql/functions.scala | 58 ++++++++++++++++++++++ .../org/apache/spark/sql/FunctionTestSuite.scala | 1 + .../apache/spark/sql/PlanGenerationTestSuite.scala | 20 ++++++++ .../explain-results/function_bucket.explain | 2 + .../explain-results/function_days.explain | 2 + .../explain-results/function_hours.explain | 2 + .../explain-results/function_months.explain | 2 + .../explain-results/function_years.explain | 2 + .../query-tests/queries/function_bucket.json | 23 +++++++++ .../query-tests/queries/function_bucket.proto.bin | 5 ++ .../query-tests/queries/function_days.json | 19 +++++++ .../query-tests/queries/function_days.proto.bin | 4 ++ .../query-tests/queries/function_hours.json | 19 +++++++ .../query-tests/queries/function_hours.proto.bin | 4 ++ .../query-tests/queries/function_months.json | 19 +++++++ .../query-tests/queries/function_months.proto.bin | 4 ++ .../query-tests/queries/function_years.json | 19 +++++++ .../query-tests/queries/function_years.proto.bin | 4 ++ 18 files changed, 209 insertions(+) diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala index 6dffa8d3ea1..4996b5033e3 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala @@ -3591,6 +3591,64 @@ object functions { */ def timestamp_seconds(e: Column): Column = Column.fn("timestamp_seconds", e) + ////////////////////////////////////////////////////////////////////////////////////////////// + // Partition Transforms functions + ////////////////////////////////////////////////////////////////////////////////////////////// + + /** + * A transform for timestamps and dates to partition data into years. + * + * @group partition_transforms + * @since 3.4.0 + */ + def years(e: Column): Column = + Column.fn("years", e) + + /** + * A transform for timestamps and dates to partition data into months. + * + * @group partition_transforms + * @since 3.4.0 + */ + def months(e: Column): Column = + Column.fn("months", e) + + /** + * A transform for timestamps and dates to partition data into days. + * + * @group partition_transforms + * @since 3.4.0 + */ + def days(e: Column): Column = + Column.fn("days", e) + + /** + * A transform for timestamps to partition data into hours. + * + * @group partition_transforms + * @since 3.4.0 + */ + def hours(e: Column): Column = + Column.fn("hours", e) + + /** + * A transform for any type that partitions by a hash of the input column. + * + * @group partition_transforms + * @since 3.4.0 + */ + def bucket(numBuckets: Column, e: Column): Column = + Column.fn("bucket", numBuckets, e) + + /** + * A transform for any type that partitions by a hash of the input column. + * + * @group partition_transforms + * @since 3.4.0 + */ + def bucket(numBuckets: Int, e: Column): Column = + Column.fn("bucket", lit(numBuckets), e) + ////////////////////////////////////////////////////////////////////////////////////////////// // Scala UDF functions ////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/FunctionTestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/FunctionTestSuite.scala index 1e4960ef9b2..d600ac432a2 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/FunctionTestSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/FunctionTestSuite.scala @@ -170,6 +170,7 @@ class FunctionTestSuite extends ConnectFunSuite { window(a, "10 seconds", "10 seconds"), window(a, "10 seconds")) testEquals("session_window", session_window(a, "1 second"), session_window(a, lit("1 second"))) + testEquals("bucket", bucket(lit(3), a), bucket(3, a)) test("assert_true no message") { val e = assert_true(a).expr diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala index 830d55b6cbf..b9ae66b2b9e 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala @@ -1386,6 +1386,26 @@ class PlanGenerationTestSuite extends ConnectFunSuite with BeforeAndAfterAll wit fn.upper(fn.col("g")) } + functionTest("years") { + fn.years(Column("a")) + } + + functionTest("months") { + fn.months(Column("a")) + } + + functionTest("days") { + fn.days(Column("a")) + } + + functionTest("hours") { + fn.hours(Column("a")) + } + + functionTest("bucket") { + fn.bucket(3, Column("a")) + } + private def temporalFunctionTest(name: String)(f: => Column): Unit = { test("function " + name) { temporals.select(f) diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_bucket.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_bucket.explain new file mode 100644 index 00000000000..8ab0c9493ab --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_bucket.explain @@ -0,0 +1,2 @@ +Project [bucket(3, a#0) AS bucket(a)#0] ++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_days.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_days.explain new file mode 100644 index 00000000000..16ca2fe415e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_days.explain @@ -0,0 +1,2 @@ +Project [days(a#0) AS days(a)#0] ++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_hours.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_hours.explain new file mode 100644 index 00000000000..a019836233d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_hours.explain @@ -0,0 +1,2 @@ +Project [hours(a#0) AS hours(a)#0] ++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_months.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_months.explain new file mode 100644 index 00000000000..17b991ec1aa --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_months.explain @@ -0,0 +1,2 @@ +Project [months(a#0) AS months(a)#0] ++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_years.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_years.explain new file mode 100644 index 00000000000..ee2342c4b02 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_years.explain @@ -0,0 +1,2 @@ +Project [years(a#0) AS years(a)#0] ++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_bucket.json b/connector/connect/common/src/test/resources/query-tests/queries/function_bucket.json new file mode 100644 index 00000000000..4ec5fb5f27b --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_bucket.json @@ -0,0 +1,23 @@ +{ + "project": { + "input": { + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "bucket", + "arguments": [{ + "literal": { + "integer": 3 + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_bucket.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_bucket.proto.bin new file mode 100644 index 00000000000..4ccecb3d59c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_bucket.proto.bin @@ -0,0 +1,5 @@ +� +�Z��struct<id:bigint,a:int,b:double,d:struct<id:bigint,a:int,b:double>,e:array<int>,f:map<string,struct<id:bigint,a:int,b:double>>,g:string> +bucket +0 +a \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_days.json b/connector/connect/common/src/test/resources/query-tests/queries/function_days.json new file mode 100644 index 00000000000..8fb62b4a2e4 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_days.json @@ -0,0 +1,19 @@ +{ + "project": { + "input": { + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "days", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_days.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_days.proto.bin new file mode 100644 index 00000000000..ecfa97f445c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_days.proto.bin @@ -0,0 +1,4 @@ +� +�Z��struct<id:bigint,a:int,b:double,d:struct<id:bigint,a:int,b:double>,e:array<int>,f:map<string,struct<id:bigint,a:int,b:double>>,g:string> +days +a \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_hours.json b/connector/connect/common/src/test/resources/query-tests/queries/function_hours.json new file mode 100644 index 00000000000..4cccc0eef47 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_hours.json @@ -0,0 +1,19 @@ +{ + "project": { + "input": { + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "hours", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_hours.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_hours.proto.bin new file mode 100644 index 00000000000..17cb707448a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_hours.proto.bin @@ -0,0 +1,4 @@ +� +�Z��struct<id:bigint,a:int,b:double,d:struct<id:bigint,a:int,b:double>,e:array<int>,f:map<string,struct<id:bigint,a:int,b:double>>,g:string> +hours +a \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_months.json b/connector/connect/common/src/test/resources/query-tests/queries/function_months.json new file mode 100644 index 00000000000..189def244d2 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_months.json @@ -0,0 +1,19 @@ +{ + "project": { + "input": { + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "months", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_months.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_months.proto.bin new file mode 100644 index 00000000000..9f6ea9641f6 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_months.proto.bin @@ -0,0 +1,4 @@ +� +�Z��struct<id:bigint,a:int,b:double,d:struct<id:bigint,a:int,b:double>,e:array<int>,f:map<string,struct<id:bigint,a:int,b:double>>,g:string> +months +a \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_years.json b/connector/connect/common/src/test/resources/query-tests/queries/function_years.json new file mode 100644 index 00000000000..a0b6f4228d0 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_years.json @@ -0,0 +1,19 @@ +{ + "project": { + "input": { + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "years", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_years.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_years.proto.bin new file mode 100644 index 00000000000..f1e2a949fb4 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_years.proto.bin @@ -0,0 +1,4 @@ +� +�Z��struct<id:bigint,a:int,b:double,d:struct<id:bigint,a:int,b:double>,e:array<int>,f:map<string,struct<id:bigint,a:int,b:double>>,g:string> +years +a \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org