This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 8dc02863b92 [SPARK-43929][SPARK-44073][SQL][PYTHON][CONNECT][FOLLOWUP] Add extract, date_part, datepart to Scala, Python and Connect API 8dc02863b92 is described below commit 8dc02863b926b9e0780b994f9ee6c5c1058d49a0 Author: Jiaan Geng <belie...@163.com> AuthorDate: Tue Jun 20 21:47:05 2023 +0800 [SPARK-43929][SPARK-44073][SQL][PYTHON][CONNECT][FOLLOWUP] Add extract, date_part, datepart to Scala, Python and Connect API ### What changes were proposed in this pull request? This PR follows up https://github.com/apache/spark/pull/41636 and https://github.com/apache/spark/pull/41651 and add extract, date_part, datepart to Scala, Python and Connect API. ### Why are the changes needed? Add extract, date_part, datepart to Scala, Python and Connect API ### Does this PR introduce _any_ user-facing change? 'No'. New feature. ### How was this patch tested? New test cases. Closes #41667 from beliefer/datetime_functions_followup. Authored-by: Jiaan Geng <belie...@163.com> Signed-off-by: Ruifeng Zheng <ruife...@apache.org> --- .../scala/org/apache/spark/sql/functions.scala | 50 ++++++++++ .../apache/spark/sql/PlanGenerationTestSuite.scala | 12 +++ .../explain-results/function_date_part.explain | 2 + .../explain-results/function_datepart.explain | 2 + .../explain-results/function_extract.explain | 2 + .../query-tests/queries/function_date_part.json | 29 ++++++ .../queries/function_date_part.proto.bin | Bin 0 -> 133 bytes .../query-tests/queries/function_datepart.json | 29 ++++++ .../queries/function_datepart.proto.bin | Bin 0 -> 132 bytes .../query-tests/queries/function_extract.json | 29 ++++++ .../query-tests/queries/function_extract.proto.bin | Bin 0 -> 131 bytes .../source/reference/pyspark.sql/functions.rst | 3 + python/pyspark/sql/connect/functions.py | 21 ++++ python/pyspark/sql/functions.py | 110 +++++++++++++++++++++ .../scala/org/apache/spark/sql/functions.scala | 41 ++++++++ .../org/apache/spark/sql/DateFunctionsSuite.scala | 68 +++++++++++++ 16 files changed, 398 insertions(+) diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala index 2ac20bd5911..a3f4a273661 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala @@ -4438,6 +4438,56 @@ object functions { */ def hour(e: Column): Column = Column.fn("hour", e) + /** + * Extracts a part of the date/timestamp or interval source. + * + * @param field + * selects which part of the source should be extracted. + * @param source + * a date/timestamp or interval column from where `field` should be extracted. + * @return + * a part of the date/timestamp or interval source + * @group datetime_funcs + * @since 3.5.0 + */ + def extract(field: Column, source: Column): Column = { + Column.fn("extract", field, source) + } + + /** + * Extracts a part of the date/timestamp or interval source. + * + * @param field + * selects which part of the source should be extracted, and supported string values are as + * same as the fields of the equivalent function `extract`. + * @param source + * a date/timestamp or interval column from where `field` should be extracted. + * @return + * a part of the date/timestamp or interval source + * @group datetime_funcs + * @since 3.5.0 + */ + def date_part(field: Column, source: Column): Column = { + Column.fn("date_part", field, source) + } + + /** + * Extracts a part of the date/timestamp or interval source. + * + * @param field + * selects which part of the source should be extracted, and supported string values are as + * same as the fields of the equivalent function `extract`. + * @param source + * a date/timestamp or interval column from where `field` should be extracted. + * @return + * a part of the date/timestamp or interval source + * @group datetime_funcs + * @since 3.5.0 + */ + def datepart(field: Column, source: Column): Column = { + Column.fn("datepart", field, source) + } + /** * Returns the last day of the month which the given date belongs to. For example, input * "2015-07-27" returns "2015-07-31" since July 31 is the last day of the month in July 2015. diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala index e212720b84e..2dae9a99146 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala @@ -2021,6 +2021,18 @@ class PlanGenerationTestSuite fn.weekofyear(fn.col("d")) } + temporalFunctionTest("extract") { + fn.extract(lit("year"), fn.col("d")) + } + + temporalFunctionTest("date_part") { + fn.date_part(lit("year"), fn.col("d")) + } + + temporalFunctionTest("datepart") { + fn.datepart(lit("year"), fn.col("d")) + } + temporalFunctionTest("from_unixtime") { fn.from_unixtime(lit(1L)) } diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_date_part.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_date_part.explain new file mode 100644 index 00000000000..dda95c323c6 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_date_part.explain @@ -0,0 +1,2 @@ +Project [year(d#0) AS date_part(year, d)#0] ++- LocalRelation <empty>, [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_datepart.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_datepart.explain new file mode 100644 index 00000000000..1434b1c6db4 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_datepart.explain @@ -0,0 +1,2 @@ +Project [year(d#0) AS datepart(year FROM d)#0] ++- LocalRelation <empty>, [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_extract.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_extract.explain new file mode 100644 index 00000000000..07f30cadb2e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_extract.explain @@ -0,0 +1,2 @@ +Project [year(d#0) AS extract(year FROM d)#0] ++- LocalRelation <empty>, [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_date_part.json b/connector/connect/common/src/test/resources/query-tests/queries/function_date_part.json new file mode 100644 index 00000000000..5e8d075c4e2 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_date_part.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "date_part", + "arguments": [{ + "literal": { + "string": "year" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "d" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_date_part.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_date_part.proto.bin new file mode 100644 index 00000000000..368ecb676c1 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_date_part.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_datepart.json b/connector/connect/common/src/test/resources/query-tests/queries/function_datepart.json new file mode 100644 index 00000000000..cc4dca86742 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_datepart.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "datepart", + "arguments": [{ + "literal": { + "string": "year" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "d" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_datepart.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_datepart.proto.bin new file mode 100644 index 00000000000..9d58fd3474d Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_datepart.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_extract.json b/connector/connect/common/src/test/resources/query-tests/queries/function_extract.json new file mode 100644 index 00000000000..6ccdb2f9d75 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_extract.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "extract", + "arguments": [{ + "literal": { + "string": "year" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "d" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_extract.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_extract.proto.bin new file mode 100644 index 00000000000..91553c3b94b Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_extract.proto.bin differ diff --git a/python/docs/source/reference/pyspark.sql/functions.rst b/python/docs/source/reference/pyspark.sql/functions.rst index f41443b8274..8195f06bea8 100644 --- a/python/docs/source/reference/pyspark.sql/functions.rst +++ b/python/docs/source/reference/pyspark.sql/functions.rst @@ -130,9 +130,12 @@ Datetime Functions dateadd datediff day + date_part + datepart dayofmonth dayofweek dayofyear + extract second weekofyear year diff --git a/python/pyspark/sql/connect/functions.py b/python/pyspark/sql/connect/functions.py index 90800106cf7..de43d59773e 100644 --- a/python/pyspark/sql/connect/functions.py +++ b/python/pyspark/sql/connect/functions.py @@ -2740,6 +2740,27 @@ def weekday(col: "ColumnOrName") -> Column: weekday.__doc__ = pysparkfuncs.weekday.__doc__ +def extract(field: "ColumnOrName", source: "ColumnOrName") -> Column: + return _invoke_function_over_columns("extract", field, source) + + +extract.__doc__ = pysparkfuncs.extract.__doc__ + + +def date_part(field: "ColumnOrName", source: "ColumnOrName") -> Column: + return _invoke_function_over_columns("date_part", field, source) + + +extract.__doc__ = pysparkfuncs.extract.__doc__ + + +def datepart(field: "ColumnOrName", source: "ColumnOrName") -> Column: + return _invoke_function_over_columns("datepart", field, source) + + +extract.__doc__ = pysparkfuncs.extract.__doc__ + + def make_date(year: "ColumnOrName", month: "ColumnOrName", day: "ColumnOrName") -> Column: return _invoke_function_over_columns("make_date", year, month, day) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 0cfc19615be..267c7e433bd 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -6029,6 +6029,116 @@ def weekday(col: "ColumnOrName") -> Column: return _invoke_function_over_columns("weekday", col) +@try_remote_functions +def extract(field: "ColumnOrName", source: "ColumnOrName") -> Column: + """ + Extracts a part of the date/timestamp or interval source. + + .. versionadded:: 3.5.0 + + Parameters + ---------- + field : :class:`~pyspark.sql.Column` or str + selects which part of the source should be extracted. + source : :class:`~pyspark.sql.Column` or str + a date/timestamp or interval column from where `field` should be extracted. + + Returns + ------- + :class:`~pyspark.sql.Column` + a part of the date/timestamp or interval source. + + Examples + -------- + >>> import datetime + >>> df = spark.createDataFrame([(datetime.datetime(2015, 4, 8, 13, 8, 15),)], ['ts']) + >>> df.select( + ... extract(lit('YEAR'), 'ts').alias('year'), + ... extract(lit('month'), 'ts').alias('month'), + ... extract(lit('WEEK'), 'ts').alias('week'), + ... extract(lit('D'), 'ts').alias('day'), + ... extract(lit('M'), 'ts').alias('minute'), + ... extract(lit('S'), 'ts').alias('second') + ... ).collect() + [Row(year=2015, month=4, week=15, day=8, minute=8, second=Decimal('15.000000'))] + """ + return _invoke_function_over_columns("extract", field, source) + + +@try_remote_functions +def date_part(field: "ColumnOrName", source: "ColumnOrName") -> Column: + """ + Extracts a part of the date/timestamp or interval source. + + .. versionadded:: 3.5.0 + + Parameters + ---------- + field : :class:`~pyspark.sql.Column` or str + selects which part of the source should be extracted, and supported string values + are as same as the fields of the equivalent function `extract`. + source : :class:`~pyspark.sql.Column` or str + a date/timestamp or interval column from where `field` should be extracted. + + Returns + ------- + :class:`~pyspark.sql.Column` + a part of the date/timestamp or interval source. + + Examples + -------- + >>> import datetime + >>> df = spark.createDataFrame([(datetime.datetime(2015, 4, 8, 13, 8, 15),)], ['ts']) + >>> df.select( + ... date_part(lit('YEAR'), 'ts').alias('year'), + ... date_part(lit('month'), 'ts').alias('month'), + ... date_part(lit('WEEK'), 'ts').alias('week'), + ... date_part(lit('D'), 'ts').alias('day'), + ... date_part(lit('M'), 'ts').alias('minute'), + ... date_part(lit('S'), 'ts').alias('second') + ... ).collect() + [Row(year=2015, month=4, week=15, day=8, minute=8, second=Decimal('15.000000'))] + """ + return _invoke_function_over_columns("date_part", field, source) + + +@try_remote_functions +def datepart(field: "ColumnOrName", source: "ColumnOrName") -> Column: + """ + Extracts a part of the date/timestamp or interval source. + + .. versionadded:: 3.5.0 + + Parameters + ---------- + field : :class:`~pyspark.sql.Column` or str + selects which part of the source should be extracted, and supported string values + are as same as the fields of the equivalent function `extract`. + source : :class:`~pyspark.sql.Column` or str + a date/timestamp or interval column from where `field` should be extracted. + + Returns + ------- + :class:`~pyspark.sql.Column` + a part of the date/timestamp or interval source. + + Examples + -------- + >>> import datetime + >>> df = spark.createDataFrame([(datetime.datetime(2015, 4, 8, 13, 8, 15),)], ['ts']) + >>> df.select( + ... datepart(lit('YEAR'), 'ts').alias('year'), + ... datepart(lit('month'), 'ts').alias('month'), + ... datepart(lit('WEEK'), 'ts').alias('week'), + ... datepart(lit('D'), 'ts').alias('day'), + ... datepart(lit('M'), 'ts').alias('minute'), + ... datepart(lit('S'), 'ts').alias('second') + ... ).collect() + [Row(year=2015, month=4, week=15, day=8, minute=8, second=Decimal('15.000000'))] + """ + return _invoke_function_over_columns("datepart", field, source) + + @try_remote_functions def make_date(year: "ColumnOrName", month: "ColumnOrName", day: "ColumnOrName") -> Column: """ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 7c3f65e2495..6c1b1262c53 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -4547,6 +4547,47 @@ object functions { */ def hour(e: Column): Column = withExpr { Hour(e.expr) } + /** + * Extracts a part of the date/timestamp or interval source. + * + * @param field selects which part of the source should be extracted. + * @param source a date/timestamp or interval column from where `field` should be extracted. + * @return a part of the date/timestamp or interval source + * @group datetime_funcs + * @since 3.5.0 + */ + def extract(field: Column, source: Column): Column = withExpr { + UnresolvedFunction("extract", Seq(field.expr, source.expr), isDistinct = false) + } + + /** + * Extracts a part of the date/timestamp or interval source. + * + * @param field selects which part of the source should be extracted, and supported string values + * are as same as the fields of the equivalent function `extract`. + * @param source a date/timestamp or interval column from where `field` should be extracted. + * @return a part of the date/timestamp or interval source + * @group datetime_funcs + * @since 3.5.0 + */ + def date_part(field: Column, source: Column): Column = withExpr { + UnresolvedFunction("date_part", Seq(field.expr, source.expr), isDistinct = false) + } + + /** + * Extracts a part of the date/timestamp or interval source. + * + * @param field selects which part of the source should be extracted, and supported string values + * are as same as the fields of the equivalent function `EXTRACT`. + * @param source a date/timestamp or interval column from where `field` should be extracted. + * @return a part of the date/timestamp or interval source + * @group datetime_funcs + * @since 3.5.0 + */ + def datepart(field: Column, source: Column): Column = withExpr { + UnresolvedFunction("datepart", Seq(field.expr, source.expr), isDistinct = false) + } + /** * Returns the last day of the month which the given date belongs to. * For example, input "2015-07-27" returns "2015-07-31" since July 31 is the last day of the diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala index 7b13af8bf7d..d5d2fe8a5d3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala @@ -267,6 +267,74 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession { Row(2, 2, 0)) } + test("extract") { + val df = Seq((d, sdf.format(d), ts)).toDF("a", "b", "c") + + checkAnswer( + df.select( + extract(lit("YEAR"), $"a"), + extract(lit("MONTH"), $"a"), + extract(lit("week"), $"b"), + extract(lit("day"), $"b"), + extract(lit("MINUTE"), $"c"), + extract(lit("SECONDS"), $"c")), + Row(2015, 4, 15, 8, 10, 15.000000)) + + checkAnswer( + df.selectExpr( + "extract(YEAR FROM a)", + "extract(MONTH FROM a)", + "extract(week FROM b)", + "extract(day FROM b)", + "extract(MINUTE FROM c)", + "extract(SECONDS FROM c)"), + Row(2015, 4, 15, 8, 10, 15.000000)) + } + + test("date_part & datepart") { + val df = Seq((d, sdf.format(d), ts)).toDF("a", "b", "c") + + checkAnswer( + df.select( + date_part(lit("YEAR"), $"a"), + date_part(lit("MONTH"), $"a"), + date_part(lit("week"), $"b"), + date_part(lit("day"), $"b"), + date_part(lit("MINUTE"), $"c"), + date_part(lit("SECONDS"), $"c")), + Row(2015, 4, 15, 8, 10, 15.000000)) + + checkAnswer( + df.select( + datepart(lit("YEAR"), $"a"), + datepart(lit("MONTH"), $"a"), + datepart(lit("week"), $"b"), + datepart(lit("day"), $"b"), + datepart(lit("MINUTE"), $"c"), + datepart(lit("SECONDS"), $"c")), + Row(2015, 4, 15, 8, 10, 15.000000)) + + checkAnswer( + df.selectExpr( + "date_part('YEAR', a)", + "date_part('MONTH', a)", + "date_part('week', b)", + "date_part('day', b)", + "date_part('MINUTE', c)", + "date_part('SECONDS', c)"), + Row(2015, 4, 15, 8, 10, 15.000000)) + + checkAnswer( + df.selectExpr( + "datepart('YEAR', a)", + "datepart('MONTH', a)", + "datepart('week', b)", + "datepart('day', b)", + "datepart('MINUTE', c)", + "datepart('SECONDS', c)"), + Row(2015, 4, 15, 8, 10, 15.000000)) + } + test("function date_add & dateadd") { val st1 = "2015-06-01 12:34:56" val st2 = "2015-06-02 12:34:56" --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org