This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new f41bacb07d04 [SPARK-46326][PYTHON][TESTS] Test missing cases for functions (pyspark.sql.functions) f41bacb07d04 is described below commit f41bacb07d04b21d66c0826420a67da41536e445 Author: Hyukjin Kwon <gurwls...@apache.org> AuthorDate: Fri Dec 8 20:51:25 2023 +0900 [SPARK-46326][PYTHON][TESTS] Test missing cases for functions (pyspark.sql.functions) ### What changes were proposed in this pull request? This PR proposes to improve test coverage by adding the tests for full test coverage in `pyspark.sql.functions`. In addition, this PR improves the examples by adding doctests a little bit. ### Why are the changes needed? For better test coverage, to avoid regressions. They are not being tested: https://app.codecov.io/gh/apache/spark/blob/master/python%2Fpyspark%2Fsql%2Fsession.py ### Does this PR introduce _any_ user-facing change? It contains a bit of docstring improvement. Otherwise, test-only. ### How was this patch tested? Manually tested the unittests via: ```bash ./python/run-tests --python-executables=python3 --testnames 'pyspark.sql.tests.connect.test_parity_functions' ./python/run-tests --python-executables=python3 --testnames 'pyspark.sql.tests.test_functions' ./python/run-tests --python-executables=python3 --testnames 'pyspark.sql.functions.builtin' ``` ### Was this patch authored or co-authored using generative AI tooling? No. Closes #44256 from HyukjinKwon/dataframe-test. Authored-by: Hyukjin Kwon <gurwls...@apache.org> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- python/pyspark/sql/functions/builtin.py | 78 ++++++++++++++++++++++++++---- python/pyspark/sql/tests/test_functions.py | 33 ++++++++++++- 2 files changed, 101 insertions(+), 10 deletions(-) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 4f8e6a8e1d14..997b641080cf 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -8279,9 +8279,40 @@ def unix_timestamp( Examples -------- >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") + + Example 1: Returns the current timestamp in UNIX. + + >>> import pyspark.sql.functions as sf + >>> spark.range(1).select(sf.unix_timestamp().alias('unix_time')).show() + ... # doctest: +SKIP + +----------+ + | unix_time| + +----------+ + |1702018137| + +----------+ + + Example 2: Using default format 'yyyy-MM-dd HH:mm:ss' parses the timestamp string. + + >>> import pyspark.sql.functions as sf + >>> time_df = spark.createDataFrame([('2015-04-08 12:12:12',)], ['dt']) + >>> time_df.select(sf.unix_timestamp('dt').alias('unix_time')).show() + +----------+ + | unix_time| + +----------+ + |1428520332| + +----------+ + + Example 3: Using user-specified format 'yyyy-MM-dd' parses the timestamp string. + + >>> import pyspark.sql.functions as sf >>> time_df = spark.createDataFrame([('2015-04-08',)], ['dt']) - >>> time_df.select(unix_timestamp('dt', 'yyyy-MM-dd').alias('unix_time')).collect() - [Row(unix_time=1428476400)] + >>> time_df.select(sf.unix_timestamp('dt', 'yyyy-MM-dd').alias('unix_time')).show() + +----------+ + | unix_time| + +----------+ + |1428476400| + +----------+ + >>> spark.conf.unset("spark.sql.session.timeZone") """ if timestamp is None: @@ -8569,13 +8600,21 @@ def window( Examples -------- >>> import datetime + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame( ... [(datetime.datetime(2016, 3, 11, 9, 0, 7), 1)], ... ).toDF("date", "val") - >>> w = df.groupBy(window("date", "5 seconds")).agg(sum("val").alias("sum")) - >>> w.select(w.window.start.cast("string").alias("start"), - ... w.window.end.cast("string").alias("end"), "sum").collect() - [Row(start='2016-03-11 09:00:05', end='2016-03-11 09:00:10', sum=1)] + >>> w = df.groupBy(sf.window("date", "5 seconds")).agg(sf.sum("val").alias("sum")) + >>> w.select( + ... w.window.start.cast("string").alias("start"), + ... w.window.end.cast("string").alias("end"), + ... "sum" + ... ).show() + +-------------------+-------------------+---+ + | start| end|sum| + +-------------------+-------------------+---+ + |2016-03-11 09:00:05|2016-03-11 09:00:10| 1| + +-------------------+-------------------+---+ """ def check_string_field(field, fieldName): # type: ignore[no-untyped-def] @@ -8737,9 +8776,30 @@ def to_unix_timestamp( Examples -------- >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") - >>> df = spark.createDataFrame([("2016-04-08",)], ["e"]) - >>> df.select(to_unix_timestamp(df.e, lit("yyyy-MM-dd")).alias('r')).collect() - [Row(r=1460098800)] + + Example 1: Using default format 'yyyy-MM-dd HH:mm:ss' parses the timestamp string. + + >>> import pyspark.sql.functions as sf + >>> time_df = spark.createDataFrame([('2015-04-08 12:12:12',)], ['dt']) + >>> time_df.select(sf.to_unix_timestamp('dt').alias('unix_time')).show() + +----------+ + | unix_time| + +----------+ + |1428520332| + +----------+ + + Example 2: Using user-specified format 'yyyy-MM-dd' parses the timestamp string. + + >>> import pyspark.sql.functions as sf + >>> time_df = spark.createDataFrame([('2015-04-08',)], ['dt']) + >>> time_df.select( + ... sf.to_unix_timestamp('dt', sf.lit('yyyy-MM-dd')).alias('unix_time')).show() + +----------+ + | unix_time| + +----------+ + |1428476400| + +----------+ + >>> spark.conf.unset("spark.sql.session.timeZone") """ if format is not None: diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py index 8586fac4e86d..b59417d8a310 100644 --- a/python/pyspark/sql/tests/test_functions.py +++ b/python/pyspark/sql/tests/test_functions.py @@ -1002,7 +1002,7 @@ class FunctionsTestsMixin: [(datetime.datetime(2016, 3, 11, 9, 0, 7), 1)], ["date", "val"] ) - w = df.groupBy(F.window("date", "5 seconds")).agg(F.sum("val").alias("sum")) + w = df.groupBy(F.window("date", "5 seconds", "5 seconds")).agg(F.sum("val").alias("sum")) r = w.select( w.window.end.cast("string").alias("end"), F.window_time(w.window).cast("string").alias("window_time"), @@ -1365,6 +1365,37 @@ class FunctionsTestsMixin: message_parameters={"arg_name": "numBuckets", "arg_type": "str"}, ) + def test_to_timestamp_ltz(self): + df = self.spark.createDataFrame([("2016-12-31",)], ["e"]) + df = df.select(F.to_timestamp_ltz(df.e, F.lit("yyyy-MM-dd")).alias("r")) + self.assertIsInstance(df.first()[0], datetime.datetime) + + df = self.spark.createDataFrame([("2016-12-31",)], ["e"]) + df = df.select(F.to_timestamp_ltz(df.e).alias("r")) + self.assertIsInstance(df.first()[0], datetime.datetime) + + def test_to_timestamp_ntz(self): + df = self.spark.createDataFrame([("2016-12-31",)], ["e"]) + df = df.select(F.to_timestamp_ntz(df.e).alias("r")) + self.assertIsInstance(df.first()[0], datetime.datetime) + + def test_convert_timezone(self): + df = self.spark.createDataFrame([("2015-04-08",)], ["dt"]) + df = df.select( + F.convert_timezone(F.lit("America/Los_Angeles"), F.lit("Asia/Hong_Kong"), "dt") + ) + self.assertIsInstance(df.first()[0], datetime.datetime) + + def test_map_concat(self): + df = self.spark.sql("SELECT map(1, 'a', 2, 'b') as map1, map(3, 'c') as map2") + self.assertEqual( + df.select(F.map_concat(["map1", "map2"]).alias("map3")).first()[0], + {1: "a", 2: "b", 3: "c"}, + ) + + def test_version(self): + self.assertIsInstance(self.spark.range(1).select(F.version()).first()[0], str) + # SPARK-45216: Fix non-deterministic seeded Dataset APIs def test_non_deterministic_with_seed(self): df = self.spark.createDataFrame([([*range(0, 10, 1)],)], ["a"]) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org