This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch branch-3.4 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.4 by this push: new e67d39f28f9 [SPARK-41868][CONNECT][PYTHON] Fix createDataFrame to support durations e67d39f28f9 is described below commit e67d39f28f931885d4204596389d245586d9863f Author: Takuya UESHIN <ues...@databricks.com> AuthorDate: Wed Mar 1 09:41:42 2023 +0800 [SPARK-41868][CONNECT][PYTHON] Fix createDataFrame to support durations ### What changes were proposed in this pull request? Fixes `createDataFrame` to support durations. ### Why are the changes needed? Currently the following command: ```py spark.createDataFrame(pd.DataFrame({"a": [timedelta(microseconds=123)]})) ``` raises an error: ``` [UNSUPPORTED_ARROWTYPE] Unsupported arrow type Duration(NANOSECOND). ``` because Arrow takes a different type for `timedelta` objects from what Spark expects. ### Does this PR introduce _any_ user-facing change? `timedelta` objects will be properly converted to `DayTimeIntervalType`. ### How was this patch tested? Enabled the related test. Closes #40226 from ueshin/issues/SPARK-41868/duration. Authored-by: Takuya UESHIN <ues...@databricks.com> Signed-off-by: Ruifeng Zheng <ruife...@apache.org> (cherry picked from commit b43b1337066a2d48d6de1df9f3955cddc105f57a) Signed-off-by: Ruifeng Zheng <ruife...@apache.org> --- python/pyspark/sql/connect/session.py | 4 ++++ python/pyspark/sql/tests/connect/test_parity_dataframe.py | 5 ----- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/python/pyspark/sql/connect/session.py b/python/pyspark/sql/connect/session.py index d6ba9a36f96..d82dbcb2db0 100644 --- a/python/pyspark/sql/connect/session.py +++ b/python/pyspark/sql/connect/session.py @@ -43,6 +43,7 @@ import pyarrow as pa from pandas.api.types import ( # type: ignore[attr-defined] is_datetime64_dtype, is_datetime64tz_dtype, + is_timedelta64_dtype, ) from pyspark import SparkContext, SparkConf, __version__ @@ -60,6 +61,7 @@ from pyspark.sql.types import ( _merge_type, Row, DataType, + DayTimeIntervalType, StructType, AtomicType, TimestampType, @@ -245,6 +247,8 @@ class SparkSession: arrow_types = [ to_arrow_type(TimestampType()) if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) + else to_arrow_type(DayTimeIntervalType()) + if is_timedelta64_dtype(t) else None for t in data.dtypes ] diff --git a/python/pyspark/sql/tests/connect/test_parity_dataframe.py b/python/pyspark/sql/tests/connect/test_parity_dataframe.py index da1172086cc..c5972ac02ae 100644 --- a/python/pyspark/sql/tests/connect/test_parity_dataframe.py +++ b/python/pyspark/sql/tests/connect/test_parity_dataframe.py @@ -22,11 +22,6 @@ from pyspark.testing.connectutils import ReusedConnectTestCase class DataFrameParityTests(DataFrameTestsMixin, ReusedConnectTestCase): - # TODO(SPARK-41868): Support data type Duration(NANOSECOND) - @unittest.skip("Fails in Spark Connect, should enable.") - def test_create_dataframe_from_pandas_with_day_time_interval(self): - super().test_create_dataframe_from_pandas_with_day_time_interval() - # TODO(SPARK-41834): Implement SparkSession.conf @unittest.skip("Fails in Spark Connect, should enable.") def test_create_dataframe_from_pandas_with_dst(self): --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org