This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new fb2bee37c96 [SPARK-42617][PS] Support `isocalendar` from the pandas 2.0.0 fb2bee37c96 is described below commit fb2bee37c964bf2164fc89a0a55085dd0c840b56 Author: zhyhimont <zhyhim...@gmail.com> AuthorDate: Mon Sep 25 15:22:32 2023 +0900 [SPARK-42617][PS] Support `isocalendar` from the pandas 2.0.0 ### What changes were proposed in this pull request? Support `isocalendar` from the pandas 2.0.0 ### Why are the changes needed? When pandas 2.0.0 is released, we should match the behavior in pandas API on Spark. ### Does this PR introduce _any_ user-facing change? Added new method `DatetimeIndex.isocalendar` and removed two depreceted `DatetimeIndex.week` and `DatetimeIndex.weekofyear` ``` dfs = ps.from_pandas(pd.date_range(start='2019-12-29', freq='D', periods=4).to_series()) dfs.dt.isocalendar() year week day 2019-12-29 2019 52 7 2019-12-30 2020 1 1 2019-12-31 2020 1 2 2020-01-01 2020 1 3 dfs.dt.isocalendar().week 2019-12-29 52 2019-12-30 1 2019-12-31 1 2020-01-01 1 ``` ### How was this patch tested? UT was updated Closes #40420 from dzhigimont/SPARK-42617_ZH. Lead-authored-by: zhyhimont <zhyhim...@gmail.com> Co-authored-by: Zhyhimont Dmitry <zhyhimon...@profitero.com> Co-authored-by: Dmitry Zhyhimont <dzhigim...@mail.ru> Co-authored-by: Zhyhimont Dmitry <dzhigim...@gmail.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- .../source/reference/pyspark.pandas/indexing.rst | 3 +- .../source/reference/pyspark.pandas/series.rst | 3 +- python/pyspark/pandas/datetimes.py | 70 ++++++++++++++++------ python/pyspark/pandas/indexes/base.py | 4 +- python/pyspark/pandas/indexes/datetimes.py | 49 +++++++++------ python/pyspark/pandas/namespace.py | 3 +- .../pyspark/pandas/tests/indexes/test_datetime.py | 28 ++------- .../pandas/tests/indexes/test_datetime_property.py | 19 +----- .../pyspark/pandas/tests/test_series_datetime.py | 17 +----- 9 files changed, 100 insertions(+), 96 deletions(-) diff --git a/python/docs/source/reference/pyspark.pandas/indexing.rst b/python/docs/source/reference/pyspark.pandas/indexing.rst index 70d463c052a..d6be57ee9c8 100644 --- a/python/docs/source/reference/pyspark.pandas/indexing.rst +++ b/python/docs/source/reference/pyspark.pandas/indexing.rst @@ -338,8 +338,7 @@ Time/date components DatetimeIndex.minute DatetimeIndex.second DatetimeIndex.microsecond - DatetimeIndex.week - DatetimeIndex.weekofyear + DatetimeIndex.isocalendar DatetimeIndex.dayofweek DatetimeIndex.day_of_week DatetimeIndex.weekday diff --git a/python/docs/source/reference/pyspark.pandas/series.rst b/python/docs/source/reference/pyspark.pandas/series.rst index 552acec096f..7b658d45d4b 100644 --- a/python/docs/source/reference/pyspark.pandas/series.rst +++ b/python/docs/source/reference/pyspark.pandas/series.rst @@ -313,8 +313,7 @@ Datetime Properties Series.dt.minute Series.dt.second Series.dt.microsecond - Series.dt.week - Series.dt.weekofyear + Series.dt.isocalendar Series.dt.dayofweek Series.dt.weekday Series.dt.dayofyear diff --git a/python/pyspark/pandas/datetimes.py b/python/pyspark/pandas/datetimes.py index b0649cf5761..4b6e23fae7a 100644 --- a/python/pyspark/pandas/datetimes.py +++ b/python/pyspark/pandas/datetimes.py @@ -18,7 +18,6 @@ """ Date/Time related functions on pandas-on-Spark Series """ -import warnings from typing import Any, Optional, Union, no_type_check import numpy as np @@ -27,7 +26,9 @@ from pandas.tseries.offsets import DateOffset import pyspark.pandas as ps import pyspark.sql.functions as F -from pyspark.sql.types import DateType, TimestampType, TimestampNTZType, LongType, IntegerType +from pyspark.sql.types import DateType, TimestampType, TimestampNTZType, IntegerType +from pyspark.pandas import DataFrame +from pyspark.pandas.config import option_context class DatetimeMethods: @@ -116,26 +117,59 @@ class DatetimeMethods: def nanosecond(self) -> "ps.Series": raise NotImplementedError() - # TODO(SPARK-42617): Support isocalendar.week and replace it. - # See also https://github.com/pandas-dev/pandas/pull/33595. - @property - def week(self) -> "ps.Series": + def isocalendar(self) -> "ps.DataFrame": """ - The week ordinal of the year. + Calculate year, week, and day according to the ISO 8601 standard. - .. deprecated:: 3.4.0 - """ - warnings.warn( - "weekofyear and week have been deprecated.", - FutureWarning, - ) - return self._data.spark.transform(lambda c: F.weekofyear(c).cast(LongType())) + .. versionadded:: 4.0.0 - @property - def weekofyear(self) -> "ps.Series": - return self.week + Returns + ------- + DataFrame + With columns year, week and day. - weekofyear.__doc__ = week.__doc__ + .. note:: Returns have int64 type instead of UInt32 as is in pandas due to UInt32 + is not supported by spark + + Examples + -------- + >>> dfs = ps.from_pandas(pd.date_range(start='2019-12-29', freq='D', periods=4).to_series()) + >>> dfs.dt.isocalendar() + year week day + 2019-12-29 2019 52 7 + 2019-12-30 2020 1 1 + 2019-12-31 2020 1 2 + 2020-01-01 2020 1 3 + + >>> dfs.dt.isocalendar().week + 2019-12-29 52 + 2019-12-30 1 + 2019-12-31 1 + 2020-01-01 1 + Name: week, dtype: int64 + """ + + return_types = [self._data.index.dtype, int, int, int] + + def pandas_isocalendar( # type: ignore[no-untyped-def] + pdf, + ) -> ps.DataFrame[return_types]: # type: ignore[valid-type] + # cast to int64 due to UInt32 is not supported by spark + return pdf[pdf.columns[0]].dt.isocalendar().astype(np.int64).reset_index() + + with option_context("compute.default_index_type", "distributed"): + psdf = self._data.to_frame().pandas_on_spark.apply_batch(pandas_isocalendar) + + return DataFrame( + psdf._internal.copy( + spark_frame=psdf._internal.spark_frame, + index_spark_columns=psdf._internal.data_spark_columns[:1], + index_fields=psdf._internal.data_fields[:1], + data_spark_columns=psdf._internal.data_spark_columns[1:], + data_fields=psdf._internal.data_fields[1:], + column_labels=[("year",), ("week",), ("day",)], + ) + ) @property def dayofweek(self) -> "ps.Series": diff --git a/python/pyspark/pandas/indexes/base.py b/python/pyspark/pandas/indexes/base.py index 48ce22b6e51..c020e918d37 100644 --- a/python/pyspark/pandas/indexes/base.py +++ b/python/pyspark/pandas/indexes/base.py @@ -2007,7 +2007,7 @@ class Index(IndexOpsMixin): if isinstance(self, MultiIndex) and level is not None: self_names = self.names - self_names[level] = names # type: ignore[index] + self_names[level] = names names = self_names return self.rename(name=names, inplace=inplace) @@ -2077,7 +2077,7 @@ class Index(IndexOpsMixin): [isinstance(item, tuple) for item in other] ) if is_other_list_of_tuples: - other = MultiIndex.from_tuples(other) # type: ignore[arg-type] + other = MultiIndex.from_tuples(other) else: raise TypeError("other must be a MultiIndex or a list of tuples") diff --git a/python/pyspark/pandas/indexes/datetimes.py b/python/pyspark/pandas/indexes/datetimes.py index 2c208974167..5a2a347d1ba 100644 --- a/python/pyspark/pandas/indexes/datetimes.py +++ b/python/pyspark/pandas/indexes/datetimes.py @@ -25,6 +25,7 @@ from pandas.tseries.offsets import DateOffset from pyspark._globals import _NoValue from pyspark import pandas as ps +from pyspark.pandas import DataFrame from pyspark.pandas.indexes.base import Index from pyspark.pandas.missing.indexes import MissingPandasLikeDatetimeIndex from pyspark.pandas.series import Series, first_series @@ -232,28 +233,40 @@ class DatetimeIndex(Index): ) return Index(self.to_series().dt.microsecond) - @property - def week(self) -> Index: + def isocalendar(self) -> DataFrame: """ - The week ordinal of the year. + Calculate year, week, and day according to the ISO 8601 standard. - .. deprecated:: 3.5.0 - """ - warnings.warn( - "`week` is deprecated in 3.5.0 and will be removed in 4.0.0.", - FutureWarning, - ) - return Index(self.to_series().dt.week) + .. versionadded:: 4.0.0 - @property - def weekofyear(self) -> Index: - warnings.warn( - "`weekofyear` is deprecated in 3.5.0 and will be removed in 4.0.0.", - FutureWarning, - ) - return Index(self.to_series().dt.weekofyear) + Returns + ------- + DataFrame + With columns year, week and day. - weekofyear.__doc__ = week.__doc__ + .. note:: Returns have int64 type instead of UInt32 as is in pandas due to UInt32 + is not supported by spark + + Examples + -------- + >>> psidxs = ps.from_pandas( + ... pd.DatetimeIndex(["2019-12-29", "2019-12-30", "2019-12-31", "2020-01-01"]) + ... ) + >>> psidxs.isocalendar() + year week day + 2019-12-29 2019 52 7 + 2019-12-30 2020 1 1 + 2019-12-31 2020 1 2 + 2020-01-01 2020 1 3 + + >>> psidxs.isocalendar().week + 2019-12-29 52 + 2019-12-30 1 + 2019-12-31 1 + 2020-01-01 1 + Name: week, dtype: int64 + """ + return self.to_series().dt.isocalendar() @property def dayofweek(self) -> Index: diff --git a/python/pyspark/pandas/namespace.py b/python/pyspark/pandas/namespace.py index f7c07b37c16..a700a243e5d 100644 --- a/python/pyspark/pandas/namespace.py +++ b/python/pyspark/pandas/namespace.py @@ -158,7 +158,8 @@ def from_pandas(pobj: Union[pd.DataFrame, pd.Series, pd.Index]) -> Union[Series, raise TypeError("Unknown data type: {}".format(type(pobj).__name__)) -_range = range # built-in range +# built-in range +_range: Type[range] = range # type: ignore[assignment] def range( diff --git a/python/pyspark/pandas/tests/indexes/test_datetime.py b/python/pyspark/pandas/tests/indexes/test_datetime.py index e93ab76186a..4eaefb514d9 100644 --- a/python/pyspark/pandas/tests/indexes/test_datetime.py +++ b/python/pyspark/pandas/tests/indexes/test_datetime.py @@ -19,6 +19,7 @@ import datetime from distutils.version import LooseVersion +import numpy as np import pandas as pd import pyspark.pandas as ps @@ -98,28 +99,6 @@ class DatetimeIndexTestsMixin: self.assert_eq(psidx.day_of_year, pidx.day_of_year) self.assert_eq(psidx.day_of_week, pidx.day_of_week) - if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"): - # TODO(SPARK-42617): Support isocalendar.week and replace it. - expected_results = [ - ps.Index([1]), - ps.Index([1, 1, 13]), - ps.Index([52, 52, 1]), - ps.Index([52, 52, 52]), - ps.Index([52, 52, 52]), - ps.Index([52, 52, 52]), - ps.Index([52, 52, 52]), - ps.Index([52, 52, 52]), - ps.Index([52, 1, 2]), - ps.Index([13, 26, 39]), - ] - for psidx, expected_result in zip(self.psidxs, expected_results): - self.assert_eq(psidx.week, expected_result) - self.assert_eq(psidx.weekofyear, expected_result) - else: - for psidx, pidx in self.idx_pairs: - self.assert_eq(psidx.week, pidx.week) - self.assert_eq(psidx.weekofyear, pidx.weekofyear) - def test_ceil(self): for psidx, pidx in self.idx_pairs: for freq in self.fixed_freqs: @@ -267,6 +246,11 @@ class DatetimeIndexTestsMixin: mapper_pser = pd.Series([1, 2, 3], index=pidx) self.assert_eq(psidx.map(mapper_pser), pidx.map(mapper_pser)) + def test_isocalendar(self): + for psidx, pidx in self.idx_pairs: + self.assert_eq(psidx.isocalendar().astype(int), pidx.isocalendar().astype(int)) + self.assert_eq(psidx.isocalendar().week, pidx.isocalendar().week.astype(np.int64)) + class DatetimeIndexTests(DatetimeIndexTestsMixin, PandasOnSparkTestCase, TestUtils): pass diff --git a/python/pyspark/pandas/tests/indexes/test_datetime_property.py b/python/pyspark/pandas/tests/indexes/test_datetime_property.py index 523b8bdda4a..0ab17664b9f 100644 --- a/python/pyspark/pandas/tests/indexes/test_datetime_property.py +++ b/python/pyspark/pandas/tests/indexes/test_datetime_property.py @@ -18,6 +18,7 @@ import datetime import unittest +import numpy as np import pandas as pd import pyspark.pandas as ps @@ -83,23 +84,7 @@ class DatetimeIndexPropertyTestsMixin: self.assert_eq(psidx.is_leap_year, pd.Index(pidx.is_leap_year)) self.assert_eq(psidx.day_of_year, pidx.day_of_year) self.assert_eq(psidx.day_of_week, pidx.day_of_week) - - # TODO(SPARK-42617): Support isocalendar.week and replace it. - expected_results = [ - ps.Index([1]), - ps.Index([1, 1, 13]), - ps.Index([52, 52, 1]), - ps.Index([52, 52, 52]), - ps.Index([52, 52, 52]), - ps.Index([52, 52, 52]), - ps.Index([52, 52, 52]), - ps.Index([52, 52, 52]), - ps.Index([52, 1, 2]), - ps.Index([13, 26, 39]), - ] - for psidx, expected_result in zip(self.psidxs, expected_results): - self.assert_eq(psidx.week, expected_result) - self.assert_eq(psidx.weekofyear, expected_result) + self.assert_eq(psidx.isocalendar().week, pidx.isocalendar().week.astype(np.int64)) class DatetimeIndexPropertyTests(DatetimeIndexPropertyTestsMixin, PandasOnSparkTestCase, TestUtils): diff --git a/python/pyspark/pandas/tests/test_series_datetime.py b/python/pyspark/pandas/tests/test_series_datetime.py index 7e05364ca5f..c7ffc0675c6 100644 --- a/python/pyspark/pandas/tests/test_series_datetime.py +++ b/python/pyspark/pandas/tests/test_series_datetime.py @@ -197,23 +197,12 @@ class SeriesDateTimeTestsMixin: with self.assertRaises(NotImplementedError): self.check_func(lambda x: x.dt.nanosecond) - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-42617): Support `isocalendar`", - ) - def test_week(self): - self.check_func(lambda x: x.dt.week) - - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-42617): Support `isocalendar`", - ) - def test_weekofyear(self): - self.check_func(lambda x: x.dt.weekofyear) - def test_dayofweek(self): self.check_func(lambda x: x.dt.dayofweek) + def test_isocalendar(self): + self.check_func(lambda x: x.dt.isocalendar().astype(np.int64)) + def test_weekday(self): self.check_func(lambda x: x.dt.weekday) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org