This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 678f47264e0 [SPARK-43562][SPARK-43870][PS] Remove APIs from `DataFrame` and `Series` 678f47264e0 is described below commit 678f47264e084af766ed339df21513f44d05897f Author: itholic <haejoon....@databricks.com> AuthorDate: Fri Aug 4 10:36:04 2023 +0900 [SPARK-43562][SPARK-43870][PS] Remove APIs from `DataFrame` and `Series` ### What changes were proposed in this pull request? This PR proposes to remove DataFrame/Series APIs that removed from [pandas 2](https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html) and above. ### Why are the changes needed? To match the behavior to pandas. ### Does this PR introduce _any_ user-facing change? (DataFrame|Series).(iteritems|mad|append) will be removed. ### How was this patch tested? Enabling the existing tests. Closes #42268 from itholic/pandas_remove_df_api. Authored-by: itholic <haejoon....@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- .../source/migration_guide/pyspark_upgrade.rst | 11 ++ .../docs/source/reference/pyspark.pandas/frame.rst | 3 - .../source/reference/pyspark.pandas/groupby.rst | 1 - .../source/reference/pyspark.pandas/series.rst | 3 - python/pyspark/pandas/frame.py | 204 +-------------------- python/pyspark/pandas/groupby.py | 81 -------- python/pyspark/pandas/namespace.py | 1 - python/pyspark/pandas/series.py | 112 +---------- .../pandas/tests/computation/test_combine.py | 71 ++----- .../pandas/tests/computation/test_compute.py | 34 ---- python/pyspark/pandas/tests/groupby/test_stat.py | 7 - .../pyspark/pandas/tests/indexes/test_indexing.py | 8 +- python/pyspark/pandas/tests/series/test_compute.py | 18 +- python/pyspark/pandas/tests/series/test_series.py | 8 +- python/pyspark/pandas/tests/series/test_stat.py | 35 ---- 15 files changed, 41 insertions(+), 556 deletions(-) diff --git a/python/docs/source/migration_guide/pyspark_upgrade.rst b/python/docs/source/migration_guide/pyspark_upgrade.rst index 7513d64ef6c..9bd879fb1a1 100644 --- a/python/docs/source/migration_guide/pyspark_upgrade.rst +++ b/python/docs/source/migration_guide/pyspark_upgrade.rst @@ -19,6 +19,17 @@ Upgrading PySpark ================== +Upgrading from PySpark 3.5 to 4.0 +--------------------------------- + +* In Spark 4.0, ``DataFrame.iteritems`` has been removed from pandas API on Spark, use ``DataFrame.items`` instead. +* In Spark 4.0, ``Series.iteritems`` has been removed from pandas API on Spark, use ``Series.items`` instead. +* In Spark 4.0, ``DataFrame.append`` has been removed from pandas API on Spark, use ``ps.concat`` instead. +* In Spark 4.0, ``Series.append`` has been removed from pandas API on Spark, use ``ps.concat`` instead. +* In Spark 4.0, ``DataFrame.mad`` has been removed from pandas API on Spark. +* In Spark 4.0, ``Series.mad`` has been removed from pandas API on Spark. + + Upgrading from PySpark 3.3 to 3.4 --------------------------------- diff --git a/python/docs/source/reference/pyspark.pandas/frame.rst b/python/docs/source/reference/pyspark.pandas/frame.rst index a8d114187b9..5f839a803d7 100644 --- a/python/docs/source/reference/pyspark.pandas/frame.rst +++ b/python/docs/source/reference/pyspark.pandas/frame.rst @@ -79,7 +79,6 @@ Indexing, iteration DataFrame.iloc DataFrame.insert DataFrame.items - DataFrame.iteritems DataFrame.iterrows DataFrame.itertuples DataFrame.keys @@ -155,7 +154,6 @@ Computations / Descriptive Stats DataFrame.ewm DataFrame.kurt DataFrame.kurtosis - DataFrame.mad DataFrame.max DataFrame.mean DataFrame.min @@ -252,7 +250,6 @@ Combining / joining / merging .. autosummary:: :toctree: api/ - DataFrame.append DataFrame.assign DataFrame.merge DataFrame.join diff --git a/python/docs/source/reference/pyspark.pandas/groupby.rst b/python/docs/source/reference/pyspark.pandas/groupby.rst index da1579fd723..e71e81c56dd 100644 --- a/python/docs/source/reference/pyspark.pandas/groupby.rst +++ b/python/docs/source/reference/pyspark.pandas/groupby.rst @@ -68,7 +68,6 @@ Computations / Descriptive Stats GroupBy.filter GroupBy.first GroupBy.last - GroupBy.mad GroupBy.max GroupBy.mean GroupBy.median diff --git a/python/docs/source/reference/pyspark.pandas/series.rst b/python/docs/source/reference/pyspark.pandas/series.rst index a0119593f96..552acec096f 100644 --- a/python/docs/source/reference/pyspark.pandas/series.rst +++ b/python/docs/source/reference/pyspark.pandas/series.rst @@ -70,7 +70,6 @@ Indexing, iteration Series.keys Series.pop Series.items - Series.iteritems Series.item Series.xs Series.get @@ -148,7 +147,6 @@ Computations / Descriptive Stats Series.ewm Series.filter Series.kurt - Series.mad Series.max Series.mean Series.min @@ -247,7 +245,6 @@ Combining / joining / merging .. autosummary:: :toctree: api/ - Series.append Series.compare Series.replace Series.update diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py index d8a3f812c33..b960b3444e3 100644 --- a/python/pyspark/pandas/frame.py +++ b/python/pyspark/pandas/frame.py @@ -1880,11 +1880,9 @@ class DataFrame(Frame, Generic[T]): polar bear 22000 koala marsupial 80000 - >>> for label, content in df.iteritems(): + >>> for label, content in df.items(): ... print('label:', label) ... print('content:', content.to_string()) - ... - ... # doctest: +SKIP label: species content: panda bear polar bear @@ -2057,20 +2055,6 @@ class DataFrame(Frame, Generic[T]): ): yield tuple(([k] if index else []) + list(v)) - def iteritems(self) -> Iterator[Tuple[Name, "Series"]]: - """ - This is an alias of ``items``. - - .. deprecated:: 3.4.0 - iteritems is deprecated and will be removed in a future version. - Use .items instead. - """ - warnings.warn( - "Deprecated in 3.4.0, and will be removed in 4.0.0. Use DataFrame.items instead.", - FutureWarning, - ) - return self.items() - def to_clipboard(self, excel: bool = True, sep: Optional[str] = None, **kwargs: Any) -> None: """ Copy object to the system clipboard. @@ -8837,91 +8821,6 @@ defaultdict(<class 'list'>, {'col..., 'col...})] ) return DataFrame(internal) - def append( - self, - other: "DataFrame", - ignore_index: bool = False, - verify_integrity: bool = False, - sort: bool = False, - ) -> "DataFrame": - """ - Append rows of other to the end of caller, returning a new object. - - Columns in other that are not in the caller are added as new columns. - - .. deprecated:: 3.4.0 - - Parameters - ---------- - other : DataFrame or Series/dict-like object, or list of these - The data to append. - - ignore_index : boolean, default False - If True, do not use the index labels. - - verify_integrity : boolean, default False - If True, raise ValueError on creating index with duplicates. - - sort : boolean, default False - Currently not supported. - - Returns - ------- - appended : DataFrame - - Examples - -------- - >>> df = ps.DataFrame([[1, 2], [3, 4]], columns=list('AB')) - - >>> df.append(df) - A B - 0 1 2 - 1 3 4 - 0 1 2 - 1 3 4 - - >>> df.append(df, ignore_index=True) - A B - 0 1 2 - 1 3 4 - 2 1 2 - 3 3 4 - """ - warnings.warn( - "The DataFrame.append method is deprecated " - "and will be removed in 4.0.0. " - "Use pyspark.pandas.concat instead.", - FutureWarning, - ) - if isinstance(other, ps.Series): - raise TypeError("DataFrames.append() does not support appending Series to DataFrames") - if sort: - raise NotImplementedError("The 'sort' parameter is currently not supported") - - if not ignore_index: - index_scols = self._internal.index_spark_columns - if len(index_scols) != other._internal.index_level: - raise ValueError("Both DataFrames have to have the same number of index levels") - - if ( - verify_integrity - and len(index_scols) > 0 - and ( - self._internal.spark_frame.select(index_scols) - .intersect( - other._internal.spark_frame.select(other._internal.index_spark_columns) - ) - .count() - ) - > 0 - ): - raise ValueError("Indices have overlapping values") - - # Lazy import to avoid circular dependency issues - from pyspark.pandas.namespace import concat - - return cast(DataFrame, concat([self, other], ignore_index=ignore_index)) - # TODO: add 'filter_func' and 'errors' parameter def update(self, other: "DataFrame", join: str = "left", overwrite: bool = True) -> None: """ @@ -12719,107 +12618,6 @@ defaultdict(<class 'list'>, {'col..., 'col...})] result_df: DataFrame = DataFrame(internal) return result_df.reset_index(drop=True) if ignore_index else result_df - def mad(self, axis: Axis = 0) -> "Series": - """ - Return the mean absolute deviation of values. - - .. deprecated:: 3.4.0 - - Parameters - ---------- - axis : {index (0), columns (1)} - Axis for the function to be applied on. - - Examples - -------- - >>> df = ps.DataFrame({'a': [1, 2, 3, np.nan], 'b': [0.1, 0.2, 0.3, np.nan]}, - ... columns=['a', 'b']) - - >>> df.mad() - a 0.666667 - b 0.066667 - dtype: float64 - - >>> df.mad(axis=1) # doctest: +SKIP - 0 0.45 - 1 0.90 - 2 1.35 - 3 NaN - dtype: float64 - """ - warnings.warn( - "The 'mad' method is deprecated and will be removed in 4.0.0. " - "To compute the same result, you may do `(df - df.mean()).abs().mean()`.", - FutureWarning, - ) - from pyspark.pandas.series import first_series - - axis = validate_axis(axis) - - if axis == 0: - - def get_spark_column(psdf: DataFrame, label: Label) -> PySparkColumn: - scol = psdf._internal.spark_column_for(label) - col_type = psdf._internal.spark_type_for(label) - - if isinstance(col_type, BooleanType): - scol = scol.cast("integer") - - return scol - - new_column_labels: List[Label] = [] - for label in self._internal.column_labels: - # Filtering out only columns of numeric and boolean type column. - dtype = self._psser_for(label).spark.data_type - if isinstance(dtype, (NumericType, BooleanType)): - new_column_labels.append(label) - - new_columns = [ - F.avg(get_spark_column(self, label)).alias(name_like_string(label)) - for label in new_column_labels - ] - - mean_data = self._internal.spark_frame.select(*new_columns).first() - - new_columns = [ - F.avg( - F.abs(get_spark_column(self, label) - mean_data[name_like_string(label)]) - ).alias(name_like_string(label)) - for label in new_column_labels - ] - - sdf = self._internal.spark_frame.select( - *[F.lit(None).cast(StringType()).alias(SPARK_DEFAULT_INDEX_NAME)], *new_columns - ) - - # The data is expected to be small so it's fine to transpose/use the default index. - with ps.option_context("compute.max_rows", 1): - internal = InternalFrame( - spark_frame=sdf, - index_spark_columns=[scol_for(sdf, SPARK_DEFAULT_INDEX_NAME)], - column_labels=new_column_labels, - column_label_names=self._internal.column_label_names, - ) - return first_series(DataFrame(internal).transpose()) - - else: - - @pandas_udf(returnType=DoubleType()) # type: ignore[call-overload] - def calculate_columns_axis(*cols: pd.Series) -> pd.Series: - return pd.concat(cols, axis=1).mad(axis=1) - - internal = self._internal.copy( - column_labels=[None], - data_spark_columns=[ - calculate_columns_axis(*self._internal.data_spark_columns).alias( - SPARK_DEFAULT_SERIES_NAME - ) - ], - data_fields=[None], - column_label_names=None, - ) - return first_series(DataFrame(internal)) - def mode(self, axis: Axis = 0, numeric_only: bool = False, dropna: bool = True) -> "DataFrame": """ Get the mode(s) of each element along the selected axis. diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py index 663a635668e..2de32817793 100644 --- a/python/pyspark/pandas/groupby.py +++ b/python/pyspark/pandas/groupby.py @@ -991,87 +991,6 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta): bool_to_numeric=True, ) - # TODO: 'axis', 'skipna', 'level' parameter should be implemented. - def mad(self) -> FrameLike: - """ - Compute mean absolute deviation of groups, excluding missing values. - - .. versionadded:: 3.4.0 - - .. deprecated:: 3.4.0 - - Examples - -------- - >>> df = ps.DataFrame({"A": [1, 2, 1, 1], "B": [True, False, False, True], - ... "C": [3, 4, 3, 4], "D": ["a", "b", "b", "a"]}) - - >>> df.groupby("A").mad() - B C - A - 1 0.444444 0.444444 - 2 0.000000 0.000000 - - >>> df.B.groupby(df.A).mad() - A - 1 0.444444 - 2 0.000000 - Name: B, dtype: float64 - - See Also - -------- - pyspark.pandas.Series.groupby - pyspark.pandas.DataFrame.groupby - """ - warnings.warn( - "The 'mad' method is deprecated and will be removed in a future version. " - "To compute the same result, you may do `(group_df - group_df.mean()).abs().mean()`.", - FutureWarning, - ) - groupkey_names = [SPARK_INDEX_NAME_FORMAT(i) for i in range(len(self._groupkeys))] - internal, agg_columns, sdf = self._prepare_reduce( - groupkey_names=groupkey_names, - accepted_spark_types=(NumericType, BooleanType), - bool_to_numeric=False, - ) - psdf: DataFrame = DataFrame(internal) - - if len(psdf._internal.column_labels) > 0: - window = Window.partitionBy(groupkey_names).rowsBetween( - Window.unboundedPreceding, Window.unboundedFollowing - ) - new_agg_scols = {} - new_stat_scols = [] - for agg_column in agg_columns: - # it is not able to directly use 'self._reduce_for_stat_function', due to - # 'it is not allowed to use a window function inside an aggregate function'. - # so we need to create temporary columns to compute the 'abs(x - avg(x))' here. - agg_column_name = agg_column._internal.data_spark_column_names[0] - new_agg_column_name = verify_temp_column_name( - psdf._internal.spark_frame, "__tmp_agg_col_{}__".format(agg_column_name) - ) - casted_agg_scol = F.col(agg_column_name).cast("double") - new_agg_scols[new_agg_column_name] = F.abs( - casted_agg_scol - F.avg(casted_agg_scol).over(window) - ) - new_stat_scols.append(F.avg(F.col(new_agg_column_name)).alias(agg_column_name)) - - sdf = ( - psdf._internal.spark_frame.withColumns(new_agg_scols) - .groupby(groupkey_names) - .agg(*new_stat_scols) - ) - else: - sdf = sdf.select(*groupkey_names).distinct() - - internal = internal.copy( - spark_frame=sdf, - index_spark_columns=[scol_for(sdf, col) for col in groupkey_names], - data_spark_columns=[scol_for(sdf, col) for col in internal.data_spark_column_names], - data_fields=None, - ) - - return self._prepare_return(DataFrame(internal)) - def sem(self, ddof: int = 1) -> FrameLike: """ Compute standard error of the mean of groups, excluding missing values. diff --git a/python/pyspark/pandas/namespace.py b/python/pyspark/pandas/namespace.py index 3563a6d81b4..5ffec6bedb9 100644 --- a/python/pyspark/pandas/namespace.py +++ b/python/pyspark/pandas/namespace.py @@ -2365,7 +2365,6 @@ def concat( See Also -------- - Series.append : Concatenate Series. DataFrame.join : Join DataFrames using indexes. DataFrame.merge : Merge DataFrames by indexes or columns. diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py index 95ca92e7878..9fbbadd5420 100644 --- a/python/pyspark/pandas/series.py +++ b/python/pyspark/pandas/series.py @@ -3584,71 +3584,6 @@ class Series(Frame, IndexOpsMixin, Generic[T]): """ return self.sort_values(ascending=False).head(n) - def append( - self, to_append: "Series", ignore_index: bool = False, verify_integrity: bool = False - ) -> "Series": - """ - Concatenate two or more Series. - - .. deprecated:: 3.4.0 - - Parameters - ---------- - to_append : Series or list/tuple of Series - ignore_index : boolean, default False - If True, do not use the index labels. - verify_integrity : boolean, default False - If True, raise Exception on creating index with duplicates - - Returns - ------- - appended : Series - - Examples - -------- - >>> s1 = ps.Series([1, 2, 3]) - >>> s2 = ps.Series([4, 5, 6]) - >>> s3 = ps.Series([4, 5, 6], index=[3,4,5]) - - >>> s1.append(s2) # doctest: +SKIP - 0 1 - 1 2 - 2 3 - 0 4 - 1 5 - 2 6 - dtype: int64 - - >>> s1.append(s3) # doctest: +SKIP - 0 1 - 1 2 - 2 3 - 3 4 - 4 5 - 5 6 - dtype: int64 - - With ignore_index set to True: - - >>> s1.append(s2, ignore_index=True) # doctest: +SKIP - 0 1 - 1 2 - 2 3 - 3 4 - 4 5 - 5 6 - dtype: int64 - """ - warnings.warn( - "The Series.append method is deprecated " - "and will be removed in 4.0.0. " - "Use pyspark.pandas.concat instead.", - FutureWarning, - ) - return first_series( - self.to_frame().append(to_append.to_frame(), ignore_index, verify_integrity) - ).rename(self.name) - def sample( self, n: Optional[int] = None, @@ -5939,37 +5874,6 @@ class Series(Frame, IndexOpsMixin, Generic[T]): pdf.columns = pd.Index(where) return first_series(DataFrame(pdf.transpose())).rename(self.name) - def mad(self) -> float: - """ - Return the mean absolute deviation of values. - - .. deprecated:: 3.4.0 - - Examples - -------- - >>> s = ps.Series([1, 2, 3, 4]) - >>> s - 0 1 - 1 2 - 2 3 - 3 4 - dtype: int64 - - >>> s.mad() - 1.0 - """ - warnings.warn( - "The 'mad' method is deprecated and will be removed in 4.0.0. " - "To compute the same result, you may do `(series - series.mean()).abs().mean()`.", - FutureWarning, - ) - sdf = self._internal.spark_frame - spark_column = self.spark.column - avg = unpack_scalar(sdf.select(F.avg(spark_column))) - mad = unpack_scalar(sdf.select(F.avg(F.abs(spark_column - avg)))) - - return mad - def unstack(self, level: int = -1) -> DataFrame: """ Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame. @@ -6083,7 +5987,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]): This method returns an iterable tuple (index, value). This is convenient if you want to create a lazy iterator. - .. note:: Unlike pandas', the iteritems in pandas-on-Spark returns generator rather + .. note:: Unlike pandas', the itmes in pandas-on-Spark returns generator rather zip object Returns @@ -6123,20 +6027,6 @@ class Series(Frame, IndexOpsMixin, Generic[T]): ): yield k, v - def iteritems(self) -> Iterable[Tuple[Name, Any]]: - """ - This is an alias of ``items``. - - .. deprecated:: 3.4.0 - iteritems is deprecated and will be removed in a future version. - Use .items instead. - """ - warnings.warn( - "Deprecated in 3.4, and will be removed in 4.0.0. Use Series.items instead.", - FutureWarning, - ) - return self.items() - def droplevel(self, level: Union[int, Name, List[Union[int, Name]]]) -> "Series": """ Return Series with requested index level(s) removed. diff --git a/python/pyspark/pandas/tests/computation/test_combine.py b/python/pyspark/pandas/tests/computation/test_combine.py index dd55c0fd686..adba20b5d99 100644 --- a/python/pyspark/pandas/tests/computation/test_combine.py +++ b/python/pyspark/pandas/tests/computation/test_combine.py @@ -41,46 +41,26 @@ class FrameCombineMixin: psdf = ps.from_pandas(pdf) return pdf, psdf - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43562): Enable DataFrameTests.test_append for pandas 2.0.0.", - ) - def test_append(self): + def test_concat(self): pdf = pd.DataFrame([[1, 2], [3, 4]], columns=list("AB")) psdf = ps.from_pandas(pdf) other_pdf = pd.DataFrame([[3, 4], [5, 6]], columns=list("BC"), index=[2, 3]) other_psdf = ps.from_pandas(other_pdf) - self.assert_eq(psdf.append(psdf), pdf.append(pdf)) - self.assert_eq(psdf.append(psdf, ignore_index=True), pdf.append(pdf, ignore_index=True)) + self.assert_eq(ps.concat([psdf, psdf]), pd.concat([pdf, pdf])) + self.assert_eq( + ps.concat([psdf, psdf], ignore_index=True), pd.concat([pdf, pdf], ignore_index=True) + ) # Assert DataFrames with non-matching columns - self.assert_eq(psdf.append(other_psdf), pdf.append(other_pdf)) - - # Assert appending a Series fails - msg = "DataFrames.append() does not support appending Series to DataFrames" - with self.assertRaises(TypeError, msg=msg): - psdf.append(psdf["A"]) - - # Assert using the sort parameter raises an exception - msg = "The 'sort' parameter is currently not supported" - with self.assertRaises(NotImplementedError, msg=msg): - psdf.append(psdf, sort=True) + self.assert_eq(ps.concat([psdf, other_psdf]), pd.concat([pdf, other_pdf])) - # Assert using 'verify_integrity' only raises an exception for overlapping indices - self.assert_eq( - psdf.append(other_psdf, verify_integrity=True), - pdf.append(other_pdf, verify_integrity=True), - ) - msg = "Indices have overlapping values" - with self.assertRaises(ValueError, msg=msg): - psdf.append(psdf, verify_integrity=True) + ps.concat([psdf, psdf["A"]]) + # Assert appending a Series + self.assert_eq(ps.concat([psdf, psdf["A"]]), pd.concat([pdf, pdf["A"]])) - # Skip integrity verification when ignore_index=True - self.assert_eq( - psdf.append(psdf, ignore_index=True, verify_integrity=True), - pdf.append(pdf, ignore_index=True, verify_integrity=True), - ) + # Assert using the sort parameter + self.assert_eq(ps.concat([psdf, psdf], sort=True), pd.concat([pdf, pdf], sort=True)) # Assert appending multi-index DataFrames multi_index_pdf = pd.DataFrame([[1, 2], [3, 4]], columns=list("AB"), index=[[2, 3], [4, 5]]) @@ -91,45 +71,32 @@ class FrameCombineMixin: other_multi_index_psdf = ps.from_pandas(other_multi_index_pdf) self.assert_eq( - multi_index_psdf.append(multi_index_psdf), multi_index_pdf.append(multi_index_pdf) + ps.concat([multi_index_psdf, multi_index_psdf]), + pd.concat([multi_index_pdf, multi_index_pdf]), ) # Assert DataFrames with non-matching columns self.assert_eq( - multi_index_psdf.append(other_multi_index_psdf), - multi_index_pdf.append(other_multi_index_pdf), - ) - - # Assert using 'verify_integrity' only raises an exception for overlapping indices - self.assert_eq( - multi_index_psdf.append(other_multi_index_psdf, verify_integrity=True), - multi_index_pdf.append(other_multi_index_pdf, verify_integrity=True), - ) - with self.assertRaises(ValueError, msg=msg): - multi_index_psdf.append(multi_index_psdf, verify_integrity=True) - - # Skip integrity verification when ignore_index=True - self.assert_eq( - multi_index_psdf.append(multi_index_psdf, ignore_index=True, verify_integrity=True), - multi_index_pdf.append(multi_index_pdf, ignore_index=True, verify_integrity=True), + ps.concat([multi_index_psdf, other_multi_index_psdf]), + pd.concat([multi_index_pdf, other_multi_index_pdf]), ) # Assert trying to append DataFrames with different index levels msg = "Both DataFrames have to have the same number of index levels" with self.assertRaises(ValueError, msg=msg): - psdf.append(multi_index_psdf) + ps.concat([psdf, multi_index_psdf]) # Skip index level check when ignore_index=True self.assert_eq( - psdf.append(multi_index_psdf, ignore_index=True), - pdf.append(multi_index_pdf, ignore_index=True), + ps.concat([psdf, other_multi_index_psdf], ignore_index=True), + pd.concat([pdf, other_multi_index_pdf], ignore_index=True), ) columns = pd.MultiIndex.from_tuples([("A", "X"), ("A", "Y")]) pdf.columns = columns psdf.columns = columns - self.assert_eq(psdf.append(psdf), pdf.append(pdf)) + self.assert_eq(ps.concat([psdf, psdf]), pd.concat([pdf, pdf])) def test_merge(self): left_pdf = pd.DataFrame( diff --git a/python/pyspark/pandas/tests/computation/test_compute.py b/python/pyspark/pandas/tests/computation/test_compute.py index 5ce273c1f47..d4b49f2ac8b 100644 --- a/python/pyspark/pandas/tests/computation/test_compute.py +++ b/python/pyspark/pandas/tests/computation/test_compute.py @@ -78,40 +78,6 @@ class FrameComputeMixin: str_psdf = ps.DataFrame({"A": ["a", "b", "c"]}, index=np.random.rand(3)) self.assert_eq(str_psdf.clip(1, 3), str_psdf) - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43560): Enable DataFrameSlowTests.test_mad for pandas 2.0.0.", - ) - def test_mad(self): - pdf = pd.DataFrame( - { - "A": [1, 2, None, 4, np.nan], - "B": [-0.1, 0.2, -0.3, np.nan, 0.5], - "C": ["a", "b", "c", "d", "e"], - } - ) - psdf = ps.from_pandas(pdf) - - self.assert_eq(psdf.mad(), pdf.mad()) - self.assert_eq(psdf.mad(axis=1), pdf.mad(axis=1)) - - with self.assertRaises(ValueError): - psdf.mad(axis=2) - - # MultiIndex columns - columns = pd.MultiIndex.from_tuples([("A", "X"), ("A", "Y"), ("A", "Z")]) - pdf.columns = columns - psdf.columns = columns - - self.assert_eq(psdf.mad(), pdf.mad()) - self.assert_eq(psdf.mad(axis=1), pdf.mad(axis=1)) - - pdf = pd.DataFrame({"A": [True, True, False, False], "B": [True, False, False, True]}) - psdf = ps.from_pandas(pdf) - - self.assert_eq(psdf.mad(), pdf.mad()) - self.assert_eq(psdf.mad(axis=1), pdf.mad(axis=1)) - def test_mode(self): pdf = pd.DataFrame( { diff --git a/python/pyspark/pandas/tests/groupby/test_stat.py b/python/pyspark/pandas/tests/groupby/test_stat.py index bfdeeecce30..8a5096942e6 100644 --- a/python/pyspark/pandas/tests/groupby/test_stat.py +++ b/python/pyspark/pandas/tests/groupby/test_stat.py @@ -206,13 +206,6 @@ class GroupbyStatMixin: psdf.groupby("A").sum(min_count=3).sort_index(), ) - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43553): Enable GroupByTests.test_mad for pandas 2.0.0.", - ) - def test_mad(self): - self._test_stat_func(lambda groupby_obj: groupby_obj.mad()) - def test_first(self): self._test_stat_func(lambda groupby_obj: groupby_obj.first()) self._test_stat_func(lambda groupby_obj: groupby_obj.first(numeric_only=None)) diff --git a/python/pyspark/pandas/tests/indexes/test_indexing.py b/python/pyspark/pandas/tests/indexes/test_indexing.py index 64fc75347ba..111dd09696d 100644 --- a/python/pyspark/pandas/tests/indexes/test_indexing.py +++ b/python/pyspark/pandas/tests/indexes/test_indexing.py @@ -53,11 +53,7 @@ class FrameIndexingMixin: with option_context("compute.ordered_head", True): self.assert_eq(psdf.head(), pdf.head()) - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43559): Enable DataFrameSlowTests.test_iteritems for pandas 2.0.0.", - ) - def test_iteritems(self): + def test_items(self): pdf = pd.DataFrame( {"species": ["bear", "bear", "marsupial"], "population": [1864, 22000, 80000]}, index=["panda", "polar", "koala"], @@ -65,7 +61,7 @@ class FrameIndexingMixin: ) psdf = ps.from_pandas(pdf) - for (p_name, p_items), (k_name, k_items) in zip(pdf.iteritems(), psdf.iteritems()): + for (p_name, p_items), (k_name, k_items) in zip(pdf.items(), psdf.items()): self.assert_eq(p_name, k_name) self.assert_eq(p_items, k_items) diff --git a/python/pyspark/pandas/tests/series/test_compute.py b/python/pyspark/pandas/tests/series/test_compute.py index 2fbdaef865e..7d39f0523d4 100644 --- a/python/pyspark/pandas/tests/series/test_compute.py +++ b/python/pyspark/pandas/tests/series/test_compute.py @@ -142,11 +142,7 @@ class SeriesComputeMixin: expected = ps.DataFrame([[1, 2], [2, 3]], index=["x", "y"], columns=["self", "other"]) self.assert_eq(expected, psser.compare(psser + 1).sort_index()) - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43465): Enable SeriesTests.test_append for pandas 2.0.0.", - ) - def test_append(self): + def test_concat(self): pser1 = pd.Series([1, 2, 3], name="0") pser2 = pd.Series([4, 5, 6], name="0") pser3 = pd.Series([4, 5, 6], index=[3, 4, 5], name="0") @@ -154,17 +150,13 @@ class SeriesComputeMixin: psser2 = ps.from_pandas(pser2) psser3 = ps.from_pandas(pser3) - self.assert_eq(psser1.append(psser2), pser1.append(pser2)) - self.assert_eq(psser1.append(psser3), pser1.append(pser3)) + self.assert_eq(ps.concat([psser1, psser2]), pd.concat([pser1, pser2])) + self.assert_eq(ps.concat([psser1, psser3]), pd.concat([pser1, pser3])) self.assert_eq( - psser1.append(psser2, ignore_index=True), pser1.append(pser2, ignore_index=True) + ps.concat([psser1, psser2], ignore_index=True), + pd.concat([pser1, pser2], ignore_index=True), ) - psser1.append(psser3, verify_integrity=True) - msg = "Indices have overlapping values" - with self.assertRaises(ValueError, msg=msg): - psser1.append(psser2, verify_integrity=True) - def test_shift(self): pser = pd.Series([10, 20, 15, 30, 45], name="x") psser = ps.Series(pser) diff --git a/python/pyspark/pandas/tests/series/test_series.py b/python/pyspark/pandas/tests/series/test_series.py index 116acb2a5b2..f7f186b6724 100644 --- a/python/pyspark/pandas/tests/series/test_series.py +++ b/python/pyspark/pandas/tests/series/test_series.py @@ -670,15 +670,11 @@ class SeriesTestsMixin: with self.assertRaisesRegex(ValueError, "The item should not be empty."): psser.filter(items=[(), ("three", "z")]) - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43480): Enable SeriesTests.test_iteritems for pandas 2.0.0.", - ) - def test_iteritems(self): + def test_items(self): pser = pd.Series(["A", "B", "C"]) psser = ps.from_pandas(pser) - for (p_name, p_items), (k_name, k_items) in zip(pser.iteritems(), psser.iteritems()): + for (p_name, p_items), (k_name, k_items) in zip(pser.items(), psser.items()): self.assert_eq(p_name, k_name) self.assert_eq(p_items, k_items) diff --git a/python/pyspark/pandas/tests/series/test_stat.py b/python/pyspark/pandas/tests/series/test_stat.py index 0d6e2424921..048a4c94fd9 100644 --- a/python/pyspark/pandas/tests/series/test_stat.py +++ b/python/pyspark/pandas/tests/series/test_stat.py @@ -524,41 +524,6 @@ class SeriesStatMixin: self.assert_eq(pser // 0, psser // 0) self.assert_eq(pser.floordiv(np.nan), psser.floordiv(np.nan)) - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43468): Enable SeriesTests.test_mad for pandas 2.0.0.", - ) - def test_mad(self): - pser = pd.Series([1, 2, 3, 4], name="Koalas") - psser = ps.from_pandas(pser) - - self.assert_eq(pser.mad(), psser.mad()) - - pser = pd.Series([None, -2, 5, 10, 50, np.nan, -20], name="Koalas") - psser = ps.from_pandas(pser) - - self.assert_eq(pser.mad(), psser.mad()) - - pmidx = pd.MultiIndex.from_tuples( - [("a", "1"), ("a", "2"), ("b", "1"), ("b", "2"), ("c", "1")] - ) - pser = pd.Series([1, 2, 3, 4, 5], name="Koalas") - pser.index = pmidx - psser = ps.from_pandas(pser) - - self.assert_eq(pser.mad(), psser.mad()) - - pmidx = pd.MultiIndex.from_tuples( - [("a", "1"), ("a", "2"), ("b", "1"), ("b", "2"), ("c", "1")] - ) - pser = pd.Series([None, -2, 5, 50, np.nan], name="Koalas") - pser.index = pmidx - psser = ps.from_pandas(pser) - - # Mark almost as True to avoid precision issue like: - # "21.555555555555554 != 21.555555555555557" - self.assert_eq(pser.mad(), psser.mad(), almost=True) - @unittest.skipIf( LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), "TODO(SPARK-43481): Enable SeriesTests.test_product for pandas 2.0.0.", --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org