This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 5280d492ad6 [SPARK-45550][PS] Remove deprecated APIs from Pandas API on Spark 5280d492ad6 is described below commit 5280d492ad636782ca910a3c0bf0f0cb5bce2223 Author: Haejoon Lee <haejoon....@databricks.com> AuthorDate: Tue Oct 17 19:40:12 2023 +0800 [SPARK-45550][PS] Remove deprecated APIs from Pandas API on Spark ### What changes were proposed in this pull request? This PR proposes to remove deprecated APIs from Pandas API on Spark: - Remove `DataFrame.to_spark_io`, use `DataFrame.spark.to_spark_io` instead. - Remove `(Index|Series).is_monotonic`, use `(Index|Series).is_monotonic_increasing` instead. ### Why are the changes needed? To cleanup API surface ### Does this PR introduce _any_ user-facing change? Remove APIs no longer available from Spark 4.x. ### How was this patch tested? The existing CI should pass ### Was this patch authored or co-authored using generative AI tooling? No. Closes #43384 from itholic/SPARK-45550. Authored-by: Haejoon Lee <haejoon....@databricks.com> Signed-off-by: Ruifeng Zheng <ruife...@apache.org> --- .../source/migration_guide/pyspark_upgrade.rst | 2 + .../docs/source/reference/pyspark.pandas/frame.rst | 1 - .../source/reference/pyspark.pandas/indexing.rst | 1 - python/docs/source/reference/pyspark.pandas/io.rst | 2 +- .../source/reference/pyspark.pandas/series.rst | 1 - python/pyspark/pandas/base.py | 83 ---------------------- python/pyspark/pandas/frame.py | 23 ------ python/pyspark/pandas/generic.py | 1 - python/pyspark/pandas/indexing.py | 4 +- python/pyspark/pandas/namespace.py | 7 +- python/pyspark/pandas/spark/accessors.py | 7 +- .../pandas/tests/test_dataframe_spark_io.py | 4 +- 12 files changed, 13 insertions(+), 123 deletions(-) diff --git a/python/docs/source/migration_guide/pyspark_upgrade.rst b/python/docs/source/migration_guide/pyspark_upgrade.rst index d081275dc83..933fa936f70 100644 --- a/python/docs/source/migration_guide/pyspark_upgrade.rst +++ b/python/docs/source/migration_guide/pyspark_upgrade.rst @@ -51,6 +51,8 @@ Upgrading from PySpark 3.5 to 4.0 * In Spark 4.0, ``Index.asi8`` has been removed from pandas API on Spark, use ``Index.astype`` instead. * In Spark 4.0, ``Index.is_type_compatible`` has been removed from pandas API on Spark, use ``Index.isin`` instead. * In Spark 4.0, ``col_space`` parameter from ``DataFrame.to_latex`` and ``Series.to_latex`` has been removed from pandas API on Spark. +* In Spark 4.0, ``DataFrame.to_spark_io`` has been removed from pandas API on Spark, use ``DataFrame.spark.to_spark_io`` instead. +* In Spark 4.0, ``Series.is_monotonic`` and ``Index.is_monotonic`` have been removed from pandas API on Spark, use ``Series.is_monotonic_increasing`` or ``Index.is_monotonic_increasing`` instead respectively. Upgrading from PySpark 3.3 to 3.4 diff --git a/python/docs/source/reference/pyspark.pandas/frame.rst b/python/docs/source/reference/pyspark.pandas/frame.rst index a22078f86e2..911999b56be 100644 --- a/python/docs/source/reference/pyspark.pandas/frame.rst +++ b/python/docs/source/reference/pyspark.pandas/frame.rst @@ -276,7 +276,6 @@ Serialization / IO / Conversion DataFrame.to_table DataFrame.to_delta DataFrame.to_parquet - DataFrame.to_spark_io DataFrame.to_csv DataFrame.to_orc DataFrame.to_pandas diff --git a/python/docs/source/reference/pyspark.pandas/indexing.rst b/python/docs/source/reference/pyspark.pandas/indexing.rst index d6be57ee9c8..08f5e224e06 100644 --- a/python/docs/source/reference/pyspark.pandas/indexing.rst +++ b/python/docs/source/reference/pyspark.pandas/indexing.rst @@ -36,7 +36,6 @@ Properties .. autosummary:: :toctree: api/ - Index.is_monotonic Index.is_monotonic_increasing Index.is_monotonic_decreasing Index.is_unique diff --git a/python/docs/source/reference/pyspark.pandas/io.rst b/python/docs/source/reference/pyspark.pandas/io.rst index b39a4e8778a..118dd49a4ad 100644 --- a/python/docs/source/reference/pyspark.pandas/io.rst +++ b/python/docs/source/reference/pyspark.pandas/io.rst @@ -69,7 +69,7 @@ Generic Spark I/O :toctree: api/ read_spark_io - DataFrame.to_spark_io + DataFrame.spark.to_spark_io Flat File / CSV --------------- diff --git a/python/docs/source/reference/pyspark.pandas/series.rst b/python/docs/source/reference/pyspark.pandas/series.rst index 7b658d45d4b..eb4a499c054 100644 --- a/python/docs/source/reference/pyspark.pandas/series.rst +++ b/python/docs/source/reference/pyspark.pandas/series.rst @@ -170,7 +170,6 @@ Computations / Descriptive Stats Series.value_counts Series.round Series.diff - Series.is_monotonic Series.is_monotonic_increasing Series.is_monotonic_decreasing diff --git a/python/pyspark/pandas/base.py b/python/pyspark/pandas/base.py index 771d79dc6e0..6921b7eabaa 100644 --- a/python/pyspark/pandas/base.py +++ b/python/pyspark/pandas/base.py @@ -532,89 +532,6 @@ class IndexOpsMixin(object, metaclass=ABCMeta): """ return self.isnull().any() - @property - def is_monotonic(self) -> bool: - """ - Return boolean if values in the object are monotonically increasing. - - .. note:: the current implementation of is_monotonic requires to shuffle - and aggregate multiple times to check the order locally and globally, - which is potentially expensive. In case of multi-index, all data is - transferred to a single node which can easily cause out-of-memory errors. - - .. note:: Disable the Spark config `spark.sql.optimizer.nestedSchemaPruning.enabled` - for multi-index if you're using pandas-on-Spark < 1.7.0 with PySpark 3.1.1. - - .. deprecated:: 3.4.0 - - Returns - ------- - is_monotonic : bool - - Examples - -------- - >>> ser = ps.Series(['1/1/2018', '3/1/2018', '4/1/2018']) - >>> ser.is_monotonic - True - - >>> df = ps.DataFrame({'dates': [None, '1/1/2018', '2/1/2018', '3/1/2018']}) - >>> df.dates.is_monotonic - False - - >>> df.index.is_monotonic - True - - >>> ser = ps.Series([1]) - >>> ser.is_monotonic - True - - >>> ser = ps.Series([]) - >>> ser.is_monotonic - True - - >>> ser.rename("a").to_frame().set_index("a").index.is_monotonic - True - - >>> ser = ps.Series([5, 4, 3, 2, 1], index=[1, 2, 3, 4, 5]) - >>> ser.is_monotonic - False - - >>> ser.index.is_monotonic - True - - Support for MultiIndex - - >>> midx = ps.MultiIndex.from_tuples( - ... [('x', 'a'), ('x', 'b'), ('y', 'c'), ('y', 'd'), ('z', 'e')]) - >>> midx # doctest: +SKIP - MultiIndex([('x', 'a'), - ('x', 'b'), - ('y', 'c'), - ('y', 'd'), - ('z', 'e')], - ) - >>> midx.is_monotonic - True - - >>> midx = ps.MultiIndex.from_tuples( - ... [('z', 'a'), ('z', 'b'), ('y', 'c'), ('y', 'd'), ('x', 'e')]) - >>> midx # doctest: +SKIP - MultiIndex([('z', 'a'), - ('z', 'b'), - ('y', 'c'), - ('y', 'd'), - ('x', 'e')], - ) - >>> midx.is_monotonic - False - """ - warnings.warn( - "is_monotonic is deprecated and will be removed in a future version. " - "Use is_monotonic_increasing instead.", - FutureWarning, - ) - return self._is_monotonic("increasing") - @property def is_monotonic_increasing(self) -> bool: """ diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py index 8f3555685ff..7d93af0485f 100644 --- a/python/pyspark/pandas/frame.py +++ b/python/pyspark/pandas/frame.py @@ -5190,7 +5190,6 @@ defaultdict(<class 'list'>, {'col..., 'col...})] read_delta DataFrame.to_parquet DataFrame.to_table - DataFrame.to_spark_io Examples -------- @@ -5280,7 +5279,6 @@ defaultdict(<class 'list'>, {'col..., 'col...})] read_parquet DataFrame.to_delta DataFrame.to_table - DataFrame.to_spark_io Examples -------- @@ -5363,7 +5361,6 @@ defaultdict(<class 'list'>, {'col..., 'col...})] DataFrame.to_delta DataFrame.to_parquet DataFrame.to_table - DataFrame.to_spark_io Examples -------- @@ -5408,26 +5405,6 @@ defaultdict(<class 'list'>, {'col..., 'col...})] **options, ) - def to_spark_io( - self, - path: Optional[str] = None, - format: Optional[str] = None, - mode: str = "overwrite", - partition_cols: Optional[Union[str, List[str]]] = None, - index_col: Optional[Union[str, List[str]]] = None, - **options: "OptionalPrimitiveType", - ) -> None: - """An alias for :func:`DataFrame.spark.to_spark_io`. - See :meth:`pyspark.pandas.spark.accessors.SparkFrameMethods.to_spark_io`. - - .. deprecated:: 3.2.0 - Use :func:`DataFrame.spark.to_spark_io` instead. - """ - warnings.warn("Deprecated in 3.2, Use DataFrame.spark.to_spark_io instead.", FutureWarning) - return self.spark.to_spark_io(path, format, mode, partition_cols, index_col, **options) - - to_spark_io.__doc__ = SparkFrameMethods.to_spark_io.__doc__ - def to_spark(self, index_col: Optional[Union[str, List[str]]] = None) -> PySparkDataFrame: if index_col is None: log_advice( diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py index 300fd73e43f..c6f1b9ccbb7 100644 --- a/python/pyspark/pandas/generic.py +++ b/python/pyspark/pandas/generic.py @@ -728,7 +728,6 @@ class Frame(object, metaclass=ABCMeta): DataFrame.to_delta DataFrame.to_table DataFrame.to_parquet - DataFrame.to_spark_io Examples -------- diff --git a/python/pyspark/pandas/indexing.py b/python/pyspark/pandas/indexing.py index de5baa3fae1..24b7c53eea9 100644 --- a/python/pyspark/pandas/indexing.py +++ b/python/pyspark/pandas/indexing.py @@ -1106,7 +1106,9 @@ class LocIndexer(LocIndexerLike): return None, None, None elif ( depth > self._internal.index_level - or not index.droplevel(list(range(self._internal.index_level)[depth:])).is_monotonic + or not index.droplevel( + list(range(self._internal.index_level)[depth:]) + ).is_monotonic_increasing ): raise KeyError( "Key length ({}) was greater than MultiIndex sort depth".format(depth) diff --git a/python/pyspark/pandas/namespace.py b/python/pyspark/pandas/namespace.py index e8898ab4893..9b64300e948 100644 --- a/python/pyspark/pandas/namespace.py +++ b/python/pyspark/pandas/namespace.py @@ -702,20 +702,19 @@ def read_spark_io( See Also -------- - DataFrame.to_spark_io DataFrame.read_table DataFrame.read_delta DataFrame.read_parquet Examples -------- - >>> ps.range(1).to_spark_io('%s/read_spark_io/data.parquet' % path) + >>> ps.range(1).spark.to_spark_io('%s/read_spark_io/data.parquet' % path) >>> ps.read_spark_io( ... '%s/read_spark_io/data.parquet' % path, format='parquet', schema='id long') id 0 0 - >>> ps.range(10, 15, num_partitions=1).to_spark_io('%s/read_spark_io/data.json' % path, + >>> ps.range(10, 15, num_partitions=1).spark.to_spark_io('%s/read_spark_io/data.json' % path, ... format='json', lineSep='__') >>> ps.read_spark_io( ... '%s/read_spark_io/data.json' % path, format='json', schema='id long', lineSep='__') @@ -728,7 +727,7 @@ def read_spark_io( You can preserve the index in the roundtrip as below. - >>> ps.range(10, 15, num_partitions=1).to_spark_io('%s/read_spark_io/data.orc' % path, + >>> ps.range(10, 15, num_partitions=1).spark.to_spark_io('%s/read_spark_io/data.orc' % path, ... format='orc', index_col="index") >>> ps.read_spark_io( ... path=r'%s/read_spark_io/data.orc' % path, format="orc", index_col="index") diff --git a/python/pyspark/pandas/spark/accessors.py b/python/pyspark/pandas/spark/accessors.py index 7fb14e6ed75..1ac12bb59dd 100644 --- a/python/pyspark/pandas/spark/accessors.py +++ b/python/pyspark/pandas/spark/accessors.py @@ -727,7 +727,6 @@ class SparkFrameMethods: See Also -------- read_table - DataFrame.to_spark_io DataFrame.spark.to_spark_io DataFrame.to_parquet @@ -761,8 +760,7 @@ class SparkFrameMethods: index_col: Optional[Union[str, List[str]]] = None, **options: "OptionalPrimitiveType", ) -> None: - """Write the DataFrame out to a Spark data source. :meth:`DataFrame.spark.to_spark_io` - is an alias of :meth:`DataFrame.to_spark_io`. + """Write the DataFrame out to a Spark data source. Parameters ---------- @@ -801,7 +799,6 @@ class SparkFrameMethods: DataFrame.to_delta DataFrame.to_parquet DataFrame.to_table - DataFrame.to_spark_io DataFrame.spark.to_spark_io Examples @@ -816,7 +813,7 @@ class SparkFrameMethods: 1 2012-02-29 12:00:00 US 2 2 2012-03-31 12:00:00 JP 3 - >>> df.to_spark_io(path='%s/to_spark_io/foo.json' % path, format='json') + >>> df.spark.to_spark_io(path='%s/to_spark_io/foo.json' % path, format='json') """ if "options" in options and isinstance(options.get("options"), dict) and len(options) == 1: options = options.get("options") # type: ignore[assignment] diff --git a/python/pyspark/pandas/tests/test_dataframe_spark_io.py b/python/pyspark/pandas/tests/test_dataframe_spark_io.py index 41be0eee4b8..628190d4616 100644 --- a/python/pyspark/pandas/tests/test_dataframe_spark_io.py +++ b/python/pyspark/pandas/tests/test_dataframe_spark_io.py @@ -218,7 +218,7 @@ class DataFrameSparkIOTestsMixin: expected = ps.DataFrame(pdf) # Write out partitioned by one column - expected.to_spark_io(tmp, format="json", mode="overwrite", partition_cols="i32") + expected.spark.to_spark_io(tmp, format="json", mode="overwrite", partition_cols="i32") # Reset column order, as once the data is written out, Spark rearranges partition # columns to appear first. actual = ps.read_spark_io(tmp, format="json") @@ -230,7 +230,7 @@ class DataFrameSparkIOTestsMixin: ) # Write out partitioned by two columns - expected.to_spark_io( + expected.spark.to_spark_io( tmp, format="json", mode="overwrite", partition_cols=["i32", "bhello"] ) # Reset column order, as once the data is written out, Spark rearranges partition --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org