This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 8d358cdbd57 [SPARK-42619][PS] Add `show_counts` parameter for DataFrame.info 8d358cdbd57 is described below commit 8d358cdbd57e69a16c914f329d3e4173ceb7b1ef Author: zhyhimont <zhyhim...@gmail.com> AuthorDate: Tue Sep 5 08:45:57 2023 +0800 [SPARK-42619][PS] Add `show_counts` parameter for DataFrame.info ### What changes were proposed in this pull request? Added `show_counts` parameter for DataFrame.info ### Why are the changes needed? When pandas 2.0.0 is released, we should match the behavior in pandas API on Spark. ### Does this PR introduce _any_ user-facing change? Changed the name of the parameter `null_counts` to `show_counts` of the method DataFrame.info ### How was this patch tested? UT Closes #40436 from dzhigimont/SPARK-42619_ZH. Lead-authored-by: zhyhimont <zhyhim...@gmail.com> Co-authored-by: Zhyhimont Dmitry <zhyhimon...@profitero.com> Signed-off-by: Ruifeng Zheng <ruife...@apache.org> --- .../source/migration_guide/pyspark_upgrade.rst | 1 + python/pyspark/pandas/frame.py | 7 +++--- python/pyspark/pandas/indexes/base.py | 2 +- python/pyspark/pandas/tests/io/test_io.py | 28 ++++++++++++++++++++++ 4 files changed, 34 insertions(+), 4 deletions(-) diff --git a/python/docs/source/migration_guide/pyspark_upgrade.rst b/python/docs/source/migration_guide/pyspark_upgrade.rst index 9ec38ad2709..8b3058ba547 100644 --- a/python/docs/source/migration_guide/pyspark_upgrade.rst +++ b/python/docs/source/migration_guide/pyspark_upgrade.rst @@ -39,6 +39,7 @@ Upgrading from PySpark 3.5 to 4.0 * In Spark 4.0, the default value of ``regex`` parameter for ``Series.str.replace`` has been changed from ``True`` to ``False`` from pandas API on Spark. Additionally, a single character ``pat`` with ``regex=True`` is now treated as a regular expression instead of a string literal. * In Spark 4.0, the resulting name from ``value_counts`` for all objects sets to ``'count'`` (or ``'proportion'`` if ``normalize=True`` was passed) from pandas API on Spark, and the index will be named after the original object. * In Spark 4.0, ``squeeze`` parameter from ``ps.read_csv`` and ``ps.read_excel`` has been removed from pandas API on Spark. +* In Spark 4.0, ``null_counts`` parameter from ``DataFrame.info`` has been removed from pandas API on Spark, use ``show_counts`` instead. Upgrading from PySpark 3.3 to 3.4 diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py index 41ab03a5c0b..adbef607256 100644 --- a/python/pyspark/pandas/frame.py +++ b/python/pyspark/pandas/frame.py @@ -11948,12 +11948,12 @@ defaultdict(<class 'list'>, {'col..., 'col...})] return cast(ps.Series, ps.from_pandas(psdf._to_internal_pandas().idxmin())) - # TODO(SPARK-41619): Add `show_counts` parameter and replace with `null_counts`. def info( self, verbose: Optional[bool] = None, buf: Optional[IO[str]] = None, max_cols: Optional[int] = None, + show_counts: Optional[bool] = None, ) -> None: """ Print a concise summary of a DataFrame. @@ -11973,10 +11973,10 @@ defaultdict(<class 'list'>, {'col..., 'col...})] When to switch from the verbose to the truncated output. If the DataFrame has more than `max_cols` columns, the truncated output is used. - null_counts : bool, optional + show_counts : bool, optional Whether to show the non-null counts. - .. deprecated:: 3.4.0 + .. versionadded:: 4.0.0 Returns ------- @@ -12066,6 +12066,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})] buf=buf, max_cols=max_cols, memory_usage=False, + show_counts=show_counts, # type: ignore ) finally: del self._data diff --git a/python/pyspark/pandas/indexes/base.py b/python/pyspark/pandas/indexes/base.py index 4c2ab137435..bfde7e554ba 100644 --- a/python/pyspark/pandas/indexes/base.py +++ b/python/pyspark/pandas/indexes/base.py @@ -289,7 +289,7 @@ class Index(IndexOpsMixin): if name is None: name = type(self).__name__ - return "%s: %s entries%s" % (name, total_count, index_summary) + return "%s: %s entries%s" % (name, int(total_count), index_summary) @property def size(self) -> int: diff --git a/python/pyspark/pandas/tests/io/test_io.py b/python/pyspark/pandas/tests/io/test_io.py index 4eadc6a7eb5..59812ae3d5a 100644 --- a/python/pyspark/pandas/tests/io/test_io.py +++ b/python/pyspark/pandas/tests/io/test_io.py @@ -16,6 +16,7 @@ # from distutils.version import LooseVersion import unittest +from io import StringIO import numpy as np import pandas as pd @@ -120,6 +121,33 @@ class FrameIOMixin: with ps.option_context("compute.max_rows", None): check_style() + def test_info(self): + pdf, psdf = self.df_pair + pdf_io = StringIO() + psdf_io = StringIO() + + psdf.info(buf=psdf_io) + pdf.info(buf=pdf_io, memory_usage=False) + + # Split is using to filter out first line with class name + # <class 'pyspark.pandas.frame.DataFrame'> vs <class 'pandas.core.frame.DataFrame'> + self.assert_eq(pdf_io.getvalue().split("\n")[1:], psdf_io.getvalue().split("\n")[1:]) + psdf_io.truncate(0) + pdf_io.truncate(0) + psdf.info(buf=psdf_io, max_cols=1) + pdf.info(buf=pdf_io, max_cols=1, memory_usage=False) + self.assert_eq(pdf_io.getvalue().split("\n")[1:], psdf_io.getvalue().split("\n")[1:]) + psdf_io.truncate(0) + pdf_io.truncate(0) + psdf.info(buf=psdf_io, show_counts=True) + pdf.info(buf=pdf_io, show_counts=True, memory_usage=False) + self.assert_eq(pdf_io.getvalue().split("\n")[1:], psdf_io.getvalue().split("\n")[1:]) + psdf_io.truncate(0) + pdf_io.truncate(0) + psdf.info(buf=psdf_io, show_counts=False) + pdf.info(buf=pdf_io, show_counts=False, memory_usage=False) + self.assert_eq(pdf_io.getvalue().split("\n")[1:], psdf_io.getvalue().split("\n")[1:]) + class FrameIOTests(FrameIOMixin, ComparisonTestBase, SQLTestUtils): pass --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org