This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new e549c6fd22a [SPARK-38890][PYTHON] Implement `ignore_index` of `DataFrame.sort_index` e549c6fd22a is described below commit e549c6fd22ac0d5a6df0d817212637c532b9a681 Author: Xinrong Meng <xinrong.m...@databricks.com> AuthorDate: Thu Apr 14 09:34:13 2022 +0900 [SPARK-38890][PYTHON] Implement `ignore_index` of `DataFrame.sort_index` ### What changes were proposed in this pull request? Implement `ignore_index` of `DataFrame.sort_index`. ### Why are the changes needed? To reach parity with pandas API. ### Does this PR introduce _any_ user-facing change? Yes. `ignore_index` of `DataFrame.sort_index` is supported as below: ```py >>> df = ps.DataFrame({'A': [2, 1, np.nan]}, index=['b', 'a', np.nan]) >>> df A b 2.0 a 1.0 NaN NaN >>> df.sort_index(ignore_index=True) A 0 1.0 1 2.0 2 NaN ``` ### How was this patch tested? Unit tests. Closes #36184 from xinrong-databricks/frame.sort_index.ignore_index. Authored-by: Xinrong Meng <xinrong.m...@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- python/pyspark/pandas/frame.py | 22 +++++++++++++++++++++- python/pyspark/pandas/tests/test_dataframe.py | 12 ++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py index 36e992fef93..a78aaa66f08 100644 --- a/python/pyspark/pandas/frame.py +++ b/python/pyspark/pandas/frame.py @@ -7014,6 +7014,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})] inplace: bool = False, kind: str = None, na_position: str = "last", + ignore_index: bool = False, ) -> Optional["DataFrame"]: """ Sort object by labels (along an axis) @@ -7033,6 +7034,10 @@ defaultdict(<class 'list'>, {'col..., 'col...})] na_position : {‘first’, ‘last’}, default ‘last’ first puts NaNs at the beginning, last puts NaNs at the end. Not implemented for MultiIndex. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + + .. versionadded:: 3.4.0 Returns ------- @@ -7060,6 +7065,12 @@ defaultdict(<class 'list'>, {'col..., 'col...})] a 1.0 b 2.0 + >>> df.sort_index(ignore_index=True) + A + 0 1.0 + 1 2.0 + 2 NaN + >>> df.sort_index(inplace=True) >>> df A @@ -7091,6 +7102,13 @@ defaultdict(<class 'list'>, {'col..., 'col...})] b 0 1 2 a 1 2 1 b 1 0 3 + + >>> df.sort_index(ignore_index=True) + A B + 0 3 0 + 1 2 1 + 2 1 2 + 3 0 3 """ inplace = validate_bool_kwarg(inplace, "inplace") axis = validate_axis(axis) @@ -7112,10 +7130,12 @@ defaultdict(<class 'list'>, {'col..., 'col...})] psdf = self._sort(by=by, ascending=ascending, na_position=na_position) if inplace: + if ignore_index: + psdf.reset_index(drop=True, inplace=inplace) self._update_internal_frame(psdf._internal) return None else: - return psdf + return psdf.reset_index(drop=True) if ignore_index else psdf def swaplevel( self, i: Union[int, Name] = -2, j: Union[int, Name] = -1, axis: Axis = 0 diff --git a/python/pyspark/pandas/tests/test_dataframe.py b/python/pyspark/pandas/tests/test_dataframe.py index fa32b38d3c9..b99a9a2e807 100644 --- a/python/pyspark/pandas/tests/test_dataframe.py +++ b/python/pyspark/pandas/tests/test_dataframe.py @@ -1678,6 +1678,8 @@ class DataFrameTest(ComparisonTestBase, SQLTestUtils): # Assert default behavior without parameters self.assert_eq(psdf.sort_index(), pdf.sort_index()) + # Assert ignoring index + self.assert_eq(psdf.sort_index(ignore_index=True), pdf.sort_index(ignore_index=True)) # Assert sorting descending self.assert_eq(psdf.sort_index(ascending=False), pdf.sort_index(ascending=False)) # Assert sorting NA indices first @@ -1694,6 +1696,14 @@ class DataFrameTest(ComparisonTestBase, SQLTestUtils): self.assertEqual(psdf.sort_index(inplace=True), pdf.sort_index(inplace=True)) self.assert_eq(psdf, pdf) self.assert_eq(psserA, pserA) + pserA = pdf.A + psserA = psdf.A + self.assertEqual( + psdf.sort_index(inplace=True, ascending=False, ignore_index=True), + pdf.sort_index(inplace=True, ascending=False, ignore_index=True), + ) + self.assert_eq(psdf, pdf) + self.assert_eq(psserA, pserA) # Assert multi-indices pdf = pd.DataFrame( @@ -1703,6 +1713,8 @@ class DataFrameTest(ComparisonTestBase, SQLTestUtils): self.assert_eq(psdf.sort_index(), pdf.sort_index()) self.assert_eq(psdf.sort_index(level=[1, 0]), pdf.sort_index(level=[1, 0])) self.assert_eq(psdf.reset_index().sort_index(), pdf.reset_index().sort_index()) + # Assert ignoring index + self.assert_eq(psdf.sort_index(ignore_index=True), pdf.sort_index(ignore_index=True)) # Assert with multi-index columns columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B")]) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org