This is an automated email from the ASF dual-hosted git repository. ueshin pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 55971b7 [SPARK-36260][PYTHON] Add set_categories to CategoricalAccessor and CategoricalIndex 55971b7 is described below commit 55971b70fe3c899d4516e4955bc7c9ebd4b4af70 Author: Xinrong Meng <xinrong.m...@databricks.com> AuthorDate: Mon Jul 26 17:12:33 2021 -0700 [SPARK-36260][PYTHON] Add set_categories to CategoricalAccessor and CategoricalIndex ### What changes were proposed in this pull request? Add set_categories to CategoricalAccessor and CategoricalIndex. ### Why are the changes needed? set_categories is supported in pandas CategoricalAccessor and CategoricalIndex. We ought to follow pandas. ### Does this PR introduce _any_ user-facing change? Yes, users will be able to use `set_categories`. ### How was this patch tested? Unit tests. Closes #33506 from xinrong-databricks/set_categories. Authored-by: Xinrong Meng <xinrong.m...@databricks.com> Signed-off-by: Takuya UESHIN <ues...@databricks.com> --- .../source/reference/pyspark.pandas/indexing.rst | 1 + .../source/reference/pyspark.pandas/series.rst | 1 + python/pyspark/pandas/categorical.py | 148 ++++++++++++++++++++- python/pyspark/pandas/indexes/category.py | 81 +++++++++++ python/pyspark/pandas/missing/indexes.py | 1 - .../pyspark/pandas/tests/indexes/test_category.py | 49 +++++++ python/pyspark/pandas/tests/test_categorical.py | 63 +++++++++ 7 files changed, 339 insertions(+), 5 deletions(-) diff --git a/python/docs/source/reference/pyspark.pandas/indexing.rst b/python/docs/source/reference/pyspark.pandas/indexing.rst index ebf332e..cf898aa 100644 --- a/python/docs/source/reference/pyspark.pandas/indexing.rst +++ b/python/docs/source/reference/pyspark.pandas/indexing.rst @@ -181,6 +181,7 @@ Categorical components CategoricalIndex.as_ordered CategoricalIndex.as_unordered CategoricalIndex.rename_categories + CategoricalIndex.set_categories .. _api.multiindex: diff --git a/python/docs/source/reference/pyspark.pandas/series.rst b/python/docs/source/reference/pyspark.pandas/series.rst index 95e102f..717c762 100644 --- a/python/docs/source/reference/pyspark.pandas/series.rst +++ b/python/docs/source/reference/pyspark.pandas/series.rst @@ -406,6 +406,7 @@ the ``Series.cat`` accessor. Series.cat.as_ordered Series.cat.as_unordered Series.cat.rename_categories + Series.cat.set_categories .. _api.series.plot: diff --git a/python/pyspark/pandas/categorical.py b/python/pyspark/pandas/categorical.py index ce9b3ed..cae9ab1 100644 --- a/python/pyspark/pandas/categorical.py +++ b/python/pyspark/pandas/categorical.py @@ -20,6 +20,8 @@ import pandas as pd from pandas.api.types import CategoricalDtype, is_dict_like, is_list_like from pyspark.pandas.internal import InternalField +from pyspark.pandas.spark import functions as SF +from pyspark.sql import functions as F from pyspark.sql.types import StructField if TYPE_CHECKING: @@ -680,12 +682,150 @@ class CategoricalAccessor(object): def set_categories( self, - new_categories: pd.Index, - ordered: bool = None, + new_categories: Union[pd.Index, List], + ordered: Optional[bool] = None, rename: bool = False, inplace: bool = False, - ) -> "ps.Series": - raise NotImplementedError() + ) -> Optional["ps.Series"]: + """ + Set the categories to the specified new_categories. + + `new_categories` can include new categories (which will result in + unused categories) or remove old categories (which results in values + set to NaN). If `rename==True`, the categories will simple be renamed + (less or more items than in old categories will result in values set to + NaN or in unused categories respectively). + + This method can be used to perform more than one action of adding, + removing, and reordering simultaneously and is therefore faster than + performing the individual steps via the more specialised methods. + + On the other hand this methods does not do checks (e.g., whether the + old categories are included in the new categories on a reorder), which + can result in surprising changes, for example when using special string + dtypes, which does not considers a S1 string equal to a single char + python string. + + Parameters + ---------- + new_categories : Index-like + The categories in new order. + ordered : bool, default False + Whether or not the categorical is treated as a ordered categorical. + If not given, do not change the ordered information. + rename : bool, default False + Whether or not the new_categories should be considered as a rename + of the old categories or as reordered categories. + inplace : bool, default False + Whether or not to reorder the categories in-place or return a copy + of this categorical with reordered categories. + + Returns + ------- + Series with reordered categories or None if inplace. + + Raises + ------ + ValueError + If new_categories does not validate as categories + + See Also + -------- + rename_categories : Rename categories. + reorder_categories : Reorder categories. + add_categories : Add new categories. + remove_categories : Remove the specified categories. + remove_unused_categories : Remove categories which are not used. + + Examples + -------- + >>> s = ps.Series(list("abbccc"), dtype="category") + >>> s # doctest: +SKIP + 0 a + 1 b + 2 b + 3 c + 4 c + 5 c + dtype: category + Categories (3, object): ['a', 'b', 'c'] + + >>> s.cat.set_categories(['b', 'c']) # doctest: +SKIP + 0 NaN + 1 b + 2 b + 3 c + 4 c + 5 c + dtype: category + Categories (2, object): ['b', 'c'] + + >>> s.cat.set_categories([1, 2, 3], rename=True) # doctest: +SKIP + 0 1 + 1 2 + 2 2 + 3 3 + 4 3 + 5 3 + dtype: category + Categories (3, int64): [1, 2, 3] + + >>> s.cat.set_categories([1, 2, 3], rename=True, ordered=True) # doctest: +SKIP + 0 1 + 1 2 + 2 2 + 3 3 + 4 3 + 5 3 + dtype: category + Categories (3, int64): [1 < 2 < 3] + """ + from pyspark.pandas.frame import DataFrame + + if not is_list_like(new_categories): + raise TypeError( + "Parameter 'new_categories' must be list-like, was '{}'".format(new_categories) + ) + + if ordered is None: + ordered = self.ordered + + new_dtype = CategoricalDtype(new_categories, ordered=ordered) + scol = self._data.spark.column + + if rename: + new_scol = ( + F.when(scol >= len(new_categories), SF.lit(-1).cast(self._data.spark.data_type)) + .otherwise(scol) + .alias(self._data._internal.data_spark_column_names[0]) + ) + + internal = self._data._psdf._internal.with_new_spark_column( + self._data._column_label, + new_scol, + field=self._data._internal.data_fields[0].copy(dtype=new_dtype), + ) + + if inplace: + self._data._psdf._update_internal_frame(internal) + return None + else: + psser = DataFrame(internal)._psser_for(self._data._column_label) + return psser._with_new_scol( + psser.spark.column, field=psser._internal.data_fields[0] + ) + else: + psser = self._data.astype(new_dtype) + if inplace: + internal = self._data._psdf._internal.with_new_spark_column( + self._data._column_label, + psser.spark.column, + field=psser._internal.data_fields[0], + ) + self._data._psdf._update_internal_frame(internal) + return None + else: + return psser def _test() -> None: diff --git a/python/pyspark/pandas/indexes/category.py b/python/pyspark/pandas/indexes/category.py index 51b14bc..cd95e39 100644 --- a/python/pyspark/pandas/indexes/category.py +++ b/python/pyspark/pandas/indexes/category.py @@ -517,6 +517,87 @@ class CategoricalIndex(Index): self.name ) + def set_categories( + self, + new_categories: Union[pd.Index, List], + ordered: Optional[bool] = None, + rename: bool = False, + inplace: bool = False, + ) -> Optional["CategoricalIndex"]: + """ + Set the categories to the specified new_categories. + + `new_categories` can include new categories (which will result in + unused categories) or remove old categories (which results in values + set to NaN). If `rename==True`, the categories will simple be renamed + (less or more items than in old categories will result in values set to + NaN or in unused categories respectively). + + This method can be used to perform more than one action of adding, + removing, and reordering simultaneously and is therefore faster than + performing the individual steps via the more specialised methods. + + On the other hand this methods does not do checks (e.g., whether the + old categories are included in the new categories on a reorder), which + can result in surprising changes, for example when using special string + dtypes, which does not considers a S1 string equal to a single char + python string. + + Parameters + ---------- + new_categories : Index-like + The categories in new order. + ordered : bool, default False + Whether or not the categorical is treated as a ordered categorical. + If not given, do not change the ordered information. + rename : bool, default False + Whether or not the new_categories should be considered as a rename + of the old categories or as reordered categories. + inplace : bool, default False + Whether or not to reorder the categories in-place or return a copy + of this categorical with reordered categories. + + Returns + ------- + CategoricalIndex with reordered categories or None if inplace. + + Raises + ------ + ValueError + If new_categories does not validate as categories + + See Also + -------- + rename_categories : Rename categories. + reorder_categories : Reorder categories. + add_categories : Add new categories. + remove_categories : Remove the specified categories. + remove_unused_categories : Remove categories which are not used. + + Examples + -------- + >>> idx = ps.CategoricalIndex(list("abbccc")) + >>> idx # doctest: +NORMALIZE_WHITESPACE + CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'], + categories=['a', 'b', 'c'], ordered=False, dtype='category') + + >>> idx.set_categories(['b', 'c']) # doctest: +NORMALIZE_WHITESPACE + CategoricalIndex([nan, 'b', 'b', 'c', 'c', 'c'], + categories=['b', 'c'], ordered=False, dtype='category') + + >>> idx.set_categories([1, 2, 3], rename=True) + CategoricalIndex([1, 2, 2, 3, 3, 3], categories=[1, 2, 3], ordered=False, dtype='category') + + >>> idx.set_categories([1, 2, 3], rename=True, ordered=True) + CategoricalIndex([1, 2, 2, 3, 3, 3], categories=[1, 2, 3], ordered=True, dtype='category') + """ + if inplace: + raise ValueError("cannot use inplace with CategoricalIndex") + + return CategoricalIndex( + self.to_series().cat.set_categories(new_categories, ordered=ordered, rename=rename) + ).rename(self.name) + def _test() -> None: import os diff --git a/python/pyspark/pandas/missing/indexes.py b/python/pyspark/pandas/missing/indexes.py index 0f1c316..938aea2 100644 --- a/python/pyspark/pandas/missing/indexes.py +++ b/python/pyspark/pandas/missing/indexes.py @@ -123,7 +123,6 @@ class MissingPandasLikeDatetimeIndex(MissingPandasLikeIndex): class MissingPandasLikeCategoricalIndex(MissingPandasLikeIndex): # Functions - set_categories = _unsupported_function("set_categories", cls="CategoricalIndex") map = _unsupported_function("map", cls="CategoricalIndex") diff --git a/python/pyspark/pandas/tests/indexes/test_category.py b/python/pyspark/pandas/tests/indexes/test_category.py index a11f36a..8368839 100644 --- a/python/pyspark/pandas/tests/indexes/test_category.py +++ b/python/pyspark/pandas/tests/indexes/test_category.py @@ -311,6 +311,55 @@ class CategoricalIndexTest(PandasOnSparkTestCase, TestUtils): lambda: psidx.rename_categories("x"), ) + def test_set_categories(self): + pidx = pd.CategoricalIndex(["a", "b", "c", "d"]) + psidx = ps.from_pandas(pidx) + + self.assert_eq( + pidx.set_categories(["a", "c", "b", "o"]), + psidx.set_categories(["a", "c", "b", "o"]), + ) + self.assert_eq( + pidx.set_categories(["a", "c", "b"]), + psidx.set_categories(["a", "c", "b"]), + ) + self.assert_eq( + pidx.set_categories(["a", "c", "b", "d", "e"]), + psidx.set_categories(["a", "c", "b", "d", "e"]), + ) + + self.assert_eq( + pidx.set_categories([0, 1, 3, 2], rename=True), + psidx.set_categories([0, 1, 3, 2], rename=True), + ) + self.assert_eq( + pidx.set_categories([0, 1, 3], rename=True), + psidx.set_categories([0, 1, 3], rename=True), + ) + self.assert_eq( + pidx.set_categories([0, 1, 3, 2, 4], rename=True), + psidx.set_categories([0, 1, 3, 2, 4], rename=True), + ) + + self.assert_eq( + pidx.set_categories(["a", "c", "b", "o"], ordered=True), + psidx.set_categories(["a", "c", "b", "o"], ordered=True), + ) + self.assert_eq( + pidx.set_categories(["a", "c", "b"], ordered=True), + psidx.set_categories(["a", "c", "b"], ordered=True), + ) + self.assert_eq( + pidx.set_categories(["a", "c", "b", "d", "e"], ordered=True), + psidx.set_categories(["a", "c", "b", "d", "e"], ordered=True), + ) + + self.assertRaisesRegex( + ValueError, + "cannot use inplace with CategoricalIndex", + lambda: psidx.set_categories(["a", "c", "b", "o"], inplace=True), + ) + if __name__ == "__main__": import unittest diff --git a/python/pyspark/pandas/tests/test_categorical.py b/python/pyspark/pandas/tests/test_categorical.py index 4122efa..67cdf3c 100644 --- a/python/pyspark/pandas/tests/test_categorical.py +++ b/python/pyspark/pandas/tests/test_categorical.py @@ -668,6 +668,69 @@ class CategoricalTest(PandasOnSparkTestCase, TestUtils): lambda: psser.cat.rename_categories("x"), ) + def test_set_categories(self): + pdf, psdf = self.df_pair + + pser = pdf.b + psser = psdf.b + + self.assert_eq( + pser.cat.set_categories(["a", "c", "b", "o"]), + psser.cat.set_categories(["a", "c", "b", "o"]), + ) + self.assert_eq( + pser.cat.set_categories(["a", "c", "b"]), + psser.cat.set_categories(["a", "c", "b"]), + ) + self.assert_eq( + pser.cat.set_categories(["a", "c", "b", "d", "e"]), + psser.cat.set_categories(["a", "c", "b", "d", "e"]), + ) + + self.assert_eq( + pser.cat.set_categories([0, 1, 3, 2], rename=True), + psser.cat.set_categories([0, 1, 3, 2], rename=True), + ) + self.assert_eq( + pser.cat.set_categories([0, 1, 3], rename=True), + psser.cat.set_categories([0, 1, 3], rename=True), + ) + self.assert_eq( + pser.cat.set_categories([0, 1, 3, 2, 4], rename=True), + psser.cat.set_categories([0, 1, 3, 2, 4], rename=True), + ) + + self.assert_eq( + pser.cat.set_categories(["a", "c", "b", "o"], ordered=True), + psser.cat.set_categories(["a", "c", "b", "o"], ordered=True), + ) + self.assert_eq( + pser.cat.set_categories(["a", "c", "b"], ordered=True), + psser.cat.set_categories(["a", "c", "b"], ordered=True), + ) + self.assert_eq( + pser.cat.set_categories(["a", "c", "b", "d", "e"], ordered=True), + psser.cat.set_categories(["a", "c", "b", "d", "e"], ordered=True), + ) + + self.assert_eq( + pser.cat.set_categories(["a", "c", "b", "o"], inplace=True, rename=True), + psser.cat.set_categories(["a", "c", "b", "o"], inplace=True, rename=True), + ) + self.assert_eq(pser, psser) + self.assert_eq(pdf, psdf) + + pser.cat.set_categories([2, 3, 1, 0], inplace=True, rename=False), + psser.cat.set_categories([2, 3, 1, 0], inplace=True, rename=False), + self.assert_eq(pser, psser) + self.assert_eq(pdf, psdf) + + self.assertRaisesRegex( + TypeError, + "Parameter 'new_categories' must be list-like, was", + lambda: psser.cat.set_categories(None), + ) + if __name__ == "__main__": import unittest --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org