This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 8cbc741320d [SPARK-45267][PS] Change the default value for numeric_only 8cbc741320d is described below commit 8cbc741320dac60ce814ce0a9b3e72239248efb8 Author: Haejoon Lee <haejoon....@databricks.com> AuthorDate: Wed Sep 27 14:04:54 2023 +0800 [SPARK-45267][PS] Change the default value for numeric_only ### What changes were proposed in this pull request? This PR proposes to change the default value for `numeric_only` with related functions. ### Why are the changes needed? There are many functions that support `numeric_only` parameter have changed their default value from `True` to `False` from Pandas 2.0.0, so we should follow their behavior. See https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html for more detail. ### Does this PR introduce _any_ user-facing change? Yes, the default value for `numeric_only` is changed to `False`. ### How was this patch tested? Updated the related UTs. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #43043 from itholic/numeric_only. Authored-by: Haejoon Lee <haejoon....@databricks.com> Signed-off-by: Ruifeng Zheng <ruife...@apache.org> --- python/pyspark/pandas/frame.py | 38 +++++++-------- python/pyspark/pandas/groupby.py | 54 +++++++--------------- python/pyspark/pandas/series.py | 13 ++++-- .../pandas/tests/computation/test_compute.py | 8 +++- 4 files changed, 47 insertions(+), 66 deletions(-) diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py index 08450c0be87..faa595f80e3 100644 --- a/python/pyspark/pandas/frame.py +++ b/python/pyspark/pandas/frame.py @@ -747,7 +747,7 @@ class DataFrame(Frame, Generic[T]): sfun: Callable[["Series"], PySparkColumn], name: str, axis: Optional[Axis] = None, - numeric_only: bool = True, + numeric_only: bool = False, skipna: bool = True, **kwargs: Any, ) -> "Series": @@ -762,10 +762,8 @@ class DataFrame(Frame, Generic[T]): axis: used only for sanity check because the series only supports index axis. name : original pandas API name. axis : axis to apply. 0 or 1, or 'index' or 'columns. - numeric_only : bool, default True - Include only float, int, boolean columns. False is not supported. This parameter - is mainly for pandas compatibility. Only 'DataFrame.count' uses this parameter - currently. + numeric_only : bool, default False + Include only float, int, boolean columns. skipna : bool, default True Exclude NA/null values when computing the result. """ @@ -11150,7 +11148,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})] # TODO: add axis, pct, na_option parameter def rank( - self, method: str = "average", ascending: bool = True, numeric_only: Optional[bool] = None + self, method: str = "average", ascending: bool = True, numeric_only: bool = False ) -> "DataFrame": """ Compute numerical data ranks (1 through n) along axis. Equal values are @@ -11171,9 +11169,13 @@ defaultdict(<class 'list'>, {'col..., 'col...})] * dense: like 'min', but rank always increases by 1 between groups ascending : boolean, default True False for ranks by high (1) to low (N) - numeric_only : bool, optional + numeric_only : bool, default False For DataFrame objects, rank only numeric columns if set to True. + .. versionchanged:: 4.0.0 + The default value of ``numeric_only`` is now ``False``. + + Returns ------- ranks : same type as caller @@ -11238,11 +11240,6 @@ defaultdict(<class 'list'>, {'col..., 'col...})] 2 2.5 3 4.0 """ - warnings.warn( - "Default value of `numeric_only` will be changed to `False` " - "instead of `None` in 4.0.0.", - FutureWarning, - ) if numeric_only: numeric_col_names = [] for label in self._internal.column_labels: @@ -12206,7 +12203,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})] self, q: Union[float, Iterable[float]] = 0.5, axis: Axis = 0, - numeric_only: bool = True, + numeric_only: bool = False, accuracy: int = 10000, ) -> DataFrameOrSeries: """ @@ -12222,9 +12219,12 @@ defaultdict(<class 'list'>, {'col..., 'col...})] 0 <= q <= 1, the quantile(s) to compute. axis : int or str, default 0 or 'index' Can only be set to 0 now. - numeric_only : bool, default True - If False, the quantile of datetime and time delta data will be computed as well. - Can only be set to True now. + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionchanged:: 4.0.0 + The default value of ``numeric_only`` is now ``False``. + accuracy : int, optional Default accuracy of approximation. Larger value means better accuracy. The relative error can be deduced by 1.0 / accuracy. @@ -12821,12 +12821,6 @@ defaultdict(<class 'list'>, {'col..., 'col...})] if numeric_only is None and axis == 0: numeric_only = True - warnings.warn( - "Default value of `numeric_only` will be changed to `False` " - "instead of `True` in 4.0.0.", - FutureWarning, - ) - mode_scols: List[PySparkColumn] = [] mode_col_names: List[str] = [] mode_labels: List[Label] = [] diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py index 7bd64376152..3d51fabd4b2 100644 --- a/python/pyspark/pandas/groupby.py +++ b/python/pyspark/pandas/groupby.py @@ -611,18 +611,17 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta): min_count=min_count, ) - def mean(self, numeric_only: Optional[bool] = True) -> FrameLike: + def mean(self, numeric_only: Optional[bool] = False) -> FrameLike: """ Compute mean of groups, excluding missing values. Parameters ---------- - numeric_only : bool, default True - Include only float, int, boolean columns. If None, will attempt to use - everything, then use only numeric data. False is not supported. - This parameter is mainly for pandas compatibility. + numeric_only : bool, default False + Include only float, int, boolean columns. .. versionadded:: 3.4.0 + .. versionchanged:: 4.0.0 Returns ------- @@ -842,7 +841,7 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta): bool_to_numeric=True, ) - def sum(self, numeric_only: Optional[bool] = True, min_count: int = 0) -> FrameLike: + def sum(self, numeric_only: bool = False, min_count: int = 0) -> FrameLike: """ Compute sum of group values @@ -851,11 +850,10 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta): Parameters ---------- numeric_only : bool, default False - Include only float, int, boolean columns. If None, will attempt to use - everything, then use only numeric data. - It takes no effect since only numeric columns can be support here. + Include only float, int, boolean columns. .. versionadded:: 3.4.0 + .. versionchanged:: 4.0.0 min_count : int, default 0 The required number of valid values to perform the operation. If fewer than min_count non-NA values are present the result will be NA. @@ -897,11 +895,6 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta): pyspark.pandas.Series.groupby pyspark.pandas.DataFrame.groupby """ - warnings.warn( - "Default value of `numeric_only` will be changed to `False` " - "instead of `True` in 4.0.0.", - FutureWarning, - ) if numeric_only is not None and not isinstance(numeric_only, bool): raise TypeError("numeric_only must be None or bool") if not isinstance(min_count, int): @@ -927,7 +920,7 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta): ) # TODO: sync the doc. - def var(self, ddof: int = 1, numeric_only: Optional[bool] = True) -> FrameLike: + def var(self, ddof: int = 1, numeric_only: bool = False) -> FrameLike: """ Compute variance of groups, excluding missing values. @@ -942,10 +935,8 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta): .. versionchanged:: 3.4.0 Supported including arbitary integers. - numeric_only : bool, default True - Include only float, int, boolean columns. If None, will attempt to use - everything, then use only numeric data. False is not supported. - This parameter is mainly for pandas compatibility. + numeric_only : bool, default False + Include only float, int, boolean columns. .. versionadded:: 4.0.0 @@ -1179,7 +1170,7 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta): return self._prepare_return(DataFrame(internal), agg_column_names=agg_column_names) - def prod(self, numeric_only: Optional[bool] = True, min_count: int = 0) -> FrameLike: + def prod(self, numeric_only: bool = False, min_count: int = 0) -> FrameLike: """ Compute prod of groups. @@ -1188,8 +1179,9 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta): Parameters ---------- numeric_only : bool, default False - Include only float, int, boolean columns. If None, will attempt to use - everything, then use only numeric data. + Include only float, int, boolean columns. + + .. versionchanged:: 4.0.0 min_count : int, default 0 The required number of valid values to perform the operation. @@ -1235,12 +1227,6 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta): if not isinstance(min_count, int): raise TypeError("min_count must be integer") - warnings.warn( - "Default value of `numeric_only` will be changed to `False` " - "instead of `True` in 4.0.0.", - FutureWarning, - ) - self._validate_agg_columns(numeric_only=numeric_only, function_name="prod") return self._reduce_for_stat_function( @@ -3441,7 +3427,7 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta): return self._handle_output(DataFrame(internal)) - def median(self, numeric_only: Optional[bool] = True, accuracy: int = 10000) -> FrameLike: + def median(self, numeric_only: bool = False, accuracy: int = 10000) -> FrameLike: """ Compute median of groups, excluding missing values. @@ -3454,10 +3440,10 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta): Parameters ---------- numeric_only : bool, default False - Include only float, int, boolean columns. If None, will attempt to use - everything, then use only numeric data. + Include only float, int, boolean columns. .. versionadded:: 3.4.0 + .. versionchanged:: 4.0.0 Returns ------- @@ -3509,12 +3495,6 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta): self._validate_agg_columns(numeric_only=numeric_only, function_name="median") - warnings.warn( - "Default value of `numeric_only` will be changed to `False` " - "instead of `True` in 4.0.0.", - FutureWarning, - ) - def stat_function(col: Column) -> Column: return F.percentile_approx(col, 0.5, accuracy) diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py index f1b785e1b41..e96e5c3b3dc 100644 --- a/python/pyspark/pandas/series.py +++ b/python/pyspark/pandas/series.py @@ -4054,7 +4054,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]): # TODO: add axis, pct, na_option parameter def rank( - self, method: str = "average", ascending: bool = True, numeric_only: Optional[bool] = None + self, method: str = "average", ascending: bool = True, numeric_only: bool = False ) -> "Series": """ Compute numerical data ranks (1 through n) along axis. Equal values are @@ -4075,9 +4075,12 @@ class Series(Frame, IndexOpsMixin, Generic[T]): * dense: like 'min', but rank always increases by 1 between groups ascending : boolean, default True False for ranks by high (1) to low (N) - numeric_only : bool, optional - If set to True, rank numeric Series, or raise TypeError for non-numeric Series. - False is not supported. This parameter is mainly for pandas compatibility. + numeric_only : bool, default False + For DataFrame objects, rank only numeric columns if set to True. + + .. versionchanged:: 4.0.0 + The default value of ``numeric_only`` is now ``False``. + Returns ------- @@ -7033,7 +7036,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]): sfun: Callable[["Series"], PySparkColumn], name: str_type, axis: Optional[Axis] = None, - numeric_only: bool = True, + numeric_only: bool = False, skipna: bool = True, **kwargs: Any, ) -> Scalar: diff --git a/python/pyspark/pandas/tests/computation/test_compute.py b/python/pyspark/pandas/tests/computation/test_compute.py index dc145601fca..7f17a3bc6f0 100644 --- a/python/pyspark/pandas/tests/computation/test_compute.py +++ b/python/pyspark/pandas/tests/computation/test_compute.py @@ -352,9 +352,10 @@ class FrameComputeMixin: pdf = pd.DataFrame({"x": ["a", "b", "c"]}) psdf = ps.from_pandas(pdf) - self.assert_eq(psdf.quantile(0.5), pdf.quantile(0.5, numeric_only=True)) + self.assert_eq(psdf.quantile(0.5, numeric_only=True), pdf.quantile(0.5, numeric_only=True)) self.assert_eq( - psdf.quantile([0.25, 0.5, 0.75]), pdf.quantile([0.25, 0.5, 0.75], numeric_only=True) + psdf.quantile([0.25, 0.5, 0.75], numeric_only=True), + pdf.quantile([0.25, 0.5, 0.75], numeric_only=True), ) with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"): @@ -432,6 +433,9 @@ class FrameComputeMixin: class FrameComputeTests(FrameComputeMixin, ComparisonTestBase, SQLTestUtils): + def test_quantile(self): + super().test_quantile() + pass --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org