This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 27abceb [SPARK-37327][PYTHON] Silence the to_pandas() advice log for internal usage 27abceb is described below commit 27abcebf8da69f88c66ef43d651fc3db6bf8c2cb Author: itholic <haejoon....@databricks.com> AuthorDate: Mon Nov 15 17:09:07 2021 +0900 [SPARK-37327][PYTHON] Silence the to_pandas() advice log for internal usage ### What changes were proposed in this pull request? This PR proposes adding internal method `_to_pandas()` to silence the advice log when it's used as an internal purpose. ### Why are the changes needed? `to_pandas()` is used in many places when implementing API, but users might not want to see the every warnings for internal usage. Showing the advice log when only users use the `to_pandas()` explicitly is enough. ### Does this PR introduce _any_ user-facing change? Reduce the advice log for `to_pandas()`. ### How was this patch tested? Manually check the result Closes #34598 from itholic/SPARK-37327. Authored-by: itholic <haejoon....@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- python/pyspark/pandas/categorical.py | 2 +- python/pyspark/pandas/frame.py | 8 +++++++- python/pyspark/pandas/generic.py | 12 ++++++++---- python/pyspark/pandas/indexes/base.py | 6 ++++++ python/pyspark/pandas/indexes/multi.py | 6 ++++++ python/pyspark/pandas/indexing.py | 2 +- python/pyspark/pandas/plot/core.py | 4 ++-- python/pyspark/pandas/series.py | 6 ++++++ 8 files changed, 37 insertions(+), 9 deletions(-) diff --git a/python/pyspark/pandas/categorical.py b/python/pyspark/pandas/categorical.py index ad2647b..e59048b 100644 --- a/python/pyspark/pandas/categorical.py +++ b/python/pyspark/pandas/categorical.py @@ -528,7 +528,7 @@ class CategoricalAccessor(object): FutureWarning, ) - categories = set(self._data.drop_duplicates().to_pandas()) + categories = set(self._data.drop_duplicates()._to_pandas()) removals = [cat for cat in self.categories if cat not in categories] return self.remove_categories(removals=removals, inplace=inplace) diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py index d42d07b..6d89eb5 100644 --- a/python/pyspark/pandas/frame.py +++ b/python/pyspark/pandas/frame.py @@ -4848,6 +4848,12 @@ defaultdict(<class 'list'>, {'col..., 'col...})] ) return self._internal.to_pandas_frame.copy() + def _to_pandas(self) -> pd.DataFrame: + """ + Same as `to_pandas()`, without issueing the advice log for internal usage. + """ + return self._internal.to_pandas_frame.copy() + def assign(self, **kwargs: Any) -> "DataFrame": """ Assign new columns to a DataFrame. @@ -10871,7 +10877,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})] object.__setattr__(self, "_data", self) count_func = self.count self.count = ( # type: ignore[assignment] - lambda: count_func().to_pandas() # type: ignore[assignment, misc, union-attr] + lambda: count_func()._to_pandas() # type: ignore[assignment, misc, union-attr] ) return pd.DataFrame.info( self, diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py index a4e99d6..bb83ddf 100644 --- a/python/pyspark/pandas/generic.py +++ b/python/pyspark/pandas/generic.py @@ -130,6 +130,10 @@ class Frame(object, metaclass=ABCMeta): def to_pandas(self) -> Union[pd.DataFrame, pd.Series]: pass + @abstractmethod + def _to_pandas(self) -> Union[pd.DataFrame, pd.Series]: + pass + @property @abstractmethod def index(self) -> "Index": @@ -578,7 +582,7 @@ class Frame(object, metaclass=ABCMeta): "`to_numpy` loads all data into the driver's memory. " "It should only be used if the resulting NumPy ndarray is expected to be small." ) - return self.to_pandas().values + return self._to_pandas().values @property def values(self) -> np.ndarray: @@ -796,7 +800,7 @@ class Frame(object, metaclass=ABCMeta): self, ps.Series ): # 0.23 seems not having 'columns' parameter in Series' to_csv. - return psdf_or_ser.to_pandas().to_csv( + return psdf_or_ser._to_pandas().to_csv( None, sep=sep, na_rep=na_rep, @@ -805,7 +809,7 @@ class Frame(object, metaclass=ABCMeta): index=False, ) else: - return psdf_or_ser.to_pandas().to_csv( + return psdf_or_ser._to_pandas().to_csv( None, sep=sep, na_rep=na_rep, @@ -1001,7 +1005,7 @@ class Frame(object, metaclass=ABCMeta): if path is None: # If path is none, just collect and use pandas's to_json. psdf_or_ser = self - pdf = psdf_or_ser.to_pandas() + pdf = psdf_or_ser._to_pandas() if isinstance(self, ps.Series): pdf = pdf.to_frame() # To make the format consistent and readable by `read_json`, convert it to pandas' and diff --git a/python/pyspark/pandas/indexes/base.py b/python/pyspark/pandas/indexes/base.py index ee79c08..2d84bc3 100644 --- a/python/pyspark/pandas/indexes/base.py +++ b/python/pyspark/pandas/indexes/base.py @@ -495,6 +495,12 @@ class Index(IndexOpsMixin): ) return self._to_internal_pandas().copy() + def _to_pandas(self) -> pd.Index: + """ + Same as `to_pandas()`, without issueing the advice log for internal usage. + """ + return self._to_internal_pandas().copy() + def to_numpy(self, dtype: Optional[Union[str, Dtype]] = None, copy: bool = False) -> np.ndarray: """ A NumPy ndarray representing the values in this Index or MultiIndex. diff --git a/python/pyspark/pandas/indexes/multi.py b/python/pyspark/pandas/indexes/multi.py index 10dbd22..d67646b 100644 --- a/python/pyspark/pandas/indexes/multi.py +++ b/python/pyspark/pandas/indexes/multi.py @@ -699,6 +699,12 @@ class MultiIndex(Index): # series-like operations. In that case, it creates new Index object instead of MultiIndex. return super().to_pandas() + def _to_pandas(self) -> pd.MultiIndex: + """ + Same as `to_pandas()`, without issueing the advice log for internal usage. + """ + return super()._to_pandas() + def nunique(self, dropna: bool = True, approx: bool = False, rsd: float = 0.05) -> int: raise NotImplementedError("nunique is not defined for MultiIndex") diff --git a/python/pyspark/pandas/indexing.py b/python/pyspark/pandas/indexing.py index dae5d84..b561e03 100644 --- a/python/pyspark/pandas/indexing.py +++ b/python/pyspark/pandas/indexing.py @@ -560,7 +560,7 @@ class LocIndexerLike(IndexerLike, metaclass=ABCMeta): psdf_or_psser = psdf if remaining_index is not None and remaining_index == 0: - pdf_or_pser = psdf_or_psser.head(2).to_pandas() + pdf_or_pser = psdf_or_psser.head(2)._to_pandas() length = len(pdf_or_pser) if length == 0: raise KeyError(name_like_string(key)) diff --git a/python/pyspark/pandas/plot/core.py b/python/pyspark/pandas/plot/core.py index 89b8320..8c40a46 100644 --- a/python/pyspark/pandas/plot/core.py +++ b/python/pyspark/pandas/plot/core.py @@ -39,7 +39,7 @@ class TopNPlotBase: # Simply use the first 1k elements and make it into a pandas dataframe # For categorical variables, it is likely called from df.x.value_counts().plot.xxx(). if isinstance(data, (Series, DataFrame)): - data = data.head(max_rows + 1).to_pandas() + data = data.head(max_rows + 1)._to_pandas() else: raise TypeError("Only DataFrame and Series are supported for plotting.") @@ -79,7 +79,7 @@ class SampledPlotBase: if isinstance(data, Series): data = data.to_frame() sampled = data._internal.resolved_copy.spark_frame.sample(fraction=self.fraction) - return DataFrame(data._internal.with_new_sdf(sampled)).to_pandas() + return DataFrame(data._internal.with_new_sdf(sampled))._to_pandas() else: raise TypeError("Only DataFrame and Series are supported for plotting.") diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py index 00fd3a2..18eaad8 100644 --- a/python/pyspark/pandas/series.py +++ b/python/pyspark/pandas/series.py @@ -1587,6 +1587,12 @@ class Series(Frame, IndexOpsMixin, Generic[T]): ) return self._to_internal_pandas().copy() + def _to_pandas(self) -> pd.Series: + """ + Same as `to_pandas()`, without issueing the advice log for internal usage. + """ + return self._to_internal_pandas().copy() + def to_list(self) -> List: """ Return a list of the values. --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org