[spark] branch master updated: [SPARK-37327][PYTHON] Silence the to_pandas() advice log for internal usage

gurwls223 Mon, 15 Nov 2021 00:10:05 -0800

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 27abceb  [SPARK-37327][PYTHON] Silence the to_pandas() advice log for 
internal usage
27abceb is described below

commit 27abcebf8da69f88c66ef43d651fc3db6bf8c2cb
Author: itholic <haejoon....@databricks.com>
AuthorDate: Mon Nov 15 17:09:07 2021 +0900

    [SPARK-37327][PYTHON] Silence the to_pandas() advice log for internal usage
    
    ### What changes were proposed in this pull request?
    
    This PR proposes adding internal method `_to_pandas()` to silence the 
advice log when it's used as an internal purpose.
    
    ### Why are the changes needed?
    
    `to_pandas()` is used in many places when implementing API, but users might 
not want to see the every warnings for internal usage.
    
    Showing the advice log when only users use the `to_pandas()` explicitly is 
enough.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Reduce the advice log for `to_pandas()`.
    
    ### How was this patch tested?
    
    Manually check the result
    
    Closes #34598 from itholic/SPARK-37327.
    
    Authored-by: itholic <haejoon....@databricks.com>
    Signed-off-by: Hyukjin Kwon <gurwls...@apache.org>
---
 python/pyspark/pandas/categorical.py   |  2 +-
 python/pyspark/pandas/frame.py         |  8 +++++++-
 python/pyspark/pandas/generic.py       | 12 ++++++++----
 python/pyspark/pandas/indexes/base.py  |  6 ++++++
 python/pyspark/pandas/indexes/multi.py |  6 ++++++
 python/pyspark/pandas/indexing.py      |  2 +-
 python/pyspark/pandas/plot/core.py     |  4 ++--
 python/pyspark/pandas/series.py        |  6 ++++++
 8 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/python/pyspark/pandas/categorical.py 
b/python/pyspark/pandas/categorical.py
index ad2647b..e59048b 100644
--- a/python/pyspark/pandas/categorical.py
+++ b/python/pyspark/pandas/categorical.py
@@ -528,7 +528,7 @@ class CategoricalAccessor(object):
                 FutureWarning,
             )
 
-        categories = set(self._data.drop_duplicates().to_pandas())
+        categories = set(self._data.drop_duplicates()._to_pandas())
         removals = [cat for cat in self.categories if cat not in categories]
         return self.remove_categories(removals=removals, inplace=inplace)
 
diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
index d42d07b..6d89eb5 100644
--- a/python/pyspark/pandas/frame.py
+++ b/python/pyspark/pandas/frame.py
@@ -4848,6 +4848,12 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
         )
         return self._internal.to_pandas_frame.copy()
 
+    def _to_pandas(self) -> pd.DataFrame:
+        """
+        Same as `to_pandas()`, without issueing the advice log for internal 
usage.
+        """
+        return self._internal.to_pandas_frame.copy()
+
     def assign(self, **kwargs: Any) -> "DataFrame":
         """
         Assign new columns to a DataFrame.
@@ -10871,7 +10877,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
                 object.__setattr__(self, "_data", self)
                 count_func = self.count
                 self.count = (  # type: ignore[assignment]
-                    lambda: count_func().to_pandas()  # type: 
ignore[assignment, misc, union-attr]
+                    lambda: count_func()._to_pandas()  # type: 
ignore[assignment, misc, union-attr]
                 )
                 return pd.DataFrame.info(
                     self,
diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py
index a4e99d6..bb83ddf 100644
--- a/python/pyspark/pandas/generic.py
+++ b/python/pyspark/pandas/generic.py
@@ -130,6 +130,10 @@ class Frame(object, metaclass=ABCMeta):
     def to_pandas(self) -> Union[pd.DataFrame, pd.Series]:
         pass
 
+    @abstractmethod
+    def _to_pandas(self) -> Union[pd.DataFrame, pd.Series]:
+        pass
+
     @property
     @abstractmethod
     def index(self) -> "Index":
@@ -578,7 +582,7 @@ class Frame(object, metaclass=ABCMeta):
             "`to_numpy` loads all data into the driver's memory. "
             "It should only be used if the resulting NumPy ndarray is expected 
to be small."
         )
-        return self.to_pandas().values
+        return self._to_pandas().values
 
     @property
     def values(self) -> np.ndarray:
@@ -796,7 +800,7 @@ class Frame(object, metaclass=ABCMeta):
                 self, ps.Series
             ):
                 # 0.23 seems not having 'columns' parameter in Series' to_csv.
-                return psdf_or_ser.to_pandas().to_csv(
+                return psdf_or_ser._to_pandas().to_csv(
                     None,
                     sep=sep,
                     na_rep=na_rep,
@@ -805,7 +809,7 @@ class Frame(object, metaclass=ABCMeta):
                     index=False,
                 )
             else:
-                return psdf_or_ser.to_pandas().to_csv(
+                return psdf_or_ser._to_pandas().to_csv(
                     None,
                     sep=sep,
                     na_rep=na_rep,
@@ -1001,7 +1005,7 @@ class Frame(object, metaclass=ABCMeta):
         if path is None:
             # If path is none, just collect and use pandas's to_json.
             psdf_or_ser = self
-            pdf = psdf_or_ser.to_pandas()
+            pdf = psdf_or_ser._to_pandas()
             if isinstance(self, ps.Series):
                 pdf = pdf.to_frame()
             # To make the format consistent and readable by `read_json`, 
convert it to pandas' and
diff --git a/python/pyspark/pandas/indexes/base.py 
b/python/pyspark/pandas/indexes/base.py
index ee79c08..2d84bc3 100644
--- a/python/pyspark/pandas/indexes/base.py
+++ b/python/pyspark/pandas/indexes/base.py
@@ -495,6 +495,12 @@ class Index(IndexOpsMixin):
         )
         return self._to_internal_pandas().copy()
 
+    def _to_pandas(self) -> pd.Index:
+        """
+        Same as `to_pandas()`, without issueing the advice log for internal 
usage.
+        """
+        return self._to_internal_pandas().copy()
+
     def to_numpy(self, dtype: Optional[Union[str, Dtype]] = None, copy: bool = 
False) -> np.ndarray:
         """
         A NumPy ndarray representing the values in this Index or MultiIndex.
diff --git a/python/pyspark/pandas/indexes/multi.py 
b/python/pyspark/pandas/indexes/multi.py
index 10dbd22..d67646b 100644
--- a/python/pyspark/pandas/indexes/multi.py
+++ b/python/pyspark/pandas/indexes/multi.py
@@ -699,6 +699,12 @@ class MultiIndex(Index):
         # series-like operations. In that case, it creates new Index object 
instead of MultiIndex.
         return super().to_pandas()
 
+    def _to_pandas(self) -> pd.MultiIndex:
+        """
+        Same as `to_pandas()`, without issueing the advice log for internal 
usage.
+        """
+        return super()._to_pandas()
+
     def nunique(self, dropna: bool = True, approx: bool = False, rsd: float = 
0.05) -> int:
         raise NotImplementedError("nunique is not defined for MultiIndex")
 
diff --git a/python/pyspark/pandas/indexing.py 
b/python/pyspark/pandas/indexing.py
index dae5d84..b561e03 100644
--- a/python/pyspark/pandas/indexing.py
+++ b/python/pyspark/pandas/indexing.py
@@ -560,7 +560,7 @@ class LocIndexerLike(IndexerLike, metaclass=ABCMeta):
             psdf_or_psser = psdf
 
         if remaining_index is not None and remaining_index == 0:
-            pdf_or_pser = psdf_or_psser.head(2).to_pandas()
+            pdf_or_pser = psdf_or_psser.head(2)._to_pandas()
             length = len(pdf_or_pser)
             if length == 0:
                 raise KeyError(name_like_string(key))
diff --git a/python/pyspark/pandas/plot/core.py 
b/python/pyspark/pandas/plot/core.py
index 89b8320..8c40a46 100644
--- a/python/pyspark/pandas/plot/core.py
+++ b/python/pyspark/pandas/plot/core.py
@@ -39,7 +39,7 @@ class TopNPlotBase:
         # Simply use the first 1k elements and make it into a pandas dataframe
         # For categorical variables, it is likely called from 
df.x.value_counts().plot.xxx().
         if isinstance(data, (Series, DataFrame)):
-            data = data.head(max_rows + 1).to_pandas()
+            data = data.head(max_rows + 1)._to_pandas()
         else:
             raise TypeError("Only DataFrame and Series are supported for 
plotting.")
 
@@ -79,7 +79,7 @@ class SampledPlotBase:
             if isinstance(data, Series):
                 data = data.to_frame()
             sampled = 
data._internal.resolved_copy.spark_frame.sample(fraction=self.fraction)
-            return DataFrame(data._internal.with_new_sdf(sampled)).to_pandas()
+            return DataFrame(data._internal.with_new_sdf(sampled))._to_pandas()
         else:
             raise TypeError("Only DataFrame and Series are supported for 
plotting.")
 
diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py
index 00fd3a2..18eaad8 100644
--- a/python/pyspark/pandas/series.py
+++ b/python/pyspark/pandas/series.py
@@ -1587,6 +1587,12 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
         )
         return self._to_internal_pandas().copy()
 
+    def _to_pandas(self) -> pd.Series:
+        """
+        Same as `to_pandas()`, without issueing the advice log for internal 
usage.
+        """
+        return self._to_internal_pandas().copy()
+
     def to_list(self) -> List:
         """
         Return a list of the values.

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-37327][PYTHON] Silence the to_pandas() advice log for internal usage

Reply via email to