(spark) branch master updated: [SPARK-49391][PS] Box plot select outliers by distance from fences

ruifengz Mon, 26 Aug 2024 03:11:21 -0700

This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new b1ddec5757ae [SPARK-49391][PS] Box plot select outliers by distance 
from fences
b1ddec5757ae is described below

commit b1ddec5757aeef69bdd4b08f4f75096b129f5d31
Author: Ruifeng Zheng <ruife...@apache.org>
AuthorDate: Mon Aug 26 18:10:36 2024 +0800

    [SPARK-49391][PS] Box plot select outliers by distance from fences
    
    ### What changes were proposed in this pull request?
    Box plot select outliers by distance from fences
    
    ### Why are the changes needed?
    if there are more than 1k outliers, existing implementations select the 
values by distance `|value - min(non_outliers)|` which is not reasonable 
because it prefers outliers above upper fence over outliers below lower fence.
    We should order them by the distance from fences:
    1, if value > upper fence,  value - upper fence;
    2, it value < lower fence,  lower fence - value;
    
    ### Does this PR introduce _any_ user-facing change?
    No
    
    ### How was this patch tested?
    CI and manually test
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No
    
    Closes #47870 from zhengruifeng/plot_hist_select_outlier.
    
    Authored-by: Ruifeng Zheng <ruife...@apache.org>
    Signed-off-by: Ruifeng Zheng <ruife...@apache.org>
---
 python/pyspark/pandas/plot/core.py                 | 42 ++++++++++++++++------
 python/pyspark/pandas/plot/matplotlib.py           |  2 +-
 python/pyspark/pandas/plot/plotly.py               |  4 +--
 .../pyspark/pandas/tests/plot/test_series_plot.py  |  2 +-
 4 files changed, 35 insertions(+), 15 deletions(-)

diff --git a/python/pyspark/pandas/plot/core.py 
b/python/pyspark/pandas/plot/core.py
index 2e188b411df1..fe5beb0e730d 100644
--- a/python/pyspark/pandas/plot/core.py
+++ b/python/pyspark/pandas/plot/core.py
@@ -420,14 +420,24 @@ class BoxPlotBase:
         return minmax.iloc[0][["min", "max"]].values
 
     @staticmethod
-    def get_fliers(colname, outliers, min_val):
+    def get_fliers(colname, outliers, lfence, ufence):
         # Filters only the outliers, should "showfliers" be True
         fliers_df = outliers.filter("`__{}_outlier`".format(colname))
 
         # If it shows fliers, take the top 1k with highest absolute values
-        # Here we normalize the values by subtracting the minimum value from
-        # each, and use absolute values.
-        order_col = F.abs(F.col("`{}`".format(colname)) - min_val.item())
+        # Here we normalize the values by subtracting the fences.
+        formated_colname = "`{}`".format(colname)
+        order_col = (
+            F.when(
+                F.col(formated_colname) > F.lit(ufence),
+                F.col(formated_colname) - F.lit(ufence),
+            )
+            .when(
+                F.col(formated_colname) < F.lit(lfence),
+                F.lit(lfence) - F.col(formated_colname),
+            )
+            .otherwise(F.lit(None))
+        )
         fliers = (
             fliers_df.select(F.col("`{}`".format(colname)))
             .orderBy(order_col)
@@ -439,15 +449,26 @@ class BoxPlotBase:
         return fliers
 
     @staticmethod
-    def get_multicol_fliers(colnames, multicol_outliers, multicol_whiskers):
+    def get_multicol_fliers(colnames, multicol_outliers, multicol_stats):
         scols = []
-        extract_colnames = []
         for i, colname in enumerate(colnames):
             formated_colname = "`{}`".format(colname)
             outlier_colname = "__{}_outlier".format(colname)
-            min_val = multicol_whiskers[colname]["min"]
+            lfence, ufence = multicol_stats[colname]["lfence"], 
multicol_stats[colname]["ufence"]
+            order_col = (
+                F.when(
+                    F.col(formated_colname) > F.lit(ufence),
+                    F.col(formated_colname) - F.lit(ufence),
+                )
+                .when(
+                    F.col(formated_colname) < F.lit(lfence),
+                    F.lit(lfence) - F.col(formated_colname),
+                )
+                .otherwise(F.lit(None))
+            )
+
             pair_col = F.struct(
-                F.abs(F.col(formated_colname) - F.lit(min_val)).alias("ord"),
+                order_col.alias("ord"),
                 F.col(formated_colname).alias("val"),
             )
             scols.append(
@@ -457,11 +478,10 @@ class BoxPlotBase:
                     .alias(f"pair_{i}"),
                     1001,
                     False,
-                ).alias(f"top_{i}")
+                ).alias(f"top_{i}")["val"]
             )
-            extract_colnames.append(f"top_{i}.val")
 
-        results = 
multicol_outliers.select(scols).select(extract_colnames).first()
+        results = multicol_outliers.select(scols).first()
 
         fliers = {}
         for i, colname in enumerate(colnames):
diff --git a/python/pyspark/pandas/plot/matplotlib.py 
b/python/pyspark/pandas/plot/matplotlib.py
index f496f2bc664b..3d045ffc8d6b 100644
--- a/python/pyspark/pandas/plot/matplotlib.py
+++ b/python/pyspark/pandas/plot/matplotlib.py
@@ -292,7 +292,7 @@ class PandasOnSparkBoxPlot(PandasBoxPlot, BoxPlotBase):
         whiskers = BoxPlotBase.calc_whiskers(spark_column_name, outliers)
 
         if showfliers:
-            fliers = BoxPlotBase.get_fliers(spark_column_name, outliers, 
whiskers[0])
+            fliers = BoxPlotBase.get_fliers(spark_column_name, outliers, 
*col_fences)
         else:
             fliers = []
 
diff --git a/python/pyspark/pandas/plot/plotly.py 
b/python/pyspark/pandas/plot/plotly.py
index 0afcd6d7e869..995060eb9c12 100644
--- a/python/pyspark/pandas/plot/plotly.py
+++ b/python/pyspark/pandas/plot/plotly.py
@@ -162,7 +162,7 @@ def plot_box(data: Union["ps.DataFrame", "ps.Series"], 
**kwargs):
 
         fliers = None
         if boxpoints:
-            fliers = BoxPlotBase.get_fliers(spark_column_name, outliers, 
whiskers[0])
+            fliers = BoxPlotBase.get_fliers(spark_column_name, outliers, 
*col_fences)
             fliers = [fliers] if len(fliers) > 0 else None
 
         fig.add_trace(
@@ -201,7 +201,7 @@ def plot_box(data: Union["ps.DataFrame", "ps.Series"], 
**kwargs):
 
         fliers = None
         if boxpoints:
-            fliers = BoxPlotBase.get_multicol_fliers(numeric_column_names, 
outliers, whiskers)
+            fliers = BoxPlotBase.get_multicol_fliers(numeric_column_names, 
outliers, multicol_stats)
 
         i = 0
         for colname in numeric_column_names:
diff --git a/python/pyspark/pandas/tests/plot/test_series_plot.py 
b/python/pyspark/pandas/tests/plot/test_series_plot.py
index 9daefbc2a23b..9bd335af527e 100644
--- a/python/pyspark/pandas/tests/plot/test_series_plot.py
+++ b/python/pyspark/pandas/tests/plot/test_series_plot.py
@@ -61,7 +61,7 @@ class SeriesPlotTestsMixin:
             stats, fences = BoxPlotBase.compute_stats(psdf["a"], "a", whis=k, 
precision=0.01)
             outliers = BoxPlotBase.outliers(psdf["a"], "a", *fences)
             whiskers = BoxPlotBase.calc_whiskers("a", outliers)
-            fliers = BoxPlotBase.get_fliers("a", outliers, whiskers[0])
+            fliers = BoxPlotBase.get_fliers("a", outliers, *fences)
 
             expected_mean = pdf["a"].mean()
             expected_median = pdf["a"].median()


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

(spark) branch master updated: [SPARK-49391][PS] Box plot select outliers by distance from fences

Reply via email to