This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new b1ddec5757ae [SPARK-49391][PS] Box plot select outliers by distance from fences b1ddec5757ae is described below commit b1ddec5757aeef69bdd4b08f4f75096b129f5d31 Author: Ruifeng Zheng <ruife...@apache.org> AuthorDate: Mon Aug 26 18:10:36 2024 +0800 [SPARK-49391][PS] Box plot select outliers by distance from fences ### What changes were proposed in this pull request? Box plot select outliers by distance from fences ### Why are the changes needed? if there are more than 1k outliers, existing implementations select the values by distance `|value - min(non_outliers)|` which is not reasonable because it prefers outliers above upper fence over outliers below lower fence. We should order them by the distance from fences: 1, if value > upper fence, value - upper fence; 2, it value < lower fence, lower fence - value; ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI and manually test ### Was this patch authored or co-authored using generative AI tooling? No Closes #47870 from zhengruifeng/plot_hist_select_outlier. Authored-by: Ruifeng Zheng <ruife...@apache.org> Signed-off-by: Ruifeng Zheng <ruife...@apache.org> --- python/pyspark/pandas/plot/core.py | 42 ++++++++++++++++------ python/pyspark/pandas/plot/matplotlib.py | 2 +- python/pyspark/pandas/plot/plotly.py | 4 +-- .../pyspark/pandas/tests/plot/test_series_plot.py | 2 +- 4 files changed, 35 insertions(+), 15 deletions(-) diff --git a/python/pyspark/pandas/plot/core.py b/python/pyspark/pandas/plot/core.py index 2e188b411df1..fe5beb0e730d 100644 --- a/python/pyspark/pandas/plot/core.py +++ b/python/pyspark/pandas/plot/core.py @@ -420,14 +420,24 @@ class BoxPlotBase: return minmax.iloc[0][["min", "max"]].values @staticmethod - def get_fliers(colname, outliers, min_val): + def get_fliers(colname, outliers, lfence, ufence): # Filters only the outliers, should "showfliers" be True fliers_df = outliers.filter("`__{}_outlier`".format(colname)) # If it shows fliers, take the top 1k with highest absolute values - # Here we normalize the values by subtracting the minimum value from - # each, and use absolute values. - order_col = F.abs(F.col("`{}`".format(colname)) - min_val.item()) + # Here we normalize the values by subtracting the fences. + formated_colname = "`{}`".format(colname) + order_col = ( + F.when( + F.col(formated_colname) > F.lit(ufence), + F.col(formated_colname) - F.lit(ufence), + ) + .when( + F.col(formated_colname) < F.lit(lfence), + F.lit(lfence) - F.col(formated_colname), + ) + .otherwise(F.lit(None)) + ) fliers = ( fliers_df.select(F.col("`{}`".format(colname))) .orderBy(order_col) @@ -439,15 +449,26 @@ class BoxPlotBase: return fliers @staticmethod - def get_multicol_fliers(colnames, multicol_outliers, multicol_whiskers): + def get_multicol_fliers(colnames, multicol_outliers, multicol_stats): scols = [] - extract_colnames = [] for i, colname in enumerate(colnames): formated_colname = "`{}`".format(colname) outlier_colname = "__{}_outlier".format(colname) - min_val = multicol_whiskers[colname]["min"] + lfence, ufence = multicol_stats[colname]["lfence"], multicol_stats[colname]["ufence"] + order_col = ( + F.when( + F.col(formated_colname) > F.lit(ufence), + F.col(formated_colname) - F.lit(ufence), + ) + .when( + F.col(formated_colname) < F.lit(lfence), + F.lit(lfence) - F.col(formated_colname), + ) + .otherwise(F.lit(None)) + ) + pair_col = F.struct( - F.abs(F.col(formated_colname) - F.lit(min_val)).alias("ord"), + order_col.alias("ord"), F.col(formated_colname).alias("val"), ) scols.append( @@ -457,11 +478,10 @@ class BoxPlotBase: .alias(f"pair_{i}"), 1001, False, - ).alias(f"top_{i}") + ).alias(f"top_{i}")["val"] ) - extract_colnames.append(f"top_{i}.val") - results = multicol_outliers.select(scols).select(extract_colnames).first() + results = multicol_outliers.select(scols).first() fliers = {} for i, colname in enumerate(colnames): diff --git a/python/pyspark/pandas/plot/matplotlib.py b/python/pyspark/pandas/plot/matplotlib.py index f496f2bc664b..3d045ffc8d6b 100644 --- a/python/pyspark/pandas/plot/matplotlib.py +++ b/python/pyspark/pandas/plot/matplotlib.py @@ -292,7 +292,7 @@ class PandasOnSparkBoxPlot(PandasBoxPlot, BoxPlotBase): whiskers = BoxPlotBase.calc_whiskers(spark_column_name, outliers) if showfliers: - fliers = BoxPlotBase.get_fliers(spark_column_name, outliers, whiskers[0]) + fliers = BoxPlotBase.get_fliers(spark_column_name, outliers, *col_fences) else: fliers = [] diff --git a/python/pyspark/pandas/plot/plotly.py b/python/pyspark/pandas/plot/plotly.py index 0afcd6d7e869..995060eb9c12 100644 --- a/python/pyspark/pandas/plot/plotly.py +++ b/python/pyspark/pandas/plot/plotly.py @@ -162,7 +162,7 @@ def plot_box(data: Union["ps.DataFrame", "ps.Series"], **kwargs): fliers = None if boxpoints: - fliers = BoxPlotBase.get_fliers(spark_column_name, outliers, whiskers[0]) + fliers = BoxPlotBase.get_fliers(spark_column_name, outliers, *col_fences) fliers = [fliers] if len(fliers) > 0 else None fig.add_trace( @@ -201,7 +201,7 @@ def plot_box(data: Union["ps.DataFrame", "ps.Series"], **kwargs): fliers = None if boxpoints: - fliers = BoxPlotBase.get_multicol_fliers(numeric_column_names, outliers, whiskers) + fliers = BoxPlotBase.get_multicol_fliers(numeric_column_names, outliers, multicol_stats) i = 0 for colname in numeric_column_names: diff --git a/python/pyspark/pandas/tests/plot/test_series_plot.py b/python/pyspark/pandas/tests/plot/test_series_plot.py index 9daefbc2a23b..9bd335af527e 100644 --- a/python/pyspark/pandas/tests/plot/test_series_plot.py +++ b/python/pyspark/pandas/tests/plot/test_series_plot.py @@ -61,7 +61,7 @@ class SeriesPlotTestsMixin: stats, fences = BoxPlotBase.compute_stats(psdf["a"], "a", whis=k, precision=0.01) outliers = BoxPlotBase.outliers(psdf["a"], "a", *fences) whiskers = BoxPlotBase.calc_whiskers("a", outliers) - fliers = BoxPlotBase.get_fliers("a", outliers, whiskers[0]) + fliers = BoxPlotBase.get_fliers("a", outliers, *fences) expected_mean = pdf["a"].mean() expected_median = pdf["a"].median() --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org