This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.2 by this push: new e423249 [SPARK-36161][PYTHON] Add type check on dropDuplicates pyspark function e423249 is described below commit e42324982de3b3009062670f915b71962964aa4f Author: Samuel Moseley <smose...@palantir.com> AuthorDate: Thu Jul 29 19:11:48 2021 +0900 [SPARK-36161][PYTHON] Add type check on dropDuplicates pyspark function ### What changes were proposed in this pull request? Improve the error message for wrong type when calling dropDuplicates in pyspark. ### Why are the changes needed? The current error message is cryptic and can be unclear to less experienced users. ### Does this PR introduce _any_ user-facing change? Yes, it adds a type error for when a user gives the wrong type to dropDuplicates ### How was this patch tested? There is currently no testing for error messages in pyspark dataframe functions Closes #33364 from sammyjmoseley/sm/add-type-checking-for-drop-duplicates. Lead-authored-by: Samuel Moseley <smose...@palantir.com> Co-authored-by: Sammy Moseley <moseley.sa...@gmail.com> Co-authored-by: Hyukjin Kwon <gurwls...@gmail.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> (cherry picked from commit a07df1acc662d8c85c7f6e53a08d011056998072) Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- python/pyspark/sql/dataframe.py | 5 +++++ python/pyspark/sql/tests/test_dataframe.py | 18 ++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 61e7a73..83eaaa4 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -18,6 +18,7 @@ import sys import random import warnings +from collections.abc import Iterable from functools import reduce from html import escape as html_escape @@ -1980,6 +1981,10 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): |Alice| 5| 80| +-----+---+------+ """ + if subset is not None and ( + not isinstance(subset, Iterable) or isinstance(subset, str)): + raise TypeError("Parameter 'subset' must be a list of columns") + if subset is None: jdf = self._jdf.dropDuplicates() else: diff --git a/python/pyspark/sql/tests/test_dataframe.py b/python/pyspark/sql/tests/test_dataframe.py index f64c7c2..8c9f330 100644 --- a/python/pyspark/sql/tests/test_dataframe.py +++ b/python/pyspark/sql/tests/test_dataframe.py @@ -67,6 +67,24 @@ class DataFrameTests(ReusedSQLTestCase): pydoc.render_doc(df.foo) pydoc.render_doc(df.take(1)) + def test_drop_duplicates(self): + # SPARK-36034 test that drop duplicates throws a type error when in correct type provided + df = self.spark.createDataFrame( + [("Alice", 50), ("Alice", 60)], + ["name", "age"] + ) + + # shouldn't drop a non-null row + self.assertEqual(df.dropDuplicates().count(), 2) + + self.assertEqual(df.dropDuplicates(["name"]).count(), 1) + + self.assertEqual(df.dropDuplicates(["name", "age"]).count(), 2) + + type_error_msg = "Parameter 'subset' must be a list of columns" + with self.assertRaisesRegex(TypeError, type_error_msg): + df.dropDuplicates("name") + def test_dropna(self): schema = StructType([ StructField("name", StringType(), True), --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org