This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new f258af5a98b [SPARK-45058][PYTHON][DOCS] Refine docstring of DataFrame.distinct f258af5a98b is described below commit f258af5a98b8f6fc9c338fb0fefb5aff751142a1 Author: allisonwang-db <allison.w...@databricks.com> AuthorDate: Mon Sep 4 09:53:40 2023 +0900 [SPARK-45058][PYTHON][DOCS] Refine docstring of DataFrame.distinct ### What changes were proposed in this pull request? This PR refines the docstring of `DataFrame.distinct` by adding more examples. ### Why are the changes needed? To improve PySpark documentations. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? doctest ### Was this patch authored or co-authored using generative AI tooling? No Closes #42782 from allisonwang-db/spark-45058-refine-distinct. Authored-by: allisonwang-db <allison.w...@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- python/pyspark/sql/dataframe.py | 77 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 76 insertions(+), 1 deletion(-) diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 42d85b82e9e..64592311a13 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -1934,15 +1934,90 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): :class:`DataFrame` DataFrame with distinct records. + See Also + -------- + DataFrame.dropDuplicates + Examples -------- + Remove duplicate rows from a DataFrame + >>> df = spark.createDataFrame( ... [(14, "Tom"), (23, "Alice"), (23, "Alice")], ["age", "name"]) + >>> df.distinct().show() + +---+-----+ + |age| name| + +---+-----+ + | 14| Tom| + | 23|Alice| + +---+-----+ - Return the number of distinct rows in the :class:`DataFrame` + Count the number of distinct rows in a DataFrame >>> df.distinct().count() 2 + + Get distinct rows from a DataFrame with multiple columns + + >>> df = spark.createDataFrame( + ... [(14, "Tom", "M"), (23, "Alice", "F"), (23, "Alice", "F"), (14, "Tom", "M")], + ... ["age", "name", "gender"]) + >>> df.distinct().show() + +---+-----+------+ + |age| name|gender| + +---+-----+------+ + | 14| Tom| M| + | 23|Alice| F| + +---+-----+------+ + + Get distinct values from a specific column in a DataFrame + + >>> df.select("name").distinct().show() + +-----+ + | name| + +-----+ + | Tom| + |Alice| + +-----+ + + Count the number of distinct values in a specific column + + >>> df.select("name").distinct().count() + 2 + + Get distinct values from multiple columns in DataFrame + + >>> df.select("name", "gender").distinct().show() + +-----+------+ + | name|gender| + +-----+------+ + | Tom| M| + |Alice| F| + +-----+------+ + + Get distinct rows from a DataFrame with null values + + >>> df = spark.createDataFrame( + ... [(14, "Tom", "M"), (23, "Alice", "F"), (23, "Alice", "F"), (14, "Tom", None)], + ... ["age", "name", "gender"]) + >>> df.distinct().show() + +---+-----+------+ + |age| name|gender| + +---+-----+------+ + | 14| Tom| M| + | 23|Alice| F| + | 14| Tom| NULL| + +---+-----+------+ + + Get distinct non-null values from a DataFrame + + >>> df.distinct().filter(df.gender.isNotNull()).show() + +---+-----+------+ + |age| name|gender| + +---+-----+------+ + | 14| Tom| M| + | 23|Alice| F| + +---+-----+------+ """ return DataFrame(self._jdf.distinct(), self.sparkSession) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org