This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new db162369941 [SPARK-45220][PYTHON][DOCS] Refine docstring of DataFrame.join db162369941 is described below commit db162369941dacbbd7eceb98896d498666f7fac8 Author: allisonwang-db <allison.w...@databricks.com> AuthorDate: Thu Oct 19 14:25:30 2023 +0900 [SPARK-45220][PYTHON][DOCS] Refine docstring of DataFrame.join ### What changes were proposed in this pull request? This PR refines the docstring of `DataFrame.join` by adding more examples and explanations. ### Why are the changes needed? To improve PySpark documentation. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? doctest ### Was this patch authored or co-authored using generative AI tooling? No Closes #43039 from allisonwang-db/spark-45220-refine-join. Authored-by: allisonwang-db <allison.w...@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- python/pyspark/sql/dataframe.py | 167 +++++++++++++++++++++++++++++----------- 1 file changed, 124 insertions(+), 43 deletions(-) diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 637787ceb66..34df2bafcf8 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -2646,7 +2646,8 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): on: Optional[Union[str, List[str], Column, List[Column]]] = None, how: Optional[str] = None, ) -> "DataFrame": - """Joins with another :class:`DataFrame`, using the given join expression. + """ + Joins with another :class:`DataFrame`, using the given join expression. .. versionadded:: 1.3.0 @@ -2675,39 +2676,55 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): Examples -------- - The following performs a full outer join between ``df1`` and ``df2``. + The following examples demonstrate various join types among ``df1``, ``df2``, and ``df3``. + >>> import pyspark.sql.functions as sf >>> from pyspark.sql import Row - >>> from pyspark.sql.functions import desc - >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")]).toDF("age", "name") - >>> df2 = spark.createDataFrame([Row(height=80, name="Tom"), Row(height=85, name="Bob")]) - >>> df3 = spark.createDataFrame([Row(age=2, name="Alice"), Row(age=5, name="Bob")]) - >>> df4 = spark.createDataFrame([ - ... Row(age=10, height=80, name="Alice"), - ... Row(age=5, height=None, name="Bob"), - ... Row(age=None, height=None, name="Tom"), - ... Row(age=None, height=None, name=None), + >>> df = spark.createDataFrame([Row(name="Alice", age=2), Row(name="Bob", age=5)]) + >>> df2 = spark.createDataFrame([Row(name="Tom", height=80), Row(name="Bob", height=85)]) + >>> df3 = spark.createDataFrame([ + ... Row(name="Alice", age=10, height=80), + ... Row(name="Bob", age=5, height=None), + ... Row(name="Tom", age=None, height=None), + ... Row(name=None, age=None, height=None), ... ]) Inner join on columns (default) - >>> df.join(df2, 'name').select(df.name, df2.height).show() - +----+------+ - |name|height| - +----+------+ - | Bob| 85| - +----+------+ - >>> df.join(df4, ['name', 'age']).select(df.name, df.age).show() - +----+---+ - |name|age| - +----+---+ - | Bob| 5| - +----+---+ - - Outer join for both DataFrames on the 'name' column. - - >>> df.join(df2, df.name == df2.name, 'outer').select( - ... df.name, df2.height).sort(desc("name")).show() + >>> df.join(df2, "name").show() + +----+---+------+ + |name|age|height| + +----+---+------+ + | Bob| 5| 85| + +----+---+------+ + + >>> df.join(df3, ["name", "age"]).show() + +----+---+------+ + |name|age|height| + +----+---+------+ + | Bob| 5| NULL| + +----+---+------+ + + Outer join on a single column with an explicit join condition. + + When the join condition is explicited stated: `df.name == df2.name`, this will + produce all records where the names match, as well as those that don't (since + it's an outer join). If there are names in `df2` that are not present in `df`, + they will appear with `NULL` in the `name` column of `df`, and vice versa for `df2`. + + >>> joined = df.join(df2, df.name == df2.name, "outer").sort(sf.desc(df.name)) + >>> joined.show() # doctest: +SKIP + +-----+----+----+------+ + | name| age|name|height| + +-----+----+----+------+ + | Bob| 5| Bob| 85| + |Alice| 2|NULL| NULL| + | NULL|NULL| Tom| 80| + +-----+----+----+------+ + + To unambiguously select output columns, specify the dataframe along with the column name: + + >>> joined.select(df.name, df2.height).show() # doctest: +SKIP +-----+------+ | name|height| +-----+------+ @@ -2715,27 +2732,91 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): |Alice| NULL| | NULL| 80| +-----+------+ - >>> df.join(df2, 'name', 'outer').select('name', 'height').sort(desc("name")).show() - +-----+------+ - | name|height| - +-----+------+ - | Tom| 80| - | Bob| 85| - |Alice| NULL| - +-----+------+ - Outer join for both DataFrams with multiple columns. + However, in self-joins, direct column references can cause ambiguity: + + >>> df.join(df, df.name == df.name, "outer").select(df.name).show() # doctest: +SKIP + Traceback (most recent call last): + ... + pyspark.errors.exceptions.captured.AnalysisException: Column name#0 are ambiguous... + + A better approach is to assign aliases to the dataframes, and then reference + the ouptut columns from the join operation using these aliases: - >>> df.join( - ... df3, - ... [df.name == df3.name, df.age == df3.age], - ... 'outer' - ... ).select(df.name, df3.age).show() + >>> df.alias("a").join( + ... df.alias("b"), sf.col("a.name") == sf.col("b.name"), "outer" + ... ).sort(sf.desc("a.name")).select("a.name", "b.age").show() +-----+---+ | name|age| +-----+---+ - |Alice| 2| | Bob| 5| + |Alice| 2| + +-----+---+ + + Outer join on a single column with implicit join condition using column name + + When you provide the column name directly as the join condition, Spark will treat + both name columns as one, and will not produce separate columns for `df.name` and + `df2.name`. This avoids having duplicate columns in the output. + + >>> df.join(df2, "name", "outer").sort(sf.desc("name")).show() + +-----+----+------+ + | name| age|height| + +-----+----+------+ + | Tom|NULL| 80| + | Bob| 5| 85| + |Alice| 2| NULL| + +-----+----+------+ + + Outer join on multiple columns + + >>> df.join(df3, ["name", "age"], "outer").show() + +-----+----+------+ + | name| age|height| + +-----+----+------+ + | NULL|NULL| NULL| + |Alice| 2| NULL| + |Alice| 10| 80| + | Bob| 5| NULL| + | Tom|NULL| NULL| + +-----+----+------+ + + Left outer join on columns + + >>> df.join(df2, "name", "left_outer").show() + +-----+---+------+ + | name|age|height| + +-----+---+------+ + |Alice| 2| NULL| + | Bob| 5| 85| + +-----+---+------+ + + Right outer join on columns + + >>> df.join(df2, "name", "right_outer").show() + +----+----+------+ + |name| age|height| + +----+----+------+ + | Tom|NULL| 80| + | Bob| 5| 85| + +----+----+------+ + + Left semi join on columns + + >>> df.join(df2, "name", "left_semi").show() + +----+---+ + |name|age| + +----+---+ + | Bob| 5| + +----+---+ + + Left anti join on columns + + >>> df.join(df2, "name", "left_anti").show() + +-----+---+ + | name|age| + +-----+---+ + |Alice| 2| +-----+---+ """ --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org