This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 4bb3aaeb4c3 [SPARK-45028][PYTHON][DOCS] Refine docstring of `DataFrame.drop` 4bb3aaeb4c3 is described below commit 4bb3aaeb4c3f13a723b6da30fe07c007e417b98c Author: panbingkun <pbk1...@gmail.com> AuthorDate: Fri Sep 1 09:18:22 2023 +0800 [SPARK-45028][PYTHON][DOCS] Refine docstring of `DataFrame.drop` ### What changes were proposed in this pull request? This pr aims to refine docstring of `DataFrame.drop`. ### Why are the changes needed? To improve PySpark documentation. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Pass GA. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #42748 from panbingkun/SPARK-45028. Authored-by: panbingkun <pbk1...@gmail.com> Signed-off-by: Ruifeng Zheng <ruife...@apache.org> --- python/pyspark/sql/dataframe.py | 45 +++++++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 8417d445eea..42d85b82e9e 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -5513,7 +5513,8 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): ... def drop(self, *cols: "ColumnOrName") -> "DataFrame": # type: ignore[misc] - """Returns a new :class:`DataFrame` without specified columns. + """ + Returns a new :class:`DataFrame` without specified columns. This is a no-op if the schema doesn't contain the given column name(s). .. versionadded:: 1.4.0 @@ -5524,28 +5525,26 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): Parameters ---------- cols: str or :class:`Column` - a name of the column, or the :class:`Column` to drop + A name of the column, or the :class:`Column` to be dropped. Returns ------- :class:`DataFrame` - DataFrame without given columns. + A new :class:`DataFrame` without the specified columns. Notes ----- - When an input is a column name, it is treated literally without further interpretation. - Otherwise, will try to match the equivalent expression. - So that dropping column by its name `drop(colName)` has different semantic with directly - dropping the column `drop(col(colName))`. + - When an input is a column name, it is treated literally without further interpretation. + Otherwise, it will try to match the equivalent expression. + So dropping a column by its name `drop(colName)` has a different semantic + with directly dropping the column `drop(col(colName))`. Examples -------- - >>> from pyspark.sql import Row - >>> from pyspark.sql.functions import col, lit + Example 1: Drop a column by name. + >>> df = spark.createDataFrame( ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"]) - >>> df2 = spark.createDataFrame([Row(height=80, name="Tom"), Row(height=85, name="Bob")]) - >>> df.drop('age').show() +-----+ | name| @@ -5554,6 +5553,9 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): |Alice| | Bob| +-----+ + + Example 2: Drop a column by :class:`Column` object. + >>> df.drop(df.age).show() +-----+ | name| @@ -5563,9 +5565,10 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): | Bob| +-----+ - Drop the column that joined both DataFrames on. + Example 3: Drop the column that joined both DataFrames on. - >>> df.join(df2, df.name == df2.name, 'inner').drop('name').sort('age').show() + >>> df2 = spark.createDataFrame([(80, "Tom"), (85, "Bob")], ["height", "name"]) + >>> df.join(df2, df.name == df2.name).drop('name').sort('age').show() +---+------+ |age|height| +---+------+ @@ -5586,7 +5589,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): | 16| Bob| 85| Bob| +---+-----+------+----+ - Drop two column by the same name. + Example 4: Drop two column by the same name. >>> df3.drop("name").show() +---+------+ @@ -5600,14 +5603,18 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): | 16| 85| +---+------+ - Can not drop col('name') due to ambiguous reference. + Example 5: Can not drop col('name') due to ambiguous reference. - >>> df3.drop(col("name")).show() + >>> from pyspark.sql import functions as sf + >>> df3.drop(sf.col("name")).show() Traceback (most recent call last): ... pyspark.errors.exceptions.captured.AnalysisException: [AMBIGUOUS_REFERENCE] Reference... - >>> df4 = df.withColumn("a.b.c", lit(1)) + Example 6: Can not find a column matching the expression "a.b.c". + + >>> from pyspark.sql import functions as sf + >>> df4 = df.withColumn("a.b.c", sf.lit(1)) >>> df4.show() +---+-----+-----+ |age| name|a.b.c| @@ -5626,9 +5633,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): | 16| Bob| +---+-----+ - Can not find a column matching the expression "a.b.c". - - >>> df4.drop(col("a.b.c")).show() + >>> df4.drop(sf.col("a.b.c")).show() +---+-----+-----+ |age| name|a.b.c| +---+-----+-----+ --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org