This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new e795dc12833 [SPARK-45931][PYTHON][DOCS] Refine docstring of mapInPandas e795dc12833 is described below commit e795dc12833a3e2cf6996fb1b649c27bb0c85a28 Author: allisonwang-db <allison.w...@databricks.com> AuthorDate: Thu Nov 16 10:42:03 2023 +0900 [SPARK-45931][PYTHON][DOCS] Refine docstring of mapInPandas ### What changes were proposed in this pull request? This PR improves the docstring of the dataframe function `mapInPandas`. ### Why are the changes needed? To improve PySpark documentation. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? doctest + manual verification ### Was this patch authored or co-authored using generative AI tooling? No Closes #43811 from allisonwang-db/spark-45931-refine-mapinpandas. Authored-by: allisonwang-db <allison.w...@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- python/pyspark/sql/pandas/map_ops.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/python/pyspark/sql/pandas/map_ops.py b/python/pyspark/sql/pandas/map_ops.py index 710fc8a9a37..55aa3249530 100644 --- a/python/pyspark/sql/pandas/map_ops.py +++ b/python/pyspark/sql/pandas/map_ops.py @@ -67,8 +67,10 @@ class PandasMapOpsMixin: Examples -------- - >>> from pyspark.sql.functions import pandas_udf >>> df = spark.createDataFrame([(1, 21), (2, 30)], ("id", "age")) + + Filter rows with id equal to 1: + >>> def filter_func(iterator): ... for pdf in iterator: ... yield pdf[pdf.id == 1] @@ -80,6 +82,36 @@ class PandasMapOpsMixin: | 1| 21| +---+---+ + Compute the mean age for each id: + + >>> def mean_age(iterator): + ... for pdf in iterator: + ... yield pdf.groupby("id").mean().reset_index() + ... + >>> df.mapInPandas(mean_age, "id: bigint, age: double").show() # doctest: +SKIP + +---+----+ + | id| age| + +---+----+ + | 1|21.0| + | 2|30.0| + +---+----+ + + Add a new column with the double of the age: + + >>> def double_age(iterator): + ... for pdf in iterator: + ... pdf["double_age"] = pdf["age"] * 2 + ... yield pdf + ... + >>> df.mapInPandas( + ... double_age, "id: bigint, age: bigint, double_age: bigint").show() # doctest: +SKIP + +---+---+----------+ + | id|age|double_age| + +---+---+----------+ + | 1| 21| 42| + | 2| 30| 60| + +---+---+----------+ + Set ``barrier`` to ``True`` to force the ``mapInPandas`` stage running in the barrier mode, it ensures all Python workers in the stage will be launched concurrently. --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org