Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/21887#discussion_r205934537 --- Diff: examples/src/main/python/sql/arrow.py --- @@ -113,6 +113,42 @@ def substract_mean(pdf): # $example off:grouped_map_pandas_udf$ +def grouped_agg_pandas_udf_example(spark): + # $example on:grouped_agg_pandas_udf$ + from pyspark.sql.functions import pandas_udf, PandasUDFType + from pyspark.sql import Window + + df = spark.createDataFrame( + [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)], + ("id", "v")) + + @pandas_udf("double", PandasUDFType.GROUPED_AGG) + def mean_udf(v): + return v.mean() + df.groupby("id").agg(mean_udf(df['v'])).show() + # +---+-----------+ + # | id|mean_udf(v)| + # +---+-----------+ + # | 1| 1.5| + # | 2| 6.0| + # +---+-----------+ + + w = Window \ + .partitionBy('id') \ + .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing) + df.withColumn('mean_v', mean_udf(df['v']).over(w)).show() + # +---+----+------+ + # | id| v|mean_v| + # +---+----+------+ + # | 1| 1.0| 1.5| + # | 1| 2.0| 1.5| + # | 2| 3.0| 6.0| + # | 2| 5.0| 6.0| + # | 2|10.0| 6.0| + # +---+----+------+ + # $example off:grouped_map_pandas_udf$ --- End diff -- Seems a typo. It looks it should be `grouped_agg_pandas_udf`. @icexelloss, `cd docs && SKIP_API=1 jekyll build` will build the doc and should better manually be tested.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org