Github user icexelloss commented on a diff in the pull request: https://github.com/apache/spark/pull/18732#discussion_r142482842 --- Diff: python/pyspark/sql/functions.py --- @@ -2181,31 +2186,69 @@ def udf(f=None, returnType=StringType()): @since(2.3) def pandas_udf(f=None, returnType=StringType()): """ - Creates a :class:`Column` expression representing a user defined function (UDF) that accepts - `Pandas.Series` as input arguments and outputs a `Pandas.Series` of the same length. + Creates a :class:`Column` expression representing a vectorized user defined function (UDF). + + The user-defined function can define one of the following transformations: + 1. One or more `pandas.Series` -> A `pandas.Series` + + This udf is used with `DataFrame.withColumn` and `DataFrame.select`. + The returnType should be a primitive data type, e.g., DoubleType() + + Example: + + >>> from pyspark.sql.types import IntegerType, StringType + >>> slen = pandas_udf(lambda s: s.str.len(), IntegerType()) + >>> @pandas_udf(returnType=StringType()) + ... def to_upper(s): + ... return s.str.upper() + ... + >>> @pandas_udf(returnType="integer") + ... def add_one(x): + ... return x + 1 + ... + >>> df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age")) + >>> df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")) \\ + ... .show() # doctest: +SKIP + +----------+--------------+------------+ + |slen(name)|to_upper(name)|add_one(age)| + +----------+--------------+------------+ + | 8| JOHN DOE| 22| + +----------+--------------+------------+ + + 2. A `pandas.DataFrame` -> A `pandas.DataFrame` + + This udf is used with `GroupedData.apply` + The returnType should be a StructType describing the schema of the returned + `pandas.DataFrame`. + + Example: + + >>> df = spark.createDataFrame([(1, 1.0), (1, 2.0), (2, 3.0), (2, 4.0)], ("id", "v")) + >>> @pandas_udf(returnType=df.schema) + ... def normalize(df): + ... v = df.v + ... ret = df.assign(v=(v - v.mean()) / v.std()) + >>> df.groupby('id').apply(normalize).show() # doctest: + SKIP + +---+-------------------+ + | id| v| + +---+-------------------+ + | 1|-0.7071067811865475| + | 1| 0.7071067811865475| + | 2|-0.7071067811865475| + | 2| 0.7071067811865475| + +---+-------------------+ + + + .. note:: The user-defined functions must be deterministic. :param f: python function if used as a standalone function :param returnType: a :class:`pyspark.sql.types.DataType` object - >>> from pyspark.sql.types import IntegerType, StringType - >>> slen = pandas_udf(lambda s: s.str.len(), IntegerType()) - >>> @pandas_udf(returnType=StringType()) - ... def to_upper(s): - ... return s.str.upper() - ... - >>> @pandas_udf(returnType="integer") - ... def add_one(x): - ... return x + 1 - ... - >>> df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age")) - >>> df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")) \\ - ... .show() # doctest: +SKIP - +----------+--------------+------------+ - |slen(name)|to_upper(name)|add_one(age)| - +----------+--------------+------------+ - | 8| JOHN DOE| 22| - +----------+--------------+------------+ """ + import pandas as pd + if isinstance(returnType, pd.Series): --- End diff -- Oh that's neat. Thanks!
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org