Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/18659#discussion_r139592201 --- Diff: python/pyspark/sql/functions.py --- @@ -2142,18 +2159,26 @@ def udf(f=None, returnType=StringType()): | 8| JOHN DOE| 22| +----------+--------------+------------+ """ - def _udf(f, returnType=StringType()): - udf_obj = UserDefinedFunction(f, returnType) - return udf_obj._wrapped() + return _create_udf(f, returnType=returnType, vectorized=False) - # decorator @udf, @udf() or @udf(dataType()) - if f is None or isinstance(f, (str, DataType)): - # If DataType has been passed as a positional argument - # for decorator use it as a returnType - return_type = f or returnType - return functools.partial(_udf, returnType=return_type) + +@since(2.3) +def pandas_udf(f=None, returnType=StringType()): + """ + Creates a :class:`Column` expression representing a user defined function (UDF) that accepts + `Pandas.Series` as input arguments and outputs a `Pandas.Series` of the same length. + + :param f: python function if used as a standalone function + :param returnType: a :class:`pyspark.sql.types.DataType` object + + # TODO: doctest + """ + import inspect + # If function "f" does not define the optional kwargs, then wrap with a kwargs placeholder + if inspect.getargspec(f).keywords is None: + return _create_udf(lambda *a, **kwargs: f(*a), returnType=returnType, vectorized=True) --- End diff -- How about disallowing it for now? I think it could be an option if 0-parameter UDF alone should not be supported consistently. `return pd.Series(1).repeat(kwargs['length'])` looks still a little bit weird ..
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org