Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/19325#discussion_r140941134 --- Diff: python/pyspark/sql/functions.py --- @@ -2183,14 +2187,28 @@ def pandas_udf(f=None, returnType=StringType()): :param f: python function if used as a standalone function :param returnType: a :class:`pyspark.sql.types.DataType` object - # TODO: doctest + >>> from pyspark.sql.types import IntegerType, StringType + >>> slen = pandas_udf(lambda s: s.str.len(), IntegerType()) + >>> @pandas_udf(returnType=StringType()) + ... def to_upper(s): + ... return s.str.upper() + ... + >>> @pandas_udf(returnType="integer") + ... def add_one(x): + ... return x + 1 + ... + >>> df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age")) + >>> df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")) \\ + ... .show() # doctest: +SKIP --- End diff -- I just double checked it passes ``` ./run-tests --python-executables=pypy --modules pyspark-sql ... Will test against the following Python executables: ['pypy'] Will test the following Python modules: ['pyspark-sql'] Starting test(pypy): pyspark.sql.functions ... Finished test(pypy): pyspark.sql.functions (74s) ... ``` Also, checked without ` # doctest: +SKIP`: ```diff diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 63e9a830bbc..3265ecc974b 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -2199,7 +2199,7 @@ def pandas_udf(f=None, returnType=StringType()): ... >>> df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age")) >>> df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")) \\ - ... .show() # doctest: +SKIP + ... .show() +----------+--------------+------------+ |slen(name)|to_upper(name)|add_one(age)| +----------+--------------+------------+ ``` ``` ./run-tests --python-executables=pypy --modules pyspark-sql ... Will test against the following Python executables: ['pypy'] Will test the following Python modules: ['pyspark-sql'] ... Starting test(pypy): pyspark.sql.functions ... Failed example: df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")) \ .show() Exception raised: Traceback (most recent call last): File "/usr/local/Cellar/pypy/5.8.0/libexec/lib-python/2.7/doctest.py", line 1315, in __run compileflags, 1) in test.globs File "<doctest pyspark.sql.functions.pandas_udf[5]>", line 1, in <module> df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")) \ File "/.../spark/python/pyspark/sql/dataframe.py", line 347, in show print(self._jdf.showString(n, 20, vertical)) File "/.../spark/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py", line 1160, in __call__ answer, self.gateway_client, self.target_id, self.name) File "/.../spark/python/pyspark/sql/utils.py", line 63, in deco return f(*a, **kw) File "/.../spark/python/lib/py4j-0.10.6-src.zip/py4j/protocol.py", line 320, in get_return_value format(target_id, ".", name), value) Py4JJavaError: An error occurred while calling o1373.showString. : org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 93.0 failed 1 times, most recent failure: Lost task 0.0 in stage 93.0 (TID 1093, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last): File "/.../spark/python/lib/pyspark.zip/pyspark/worker.py", line 190, in main func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type) File "/.../spark/python/lib/pyspark.zip/pyspark/worker.py", line 112, in read_udfs arg_offsets, udf = read_single_udf(pickleSer, infile, eval_type) File "/.../spark/python/lib/pyspark.zip/pyspark/worker.py", line 102, in read_single_udf return arg_offsets, wrap_pandas_udf(row_func, return_type) File "/.../spark/python/lib/pyspark.zip/pyspark/worker.py", line 77, in wrap_pandas_udf arrow_return_type = toArrowType(return_type) File "/.../spark/python/lib/pyspark.zip/pyspark/sql/types.py", line 1603, in toArrowType import pyarrow as pa ImportError: No module named pyarrow ```
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org