Github user icexelloss commented on a diff in the pull request: https://github.com/apache/spark/pull/18732#discussion_r141885324 --- Diff: python/pyspark/sql/tests.py --- @@ -3376,6 +3377,74 @@ def test_vectorized_udf_empty_partition(self): res = df.select(f(col('id'))) self.assertEquals(df.collect(), res.collect()) +@unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed") +class GroupbyApplyTests(ReusedPySparkTestCase): + @classmethod + def setUpClass(cls): + ReusedPySparkTestCase.setUpClass() + cls.spark = SparkSession(cls.sc) + + @classmethod + def tearDownClass(cls): + ReusedPySparkTestCase.tearDownClass() + cls.spark.stop() + + def assertFramesEqual(self, expected, result): + msg = ("DataFrames are not equal: " + + ("\n\nExpected:\n%s\n%s" % (expected, expected.dtypes)) + + ("\n\nResult:\n%s\n%s" % (result, result.dtypes))) + self.assertTrue(expected.equals(result), msg=msg) + + @property + def data(self): + from pyspark.sql.functions import pandas_udf, array, explode, col, lit + return self.spark.range(10).toDF('id') \ + .withColumn("vs", array([lit(i) for i in range(20, 30)])) \ + .withColumn("v", explode(col('vs'))).drop('vs') + + def test_groupby_apply_simple(self): + from pyspark.sql.functions import pandas_udf + df = self.data + + def foo(df): + ret = df + ret = ret.assign(v1=df.v * df.id * 1.0) + ret = ret.assign(v2=df.v + df.id) + return ret + + foo_udf = pandas_udf( + foo, + StructType( + [StructField('id', LongType()), + StructField('v', IntegerType()), + StructField('v1', DoubleType()), + StructField('v2', LongType())])) --- End diff -- Yes the column names are specified in the returnType and the returnType must be a `StructType`. The rational is that `apply()` is a mapping from a pd.Dataframe -> pd.DataFrame, therefore the returnType must be a `StructType`. This is the best way I can think of to specify the column names and returnType, it makes sense to me because there should be a one-to-one mapping between the return value of the function (a `pd.DataFrame`) and it's schema (a `StructType` containing column names and dataType) Also because `pd.DataFrame` doesn't support nested types, there is no ambiguity whether a `StructType` indicates a `pd.DataFrame` or nested type either.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org