Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/20211#discussion_r160583899 --- Diff: python/pyspark/sql/tests.py --- @@ -3995,23 +3995,49 @@ def test_coerce(self): self.assertFramesEqual(expected, result) def test_complex_groupby(self): + import pandas as pd from pyspark.sql.functions import pandas_udf, col, PandasUDFType df = self.data + pdf = df.toPandas() @pandas_udf( - 'id long, v int, norm double', + 'v int, v2 double', PandasUDFType.GROUP_MAP ) - def normalize(pdf): + def foo(pdf): v = pdf.v - return pdf.assign(norm=(v - v.mean()) / v.std()) - - result = df.groupby(col('id') % 2 == 0).apply(normalize).sort('id', 'v').toPandas() - pdf = df.toPandas() - expected = pdf.groupby(pdf['id'] % 2 == 0).apply(normalize.func) - expected = expected.sort_values(['id', 'v']).reset_index(drop=True) - expected = expected.assign(norm=expected.norm.astype('float64')) - self.assertFramesEqual(expected, result) + return pd.DataFrame({'v': v + 1, 'v2': v - v.mean()})[:] --- End diff -- Why should we copy here by the way?
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org