Github user icexelloss commented on a diff in the pull request: https://github.com/apache/spark/pull/20211#discussion_r160862182 --- Diff: python/pyspark/sql/tests.py --- @@ -3995,23 +3995,49 @@ def test_coerce(self): self.assertFramesEqual(expected, result) def test_complex_groupby(self): + import pandas as pd from pyspark.sql.functions import pandas_udf, col, PandasUDFType df = self.data + pdf = df.toPandas() @pandas_udf( - 'id long, v int, norm double', + 'v int, v2 double', PandasUDFType.GROUP_MAP ) - def normalize(pdf): + def foo(pdf): v = pdf.v - return pdf.assign(norm=(v - v.mean()) / v.std()) - - result = df.groupby(col('id') % 2 == 0).apply(normalize).sort('id', 'v').toPandas() - pdf = df.toPandas() - expected = pdf.groupby(pdf['id'] % 2 == 0).apply(normalize.func) - expected = expected.sort_values(['id', 'v']).reset_index(drop=True) - expected = expected.assign(norm=expected.norm.astype('float64')) - self.assertFramesEqual(expected, result) + return pd.DataFrame({'v': v + 1, 'v2': v - v.mean()})[:] --- End diff -- This is just for simplifying the test - pandas has very complicated behavior when it comes to what's the index of the return value when using `groupby apply` If interested, take a look at http://nbviewer.jupyter.org/gist/mbirdi/05f8a83d340476e5f03a
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org