This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch branch-2.4 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-2.4 by this push: new 906df15 [SPARK-34703][PYSPARK][2.4] Fix pyspark test when using sort_values on Pandas 906df15 is described below commit 906df15f81c1e1c41a097f4230695da3a919227a Author: Liang-Chi Hsieh <vii...@gmail.com> AuthorDate: Wed Mar 10 18:42:11 2021 -0800 [SPARK-34703][PYSPARK][2.4] Fix pyspark test when using sort_values on Pandas ### What changes were proposed in this pull request? This patch fixes a few PySpark test error related to Pandas, in order to restore 2.4 Jenkins builds. ### Why are the changes needed? There are APIs changed since Pandas 0.24. If there are index and column name are the same, `sort_values` will throw error. Three PySpark tests are currently failed in Jenkins 2.4 build: `test_column_order`, `test_complex_groupby`, `test_udf_with_key`: ``` ====================================================================== ERROR: test_column_order (pyspark.sql.tests.GroupedMapPandasUDFTests) ---------------------------------------------------------------------- Traceback (most recent call last): File "/spark/python/pyspark/sql/tests.py", line 5996, in test_column_order expected = pd_result.sort_values(['id', 'v']).reset_index(drop=True) File "/usr/local/lib/python2.7/dist-packages/pandas/core/frame.py", line 4711, in sort_values for x in by] File "/usr/local/lib/python2.7/dist-packages/pandas/core/generic.py", line 1702, in _get_label_or_level_values self._check_label_or_level_ambiguity(key, axis=axis) File "/usr/local/lib/python2.7/dist-packages/pandas/core/generic.py", line 1656, in _check_label_or_level_ambiguity raise ValueError(msg) ValueError: 'id' is both an index level and a column label, which is ambiguous. ====================================================================== ERROR: test_complex_groupby (pyspark.sql.tests.GroupedMapPandasUDFTests) ---------------------------------------------------------------------- Traceback (most recent call last): File "/spark/python/pyspark/sql/tests.py", line 5765, in test_complex_groupby expected = expected.sort_values(['id', 'v']).reset_index(drop=True) File "/usr/local/lib/python2.7/dist-packages/pandas/core/frame.py", line 4711, in sort_values for x in by] File "/usr/local/lib/python2.7/dist-packages/pandas/core/generic.py", line 1702, in _get_label_or_level_values self._check_label_or_level_ambiguity(key, axis=axis) File "/usr/local/lib/python2.7/dist-packages/pandas/core/generic.py", line 1656, in _check_label_or_level_ambiguity raise ValueError(msg) ValueError: 'id' is both an index level and a column label, which is ambiguous. ====================================================================== ERROR: test_udf_with_key (pyspark.sql.tests.GroupedMapPandasUDFTests) ---------------------------------------------------------------------- Traceback (most recent call last): File "/spark/python/pyspark/sql/tests.py", line 5922, in test_udf_with_key .sort_values(['id', 'v']).reset_index(drop=True) File "/usr/local/lib/python2.7/dist-packages/pandas/core/frame.py", line 4711, in sort_values for x in by] File "/usr/local/lib/python2.7/dist-packages/pandas/core/generic.py", line 1702, in _get_label_or_level_values self._check_label_or_level_ambiguity(key, axis=axis) File "/usr/local/lib/python2.7/dist-packages/pandas/core/generic.py", line 1656, in _check_label_or_level_ambiguity raise ValueError(msg) ValueError: 'id' is both an index level and a column label, which is ambiguous. ``` ### Does this PR introduce _any_ user-facing change? No, dev only. ### How was this patch tested? Verified by running the tests locally. Closes #31803 from viirya/SPARK-34703. Authored-by: Liang-Chi Hsieh <vii...@gmail.com> Signed-off-by: Dongjoon Hyun <dh...@apple.com> --- python/pyspark/sql/tests.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 70f3882..e3b8e19 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -5761,7 +5761,7 @@ class GroupedMapPandasUDFTests(ReusedSQLTestCase): result = df.groupby(col('id') % 2 == 0).apply(normalize).sort('id', 'v').toPandas() pdf = df.toPandas() - expected = pdf.groupby(pdf['id'] % 2 == 0).apply(normalize.func) + expected = pdf.groupby(pdf['id'] % 2 == 0, as_index=False).apply(normalize.func) expected = expected.sort_values(['id', 'v']).reset_index(drop=True) expected = expected.assign(norm=expected.norm.astype('float64')) self.assertPandasEqual(expected, result) @@ -5917,21 +5917,21 @@ class GroupedMapPandasUDFTests(ReusedSQLTestCase): # Test groupby column result1 = df.groupby('id').apply(udf1).sort('id', 'v').toPandas() - expected1 = pdf.groupby('id')\ + expected1 = pdf.groupby('id', as_index=False)\ .apply(lambda x: udf1.func((x.id.iloc[0],), x))\ .sort_values(['id', 'v']).reset_index(drop=True) self.assertPandasEqual(expected1, result1) # Test groupby expression result2 = df.groupby(df.id % 2).apply(udf1).sort('id', 'v').toPandas() - expected2 = pdf.groupby(pdf.id % 2)\ + expected2 = pdf.groupby(pdf.id % 2, as_index=False)\ .apply(lambda x: udf1.func((x.id.iloc[0] % 2,), x))\ .sort_values(['id', 'v']).reset_index(drop=True) self.assertPandasEqual(expected2, result2) # Test complex groupby result3 = df.groupby(df.id, df.v % 2).apply(udf2).sort('id', 'v').toPandas() - expected3 = pdf.groupby([pdf.id, pdf.v % 2])\ + expected3 = pdf.groupby([pdf.id, pdf.v % 2], as_index=False)\ .apply(lambda x: udf2.func((x.id.iloc[0], (x.v % 2).iloc[0],), x))\ .sort_values(['id', 'v']).reset_index(drop=True) self.assertPandasEqual(expected3, result3) @@ -5953,7 +5953,7 @@ class GroupedMapPandasUDFTests(ReusedSQLTestCase): df = self.data grouped_df = df.groupby('id') - grouped_pdf = df.toPandas().groupby('id') + grouped_pdf = df.toPandas().groupby('id', as_index=False) # Function returns a pdf with required column names, but order could be arbitrary using dict def change_col_order(pdf): --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org