This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new fb09e31fffc5 [SPARK-46513][PS][TESTS] Move `BasicIndexingTests` to `pyspark.pandas.tests.indexes.*` fb09e31fffc5 is described below commit fb09e31fffc5dc26ee2f33233a806eac6514654a Author: Ruifeng Zheng <ruife...@apache.org> AuthorDate: Wed Dec 27 08:56:38 2023 +0900 [SPARK-46513][PS][TESTS] Move `BasicIndexingTests` to `pyspark.pandas.tests.indexes.*` ### What changes were proposed in this pull request? Move `BasicIndexingTests` to `pyspark.pandas.tests.indexes.*` ### Why are the changes needed? test code clean up ### Does this PR introduce _any_ user-facing change? no, test-only ### How was this patch tested? ci ### Was this patch authored or co-authored using generative AI tooling? no Closes #44499 from zhengruifeng/ps_test_index_basic. Authored-by: Ruifeng Zheng <ruife...@apache.org> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- dev/sparktestsupport/modules.py | 3 +- .../test_parity_indexing_basic.py} | 21 +-- .../pandas/tests/indexes/test_indexing_basic.py | 171 +++++++++++++++++++++ python/pyspark/pandas/tests/test_indexing.py | 130 ---------------- 4 files changed, 179 insertions(+), 146 deletions(-) diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 66ae11886cd4..042e72863efd 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -816,6 +816,7 @@ pyspark_pandas_slow = Module( "pyspark.pandas.tests.indexes.test_datetime_round", "pyspark.pandas.tests.indexes.test_align", "pyspark.pandas.tests.indexes.test_indexing", + "pyspark.pandas.tests.indexes.test_indexing_basic", "pyspark.pandas.tests.indexes.test_reindex", "pyspark.pandas.tests.indexes.test_rename", "pyspark.pandas.tests.indexes.test_reset_index", @@ -1088,6 +1089,7 @@ pyspark_pandas_connect_part0 = Module( "pyspark.pandas.tests.connect.indexes.test_parity_map", "pyspark.pandas.tests.connect.indexes.test_parity_align", "pyspark.pandas.tests.connect.indexes.test_parity_indexing", + "pyspark.pandas.tests.connect.indexes.test_parity_indexing_basic", "pyspark.pandas.tests.connect.indexes.test_parity_reindex", "pyspark.pandas.tests.connect.indexes.test_parity_rename", "pyspark.pandas.tests.connect.indexes.test_parity_reset_index", @@ -1136,7 +1138,6 @@ pyspark_pandas_connect_part1 = Module( "pyspark.pandas.tests.connect.groupby.test_parity_cumulative", "pyspark.pandas.tests.connect.groupby.test_parity_missing_data", "pyspark.pandas.tests.connect.groupby.test_parity_split_apply", - "pyspark.pandas.tests.connect.test_parity_indexing", "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_align", "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_basic_slow", "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_cov", diff --git a/python/pyspark/pandas/tests/connect/test_parity_indexing.py b/python/pyspark/pandas/tests/connect/indexes/test_parity_indexing_basic.py similarity index 70% rename from python/pyspark/pandas/tests/connect/test_parity_indexing.py rename to python/pyspark/pandas/tests/connect/indexes/test_parity_indexing_basic.py index 950bd2d0b2d0..626f17c72113 100644 --- a/python/pyspark/pandas/tests/connect/test_parity_indexing.py +++ b/python/pyspark/pandas/tests/connect/indexes/test_parity_indexing_basic.py @@ -16,30 +16,21 @@ # import unittest -import pandas as pd - -from pyspark import pandas as ps -from pyspark.pandas.tests.test_indexing import BasicIndexingTestsMixin +from pyspark.pandas.tests.indexes.test_indexing_basic import BasicIndexingTestsMixin from pyspark.testing.connectutils import ReusedConnectTestCase from pyspark.testing.pandasutils import PandasOnSparkTestUtils class BasicIndexingParityTests( - BasicIndexingTestsMixin, PandasOnSparkTestUtils, ReusedConnectTestCase + BasicIndexingTestsMixin, + PandasOnSparkTestUtils, + ReusedConnectTestCase, ): - @property - def pdf(self): - return pd.DataFrame( - {"month": [1, 4, 7, 10], "year": [2012, 2014, 2013, 2014], "sale": [55, 40, 84, 31]} - ) - - @property - def psdf(self): - return ps.from_pandas(self.pdf) + pass if __name__ == "__main__": - from pyspark.pandas.tests.connect.test_parity_indexing import * # noqa: F401 + from pyspark.pandas.tests.connect.indexes.test_parity_indexing_basic import * # noqa: F401 try: import xmlrunner # type: ignore[import] diff --git a/python/pyspark/pandas/tests/indexes/test_indexing_basic.py b/python/pyspark/pandas/tests/indexes/test_indexing_basic.py new file mode 100644 index 000000000000..365ac0b86d47 --- /dev/null +++ b/python/pyspark/pandas/tests/indexes/test_indexing_basic.py @@ -0,0 +1,171 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import unittest + +import pandas as pd + +from pyspark import pandas as ps +from pyspark.testing.pandasutils import PandasOnSparkTestCase, compare_both + + +class BasicIndexingTestsMixin: + @property + def pdf(self): + return pd.DataFrame( + {"month": [1, 4, 7, 10], "year": [2012, 2014, 2013, 2014], "sale": [55, 40, 84, 31]} + ) + + @property + def psdf(self): + return ps.from_pandas(self.pdf) + + @compare_both(almost=False) + def test_indexing(self, df): + df1 = df.set_index("month") + yield df1 + + yield df.set_index("month", drop=False) + yield df.set_index("month", append=True) + yield df.set_index(["year", "month"]) + yield df.set_index(["year", "month"], drop=False) + yield df.set_index(["year", "month"], append=True) + + yield df1.set_index("year", drop=False, append=True) + + df2 = df1.copy() + df2.set_index("year", append=True, inplace=True) + yield df2 + + self.assertRaisesRegex(KeyError, "unknown", lambda: df.set_index("unknown")) + self.assertRaisesRegex(KeyError, "unknown", lambda: df.set_index(["month", "unknown"])) + + for d in [df, df1, df2]: + yield d.reset_index() + yield d.reset_index(drop=True) + + yield df1.reset_index(level=0) + yield df2.reset_index(level=1) + yield df2.reset_index(level=[1, 0]) + yield df1.reset_index(level="month") + yield df2.reset_index(level="year") + yield df2.reset_index(level=["month", "year"]) + yield df2.reset_index(level="month", drop=True) + yield df2.reset_index(level=["month", "year"], drop=True) + + self.assertRaisesRegex( + IndexError, + "Too many levels: Index has only 1 level, not 3", + lambda: df1.reset_index(level=2), + ) + self.assertRaisesRegex( + IndexError, + "Too many levels: Index has only 1 level, not 4", + lambda: df1.reset_index(level=[3, 2]), + ) + self.assertRaisesRegex(KeyError, "unknown.*month", lambda: df1.reset_index(level="unknown")) + self.assertRaisesRegex( + KeyError, "Level unknown not found", lambda: df2.reset_index(level="unknown") + ) + + df3 = df2.copy() + df3.reset_index(inplace=True) + yield df3 + + yield df1.sale.reset_index() + yield df1.sale.reset_index(level=0) + yield df2.sale.reset_index(level=[1, 0]) + yield df1.sale.reset_index(drop=True) + yield df1.sale.reset_index(name="s") + yield df1.sale.reset_index(name="s", drop=True) + + s = df1.sale + self.assertRaisesRegex( + TypeError, + "Cannot reset_index inplace on a Series to create a DataFrame", + lambda: s.reset_index(inplace=True), + ) + s.reset_index(drop=True, inplace=True) + yield s + yield df1 + + # multi-index columns + df4 = df.copy() + df4.columns = pd.MultiIndex.from_tuples( + [("cal", "month"), ("cal", "year"), ("num", "sale")] + ) + df5 = df4.set_index(("cal", "month")) + yield df5 + yield df4.set_index([("cal", "month"), ("num", "sale")]) + + self.assertRaises(KeyError, lambda: df5.reset_index(level=("cal", "month"))) + + yield df5.reset_index(level=[("cal", "month")]) + + # non-string names + df6 = df.copy() + df6.columns = [10.0, 20.0, 30.0] + df7 = df6.set_index(10.0) + yield df7 + yield df6.set_index([10.0, 30.0]) + + yield df7.reset_index(level=10.0) + yield df7.reset_index(level=[10.0]) + + df8 = df.copy() + df8.columns = pd.MultiIndex.from_tuples([(10, "month"), (10, "year"), (20, "sale")]) + df9 = df8.set_index((10, "month")) + yield df9 + yield df8.set_index([(10, "month"), (20, "sale")]) + + yield df9.reset_index(level=[(10, "month")]) + + def test_from_pandas_with_explicit_index(self): + pdf = self.pdf + + df1 = ps.from_pandas(pdf.set_index("month")) + self.assertPandasEqual(df1._to_pandas(), pdf.set_index("month")) + + df2 = ps.from_pandas(pdf.set_index(["year", "month"])) + self.assertPandasEqual(df2._to_pandas(), pdf.set_index(["year", "month"])) + + def test_limitations(self): + df = self.psdf.set_index("month") + + self.assertRaisesRegex( + ValueError, + "Level should be all int or all string.", + lambda: df.reset_index([1, "month"]), + ) + + +class BasicIndexingTests( + BasicIndexingTestsMixin, + PandasOnSparkTestCase, +): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.indexes.test_indexing_basic import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/test_indexing.py b/python/pyspark/pandas/tests/test_indexing.py index eb86c9ffabc4..0240f2c75996 100644 --- a/python/pyspark/pandas/tests/test_indexing.py +++ b/python/pyspark/pandas/tests/test_indexing.py @@ -26,132 +26,6 @@ from pyspark.pandas.exceptions import SparkPandasIndexingError, SparkPandasNotIm from pyspark.testing.pandasutils import ComparisonTestBase, compare_both -class BasicIndexingTestsMixin: - @property - def pdf(self): - return pd.DataFrame( - {"month": [1, 4, 7, 10], "year": [2012, 2014, 2013, 2014], "sale": [55, 40, 84, 31]} - ) - - @compare_both(almost=False) - def test_indexing(self, df): - df1 = df.set_index("month") - yield df1 - - yield df.set_index("month", drop=False) - yield df.set_index("month", append=True) - yield df.set_index(["year", "month"]) - yield df.set_index(["year", "month"], drop=False) - yield df.set_index(["year", "month"], append=True) - - yield df1.set_index("year", drop=False, append=True) - - df2 = df1.copy() - df2.set_index("year", append=True, inplace=True) - yield df2 - - self.assertRaisesRegex(KeyError, "unknown", lambda: df.set_index("unknown")) - self.assertRaisesRegex(KeyError, "unknown", lambda: df.set_index(["month", "unknown"])) - - for d in [df, df1, df2]: - yield d.reset_index() - yield d.reset_index(drop=True) - - yield df1.reset_index(level=0) - yield df2.reset_index(level=1) - yield df2.reset_index(level=[1, 0]) - yield df1.reset_index(level="month") - yield df2.reset_index(level="year") - yield df2.reset_index(level=["month", "year"]) - yield df2.reset_index(level="month", drop=True) - yield df2.reset_index(level=["month", "year"], drop=True) - - self.assertRaisesRegex( - IndexError, - "Too many levels: Index has only 1 level, not 3", - lambda: df1.reset_index(level=2), - ) - self.assertRaisesRegex( - IndexError, - "Too many levels: Index has only 1 level, not 4", - lambda: df1.reset_index(level=[3, 2]), - ) - self.assertRaisesRegex(KeyError, "unknown.*month", lambda: df1.reset_index(level="unknown")) - self.assertRaisesRegex( - KeyError, "Level unknown not found", lambda: df2.reset_index(level="unknown") - ) - - df3 = df2.copy() - df3.reset_index(inplace=True) - yield df3 - - yield df1.sale.reset_index() - yield df1.sale.reset_index(level=0) - yield df2.sale.reset_index(level=[1, 0]) - yield df1.sale.reset_index(drop=True) - yield df1.sale.reset_index(name="s") - yield df1.sale.reset_index(name="s", drop=True) - - s = df1.sale - self.assertRaisesRegex( - TypeError, - "Cannot reset_index inplace on a Series to create a DataFrame", - lambda: s.reset_index(inplace=True), - ) - s.reset_index(drop=True, inplace=True) - yield s - yield df1 - - # multi-index columns - df4 = df.copy() - df4.columns = pd.MultiIndex.from_tuples( - [("cal", "month"), ("cal", "year"), ("num", "sale")] - ) - df5 = df4.set_index(("cal", "month")) - yield df5 - yield df4.set_index([("cal", "month"), ("num", "sale")]) - - self.assertRaises(KeyError, lambda: df5.reset_index(level=("cal", "month"))) - - yield df5.reset_index(level=[("cal", "month")]) - - # non-string names - df6 = df.copy() - df6.columns = [10.0, 20.0, 30.0] - df7 = df6.set_index(10.0) - yield df7 - yield df6.set_index([10.0, 30.0]) - - yield df7.reset_index(level=10.0) - yield df7.reset_index(level=[10.0]) - - df8 = df.copy() - df8.columns = pd.MultiIndex.from_tuples([(10, "month"), (10, "year"), (20, "sale")]) - df9 = df8.set_index((10, "month")) - yield df9 - yield df8.set_index([(10, "month"), (20, "sale")]) - - yield df9.reset_index(level=[(10, "month")]) - - def test_from_pandas_with_explicit_index(self): - pdf = self.pdf - - df1 = ps.from_pandas(pdf.set_index("month")) - self.assertPandasEqual(df1._to_pandas(), pdf.set_index("month")) - - df2 = ps.from_pandas(pdf.set_index(["year", "month"])) - self.assertPandasEqual(df2._to_pandas(), pdf.set_index(["year", "month"])) - - def test_limitations(self): - df = self.psdf.set_index("month") - - self.assertRaisesRegex( - ValueError, - "Level should be all int or all string.", - lambda: df.reset_index([1, "month"]), - ) - - class IndexingTest(ComparisonTestBase): @property def pdf(self): @@ -1320,10 +1194,6 @@ class IndexingTest(ComparisonTestBase): psdf.iloc[[1, 1]] -class BasicIndexingTests(BasicIndexingTestsMixin, ComparisonTestBase): - pass - - if __name__ == "__main__": from pyspark.pandas.tests.test_indexing import * # noqa: F401 --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org