This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new ff881dad4c50 [SPARK-46517][PS][TESTS][FOLLOWUPS] Reorganize `IndexingTest`: factor out `test_loc2d*` ff881dad4c50 is described below commit ff881dad4c50c3ba7dfeb401d70277f5cfd15687 Author: Ruifeng Zheng <ruife...@apache.org> AuthorDate: Thu Dec 28 12:00:59 2023 +0800 [SPARK-46517][PS][TESTS][FOLLOWUPS] Reorganize `IndexingTest`: factor out `test_loc2d*` ### What changes were proposed in this pull request? 1, factor out `test_loc2d*`; 2, add the missing parity tests; ### Why are the changes needed? test parity and testing parallelism ### Does this PR introduce _any_ user-facing change? no, test-only ### How was this patch tested? ci ### Was this patch authored or co-authored using generative AI tooling? no Closes #44518 from zhengruifeng/ps_test_indexing_loc2d. Authored-by: Ruifeng Zheng <ruife...@apache.org> Signed-off-by: Ruifeng Zheng <ruife...@apache.org> --- dev/sparktestsupport/modules.py | 2 + .../connect/indexes/test_parity_indexing_loc_2d.py | 41 ++++ .../pandas/tests/indexes/test_indexing_loc_2d.py | 247 +++++++++++++++++++++ python/pyspark/pandas/tests/test_indexing.py | 178 --------------- 4 files changed, 290 insertions(+), 178 deletions(-) diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 4e24eb7b71f1..e169a42075b7 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -819,6 +819,7 @@ pyspark_pandas_slow = Module( "pyspark.pandas.tests.indexes.test_indexing_basic", "pyspark.pandas.tests.indexes.test_indexing_iloc", "pyspark.pandas.tests.indexes.test_indexing_loc", + "pyspark.pandas.tests.indexes.test_indexing_loc_2d", "pyspark.pandas.tests.indexes.test_indexing_loc_multi_idx", "pyspark.pandas.tests.indexes.test_reindex", "pyspark.pandas.tests.indexes.test_rename", @@ -1095,6 +1096,7 @@ pyspark_pandas_connect_part0 = Module( "pyspark.pandas.tests.connect.indexes.test_parity_indexing_basic", "pyspark.pandas.tests.connect.indexes.test_parity_indexing_iloc", "pyspark.pandas.tests.connect.indexes.test_parity_indexing_loc", + "pyspark.pandas.tests.connect.indexes.test_parity_indexing_loc_2d", "pyspark.pandas.tests.connect.indexes.test_parity_indexing_loc_multi_idx", "pyspark.pandas.tests.connect.indexes.test_parity_reindex", "pyspark.pandas.tests.connect.indexes.test_parity_rename", diff --git a/python/pyspark/pandas/tests/connect/indexes/test_parity_indexing_loc_2d.py b/python/pyspark/pandas/tests/connect/indexes/test_parity_indexing_loc_2d.py new file mode 100644 index 000000000000..18e0f9088223 --- /dev/null +++ b/python/pyspark/pandas/tests/connect/indexes/test_parity_indexing_loc_2d.py @@ -0,0 +1,41 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import unittest + +from pyspark.pandas.tests.indexes.test_indexing_loc_2d import IndexingLoc2DMixin +from pyspark.testing.connectutils import ReusedConnectTestCase +from pyspark.testing.pandasutils import PandasOnSparkTestUtils + + +class IndexingLoc2DParityTests( + IndexingLoc2DMixin, + PandasOnSparkTestUtils, + ReusedConnectTestCase, +): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.connect.indexes.test_parity_indexing_loc_2d import * # noqa + + try: + import xmlrunner # type: ignore[import] + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/indexes/test_indexing_loc_2d.py b/python/pyspark/pandas/tests/indexes/test_indexing_loc_2d.py new file mode 100644 index 000000000000..88f41d1aade3 --- /dev/null +++ b/python/pyspark/pandas/tests/indexes/test_indexing_loc_2d.py @@ -0,0 +1,247 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import unittest + +import numpy as np +import pandas as pd + +from pyspark import pandas as ps +from pyspark.pandas.exceptions import SparkPandasIndexingError, SparkPandasNotImplementedError +from pyspark.testing.pandasutils import PandasOnSparkTestCase +from pyspark.testing.sqlutils import SQLTestUtils + + +class IndexingLoc2DMixin: + @property + def pdf(self): + return pd.DataFrame( + {"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [4, 5, 6, 3, 2, 1, 0, 0, 0]}, + index=[0, 1, 3, 5, 6, 8, 9, 9, 9], + ) + + @property + def pdf2(self): + return pd.DataFrame( + {0: [1, 2, 3, 4, 5, 6, 7, 8, 9], 1: [4, 5, 6, 3, 2, 1, 0, 0, 0]}, + index=[0, 1, 3, 5, 6, 8, 9, 9, 9], + ) + + @property + def psdf(self): + return ps.from_pandas(self.pdf) + + @property + def psdf2(self): + return ps.from_pandas(self.pdf2) + + def test_loc2d_multiindex(self): + psdf = self.psdf + psdf = psdf.set_index("b", append=True) + pdf = self.pdf + pdf = pdf.set_index("b", append=True) + + self.assert_eq(psdf.loc[:, :], pdf.loc[:, :]) + self.assert_eq(psdf.loc[:, "a"], pdf.loc[:, "a"]) + self.assert_eq(psdf.loc[5:5, "a"], pdf.loc[5:5, "a"]) + + self.assert_eq(psdf.loc[:, "a":"a"], pdf.loc[:, "a":"a"]) + self.assert_eq(psdf.loc[:, "a":"c"], pdf.loc[:, "a":"c"]) + self.assert_eq(psdf.loc[:, "b":"c"], pdf.loc[:, "b":"c"]) + + def test_loc2d(self): + psdf = self.psdf + pdf = self.pdf + + # index indexer is always regarded as slice for duplicated values + self.assert_eq(psdf.loc[5:5, "a"], pdf.loc[5:5, "a"]) + self.assert_eq(psdf.loc[[5], "a"], pdf.loc[[5], "a"]) + self.assert_eq(psdf.loc[5:5, ["a"]], pdf.loc[5:5, ["a"]]) + self.assert_eq(psdf.loc[[5], ["a"]], pdf.loc[[5], ["a"]]) + self.assert_eq(psdf.loc[:, :], pdf.loc[:, :]) + + self.assert_eq(psdf.loc[3:8, "a"], pdf.loc[3:8, "a"]) + self.assert_eq(psdf.loc[:8, "a"], pdf.loc[:8, "a"]) + self.assert_eq(psdf.loc[3:, "a"], pdf.loc[3:, "a"]) + self.assert_eq(psdf.loc[[8], "a"], pdf.loc[[8], "a"]) + + self.assert_eq(psdf.loc[3:8, ["a"]], pdf.loc[3:8, ["a"]]) + self.assert_eq(psdf.loc[:8, ["a"]], pdf.loc[:8, ["a"]]) + self.assert_eq(psdf.loc[3:, ["a"]], pdf.loc[3:, ["a"]]) + # TODO?: self.assert_eq(psdf.loc[[3, 4, 3], ['a']], pdf.loc[[3, 4, 3], ['a']]) + + self.assertRaises(SparkPandasIndexingError, lambda: psdf.loc[3, 3, 3]) + self.assertRaises(SparkPandasIndexingError, lambda: psdf.a.loc[3, 3]) + self.assertRaises(SparkPandasIndexingError, lambda: psdf.a.loc[3:, 3]) + self.assertRaises(SparkPandasIndexingError, lambda: psdf.a.loc[psdf.a % 2 == 0, 3]) + + self.assert_eq(psdf.loc[5, "a"], pdf.loc[5, "a"]) + self.assert_eq(psdf.loc[9, "a"], pdf.loc[9, "a"]) + self.assert_eq(psdf.loc[5, ["a"]], pdf.loc[5, ["a"]]) + self.assert_eq(psdf.loc[9, ["a"]], pdf.loc[9, ["a"]]) + + self.assert_eq(psdf.loc[:, "a":"a"], pdf.loc[:, "a":"a"]) + self.assert_eq(psdf.loc[:, "a":"d"], pdf.loc[:, "a":"d"]) + self.assert_eq(psdf.loc[:, "c":"d"], pdf.loc[:, "c":"d"]) + + # bool list-like column select + bool_list = [True, False] + self.assert_eq(psdf.loc[:, bool_list], pdf.loc[:, bool_list]) + self.assert_eq(psdf.loc[:, np.array(bool_list)], pdf.loc[:, np.array(bool_list)]) + + pser = pd.Series(bool_list, index=pdf.columns) + self.assert_eq(psdf.loc[:, pser], pdf.loc[:, pser]) + pser = pd.Series(list(reversed(bool_list)), index=list(reversed(pdf.columns))) + self.assert_eq(psdf.loc[:, pser], pdf.loc[:, pser]) + + self.assertRaises(IndexError, lambda: psdf.loc[:, bool_list[:-1]]) + self.assertRaises(IndexError, lambda: psdf.loc[:, np.array(bool_list + [True])]) + self.assertRaises(SparkPandasIndexingError, lambda: psdf.loc[:, pd.Series(bool_list)]) + + # non-string column names + psdf = self.psdf2 + pdf = self.pdf2 + + self.assert_eq(psdf.loc[5:5, 0], pdf.loc[5:5, 0]) + self.assert_eq(psdf.loc[5:5, [0]], pdf.loc[5:5, [0]]) + self.assert_eq(psdf.loc[3:8, 0], pdf.loc[3:8, 0]) + self.assert_eq(psdf.loc[3:8, [0]], pdf.loc[3:8, [0]]) + + self.assert_eq(psdf.loc[:, 0:0], pdf.loc[:, 0:0]) + self.assert_eq(psdf.loc[:, 0:3], pdf.loc[:, 0:3]) + self.assert_eq(psdf.loc[:, 2:3], pdf.loc[:, 2:3]) + + def test_loc2d_multiindex_columns(self): + arrays = [np.array(["bar", "bar", "baz", "baz"]), np.array(["one", "two", "one", "two"])] + + pdf = pd.DataFrame(np.random.randn(3, 4), index=["A", "B", "C"], columns=arrays) + psdf = ps.from_pandas(pdf) + + self.assert_eq(psdf.loc["B":"B", "bar"], pdf.loc["B":"B", "bar"]) + self.assert_eq(psdf.loc["B":"B", ["bar"]], pdf.loc["B":"B", ["bar"]]) + + self.assert_eq(psdf.loc[:, "bar":"bar"], pdf.loc[:, "bar":"bar"]) + self.assert_eq(psdf.loc[:, "bar":("baz", "one")], pdf.loc[:, "bar":("baz", "one")]) + self.assert_eq( + psdf.loc[:, ("bar", "two"):("baz", "one")], pdf.loc[:, ("bar", "two"):("baz", "one")] + ) + self.assert_eq(psdf.loc[:, ("bar", "two"):"bar"], pdf.loc[:, ("bar", "two"):"bar"]) + self.assert_eq(psdf.loc[:, "a":"bax"], pdf.loc[:, "a":"bax"]) + self.assert_eq( + psdf.loc[:, ("bar", "x"):("baz", "a")], + pdf.loc[:, ("bar", "x"):("baz", "a")], + almost=True, + ) + + pdf = pd.DataFrame( + np.random.randn(3, 4), + index=["A", "B", "C"], + columns=pd.MultiIndex.from_tuples( + [("bar", "two"), ("bar", "one"), ("baz", "one"), ("baz", "two")] + ), + ) + psdf = ps.from_pandas(pdf) + + self.assert_eq(psdf.loc[:, "bar":"baz"], pdf.loc[:, "bar":"baz"]) + + self.assertRaises(KeyError, lambda: psdf.loc[:, "bar":("baz", "one")]) + self.assertRaises(KeyError, lambda: psdf.loc[:, ("bar", "two"):"bar"]) + + # bool list-like column select + bool_list = [True, False, True, False] + self.assert_eq(psdf.loc[:, bool_list], pdf.loc[:, bool_list]) + self.assert_eq(psdf.loc[:, np.array(bool_list)], pdf.loc[:, np.array(bool_list)]) + + pser = pd.Series(bool_list, index=pdf.columns) + self.assert_eq(psdf.loc[:, pser], pdf.loc[:, pser]) + + pser = pd.Series(list(reversed(bool_list)), index=list(reversed(pdf.columns))) + self.assert_eq(psdf.loc[:, pser], pdf.loc[:, pser]) + + # non-string column names + arrays = [np.array([0, 0, 1, 1]), np.array([1, 2, 1, 2])] + + pdf = pd.DataFrame(np.random.randn(3, 4), index=["A", "B", "C"], columns=arrays) + psdf = ps.from_pandas(pdf) + + self.assert_eq(psdf.loc["B":"B", 0], pdf.loc["B":"B", 0]) + self.assert_eq(psdf.loc["B":"B", [0]], pdf.loc["B":"B", [0]]) + self.assert_eq(psdf.loc[:, 0:0], pdf.loc[:, 0:0]) + self.assert_eq(psdf.loc[:, 0:(1, 1)], pdf.loc[:, 0:(1, 1)]) + self.assert_eq(psdf.loc[:, (0, 2):(1, 1)], pdf.loc[:, (0, 2):(1, 1)]) + self.assert_eq(psdf.loc[:, (0, 2):0], pdf.loc[:, (0, 2):0]) + self.assert_eq(psdf.loc[:, -1:2], pdf.loc[:, -1:2]) + + def test_loc2d_with_known_divisions(self): + pdf = pd.DataFrame( + np.random.randn(20, 5), index=list("abcdefghijklmnopqrst"), columns=list("ABCDE") + ) + psdf = ps.from_pandas(pdf) + + self.assert_eq(psdf.loc[["a"], "A"], pdf.loc[["a"], "A"]) + self.assert_eq(psdf.loc[["a"], ["A"]], pdf.loc[["a"], ["A"]]) + self.assert_eq(psdf.loc["a":"o", "A"], pdf.loc["a":"o", "A"]) + self.assert_eq(psdf.loc["a":"o", ["A"]], pdf.loc["a":"o", ["A"]]) + self.assert_eq(psdf.loc[["n"], ["A"]], pdf.loc[["n"], ["A"]]) + self.assert_eq(psdf.loc[["a", "c", "n"], ["A"]], pdf.loc[["a", "c", "n"], ["A"]]) + # TODO?: self.assert_eq(psdf.loc[['t', 'b'], ['A']], pdf.loc[['t', 'b'], ['A']]) + # TODO?: self.assert_eq(psdf.loc[['r', 'r', 'c', 'g', 'h'], ['A']], + # TODO?: pdf.loc[['r', 'r', 'c', 'g', 'h'], ['A']]) + + @unittest.skip("TODO: should handle duplicated columns properly") + def test_loc2d_duplicated_columns(self): + pdf = pd.DataFrame( + np.random.randn(20, 5), index=list("abcdefghijklmnopqrst"), columns=list("AABCD") + ) + psdf = ps.from_pandas(pdf) + + # TODO?: self.assert_eq(psdf.loc[['a'], 'A'], pdf.loc[['a'], 'A']) + # TODO?: self.assert_eq(psdf.loc[['a'], ['A']], pdf.loc[['a'], ['A']]) + self.assert_eq(psdf.loc[["j"], "B"], pdf.loc[["j"], "B"]) + self.assert_eq(psdf.loc[["j"], ["B"]], pdf.loc[["j"], ["B"]]) + + # TODO?: self.assert_eq(psdf.loc['a':'o', 'A'], pdf.loc['a':'o', 'A']) + # TODO?: self.assert_eq(psdf.loc['a':'o', ['A']], pdf.loc['a':'o', ['A']]) + self.assert_eq(psdf.loc["j":"q", "B"], pdf.loc["j":"q", "B"]) + self.assert_eq(psdf.loc["j":"q", ["B"]], pdf.loc["j":"q", ["B"]]) + + # TODO?: self.assert_eq(psdf.loc['a':'o', 'B':'D'], pdf.loc['a':'o', 'B':'D']) + # TODO?: self.assert_eq(psdf.loc['a':'o', 'B':'D'], pdf.loc['a':'o', 'B':'D']) + # TODO?: self.assert_eq(psdf.loc['j':'q', 'B':'A'], pdf.loc['j':'q', 'B':'A']) + # TODO?: self.assert_eq(psdf.loc['j':'q', 'B':'A'], pdf.loc['j':'q', 'B':'A']) + + self.assert_eq(psdf.loc[psdf.B > 0, "B"], pdf.loc[pdf.B > 0, "B"]) + # TODO?: self.assert_eq(psdf.loc[psdf.B > 0, ['A', 'C']], pdf.loc[pdf.B > 0, ['A', 'C']]) + + +class IndexingLoc2DTests( + IndexingLoc2DMixin, + PandasOnSparkTestCase, + SQLTestUtils, +): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.indexes.test_indexing_loc_2d import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/test_indexing.py b/python/pyspark/pandas/tests/test_indexing.py index ef496c3b5565..47533793f275 100644 --- a/python/pyspark/pandas/tests/test_indexing.py +++ b/python/pyspark/pandas/tests/test_indexing.py @@ -196,184 +196,6 @@ class IndexingTest(ComparisonTestBase): with self.assertRaises(KeyError): psdf.iat[99, 0] - def test_loc2d_multiindex(self): - psdf = self.psdf - psdf = psdf.set_index("b", append=True) - pdf = self.pdf - pdf = pdf.set_index("b", append=True) - - self.assert_eq(psdf.loc[:, :], pdf.loc[:, :]) - self.assert_eq(psdf.loc[:, "a"], pdf.loc[:, "a"]) - self.assert_eq(psdf.loc[5:5, "a"], pdf.loc[5:5, "a"]) - - self.assert_eq(psdf.loc[:, "a":"a"], pdf.loc[:, "a":"a"]) - self.assert_eq(psdf.loc[:, "a":"c"], pdf.loc[:, "a":"c"]) - self.assert_eq(psdf.loc[:, "b":"c"], pdf.loc[:, "b":"c"]) - - def test_loc2d(self): - psdf = self.psdf - pdf = self.pdf - - # index indexer is always regarded as slice for duplicated values - self.assert_eq(psdf.loc[5:5, "a"], pdf.loc[5:5, "a"]) - self.assert_eq(psdf.loc[[5], "a"], pdf.loc[[5], "a"]) - self.assert_eq(psdf.loc[5:5, ["a"]], pdf.loc[5:5, ["a"]]) - self.assert_eq(psdf.loc[[5], ["a"]], pdf.loc[[5], ["a"]]) - self.assert_eq(psdf.loc[:, :], pdf.loc[:, :]) - - self.assert_eq(psdf.loc[3:8, "a"], pdf.loc[3:8, "a"]) - self.assert_eq(psdf.loc[:8, "a"], pdf.loc[:8, "a"]) - self.assert_eq(psdf.loc[3:, "a"], pdf.loc[3:, "a"]) - self.assert_eq(psdf.loc[[8], "a"], pdf.loc[[8], "a"]) - - self.assert_eq(psdf.loc[3:8, ["a"]], pdf.loc[3:8, ["a"]]) - self.assert_eq(psdf.loc[:8, ["a"]], pdf.loc[:8, ["a"]]) - self.assert_eq(psdf.loc[3:, ["a"]], pdf.loc[3:, ["a"]]) - # TODO?: self.assert_eq(psdf.loc[[3, 4, 3], ['a']], pdf.loc[[3, 4, 3], ['a']]) - - self.assertRaises(SparkPandasIndexingError, lambda: psdf.loc[3, 3, 3]) - self.assertRaises(SparkPandasIndexingError, lambda: psdf.a.loc[3, 3]) - self.assertRaises(SparkPandasIndexingError, lambda: psdf.a.loc[3:, 3]) - self.assertRaises(SparkPandasIndexingError, lambda: psdf.a.loc[psdf.a % 2 == 0, 3]) - - self.assert_eq(psdf.loc[5, "a"], pdf.loc[5, "a"]) - self.assert_eq(psdf.loc[9, "a"], pdf.loc[9, "a"]) - self.assert_eq(psdf.loc[5, ["a"]], pdf.loc[5, ["a"]]) - self.assert_eq(psdf.loc[9, ["a"]], pdf.loc[9, ["a"]]) - - self.assert_eq(psdf.loc[:, "a":"a"], pdf.loc[:, "a":"a"]) - self.assert_eq(psdf.loc[:, "a":"d"], pdf.loc[:, "a":"d"]) - self.assert_eq(psdf.loc[:, "c":"d"], pdf.loc[:, "c":"d"]) - - # bool list-like column select - bool_list = [True, False] - self.assert_eq(psdf.loc[:, bool_list], pdf.loc[:, bool_list]) - self.assert_eq(psdf.loc[:, np.array(bool_list)], pdf.loc[:, np.array(bool_list)]) - - pser = pd.Series(bool_list, index=pdf.columns) - self.assert_eq(psdf.loc[:, pser], pdf.loc[:, pser]) - pser = pd.Series(list(reversed(bool_list)), index=list(reversed(pdf.columns))) - self.assert_eq(psdf.loc[:, pser], pdf.loc[:, pser]) - - self.assertRaises(IndexError, lambda: psdf.loc[:, bool_list[:-1]]) - self.assertRaises(IndexError, lambda: psdf.loc[:, np.array(bool_list + [True])]) - self.assertRaises(SparkPandasIndexingError, lambda: psdf.loc[:, pd.Series(bool_list)]) - - # non-string column names - psdf = self.psdf2 - pdf = self.pdf2 - - self.assert_eq(psdf.loc[5:5, 0], pdf.loc[5:5, 0]) - self.assert_eq(psdf.loc[5:5, [0]], pdf.loc[5:5, [0]]) - self.assert_eq(psdf.loc[3:8, 0], pdf.loc[3:8, 0]) - self.assert_eq(psdf.loc[3:8, [0]], pdf.loc[3:8, [0]]) - - self.assert_eq(psdf.loc[:, 0:0], pdf.loc[:, 0:0]) - self.assert_eq(psdf.loc[:, 0:3], pdf.loc[:, 0:3]) - self.assert_eq(psdf.loc[:, 2:3], pdf.loc[:, 2:3]) - - def test_loc2d_multiindex_columns(self): - arrays = [np.array(["bar", "bar", "baz", "baz"]), np.array(["one", "two", "one", "two"])] - - pdf = pd.DataFrame(np.random.randn(3, 4), index=["A", "B", "C"], columns=arrays) - psdf = ps.from_pandas(pdf) - - self.assert_eq(psdf.loc["B":"B", "bar"], pdf.loc["B":"B", "bar"]) - self.assert_eq(psdf.loc["B":"B", ["bar"]], pdf.loc["B":"B", ["bar"]]) - - self.assert_eq(psdf.loc[:, "bar":"bar"], pdf.loc[:, "bar":"bar"]) - self.assert_eq(psdf.loc[:, "bar":("baz", "one")], pdf.loc[:, "bar":("baz", "one")]) - self.assert_eq( - psdf.loc[:, ("bar", "two"):("baz", "one")], pdf.loc[:, ("bar", "two"):("baz", "one")] - ) - self.assert_eq(psdf.loc[:, ("bar", "two"):"bar"], pdf.loc[:, ("bar", "two"):"bar"]) - self.assert_eq(psdf.loc[:, "a":"bax"], pdf.loc[:, "a":"bax"]) - self.assert_eq( - psdf.loc[:, ("bar", "x"):("baz", "a")], - pdf.loc[:, ("bar", "x"):("baz", "a")], - almost=True, - ) - - pdf = pd.DataFrame( - np.random.randn(3, 4), - index=["A", "B", "C"], - columns=pd.MultiIndex.from_tuples( - [("bar", "two"), ("bar", "one"), ("baz", "one"), ("baz", "two")] - ), - ) - psdf = ps.from_pandas(pdf) - - self.assert_eq(psdf.loc[:, "bar":"baz"], pdf.loc[:, "bar":"baz"]) - - self.assertRaises(KeyError, lambda: psdf.loc[:, "bar":("baz", "one")]) - self.assertRaises(KeyError, lambda: psdf.loc[:, ("bar", "two"):"bar"]) - - # bool list-like column select - bool_list = [True, False, True, False] - self.assert_eq(psdf.loc[:, bool_list], pdf.loc[:, bool_list]) - self.assert_eq(psdf.loc[:, np.array(bool_list)], pdf.loc[:, np.array(bool_list)]) - - pser = pd.Series(bool_list, index=pdf.columns) - self.assert_eq(psdf.loc[:, pser], pdf.loc[:, pser]) - - pser = pd.Series(list(reversed(bool_list)), index=list(reversed(pdf.columns))) - self.assert_eq(psdf.loc[:, pser], pdf.loc[:, pser]) - - # non-string column names - arrays = [np.array([0, 0, 1, 1]), np.array([1, 2, 1, 2])] - - pdf = pd.DataFrame(np.random.randn(3, 4), index=["A", "B", "C"], columns=arrays) - psdf = ps.from_pandas(pdf) - - self.assert_eq(psdf.loc["B":"B", 0], pdf.loc["B":"B", 0]) - self.assert_eq(psdf.loc["B":"B", [0]], pdf.loc["B":"B", [0]]) - self.assert_eq(psdf.loc[:, 0:0], pdf.loc[:, 0:0]) - self.assert_eq(psdf.loc[:, 0:(1, 1)], pdf.loc[:, 0:(1, 1)]) - self.assert_eq(psdf.loc[:, (0, 2):(1, 1)], pdf.loc[:, (0, 2):(1, 1)]) - self.assert_eq(psdf.loc[:, (0, 2):0], pdf.loc[:, (0, 2):0]) - self.assert_eq(psdf.loc[:, -1:2], pdf.loc[:, -1:2]) - - def test_loc2d_with_known_divisions(self): - pdf = pd.DataFrame( - np.random.randn(20, 5), index=list("abcdefghijklmnopqrst"), columns=list("ABCDE") - ) - psdf = ps.from_pandas(pdf) - - self.assert_eq(psdf.loc[["a"], "A"], pdf.loc[["a"], "A"]) - self.assert_eq(psdf.loc[["a"], ["A"]], pdf.loc[["a"], ["A"]]) - self.assert_eq(psdf.loc["a":"o", "A"], pdf.loc["a":"o", "A"]) - self.assert_eq(psdf.loc["a":"o", ["A"]], pdf.loc["a":"o", ["A"]]) - self.assert_eq(psdf.loc[["n"], ["A"]], pdf.loc[["n"], ["A"]]) - self.assert_eq(psdf.loc[["a", "c", "n"], ["A"]], pdf.loc[["a", "c", "n"], ["A"]]) - # TODO?: self.assert_eq(psdf.loc[['t', 'b'], ['A']], pdf.loc[['t', 'b'], ['A']]) - # TODO?: self.assert_eq(psdf.loc[['r', 'r', 'c', 'g', 'h'], ['A']], - # TODO?: pdf.loc[['r', 'r', 'c', 'g', 'h'], ['A']]) - - @unittest.skip("TODO: should handle duplicated columns properly") - def test_loc2d_duplicated_columns(self): - pdf = pd.DataFrame( - np.random.randn(20, 5), index=list("abcdefghijklmnopqrst"), columns=list("AABCD") - ) - psdf = ps.from_pandas(pdf) - - # TODO?: self.assert_eq(psdf.loc[['a'], 'A'], pdf.loc[['a'], 'A']) - # TODO?: self.assert_eq(psdf.loc[['a'], ['A']], pdf.loc[['a'], ['A']]) - self.assert_eq(psdf.loc[["j"], "B"], pdf.loc[["j"], "B"]) - self.assert_eq(psdf.loc[["j"], ["B"]], pdf.loc[["j"], ["B"]]) - - # TODO?: self.assert_eq(psdf.loc['a':'o', 'A'], pdf.loc['a':'o', 'A']) - # TODO?: self.assert_eq(psdf.loc['a':'o', ['A']], pdf.loc['a':'o', ['A']]) - self.assert_eq(psdf.loc["j":"q", "B"], pdf.loc["j":"q", "B"]) - self.assert_eq(psdf.loc["j":"q", ["B"]], pdf.loc["j":"q", ["B"]]) - - # TODO?: self.assert_eq(psdf.loc['a':'o', 'B':'D'], pdf.loc['a':'o', 'B':'D']) - # TODO?: self.assert_eq(psdf.loc['a':'o', 'B':'D'], pdf.loc['a':'o', 'B':'D']) - # TODO?: self.assert_eq(psdf.loc['j':'q', 'B':'A'], pdf.loc['j':'q', 'B':'A']) - # TODO?: self.assert_eq(psdf.loc['j':'q', 'B':'A'], pdf.loc['j':'q', 'B':'A']) - - self.assert_eq(psdf.loc[psdf.B > 0, "B"], pdf.loc[pdf.B > 0, "B"]) - # TODO?: self.assert_eq(psdf.loc[psdf.B > 0, ['A', 'C']], pdf.loc[pdf.B > 0, ['A', 'C']]) - def test_getitem(self): pdf = pd.DataFrame( { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org