This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new ffe3fc9 [SPARK-37514][PYTHON] Remove workarounds due to older pandas ffe3fc9 is described below commit ffe3fc9d23967e41092cf67539aa7f0d77b9eb75 Author: Takuya UESHIN <ues...@databricks.com> AuthorDate: Thu Dec 2 10:51:05 2021 +0900 [SPARK-37514][PYTHON] Remove workarounds due to older pandas ### What changes were proposed in this pull request? Removes workarounds due to older pandas. ### Why are the changes needed? Now that we upgraded the minimum version of pandas to `1.0.5`. We can remove workarounds for pandas API on Spark to run with older pandas. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Modified existing tests to remove workarounds for older pandas. Closes #34772 from ueshin/issues/SPARK-37514/older_pandas. Authored-by: Takuya UESHIN <ues...@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- python/pyspark/pandas/frame.py | 29 +- python/pyspark/pandas/generic.py | 42 +- python/pyspark/pandas/groupby.py | 26 +- python/pyspark/pandas/indexes/multi.py | 36 +- python/pyspark/pandas/namespace.py | 11 +- python/pyspark/pandas/plot/matplotlib.py | 47 +-- .../pandas/tests/data_type_ops/test_boolean_ops.py | 8 +- .../pandas/tests/data_type_ops/test_num_ops.py | 27 +- python/pyspark/pandas/tests/indexes/test_base.py | 252 +++++------- .../pyspark/pandas/tests/indexes/test_category.py | 2 +- .../tests/plot/test_frame_plot_matplotlib.py | 7 +- .../pandas/tests/plot/test_frame_plot_plotly.py | 5 - .../tests/plot/test_series_plot_matplotlib.py | 7 +- .../pandas/tests/plot/test_series_plot_plotly.py | 5 - python/pyspark/pandas/tests/test_dataframe.py | 383 +++++------------- .../pandas/tests/test_dataframe_conversion.py | 14 +- .../pandas/tests/test_dataframe_spark_io.py | 28 +- python/pyspark/pandas/tests/test_expanding.py | 128 +----- python/pyspark/pandas/tests/test_groupby.py | 130 ++---- python/pyspark/pandas/tests/test_indexing.py | 6 - python/pyspark/pandas/tests/test_numpy_compat.py | 31 +- .../pandas/tests/test_ops_on_diff_frames.py | 45 +-- .../tests/test_ops_on_diff_frames_groupby.py | 1 - .../test_ops_on_diff_frames_groupby_expanding.py | 39 +- python/pyspark/pandas/tests/test_reshape.py | 7 +- python/pyspark/pandas/tests/test_series.py | 442 ++++++++------------- .../pyspark/pandas/tests/test_series_conversion.py | 5 - python/pyspark/pandas/tests/test_stats.py | 27 +- 28 files changed, 508 insertions(+), 1282 deletions(-) diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py index edfb62e..de36531 100644 --- a/python/pyspark/pandas/frame.py +++ b/python/pyspark/pandas/frame.py @@ -20,7 +20,6 @@ A wrapper class for Spark DataFrame to behave similar to pandas DataFrame. """ from collections import OrderedDict, defaultdict, namedtuple from collections.abc import Mapping -from distutils.version import LooseVersion import re import warnings import inspect @@ -58,10 +57,7 @@ from pandas.tseries.frequencies import DateOffset, to_offset if TYPE_CHECKING: from pandas.io.formats.style import Styler -if LooseVersion(pd.__version__) >= LooseVersion("0.24"): - from pandas.core.dtypes.common import infer_dtype_from_object -else: - from pandas.core.dtypes.common import _get_dtype_from_object as infer_dtype_from_object +from pandas.core.dtypes.common import infer_dtype_from_object from pandas.core.accessor import CachedAccessor from pandas.core.dtypes.inference import is_sequence from pyspark import StorageLevel @@ -3128,17 +3124,9 @@ defaultdict(<class 'list'>, {'col..., 'col...})] psdf.index.name = verify_temp_column_name(psdf, "__index_name__") return_types = [psdf.index.dtype] + list(psdf.dtypes) - if LooseVersion(pd.__version__) < LooseVersion("0.24"): - - @no_type_check - def pandas_at_time(pdf) -> ps.DataFrame[return_types]: - return pdf.at_time(time, asof).reset_index() - - else: - - @no_type_check - def pandas_at_time(pdf) -> ps.DataFrame[return_types]: - return pdf.at_time(time, asof, axis).reset_index() + @no_type_check + def pandas_at_time(pdf) -> ps.DataFrame[return_types]: + return pdf.at_time(time, asof, axis).reset_index() # apply_batch will remove the index of the pandas-on-Spark DataFrame and attach # a default index, which will never be used. So use "distributed" index as a dummy @@ -12103,17 +12091,14 @@ defaultdict(<class 'list'>, {'col..., 'col...})] def _repr_html_(self) -> str: max_display_count = get_option("display.max_rows") - # pandas 0.25.1 has a regression about HTML representation so 'bold_rows' - # has to be set as False explicitly. See https://github.com/pandas-dev/pandas/issues/28204 - bold_rows = not (LooseVersion("0.25.1") == LooseVersion(pd.__version__)) if max_display_count is None: - return self._to_internal_pandas().to_html(notebook=True, bold_rows=bold_rows) + return self._to_internal_pandas().to_html(notebook=True) pdf = self._get_or_create_repr_pandas_cache(max_display_count) pdf_length = len(pdf) pdf = pdf.iloc[:max_display_count] if pdf_length > max_display_count: - repr_html = pdf.to_html(show_dimensions=True, notebook=True, bold_rows=bold_rows) + repr_html = pdf.to_html(show_dimensions=True, notebook=True) match = REPR_HTML_PATTERN.search(repr_html) if match is not None: nrows = match.group("rows") @@ -12124,7 +12109,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})] "{by} {cols} columns</p>\n</div>".format(rows=nrows, by=by, cols=ncols) ) return REPR_HTML_PATTERN.sub(footer, repr_html) - return pdf.to_html(notebook=True, bold_rows=bold_rows) + return pdf.to_html(notebook=True) def __getitem__(self, key: Any) -> Any: from pyspark.pandas.series import Series diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py index bb83ddf..6597d05 100644 --- a/python/pyspark/pandas/generic.py +++ b/python/pyspark/pandas/generic.py @@ -20,7 +20,6 @@ A base class of DataFrame/Column to behave similar to pandas DataFrame/Series. """ from abc import ABCMeta, abstractmethod from collections import Counter -from distutils.version import LooseVersion from functools import reduce from typing import ( Any, @@ -795,31 +794,17 @@ class Frame(object, metaclass=ABCMeta): if path is None: # If path is none, just collect and use pandas's to_csv. - psdf_or_ser = self - if (LooseVersion("0.24") > LooseVersion(pd.__version__)) and isinstance( - self, ps.Series - ): - # 0.23 seems not having 'columns' parameter in Series' to_csv. - return psdf_or_ser._to_pandas().to_csv( - None, - sep=sep, - na_rep=na_rep, - header=header, - date_format=date_format, - index=False, - ) - else: - return psdf_or_ser._to_pandas().to_csv( - None, - sep=sep, - na_rep=na_rep, - columns=columns, - header=header, - quotechar=quotechar, - date_format=date_format, - escapechar=escapechar, - index=False, - ) + return self._to_pandas().to_csv( + None, + sep=sep, + na_rep=na_rep, + columns=columns, + header=header, + quotechar=quotechar, + date_format=date_format, + escapechar=escapechar, + index=False, + ) if isinstance(self, ps.DataFrame): psdf = self @@ -3005,11 +2990,6 @@ class Frame(object, metaclass=ABCMeta): | 0 | elk | dog | | 1 | pig | quetzal | """ - # `to_markdown` is supported in pandas >= 1.0.0 since it's newly added in pandas 1.0.0. - if LooseVersion(pd.__version__) < LooseVersion("1.0.0"): - raise NotImplementedError( - "`to_markdown()` only supported in pandas-on-Spark with pandas >= 1.0.0" - ) log_advice( "`to_markdown` loads all data into the driver's memory. " "It should only be used if the resulting pandas object is expected to be small." diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py index 36b5ece..7cdfb86 100644 --- a/python/pyspark/pandas/groupby.py +++ b/python/pyspark/pandas/groupby.py @@ -1280,32 +1280,10 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta): def pandas_groupby_apply(pdf: pd.DataFrame) -> pd.DataFrame: - if not is_series_groupby and LooseVersion(pd.__version__) < LooseVersion("0.25"): - # `groupby.apply` in pandas<0.25 runs the functions twice for the first group. - # https://github.com/pandas-dev/pandas/pull/24748 - - should_skip_first_call = True - - def wrapped_func( - df: Union[pd.DataFrame, pd.Series], *a: Any, **k: Any - ) -> Union[pd.DataFrame, pd.Series]: - nonlocal should_skip_first_call - if should_skip_first_call: - should_skip_first_call = False - if should_return_series: - return pd.Series() - else: - return pd.DataFrame() - else: - return pandas_apply(df, *a, **k) - - else: - wrapped_func = pandas_apply - if is_series_groupby: - pdf_or_ser = pdf.groupby(groupkey_names)[name].apply(wrapped_func, *args, **kwargs) + pdf_or_ser = pdf.groupby(groupkey_names)[name].apply(pandas_apply, *args, **kwargs) else: - pdf_or_ser = pdf.groupby(groupkey_names).apply(wrapped_func, *args, **kwargs) + pdf_or_ser = pdf.groupby(groupkey_names).apply(pandas_apply, *args, **kwargs) if should_return_series and isinstance(pdf_or_ser, pd.DataFrame): pdf_or_ser = pdf_or_ser.stack() diff --git a/python/pyspark/pandas/indexes/multi.py b/python/pyspark/pandas/indexes/multi.py index a8bce01..774c677c 100644 --- a/python/pyspark/pandas/indexes/multi.py +++ b/python/pyspark/pandas/indexes/multi.py @@ -15,7 +15,6 @@ # limitations under the License. # -from distutils.version import LooseVersion from functools import partial, reduce from typing import Any, Callable, Iterator, List, Optional, Tuple, Union, cast, no_type_check @@ -106,31 +105,16 @@ class MultiIndex(Index): name=None, verify_integrity: bool = True, ) -> "MultiIndex": - if LooseVersion(pd.__version__) < LooseVersion("0.24"): - if levels is None or codes is None: - raise TypeError("Must pass both levels and codes") - - pidx = pd.MultiIndex( - levels=levels, - labels=codes, - sortorder=sortorder, - names=names, - dtype=dtype, - copy=copy, - name=name, - verify_integrity=verify_integrity, - ) - else: - pidx = pd.MultiIndex( - levels=levels, - codes=codes, - sortorder=sortorder, - names=names, - dtype=dtype, - copy=copy, - name=name, - verify_integrity=verify_integrity, - ) + pidx = pd.MultiIndex( + levels=levels, + codes=codes, + sortorder=sortorder, + names=names, + dtype=dtype, + copy=copy, + name=name, + verify_integrity=verify_integrity, + ) return ps.from_pandas(pidx) @property diff --git a/python/pyspark/pandas/namespace.py b/python/pyspark/pandas/namespace.py index e06fa2f..aa99f18 100644 --- a/python/pyspark/pandas/namespace.py +++ b/python/pyspark/pandas/namespace.py @@ -35,7 +35,6 @@ from typing import ( from collections import OrderedDict from collections.abc import Iterable from datetime import tzinfo -from distutils.version import LooseVersion from functools import reduce from io import BytesIO import json @@ -2440,13 +2439,9 @@ def concat( assert len(merged_columns) > 0 - if LooseVersion(pd.__version__) < LooseVersion("0.24"): - # Always sort when multi-index columns, and if there are Series, never sort. - sort = len(merged_columns[0]) > 1 or (num_series == 0 and sort) - else: - # Always sort when multi-index columns or there are more than two Series, - # and if there is only one Series, never sort. - sort = len(merged_columns[0]) > 1 or num_series > 1 or (num_series != 1 and sort) + # Always sort when multi-index columns or there are more than two Series, + # and if there is only one Series, never sort. + sort = len(merged_columns[0]) > 1 or num_series > 1 or (num_series != 1 and sort) if sort: # FIXME: better ordering diff --git a/python/pyspark/pandas/plot/matplotlib.py b/python/pyspark/pandas/plot/matplotlib.py index 4d8db21..3232b5e 100644 --- a/python/pyspark/pandas/plot/matplotlib.py +++ b/python/pyspark/pandas/plot/matplotlib.py @@ -19,11 +19,24 @@ from distutils.version import LooseVersion import matplotlib as mat import numpy as np -import pandas as pd from matplotlib.axes._base import _process_plot_format from pandas.core.dtypes.inference import is_list_like from pandas.io.formats.printing import pprint_thing +from pandas.plotting._matplotlib import ( + BarPlot as PandasBarPlot, + BoxPlot as PandasBoxPlot, + HistPlot as PandasHistPlot, + PiePlot as PandasPiePlot, + AreaPlot as PandasAreaPlot, + LinePlot as PandasLinePlot, + BarhPlot as PandasBarhPlot, + ScatterPlot as PandasScatterPlot, + KdePlot as PandasKdePlot, +) +from pandas.plotting._core import PlotAccessor +from pandas.plotting._matplotlib.core import MPLPlot as PandasMPLPlot + from pyspark.pandas.plot import ( TopNPlotBase, SampledPlotBase, @@ -33,37 +46,7 @@ from pyspark.pandas.plot import ( KdePlotBase, ) - -if LooseVersion(pd.__version__) < LooseVersion("0.25"): - from pandas.plotting._core import ( - _all_kinds, - BarPlot as PandasBarPlot, - BoxPlot as PandasBoxPlot, - HistPlot as PandasHistPlot, - MPLPlot as PandasMPLPlot, - PiePlot as PandasPiePlot, - AreaPlot as PandasAreaPlot, - LinePlot as PandasLinePlot, - BarhPlot as PandasBarhPlot, - ScatterPlot as PandasScatterPlot, - KdePlot as PandasKdePlot, - ) -else: - from pandas.plotting._matplotlib import ( - BarPlot as PandasBarPlot, - BoxPlot as PandasBoxPlot, - HistPlot as PandasHistPlot, - PiePlot as PandasPiePlot, - AreaPlot as PandasAreaPlot, - LinePlot as PandasLinePlot, - BarhPlot as PandasBarhPlot, - ScatterPlot as PandasScatterPlot, - KdePlot as PandasKdePlot, - ) - from pandas.plotting._core import PlotAccessor - from pandas.plotting._matplotlib.core import MPLPlot as PandasMPLPlot - - _all_kinds = PlotAccessor._all_kinds +_all_kinds = PlotAccessor._all_kinds class PandasOnSparkBarPlot(PandasBarPlot, TopNPlotBase): diff --git a/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py index eac3bb6..bd391f7 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py @@ -215,12 +215,8 @@ class BooleanOpsTest(PandasOnSparkTestCase, TestCasesUtils): pdf, psdf = self.pdf, self.psdf b_pser, b_psser = pdf["bool"], psdf["bool"] - if LooseVersion(pd.__version__) >= LooseVersion("0.25.3"): - self.assert_eq(1 // b_pser, 1 // b_psser) - self.assert_eq(0.1 // b_pser, 0.1 // b_psser) - else: - self.assert_eq(1 // b_psser, pd.Series([1.0, 1.0, np.inf], name="bool")) - self.assert_eq(0.1 // b_psser, pd.Series([0.0, 0.0, np.inf], name="bool")) + self.assert_eq(1 // b_pser, 1 // b_psser) + self.assert_eq(0.1 // b_pser, 0.1 // b_psser) self.assertRaises(TypeError, lambda: "x" // b_psser) self.assertRaises(TypeError, lambda: True // b_psser) self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) // b_psser) diff --git a/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py index 77fc93c..785eb25 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py @@ -131,16 +131,7 @@ class NumOpsTest(PandasOnSparkTestCase, TestCasesUtils): for n_col in self.non_numeric_df_cols: if n_col == "bool": - if LooseVersion(pd.__version__) >= LooseVersion("0.25.3"): - self.assert_eq( - pdf["float"] // pdf["bool"], - psdf["float"] // psdf["bool"], - ) - else: - self.assert_eq( - pd.Series([1.0, 2.0, np.inf]), - psdf["float"] // psdf["bool"], - ) + self.assert_eq(pdf["float"] // pdf["bool"], psdf["float"] // psdf["bool"]) else: for col in self.numeric_df_cols: psser = psdf[col] @@ -378,13 +369,15 @@ class NumOpsTest(PandasOnSparkTestCase, TestCasesUtils): with ps.option_context("compute.eager_check", False): psser.astype(int) - psser = self.psdf["decimal_nan"] - with ps.option_context("compute.eager_check", True), self.assertRaisesRegex( - ValueError, "Cannot convert" - ): - psser.astype(int) - with ps.option_context("compute.eager_check", False): - psser.astype(int) + # Skip decimal_nan test before v1.3.0, it not supported by pandas on spark yet. + if LooseVersion(pd.__version__) >= LooseVersion("1.3"): + psser = self.psdf["decimal_nan"] + with ps.option_context("compute.eager_check", True), self.assertRaisesRegex( + ValueError, "Cannot convert" + ): + psser.astype(int) + with ps.option_context("compute.eager_check", False): + psser.astype(int) def test_neg(self): pdf, psdf = self.pdf, self.psdf diff --git a/python/pyspark/pandas/tests/indexes/test_base.py b/python/pyspark/pandas/tests/indexes/test_base.py index 173a2bf..88c826e 100644 --- a/python/pyspark/pandas/tests/indexes/test_base.py +++ b/python/pyspark/pandas/tests/indexes/test_base.py @@ -170,19 +170,14 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils): self.assert_eq(psidx.to_frame(), pidx.to_frame()) self.assert_eq(psidx.to_frame(index=False), pidx.to_frame(index=False)) - if LooseVersion(pd.__version__) >= LooseVersion("0.24"): - # The `name` argument is added in pandas 0.24. - self.assert_eq(psidx.to_frame(name="x"), pidx.to_frame(name="x")) - self.assert_eq( - psidx.to_frame(index=False, name="x"), - pidx.to_frame(index=False, name="x"), - ) + self.assert_eq(psidx.to_frame(name="x"), pidx.to_frame(name="x")) + self.assert_eq(psidx.to_frame(index=False, name="x"), pidx.to_frame(index=False, name="x")) - self.assertRaises(TypeError, lambda: psidx.to_frame(name=["x"])) + self.assertRaises(TypeError, lambda: psidx.to_frame(name=["x"])) - # non-string name - self.assert_eq(psidx.to_frame(name=10), pidx.to_frame(name=10)) - self.assert_eq(psidx.to_frame(name=("x", 10)), pidx.to_frame(name=("x", 10))) + # non-string name + self.assert_eq(psidx.to_frame(name=10), pidx.to_frame(name=10)) + self.assert_eq(psidx.to_frame(name=("x", 10)), pidx.to_frame(name=("x", 10))) pidx = self.pdf.set_index("b", append=True).index psidx = self.psdf.set_index("b", append=True).index @@ -190,25 +185,22 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils): self.assert_eq(psidx.to_frame(), pidx.to_frame()) self.assert_eq(psidx.to_frame(index=False), pidx.to_frame(index=False)) - if LooseVersion(pd.__version__) >= LooseVersion("0.24"): - # The `name` argument is added in pandas 0.24. - self.assert_eq(psidx.to_frame(name=["x", "y"]), pidx.to_frame(name=["x", "y"])) - self.assert_eq(psidx.to_frame(name=("x", "y")), pidx.to_frame(name=("x", "y"))) - self.assert_eq( - psidx.to_frame(index=False, name=["x", "y"]), - pidx.to_frame(index=False, name=["x", "y"]), - ) + self.assert_eq(psidx.to_frame(name=["x", "y"]), pidx.to_frame(name=["x", "y"])) + self.assert_eq(psidx.to_frame(name=("x", "y")), pidx.to_frame(name=("x", "y"))) + self.assert_eq( + psidx.to_frame(index=False, name=["x", "y"]), + pidx.to_frame(index=False, name=["x", "y"]), + ) - self.assertRaises(TypeError, lambda: psidx.to_frame(name="x")) - self.assertRaises(ValueError, lambda: psidx.to_frame(name=["x"])) + self.assertRaises(TypeError, lambda: psidx.to_frame(name="x")) + self.assertRaises(ValueError, lambda: psidx.to_frame(name=["x"])) - # non-string names - self.assert_eq(psidx.to_frame(name=[10, 20]), pidx.to_frame(name=[10, 20])) - self.assert_eq(psidx.to_frame(name=("x", 10)), pidx.to_frame(name=("x", 10))) - self.assert_eq( - psidx.to_frame(name=[("x", 10), ("y", 20)]), - pidx.to_frame(name=[("x", 10), ("y", 20)]), - ) + # non-string names + self.assert_eq(psidx.to_frame(name=[10, 20]), pidx.to_frame(name=[10, 20])) + self.assert_eq(psidx.to_frame(name=("x", 10)), pidx.to_frame(name=("x", 10))) + self.assert_eq( + psidx.to_frame(name=[("x", 10), ("y", 20)]), pidx.to_frame(name=[("x", 10), ("y", 20)]) + ) def test_index_names(self): psdf = self.psdf @@ -1491,68 +1483,64 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils): ) # Testing if the result is correct after sort=False. - # The `sort` argument is added in pandas 0.24. - if LooseVersion(pd.__version__) >= LooseVersion("0.24"): - self.assert_eq( - psidx1.union(psidx2, sort=False).sort_values(), - pidx1.union(pidx2, sort=False).sort_values(), - ) - self.assert_eq( - psidx2.union(psidx1, sort=False).sort_values(), - pidx2.union(pidx1, sort=False).sort_values(), - ) - self.assert_eq( - psidx1.union([3, 4, 5, 6], sort=False).sort_values(), - pidx1.union([3, 4, 5, 6], sort=False).sort_values(), - almost=True, - ) - self.assert_eq( - psidx2.union([1, 2, 3, 4], sort=False).sort_values(), - pidx2.union([1, 2, 3, 4], sort=False).sort_values(), - almost=True, - ) - self.assert_eq( - psidx1.union(ps.Series([3, 4, 5, 6]), sort=False).sort_values(), - pidx1.union(pd.Series([3, 4, 5, 6]), sort=False).sort_values(), - almost=True, - ) - self.assert_eq( - psidx2.union(ps.Series([1, 2, 3, 4]), sort=False).sort_values(), - pidx2.union(pd.Series([1, 2, 3, 4]), sort=False).sort_values(), - almost=True, - ) + self.assert_eq( + psidx1.union(psidx2, sort=False).sort_values(), + pidx1.union(pidx2, sort=False).sort_values(), + ) + self.assert_eq( + psidx2.union(psidx1, sort=False).sort_values(), + pidx2.union(pidx1, sort=False).sort_values(), + ) + self.assert_eq( + psidx1.union([3, 4, 5, 6], sort=False).sort_values(), + pidx1.union([3, 4, 5, 6], sort=False).sort_values(), + almost=True, + ) + self.assert_eq( + psidx2.union([1, 2, 3, 4], sort=False).sort_values(), + pidx2.union([1, 2, 3, 4], sort=False).sort_values(), + almost=True, + ) + self.assert_eq( + psidx1.union(ps.Series([3, 4, 5, 6]), sort=False).sort_values(), + pidx1.union(pd.Series([3, 4, 5, 6]), sort=False).sort_values(), + almost=True, + ) + self.assert_eq( + psidx2.union(ps.Series([1, 2, 3, 4]), sort=False).sort_values(), + pidx2.union(pd.Series([1, 2, 3, 4]), sort=False).sort_values(), + almost=True, + ) - # Duplicated values for Index is supported in pandas >= 1.0.0 - if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"): - pidx1 = pd.Index([1, 2, 3, 4, 3, 4, 3, 4]) - pidx2 = pd.Index([3, 4, 3, 4, 5, 6]) - psidx1 = ps.from_pandas(pidx1) - psidx2 = ps.from_pandas(pidx2) + pidx1 = pd.Index([1, 2, 3, 4, 3, 4, 3, 4]) + pidx2 = pd.Index([3, 4, 3, 4, 5, 6]) + psidx1 = ps.from_pandas(pidx1) + psidx2 = ps.from_pandas(pidx2) - self.assert_eq(psidx1.union(psidx2), pidx1.union(pidx2)) - self.assert_eq( - psidx1.union([3, 4, 3, 3, 5, 6]), pidx1.union([3, 4, 3, 4, 5, 6]), almost=True - ) - self.assert_eq( - psidx1.union(ps.Series([3, 4, 3, 3, 5, 6])), - pidx1.union(pd.Series([3, 4, 3, 4, 5, 6])), - almost=True, - ) + self.assert_eq(psidx1.union(psidx2), pidx1.union(pidx2)) + self.assert_eq( + psidx1.union([3, 4, 3, 3, 5, 6]), pidx1.union([3, 4, 3, 4, 5, 6]), almost=True + ) + self.assert_eq( + psidx1.union(ps.Series([3, 4, 3, 3, 5, 6])), + pidx1.union(pd.Series([3, 4, 3, 4, 5, 6])), + almost=True, + ) - # Manually create the expected result here since there is a bug in Index.union - # dropping duplicated values in pandas < 1.3. - expected = pd.Index([1, 2, 3, 3, 3, 4, 4, 4, 5, 6]) - self.assert_eq(psidx2.union(psidx1), expected) - self.assert_eq( - psidx2.union([1, 2, 3, 4, 3, 4, 3, 4]), - expected, - almost=True, - ) - self.assert_eq( - psidx2.union(ps.Series([1, 2, 3, 4, 3, 4, 3, 4])), - expected, - almost=True, - ) + # Manually create the expected result here since there is a bug in Index.union + # dropping duplicated values in pandas < 1.3. + expected = pd.Index([1, 2, 3, 3, 3, 4, 4, 4, 5, 6]) + self.assert_eq(psidx2.union(psidx1), expected) + self.assert_eq( + psidx2.union([1, 2, 3, 4, 3, 4, 3, 4]), + expected, + almost=True, + ) + self.assert_eq( + psidx2.union(ps.Series([1, 2, 3, 4, 3, 4, 3, 4])), + expected, + almost=True, + ) # MultiIndex pmidx1 = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")]) @@ -1595,55 +1583,40 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils): ) # Testing if the result is correct after sort=False. - # The `sort` argument is added in pandas 0.24. - if LooseVersion(pd.__version__) >= LooseVersion("0.24"): - # Manually create the expected result here since there is a bug in MultiIndex.union - # dropping duplicated values in pandas < 1.3. - expected = pd.MultiIndex.from_tuples( - [("x", "a"), ("x", "a"), ("x", "b"), ("x", "b"), ("x", "c"), ("x", "d")] - ) - self.assert_eq( - psmidx1.union(psmidx2, sort=False).sort_values(), - expected, - ) - self.assert_eq( - psmidx2.union(psmidx1, sort=False).sort_values(), - expected, - ) - self.assert_eq( - psmidx1.union( - [("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")], sort=False - ).sort_values(), - expected, - ) - self.assert_eq( - psmidx2.union( - [("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")], sort=False - ).sort_values(), - expected, - ) + # Manually create the expected result here since there is a bug in MultiIndex.union + # dropping duplicated values in pandas < 1.3. + expected = pd.MultiIndex.from_tuples( + [("x", "a"), ("x", "a"), ("x", "b"), ("x", "b"), ("x", "c"), ("x", "d")] + ) + self.assert_eq(psmidx1.union(psmidx2, sort=False).sort_values(), expected) + self.assert_eq(psmidx2.union(psmidx1, sort=False).sort_values(), expected) + self.assert_eq( + psmidx1.union( + [("x", "a"), ("x", "b"), ("x", "c"), ("x", "d")], sort=False + ).sort_values(), + expected, + ) + self.assert_eq( + psmidx2.union( + [("x", "a"), ("x", "b"), ("x", "a"), ("x", "b")], sort=False + ).sort_values(), + expected, + ) - expected = pd.MultiIndex.from_tuples( - [(1, 1), (1, 2), (1, 3), (1, 3), (1, 4), (1, 4), (1, 5), (1, 6)] - ) - self.assert_eq( - psmidx3.union(psmidx4, sort=False).sort_values(), - expected, - ) - self.assert_eq( - psmidx4.union(psmidx3, sort=False).sort_values(), - expected, - ) - self.assert_eq( - psmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)], sort=False).sort_values(), - expected, - ) - self.assert_eq( - psmidx4.union( - [(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)], sort=False - ).sort_values(), - expected, - ) + expected = pd.MultiIndex.from_tuples( + [(1, 1), (1, 2), (1, 3), (1, 3), (1, 4), (1, 4), (1, 5), (1, 6)] + ) + self.assert_eq(psmidx3.union(psmidx4, sort=False).sort_values(), expected) + self.assert_eq(psmidx4.union(psmidx3, sort=False).sort_values(), expected) + self.assert_eq( + psmidx3.union([(1, 3), (1, 4), (1, 5), (1, 6)], sort=False).sort_values(), expected + ) + self.assert_eq( + psmidx4.union( + [(1, 1), (1, 2), (1, 3), (1, 4), (1, 3), (1, 4)], sort=False + ).sort_values(), + expected, + ) self.assertRaises(NotImplementedError, lambda: psidx1.union(psmidx1)) self.assertRaises(TypeError, lambda: psmidx1.union(psidx1)) @@ -2025,10 +1998,6 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils): self.assertTrue(isinstance(psmidx, ps.MultiIndex)) self.assert_eq(pmidx, psmidx) - @unittest.skipIf( - LooseVersion(pd.__version__) < LooseVersion("0.24"), - "MultiIndex.from_frame is new in pandas 0.24", - ) def test_multiindex_from_frame(self): pdf = pd.DataFrame( [["HI", "Temp"], ["HI", "Precip"], ["NJ", "Temp"], ["NJ", "Precip"]], columns=["a", "b"] @@ -2346,10 +2315,7 @@ class IndexesTest(PandasOnSparkTestCase, TestUtils): psidx1 = psdf.index.get_level_values(0) psidx2 = psdf.index.get_level_values(1) - if LooseVersion(pd.__version__) >= LooseVersion("1.0"): - self.assert_eq(psidx1 * 10 + psidx2, pidx1 * 10 + pidx2) - else: - self.assert_eq(psidx1 * 10 + psidx2, (pidx1 * 10 + pidx2).rename(None)) + self.assert_eq(psidx1 * 10 + psidx2, pidx1 * 10 + pidx2) def test_factorize(self): pidx = pd.Index(["a", "b", "a", "b"]) diff --git a/python/pyspark/pandas/tests/indexes/test_category.py b/python/pyspark/pandas/tests/indexes/test_category.py index d2a79a7..14c39aa 100644 --- a/python/pyspark/pandas/tests/indexes/test_category.py +++ b/python/pyspark/pandas/tests/indexes/test_category.py @@ -82,7 +82,7 @@ class CategoricalIndexTest(PandasOnSparkTestCase, TestUtils): pidx.categories = ["z", "y", "x"] psidx.categories = ["z", "y", "x"] - if LooseVersion(pd.__version__) >= LooseVersion("1.0.5"): + if LooseVersion(pd.__version__) >= LooseVersion("1.1"): self.assert_eq(pidx, psidx) self.assert_eq(pdf, psdf) else: diff --git a/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py b/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py index ba939ae..bb40099 100644 --- a/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py +++ b/python/pyspark/pandas/tests/plot/test_frame_plot_matplotlib.py @@ -16,7 +16,6 @@ # import base64 -from distutils.version import LooseVersion from io import BytesIO import unittest @@ -46,16 +45,14 @@ class DataFramePlotMatplotlibTest(PandasOnSparkTestCase, TestUtils): @classmethod def setUpClass(cls): super().setUpClass() - if LooseVersion(pd.__version__) >= LooseVersion("0.25"): - pd.set_option("plotting.backend", "matplotlib") + pd.set_option("plotting.backend", "matplotlib") set_option("plotting.backend", "matplotlib") set_option("plotting.max_rows", 2000) set_option("plotting.sample_ratio", None) @classmethod def tearDownClass(cls): - if LooseVersion(pd.__version__) >= LooseVersion("0.25"): - pd.reset_option("plotting.backend") + pd.reset_option("plotting.backend") reset_option("plotting.backend") reset_option("plotting.max_rows") reset_option("plotting.sample_ratio") diff --git a/python/pyspark/pandas/tests/plot/test_frame_plot_plotly.py b/python/pyspark/pandas/tests/plot/test_frame_plot_plotly.py index eb47448..7be00d5 100644 --- a/python/pyspark/pandas/tests/plot/test_frame_plot_plotly.py +++ b/python/pyspark/pandas/tests/plot/test_frame_plot_plotly.py @@ -16,7 +16,6 @@ # import unittest -from distutils.version import LooseVersion import pprint import pandas as pd @@ -38,10 +37,6 @@ if have_plotly: @unittest.skipIf(not have_plotly, plotly_requirement_message) -@unittest.skipIf( - LooseVersion(pd.__version__) < "1.0.0", - "pandas<1.0; pandas<1.0 does not support latest plotly and/or 'plotting.backend' option.", -) class DataFramePlotPlotlyTest(PandasOnSparkTestCase, TestUtils): @classmethod def setUpClass(cls): diff --git a/python/pyspark/pandas/tests/plot/test_series_plot_matplotlib.py b/python/pyspark/pandas/tests/plot/test_series_plot_matplotlib.py index 7b18a35..6124c3f 100644 --- a/python/pyspark/pandas/tests/plot/test_series_plot_matplotlib.py +++ b/python/pyspark/pandas/tests/plot/test_series_plot_matplotlib.py @@ -16,7 +16,6 @@ # import base64 -from distutils.version import LooseVersion from io import BytesIO import unittest @@ -44,15 +43,13 @@ class SeriesPlotMatplotlibTest(PandasOnSparkTestCase, TestUtils): @classmethod def setUpClass(cls): super().setUpClass() - if LooseVersion(pd.__version__) >= LooseVersion("0.25"): - pd.set_option("plotting.backend", "matplotlib") + pd.set_option("plotting.backend", "matplotlib") set_option("plotting.backend", "matplotlib") set_option("plotting.max_rows", 1000) @classmethod def tearDownClass(cls): - if LooseVersion(pd.__version__) >= LooseVersion("0.25"): - pd.reset_option("plotting.backend") + pd.reset_option("plotting.backend") reset_option("plotting.backend") reset_option("plotting.max_rows") super().tearDownClass() diff --git a/python/pyspark/pandas/tests/plot/test_series_plot_plotly.py b/python/pyspark/pandas/tests/plot/test_series_plot_plotly.py index d100466..8cb529d 100644 --- a/python/pyspark/pandas/tests/plot/test_series_plot_plotly.py +++ b/python/pyspark/pandas/tests/plot/test_series_plot_plotly.py @@ -16,7 +16,6 @@ # import unittest -from distutils.version import LooseVersion import pprint import pandas as pd @@ -38,10 +37,6 @@ if have_plotly: @unittest.skipIf(not have_plotly, plotly_requirement_message) -@unittest.skipIf( - LooseVersion(pd.__version__) < "1.0.0", - "pandas<1.0; pandas<1.0 does not support latest plotly and/or 'plotting.backend' option.", -) class SeriesPlotPlotlyTest(PandasOnSparkTestCase, TestUtils): @classmethod def setUpClass(cls): diff --git a/python/pyspark/pandas/tests/test_dataframe.py b/python/pyspark/pandas/tests/test_dataframe.py index ae8fcae..d12a084 100644 --- a/python/pyspark/pandas/tests/test_dataframe.py +++ b/python/pyspark/pandas/tests/test_dataframe.py @@ -501,18 +501,13 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): check(psdf.index.to_frame(), pdf.index.to_frame()) check(psdf.index.to_frame(index=False), pdf.index.to_frame(index=False)) - if LooseVersion(pd.__version__) >= LooseVersion("0.24"): - # The `name` argument is added in pandas 0.24. - check(psdf.index.to_frame(name="a"), pdf.index.to_frame(name="a")) - check( - psdf.index.to_frame(index=False, name="a"), - pdf.index.to_frame(index=False, name="a"), - ) - check(psdf.index.to_frame(name=("x", "a")), pdf.index.to_frame(name=("x", "a"))) - check( - psdf.index.to_frame(index=False, name=("x", "a")), - pdf.index.to_frame(index=False, name=("x", "a")), - ) + check(psdf.index.to_frame(name="a"), pdf.index.to_frame(name="a")) + check(psdf.index.to_frame(index=False, name="a"), pdf.index.to_frame(index=False, name="a")) + check(psdf.index.to_frame(name=("x", "a")), pdf.index.to_frame(name=("x", "a"))) + check( + psdf.index.to_frame(index=False, name=("x", "a")), + pdf.index.to_frame(index=False, name=("x", "a")), + ) def test_multiindex_column_access(self): columns = pd.MultiIndex.from_tuples( @@ -896,46 +891,22 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): self.assertRaises(ValueError, lambda: psdf.rename_axis(["cols2", "cols3"], axis=1)) self.assertRaises(TypeError, lambda: psdf.rename_axis(mapper=["index2"], index=["index3"])) - # index/columns parameters and dict_like/functions mappers introduced in pandas 0.24.0 - if LooseVersion(pd.__version__) >= LooseVersion("0.24.0"): - self.assert_eq( - pdf.rename_axis(index={"index": "index2"}, columns={"cols": "cols2"}).sort_index(), - psdf.rename_axis(index={"index": "index2"}, columns={"cols": "cols2"}).sort_index(), - ) - - self.assert_eq( - pdf.rename_axis( - index={"missing": "index2"}, columns={"missing": "cols2"} - ).sort_index(), - psdf.rename_axis( - index={"missing": "index2"}, columns={"missing": "cols2"} - ).sort_index(), - ) + self.assert_eq( + pdf.rename_axis(index={"index": "index2"}, columns={"cols": "cols2"}).sort_index(), + psdf.rename_axis(index={"index": "index2"}, columns={"cols": "cols2"}).sort_index(), + ) - self.assert_eq( - pdf.rename_axis(index=str.upper, columns=str.upper).sort_index(), - psdf.rename_axis(index=str.upper, columns=str.upper).sort_index(), - ) - else: - expected = pdf - expected.index.name = "index2" - expected.columns.name = "cols2" - result = psdf.rename_axis( - index={"index": "index2"}, columns={"cols": "cols2"} - ).sort_index() - self.assert_eq(expected, result) - - expected.index.name = "index" - expected.columns.name = "cols" - result = psdf.rename_axis( + self.assert_eq( + pdf.rename_axis(index={"missing": "index2"}, columns={"missing": "cols2"}).sort_index(), + psdf.rename_axis( index={"missing": "index2"}, columns={"missing": "cols2"} - ).sort_index() - self.assert_eq(expected, result) + ).sort_index(), + ) - expected.index.name = "INDEX" - expected.columns.name = "COLS" - result = psdf.rename_axis(index=str.upper, columns=str.upper).sort_index() - self.assert_eq(expected, result) + self.assert_eq( + pdf.rename_axis(index=str.upper, columns=str.upper).sort_index(), + psdf.rename_axis(index=str.upper, columns=str.upper).sort_index(), + ) index = pd.MultiIndex.from_tuples( [("A", "B"), ("C", "D"), ("E", "F")], names=["index1", "index2"] @@ -963,69 +934,33 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): ) self.assertRaises(ValueError, lambda: psdf.rename_axis(["cols3", "cols4", "cols5"], axis=1)) - # index/columns parameters and dict_like/functions mappers introduced in pandas 0.24.0 - if LooseVersion(pd.__version__) >= LooseVersion("0.24.0"): - self.assert_eq( - pdf.rename_axis( - index={"index1": "index3"}, columns={"cols1": "cols3"} - ).sort_index(), - psdf.rename_axis( - index={"index1": "index3"}, columns={"cols1": "cols3"} - ).sort_index(), - ) - - self.assert_eq( - pdf.rename_axis( - index={"missing": "index3"}, columns={"missing": "cols3"} - ).sort_index(), - psdf.rename_axis( - index={"missing": "index3"}, columns={"missing": "cols3"} - ).sort_index(), - ) - - self.assert_eq( - pdf.rename_axis( - index={"index1": "index3", "index2": "index4"}, - columns={"cols1": "cols3", "cols2": "cols4"}, - ).sort_index(), - psdf.rename_axis( - index={"index1": "index3", "index2": "index4"}, - columns={"cols1": "cols3", "cols2": "cols4"}, - ).sort_index(), - ) + self.assert_eq( + pdf.rename_axis(index={"index1": "index3"}, columns={"cols1": "cols3"}).sort_index(), + psdf.rename_axis(index={"index1": "index3"}, columns={"cols1": "cols3"}).sort_index(), + ) - self.assert_eq( - pdf.rename_axis(index=str.upper, columns=str.upper).sort_index(), - psdf.rename_axis(index=str.upper, columns=str.upper).sort_index(), - ) - else: - expected = pdf - expected.index.names = ["index3", "index2"] - expected.columns.names = ["cols3", "cols2"] - result = psdf.rename_axis( - index={"index1": "index3"}, columns={"cols1": "cols3"} - ).sort_index() - self.assert_eq(expected, result) - - expected.index.names = ["index1", "index2"] - expected.columns.names = ["cols1", "cols2"] - result = psdf.rename_axis( - index={"missing": "index2"}, columns={"missing": "cols2"} - ).sort_index() - self.assert_eq(expected, result) + self.assert_eq( + pdf.rename_axis(index={"missing": "index3"}, columns={"missing": "cols3"}).sort_index(), + psdf.rename_axis( + index={"missing": "index3"}, columns={"missing": "cols3"} + ).sort_index(), + ) - expected.index.names = ["index3", "index4"] - expected.columns.names = ["cols3", "cols4"] - result = psdf.rename_axis( + self.assert_eq( + pdf.rename_axis( index={"index1": "index3", "index2": "index4"}, columns={"cols1": "cols3", "cols2": "cols4"}, - ).sort_index() - self.assert_eq(expected, result) + ).sort_index(), + psdf.rename_axis( + index={"index1": "index3", "index2": "index4"}, + columns={"cols1": "cols3", "cols2": "cols4"}, + ).sort_index(), + ) - expected.index.names = ["INDEX1", "INDEX2"] - expected.columns.names = ["COLS1", "COLS2"] - result = psdf.rename_axis(index=str.upper, columns=str.upper).sort_index() - self.assert_eq(expected, result) + self.assert_eq( + pdf.rename_axis(index=str.upper, columns=str.upper).sort_index(), + psdf.rename_axis(index=str.upper, columns=str.upper).sort_index(), + ) def test_dot(self): psdf = self.psdf @@ -1143,47 +1078,17 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): self.assertRaises(KeyError, lambda: psdf.droplevel({"level_1"}, axis=1)) self.assertRaises(KeyError, lambda: psdf.droplevel({"level_1": 1}, axis=1)) - # droplevel is new in pandas 0.24.0 - if LooseVersion(pd.__version__) >= LooseVersion("0.24.0"): - self.assert_eq(pdf.droplevel("a"), psdf.droplevel("a")) - self.assert_eq(pdf.droplevel(["a"]), psdf.droplevel(["a"])) - self.assert_eq(pdf.droplevel(("a",)), psdf.droplevel(("a",))) - self.assert_eq(pdf.droplevel(0), psdf.droplevel(0)) - self.assert_eq(pdf.droplevel(-1), psdf.droplevel(-1)) - - self.assert_eq(pdf.droplevel("level_1", axis=1), psdf.droplevel("level_1", axis=1)) - self.assert_eq(pdf.droplevel(["level_1"], axis=1), psdf.droplevel(["level_1"], axis=1)) - self.assert_eq( - pdf.droplevel(("level_1",), axis=1), psdf.droplevel(("level_1",), axis=1) - ) - self.assert_eq(pdf.droplevel(0, axis=1), psdf.droplevel(0, axis=1)) - self.assert_eq(pdf.droplevel(-1, axis=1), psdf.droplevel(-1, axis=1)) - else: - expected = pdf.copy() - expected.index = expected.index.droplevel("a") - - self.assert_eq(expected, psdf.droplevel("a")) - self.assert_eq(expected, psdf.droplevel(["a"])) - self.assert_eq(expected, psdf.droplevel(("a",))) - self.assert_eq(expected, psdf.droplevel(0)) - - expected = pdf.copy() - expected.index = expected.index.droplevel(-1) - - self.assert_eq(expected, psdf.droplevel(-1)) - - expected = pdf.copy() - expected.columns = expected.columns.droplevel("level_1") + self.assert_eq(pdf.droplevel("a"), psdf.droplevel("a")) + self.assert_eq(pdf.droplevel(["a"]), psdf.droplevel(["a"])) + self.assert_eq(pdf.droplevel(("a",)), psdf.droplevel(("a",))) + self.assert_eq(pdf.droplevel(0), psdf.droplevel(0)) + self.assert_eq(pdf.droplevel(-1), psdf.droplevel(-1)) - self.assert_eq(expected, psdf.droplevel("level_1", axis=1)) - self.assert_eq(expected, psdf.droplevel(["level_1"], axis=1)) - self.assert_eq(expected, psdf.droplevel(("level_1",), axis=1)) - self.assert_eq(expected, psdf.droplevel(0, axis=1)) - - expected = pdf.copy() - expected.columns = expected.columns.droplevel(-1) - - self.assert_eq(expected, psdf.droplevel(-1, axis=1)) + self.assert_eq(pdf.droplevel("level_1", axis=1), psdf.droplevel("level_1", axis=1)) + self.assert_eq(pdf.droplevel(["level_1"], axis=1), psdf.droplevel(["level_1"], axis=1)) + self.assert_eq(pdf.droplevel(("level_1",), axis=1), psdf.droplevel(("level_1",), axis=1)) + self.assert_eq(pdf.droplevel(0, axis=1), psdf.droplevel(0, axis=1)) + self.assert_eq(pdf.droplevel(-1, axis=1), psdf.droplevel(-1, axis=1)) # Tupled names pdf.columns.names = [("level", 1), ("level", 2)] @@ -1193,22 +1098,10 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): self.assertRaises(KeyError, lambda: psdf.droplevel("a")) self.assertRaises(KeyError, lambda: psdf.droplevel(("a", 10))) - # droplevel is new in pandas 0.24.0 - if LooseVersion(pd.__version__) >= LooseVersion("0.24.0"): - self.assert_eq(pdf.droplevel([("a", 10)]), psdf.droplevel([("a", 10)])) - self.assert_eq( - pdf.droplevel([("level", 1)], axis=1), psdf.droplevel([("level", 1)], axis=1) - ) - else: - expected = pdf.copy() - expected.index = expected.index.droplevel([("a", 10)]) - - self.assert_eq(expected, psdf.droplevel([("a", 10)])) - - expected = pdf.copy() - expected.columns = expected.columns.droplevel([("level", 1)]) - - self.assert_eq(expected, psdf.droplevel([("level", 1)], axis=1)) + self.assert_eq(pdf.droplevel([("a", 10)]), psdf.droplevel([("a", 10)])) + self.assert_eq( + pdf.droplevel([("level", 1)], axis=1), psdf.droplevel([("level", 1)], axis=1) + ) # non-string names pdf = ( @@ -1219,33 +1112,13 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): pdf.columns = pd.MultiIndex.from_tuples([("c", "e"), ("d", "f")], names=[100.0, 200.0]) psdf = ps.from_pandas(pdf) - # droplevel is new in pandas 0.24.0 - if LooseVersion(pd.__version__) >= LooseVersion("0.24.0"): - self.assert_eq(pdf.droplevel(10.0), psdf.droplevel(10.0)) - self.assert_eq(pdf.droplevel([10.0]), psdf.droplevel([10.0])) - self.assert_eq(pdf.droplevel((10.0,)), psdf.droplevel((10.0,))) - self.assert_eq(pdf.droplevel(0), psdf.droplevel(0)) - self.assert_eq(pdf.droplevel(-1), psdf.droplevel(-1)) - self.assert_eq(pdf.droplevel(100.0, axis=1), psdf.droplevel(100.0, axis=1)) - self.assert_eq(pdf.droplevel(0, axis=1), psdf.droplevel(0, axis=1)) - else: - expected = pdf.copy() - expected.index = expected.index.droplevel(10.0) - - self.assert_eq(expected, psdf.droplevel(10.0)) - self.assert_eq(expected, psdf.droplevel([10.0])) - self.assert_eq(expected, psdf.droplevel((10.0,))) - self.assert_eq(expected, psdf.droplevel(0)) - - expected = pdf.copy() - expected.index = expected.index.droplevel(-1) - self.assert_eq(expected, psdf.droplevel(-1)) - - expected = pdf.copy() - expected.columns = expected.columns.droplevel(100.0) - - self.assert_eq(expected, psdf.droplevel(100.0, axis=1)) - self.assert_eq(expected, psdf.droplevel(0, axis=1)) + self.assert_eq(pdf.droplevel(10.0), psdf.droplevel(10.0)) + self.assert_eq(pdf.droplevel([10.0]), psdf.droplevel([10.0])) + self.assert_eq(pdf.droplevel((10.0,)), psdf.droplevel((10.0,))) + self.assert_eq(pdf.droplevel(0), psdf.droplevel(0)) + self.assert_eq(pdf.droplevel(-1), psdf.droplevel(-1)) + self.assert_eq(pdf.droplevel(100.0, axis=1), psdf.droplevel(100.0, axis=1)) + self.assert_eq(pdf.droplevel(0, axis=1), psdf.droplevel(0, axis=1)) def test_drop(self): pdf = pd.DataFrame({"x": [1, 2], "y": [3, 4], "z": [5, 6]}, index=np.random.rand(2)) @@ -2159,42 +2032,38 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): ) # Test Series on the right - # pd.DataFrame.merge with Series is implemented since version 0.24.0 - if LooseVersion(pd.__version__) >= LooseVersion("0.24.0"): - check(lambda left, right: left.merge(right), right_psser, right_ps) - check( - lambda left, right: left.merge(right, left_on="x", right_on="x"), - right_psser, - right_ps, - ) - check( - lambda left, right: left.set_index("x").merge(right, left_index=True, right_on="x"), - right_psser, - right_ps, - ) - - # Test join types with Series - for how in ["inner", "left", "right", "outer"]: - check(lambda left, right: left.merge(right, how=how), right_psser, right_ps) - check( - lambda left, right: left.merge(right, left_on="x", right_on="x", how=how), - right_psser, - right_ps, - ) + check(lambda left, right: left.merge(right), right_psser, right_ps) + check( + lambda left, right: left.merge(right, left_on="x", right_on="x"), right_psser, right_ps + ) + check( + lambda left, right: left.set_index("x").merge(right, left_index=True, right_on="x"), + right_psser, + right_ps, + ) - # suffix with Series + # Test join types with Series + for how in ["inner", "left", "right", "outer"]: + check(lambda left, right: left.merge(right, how=how), right_psser, right_ps) check( - lambda left, right: left.merge( - right, - suffixes=["_left", "_right"], - how="outer", - left_index=True, - right_index=True, - ), + lambda left, right: left.merge(right, left_on="x", right_on="x", how=how), right_psser, right_ps, ) + # suffix with Series + check( + lambda left, right: left.merge( + right, + suffixes=["_left", "_right"], + how="outer", + left_index=True, + right_index=True, + ), + right_psser, + right_ps, + ) + # multi-index columns left_columns = pd.MultiIndex.from_tuples([(10, "lkey"), (10, "value"), (20, "x")]) left_pdf.columns = left_columns @@ -3850,14 +3719,8 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): self.assertRaises(KeyError, lambda: psdf.melt(value_vars="Z")) # multi-index columns - if LooseVersion("0.24") <= LooseVersion(pd.__version__) < LooseVersion("1.0.0"): - # pandas >=0.24,<1.0 doesn't support mixed int/str columns in melt. - # see: https://github.com/pandas-dev/pandas/pull/29792 - TEN = "10" - TWELVE = "20" - else: - TEN = 10.0 - TWELVE = 20.0 + TEN = 10.0 + TWELVE = 20.0 columns = pd.MultiIndex.from_tuples([(TEN, "A"), (TEN, "B"), (TWELVE, "C")]) pdf.columns = columns @@ -4904,12 +4767,8 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): pdf = pd.DataFrame({"x": ["a", "b", "c"]}) psdf = ps.from_pandas(pdf) - if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"): - self.assert_eq(psdf.quantile(0.5), pdf.quantile(0.5)) - self.assert_eq(psdf.quantile([0.25, 0.5, 0.75]), pdf.quantile([0.25, 0.5, 0.75])) - else: - self.assert_eq(psdf.quantile(0.5), pd.Series(name=0.5)) - self.assert_eq(psdf.quantile([0.25, 0.5, 0.75]), pd.DataFrame(index=[0.25, 0.5, 0.75])) + self.assert_eq(psdf.quantile(0.5), pdf.quantile(0.5)) + self.assert_eq(psdf.quantile([0.25, 0.5, 0.75]), pdf.quantile([0.25, 0.5, 0.75])) with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"): psdf.quantile(0.5, numeric_only=False) @@ -5151,11 +5010,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): pdf = pd.DataFrame(data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]}) psdf = ps.from_pandas(pdf) - # `to_markdown()` is supported in pandas >= 1.0.0 since it's newly added in pandas 1.0.0. - if LooseVersion(pd.__version__) < LooseVersion("1.0.0"): - self.assertRaises(NotImplementedError, lambda: psdf.to_markdown()) - else: - self.assert_eq(pdf.to_markdown(), psdf.to_markdown()) + self.assert_eq(pdf.to_markdown(), psdf.to_markdown()) def test_cache(self): pdf = pd.DataFrame( @@ -5234,17 +5089,7 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): ) psdf = ps.from_pandas(pdf) - if LooseVersion(pd.__version__) < LooseVersion("1.0.0") and LooseVersion( - pd.__version__ - ) >= LooseVersion("0.24.0"): - expected_result = pd.DataFrame( - {"angles": [np.inf, 3.0, 2.0], "degrees": [0.0, 0.0, 0.0]}, - index=["circle", "triangle", "rectangle"], - columns=["angles", "degrees"], - ) - else: - expected_result = pdf.rfloordiv(10) - + expected_result = pdf.rfloordiv(10) self.assert_eq(psdf.rfloordiv(10), expected_result) def test_truncate(self): @@ -5360,17 +5205,8 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): pdf.columns.name = "columns" psdf = ps.from_pandas(pdf) - if LooseVersion(pd.__version__) >= LooseVersion("0.25.0"): - expected_result1 = pdf.explode("A") - expected_result2 = pdf.explode("B") - else: - expected_result1 = pd.DataFrame( - {"A": [-1, np.nan, 0, np.inf, 1, -np.inf], "B": [1, 1, 1, 1, 1, 1]}, - index=pd.Index([0, 0, 1, 1, 2, 2]), - ) - expected_result1.index.name = "index" - expected_result1.columns.name = "columns" - expected_result2 = pdf + expected_result1 = pdf.explode("A") + expected_result2 = pdf.explode("B") self.assert_eq(psdf.explode("A"), expected_result1, almost=True) self.assert_eq(psdf.explode("B"), expected_result2) @@ -5386,16 +5222,8 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): pdf.index = midx psdf = ps.from_pandas(pdf) - if LooseVersion(pd.__version__) >= LooseVersion("0.25.0"): - expected_result1 = pdf.explode("A") - expected_result2 = pdf.explode("B") - else: - midx = pd.MultiIndex.from_tuples( - [("x", "a"), ("x", "a"), ("x", "b"), ("x", "b"), ("y", "c"), ("y", "c")], - names=["index1", "index2"], - ) - expected_result1.index = midx - expected_result2 = pdf + expected_result1 = pdf.explode("A") + expected_result2 = pdf.explode("B") self.assert_eq(psdf.explode("A"), expected_result1, almost=True) self.assert_eq(psdf.explode("B"), expected_result2) @@ -5409,16 +5237,9 @@ class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): pdf.columns = columns psdf.columns = columns - if LooseVersion(pd.__version__) >= LooseVersion("0.25.0"): - expected_result1 = pdf.explode(("A", "Z")) - expected_result2 = pdf.explode(("B", "X")) - expected_result3 = pdf.A.explode("Z") - else: - expected_result1.columns = columns - expected_result2 = pdf - expected_result3 = pd.DataFrame({"Z": [-1, np.nan, 0, np.inf, 1, -np.inf]}, index=midx) - expected_result3.index.name = "index" - expected_result3.columns.name = "column2" + expected_result1 = pdf.explode(("A", "Z")) + expected_result2 = pdf.explode(("B", "X")) + expected_result3 = pdf.A.explode("Z") self.assert_eq(psdf.explode(("A", "Z")), expected_result1, almost=True) self.assert_eq(psdf.explode(("B", "X")), expected_result2) diff --git a/python/pyspark/pandas/tests/test_dataframe_conversion.py b/python/pyspark/pandas/tests/test_dataframe_conversion.py index 2318c01..e216104 100644 --- a/python/pyspark/pandas/tests/test_dataframe_conversion.py +++ b/python/pyspark/pandas/tests/test_dataframe_conversion.py @@ -24,7 +24,6 @@ import unittest import numpy as np import pandas as pd -from distutils.version import LooseVersion from pyspark import pandas as ps from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils from pyspark.testing.sqlutils import SQLTestUtils @@ -218,18 +217,15 @@ class DataFrameConversionTest(PandasOnSparkTestCase, SQLTestUtils, TestUtils): self.assert_eq(psdf.to_latex(index_names=False), pdf.to_latex(index_names=False)) self.assert_eq(psdf.to_latex(bold_rows=True), pdf.to_latex(bold_rows=True)) self.assert_eq(psdf.to_latex(decimal=","), pdf.to_latex(decimal=",")) - if LooseVersion(pd.__version__) < LooseVersion("1.0.0"): - self.assert_eq(psdf.to_latex(encoding="ascii"), pdf.to_latex(encoding="ascii")) def test_to_records(self): - if LooseVersion(pd.__version__) >= LooseVersion("0.24.0"): - pdf = pd.DataFrame({"A": [1, 2], "B": [0.5, 0.75]}, index=["a", "b"]) + pdf = pd.DataFrame({"A": [1, 2], "B": [0.5, 0.75]}, index=["a", "b"]) - psdf = ps.from_pandas(pdf) + psdf = ps.from_pandas(pdf) - self.assert_eq(psdf.to_records(), pdf.to_records()) - self.assert_eq(psdf.to_records(index=False), pdf.to_records(index=False)) - self.assert_eq(psdf.to_records(index_dtypes="<S2"), pdf.to_records(index_dtypes="<S2")) + self.assert_eq(psdf.to_records(), pdf.to_records()) + self.assert_eq(psdf.to_records(index=False), pdf.to_records(index=False)) + self.assert_eq(psdf.to_records(index_dtypes="<S2"), pdf.to_records(index_dtypes="<S2")) def test_from_records(self): # Assert using a dict as input diff --git a/python/pyspark/pandas/tests/test_dataframe_spark_io.py b/python/pyspark/pandas/tests/test_dataframe_spark_io.py index f31cd5c..f7bb5b9 100644 --- a/python/pyspark/pandas/tests/test_dataframe_spark_io.py +++ b/python/pyspark/pandas/tests/test_dataframe_spark_io.py @@ -15,14 +15,12 @@ # limitations under the License. # -from distutils.version import LooseVersion import unittest import glob import os import numpy as np import pandas as pd -import pyarrow as pa from pyspark import pandas as ps from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils @@ -56,31 +54,17 @@ class DataFrameSparkIOTest(PandasOnSparkTestCase, TestUtils): 1 ).write.parquet(tmp, mode="overwrite") - def check(columns, expected): - if LooseVersion("0.21.1") <= LooseVersion(pd.__version__): - expected = pd.read_parquet(tmp, columns=columns) + def check(columns): + expected = pd.read_parquet(tmp, columns=columns) actual = ps.read_parquet(tmp, columns=columns) self.assertPandasEqual(expected, actual.to_pandas()) - check(None, data) - check(["i32", "i64"], data[["i32", "i64"]]) - check(["i64", "i32"], data[["i64", "i32"]]) - - if LooseVersion(pa.__version__) < LooseVersion("1.0.0"): - # TODO: `pd.read_parquet()` changed the behavior due to PyArrow 1.0.0. - # We might want to adjust the behavior. Let's see how pandas handles it. - check(("i32", "i64"), data[["i32", "i64"]]) - check(["a", "b", "i32", "i64"], data[["i32", "i64"]]) - check([], pd.DataFrame([])) - check(["a"], pd.DataFrame([])) - check("i32", pd.DataFrame([])) - check("float", data[["f"]]) + check(None) + check(["i32", "i64"]) + check(["i64", "i32"]) # check with pyspark patch. - if LooseVersion("0.21.1") <= LooseVersion(pd.__version__): - expected = pd.read_parquet(tmp) - else: - expected = data + expected = pd.read_parquet(tmp) actual = ps.read_parquet(tmp) self.assertPandasEqual(expected, actual.to_pandas()) diff --git a/python/pyspark/pandas/tests/test_expanding.py b/python/pyspark/pandas/tests/test_expanding.py index d52ccba..ef0b120 100644 --- a/python/pyspark/pandas/tests/test_expanding.py +++ b/python/pyspark/pandas/tests/test_expanding.py @@ -67,39 +67,7 @@ class ExpandingTest(PandasOnSparkTestCase, TestUtils): self.assertEqual(repr(ps.range(10).expanding(5)), "Expanding [min_periods=5]") def test_expanding_count(self): - # The behaviour of Expanding.count are different between pandas>=1.0.0 and lower, - # and we're following the behaviour of latest version of pandas. - if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"): - self._test_expanding_func("count") - else: - # Series - idx = np.random.rand(3) - psser = ps.Series([1, 2, 3], index=idx, name="a") - expected_result = pd.Series([None, 2.0, 3.0], index=idx, name="a") - self.assert_eq(psser.expanding(2).count().sort_index(), expected_result.sort_index()) - self.assert_eq(psser.expanding(2).count().sum(), expected_result.sum()) - - # MultiIndex - midx = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")]) - psser = ps.Series([1, 2, 3], index=midx, name="a") - expected_result = pd.Series([None, 2.0, 3.0], index=midx, name="a") - self.assert_eq(psser.expanding(2).count().sort_index(), expected_result.sort_index()) - - # DataFrame - psdf = ps.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]}) - expected_result = pd.DataFrame({"a": [None, 2.0, 3.0, 4.0], "b": [None, 2.0, 3.0, 4.0]}) - self.assert_eq(psdf.expanding(2).count().sort_index(), expected_result.sort_index()) - self.assert_eq(psdf.expanding(2).count().sum(), expected_result.sum()) - - # MultiIndex columns - idx = np.random.rand(4) - psdf = ps.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]}, index=idx) - psdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")]) - expected_result = pd.DataFrame( - {("a", "x"): [None, 2.0, 3.0, 4.0], ("a", "y"): [None, 2.0, 3.0, 4.0]}, - index=idx, - ) - self.assert_eq(psdf.expanding(2).count().sort_index(), expected_result.sort_index()) + self._test_expanding_func("count") def test_expanding_min(self): self._test_expanding_func("min") @@ -219,99 +187,7 @@ class ExpandingTest(PandasOnSparkTestCase, TestUtils): ) def test_groupby_expanding_count(self): - # The behaviour of ExpandingGroupby.count are different between pandas>=1.0.0 and lower, - # and we're following the behaviour of latest version of pandas. - if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"): - self._test_groupby_expanding_func("count") - else: - # Series - psser = ps.Series([1, 2, 3, 2], index=np.random.rand(4)) - midx = pd.MultiIndex.from_tuples( - list(zip(psser.to_pandas().values, psser.index.to_pandas().values)) - ) - expected_result = pd.Series([np.nan, np.nan, np.nan, 2], index=midx) - self.assert_eq( - psser.groupby(psser).expanding(2).count().sort_index(), expected_result.sort_index() - ) - self.assert_eq(psser.groupby(psser).expanding(2).count().sum(), expected_result.sum()) - - # MultiIndex - psser = ps.Series( - [1, 2, 3, 2], - index=pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z"), ("a", "y")]), - ) - midx = pd.MultiIndex.from_tuples( - [(1, "a", "x"), (2, "a", "y"), (3, "b", "z"), (2, "a", "y")] - ) - expected_result = pd.Series([np.nan, np.nan, np.nan, 2], index=midx) - self.assert_eq( - psser.groupby(psser).expanding(2).count().sort_index(), expected_result.sort_index() - ) - - # DataFrame - psdf = ps.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]}) - midx = pd.MultiIndex.from_tuples([(1, 0), (2, 1), (2, 3), (3, 2)], names=["a", None]) - expected_result = pd.DataFrame( - {"a": [None, None, 2.0, None], "b": [None, None, 2.0, None]}, index=midx - ) - self.assert_eq( - psdf.groupby(psdf.a).expanding(2).count().sort_index(), expected_result.sort_index() - ) - self.assert_eq(psdf.groupby(psdf.a).expanding(2).count().sum(), expected_result.sum()) - expected_result = pd.DataFrame( - {"a": [None, None, 2.0, None], "b": [None, None, 2.0, None]}, - index=pd.MultiIndex.from_tuples( - [(2, 0), (3, 1), (3, 3), (4, 2)], names=["a", None] - ), - ) - self.assert_eq( - psdf.groupby(psdf.a + 1).expanding(2).count().sort_index(), - expected_result.sort_index(), - ) - expected_result = pd.Series([None, None, 2.0, None], index=midx, name="b") - self.assert_eq( - psdf.b.groupby(psdf.a).expanding(2).count().sort_index(), - expected_result.sort_index(), - ) - self.assert_eq( - psdf.groupby(psdf.a)["b"].expanding(2).count().sort_index(), - expected_result.sort_index(), - ) - expected_result = pd.DataFrame({"b": [None, None, 2.0, None]}, index=midx) - self.assert_eq( - psdf.groupby(psdf.a)[["b"]].expanding(2).count().sort_index(), - expected_result.sort_index(), - ) - - # MultiIndex column - psdf = ps.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]}) - psdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")]) - midx = pd.MultiIndex.from_tuples( - [(1, 0), (2, 1), (2, 3), (3, 2)], names=[("a", "x"), None] - ) - expected_result = pd.DataFrame( - {"a": [None, None, 2.0, None], "b": [None, None, 2.0, None]}, index=midx - ) - expected_result.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")]) - self.assert_eq( - psdf.groupby(("a", "x")).expanding(2).count().sort_index(), - expected_result.sort_index(), - ) - midx = pd.MultiIndex.from_tuples( - [(1, 4.0, 0), (2, 1.0, 3), (2, 2.0, 1), (3, 3.0, 2)], - names=[("a", "x"), ("a", "y"), None], - ) - expected_result = pd.DataFrame( - { - ("a", "x"): [np.nan, np.nan, np.nan, np.nan], - ("a", "y"): [np.nan, np.nan, np.nan, np.nan], - }, - index=midx, - ) - self.assert_eq( - psdf.groupby([("a", "x"), ("a", "y")]).expanding(2).count().sort_index(), - expected_result.sort_index(), - ) + self._test_groupby_expanding_func("count") def test_groupby_expanding_min(self): self._test_groupby_expanding_func("min") diff --git a/python/pyspark/pandas/tests/test_groupby.py b/python/pyspark/pandas/tests/test_groupby.py index 4e1c0d0..b966e79 100644 --- a/python/pyspark/pandas/tests/test_groupby.py +++ b/python/pyspark/pandas/tests/test_groupby.py @@ -212,11 +212,9 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils): self.assert_eq(psdf.groupby((10, "a"))[(20, "c")].sum().sort_index(), expected) - if ( - LooseVersion(pd.__version__) >= LooseVersion("1.0.4") - and LooseVersion(pd.__version__) != LooseVersion("1.1.3") - and LooseVersion(pd.__version__) != LooseVersion("1.1.4") - ): + if LooseVersion(pd.__version__) != LooseVersion("1.1.3") and LooseVersion( + pd.__version__ + ) != LooseVersion("1.1.4"): self.assert_eq( psdf[(20, "c")].groupby(psdf[(10, "a")]).sum().sort_index(), pdf[(20, "c")].groupby(pdf[(10, "a")]).sum().sort_index(), @@ -512,7 +510,6 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils): sorted_agg_pdf = pdf.groupby(("X", "A")).agg(aggfunc).sort_index() self.assert_eq(sorted_agg_psdf, sorted_agg_pdf) - @unittest.skipIf(pd.__version__ < "0.25.0", "not supported before pandas 0.25.0") def test_aggregate_relabel(self): # this is to test named aggregation in groupby pdf = pd.DataFrame({"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}) @@ -809,26 +806,16 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils): pdf.columns = pd.MultiIndex.from_tuples([("y", "A"), ("y", "B"), ("x", "group")]) psdf = ps.from_pandas(pdf) - if LooseVersion(pd.__version__) < LooseVersion("1.0.0"): - agg_pdf = pd.DataFrame( - {"a_max": [1, 3]}, index=pd.Index(["a", "b"], name=("x", "group")) - ) - elif LooseVersion(pd.__version__) >= LooseVersion("1.0.0"): - agg_pdf = pdf.groupby(("x", "group")).agg(a_max=(("y", "A"), "max")).sort_index() + agg_pdf = pdf.groupby(("x", "group")).agg(a_max=(("y", "A"), "max")).sort_index() agg_psdf = psdf.groupby(("x", "group")).agg(a_max=(("y", "A"), "max")).sort_index() self.assert_eq(agg_pdf, agg_psdf) # same column, different methods - if LooseVersion(pd.__version__) < LooseVersion("1.0.0"): - agg_pdf = pd.DataFrame( - {"a_max": [1, 3], "a_min": [0, 2]}, index=pd.Index(["a", "b"], name=("x", "group")) - ) - elif LooseVersion(pd.__version__) >= LooseVersion("1.0.0"): - agg_pdf = ( - pdf.groupby(("x", "group")) - .agg(a_max=(("y", "A"), "max"), a_min=(("y", "A"), "min")) - .sort_index() - ) + agg_pdf = ( + pdf.groupby(("x", "group")) + .agg(a_max=(("y", "A"), "max"), a_min=(("y", "A"), "min")) + .sort_index() + ) agg_psdf = ( psdf.groupby(("x", "group")) .agg(a_max=(("y", "A"), "max"), a_min=(("y", "A"), "min")) @@ -837,16 +824,11 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils): self.assert_eq(agg_pdf, agg_psdf) # different column, different methods - if LooseVersion(pd.__version__) < LooseVersion("1.0.0"): - agg_pdf = pd.DataFrame( - {"a_max": [6, 8], "a_min": [0, 2]}, index=pd.Index(["a", "b"], name=("x", "group")) - ) - elif LooseVersion(pd.__version__) >= LooseVersion("1.0.0"): - agg_pdf = ( - pdf.groupby(("x", "group")) - .agg(a_max=(("y", "B"), "max"), a_min=(("y", "A"), "min")) - .sort_index() - ) + agg_pdf = ( + pdf.groupby(("x", "group")) + .agg(a_max=(("y", "B"), "max"), a_min=(("y", "A"), "min")) + .sort_index() + ) agg_psdf = ( psdf.groupby(("x", "group")) .agg(a_max=(("y", "B"), "max"), a_min=(("y", "A"), "min")) @@ -1721,23 +1703,13 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils): ) psdf = ps.from_pandas(pdf) - if LooseVersion(pd.__version__) <= LooseVersion("0.24.2"): - self.assert_eq( - psdf.groupby("A").ffill().sort_index(), - pdf.groupby("A").ffill().sort_index().drop("A", 1), - ) - self.assert_eq( - psdf.groupby("A")[["B"]].ffill().sort_index(), - pdf.groupby("A")[["B"]].ffill().sort_index().drop("A", 1), - ) - else: - self.assert_eq( - psdf.groupby("A").ffill().sort_index(), pdf.groupby("A").ffill().sort_index() - ) - self.assert_eq( - psdf.groupby("A")[["B"]].ffill().sort_index(), - pdf.groupby("A")[["B"]].ffill().sort_index(), - ) + self.assert_eq( + psdf.groupby("A").ffill().sort_index(), pdf.groupby("A").ffill().sort_index() + ) + self.assert_eq( + psdf.groupby("A")[["B"]].ffill().sort_index(), + pdf.groupby("A")[["B"]].ffill().sort_index(), + ) self.assert_eq( psdf.groupby("A")["B"].ffill().sort_index(), pdf.groupby("A")["B"].ffill().sort_index() ) @@ -1750,16 +1722,10 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils): pdf.columns = columns psdf.columns = columns - if LooseVersion(pd.__version__) <= LooseVersion("0.24.2"): - self.assert_eq( - psdf.groupby(("X", "A")).ffill().sort_index(), - pdf.groupby(("X", "A")).ffill().sort_index().drop(("X", "A"), 1), - ) - else: - self.assert_eq( - psdf.groupby(("X", "A")).ffill().sort_index(), - pdf.groupby(("X", "A")).ffill().sort_index(), - ) + self.assert_eq( + psdf.groupby(("X", "A")).ffill().sort_index(), + pdf.groupby(("X", "A")).ffill().sort_index(), + ) def test_bfill(self): idx = np.random.rand(4 * 3) @@ -1774,23 +1740,13 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils): ) psdf = ps.from_pandas(pdf) - if LooseVersion(pd.__version__) <= LooseVersion("0.24.2"): - self.assert_eq( - psdf.groupby("A").bfill().sort_index(), - pdf.groupby("A").bfill().sort_index().drop("A", 1), - ) - self.assert_eq( - psdf.groupby("A")[["B"]].bfill().sort_index(), - pdf.groupby("A")[["B"]].bfill().sort_index().drop("A", 1), - ) - else: - self.assert_eq( - psdf.groupby("A").bfill().sort_index(), pdf.groupby("A").bfill().sort_index() - ) - self.assert_eq( - psdf.groupby("A")[["B"]].bfill().sort_index(), - pdf.groupby("A")[["B"]].bfill().sort_index(), - ) + self.assert_eq( + psdf.groupby("A").bfill().sort_index(), pdf.groupby("A").bfill().sort_index() + ) + self.assert_eq( + psdf.groupby("A")[["B"]].bfill().sort_index(), + pdf.groupby("A")[["B"]].bfill().sort_index(), + ) self.assert_eq( psdf.groupby("A")["B"].bfill().sort_index(), pdf.groupby("A")["B"].bfill().sort_index(), @@ -1804,18 +1760,11 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils): pdf.columns = columns psdf.columns = columns - if LooseVersion(pd.__version__) <= LooseVersion("0.24.2"): - self.assert_eq( - psdf.groupby(("X", "A")).bfill().sort_index(), - pdf.groupby(("X", "A")).bfill().sort_index().drop(("X", "A"), 1), - ) - else: - self.assert_eq( - psdf.groupby(("X", "A")).bfill().sort_index(), - pdf.groupby(("X", "A")).bfill().sort_index(), - ) + self.assert_eq( + psdf.groupby(("X", "A")).bfill().sort_index(), + pdf.groupby(("X", "A")).bfill().sort_index(), + ) - @unittest.skipIf(pd.__version__ < "0.24.0", "not supported before pandas 0.24.0") def test_shift(self): pdf = pd.DataFrame( { @@ -1849,13 +1798,6 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils): psdf.groupby(psdf.b // 5)["a"].shift().sort_index(), pdf.groupby(pdf.b // 5)["a"].shift().sort_index(), ) - # TODO: known pandas' bug when fill_value is not None pandas>=1.0.0 - # https://github.com/pandas-dev/pandas/issues/31971#issue-565171762 - if LooseVersion(pd.__version__) < LooseVersion("1.0.0"): - self.assert_eq( - psdf.groupby(["b"])[["a", "c"]].shift(periods=-1, fill_value=0).sort_index(), - pdf.groupby(["b"])[["a", "c"]].shift(periods=-1, fill_value=0).sort_index(), - ) self.assert_eq( psdf.a.rename().groupby(psdf.b).shift().sort_index(), pdf.a.rename().groupby(pdf.b).shift().sort_index(), diff --git a/python/pyspark/pandas/tests/test_indexing.py b/python/pyspark/pandas/tests/test_indexing.py index 2b00b3f..0b76e9e 100644 --- a/python/pyspark/pandas/tests/test_indexing.py +++ b/python/pyspark/pandas/tests/test_indexing.py @@ -1169,12 +1169,6 @@ class IndexingTest(PandasOnSparkTestCase): psser.loc["y"] = psser * 10 self.assert_eq(psser, pser) - if LooseVersion(pd.__version__) < LooseVersion("1.0"): - # TODO: seems like a pandas' bug in pandas>=1.0.0? - pser.loc[("x", "viper"):"y"] = pser * 20 - psser.loc[("x", "viper"):"y"] = psser * 20 - self.assert_eq(psser, pser) - def test_series_iloc_setitem(self): pdf = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}, index=["cobra", "viper", "sidewinder"]) psdf = ps.from_pandas(pdf) diff --git a/python/pyspark/pandas/tests/test_numpy_compat.py b/python/pyspark/pandas/tests/test_numpy_compat.py index cefaf45..0d6a8fb 100644 --- a/python/pyspark/pandas/tests/test_numpy_compat.py +++ b/python/pyspark/pandas/tests/test_numpy_compat.py @@ -15,8 +15,6 @@ # limitations under the License. # -from distutils.version import LooseVersion - import numpy as np import pandas as pd @@ -65,10 +63,7 @@ class NumPyCompatTest(PandasOnSparkTestCase, SQLTestUtils): psdf = self.psdf pdf = self.pdf - if LooseVersion(pd.__version__) < LooseVersion("0.25"): - self.assert_eq(np.add(psdf.a, psdf.b), np.add(pdf.a, pdf.b).rename()) - else: - self.assert_eq(np.add(psdf.a, psdf.b), np.add(pdf.a, pdf.b)) + self.assert_eq(np.add(psdf.a, psdf.b), np.add(pdf.a, pdf.b)) psdf = self.psdf pdf = self.pdf @@ -114,12 +109,7 @@ class NumPyCompatTest(PandasOnSparkTestCase, SQLTestUtils): if np_name not in self.blacklist: try: # binary ufunc - if LooseVersion(pd.__version__) < LooseVersion("0.25"): - self.assert_eq( - np_func(pdf.a, pdf.b).rename(), np_func(psdf.a, psdf.b), almost=True - ) - else: - self.assert_eq(np_func(pdf.a, pdf.b), np_func(psdf.a, psdf.b), almost=True) + self.assert_eq(np_func(pdf.a, pdf.b), np_func(psdf.a, psdf.b), almost=True) self.assert_eq(np_func(pdf.a, 1), np_func(psdf.a, 1), almost=True) except Exception as e: raise AssertionError("Test in '%s' function was failed." % np_name) from e @@ -132,18 +122,11 @@ class NumPyCompatTest(PandasOnSparkTestCase, SQLTestUtils): if np_name not in self.blacklist: try: # binary ufunc - if LooseVersion(pd.__version__) < LooseVersion("0.25"): - self.assert_eq( - np_func(pdf.a, pdf2.b).sort_index().rename(), - np_func(psdf.a, psdf2.b).sort_index(), - almost=True, - ) - else: - self.assert_eq( - np_func(pdf.a, pdf2.b).sort_index(), - np_func(psdf.a, psdf2.b).sort_index(), - almost=True, - ) + self.assert_eq( + np_func(pdf.a, pdf2.b).sort_index(), + np_func(psdf.a, psdf2.b).sort_index(), + almost=True, + ) except Exception as e: raise AssertionError("Test in '%s' function was failed." % np_name) from e finally: diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py index 52cf5f5..0ae5cd0 100644 --- a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py +++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py @@ -359,9 +359,7 @@ class OpsOnDiffFramesEnabledTest(PandasOnSparkTestCase, SQLTestUtils): ) # DataFrame - if check_extension and ( - LooseVersion("1.0") <= LooseVersion(pd.__version__) < LooseVersion("1.1") - ): + if check_extension and LooseVersion(pd.__version__) < LooseVersion("1.1"): self.assert_eq( (psdf1 + psdf2 - psdf3).sort_index(), (pdf1 + pdf2 - pdf3).sort_index(), almost=True ) @@ -392,9 +390,7 @@ class OpsOnDiffFramesEnabledTest(PandasOnSparkTestCase, SQLTestUtils): ) # DataFrame - if check_extension and ( - LooseVersion("1.0") <= LooseVersion(pd.__version__) < LooseVersion("1.1") - ): + if check_extension and LooseVersion(pd.__version__) < LooseVersion("1.1"): self.assert_eq( (psdf1 + psdf2 - psdf3).sort_index(), (pdf1 + pdf2 - pdf3).sort_index(), almost=True ) @@ -420,26 +416,9 @@ class OpsOnDiffFramesEnabledTest(PandasOnSparkTestCase, SQLTestUtils): assert_eq((psser1 * psser2 * psser3).sort_index(), (pser1 * pser2 * pser3).sort_index()) if check_extension and not extension_float_dtypes_available: - if LooseVersion(pd.__version__) >= LooseVersion("1.0"): - self.assert_eq( - (psser1 - psser2 / psser3).sort_index(), (pser1 - pser2 / pser3).sort_index() - ) - else: - expected = pd.Series( - [249.0, np.nan, 0.0, 0.88, np.nan, np.nan, np.nan, np.nan, np.nan, -np.inf] - + [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], - index=pd.MultiIndex( - [ - ["cow", "falcon", "koala", "koalas", "lama"], - ["length", "power", "speed", "weight"], - ], - [ - [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 3, 3, 3, 4, 4, 4], - [0, 1, 2, 2, 3, 0, 0, 1, 2, 3, 0, 0, 3, 3, 0, 2, 3], - ], - ), - ) - self.assert_eq((psser1 - psser2 / psser3).sort_index(), expected) + self.assert_eq( + (psser1 - psser2 / psser3).sort_index(), (pser1 - pser2 / pser3).sort_index() + ) else: assert_eq((psser1 - psser2 / psser3).sort_index(), (pser1 - pser2 / pser3).sort_index()) @@ -1665,13 +1644,7 @@ class OpsOnDiffFramesEnabledTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(psidx1 * 10 + psidx2, pidx1 * 10 + pidx2) self.assert_eq(psidx1.rename(None) * 10 + psidx2, pidx1.rename(None) * 10 + pidx2) - - if LooseVersion(pd.__version__) >= LooseVersion("1.0"): - self.assert_eq(psidx1 * 10 + psidx2.rename(None), pidx1 * 10 + pidx2.rename(None)) - else: - self.assert_eq( - psidx1 * 10 + psidx2.rename(None), (pidx1 * 10 + pidx2.rename(None)).rename(None) - ) + self.assert_eq(psidx1 * 10 + psidx2.rename(None), pidx1 * 10 + pidx2.rename(None)) pidx3 = pd.Index([11, 12, 13]) psidx3 = ps.from_pandas(pidx3) @@ -1689,11 +1662,7 @@ class OpsOnDiffFramesEnabledTest(PandasOnSparkTestCase, SQLTestUtils): psidx3 = ps.from_pandas(pidx3) self.assert_eq(psidx1 * 10 + psidx2, pidx1 * 10 + pidx2) - - if LooseVersion(pd.__version__) >= LooseVersion("1.0"): - self.assert_eq(psidx1 * 10 + psidx3, pidx1 * 10 + pidx3) - else: - self.assert_eq(psidx1 * 10 + psidx3, (pidx1 * 10 + pidx3).rename(None)) + self.assert_eq(psidx1 * 10 + psidx3, pidx1 * 10 + pidx3) def test_align(self): pdf1 = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}, index=[10, 20, 30]) diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py index 70c3089..3e8bcff 100644 --- a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py +++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py @@ -531,7 +531,6 @@ class OpsOnDiffFramesGroupByTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(psdf.groupby(kkey).rank().sum(), pdf.groupby(pkey).rank().sum()) self.assert_eq(psdf.groupby(kkey)["a"].rank().sum(), pdf.groupby(pkey)["a"].rank().sum()) - @unittest.skipIf(pd.__version__ < "0.24.0", "not supported before pandas 0.24.0") def test_shift(self): pdf = pd.DataFrame( { diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_expanding.py b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_expanding.py index 634cbd7..08f1774 100644 --- a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_expanding.py +++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby_expanding.py @@ -17,7 +17,6 @@ from distutils.version import LooseVersion -import numpy as np import pandas as pd from pyspark import pandas as ps @@ -74,43 +73,7 @@ class OpsOnDiffFramesGroupByExpandingTest(PandasOnSparkTestCase, TestUtils): ) def test_groupby_expanding_count(self): - # The behaviour of ExpandingGroupby.count are different between pandas>=1.0.0 and lower, - # and we're following the behaviour of latest version of pandas. - if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"): - self._test_groupby_expanding_func("count") - else: - # Series - psser = ps.Series([1, 2, 3]) - kkey = ps.Series([1, 2, 3], name="a") - midx = pd.MultiIndex.from_tuples( - list(zip(kkey.to_pandas().values, psser.index.to_pandas().values)), - names=["a", None], - ) - expected_result = pd.Series([np.nan, np.nan, np.nan], index=midx) - self.assert_eq( - psser.groupby(kkey).expanding(2).count().sort_index(), expected_result.sort_index() - ) - - # DataFrame - psdf = ps.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]}) - kkey = ps.Series([1, 2, 3, 2], name="a") - midx = pd.MultiIndex.from_tuples([(1, 0), (2, 1), (2, 3), (3, 2)], names=["a", None]) - expected_result = pd.DataFrame( - {"a": [None, None, 2.0, None], "b": [None, None, 2.0, None]}, index=midx - ) - self.assert_eq( - psdf.groupby(kkey).expanding(2).count().sort_index(), expected_result.sort_index() - ) - expected_result = pd.Series([None, None, 2.0, None], index=midx, name="b") - self.assert_eq( - psdf.groupby(kkey)["b"].expanding(2).count().sort_index(), - expected_result.sort_index(), - ) - expected_result = pd.DataFrame({"b": [None, None, 2.0, None]}, index=midx) - self.assert_eq( - psdf.groupby(kkey)[["b"]].expanding(2).count().sort_index(), - expected_result.sort_index(), - ) + self._test_groupby_expanding_func("count") def test_groupby_expanding_min(self): self._test_groupby_expanding_func("min") diff --git a/python/pyspark/pandas/tests/test_reshape.py b/python/pyspark/pandas/tests/test_reshape.py index f2a0cb1..5e5003a 100644 --- a/python/pyspark/pandas/tests/test_reshape.py +++ b/python/pyspark/pandas/tests/test_reshape.py @@ -213,11 +213,8 @@ class ReshapeTest(PandasOnSparkTestCase): ) psdf = ps.from_pandas(pdf) - if LooseVersion("0.23.0") <= LooseVersion(pd.__version__): - exp = pd.get_dummies(pdf, dtype="float64") - else: - exp = pd.get_dummies(pdf) - exp = exp.astype({"A_a": "float64", "A_b": "float64"}) + exp = pd.get_dummies(pdf) + exp = exp.astype({"A_a": "float64", "A_b": "float64"}) res = ps.get_dummies(psdf, dtype="float64") self.assert_eq(res, exp) diff --git a/python/pyspark/pandas/tests/test_series.py b/python/pyspark/pandas/tests/test_series.py index 51c26ad..cec6a47 100644 --- a/python/pyspark/pandas/tests/test_series.py +++ b/python/pyspark/pandas/tests/test_series.py @@ -236,27 +236,15 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): self.assertRaises(ValueError, lambda: psser.rename_axis(["index2", "index3"])) self.assertRaises(TypeError, lambda: psser.rename_axis(mapper=["index2"], index=["index3"])) - # index/columns parameters and dict_like/functions mappers introduced in pandas 0.24.0 - if LooseVersion(pd.__version__) >= LooseVersion("0.24.0"): - self.assert_eq( - pser.rename_axis(index={"index": "index2", "missing": "index4"}).sort_index(), - psser.rename_axis(index={"index": "index2", "missing": "index4"}).sort_index(), - ) - - self.assert_eq( - pser.rename_axis(index=str.upper).sort_index(), - psser.rename_axis(index=str.upper).sort_index(), - ) - else: - expected = psser - expected.index.name = "index2" - result = psser.rename_axis(index={"index": "index2", "missing": "index4"}).sort_index() - self.assert_eq(expected, result) + self.assert_eq( + pser.rename_axis(index={"index": "index2", "missing": "index4"}).sort_index(), + psser.rename_axis(index={"index": "index2", "missing": "index4"}).sort_index(), + ) - expected = psser - expected.index.name = "INDEX" - result = psser.rename_axis(index=str.upper).sort_index() - self.assert_eq(expected, result) + self.assert_eq( + pser.rename_axis(index=str.upper).sort_index(), + psser.rename_axis(index=str.upper).sort_index(), + ) index = pd.MultiIndex.from_tuples( [("A", "B"), ("C", "D"), ("E", "F")], names=["index1", "index2"] @@ -271,32 +259,19 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): self.assertRaises(ValueError, lambda: psser.rename_axis(["index3", "index4", "index5"])) - # index/columns parameters and dict_like/functions mappers introduced in pandas 0.24.0 - if LooseVersion(pd.__version__) >= LooseVersion("0.24.0"): - self.assert_eq( - pser.rename_axis( - index={"index1": "index3", "index2": "index4", "missing": "index5"} - ).sort_index(), - psser.rename_axis( - index={"index1": "index3", "index2": "index4", "missing": "index5"} - ).sort_index(), - ) - - self.assert_eq( - pser.rename_axis(index=str.upper).sort_index(), - psser.rename_axis(index=str.upper).sort_index(), - ) - else: - expected = psser - expected.index.names = ["index3", "index4"] - result = psser.rename_axis( + self.assert_eq( + pser.rename_axis( + index={"index1": "index3", "index2": "index4", "missing": "index5"} + ).sort_index(), + psser.rename_axis( index={"index1": "index3", "index2": "index4", "missing": "index5"} - ).sort_index() - self.assert_eq(expected, result) + ).sort_index(), + ) - expected.index.names = ["INDEX1", "INDEX2"] - result = psser.rename_axis(index=str.upper).sort_index() - self.assert_eq(expected, result) + self.assert_eq( + pser.rename_axis(index=str.upper).sort_index(), + psser.rename_axis(index=str.upper).sort_index(), + ) def test_or(self): pdf = pd.DataFrame( @@ -792,45 +767,43 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): ) # Series with MultiIndex some of index is NaN. - # This test only available for pandas >= 0.24. - if LooseVersion(pd.__version__) >= LooseVersion("0.24"): - pser.index = pd.MultiIndex.from_tuples( - [("x", "a"), None, ("y", "c"), ("x", "a"), ("y", "c"), ("x", "a")] - ) - psser = ps.from_pandas(pser) + pser.index = pd.MultiIndex.from_tuples( + [("x", "a"), None, ("y", "c"), ("x", "a"), ("y", "c"), ("x", "a")] + ) + psser = ps.from_pandas(pser) - self.assert_eq(psser.value_counts(normalize=True), pser.value_counts(normalize=True)) - self.assert_eq(psser.value_counts(ascending=True), pser.value_counts(ascending=True)) - self.assert_eq( - psser.value_counts(normalize=True, dropna=False), - pser.value_counts(normalize=True, dropna=False), - ) - self.assert_eq( - psser.value_counts(ascending=True, dropna=False), - pser.value_counts(ascending=True, dropna=False), - ) + self.assert_eq(psser.value_counts(normalize=True), pser.value_counts(normalize=True)) + self.assert_eq(psser.value_counts(ascending=True), pser.value_counts(ascending=True)) + self.assert_eq( + psser.value_counts(normalize=True, dropna=False), + pser.value_counts(normalize=True, dropna=False), + ) + self.assert_eq( + psser.value_counts(ascending=True, dropna=False), + pser.value_counts(ascending=True, dropna=False), + ) - # FIXME: MultiIndex.value_counts returns wrong indices. - self.assert_eq( - psser.index.value_counts(normalize=True), - pser.index.value_counts(normalize=True), - almost=True, - ) - self.assert_eq( - psser.index.value_counts(ascending=True), - pser.index.value_counts(ascending=True), - almost=True, - ) - self.assert_eq( - psser.index.value_counts(normalize=True, dropna=False), - pser.index.value_counts(normalize=True, dropna=False), - almost=True, - ) - self.assert_eq( - psser.index.value_counts(ascending=True, dropna=False), - pser.index.value_counts(ascending=True, dropna=False), - almost=True, - ) + # FIXME: MultiIndex.value_counts returns wrong indices. + self.assert_eq( + psser.index.value_counts(normalize=True), + pser.index.value_counts(normalize=True), + almost=True, + ) + self.assert_eq( + psser.index.value_counts(ascending=True), + pser.index.value_counts(ascending=True), + almost=True, + ) + self.assert_eq( + psser.index.value_counts(normalize=True, dropna=False), + pser.index.value_counts(normalize=True, dropna=False), + almost=True, + ) + self.assert_eq( + psser.index.value_counts(ascending=True, dropna=False), + pser.index.value_counts(ascending=True, dropna=False), + almost=True, + ) def test_nsmallest(self): sample_lst = [1, 2, 3, 4, np.nan, 6] @@ -1438,12 +1411,8 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(psser.shift().shift(-1), pser.shift().shift(-1)) self.assert_eq(psser.shift().sum(), pser.shift().sum()) - if LooseVersion(pd.__version__) < LooseVersion("0.24.2"): - self.assert_eq(psser.shift(periods=2), pser.shift(periods=2)) - else: - self.assert_eq( - psser.shift(periods=2, fill_value=0), pser.shift(periods=2, fill_value=0) - ) + self.assert_eq(psser.shift(periods=2, fill_value=0), pser.shift(periods=2, fill_value=0)) + with self.assertRaisesRegex(TypeError, "periods should be an int; however"): psser.shift(periods=1.5) @@ -1992,51 +1961,29 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): pser = pd.Series([100, None, 300, None, 500], name="Koalas") psser = ps.from_pandas(pser) - if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"): - kdiv, kmod = psser.divmod(-100) - pdiv, pmod = pser.divmod(-100) - self.assert_eq(kdiv, pdiv) - self.assert_eq(kmod, pmod) + kdiv, kmod = psser.divmod(-100) + pdiv, pmod = pser.divmod(-100) + self.assert_eq(kdiv, pdiv) + self.assert_eq(kmod, pmod) - kdiv, kmod = psser.divmod(100) - pdiv, pmod = pser.divmod(100) - self.assert_eq(kdiv, pdiv) - self.assert_eq(kmod, pmod) - elif LooseVersion(pd.__version__) < LooseVersion("1.0.0"): - kdiv, kmod = psser.divmod(-100) - pdiv, pmod = pser.floordiv(-100), pser.mod(-100) - self.assert_eq(kdiv, pdiv) - self.assert_eq(kmod, pmod) - - kdiv, kmod = psser.divmod(100) - pdiv, pmod = pser.floordiv(100), pser.mod(100) - self.assert_eq(kdiv, pdiv) - self.assert_eq(kmod, pmod) + kdiv, kmod = psser.divmod(100) + pdiv, pmod = pser.divmod(100) + self.assert_eq(kdiv, pdiv) + self.assert_eq(kmod, pmod) def test_rdivmod(self): pser = pd.Series([100, None, 300, None, 500]) psser = ps.from_pandas(pser) - if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"): - krdiv, krmod = psser.rdivmod(-100) - prdiv, prmod = pser.rdivmod(-100) - self.assert_eq(krdiv, prdiv) - self.assert_eq(krmod, prmod) - - krdiv, krmod = psser.rdivmod(100) - prdiv, prmod = pser.rdivmod(100) - self.assert_eq(krdiv, prdiv) - self.assert_eq(krmod, prmod) - elif LooseVersion(pd.__version__) < LooseVersion("1.0.0"): - krdiv, krmod = psser.rdivmod(-100) - prdiv, prmod = pser.rfloordiv(-100), pser.rmod(-100) - self.assert_eq(krdiv, prdiv) - self.assert_eq(krmod, prmod) + krdiv, krmod = psser.rdivmod(-100) + prdiv, prmod = pser.rdivmod(-100) + self.assert_eq(krdiv, prdiv) + self.assert_eq(krmod, prmod) - krdiv, krmod = psser.rdivmod(100) - prdiv, prmod = pser.rfloordiv(100), pser.rmod(100) - self.assert_eq(krdiv, prdiv) - self.assert_eq(krmod, prmod) + krdiv, krmod = psser.rdivmod(100) + prdiv, prmod = pser.rdivmod(100) + self.assert_eq(krdiv, prdiv) + self.assert_eq(krmod, prmod) def test_mod(self): pser = pd.Series([100, None, -300, None, 500, -700], name="Koalas") @@ -2054,22 +2001,18 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): pser = pd.Series([0, 0, 1, 1, 1, np.nan, np.nan, np.nan]) psser = ps.from_pandas(pser) self.assert_eq(psser.mode(), pser.mode()) - if LooseVersion(pd.__version__) >= LooseVersion("0.24"): - # The `dropna` argument is added in pandas 0.24. - self.assert_eq( - psser.mode(dropna=False).sort_values().reset_index(drop=True), - pser.mode(dropna=False).sort_values().reset_index(drop=True), - ) + self.assert_eq( + psser.mode(dropna=False).sort_values().reset_index(drop=True), + pser.mode(dropna=False).sort_values().reset_index(drop=True), + ) pser.name = "x" psser = ps.from_pandas(pser) self.assert_eq(psser.mode(), pser.mode()) - if LooseVersion(pd.__version__) >= LooseVersion("0.24"): - # The `dropna` argument is added in pandas 0.24. - self.assert_eq( - psser.mode(dropna=False).sort_values().reset_index(drop=True), - pser.mode(dropna=False).sort_values().reset_index(drop=True), - ) + self.assert_eq( + psser.mode(dropna=False).sort_values().reset_index(drop=True), + pser.mode(dropna=False).sort_values().reset_index(drop=True), + ) def test_rmod(self): pser = pd.Series([100, None, -300, None, 500, -700], name="Koalas") @@ -2206,16 +2149,8 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(pser.truediv(np.nan), psser.truediv(np.nan)) self.assert_eq(pser / np.nan, psser / np.nan) - # floordiv has different behavior in pandas > 1.0.0 when divide by 0 - if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"): - self.assert_eq(pser.floordiv(0), psser.floordiv(0)) - self.assert_eq(pser // 0, psser // 0) - else: - result = pd.Series( - [np.inf, np.nan, -np.inf, np.nan, np.inf, -np.inf, np.inf, -np.inf], name="Koalas" - ) - self.assert_eq(psser.floordiv(0), result) - self.assert_eq(psser // 0, result) + self.assert_eq(pser.floordiv(0), psser.floordiv(0)) + self.assert_eq(pser // 0, psser // 0) self.assert_eq(pser.floordiv(np.nan), psser.floordiv(np.nan)) def test_mad(self): @@ -2280,11 +2215,7 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): pser = pd.Series(["elk", "pig", "dog", "quetzal"], name="animal") psser = ps.from_pandas(pser) - # `to_markdown()` is supported in pandas >= 1.0.0 since it's newly added in pandas 1.0.0. - if LooseVersion(pd.__version__) < LooseVersion("1.0.0"): - self.assertRaises(NotImplementedError, lambda: psser.to_markdown()) - else: - self.assert_eq(pser.to_markdown(), psser.to_markdown()) + self.assert_eq(pser.to_markdown(), psser.to_markdown()) def test_unstack(self): pser = pd.Series( @@ -2404,66 +2335,62 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(p_items, k_items) def test_droplevel(self): - # droplevel is new in pandas 0.24.0 - if LooseVersion(pd.__version__) >= LooseVersion("0.24.0"): - pser = pd.Series( - [1, 2, 3], - index=pd.MultiIndex.from_tuples( - [("x", "a", "q"), ("x", "b", "w"), ("y", "c", "e")], - names=["level_1", "level_2", "level_3"], - ), - ) - psser = ps.from_pandas(pser) + pser = pd.Series( + [1, 2, 3], + index=pd.MultiIndex.from_tuples( + [("x", "a", "q"), ("x", "b", "w"), ("y", "c", "e")], + names=["level_1", "level_2", "level_3"], + ), + ) + psser = ps.from_pandas(pser) - self.assert_eq(pser.droplevel(0), psser.droplevel(0)) - self.assert_eq(pser.droplevel("level_1"), psser.droplevel("level_1")) - self.assert_eq(pser.droplevel(-1), psser.droplevel(-1)) - self.assert_eq(pser.droplevel([0]), psser.droplevel([0])) - self.assert_eq(pser.droplevel(["level_1"]), psser.droplevel(["level_1"])) - self.assert_eq(pser.droplevel((0,)), psser.droplevel((0,))) - self.assert_eq(pser.droplevel(("level_1",)), psser.droplevel(("level_1",))) - self.assert_eq(pser.droplevel([0, 2]), psser.droplevel([0, 2])) - self.assert_eq( - pser.droplevel(["level_1", "level_3"]), psser.droplevel(["level_1", "level_3"]) - ) - self.assert_eq(pser.droplevel((1, 2)), psser.droplevel((1, 2))) - self.assert_eq( - pser.droplevel(("level_2", "level_3")), psser.droplevel(("level_2", "level_3")) - ) + self.assert_eq(pser.droplevel(0), psser.droplevel(0)) + self.assert_eq(pser.droplevel("level_1"), psser.droplevel("level_1")) + self.assert_eq(pser.droplevel(-1), psser.droplevel(-1)) + self.assert_eq(pser.droplevel([0]), psser.droplevel([0])) + self.assert_eq(pser.droplevel(["level_1"]), psser.droplevel(["level_1"])) + self.assert_eq(pser.droplevel((0,)), psser.droplevel((0,))) + self.assert_eq(pser.droplevel(("level_1",)), psser.droplevel(("level_1",))) + self.assert_eq(pser.droplevel([0, 2]), psser.droplevel([0, 2])) + self.assert_eq( + pser.droplevel(["level_1", "level_3"]), psser.droplevel(["level_1", "level_3"]) + ) + self.assert_eq(pser.droplevel((1, 2)), psser.droplevel((1, 2))) + self.assert_eq( + pser.droplevel(("level_2", "level_3")), psser.droplevel(("level_2", "level_3")) + ) - with self.assertRaisesRegex(KeyError, "Level {0, 1, 2} not found"): - psser.droplevel({0, 1, 2}) - with self.assertRaisesRegex(KeyError, "Level level_100 not found"): - psser.droplevel(["level_1", "level_100"]) - with self.assertRaisesRegex( - IndexError, "Too many levels: Index has only 3 levels, not 11" - ): - psser.droplevel(10) - with self.assertRaisesRegex( - IndexError, - "Too many levels: Index has only 3 levels, -10 is not a valid level number", - ): - psser.droplevel(-10) - with self.assertRaisesRegex( - ValueError, - "Cannot remove 3 levels from an index with 3 levels: " - "at least one level must be left.", - ): - psser.droplevel([0, 1, 2]) - with self.assertRaisesRegex( - ValueError, - "Cannot remove 5 levels from an index with 3 levels: " - "at least one level must be left.", - ): - psser.droplevel([1, 1, 1, 1, 1]) + with self.assertRaisesRegex(KeyError, "Level {0, 1, 2} not found"): + psser.droplevel({0, 1, 2}) + with self.assertRaisesRegex(KeyError, "Level level_100 not found"): + psser.droplevel(["level_1", "level_100"]) + with self.assertRaisesRegex(IndexError, "Too many levels: Index has only 3 levels, not 11"): + psser.droplevel(10) + with self.assertRaisesRegex( + IndexError, + "Too many levels: Index has only 3 levels, -10 is not a valid level number", + ): + psser.droplevel(-10) + with self.assertRaisesRegex( + ValueError, + "Cannot remove 3 levels from an index with 3 levels: " + "at least one level must be left.", + ): + psser.droplevel([0, 1, 2]) + with self.assertRaisesRegex( + ValueError, + "Cannot remove 5 levels from an index with 3 levels: " + "at least one level must be left.", + ): + psser.droplevel([1, 1, 1, 1, 1]) - # Tupled names - pser.index.names = [("a", "1"), ("b", "2"), ("c", "3")] - psser = ps.from_pandas(pser) + # Tupled names + pser.index.names = [("a", "1"), ("b", "2"), ("c", "3")] + psser = ps.from_pandas(pser) - self.assert_eq( - pser.droplevel([("a", "1"), ("c", "3")]), psser.droplevel([("a", "1"), ("c", "3")]) - ) + self.assert_eq( + pser.droplevel([("a", "1"), ("c", "3")]), psser.droplevel([("a", "1"), ("c", "3")]) + ) def test_dot(self): pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -2684,22 +2611,20 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): # Deals with na_sentinel # # pandas >= 1.1.2 support na_sentinel=None - # pandas >= 0.24 support na_sentinel not to be -1 # pd_below_1_1_2 = LooseVersion(pd.__version__) < LooseVersion("1.1.2") - pd_below_0_24 = LooseVersion(pd.__version__) < LooseVersion("0.24") pser = pd.Series(["a", "b", "a", np.nan, None]) psser = ps.from_pandas(pser) pcodes, puniques = pser.factorize(sort=True, na_sentinel=-2) kcodes, kuniques = psser.factorize(na_sentinel=-2) - self.assert_eq([0, 1, 0, -2, -2] if pd_below_0_24 else pcodes.tolist(), kcodes.to_list()) + self.assert_eq(pcodes.tolist(), kcodes.to_list()) self.assert_eq(puniques, kuniques) pcodes, puniques = pser.factorize(sort=True, na_sentinel=2) kcodes, kuniques = psser.factorize(na_sentinel=2) - self.assert_eq([0, 1, 0, 2, 2] if pd_below_0_24 else pcodes.tolist(), kcodes.to_list()) + self.assert_eq(pcodes.tolist(), kcodes.to_list()) self.assert_eq(puniques, kuniques) if not pd_below_1_1_2: @@ -2736,50 +2661,19 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(expected, psser) def test_explode(self): - if LooseVersion(pd.__version__) >= LooseVersion("0.25"): - pser = pd.Series([[1, 2, 3], [], None, [3, 4]]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.explode(), psser.explode(), almost=True) - - # MultiIndex - pser.index = pd.MultiIndex.from_tuples([("a", "w"), ("b", "x"), ("c", "y"), ("d", "z")]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.explode(), psser.explode(), almost=True) - - # non-array type Series - pser = pd.Series([1, 2, 3, 4]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.explode(), psser.explode()) - else: - pser = pd.Series([[1, 2, 3], [], None, [3, 4]]) - psser = ps.from_pandas(pser) - expected = pd.Series([1.0, 2.0, 3.0, None, None, 3.0, 4.0], index=[0, 0, 0, 1, 2, 3, 3]) - self.assert_eq(psser.explode(), expected) + pser = pd.Series([[1, 2, 3], [], None, [3, 4]]) + psser = ps.from_pandas(pser) + self.assert_eq(pser.explode(), psser.explode(), almost=True) - # MultiIndex - pser.index = pd.MultiIndex.from_tuples([("a", "w"), ("b", "x"), ("c", "y"), ("d", "z")]) - psser = ps.from_pandas(pser) - expected = pd.Series( - [1.0, 2.0, 3.0, None, None, 3.0, 4.0], - index=pd.MultiIndex.from_tuples( - [ - ("a", "w"), - ("a", "w"), - ("a", "w"), - ("b", "x"), - ("c", "y"), - ("d", "z"), - ("d", "z"), - ] - ), - ) - self.assert_eq(psser.explode(), expected) + # MultiIndex + pser.index = pd.MultiIndex.from_tuples([("a", "w"), ("b", "x"), ("c", "y"), ("d", "z")]) + psser = ps.from_pandas(pser) + self.assert_eq(pser.explode(), psser.explode(), almost=True) - # non-array type Series - pser = pd.Series([1, 2, 3, 4]) - psser = ps.from_pandas(pser) - expected = pser - self.assert_eq(psser.explode(), expected) + # non-array type Series + pser = pd.Series([1, 2, 3, 4]) + psser = ps.from_pandas(pser) + self.assert_eq(pser.explode(), psser.explode()) def test_argsort(self): # Without null values @@ -2888,36 +2782,20 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): ) psser = ps.from_pandas(pser) - if LooseVersion(pd.__version__) >= LooseVersion("1.0"): - self.assert_eq(pser.argmin(), psser.argmin()) - self.assert_eq(pser.argmax(), psser.argmax()) + self.assert_eq(pser.argmin(), psser.argmin()) + self.assert_eq(pser.argmax(), psser.argmax()) - # MultiIndex - pser.index = pd.MultiIndex.from_tuples( - [("a", "t"), ("b", "u"), ("c", "v"), ("d", "w"), ("e", "x"), ("f", "u")] - ) - psser = ps.from_pandas(pser) - self.assert_eq(pser.argmin(), psser.argmin()) - self.assert_eq(pser.argmax(), psser.argmax()) - - # Null Series - self.assert_eq(pd.Series([np.nan]).argmin(), ps.Series([np.nan]).argmin()) - self.assert_eq(pd.Series([np.nan]).argmax(), ps.Series([np.nan]).argmax()) - else: - self.assert_eq(pser.values.argmin(), psser.argmin()) - self.assert_eq(pser.values.argmax(), psser.argmax()) - - # MultiIndex - pser.index = pd.MultiIndex.from_tuples( - [("a", "t"), ("b", "u"), ("c", "v"), ("d", "w"), ("e", "x"), ("f", "u")] - ) - psser = ps.from_pandas(pser) - self.assert_eq(pser.values.argmin(), psser.argmin()) - self.assert_eq(pser.values.argmax(), psser.argmax()) + # MultiIndex + pser.index = pd.MultiIndex.from_tuples( + [("a", "t"), ("b", "u"), ("c", "v"), ("d", "w"), ("e", "x"), ("f", "u")] + ) + psser = ps.from_pandas(pser) + self.assert_eq(pser.argmin(), psser.argmin()) + self.assert_eq(pser.argmax(), psser.argmax()) - # Null Series - self.assert_eq(-1, ps.Series([np.nan]).argmin()) - self.assert_eq(-1, ps.Series([np.nan]).argmax()) + # Null Series + self.assert_eq(pd.Series([np.nan]).argmin(), ps.Series([np.nan]).argmin()) + self.assert_eq(pd.Series([np.nan]).argmax(), ps.Series([np.nan]).argmax()) with self.assertRaisesRegex(ValueError, "attempt to get argmin of an empty sequence"): ps.Series([]).argmin() diff --git a/python/pyspark/pandas/tests/test_series_conversion.py b/python/pyspark/pandas/tests/test_series_conversion.py index 6d676dd..1e5c5cf 100644 --- a/python/pyspark/pandas/tests/test_series_conversion.py +++ b/python/pyspark/pandas/tests/test_series_conversion.py @@ -16,7 +16,6 @@ # import unittest -from distutils.version import LooseVersion import pandas as pd @@ -58,10 +57,6 @@ class SeriesConversionTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(psser.to_latex(sparsify=False), pser.to_latex(sparsify=False)) self.assert_eq(psser.to_latex(index_names=False), pser.to_latex(index_names=False)) self.assert_eq(psser.to_latex(bold_rows=True), pser.to_latex(bold_rows=True)) - # Can't specifying `encoding` without specifying `buf` as filename in pandas >= 1.0.0 - # https://github.com/pandas-dev/pandas/blob/master/pandas/io/formats/format.py#L492-L495 - if LooseVersion(pd.__version__) < LooseVersion("1.0.0"): - self.assert_eq(psser.to_latex(encoding="ascii"), pser.to_latex(encoding="ascii")) self.assert_eq(psser.to_latex(decimal=","), pser.to_latex(decimal=",")) diff --git a/python/pyspark/pandas/tests/test_stats.py b/python/pyspark/pandas/tests/test_stats.py index 2136695..eef1616 100644 --- a/python/pyspark/pandas/tests/test_stats.py +++ b/python/pyspark/pandas/tests/test_stats.py @@ -15,8 +15,6 @@ # limitations under the License. # -from distutils.version import LooseVersion - import numpy as np import pandas as pd @@ -341,14 +339,8 @@ class StatsTest(PandasOnSparkTestCase, SQLTestUtils): ) self.assert_eq(psdf.count(numeric_only=True), pdf.count(numeric_only=True)) - if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"): - self.assert_eq(psdf.sum(numeric_only=True), pdf.sum(numeric_only=True)) - self.assert_eq(psdf.product(numeric_only=True), pdf.product(numeric_only=True)) - else: - self.assert_eq(psdf.sum(numeric_only=True), pdf.sum(numeric_only=True).astype(int)) - self.assert_eq( - psdf.product(numeric_only=True), pdf.product(numeric_only=True).astype(int) - ) + self.assert_eq(psdf.sum(numeric_only=True), pdf.sum(numeric_only=True)) + self.assert_eq(psdf.product(numeric_only=True), pdf.product(numeric_only=True)) self.assert_eq(psdf.mean(numeric_only=True), pdf.mean(numeric_only=True)) @@ -395,17 +387,10 @@ class StatsTest(PandasOnSparkTestCase, SQLTestUtils): pdf = pd.DataFrame({"i": [0, 1, 2], "b": [False, False, True], "s": ["x", "y", "z"]}) psdf = ps.from_pandas(pdf) - if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"): - self.assert_eq(psdf.sum(numeric_only=True), pdf.sum(numeric_only=True)) - self.assert_eq( - psdf[["i", "b"]].sum(numeric_only=False), pdf[["i", "b"]].sum(numeric_only=False) - ) - else: - self.assert_eq(psdf.sum(numeric_only=True), pdf.sum(numeric_only=True).astype(int)) - self.assert_eq( - psdf[["i", "b"]].sum(numeric_only=False), - pdf[["i", "b"]].sum(numeric_only=False).astype(int), - ) + self.assert_eq(psdf.sum(numeric_only=True), pdf.sum(numeric_only=True)) + self.assert_eq( + psdf[["i", "b"]].sum(numeric_only=False), pdf[["i", "b"]].sum(numeric_only=False) + ) with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"): psdf.sum(numeric_only=False) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org