This is an automated email from the ASF dual-hosted git repository. ibzib pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/beam.git
The following commit(s) were added to refs/heads/master by this push: new fc7df4b [BEAM-12764] Revert "Merge pull request #15165 from [BEAM-12593] Verify DataFrame API on pandas 1.3.0" new aa1c8e5 Merge pull request #15385 from ibzib/rollback-dataframes fc7df4b is described below commit fc7df4b97c571ff15dd5c388051fca1bf613665d Author: Kyle Weaver <kcwea...@google.com> AuthorDate: Tue Aug 24 16:13:39 2021 -0700 [BEAM-12764] Revert "Merge pull request #15165 from [BEAM-12593] Verify DataFrame API on pandas 1.3.0" This reverts commit faac725e98f8422b172a77e8c898af03c15b74c6. --- sdks/python/apache_beam/dataframe/frames.py | 21 +++----- sdks/python/apache_beam/dataframe/frames_test.py | 46 ++--------------- .../apache_beam/dataframe/pandas_doctests_test.py | 58 ++++------------------ sdks/python/setup.py | 2 +- 4 files changed, 22 insertions(+), 105 deletions(-) diff --git a/sdks/python/apache_beam/dataframe/frames.py b/sdks/python/apache_beam/dataframe/frames.py index 45ae8c6..b834d9c 100644 --- a/sdks/python/apache_beam/dataframe/frames.py +++ b/sdks/python/apache_beam/dataframe/frames.py @@ -55,9 +55,6 @@ __all__ = [ 'DeferredDataFrame', ] -# Get major, minor version -PD_VERSION = tuple(map(int, pd.__version__.split('.')[0:2])) - def populate_not_implemented(pd_type): def wrapper(deferred_type): @@ -1935,7 +1932,7 @@ class DeferredSeries(DeferredDataFrameOrSeries): else: column = self - result = column.groupby(column, dropna=dropna).size() + result = column.groupby(column).size() # groupby.size() names the index, which we don't need result.index.name = None @@ -2395,8 +2392,8 @@ class DeferredDataFrame(DeferredDataFrameOrSeries): if func in ('quantile',): return getattr(self, func)(*args, axis=axis, **kwargs) - # In pandas<1.3.0, maps to a property, args are ignored - if func in ('size',) and PD_VERSION < (1, 3): + # Maps to a property, args are ignored + if func in ('size',): return getattr(self, func) # We also have specialized distributed implementations for these. They only @@ -3395,7 +3392,7 @@ class DeferredDataFrame(DeferredDataFrameOrSeries): @frame_base.with_docs_from(pd.DataFrame) def value_counts(self, subset=None, sort=False, normalize=False, - ascending=False, dropna=True): + ascending=False): """``sort`` is ``False`` by default, and ``sort=True`` is not supported because it imposes an ordering on the dataset which likely will not be preserved.""" @@ -3406,16 +3403,10 @@ class DeferredDataFrame(DeferredDataFrameOrSeries): "ordering on the dataset which likely will not be preserved.", reason="order-sensitive") columns = subset or list(self.columns) - - if dropna: - dropped = self.dropna() - else: - dropped = self - - result = dropped.groupby(columns, dropna=dropna).size() + result = self.groupby(columns).size() if normalize: - return result/dropped.length() + return result/self.dropna().length() else: return result diff --git a/sdks/python/apache_beam/dataframe/frames_test.py b/sdks/python/apache_beam/dataframe/frames_test.py index a2703d8..c3972ad 100644 --- a/sdks/python/apache_beam/dataframe/frames_test.py +++ b/sdks/python/apache_beam/dataframe/frames_test.py @@ -25,8 +25,7 @@ from apache_beam.dataframe import expressions from apache_beam.dataframe import frame_base from apache_beam.dataframe import frames -# Get major, minor version -PD_VERSION = tuple(map(int, pd.__version__.split('.')[0:2])) +PD_VERSION = tuple(map(int, pd.__version__.split('.'))) GROUPBY_DF = pd.DataFrame({ 'group': ['a' if i % 5 == 0 or i % 3 == 0 else 'b' for i in range(100)], @@ -236,17 +235,6 @@ class DeferredFrameTest(_AbstractFrameTest): self._run_test( lambda df, df2: df.subtract(2).multiply(df2).divide(df), df, df2) - @unittest.skipIf(PD_VERSION < (1, 3), "dropna=False is new in pandas 1.3") - def test_value_counts_dropna_false(self): - df = pd.DataFrame({ - 'first_name': ['John', 'Anne', 'John', 'Beth'], - 'middle_name': ['Smith', pd.NA, pd.NA, 'Louise'] - }) - # TODO(BEAM-12495): Remove the assertRaises this when the underlying bug in - # https://github.com/pandas-dev/pandas/issues/36470 is fixed. - with self.assertRaises(NotImplementedError): - self._run_test(lambda df: df.value_counts(dropna=False), df) - def test_get_column(self): df = pd.DataFrame({ 'Animal': ['Falcon', 'Falcon', 'Parrot', 'Parrot'], @@ -381,15 +369,10 @@ class DeferredFrameTest(_AbstractFrameTest): nonparallel=True) def test_combine_Series(self): - s1 = pd.Series({'falcon': 330.0, 'eagle': 160.0}) - s2 = pd.Series({'falcon': 345.0, 'eagle': 200.0, 'duck': 30.0}) - self._run_test( - lambda s1, - s2: s1.combine(s2, max), - s1, - s2, - nonparallel=True, - check_proxy=False) + with expressions.allow_non_parallel_operations(): + s1 = pd.Series({'falcon': 330.0, 'eagle': 160.0}) + s2 = pd.Series({'falcon': 345.0, 'eagle': 200.0, 'duck': 30.0}) + self._run_test(lambda s1, s2: s1.combine(s2, max), s1, s2) def test_combine_first_dataframe(self): df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]}) @@ -604,27 +587,8 @@ class DeferredFrameTest(_AbstractFrameTest): self._run_test(lambda df: df.value_counts(), df) self._run_test(lambda df: df.value_counts(normalize=True), df) - if PD_VERSION >= (1, 3): - # dropna=False is new in pandas 1.3 - # TODO(BEAM-12495): Remove the assertRaises this when the underlying bug - # in https://github.com/pandas-dev/pandas/issues/36470 is fixed. - with self.assertRaises(NotImplementedError): - self._run_test(lambda df: df.value_counts(dropna=False), df) - - # Test the defaults. self._run_test(lambda df: df.num_wings.value_counts(), df) self._run_test(lambda df: df.num_wings.value_counts(normalize=True), df) - self._run_test(lambda df: df.num_wings.value_counts(dropna=False), df) - - # Test the combination interactions. - for normalize in (True, False): - for dropna in (True, False): - self._run_test( - lambda df, - dropna=dropna, - normalize=normalize: df.num_wings.value_counts( - dropna=dropna, normalize=normalize), - df) def test_value_counts_does_not_support_sort(self): df = pd.DataFrame({ diff --git a/sdks/python/apache_beam/dataframe/pandas_doctests_test.py b/sdks/python/apache_beam/dataframe/pandas_doctests_test.py index 755e4e5..edc42f1 100644 --- a/sdks/python/apache_beam/dataframe/pandas_doctests_test.py +++ b/sdks/python/apache_beam/dataframe/pandas_doctests_test.py @@ -20,7 +20,6 @@ import unittest import pandas as pd from apache_beam.dataframe import doctests -from apache_beam.dataframe.frames import PD_VERSION from apache_beam.dataframe.pandas_top_level_functions import _is_top_level_function @@ -69,8 +68,7 @@ class DoctestTest(unittest.TestCase): "df.replace(regex={r'^ba.$': 'new', 'foo': 'xyz'})" ], 'pandas.core.generic.NDFrame.fillna': [ - 'df.fillna(method=\'ffill\')', - 'df.fillna(method="ffill")', + "df.fillna(method='ffill')", 'df.fillna(value=values, limit=1)', ], 'pandas.core.generic.NDFrame.sort_values': ['*'], @@ -166,8 +164,7 @@ class DoctestTest(unittest.TestCase): 'pandas.core.frame.DataFrame.cumprod': ['*'], 'pandas.core.frame.DataFrame.diff': ['*'], 'pandas.core.frame.DataFrame.fillna': [ - 'df.fillna(method=\'ffill\')', - 'df.fillna(method="ffill")', + "df.fillna(method='ffill')", 'df.fillna(value=values, limit=1)', ], 'pandas.core.frame.DataFrame.items': ['*'], @@ -240,8 +237,6 @@ class DoctestTest(unittest.TestCase): # reindex not supported 's2 = s.reindex([1, 0, 2, 3])', ], - 'pandas.core.frame.DataFrame.resample': ['*'], - 'pandas.core.frame.DataFrame.values': ['*'], }, not_implemented_ok={ 'pandas.core.frame.DataFrame.transform': [ @@ -249,8 +244,6 @@ class DoctestTest(unittest.TestCase): # frames_test.py::DeferredFrameTest::test_groupby_transform_sum "df.groupby('Date')['Data'].transform('sum')", ], - 'pandas.core.frame.DataFrame.swaplevel': ['*'], - 'pandas.core.frame.DataFrame.melt': ['*'], 'pandas.core.frame.DataFrame.reindex_axis': ['*'], 'pandas.core.frame.DataFrame.round': [ 'df.round(decimals)', @@ -274,11 +267,6 @@ class DoctestTest(unittest.TestCase): 'pandas.core.frame.DataFrame.set_index': [ "df.set_index([s, s**2])", ], - - # TODO(BEAM-12495) - 'pandas.core.frame.DataFrame.value_counts': [ - 'df.value_counts(dropna=False)' - ], }, skip={ # s2 created with reindex @@ -286,8 +274,6 @@ class DoctestTest(unittest.TestCase): 'df.dot(s2)', ], - 'pandas.core.frame.DataFrame.resample': ['df'], - 'pandas.core.frame.DataFrame.asfreq': ['*'], # Throws NotImplementedError when modifying df 'pandas.core.frame.DataFrame.axes': [ # Returns deferred index. @@ -316,14 +302,6 @@ class DoctestTest(unittest.TestCase): 'pandas.core.frame.DataFrame.to_markdown': ['*'], 'pandas.core.frame.DataFrame.to_parquet': ['*'], - # Raises right exception, but testing framework has matching issues. - # Tested in `frames_test.py`. - 'pandas.core.frame.DataFrame.insert': [ - 'df', - 'df.insert(1, "newcol", [99, 99])', - 'df.insert(0, "col1", [100, 100], allow_duplicates=True)' - ], - 'pandas.core.frame.DataFrame.to_records': [ 'df.index = df.index.rename("I")', 'index_dtypes = f"<S{df.index.str.len().max()}"', # 1.x @@ -407,8 +385,7 @@ class DoctestTest(unittest.TestCase): 's.dot(arr)', # non-deferred result ], 'pandas.core.series.Series.fillna': [ - 'df.fillna(method=\'ffill\')', - 'df.fillna(method="ffill")', + "df.fillna(method='ffill')", 'df.fillna(value=values, limit=1)', ], 'pandas.core.series.Series.items': ['*'], @@ -457,11 +434,11 @@ class DoctestTest(unittest.TestCase): 's.drop_duplicates()', "s.drop_duplicates(keep='last')", ], + 'pandas.core.series.Series.repeat': [ + 's.repeat([1, 2, 3])' + ], 'pandas.core.series.Series.reindex': ['*'], 'pandas.core.series.Series.autocorr': ['*'], - 'pandas.core.series.Series.repeat': ['s.repeat([1, 2, 3])'], - 'pandas.core.series.Series.resample': ['*'], - 'pandas.core.series.Series': ['ser.iloc[0] = 999'], }, not_implemented_ok={ 'pandas.core.series.Series.transform': [ @@ -474,11 +451,8 @@ class DoctestTest(unittest.TestCase): 'ser.groupby(["a", "b", "a", np.nan]).mean()', 'ser.groupby(["a", "b", "a", np.nan], dropna=False).mean()', ], - 'pandas.core.series.Series.swaplevel' :['*'] }, skip={ - # Relies on setting values with iloc - 'pandas.core.series.Series': ['ser', 'r'], 'pandas.core.series.Series.groupby': [ # TODO(BEAM-11393): This example requires aligning two series # with non-unique indexes. It only works in pandas because @@ -486,7 +460,6 @@ class DoctestTest(unittest.TestCase): # alignment. 'ser.groupby(ser > 100).mean()', ], - 'pandas.core.series.Series.asfreq': ['*'], # error formatting 'pandas.core.series.Series.append': [ 's1.append(s2, verify_integrity=True)', @@ -518,12 +491,12 @@ class DoctestTest(unittest.TestCase): # Inspection after modification. 's' ], - 'pandas.core.series.Series.resample': ['df'], }) self.assertEqual(result.failed, 0) def test_string_tests(self): - if PD_VERSION < (1, 2): + PD_VERSION = tuple(int(v) for v in pd.__version__.split('.')) + if PD_VERSION < (1, 2, 0): module = pd.core.strings else: # Definitions were moved to accessor in pandas 1.2.0 @@ -695,13 +668,11 @@ class DoctestTest(unittest.TestCase): 'pandas.core.groupby.generic.SeriesGroupBy.diff': ['*'], 'pandas.core.groupby.generic.DataFrameGroupBy.hist': ['*'], 'pandas.core.groupby.generic.DataFrameGroupBy.fillna': [ - 'df.fillna(method=\'ffill\')', - 'df.fillna(method="ffill")', + "df.fillna(method='ffill')", 'df.fillna(value=values, limit=1)', ], 'pandas.core.groupby.generic.SeriesGroupBy.fillna': [ - 'df.fillna(method=\'ffill\')', - 'df.fillna(method="ffill")', + "df.fillna(method='ffill')", 'df.fillna(value=values, limit=1)', ], }, @@ -711,7 +682,6 @@ class DoctestTest(unittest.TestCase): 'pandas.core.groupby.generic.SeriesGroupBy.transform': ['*'], 'pandas.core.groupby.generic.SeriesGroupBy.idxmax': ['*'], 'pandas.core.groupby.generic.SeriesGroupBy.idxmin': ['*'], - 'pandas.core.groupby.generic.SeriesGroupBy.apply': ['*'], }, skip={ 'pandas.core.groupby.generic.SeriesGroupBy.cov': [ @@ -728,14 +698,6 @@ class DoctestTest(unittest.TestCase): # These examples rely on grouping by a list 'pandas.core.groupby.generic.SeriesGroupBy.aggregate': ['*'], 'pandas.core.groupby.generic.DataFrameGroupBy.aggregate': ['*'], - 'pandas.core.groupby.generic.SeriesGroupBy.transform': [ - # Dropping invalid columns during a transform is unsupported. - 'grouped.transform(lambda x: (x - x.mean()) / x.std())' - ], - 'pandas.core.groupby.generic.DataFrameGroupBy.transform': [ - # Dropping invalid columns during a transform is unsupported. - 'grouped.transform(lambda x: (x - x.mean()) / x.std())' - ], }) self.assertEqual(result.failed, 0) diff --git a/sdks/python/setup.py b/sdks/python/setup.py index 338251d..f4e02b8 100644 --- a/sdks/python/setup.py +++ b/sdks/python/setup.py @@ -166,7 +166,7 @@ if sys.platform == 'win32' and sys.maxsize <= 2**32: REQUIRED_TEST_PACKAGES = [ 'freezegun>=0.3.12', 'mock>=1.0.1,<3.0.0', - 'pandas>=1.0,<1.4.0', + 'pandas>=1.0,<1.3.0', 'parameterized>=0.7.1,<0.8.0', 'pyhamcrest>=1.9,!=1.10.0,<2.0.0', 'pyyaml>=3.12,<6.0.0',