This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 53cbaeb20293 [SPARK-46936][PS] Implement `Frame.to_feather` 53cbaeb20293 is described below commit 53cbaeb202931a918864ca4aecd9826144ad8307 Author: Ruifeng Zheng <ruife...@apache.org> AuthorDate: Fri Feb 2 09:04:26 2024 +0800 [SPARK-46936][PS] Implement `Frame.to_feather` ### What changes were proposed in this pull request? Implement `Frame.to_feather` ### Why are the changes needed? for pandas parity ### Does this PR introduce _any_ user-facing change? yes ``` In [3]: pdf = pd.DataFrame( ...: [[1, 1.0, "a"]], ...: columns=["x", "y", "z"], ...: ) In [4]: pdf Out[4]: x y z 0 1 1.0 a In [5]: psdf = ps.from_pandas(pdf) In [6]: psdf x y z 0 1 1.0 a In [7]: pdf.to_feather("/tmp/file1.feather") In [8]: psdf.to_feather("/tmp/file2.feather") In [9]: f1 = pd.read_feather("/tmp/file1.feather") In [10]: f1 Out[10]: x y z 0 1 1.0 a In [11]: f2 = pd.read_feather("/tmp/file2.feather") In [12]: f2 Out[12]: x y z 0 1 1.0 a ``` ### How was this patch tested? added ut ### Was this patch authored or co-authored using generative AI tooling? no Closes #44972 from zhengruifeng/ps_to_feather. Authored-by: Ruifeng Zheng <ruife...@apache.org> Signed-off-by: Ruifeng Zheng <ruife...@apache.org> --- dev/sparktestsupport/modules.py | 2 + .../docs/source/reference/pyspark.pandas/frame.rst | 1 + python/pyspark/pandas/frame.py | 35 +++++++++++ python/pyspark/pandas/missing/frame.py | 1 - .../pandas/tests/connect/io/test_parity_feather.py | 42 +++++++++++++ python/pyspark/pandas/tests/io/test_feather.py | 68 ++++++++++++++++++++++ 6 files changed, 148 insertions(+), 1 deletion(-) diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 508cf56b9c87..233dcf4e54b6 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -816,6 +816,7 @@ pyspark_pandas = Module( "pyspark.pandas.tests.series.test_stat", "pyspark.pandas.tests.io.test_io", "pyspark.pandas.tests.io.test_csv", + "pyspark.pandas.tests.io.test_feather", "pyspark.pandas.tests.io.test_dataframe_conversion", "pyspark.pandas.tests.io.test_dataframe_spark_io", "pyspark.pandas.tests.io.test_series_conversion", @@ -1297,6 +1298,7 @@ pyspark_pandas_connect_part3 = Module( # pandas-on-Spark unittests "pyspark.pandas.tests.connect.io.test_parity_io", "pyspark.pandas.tests.connect.io.test_parity_csv", + "pyspark.pandas.tests.connect.io.test_parity_feather", "pyspark.pandas.tests.connect.io.test_parity_dataframe_conversion", "pyspark.pandas.tests.connect.io.test_parity_dataframe_spark_io", "pyspark.pandas.tests.connect.io.test_parity_series_conversion", diff --git a/python/docs/source/reference/pyspark.pandas/frame.rst b/python/docs/source/reference/pyspark.pandas/frame.rst index 77b60468b8fb..564ddb607a19 100644 --- a/python/docs/source/reference/pyspark.pandas/frame.rst +++ b/python/docs/source/reference/pyspark.pandas/frame.rst @@ -283,6 +283,7 @@ Serialization / IO / Conversion DataFrame.to_numpy DataFrame.to_spark DataFrame.to_string + DataFrame.to_feather DataFrame.to_json DataFrame.to_dict DataFrame.to_excel diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py index 7222b877bba1..3b3565f7ea9f 100644 --- a/python/pyspark/pandas/frame.py +++ b/python/pyspark/pandas/frame.py @@ -2648,6 +2648,41 @@ defaultdict(<class 'list'>, {'col..., 'col...})] psdf._to_internal_pandas(), self.to_latex, pd.DataFrame.to_latex, args ) + def to_feather( + self, + path: Union[str, IO[str]], + **kwargs: Any, + ) -> None: + """ + Write a DataFrame to the binary Feather format. + + .. note:: This method should only be used if the resulting DataFrame is expected + to be small, as all the data is loaded into the driver's memory. + + .. versionadded:: 4.0.0 + + Parameters + ---------- + path : str, path object, file-like object + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``write()`` function. + **kwargs : + Additional keywords passed to :func:`pyarrow.feather.write_feather`. + This includes the `compression`, `compression_level`, `chunksize` + and `version` keywords. + + Examples + -------- + >>> df = ps.DataFrame([[1, 2, 3], [4, 5, 6]]) + >>> df.to_feather("file.feather") # doctest: +SKIP + """ + # Make sure locals() call is at the top of the function so we don't capture local variables. + args = locals() + + return validate_arguments_and_invoke_function( + self._to_internal_pandas(), self.to_feather, pd.DataFrame.to_feather, args + ) + def transpose(self) -> "DataFrame": """ Transpose index and columns. diff --git a/python/pyspark/pandas/missing/frame.py b/python/pyspark/pandas/missing/frame.py index 25a3a2afa3df..fdb6cec7c0f9 100644 --- a/python/pyspark/pandas/missing/frame.py +++ b/python/pyspark/pandas/missing/frame.py @@ -42,7 +42,6 @@ class MissingPandasLikeDataFrame: infer_objects = _unsupported_function("infer_objects") reorder_levels = _unsupported_function("reorder_levels") set_axis = _unsupported_function("set_axis") - to_feather = _unsupported_function("to_feather") to_period = _unsupported_function("to_period") to_sql = _unsupported_function("to_sql") to_stata = _unsupported_function("to_stata") diff --git a/python/pyspark/pandas/tests/connect/io/test_parity_feather.py b/python/pyspark/pandas/tests/connect/io/test_parity_feather.py new file mode 100644 index 000000000000..5d48fed547f5 --- /dev/null +++ b/python/pyspark/pandas/tests/connect/io/test_parity_feather.py @@ -0,0 +1,42 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import unittest + +from pyspark.pandas.tests.io.test_feather import FeatherMixin +from pyspark.testing.connectutils import ReusedConnectTestCase +from pyspark.testing.pandasutils import PandasOnSparkTestUtils, TestUtils + + +class FeatherParityTests( + FeatherMixin, + PandasOnSparkTestUtils, + ReusedConnectTestCase, + TestUtils, +): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.connect.io.test_parity_feather import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/pandas/tests/io/test_feather.py b/python/pyspark/pandas/tests/io/test_feather.py new file mode 100644 index 000000000000..74fa6bc7d7b6 --- /dev/null +++ b/python/pyspark/pandas/tests/io/test_feather.py @@ -0,0 +1,68 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import unittest + +import pandas as pd + +from pyspark import pandas as ps +from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils + + +class FeatherMixin: + @property + def pdf(self): + return pd.DataFrame( + [[1, 1.0, "a"]], + columns=["x", "y", "z"], + ) + + @property + def psdf(self): + return ps.from_pandas(self.pdf) + + def test_to_feather(self): + with self.temp_dir() as dirpath: + path1 = f"{dirpath}/file1.feather" + path2 = f"{dirpath}/file2.feather" + + self.pdf.to_feather(path1) + self.psdf.to_feather(path2) + + self.assert_eq( + pd.read_feather(path1), + pd.read_feather(path2), + ) + + +class FeatherTests( + FeatherMixin, + PandasOnSparkTestCase, + TestUtils, +): + pass + + +if __name__ == "__main__": + from pyspark.pandas.tests.io.test_feather import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org