This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 88f121c47778 [SPARK-46931][PS] Implement `{Frame, Series}.to_hdf` 88f121c47778 is described below commit 88f121c47778f0755862046d09484a83932cb30b Author: Ruifeng Zheng <ruife...@apache.org> AuthorDate: Wed Jan 31 08:41:21 2024 -0800 [SPARK-46931][PS] Implement `{Frame, Series}.to_hdf` ### What changes were proposed in this pull request? Implement `{Frame, Series}.to_hdf` ### Why are the changes needed? pandas parity ### Does this PR introduce _any_ user-facing change? yes ``` In [3]: df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=['a', 'b', 'c']) In [4]: df.to_hdf('/tmp/data.h5', key='df', mode='w') In [5]: psdf = ps.from_pandas(df) In [6]: psdf.to_hdf('/tmp/data2.h5', key='df', mode='w') /Users/ruifeng.zheng/Dev/spark/python/pyspark/pandas/utils.py:1015: PandasAPIOnSparkAdviceWarning: `to_hdf` loads all data into the driver's memory. It should only be used if the resulting DataFrame is expected to be small. warnings.warn(message, PandasAPIOnSparkAdviceWarning) In [7]: !ls /tmp/*h5 /tmp/data.h5 /tmp/data2.h5 In [8]: !ls -lh /tmp/*h5 -rw-r--r-- 1 ruifeng.zheng wheel 6.9K Jan 31 12:21 /tmp/data.h5 -rw-r--r-- 1 ruifeng.zheng wheel 6.9K Jan 31 12:21 /tmp/data2.h5 ``` ### How was this patch tested? manually test, `hdf` requires additional library `pytables` which in turn needs [many prerequisites](https://www.pytables.org/usersguide/installation.html#prerequisites) since `pytables` is just a optional dep of `Pandas`, so I think we can avoid adding it to CI first. ### Was this patch authored or co-authored using generative AI tooling? no Closes #44966 from zhengruifeng/ps_to_hdf. Authored-by: Ruifeng Zheng <ruife...@apache.org> Signed-off-by: Dongjoon Hyun <dh...@apple.com> --- .../docs/source/reference/pyspark.pandas/frame.rst | 1 + .../source/reference/pyspark.pandas/series.rst | 1 + python/pyspark/pandas/generic.py | 120 +++++++++++++++++++++ python/pyspark/pandas/missing/frame.py | 1 - python/pyspark/pandas/missing/series.py | 1 - 5 files changed, 122 insertions(+), 2 deletions(-) diff --git a/python/docs/source/reference/pyspark.pandas/frame.rst b/python/docs/source/reference/pyspark.pandas/frame.rst index 12cf6e7db12f..77b60468b8fb 100644 --- a/python/docs/source/reference/pyspark.pandas/frame.rst +++ b/python/docs/source/reference/pyspark.pandas/frame.rst @@ -286,6 +286,7 @@ Serialization / IO / Conversion DataFrame.to_json DataFrame.to_dict DataFrame.to_excel + DataFrame.to_hdf DataFrame.to_clipboard DataFrame.to_markdown DataFrame.to_records diff --git a/python/docs/source/reference/pyspark.pandas/series.rst b/python/docs/source/reference/pyspark.pandas/series.rst index 88d1861c6ccf..5606fa93a5f3 100644 --- a/python/docs/source/reference/pyspark.pandas/series.rst +++ b/python/docs/source/reference/pyspark.pandas/series.rst @@ -486,6 +486,7 @@ Serialization / IO / Conversion Series.to_json Series.to_csv Series.to_excel + Series.to_hdf Series.to_frame Pandas-on-Spark specific diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py index 77cefb53fe5d..ed2aeb8ea6af 100644 --- a/python/pyspark/pandas/generic.py +++ b/python/pyspark/pandas/generic.py @@ -1103,6 +1103,126 @@ class Frame(object, metaclass=ABCMeta): psdf._to_internal_pandas(), self.to_excel, f, args ) + def to_hdf( + self, + path_or_buf: Union[str, pd.HDFStore], + key: str, + mode: str = "a", + complevel: Optional[int] = None, + complib: Optional[str] = None, + append: bool = False, + format: Optional[str] = None, + index: bool = True, + min_itemsize: Optional[Union[int, Dict[str, int]]] = None, + nan_rep: Optional[Any] = None, + dropna: Optional[bool] = None, + data_columns: Optional[Union[bool, List[str]]] = None, + errors: str = "strict", + encoding: str = "UTF-8", + ) -> None: + """ + Write the contained data to an HDF5 file using HDFStore. + + .. note:: This method should only be used if the resulting DataFrame is expected + to be small, as all the data is loaded into the driver's memory. + + .. versionadded:: 4.0.0 + + Parameters + ---------- + path_or_buf : str or pandas.HDFStore + File path or HDFStore object. + key : str + Identifier for the group in the store. + mode : {'a', 'w', 'r+'}, default 'a' + Mode to open file: + + - 'w': write, a new file is created (an existing file with + the same name would be deleted). + - 'a': append, an existing file is opened for reading and + writing, and if the file does not exist it is created. + - 'r+': similar to 'a', but the file must already exist. + + complevel : {0-9}, default None + Specifies a compression level for data. + A value of 0 or None disables compression. + complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib' + Specifies the compression library to be used. + These additional compressors for Blosc are supported + (default if no compressor specified: 'blosc:blosclz'): + {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', + 'blosc:zlib', 'blosc:zstd'}. + Specifying a compression library which is not available issues + a ValueError. + append : bool, default False + For Table formats, append the input data to the existing. + format : {'fixed', 'table', None}, default 'fixed' + Possible values: + + - 'fixed': Fixed format. Fast writing/reading. Not-appendable, + nor searchable. + - 'table': Table format. Write as a PyTables Table structure + which may perform worse but allow more flexible operations + like searching / selecting subsets of the data. + - If None, pd.get_option('io.hdf.default_format') is checked, + followed by fallback to "fixed". + + index : bool, default True + Write DataFrame index as a column. + min_itemsize : dict or int, optional + Map column names to minimum string sizes for columns. + nan_rep : Any, optional + How to represent null values as str. + Not allowed with append=True. + dropna : bool, default False, optional + Remove missing values. + data_columns : list of columns or True, optional + List of columns to create as indexed data columns for on-disk + queries, or True to use all columns. By default only the axes + of the object are indexed. Applicable only to format='table'. + errors : str, default 'strict' + Specifies how encoding and decoding errors are to be handled. + See the errors argument for :func:`open` for a full list + of options. + encoding : str, default "UTF-8" + + See Also + -------- + DataFrame.to_orc : Write a DataFrame to the binary orc format. + DataFrame.to_parquet : Write a DataFrame to the binary parquet format. + DataFrame.to_csv : Write out to a csv file. + + Examples + -------- + >>> df = ps.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, + ... index=['a', 'b', 'c']) # doctest: +SKIP + >>> df.to_hdf('data.h5', key='df', mode='w') # doctest: +SKIP + + We can add another object to the same file: + + >>> s = ps.Series([1, 2, 3, 4]) # doctest: +SKIP + >>> s.to_hdf('data.h5', key='s') # doctest: +SKIP + """ + log_advice( + "`to_hdf` loads all data into the driver's memory. " + "It should only be used if the resulting DataFrame is expected to be small." + ) + # Make sure locals() call is at the top of the function so we don't capture local variables. + args = locals() + psdf = self + + if isinstance(self, ps.DataFrame): + f = pd.DataFrame.to_hdf + elif isinstance(self, ps.Series): + f = pd.Series.to_hdf + else: + raise TypeError( + "Constructor expects DataFrame or Series; however, " "got [%s]" % (self,) + ) + return validate_arguments_and_invoke_function( + psdf._to_internal_pandas(), self.to_hdf, f, args + ) + def mean( self, axis: Optional[Axis] = None, skipna: bool = True, numeric_only: bool = None ) -> Union[Scalar, "Series"]: diff --git a/python/pyspark/pandas/missing/frame.py b/python/pyspark/pandas/missing/frame.py index 7a4d09d4ea81..25a3a2afa3df 100644 --- a/python/pyspark/pandas/missing/frame.py +++ b/python/pyspark/pandas/missing/frame.py @@ -43,7 +43,6 @@ class MissingPandasLikeDataFrame: reorder_levels = _unsupported_function("reorder_levels") set_axis = _unsupported_function("set_axis") to_feather = _unsupported_function("to_feather") - to_hdf = _unsupported_function("to_hdf") to_period = _unsupported_function("to_period") to_sql = _unsupported_function("to_sql") to_stata = _unsupported_function("to_stata") diff --git a/python/pyspark/pandas/missing/series.py b/python/pyspark/pandas/missing/series.py index 4ee860d6654f..08f21f46b2cc 100644 --- a/python/pyspark/pandas/missing/series.py +++ b/python/pyspark/pandas/missing/series.py @@ -40,7 +40,6 @@ class MissingPandasLikeSeries: infer_objects = _unsupported_function("infer_objects") reorder_levels = _unsupported_function("reorder_levels") set_axis = _unsupported_function("set_axis") - to_hdf = _unsupported_function("to_hdf") to_period = _unsupported_function("to_period") to_sql = _unsupported_function("to_sql") to_timestamp = _unsupported_function("to_timestamp") --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org