[ https://issues.apache.org/jira/browse/ARROW-2046?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16380945#comment-16380945 ]
ASF GitHub Bot commented on ARROW-2046: --------------------------------------- wesm closed pull request #1675: ARROW-2046: [Python] Support path-like objects URL: https://github.com/apache/arrow/pull/1675 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 9061ed53d..e513e1d92 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -31,7 +31,7 @@ from pyarrow.lib cimport (Array, Schema, NativeFile, get_reader, get_writer) from pyarrow.compat import tobytes, frombytes -from pyarrow.lib import ArrowException, NativeFile +from pyarrow.lib import ArrowException, NativeFile, _stringify_path import six @@ -825,15 +825,17 @@ cdef class ParquetWriter: c_string c_where CMemoryPool* pool - if isinstance(where, six.string_types): + try: + where = _stringify_path(where) + except TypeError: + get_writer(where, &self.sink) + self.own_sink = False + else: c_where = tobytes(where) with nogil: check_status(FileOutputStream.Open(c_where, &self.sink)) self.own_sink = True - else: - get_writer(where, &self.sink) - self.own_sink = False self.use_dictionary = use_dictionary self.compression = compression diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 8b364dc71..ad6d2d66a 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -38,6 +38,18 @@ cdef extern from "Python.h": char *v, Py_ssize_t len) except NULL +def _stringify_path(path): + """ + Convert *path* to a string or unicode path if possible. + """ + if isinstance(path, six.string_types): + return path + try: + return path.__fspath__() + except AttributeError: + raise TypeError("not a path-like object") + + cdef class NativeFile: def __cinit__(self): self.closed = True @@ -822,13 +834,16 @@ def frombuffer(object obj): cdef get_reader(object source, shared_ptr[RandomAccessFile]* reader): cdef NativeFile nf - if isinstance(source, six.string_types): - source = memory_map(source, mode='r') - elif isinstance(source, Buffer): - source = BufferReader(source) - elif not isinstance(source, NativeFile) and hasattr(source, 'read'): - # Optimistically hope this is file-like - source = PythonFile(source, mode='r') + try: + source_path = _stringify_path(source) + except TypeError: + if isinstance(source, Buffer): + source = BufferReader(source) + elif not isinstance(source, NativeFile) and hasattr(source, 'read'): + # Optimistically hope this is file-like + source = PythonFile(source, mode='r') + else: + source = memory_map(source_path, mode='r') if isinstance(source, NativeFile): nf = source @@ -846,11 +861,14 @@ cdef get_reader(object source, shared_ptr[RandomAccessFile]* reader): cdef get_writer(object source, shared_ptr[OutputStream]* writer): cdef NativeFile nf - if isinstance(source, six.string_types): - source = OSFile(source, mode='w') - elif not isinstance(source, NativeFile) and hasattr(source, 'write'): - # Optimistically hope this is file-like - source = PythonFile(source, mode='w') + try: + source_path = _stringify_path(source) + except TypeError: + if not isinstance(source, NativeFile) and hasattr(source, 'write'): + # Optimistically hope this is file-like + source = PythonFile(source, mode='w') + else: + source = OSFile(source_path, mode='w') if isinstance(source, NativeFile): nf = source diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index bd76feb2e..187971fb8 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -22,6 +22,7 @@ import io import json import os +import sys import pytest @@ -264,6 +265,19 @@ def test_pandas_parquet_1_0_rountrip(tmpdir): tm.assert_frame_equal(df, df_read) +@parquet +@pytest.mark.skipif(sys.version_info < (3, 6), reason="need Python 3.6") +def test_path_objects(tmpdir): + # Test compatibility with PEP 519 path-like objects + import pathlib + p = pathlib.Path(tmpdir) / 'zzz.parquet' + df = pd.DataFrame({'x': np.arange(10, dtype=np.int64)}) + _write_table(df, p) + table_read = _read_table(p) + df_read = table_read.to_pandas() + tm.assert_frame_equal(df, df_read) + + @parquet def test_pandas_column_selection(tmpdir): size = 10000 diff --git a/python/pyarrow/tests/test_serialization.py b/python/pyarrow/tests/test_serialization.py index feccebbde..3ee02cb8c 100644 --- a/python/pyarrow/tests/test_serialization.py +++ b/python/pyarrow/tests/test_serialization.py @@ -584,15 +584,11 @@ def _get_modified_env_with_pythonpath(): # Prepend pyarrow root directory to PYTHONPATH env = os.environ.copy() existing_pythonpath = env.get('PYTHONPATH', '') - if sys.platform == 'win32': - sep = ';' - else: - sep = ':' module_path = os.path.abspath( os.path.dirname(os.path.dirname(pa.__file__))) - env['PYTHONPATH'] = sep.join((module_path, existing_pythonpath)) + env['PYTHONPATH'] = os.pathsep.join((module_path, existing_pythonpath)) return env @@ -650,3 +646,14 @@ def loads2(serialized_obj): serialized = pa.serialize(test_object, context=context).to_buffer() deserialized = pa.deserialize(serialized.to_pybytes(), context=context) assert deserialized == b'custom serialization 2' + + +@pytest.mark.skipif(sys.version_info < (3, 6), reason="need Python 3.6") +def test_path_objects(tmpdir): + # Test compatibility with PEP 519 path-like objects + import pathlib + p = pathlib.Path(tmpdir) / 'zzz.bin' + obj = 1234 + pa.serialize_to(obj, p) + res = pa.deserialize_from(p, None) + assert res == obj ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [Python] Add support for PEP519 - pathlib and similar objects > ------------------------------------------------------------- > > Key: ARROW-2046 > URL: https://issues.apache.org/jira/browse/ARROW-2046 > Project: Apache Arrow > Issue Type: Improvement > Components: Python > Reporter: Victor Uriarte > Assignee: Antoine Pitrou > Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > > Currently `pyarrow` doesn't seem to support reading from `pathlib.Path` or > similar objects. [PEP519|https://www.python.org/dev/peps/pep-0519/] > introduced `__fspath__` which could be used to transform any `Path` like > object to a string. > [Pandas|https://github.com/pandas-dev/pandas/blob/a9d8e04ab68f688f899b4164bfa1ac868c9c1c64/pandas/io/common.py#L120-L160] > has a sample implementation, though I think a simpler implementation of it > could be used. > > {code:java} > import pathlib > import pandas as pd > import pyarrow as pa > import pyarrow.parquet as pq > df = pd.DataFrame({ > 'Foo': ['A', 'A', 'B', 'B', 'C'], > 'Bar': ['A1', 'A2', 'B2', 'D3', ''], > }) > test_dir = pathlib.Path(__file__).parent / 'test' > test_dir.mkdir(parents=True, exist_ok=True) > table = pa.Table.from_pandas(df) > path = test_dir / 'file1.parquet' > # Doesn't work > pq.write_table(table, path) > # Works > pq.write_table(table, str(path)) > {code} > > [https://github.com/apache/arrow/issues/1522] > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)