[jira] [Commented] (ARROW-2046) [Python] Add support for PEP519 - pathlib and similar objects

ASF GitHub Bot (JIRA) Wed, 28 Feb 2018 11:50:19 -0800

    [ 
https://issues.apache.org/jira/browse/ARROW-2046?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16380945#comment-16380945
 ]


ASF GitHub Bot commented on ARROW-2046:
---------------------------------------

wesm closed pull request #1675: ARROW-2046: [Python] Support path-like objects
URL: https://github.com/apache/arrow/pull/1675
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 9061ed53d..e513e1d92 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -31,7 +31,7 @@ from pyarrow.lib cimport (Array, Schema,
                           NativeFile, get_reader, get_writer)
 
 from pyarrow.compat import tobytes, frombytes
-from pyarrow.lib import ArrowException, NativeFile
+from pyarrow.lib import ArrowException, NativeFile, _stringify_path
 
 import six
 
@@ -825,15 +825,17 @@ cdef class ParquetWriter:
             c_string c_where
             CMemoryPool* pool
 
-        if isinstance(where, six.string_types):
+        try:
+            where = _stringify_path(where)
+        except TypeError:
+            get_writer(where, &self.sink)
+            self.own_sink = False
+        else:
             c_where = tobytes(where)
             with nogil:
                 check_status(FileOutputStream.Open(c_where,
                                                    &self.sink))
             self.own_sink = True
-        else:
-            get_writer(where, &self.sink)
-            self.own_sink = False
 
         self.use_dictionary = use_dictionary
         self.compression = compression
diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi
index 8b364dc71..ad6d2d66a 100644
--- a/python/pyarrow/io.pxi
+++ b/python/pyarrow/io.pxi
@@ -38,6 +38,18 @@ cdef extern from "Python.h":
         char *v, Py_ssize_t len) except NULL
 
 
+def _stringify_path(path):
+    """
+    Convert *path* to a string or unicode path if possible.
+    """
+    if isinstance(path, six.string_types):
+        return path
+    try:
+        return path.__fspath__()
+    except AttributeError:
+        raise TypeError("not a path-like object")
+
+
 cdef class NativeFile:
     def __cinit__(self):
         self.closed = True
@@ -822,13 +834,16 @@ def frombuffer(object obj):
 cdef get_reader(object source, shared_ptr[RandomAccessFile]* reader):
     cdef NativeFile nf
 
-    if isinstance(source, six.string_types):
-        source = memory_map(source, mode='r')
-    elif isinstance(source, Buffer):
-        source = BufferReader(source)
-    elif not isinstance(source, NativeFile) and hasattr(source, 'read'):
-        # Optimistically hope this is file-like
-        source = PythonFile(source, mode='r')
+    try:
+        source_path = _stringify_path(source)
+    except TypeError:
+        if isinstance(source, Buffer):
+            source = BufferReader(source)
+        elif not isinstance(source, NativeFile) and hasattr(source, 'read'):
+            # Optimistically hope this is file-like
+            source = PythonFile(source, mode='r')
+    else:
+        source = memory_map(source_path, mode='r')
 
     if isinstance(source, NativeFile):
         nf = source
@@ -846,11 +861,14 @@ cdef get_reader(object source, 
shared_ptr[RandomAccessFile]* reader):
 cdef get_writer(object source, shared_ptr[OutputStream]* writer):
     cdef NativeFile nf
 
-    if isinstance(source, six.string_types):
-        source = OSFile(source, mode='w')
-    elif not isinstance(source, NativeFile) and hasattr(source, 'write'):
-        # Optimistically hope this is file-like
-        source = PythonFile(source, mode='w')
+    try:
+        source_path = _stringify_path(source)
+    except TypeError:
+        if not isinstance(source, NativeFile) and hasattr(source, 'write'):
+            # Optimistically hope this is file-like
+            source = PythonFile(source, mode='w')
+    else:
+        source = OSFile(source_path, mode='w')
 
     if isinstance(source, NativeFile):
         nf = source
diff --git a/python/pyarrow/tests/test_parquet.py 
b/python/pyarrow/tests/test_parquet.py
index bd76feb2e..187971fb8 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -22,6 +22,7 @@
 import io
 import json
 import os
+import sys
 
 import pytest
 
@@ -264,6 +265,19 @@ def test_pandas_parquet_1_0_rountrip(tmpdir):
     tm.assert_frame_equal(df, df_read)
 
 
+@parquet
+@pytest.mark.skipif(sys.version_info < (3, 6), reason="need Python 3.6")
+def test_path_objects(tmpdir):
+    # Test compatibility with PEP 519 path-like objects
+    import pathlib
+    p = pathlib.Path(tmpdir) / 'zzz.parquet'
+    df = pd.DataFrame({'x': np.arange(10, dtype=np.int64)})
+    _write_table(df, p)
+    table_read = _read_table(p)
+    df_read = table_read.to_pandas()
+    tm.assert_frame_equal(df, df_read)
+
+
 @parquet
 def test_pandas_column_selection(tmpdir):
     size = 10000
diff --git a/python/pyarrow/tests/test_serialization.py 
b/python/pyarrow/tests/test_serialization.py
index feccebbde..3ee02cb8c 100644
--- a/python/pyarrow/tests/test_serialization.py
+++ b/python/pyarrow/tests/test_serialization.py
@@ -584,15 +584,11 @@ def _get_modified_env_with_pythonpath():
     # Prepend pyarrow root directory to PYTHONPATH
     env = os.environ.copy()
     existing_pythonpath = env.get('PYTHONPATH', '')
-    if sys.platform == 'win32':
-        sep = ';'
-    else:
-        sep = ':'
 
     module_path = os.path.abspath(
         os.path.dirname(os.path.dirname(pa.__file__)))
 
-    env['PYTHONPATH'] = sep.join((module_path, existing_pythonpath))
+    env['PYTHONPATH'] = os.pathsep.join((module_path, existing_pythonpath))
     return env
 
 
@@ -650,3 +646,14 @@ def loads2(serialized_obj):
     serialized = pa.serialize(test_object, context=context).to_buffer()
     deserialized = pa.deserialize(serialized.to_pybytes(), context=context)
     assert deserialized == b'custom serialization 2'
+
+
+@pytest.mark.skipif(sys.version_info < (3, 6), reason="need Python 3.6")
+def test_path_objects(tmpdir):
+    # Test compatibility with PEP 519 path-like objects
+    import pathlib
+    p = pathlib.Path(tmpdir) / 'zzz.bin'
+    obj = 1234
+    pa.serialize_to(obj, p)
+    res = pa.deserialize_from(p, None)
+    assert res == obj


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> [Python] Add support for PEP519 - pathlib and similar objects
> -------------------------------------------------------------
>
>                 Key: ARROW-2046
>                 URL: https://issues.apache.org/jira/browse/ARROW-2046
>             Project: Apache Arrow
>          Issue Type: Improvement
>          Components: Python
>            Reporter: Victor Uriarte
>            Assignee: Antoine Pitrou
>            Priority: Major
>              Labels: pull-request-available
>             Fix For: 0.9.0
>
>
> Currently `pyarrow` doesn't seem to support reading from `pathlib.Path` or 
> similar objects. [PEP519|https://www.python.org/dev/peps/pep-0519/] 
> introduced `__fspath__` which could be used to transform any `Path` like 
> object to a string.
> [Pandas|https://github.com/pandas-dev/pandas/blob/a9d8e04ab68f688f899b4164bfa1ac868c9c1c64/pandas/io/common.py#L120-L160]
>  has a sample implementation, though I think a simpler implementation of it 
> could be used.
>  
> {code:java}
> import pathlib
> import pandas as pd
> import pyarrow as pa
> import pyarrow.parquet as pq
> df = pd.DataFrame({
>  'Foo': ['A', 'A', 'B', 'B', 'C'],
>  'Bar': ['A1', 'A2', 'B2', 'D3', ''],
> })
> test_dir = pathlib.Path(__file__).parent / 'test'
> test_dir.mkdir(parents=True, exist_ok=True)
> table = pa.Table.from_pandas(df)
> path = test_dir / 'file1.parquet'
> # Doesn't work
> pq.write_table(table, path)
> # Works
> pq.write_table(table, str(path))
> {code}
>  
> [https://github.com/apache/arrow/issues/1522]
>  
>  



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

[jira] [Commented] (ARROW-2046) [Python] Add support for PEP519 - pathlib and similar objects

Reply via email to