[jira] [Commented] (ARROW-2198) [Python] Docstring for parquet.read_table is misleading or incorrect

ASF GitHub Bot (JIRA) Wed, 28 Feb 2018 20:33:03 -0800

    [ 
https://issues.apache.org/jira/browse/ARROW-2198?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16381496#comment-16381496
 ]


ASF GitHub Bot commented on ARROW-2198:
---------------------------------------

wesm closed pull request #1654: ARROW-2198: [Python] correct docstring for 
parquet.read_table
URL: https://github.com/apache/arrow/pull/1654
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index 485459e0b..f46ce9481 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -42,9 +42,9 @@ class ParquetFile(object):
 
     Parameters
     ----------
-    source : str or pyarrow.io.NativeFile
-        Readable source. For passing Python file objects or byte buffers,
-        see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader.
+    source : str, pyarrow.NativeFile, or file-like object
+        Readable source. For passing bytes or buffer-like file containing a
+        Parquet file, use pyarorw.BufferReader
     metadata : ParquetFileMetadata, default None
         Use existing metadata object, rather than reading from file.
     common_metadata : ParquetFileMetadata, default None
@@ -862,35 +862,34 @@ def _make_manifest(path_or_paths, fs, pathsep='/'):
     return pieces, partitions, metadata_path
 
 
-def read_table(source, columns=None, nthreads=1, metadata=None,
-               use_pandas_metadata=False):
-    """
-    Read a Table from Parquet format
+_read_table_docstring = """
+{0}
 
-    Parameters
-    ----------
-    source: str or pyarrow.io.NativeFile
-        Location of Parquet dataset. If a string passed, can be a single file
-        name or directory name. For passing Python file objects or byte
-        buffers, see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader.
-    columns: list
-        If not None, only these columns will be read from the file. A column
-        name may be a prefix of a nested field, e.g. 'a' will select 'a.b',
-        'a.c', and 'a.d.e'
-    nthreads : int, default 1
-        Number of columns to read in parallel. Requires that the underlying
-        file source is threadsafe
-    metadata : FileMetaData
-        If separately computed
-    use_pandas_metadata : boolean, default False
-        If True and file has custom pandas schema metadata, ensure that
-        index columns are also loaded
+Parameters
+----------
+source: str, pyarrow.NativeFile, or file-like object
+    If a string passed, can be a single file name or directory name. For
+    file-like objects, only read a single file. Use pyarrow.BufferReader to
+    read a file contained in a bytes or buffer-like object
+columns: list
+    If not None, only these columns will be read from the file. A column
+    name may be a prefix of a nested field, e.g. 'a' will select 'a.b',
+    'a.c', and 'a.d.e'
+nthreads : int, default 1
+    Number of columns to read in parallel. Requires that the underlying
+    file source is threadsafe
+metadata : FileMetaData
+    If separately computed
+{1}
+
+Returns
+-------
+{2}
+"""
 
-    Returns
-    -------
-    pyarrow.Table
-        Content of the file as a table (of columns)
-    """
+
+def read_table(source, columns=None, nthreads=1, metadata=None,
+               use_pandas_metadata=False):
     if is_string(source):
         fs = LocalFileSystem.get_instance()
         if fs.isdir(source):
@@ -902,37 +901,29 @@ def read_table(source, columns=None, nthreads=1, 
metadata=None,
                    use_pandas_metadata=use_pandas_metadata)
 
 
-def read_pandas(source, columns=None, nthreads=1, metadata=None):
-    """
-    Read a Table from Parquet format, also reading DataFrame index values if
-    known in the file metadata
+read_table.__doc__ = _read_table_docstring.format(
+    'Read a Table from Parquet format',
+    """use_pandas_metadata : boolean, default False
+    If True and file has custom pandas schema metadata, ensure that
+    index columns are also loaded""",
+    """pyarrow.Table
+    Content of the file as a table (of columns)""")
 
-    Parameters
-    ----------
-    source: str or pyarrow.io.NativeFile
-        Location of Parquet dataset. If a string passed, can be a single file
-        name. For passing Python file objects or byte buffers,
-        see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader.
-    columns: list
-        If not None, only these columns will be read from the file. A column
-        name may be a prefix of a nested field, e.g. 'a' will select 'a.b',
-        'a.c', and 'a.d.e'
-    nthreads : int, default 1
-        Number of columns to read in parallel. Requires that the underlying
-        file source is threadsafe
-    metadata : FileMetaData
-        If separately computed
 
-    Returns
-    -------
-    pyarrow.Table
-        Content of the file as a Table of Columns, including DataFrame indexes
-        as Columns.
-    """
+def read_pandas(source, columns=None, nthreads=1, metadata=None):
     return read_table(source, columns=columns, nthreads=nthreads,
                       metadata=metadata, use_pandas_metadata=True)
 
 
+read_pandas.__doc__ = _read_table_docstring.format(
+    'Read a Table from Parquet format, also reading DataFrame\n'
+    'index values if known in the file metadata',
+    '',
+    """pyarrow.Table
+    Content of the file as a Table of Columns, including DataFrame
+    indexes as columns""")
+
+
 def write_table(table, where, row_group_size=None, version='1.0',
                 use_dictionary=True, compression='snappy',
                 use_deprecated_int96_timestamps=None,
@@ -966,7 +957,7 @@ def write_table(table, where, row_group_size=None, 
version='1.0',
 Parameters
 ----------
 table : pyarrow.Table
-where: string or pyarrow.io.NativeFile
+where: string or pyarrow.NativeFile
 {0}
 """.format(_parquet_writer_arg_docs)
 
@@ -1064,7 +1055,7 @@ def write_metadata(schema, where, version='1.0',
     Parameters
     ----------
     schema : pyarrow.Schema
-    where: string or pyarrow.io.NativeFile
+    where: string or pyarrow.NativeFile
     version : {"1.0", "2.0"}, default "1.0"
         The Parquet format version, defaults to 1.0
     use_deprecated_int96_timestamps : boolean, default False


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> [Python] Docstring for parquet.read_table is misleading or incorrect
> --------------------------------------------------------------------
>
>                 Key: ARROW-2198
>                 URL: https://issues.apache.org/jira/browse/ARROW-2198
>             Project: Apache Arrow
>          Issue Type: Improvement
>          Components: Python
>            Reporter: Wes McKinney
>            Assignee: Wes McKinney
>            Priority: Major
>              Labels: pull-request-available
>             Fix For: 0.9.0
>
>
> See https://github.com/apache/arrow/blob/master/python/pyarrow/parquet.py#L872
> One should be able to pass a Python file object directly. The docstring 
> suggests otherwise



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

[jira] [Commented] (ARROW-2198) [Python] Docstring for parquet.read_table is misleading or incorrect

Reply via email to