[ https://issues.apache.org/jira/browse/ARROW-2198?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16381496#comment-16381496 ]
ASF GitHub Bot commented on ARROW-2198: --------------------------------------- wesm closed pull request #1654: ARROW-2198: [Python] correct docstring for parquet.read_table URL: https://github.com/apache/arrow/pull/1654 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 485459e0b..f46ce9481 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -42,9 +42,9 @@ class ParquetFile(object): Parameters ---------- - source : str or pyarrow.io.NativeFile - Readable source. For passing Python file objects or byte buffers, - see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader. + source : str, pyarrow.NativeFile, or file-like object + Readable source. For passing bytes or buffer-like file containing a + Parquet file, use pyarorw.BufferReader metadata : ParquetFileMetadata, default None Use existing metadata object, rather than reading from file. common_metadata : ParquetFileMetadata, default None @@ -862,35 +862,34 @@ def _make_manifest(path_or_paths, fs, pathsep='/'): return pieces, partitions, metadata_path -def read_table(source, columns=None, nthreads=1, metadata=None, - use_pandas_metadata=False): - """ - Read a Table from Parquet format +_read_table_docstring = """ +{0} - Parameters - ---------- - source: str or pyarrow.io.NativeFile - Location of Parquet dataset. If a string passed, can be a single file - name or directory name. For passing Python file objects or byte - buffers, see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader. - columns: list - If not None, only these columns will be read from the file. A column - name may be a prefix of a nested field, e.g. 'a' will select 'a.b', - 'a.c', and 'a.d.e' - nthreads : int, default 1 - Number of columns to read in parallel. Requires that the underlying - file source is threadsafe - metadata : FileMetaData - If separately computed - use_pandas_metadata : boolean, default False - If True and file has custom pandas schema metadata, ensure that - index columns are also loaded +Parameters +---------- +source: str, pyarrow.NativeFile, or file-like object + If a string passed, can be a single file name or directory name. For + file-like objects, only read a single file. Use pyarrow.BufferReader to + read a file contained in a bytes or buffer-like object +columns: list + If not None, only these columns will be read from the file. A column + name may be a prefix of a nested field, e.g. 'a' will select 'a.b', + 'a.c', and 'a.d.e' +nthreads : int, default 1 + Number of columns to read in parallel. Requires that the underlying + file source is threadsafe +metadata : FileMetaData + If separately computed +{1} + +Returns +------- +{2} +""" - Returns - ------- - pyarrow.Table - Content of the file as a table (of columns) - """ + +def read_table(source, columns=None, nthreads=1, metadata=None, + use_pandas_metadata=False): if is_string(source): fs = LocalFileSystem.get_instance() if fs.isdir(source): @@ -902,37 +901,29 @@ def read_table(source, columns=None, nthreads=1, metadata=None, use_pandas_metadata=use_pandas_metadata) -def read_pandas(source, columns=None, nthreads=1, metadata=None): - """ - Read a Table from Parquet format, also reading DataFrame index values if - known in the file metadata +read_table.__doc__ = _read_table_docstring.format( + 'Read a Table from Parquet format', + """use_pandas_metadata : boolean, default False + If True and file has custom pandas schema metadata, ensure that + index columns are also loaded""", + """pyarrow.Table + Content of the file as a table (of columns)""") - Parameters - ---------- - source: str or pyarrow.io.NativeFile - Location of Parquet dataset. If a string passed, can be a single file - name. For passing Python file objects or byte buffers, - see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader. - columns: list - If not None, only these columns will be read from the file. A column - name may be a prefix of a nested field, e.g. 'a' will select 'a.b', - 'a.c', and 'a.d.e' - nthreads : int, default 1 - Number of columns to read in parallel. Requires that the underlying - file source is threadsafe - metadata : FileMetaData - If separately computed - Returns - ------- - pyarrow.Table - Content of the file as a Table of Columns, including DataFrame indexes - as Columns. - """ +def read_pandas(source, columns=None, nthreads=1, metadata=None): return read_table(source, columns=columns, nthreads=nthreads, metadata=metadata, use_pandas_metadata=True) +read_pandas.__doc__ = _read_table_docstring.format( + 'Read a Table from Parquet format, also reading DataFrame\n' + 'index values if known in the file metadata', + '', + """pyarrow.Table + Content of the file as a Table of Columns, including DataFrame + indexes as columns""") + + def write_table(table, where, row_group_size=None, version='1.0', use_dictionary=True, compression='snappy', use_deprecated_int96_timestamps=None, @@ -966,7 +957,7 @@ def write_table(table, where, row_group_size=None, version='1.0', Parameters ---------- table : pyarrow.Table -where: string or pyarrow.io.NativeFile +where: string or pyarrow.NativeFile {0} """.format(_parquet_writer_arg_docs) @@ -1064,7 +1055,7 @@ def write_metadata(schema, where, version='1.0', Parameters ---------- schema : pyarrow.Schema - where: string or pyarrow.io.NativeFile + where: string or pyarrow.NativeFile version : {"1.0", "2.0"}, default "1.0" The Parquet format version, defaults to 1.0 use_deprecated_int96_timestamps : boolean, default False ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [Python] Docstring for parquet.read_table is misleading or incorrect > -------------------------------------------------------------------- > > Key: ARROW-2198 > URL: https://issues.apache.org/jira/browse/ARROW-2198 > Project: Apache Arrow > Issue Type: Improvement > Components: Python > Reporter: Wes McKinney > Assignee: Wes McKinney > Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > > See https://github.com/apache/arrow/blob/master/python/pyarrow/parquet.py#L872 > One should be able to pass a Python file object directly. The docstring > suggests otherwise -- This message was sent by Atlassian JIRA (v7.6.3#76005)