jorisvandenbossche commented on a change in pull request #12153: URL: https://github.com/apache/arrow/pull/12153#discussion_r785757447
########## File path: python/pyarrow/orc.py ########## @@ -175,3 +176,33 @@ def write_table(table, where): writer = ORCWriter(where) writer.write(table) writer.close() + + +def read_table(source, columns=None, filesystem=None): + """ + Read a table from ORC format Review comment: ```suggestion Read a Table from an ORC file. ``` ########## File path: python/pyarrow/orc.py ########## @@ -175,3 +176,33 @@ def write_table(table, where): writer = ORCWriter(where) writer.write(table) writer.close() + + +def read_table(source, columns=None, filesystem=None): + """ + Read a table from ORC format + + Parameters + ---------- + source : str, pyarrow.NativeFile, or file-like object + If a string passed, can be a single file name or directory name. For + file-like objects, only read a single file. Use pyarrow.BufferReader to + read a file contained in a bytes or buffer-like object. + columns : list + If not None, only these columns will be read from the file. A column + name may be a prefix of a nested field, e.g. 'a' will select 'a.b', + 'a.c', and 'a.d.e'. If empty, no columns will be read. Note + that the table will still have the correct num_rows set despite having + no columns. Review comment: This last sentence doesn't actually seem to be true: ``` In [1]: import pyarrow as pa In [2]: from pyarrow import orc In [3]: orc.write_table(pa.table({'a': [1, 2, 3]}), "test.orc") In [4]: result = orc.ORCFile("test.orc").read(columns=[]) In [5]: result.num_rows Out[5]: 0 In [6]: result.num_columns Out[6]: 0 ``` ########## File path: python/pyarrow/tests/test_orc.py ########## @@ -169,6 +171,36 @@ def test_orcfile_empty(datadir): assert table.schema == expected_schema +def test_readwrite(tmpdir): + from pyarrow import orc + a = pa.array([1, None, 3, None]) + b = pa.array([None, "Arrow", None, "ORC"]) + table = pa.table({"int64": a, "utf8": b}) + file = tmpdir.join("test.orc") + orc.write_table(table, file) + output_table = orc.read_table(file) + assert table.equals(output_table) + Review comment: Can you add one more read_table call here with eg `columns=["int64"]`, just to make sure that this keyword is properly passed through ########## File path: python/pyarrow/orc.py ########## @@ -175,3 +176,33 @@ def write_table(table, where): writer = ORCWriter(where) writer.write(table) writer.close() + + +def read_table(source, columns=None, filesystem=None): + """ + Read a table from ORC format + + Parameters + ---------- + source : str, pyarrow.NativeFile, or file-like object + If a string passed, can be a single file name or directory name. For + file-like objects, only read a single file. Use pyarrow.BufferReader to + read a file contained in a bytes or buffer-like object. + columns : list + If not None, only these columns will be read from the file. A column + name may be a prefix of a nested field, e.g. 'a' will select 'a.b', + 'a.c', and 'a.d.e'. If empty, no columns will be read. Note + that the table will still have the correct num_rows set despite having + no columns. + filesystem : FileSystem, default None + If nothing passed, paths assumed to be found in the local on-disk + filesystem. Review comment: I know this is copied from the parquet read_table docstring, but I am not sure this is fully correct? I suppose passing a URI for eg a remote file also works? ########## File path: python/pyarrow/orc.py ########## @@ -175,3 +176,33 @@ def write_table(table, where): writer = ORCWriter(where) writer.write(table) writer.close() + + +def read_table(source, columns=None, filesystem=None): + """ + Read a table from ORC format + + Parameters + ---------- + source : str, pyarrow.NativeFile, or file-like object + If a string passed, can be a single file name or directory name. For Review comment: For ORC, this implementation of `read_table` does not support reading directories. So the above should only be a single file name. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org