Repository: arrow Updated Branches: refs/heads/master 4df2a0bfa -> 44855bb16
ARROW-1273: [Python] Add Parquet read_metadata, read_schema convenience functions cc @xhochy @cpcloud for feedback on API Author: Wes McKinney <[email protected]> Closes #904 from wesm/ARROW-1273 and squashes the following commits: 13725654 [Wes McKinney] Add Parquet read_metadata, read_schema convenience functions Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/44855bb1 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/44855bb1 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/44855bb1 Branch: refs/heads/master Commit: 44855bb16312031a6d4285632d0071c676ef38aa Parents: 4df2a0b Author: Wes McKinney <[email protected]> Authored: Fri Jul 28 11:14:35 2017 -0400 Committer: Wes McKinney <[email protected]> Committed: Fri Jul 28 11:14:35 2017 -0400 ---------------------------------------------------------------------- python/doc/source/api.rst | 3 +++ python/pyarrow/parquet.py | 30 ++++++++++++++++++++++++++++++ python/pyarrow/tests/test_parquet.py | 31 ++++++++++++++++++++++++------- 3 files changed, 57 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/44855bb1/python/doc/source/api.rst ---------------------------------------------------------------------- diff --git a/python/doc/source/api.rst b/python/doc/source/api.rst index 6554465..b84163b 100644 --- a/python/doc/source/api.rst +++ b/python/doc/source/api.rst @@ -239,5 +239,8 @@ Apache Parquet ParquetDataset ParquetFile read_table + read_metadata + read_pandas + read_schema write_metadata write_table http://git-wip-us.apache.org/repos/asf/arrow/blob/44855bb1/python/pyarrow/parquet.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 34c1d12..a3af9ae 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -805,3 +805,33 @@ def write_metadata(schema, where, version='1.0', ) writer = ParquetWriter(where, schema, **options) writer.close() + + +def read_metadata(where): + """ + Read FileMetadata from footer of a single Parquet file + + Parameters + ---------- + where : string (filepath) or file-like object + + Returns + ------- + metadata : FileMetadata + """ + return ParquetFile(where).metadata + + +def read_schema(where): + """ + Read effective Arrow schema from Parquet file metadata + + Parameters + ---------- + where : string (filepath) or file-like object + + Returns + ------- + schema : pyarrow.Schema + """ + return ParquetFile(where).schema.to_arrow_schema() http://git-wip-us.apache.org/repos/asf/arrow/blob/44855bb1/python/pyarrow/tests/test_parquet.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 7443df8..f840673 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -124,9 +124,8 @@ def test_pandas_parquet_custom_metadata(tmpdir): assert b'pandas' in arrow_table.schema.metadata _write_table(arrow_table, filename.strpath, version="2.0") - pf = pq.ParquetFile(filename.strpath) - md = pf.metadata.metadata + md = pq.read_metadata(filename.strpath).metadata assert b'pandas' in md js = json.loads(md[b'pandas'].decode('utf8')) @@ -592,7 +591,7 @@ def test_pass_separate_metadata(): _write_table(a_table, buf, compression='snappy', version='2.0') buf.seek(0) - metadata = pq.ParquetFile(buf).metadata + metadata = pq.read_metadata(buf) buf.seek(0) @@ -788,14 +787,32 @@ def test_read_common_metadata_files(tmpdir): dataset = pq.ParquetDataset(base_path) assert dataset.metadata_path == metadata_path - pf = pq.ParquetFile(data_path) - assert dataset.schema.equals(pf.schema) + common_schema = pq.read_metadata(data_path).schema + assert dataset.schema.equals(common_schema) # handle list of one directory dataset2 = pq.ParquetDataset([base_path]) assert dataset2.schema.equals(dataset.schema) +@parquet +def test_read_schema(tmpdir): + import pyarrow.parquet as pq + + N = 100 + df = pd.DataFrame({ + 'index': np.arange(N), + 'values': np.random.randn(N) + }, columns=['index', 'values']) + + data_path = pjoin(str(tmpdir), 'test.parquet') + + table = pa.Table.from_pandas(df) + _write_table(table, data_path) + + assert table.schema.equals(pq.read_schema(data_path)) + + def _filter_partition(df, part_keys): predicate = np.ones(len(df), dtype=bool) @@ -847,7 +864,7 @@ def test_read_multiple_files(tmpdir): assert result.equals(expected) # Read with provided metadata - metadata = pq.ParquetFile(paths[0]).metadata + metadata = pq.read_metadata(paths[0]) result2 = read_multiple_files(paths, metadata=metadata) assert result2.equals(expected) @@ -873,7 +890,7 @@ def test_read_multiple_files(tmpdir): t = pa.Table.from_pandas(bad_apple) _write_table(t, bad_apple_path) - bad_meta = pq.ParquetFile(bad_apple_path).metadata + bad_meta = pq.read_metadata(bad_apple_path) with pytest.raises(ValueError): read_multiple_files(paths + [bad_apple_path])
