This is an automated email from the ASF dual-hosted git repository.
jorisvandenbossche pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 42ed37e3fc ARROW-16719: [Python] Add path/URI + filesystem handling to
parquet.read_metadata (#13629)
42ed37e3fc is described below
commit 42ed37e3fc84465f365531e611f1bf632b599e7b
Author: Kshiteej K <[email protected]>
AuthorDate: Wed Aug 17 16:51:40 2022 +0530
ARROW-16719: [Python] Add path/URI + filesystem handling to
parquet.read_metadata (#13629)
Add `filesystem` support to `pq.read_metadata` and `pq.read_schema`.
Lead-authored-by: kshitij12345 <[email protected]>
Co-authored-by: Kshiteej K <[email protected]>
Co-authored-by: Joris Van den Bossche <[email protected]>
Signed-off-by: Joris Van den Bossche <[email protected]>
---
python/pyarrow/parquet/core.py | 39 ++++++++++++++++++++++-----
python/pyarrow/tests/parquet/test_metadata.py | 37 +++++++++++++++++++++++++
2 files changed, 69 insertions(+), 7 deletions(-)
diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py
index 4e2c273953..d3a1451dcb 100644
--- a/python/pyarrow/parquet/core.py
+++ b/python/pyarrow/parquet/core.py
@@ -18,6 +18,7 @@
from collections import defaultdict
from concurrent import futures
+from contextlib import nullcontext
from functools import partial, reduce
import sys
@@ -3389,7 +3390,8 @@ def write_metadata(schema, where,
metadata_collector=None, **kwargs):
metadata.write_metadata_file(where)
-def read_metadata(where, memory_map=False, decryption_properties=None):
+def read_metadata(where, memory_map=False, decryption_properties=None,
+ filesystem=None):
"""
Read FileMetaData from footer of a single Parquet file.
@@ -3400,6 +3402,10 @@ def read_metadata(where, memory_map=False,
decryption_properties=None):
Create memory map when the source is a file path.
decryption_properties : FileDecryptionProperties, default None
Decryption properties for reading encrypted Parquet files.
+ filesystem : FileSystem, default None
+ If nothing passed, will be inferred based on path.
+ Path will try to be found in the local on-disk filesystem otherwise
+ it will be parsed as an URI to determine the filesystem.
Returns
-------
@@ -3422,11 +3428,19 @@ def read_metadata(where, memory_map=False,
decryption_properties=None):
format_version: 2.6
serialized_size: ...
"""
- return ParquetFile(where, memory_map=memory_map,
- decryption_properties=decryption_properties).metadata
+ filesystem, where = _resolve_filesystem_and_path(where, filesystem)
+ file_ctx = nullcontext()
+ if filesystem is not None:
+ file_ctx = where = filesystem.open_input_file(where)
+
+ with file_ctx:
+ file = ParquetFile(where, memory_map=memory_map,
+ decryption_properties=decryption_properties)
+ return file.metadata
-def read_schema(where, memory_map=False, decryption_properties=None):
+def read_schema(where, memory_map=False, decryption_properties=None,
+ filesystem=None):
"""
Read effective Arrow schema from Parquet file metadata.
@@ -3437,6 +3451,10 @@ def read_schema(where, memory_map=False,
decryption_properties=None):
Create memory map when the source is a file path.
decryption_properties : FileDecryptionProperties, default None
Decryption properties for reading encrypted Parquet files.
+ filesystem : FileSystem, default None
+ If nothing passed, will be inferred based on path.
+ Path will try to be found in the local on-disk filesystem otherwise
+ it will be parsed as an URI to determine the filesystem.
Returns
-------
@@ -3454,9 +3472,16 @@ def read_schema(where, memory_map=False,
decryption_properties=None):
n_legs: int64
animal: string
"""
- return ParquetFile(
- where, memory_map=memory_map,
- decryption_properties=decryption_properties).schema.to_arrow_schema()
+ filesystem, where = _resolve_filesystem_and_path(where, filesystem)
+ file_ctx = nullcontext()
+ if filesystem is not None:
+ file_ctx = where = filesystem.open_input_file(where)
+
+ with file_ctx:
+ file = ParquetFile(
+ where, memory_map=memory_map,
+ decryption_properties=decryption_properties)
+ return file.schema.to_arrow_schema()
# re-export everything
diff --git a/python/pyarrow/tests/parquet/test_metadata.py
b/python/pyarrow/tests/parquet/test_metadata.py
index b36ea60658..e4c9a757fc 100644
--- a/python/pyarrow/tests/parquet/test_metadata.py
+++ b/python/pyarrow/tests/parquet/test_metadata.py
@@ -24,6 +24,8 @@ import pytest
import pyarrow as pa
from pyarrow.tests.parquet.common import _check_roundtrip, make_sample_file
+from pyarrow.fs import LocalFileSystem
+from pyarrow.tests import util
try:
import pyarrow.parquet as pq
@@ -533,6 +535,41 @@ def test_metadata_exceeds_message_size():
metadata = pq.read_metadata(pa.BufferReader(buf))
+def test_metadata_schema_filesystem(tempdir):
+ table = pa.table({"a": [1, 2, 3]})
+
+ # URI writing to local file.
+ fname = "data.parquet"
+ file_path = str(tempdir / fname)
+ file_uri = 'file:///' + file_path
+
+ pq.write_table(table, file_path)
+
+ # Get expected `metadata` from path.
+ metadata = pq.read_metadata(tempdir / fname)
+ schema = table.schema
+
+ assert pq.read_metadata(file_uri).equals(metadata)
+ assert pq.read_metadata(
+ file_path, filesystem=LocalFileSystem()).equals(metadata)
+ assert pq.read_metadata(
+ fname, filesystem=f'file:///{tempdir}').equals(metadata)
+
+ assert pq.read_schema(file_uri).equals(schema)
+ assert pq.read_schema(
+ file_path, filesystem=LocalFileSystem()).equals(schema)
+ assert pq.read_schema(
+ fname, filesystem=f'file:///{tempdir}').equals(schema)
+
+ with util.change_cwd(tempdir):
+ # Pass `filesystem` arg
+ assert pq.read_metadata(
+ fname, filesystem=LocalFileSystem()).equals(metadata)
+
+ assert pq.read_schema(
+ fname, filesystem=LocalFileSystem()).equals(schema)
+
+
def test_metadata_equals():
table = pa.table({"a": [1, 2, 3]})
with pa.BufferOutputStream() as out: