This is an automated email from the ASF dual-hosted git repository.
fokko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/master by this push:
new ed799b47a5 Python: Inline the Filesystem imports (#8548)
ed799b47a5 is described below
commit ed799b47a5b5899c9e40a46fc85d7882d0065c37
Author: Fokko Driesprong <[email protected]>
AuthorDate: Tue Sep 12 10:30:01 2023 +0200
Python: Inline the Filesystem imports (#8548)
* Python: Inline the Filesystem imports
It can be that certain build flags are turned off:
```
-DARROW_GCS=ON: Build Arrow with GCS support (requires the GCloud SDK for
C++)
-DARROW_HDFS=ON: Arrow integration with libhdfs for accessing the Hadoop
Filesystem
```
From:
https://arrow.apache.org/docs/dev/developers/cpp/building.html#optional-components
This will cause an ImportError when importing `pyarrow.py`,
while it can be that you don't want to use a missing FS.
Therefore it is better to inline the imports
* Move imports to the top
---
python/pyiceberg/io/pyarrow.py | 17 ++++++++++++-----
1 file changed, 12 insertions(+), 5 deletions(-)
diff --git a/python/pyiceberg/io/pyarrow.py b/python/pyiceberg/io/pyarrow.py
index 3453b18e44..e43541f03e 100644
--- a/python/pyiceberg/io/pyarrow.py
+++ b/python/pyiceberg/io/pyarrow.py
@@ -58,11 +58,6 @@ from pyarrow.fs import (
FileSystem,
FileType,
FSSpecHandler,
- GcsFileSystem,
- HadoopFileSystem,
- LocalFileSystem,
- PyFileSystem,
- S3FileSystem,
)
from sortedcontainers import SortedList
@@ -306,6 +301,8 @@ class PyArrowFileIO(FileIO):
def _get_fs(self, scheme: str) -> FileSystem:
if scheme in {"s3", "s3a", "s3n"}:
+ from pyarrow.fs import S3FileSystem
+
client_kwargs = {
"endpoint_override": self.properties.get(S3_ENDPOINT),
"access_key": self.properties.get(S3_ACCESS_KEY_ID),
@@ -319,6 +316,8 @@ class PyArrowFileIO(FileIO):
return S3FileSystem(**client_kwargs)
elif scheme == "hdfs":
+ from pyarrow.fs import HadoopFileSystem
+
hdfs_kwargs: Dict[str, Any] = {}
if host := self.properties.get(HDFS_HOST):
hdfs_kwargs["host"] = host
@@ -329,8 +328,11 @@ class PyArrowFileIO(FileIO):
hdfs_kwargs["user"] = user
if kerb_ticket := self.properties.get(HDFS_KERB_TICKET):
hdfs_kwargs["kerb_ticket"] = kerb_ticket
+
return HadoopFileSystem(**hdfs_kwargs)
elif scheme in {"gs", "gcs"}:
+ from pyarrow.fs import GcsFileSystem
+
gcs_kwargs: Dict[str, Any] = {}
if access_token := self.properties.get(GCS_TOKEN):
gcs_kwargs["access_token"] = access_token
@@ -342,8 +344,11 @@ class PyArrowFileIO(FileIO):
url_parts = urlparse(endpoint)
gcs_kwargs["scheme"] = url_parts.scheme
gcs_kwargs["endpoint_override"] = url_parts.netloc
+
return GcsFileSystem(**gcs_kwargs)
elif scheme == "file":
+ from pyarrow.fs import LocalFileSystem
+
return LocalFileSystem()
else:
raise ValueError(f"Unrecognized filesystem type in URI: {scheme}")
@@ -899,6 +904,8 @@ def project_table(
from pyiceberg.io.fsspec import FsspecFileIO
if isinstance(table.io, FsspecFileIO):
+ from pyarrow.fs import PyFileSystem
+
fs = PyFileSystem(FSSpecHandler(table.io.get_fs(scheme)))
else:
raise ValueError(f"Expected PyArrowFileIO or FsspecFileIO,
got: {table.io}")