This is an automated email from the ASF dual-hosted git repository.
raulcd pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new cd1ed18fd1 GH-36352: [Python] Add project_id to GcsFileSystem options
(#36376)
cd1ed18fd1 is described below
commit cd1ed18fd1e08912ea47b64edf55be9c046375c4
Author: Raúl Cumplido <[email protected]>
AuthorDate: Tue Jul 4 10:55:46 2023 +0200
GH-36352: [Python] Add project_id to GcsFileSystem options (#36376)
### Rationale for this change
Some of our Python CI tests for GCS are failing due to the new project_id
option added for GcsFileSystem here: https://github.com/apache/arrow/pull/36228
### What changes are included in this PR?
Added option
### Are these changes tested?
Will be tested on CI.
### Are there any user-facing changes?
Yes, there is a new project_id option when defining a GcsFileSystem.
* Closes: #36352
Lead-authored-by: Raúl Cumplido <[email protected]>
Co-authored-by: Joris Van den Bossche <[email protected]>
Signed-off-by: Raúl Cumplido <[email protected]>
---
python/pyarrow/_gcsfs.pyx | 24 ++++++++++++++++++++++--
python/pyarrow/includes/libarrow_fs.pxd | 1 +
python/pyarrow/tests/test_fs.py | 9 ++++++---
3 files changed, 29 insertions(+), 5 deletions(-)
diff --git a/python/pyarrow/_gcsfs.pyx b/python/pyarrow/_gcsfs.pyx
index 38bda399f2..1fda08232c 100644
--- a/python/pyarrow/_gcsfs.pyx
+++ b/python/pyarrow/_gcsfs.pyx
@@ -75,6 +75,11 @@ cdef class GcsFileSystem(FileSystem):
retry_time_limit : timedelta, default None
Set the maximum amount of time the GCS client will attempt to retry
transient errors. Subsecond granularity is ignored.
+ project_id : str, default None
+ The GCP project identifier to use for creating buckets.
+ If not set, the library uses the GOOGLE_CLOUD_PROJECT environment
+ variable. Most I/O operations do not need a project id, only
applications
+ that create new buckets need a project id.
"""
cdef:
@@ -86,7 +91,8 @@ cdef class GcsFileSystem(FileSystem):
scheme=None,
endpoint_override=None,
default_metadata=None,
- retry_time_limit=None):
+ retry_time_limit=None,
+ project_id=None):
cdef:
CGcsOptions options
shared_ptr[CGcsFileSystem] wrapped
@@ -136,6 +142,8 @@ cdef class GcsFileSystem(FileSystem):
if retry_time_limit is not None:
time_limit_seconds = retry_time_limit.total_seconds()
options.retry_limit_seconds = time_limit_seconds
+ if project_id is not None:
+ options.project_id = <c_string>tobytes(project_id)
with nogil:
wrapped = GetResultValue(CGcsFileSystem.Make(options))
@@ -165,6 +173,9 @@ cdef class GcsFileSystem(FileSystem):
if opts.retry_limit_seconds.has_value():
retry_time_limit = timedelta(
seconds=opts.retry_limit_seconds.value())
+ project_id = None
+ if opts.project_id.has_value():
+ project_id = frombytes(opts.project_id.value())
return (
GcsFileSystem._reconstruct, (dict(
access_token=frombytes(opts.credentials.access_token()),
@@ -176,7 +187,8 @@ cdef class GcsFileSystem(FileSystem):
default_bucket_location=frombytes(
opts.default_bucket_location),
default_metadata=pyarrow_wrap_metadata(opts.default_metadata),
- retry_time_limit=retry_time_limit
+ retry_time_limit=retry_time_limit,
+ project_id=project_id
),))
@property
@@ -185,3 +197,11 @@ cdef class GcsFileSystem(FileSystem):
The GCP location this filesystem will write to.
"""
return frombytes(self.gcsfs.options().default_bucket_location)
+
+ @property
+ def project_id(self):
+ """
+ The GCP project id this filesystem will use.
+ """
+ if self.gcsfs.options().project_id.has_value():
+ return frombytes(self.gcsfs.options().project_id.value())
diff --git a/python/pyarrow/includes/libarrow_fs.pxd
b/python/pyarrow/includes/libarrow_fs.pxd
index 0b683e613d..2727fc2011 100644
--- a/python/pyarrow/includes/libarrow_fs.pxd
+++ b/python/pyarrow/includes/libarrow_fs.pxd
@@ -225,6 +225,7 @@ cdef extern from "arrow/filesystem/api.h" namespace
"arrow::fs" nogil:
c_string endpoint_override
c_string scheme
c_string default_bucket_location
+ optional[c_string] project_id
optional[double] retry_limit_seconds
shared_ptr[const CKeyValueMetadata] default_metadata
c_bool Equals(const CS3Options& other)
diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py
index 59bbb5a392..a629db73e2 100644
--- a/python/pyarrow/tests/test_fs.py
+++ b/python/pyarrow/tests/test_fs.py
@@ -213,7 +213,8 @@ def gcsfs(request, gcs_server):
scheme='http',
# Mock endpoint doesn't check credentials.
anonymous=True,
- retry_time_limit=timedelta(seconds=45)
+ retry_time_limit=timedelta(seconds=45),
+ project_id='test-project-id'
)
try:
fs.create_dir(bucket)
@@ -1064,9 +1065,11 @@ def test_gcs_options():
target_service_account='service_account@apache',
credential_token_expiration=dt,
default_bucket_location='us-west2',
- scheme='https', endpoint_override='localhost:8999')
+ scheme='https', endpoint_override='localhost:8999',
+ project_id='test-project-id')
assert isinstance(fs, GcsFileSystem)
assert fs.default_bucket_location == 'us-west2'
+ assert fs.project_id == 'test-project-id'
assert pickle.loads(pickle.dumps(fs)) == fs
fs = GcsFileSystem()
@@ -1476,7 +1479,7 @@ def test_filesystem_from_uri_gcs(gcs_server):
uri = ("gs://anonymous@" +
f"mybucket/foo/bar?scheme=http&endpoint_override={host}:{port}&" +
- "retry_limit_seconds=5")
+ "retry_limit_seconds=5&project_id=test-project-id")
fs, path = FileSystem.from_uri(uri)
assert isinstance(fs, GcsFileSystem)