This is an automated email from the ASF dual-hosted git repository.

raulcd pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new cd1ed18fd1 GH-36352: [Python] Add project_id to GcsFileSystem options 
(#36376)
cd1ed18fd1 is described below

commit cd1ed18fd1e08912ea47b64edf55be9c046375c4
Author: Raúl Cumplido <[email protected]>
AuthorDate: Tue Jul 4 10:55:46 2023 +0200

    GH-36352: [Python] Add project_id to GcsFileSystem options (#36376)
    
    ### Rationale for this change
    Some of our Python CI tests for GCS are failing due to the new project_id 
option added for GcsFileSystem here: https://github.com/apache/arrow/pull/36228
    
    ### What changes are included in this PR?
    
    Added option
    
    ### Are these changes tested?
    
    Will be tested on CI.
    
    ### Are there any user-facing changes?
    
    Yes, there is a new project_id option when defining a GcsFileSystem.
    * Closes: #36352
    
    Lead-authored-by: Raúl Cumplido <[email protected]>
    Co-authored-by: Joris Van den Bossche <[email protected]>
    Signed-off-by: Raúl Cumplido <[email protected]>
---
 python/pyarrow/_gcsfs.pyx               | 24 ++++++++++++++++++++++--
 python/pyarrow/includes/libarrow_fs.pxd |  1 +
 python/pyarrow/tests/test_fs.py         |  9 ++++++---
 3 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/python/pyarrow/_gcsfs.pyx b/python/pyarrow/_gcsfs.pyx
index 38bda399f2..1fda08232c 100644
--- a/python/pyarrow/_gcsfs.pyx
+++ b/python/pyarrow/_gcsfs.pyx
@@ -75,6 +75,11 @@ cdef class GcsFileSystem(FileSystem):
     retry_time_limit : timedelta, default None
         Set the maximum amount of time the GCS client will attempt to retry
         transient errors. Subsecond granularity is ignored.
+    project_id : str, default None
+        The GCP project identifier to use for creating buckets.
+        If not set, the library uses the GOOGLE_CLOUD_PROJECT environment
+        variable. Most I/O operations do not need a project id, only 
applications
+        that create new buckets need a project id.
     """
 
     cdef:
@@ -86,7 +91,8 @@ cdef class GcsFileSystem(FileSystem):
                  scheme=None,
                  endpoint_override=None,
                  default_metadata=None,
-                 retry_time_limit=None):
+                 retry_time_limit=None,
+                 project_id=None):
         cdef:
             CGcsOptions options
             shared_ptr[CGcsFileSystem] wrapped
@@ -136,6 +142,8 @@ cdef class GcsFileSystem(FileSystem):
         if retry_time_limit is not None:
             time_limit_seconds = retry_time_limit.total_seconds()
             options.retry_limit_seconds = time_limit_seconds
+        if project_id is not None:
+            options.project_id = <c_string>tobytes(project_id)
 
         with nogil:
             wrapped = GetResultValue(CGcsFileSystem.Make(options))
@@ -165,6 +173,9 @@ cdef class GcsFileSystem(FileSystem):
         if opts.retry_limit_seconds.has_value():
             retry_time_limit = timedelta(
                 seconds=opts.retry_limit_seconds.value())
+        project_id = None
+        if opts.project_id.has_value():
+            project_id = frombytes(opts.project_id.value())
         return (
             GcsFileSystem._reconstruct, (dict(
                 access_token=frombytes(opts.credentials.access_token()),
@@ -176,7 +187,8 @@ cdef class GcsFileSystem(FileSystem):
                 default_bucket_location=frombytes(
                     opts.default_bucket_location),
                 default_metadata=pyarrow_wrap_metadata(opts.default_metadata),
-                retry_time_limit=retry_time_limit
+                retry_time_limit=retry_time_limit,
+                project_id=project_id
             ),))
 
     @property
@@ -185,3 +197,11 @@ cdef class GcsFileSystem(FileSystem):
         The GCP location this filesystem will write to.
         """
         return frombytes(self.gcsfs.options().default_bucket_location)
+
+    @property
+    def project_id(self):
+        """
+        The GCP project id this filesystem will use.
+        """
+        if self.gcsfs.options().project_id.has_value():
+            return frombytes(self.gcsfs.options().project_id.value())
diff --git a/python/pyarrow/includes/libarrow_fs.pxd 
b/python/pyarrow/includes/libarrow_fs.pxd
index 0b683e613d..2727fc2011 100644
--- a/python/pyarrow/includes/libarrow_fs.pxd
+++ b/python/pyarrow/includes/libarrow_fs.pxd
@@ -225,6 +225,7 @@ cdef extern from "arrow/filesystem/api.h" namespace 
"arrow::fs" nogil:
         c_string endpoint_override
         c_string scheme
         c_string default_bucket_location
+        optional[c_string] project_id
         optional[double] retry_limit_seconds
         shared_ptr[const CKeyValueMetadata] default_metadata
         c_bool Equals(const CS3Options& other)
diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py
index 59bbb5a392..a629db73e2 100644
--- a/python/pyarrow/tests/test_fs.py
+++ b/python/pyarrow/tests/test_fs.py
@@ -213,7 +213,8 @@ def gcsfs(request, gcs_server):
         scheme='http',
         # Mock endpoint doesn't check credentials.
         anonymous=True,
-        retry_time_limit=timedelta(seconds=45)
+        retry_time_limit=timedelta(seconds=45),
+        project_id='test-project-id'
     )
     try:
         fs.create_dir(bucket)
@@ -1064,9 +1065,11 @@ def test_gcs_options():
                        target_service_account='service_account@apache',
                        credential_token_expiration=dt,
                        default_bucket_location='us-west2',
-                       scheme='https', endpoint_override='localhost:8999')
+                       scheme='https', endpoint_override='localhost:8999',
+                       project_id='test-project-id')
     assert isinstance(fs, GcsFileSystem)
     assert fs.default_bucket_location == 'us-west2'
+    assert fs.project_id == 'test-project-id'
     assert pickle.loads(pickle.dumps(fs)) == fs
 
     fs = GcsFileSystem()
@@ -1476,7 +1479,7 @@ def test_filesystem_from_uri_gcs(gcs_server):
 
     uri = ("gs://anonymous@" +
            f"mybucket/foo/bar?scheme=http&endpoint_override={host}:{port}&" +
-           "retry_limit_seconds=5")
+           "retry_limit_seconds=5&project_id=test-project-id")
 
     fs, path = FileSystem.from_uri(uri)
     assert isinstance(fs, GcsFileSystem)

Reply via email to