This is an automated email from the ASF dual-hosted git repository.

xuanwo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opendal.git


The following commit(s) were added to refs/heads/main by this push:
     new e05523347 feat(services/gdrive): implement batch recursive listing for 
~200x performance improvement (#7059)
e05523347 is described below

commit e0552334776c7bedcc2b958551ca3b9b9d5e9373
Author: mro68 <[email protected]>
AuthorDate: Fri Dec 19 16:25:12 2025 +0100

    feat(services/gdrive): implement batch recursive listing for ~200x 
performance improvement (#7059)
    
    * feat(services/gdrive): implement batch recursive listing
    
    Implement batch listing using OR queries to combine multiple folder
    lookups into a single API call. This reduces the number of API calls
    from O(n) to O(n/20) where n is the number of folders.
    
    Key changes:
    - Add GdriveFlatLister for recursive listing with batched OR queries
    - Add gdrive_list_batch() to core for handling batch queries
    - Add parents field to GdriveFile for proper folder tracking
    - Use GdriveFlatLister instead of GdriveLister for recursive listing
    
    Performance improvement: ~200x faster for large directory trees
    (e.g., 20000 folders: 10min -> 3sec)
    
    * feat(services/gdrive): enable list_with_recursive capability
    
    This enables the GdriveFlatLister to be used when recursive listing
    is requested, providing ~50x faster directory listing by batching
    up to 50 parent IDs per API call using OR queries.
---
 core/services/gdrive/src/backend.rs |  20 ++-
 core/services/gdrive/src/builder.rs |   1 +
 core/services/gdrive/src/core.rs    |  52 ++++++-
 core/services/gdrive/src/lister.rs  | 287 +++++++++++++++++++++++++++++++++++-
 4 files changed, 348 insertions(+), 12 deletions(-)

diff --git a/core/services/gdrive/src/backend.rs 
b/core/services/gdrive/src/backend.rs
index 6949e9673..cc03fe7d9 100644
--- a/core/services/gdrive/src/backend.rs
+++ b/core/services/gdrive/src/backend.rs
@@ -26,6 +26,7 @@ use super::core::GdriveCore;
 use super::core::GdriveFile;
 use super::deleter::GdriveDeleter;
 use super::error::parse_error;
+use super::lister::GdriveFlatLister;
 use super::lister::GdriveLister;
 use super::writer::GdriveWriter;
 use opendal_core::raw::*;
@@ -36,10 +37,13 @@ pub struct GdriveBackend {
     pub core: Arc<GdriveCore>,
 }
 
+/// Lister type that supports both recursive and non-recursive listing
+pub type GdriveListers = TwoWays<oio::PageLister<GdriveLister>, 
GdriveFlatLister>;
+
 impl Access for GdriveBackend {
     type Reader = HttpBody;
     type Writer = oio::OneShotWriter<GdriveWriter>;
-    type Lister = oio::PageLister<GdriveLister>;
+    type Lister = GdriveListers;
     type Deleter = oio::OneShotDeleter<GdriveDeleter>;
 
     fn info(&self) -> Arc<AccessorInfo> {
@@ -117,10 +121,18 @@ impl Access for GdriveBackend {
         ))
     }
 
-    async fn list(&self, path: &str, _args: OpList) -> Result<(RpList, 
Self::Lister)> {
+    async fn list(&self, path: &str, args: OpList) -> Result<(RpList, 
Self::Lister)> {
         let path = build_abs_path(&self.core.root, path);
-        let l = GdriveLister::new(path, self.core.clone());
-        Ok((RpList::default(), oio::PageLister::new(l)))
+
+        if args.recursive() {
+            // Use optimized batch-query recursive lister
+            let l = GdriveFlatLister::new(path, self.core.clone());
+            Ok((RpList::default(), TwoWays::Two(l)))
+        } else {
+            // Use standard page-based lister for non-recursive
+            let l = GdriveLister::new(path, self.core.clone());
+            Ok((RpList::default(), TwoWays::One(oio::PageLister::new(l))))
+        }
     }
 
     async fn copy(&self, from: &str, to: &str, _args: OpCopy) -> 
Result<RpCopy> {
diff --git a/core/services/gdrive/src/builder.rs 
b/core/services/gdrive/src/builder.rs
index 2f726c1e8..b9a0d4a4d 100644
--- a/core/services/gdrive/src/builder.rs
+++ b/core/services/gdrive/src/builder.rs
@@ -115,6 +115,7 @@ impl Builder for GdriveBuilder {
                 read: true,
 
                 list: true,
+                list_with_recursive: true,
 
                 write: true,
 
diff --git a/core/services/gdrive/src/core.rs b/core/services/gdrive/src/core.rs
index 339a3d34b..5593424c9 100644
--- a/core/services/gdrive/src/core.rs
+++ b/core/services/gdrive/src/core.rs
@@ -104,7 +104,53 @@ impl GdriveCore {
         url = url.push("q", &percent_encode_path(&q));
         url = url.push(
             "fields",
-            "nextPageToken,files(id,name,mimeType,size,modifiedTime)",
+            "nextPageToken,files(id,name,mimeType,size,modifiedTime,parents)",
+        );
+        if !next_page_token.is_empty() {
+            url = url.push("pageToken", next_page_token);
+        };
+
+        let mut req = Request::get(url.finish())
+            .extension(Operation::List)
+            .body(Buffer::new())
+            .map_err(new_request_build_error)?;
+        self.sign(&mut req).await?;
+
+        self.info.http_client().send(req).await
+    }
+
+    /// List multiple directories in a single API call using OR query.
+    /// This is much more efficient than listing directories one by one.
+    /// Google Drive API supports up to ~50 parent IDs in a single query.
+    pub async fn gdrive_list_batch(
+        &self,
+        file_ids: &[String],
+        page_size: i32,
+        next_page_token: &str,
+    ) -> Result<Response<Buffer>> {
+        if file_ids.is_empty() {
+            return Err(Error::new(
+                ErrorKind::Unexpected,
+                "file_ids cannot be empty",
+            ));
+        }
+
+        // Build OR query: ('id1' in parents or 'id2' in parents or ...)
+        let parents_query = file_ids
+            .iter()
+            .map(|id| format!("'{}' in parents", id))
+            .collect::<Vec<_>>()
+            .join(" or ");
+        let q = format!("({}) and trashed = false", parents_query);
+
+        let url = "https://www.googleapis.com/drive/v3/files";;
+        let mut url = QueryPairsWriter::new(url);
+        url = url.push("pageSize", &page_size.to_string());
+        url = url.push("q", &percent_encode_path(&q));
+        // Include 'parents' field to know which directory each file belongs to
+        url = url.push(
+            "fields",
+            "nextPageToken,files(id,name,mimeType,size,modifiedTime,parents)",
         );
         if !next_page_token.is_empty() {
             url = url.push("pageToken", next_page_token);
@@ -486,6 +532,10 @@ pub struct GdriveFile {
     // if other operations(such as search) do not specify the `fields` query 
parameter,
     // try to access this field, it will be `None`.
     pub modified_time: Option<String>,
+    // Parents are returned when using batch listing to identify which 
directory
+    // a file belongs to. This is needed for the OR query optimization.
+    #[serde(default)]
+    pub parents: Vec<String>,
 }
 
 /// refer to 
https://developers.google.com/drive/api/reference/rest/v3/files/list
diff --git a/core/services/gdrive/src/lister.rs 
b/core/services/gdrive/src/lister.rs
index 1ee111eca..643effab0 100644
--- a/core/services/gdrive/src/lister.rs
+++ b/core/services/gdrive/src/lister.rs
@@ -15,11 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::collections::HashMap;
+use std::collections::VecDeque;
 use std::sync::Arc;
 
 use http::StatusCode;
 
 use super::core::GdriveCore;
+use super::core::GdriveFile;
 use super::core::GdriveFileList;
 use super::error::parse_error;
 use opendal_core::raw::*;
@@ -50,7 +53,7 @@ impl oio::PageList for GdriveLister {
 
         let resp = self
             .core
-            .gdrive_list(file_id.as_str(), 100, &ctx.token)
+            .gdrive_list(file_id.as_str(), 1000, &ctx.token)
             .await?;
 
         let bytes = match resp.status() {
@@ -94,12 +97,10 @@ impl oio::PageList for GdriveLister {
             let path = format!("{}{}", &self.path, file.name);
             let normalized_path = build_rel_path(root, &path);
 
-            // Update path cache when path doesn't exist.
-            // When Google Drive converts a format, for example, Microsoft 
PowerPoint,
-            // Google Drive keeps two entries with the same ID.
-            if let Ok(None) = self.core.path_cache.get(&path).await {
-                self.core.path_cache.insert(&path, &file.id).await;
-            }
+            // Update path cache with the file id from list response.
+            // This avoids expensive API calls since insert() is idempotent
+            // and checks internally if the entry already exists.
+            self.core.path_cache.insert(&path, &file.id).await;
 
             let mut metadata = 
Metadata::new(file_type).with_content_type(file.mime_type.clone());
             if let Some(size) = file.size {
@@ -122,3 +123,275 @@ impl oio::PageList for GdriveLister {
         Ok(())
     }
 }
+
+// ============================================================================
+// GdriveFlatLister - Efficient recursive listing implementation
+// ============================================================================
+
+// GdriveFlatLister implements efficient recursive listing for Google Drive.
+//
+// Unlike the generic FlatLister which makes one API call per directory,
+// this implementation uses batch queries with OR conditions to list
+// multiple directories in a single API call (up to 50 parent IDs per query).
+//
+// This optimization reduces API calls from O(n) to O(n/50) where n is the
+// number of directories, matching rclone's performance.
+
+/// Maximum number of parent IDs to include in a single batch query.
+/// Google Drive API has limits on query length, 50 is a safe value.
+const BATCH_SIZE: usize = 50;
+
+/// Page size for API requests. Google Drive allows up to 1000.
+const PAGE_SIZE: i32 = 1000;
+
+pub struct GdriveFlatLister {
+    core: Arc<GdriveCore>,
+    root_path: String,
+
+    /// Queue of directory IDs that still need to be listed
+    pending_dirs: VecDeque<PendingDir>,
+
+    /// Buffer of entries ready to be returned
+    entry_buffer: VecDeque<oio::Entry>,
+
+    /// Current batch of directory IDs being listed
+    current_batch: Vec<String>,
+
+    /// Map from directory ID to its absolute path (for reconstructing file 
paths)
+    dir_id_to_path: HashMap<String, String>,
+
+    /// Pagination token for current batch query
+    page_token: String,
+
+    /// Whether we've started processing
+    started: bool,
+
+    /// Whether listing is complete
+    done: bool,
+}
+
+/// Represents a directory waiting to be listed
+struct PendingDir {
+    id: String,
+    path: String,
+}
+
+impl GdriveFlatLister {
+    pub fn new(root_path: String, core: Arc<GdriveCore>) -> Self {
+        Self {
+            core,
+            root_path,
+            pending_dirs: VecDeque::new(),
+            entry_buffer: VecDeque::new(),
+            current_batch: Vec::new(),
+            dir_id_to_path: HashMap::new(),
+            page_token: String::new(),
+            started: false,
+            done: false,
+        }
+    }
+
+    /// Initialize the lister by resolving the root directory ID
+    async fn initialize(&mut self) -> Result<()> {
+        log::debug!(
+            "GdriveFlatLister: initializing with root path: {:?}",
+            &self.root_path
+        );
+
+        let root_id = self.core.path_cache.get(&self.root_path).await?;
+
+        let root_id = match root_id {
+            Some(id) => {
+                log::debug!("GdriveFlatLister: root path resolved to ID: 
{:?}", &id);
+                id
+            }
+            None => {
+                log::debug!(
+                    "GdriveFlatLister: root path not found: {:?}",
+                    &self.root_path
+                );
+                self.done = true;
+                return Ok(());
+            }
+        };
+
+        // Add the root directory entry first
+        let rel_path = build_rel_path(&self.core.root, &self.root_path);
+        let entry = oio::Entry::new(&rel_path, Metadata::new(EntryMode::DIR));
+        self.entry_buffer.push_back(entry);
+
+        // Queue the root directory for listing
+        self.pending_dirs.push_back(PendingDir {
+            id: root_id.clone(),
+            path: self.root_path.clone(),
+        });
+        self.dir_id_to_path.insert(root_id, self.root_path.clone());
+
+        self.started = true;
+        Ok(())
+    }
+
+    /// Fill the current batch with pending directory IDs
+    fn fill_batch(&mut self) {
+        self.current_batch.clear();
+        while self.current_batch.len() < BATCH_SIZE {
+            if let Some(dir) = self.pending_dirs.pop_front() {
+                self.current_batch.push(dir.id.clone());
+                self.dir_id_to_path.insert(dir.id, dir.path);
+            } else {
+                break;
+            }
+        }
+    }
+
+    /// Process a batch of directories using OR query
+    async fn process_batch(&mut self) -> Result<()> {
+        if self.current_batch.is_empty() {
+            self.done = true;
+            return Ok(());
+        }
+
+        log::debug!(
+            "GdriveFlatLister: processing batch of {} directories: {:?}",
+            self.current_batch.len(),
+            &self.current_batch
+        );
+
+        let resp = self
+            .core
+            .gdrive_list_batch(&self.current_batch, PAGE_SIZE, 
&self.page_token)
+            .await?;
+
+        let bytes = match resp.status() {
+            StatusCode::OK => resp.into_body().to_bytes(),
+            _ => return Err(parse_error(resp)),
+        };
+
+        log::debug!("GdriveFlatLister: response size: {} bytes", bytes.len());
+
+        if bytes.is_empty() {
+            self.page_token.clear();
+            self.fill_batch();
+            return Ok(());
+        }
+
+        let decoded_response =
+            
serde_json::from_slice::<GdriveFileList>(&bytes).map_err(new_json_deserialize_error)?;
+
+        log::debug!(
+            "GdriveFlatLister: got {} files, next_page_token: {:?}",
+            decoded_response.files.len(),
+            decoded_response.next_page_token.is_some()
+        );
+
+        // Process files from the response FIRST (this may add new dirs to 
pending_dirs)
+        for file in decoded_response.files {
+            self.process_file(file).await?;
+        }
+
+        // Update pagination state AFTER processing files
+        if let Some(next_page_token) = decoded_response.next_page_token {
+            self.page_token = next_page_token;
+        } else {
+            self.page_token.clear();
+            // Current batch is done, prepare next batch with newly discovered 
directories
+            self.fill_batch();
+        }
+
+        Ok(())
+    }
+
+    /// Process a single file from the API response
+    async fn process_file(&mut self, mut file: GdriveFile) -> Result<()> {
+        let is_dir = file.mime_type.as_str() == 
"application/vnd.google-apps.folder";
+
+        if is_dir && !file.name.ends_with('/') {
+            file.name.push('/');
+        }
+
+        // Find the parent directory path
+        // The file's parents field contains the parent ID(s)
+        let parent_path = if let Some(parent_id) = file.parents.first() {
+            self.dir_id_to_path
+                .get(parent_id)
+                .cloned()
+                .unwrap_or_else(|| self.root_path.clone())
+        } else {
+            self.root_path.clone()
+        };
+
+        let abs_path = format!("{}{}", parent_path, file.name);
+        let rel_path = build_rel_path(&self.core.root, &abs_path);
+
+        // Update path cache
+        self.core.path_cache.insert(&abs_path, &file.id).await;
+
+        // Build metadata
+        let entry_mode = if is_dir {
+            EntryMode::DIR
+        } else {
+            EntryMode::FILE
+        };
+
+        let mut metadata = 
Metadata::new(entry_mode).with_content_type(file.mime_type.clone());
+
+        if let Some(size) = file.size {
+            if let Ok(size) = size.parse::<u64>() {
+                metadata = metadata.with_content_length(size);
+            }
+        }
+
+        if let Some(modified_time) = file.modified_time {
+            if let Ok(ts) = modified_time.parse::<Timestamp>() {
+                metadata = metadata.with_last_modified(ts);
+            }
+        }
+
+        let entry = oio::Entry::new(&rel_path, metadata);
+        self.entry_buffer.push_back(entry);
+
+        // If it's a directory, queue it for recursive listing
+        if is_dir {
+            self.pending_dirs.push_back(PendingDir {
+                id: file.id.clone(),
+                path: abs_path.clone(),
+            });
+            self.dir_id_to_path.insert(file.id, abs_path);
+        }
+
+        Ok(())
+    }
+}
+
+impl oio::List for GdriveFlatLister {
+    async fn next(&mut self) -> Result<Option<oio::Entry>> {
+        // Initialize on first call
+        if !self.started {
+            self.initialize().await?;
+            self.fill_batch();
+        }
+
+        loop {
+            // Return buffered entries first
+            if let Some(entry) = self.entry_buffer.pop_front() {
+                return Ok(Some(entry));
+            }
+
+            // If we're done, return None
+            if self.done {
+                return Ok(None);
+            }
+
+            // Process more directories
+            self.process_batch().await?;
+
+            // Check if we got any entries or if we're done
+            if self.entry_buffer.is_empty()
+                && self.current_batch.is_empty()
+                && self.pending_dirs.is_empty()
+            {
+                self.done = true;
+            }
+        }
+    }
+}

Reply via email to