siomporas commented on code in PR #6836: URL: https://github.com/apache/opendal/pull/6836#discussion_r2573176501
########## core/src/services/git/core.rs: ########## @@ -0,0 +1,575 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::collections::HashMap; +use std::fmt::Debug; +use std::sync::{Arc, Mutex}; + +use base64::Engine; + +use crate::raw::*; +use crate::*; + +// Constants for Git references and LFS +const GIT_HEAD_REF: &str = "HEAD"; +const LFS_VERSION_PREFIX: &str = "version https://git-lfs.github.com/spec/v1"; +const LFS_CONTENT_TYPE: &str = "application/vnd.git-lfs+json"; +const LFS_OID_PREFIX: &str = "oid sha256:"; +const LFS_SIZE_PREFIX: &str = "size "; + +/// Git LFS pointer information +#[derive(Debug, Clone)] +pub struct LfsPointer { + pub oid: String, + pub size: u64, +} + +/// Wrapper for gix repository with auto-cleanup temp dir +#[derive(Clone)] +struct RepoHolder { + repo: gix::Repository, + _tempdir: Arc<tempfile::TempDir>, +} + +/// Cached LFS download URL from batch API +#[derive(Clone, Debug)] +struct LfsDownloadUrl { + url: String, +} + +/// Core functionality for Git operations. +pub struct GitCore { + pub repository: String, + pub reference: String, + pub root: String, + pub username: Option<String>, + pub password: Option<String>, + pub resolve_lfs: bool, + repo_cell: Arc<Mutex<Option<RepoHolder>>>, + lfs_url_cache: Arc<Mutex<HashMap<String, LfsDownloadUrl>>>, +} + +impl Debug for GitCore { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("GitCore") + .field("repository", &self.repository) + .field("reference", &self.reference) + .field("root", &self.root) + .field("username", &self.username) + .field("password", &"<redacted>") + .field("resolve_lfs", &self.resolve_lfs) + .finish_non_exhaustive() + } +} + +impl GitCore { + /// Create a new GitCore instance + pub fn new( + repository: String, + reference: String, + root: String, + username: Option<String>, + password: Option<String>, + resolve_lfs: bool, + ) -> Self { + Self { + repository, + reference, + root, + username, + password, + resolve_lfs, + repo_cell: Arc::new(Mutex::new(None)), + lfs_url_cache: Arc::new(Mutex::new(HashMap::new())), + } + } + + fn get_or_create_repo(&self) -> Result<gix::Repository> { + let mut cell = self + .repo_cell + .lock() + .map_err(|_| Error::new(ErrorKind::Unexpected, "repository lock poisoned"))?; + + if cell.is_none() { + let temp_dir = tempfile::tempdir().map_err(|e| { + Error::new(ErrorKind::Unexpected, "failed to create temp dir").set_source(e) + })?; + + // Build URL with credentials if provided + let clone_url = + if let (Some(username), Some(password)) = (&self.username, &self.password) { + // Parse the URL and inject credentials + let url = self.repository.as_str(); + if let Some(scheme_end) = url.find("://") { + let (scheme, rest) = url.split_at(scheme_end + 3); + format!("{}{}:{}@{}", scheme, username, password, rest) + } else { + self.repository.clone() + } + } else { + self.repository.clone() + }; + + let mut prepare = + gix::prepare_clone(clone_url.as_str(), temp_dir.path()).map_err(|e| { + Error::new(ErrorKind::Unexpected, "failed to prepare clone").set_source(e) + })?; + + // Use full clone instead of shallow to support arbitrary commit SHAs + // Shallow clone only gets the tip of the default branch + let (repo, _) = prepare + .fetch_only(gix::progress::Discard, &gix::interrupt::IS_INTERRUPTED) Review Comment: The clone operation is for the underlying git repository, which is done in a temp folder. This is necessary because of how git's internal object database and revisions are stored and served in packs. Dealing with individual files requires access to the packs, which is how things like GitHub serve individual files - this service is meant to use the least common denominator, which is git's protocol, so we need to download the packs for an oid to get the files which is what gix provides. The value isn't in providing the contents of the core git repo (these contents are loaded into memory when the repo is cloned) - it is offering streams for the LFS objects which are generally huge, and exist completely outside and on top of the git repository and are directly accessible over http. Hopefully that makes sense. I suggest you clone down the example project I linked in the description and clone an AI model off of huggingface while watching resource utilisation for the process if that still doesn't make sense. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
