Copilot commented on code in PR #63814:
URL: https://github.com/apache/airflow/pull/63814#discussion_r3025334117
##########
providers/git/src/airflow/providers/git/bundles/git.py:
##########
@@ -123,16 +123,37 @@ def _is_pruned_worktree(self) -> bool:
return False
return not (self.repo_path / ".git").exists()
+ def _local_repo_has_version(self) -> bool:
+ """Check if the local repo already has the correct version checked
out."""
+ if not self.version or not self.repo_path.is_dir() or not
(self.repo_path / ".git").exists():
+ return False
+ try:
+ repo = Repo(self.repo_path)
+ has_version = repo.head.commit.hexsha == self.version
+ repo.close()
+ return has_version
+ except Exception: # general exception just to catch anything wrong
with repo
+ return False
+
def _initialize(self):
with self.lock():
- # Avoids re-cloning on every task run when
prune_dotgit_folder=True.
+ # Avoids re-cloning on every task run when:
+ # 1. prune_dotgit_folder=True
+ # 2. When local version matches expected version
Review Comment:
The comment is misleading: `_is_pruned_worktree()` checks for “no `.git`
folder”, not whether `prune_dotgit_folder=True`. Since the condition can also
be true if `.git` is missing for other reasons (or if `prune_dotgit_folder` has
changed between runs), consider rewording to describe the actual state-based
behavior (e.g., 'when a pruned worktree already exists') rather than
referencing the configuration flag.
```suggestion
# 1. A pruned worktree (no .git directory) already exists at
repo_path
# 2. The local repo already has the expected version checked out
```
##########
providers/git/src/airflow/providers/git/bundles/git.py:
##########
@@ -123,16 +123,37 @@ def _is_pruned_worktree(self) -> bool:
return False
return not (self.repo_path / ".git").exists()
+ def _local_repo_has_version(self) -> bool:
+ """Check if the local repo already has the correct version checked
out."""
+ if not self.version or not self.repo_path.is_dir() or not
(self.repo_path / ".git").exists():
+ return False
+ try:
+ repo = Repo(self.repo_path)
+ has_version = repo.head.commit.hexsha == self.version
+ repo.close()
+ return has_version
+ except Exception: # general exception just to catch anything wrong
with repo
+ return False
Review Comment:
This uses a broad `except Exception` and manually calls `repo.close()` only
on the success path. It’s better to ensure the repo handle is closed even when
an exception occurs (e.g., via `with Repo(...) as repo:` or a `finally` block),
and to narrow the exception types to expected GitPython failures (e.g.,
`InvalidGitRepositoryError`, `NoSuchPathError`, `BadName`, `ValueError`) so
unexpected errors aren’t silently swallowed.
```suggestion
repo = None
try:
repo = Repo(self.repo_path)
return repo.head.commit.hexsha == self.version
except (InvalidGitRepositoryError, NoSuchPathError, BadName,
GitCommandError, ValueError):
# Any expected git-related error means we can't trust the local
repo version.
return False
finally:
if repo is not None:
repo.close()
```
##########
providers/git/src/airflow/providers/git/bundles/git.py:
##########
@@ -123,16 +123,37 @@ def _is_pruned_worktree(self) -> bool:
return False
return not (self.repo_path / ".git").exists()
+ def _local_repo_has_version(self) -> bool:
+ """Check if the local repo already has the correct version checked
out."""
+ if not self.version or not self.repo_path.is_dir() or not
(self.repo_path / ".git").exists():
+ return False
+ try:
+ repo = Repo(self.repo_path)
+ has_version = repo.head.commit.hexsha == self.version
+ repo.close()
+ return has_version
Review Comment:
`self.version` in `GitDagBundle` is treated elsewhere as a general
“commit-ish” (e.g., tag name, abbreviated SHA), but here it’s compared directly
to `repo.head.commit.hexsha` (full 40-char SHA). This prevents the new
optimization from working for tags/short SHAs and can lead to inconsistent
behavior vs `_has_version()`/`repo.commit(version)` usage. Consider resolving
`self.version` to a commit and comparing hexshas (e.g.,
`repo.commit(self.version).hexsha`).
```suggestion
try:
expected_commit = repo.commit(self.version)
has_version = repo.head.commit.hexsha ==
expected_commit.hexsha
return has_version
finally:
repo.close()
```
##########
providers/git/tests/unit/git/bundles/test_git.py:
##########
@@ -308,6 +308,81 @@ def
test_second_initialize_reuses_pruned_worktree_without_recloning(self, mock_g
files_in_repo = {f.name for f in bundle2.path.iterdir() if f.is_file()}
assert {"test_dag.py"} == files_in_repo
+ @mock.patch("airflow.providers.git.bundles.git.GitHook")
+ def test_second_initialize_skips_clone_when_local_repo_has_version(self,
mock_githook, git_repo):
+ """When the local repo already has the correct version checked out
(with .git intact), skip re-cloning."""
+ repo_path, repo = git_repo
+ mock_githook.return_value.repo_url = repo_path
+ version = repo.head.commit.hexsha
+ bundle_name = "test_version_reuse"
+
+ # Clone with prune_dotgit_folder=False so .git is preserved
+ bundle1 = GitDagBundle(
+ name=bundle_name,
+ git_conn_id=CONN_HTTPS,
+ version=version,
+ tracking_ref=GIT_DEFAULT_BRANCH,
+ prune_dotgit_folder=False,
+ )
+ bundle1.initialize()
+ assert (bundle1.repo_path / ".git").exists()
+ assert bundle1.get_current_version() == version
+
+ # Should detect local repo has correct version and skip clone
+ with (
+ patch.object(GitDagBundle, "_clone_bare_repo_if_required") as
mock_bare_clone,
+ patch.object(GitDagBundle, "_clone_repo_if_required") as
mock_clone,
+ ):
+ bundle2 = GitDagBundle(
+ name=bundle_name,
+ git_conn_id=CONN_HTTPS,
+ version=version,
+ tracking_ref=GIT_DEFAULT_BRANCH,
+ prune_dotgit_folder=False,
+ )
+ bundle2.initialize()
+ mock_bare_clone.assert_not_called()
+ mock_clone.assert_not_called()
+
+ @mock.patch("airflow.providers.git.bundles.git.GitHook")
+ def test_local_repo_with_wrong_version_does_not_skip_clone(self,
mock_githook, git_repo):
+ """When the local repo has a different version, do not skip cloning."""
Review Comment:
This test name/docstring suggests it validates the new 'do not skip when
local repo is on a different version' behavior, but the setup uses a different
`version` value on the second initialization, which changes `repo_path`
(because `repo_path = versions_dir / version`). That means the test doesn’t
exercise the 'same path but wrong HEAD' scenario that
`_local_repo_has_version()` is intended to gate. Consider renaming this test to
what it actually verifies, or adjust it to mutate the existing repo at the same
version-path (e.g., checkout a different commit in-place) and then assert
initialization does not short-circuit.
```suggestion
def test_initialize_with_different_versions_creates_separate_repos(self,
mock_githook, git_repo):
"""Initializing the same bundle with different versions creates
separate repos, each at the requested version."""
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]