This is an automated email from the ASF dual-hosted git repository.
sbp pushed a commit to branch sbp
in repository https://gitbox.apache.org/repos/asf/tooling-trusted-releases.git
The following commit(s) were added to refs/heads/sbp by this push:
new 342cefb Use rsync to compare GitHub trees with extracted archives
342cefb is described below
commit 342cefb093bfa2b7d66d8d597afc21b7d62d40db
Author: Sean B. Palmer <[email protected]>
AuthorDate: Thu Feb 5 16:22:21 2026 +0000
Use rsync to compare GitHub trees with extracted archives
---
atr/storage/__init__.py | 2 +-
atr/tasks/checks/compare.py | 95 ++++++++++++++
tests/unit/test_checks_compare.py | 262 ++++++++++++++++++++++++++++++++++++--
3 files changed, 345 insertions(+), 14 deletions(-)
diff --git a/atr/storage/__init__.py b/atr/storage/__init__.py
index fcca6c0..cf15941 100644
--- a/atr/storage/__init__.py
+++ b/atr/storage/__init__.py
@@ -52,7 +52,7 @@ def audit(**kwargs: basic.JSON) -> None:
kwargs = {"datetime": now, "action": action, **kwargs}
if request_user_id:
kwargs["request_user_id"] = request_user_id
- if admin_user_id and request_user_id != admin_user_id:
+ if admin_user_id and (request_user_id != admin_user_id):
kwargs["admin_user_id"] = admin_user_id
msg = json.dumps(kwargs, allow_nan=False)
# The atr.log logger should give the same name
diff --git a/atr/tasks/checks/compare.py b/atr/tasks/checks/compare.py
index d1afe88..3cf62c7 100644
--- a/atr/tasks/checks/compare.py
+++ b/atr/tasks/checks/compare.py
@@ -17,10 +17,12 @@
import asyncio
import contextlib
+import dataclasses
import json
import os
import pathlib
import shutil
+import subprocess
import time
from collections.abc import Mapping
from typing import Any, Final
@@ -60,6 +62,12 @@ class DetermineWantsForSha:
return [dulwich.objects.ObjectID(self.sha.encode("ascii"))]
[email protected]
+class TreeComparisonResult:
+ invalid: set[str]
+ repo_only: set[str]
+
+
async def source_trees(args: checks.FunctionArguments) -> results.Results |
None:
recorder = await args.recorder()
is_source = await recorder.primary_path_is_source()
@@ -89,6 +97,12 @@ async def source_trees(args: checks.FunctionArguments) ->
results.Results | None
await aiofiles.os.makedirs(github_dir, exist_ok=True)
await aiofiles.os.makedirs(archive_dir_path, exist_ok=True)
checkout_dir = await _checkout_github_source(payload, github_dir)
+ if checkout_dir is None:
+ await recorder.failure(
+ "Failed to clone GitHub repository for comparison",
+ {"repo_url":
f"https://github.com/{payload.repository}.git", "sha": payload.sha},
+ )
+ return None
if await _decompress_archive(primary_abs_path, archive_dir_path,
max_extract_size, chunk_size):
archive_dir = str(archive_dir_path)
else:
@@ -97,6 +111,29 @@ async def source_trees(args: checks.FunctionArguments) ->
results.Results | None
{"archive_path": str(primary_abs_path), "extract_dir":
str(archive_dir_path)},
)
return None
+ try:
+ comparison = await _compare_trees(github_dir, archive_dir_path)
+ except RuntimeError as exc:
+ await recorder.failure(
+ "Failed to compare source tree against GitHub checkout",
+ {"error": str(exc)},
+ )
+ return None
+ if comparison.invalid:
+ invalid_list = sorted(comparison.invalid)
+ await recorder.failure(
+ "Source archive contains files not in GitHub checkout or
with different content",
+ {"invalid_count": len(invalid_list), "invalid_paths":
invalid_list},
+ )
+ return None
+ repo_only_list = sorted(comparison.repo_only)
+ await recorder.success(
+ "Source archive is a valid subset of GitHub checkout",
+ {
+ "repo_only_count": len(repo_only_list),
+ "repo_only_paths_sample": repo_only_list[:5],
+ },
+ )
payload_summary = _payload_summary(payload)
log.info(
"Ran compare.source_trees successfully",
@@ -166,6 +203,64 @@ def _clone_repo(repo_url: str, sha: str, checkout_dir:
pathlib.Path) -> None:
shutil.rmtree(git_dir)
+async def _compare_trees(repo_dir: pathlib.Path, archive_dir: pathlib.Path) ->
TreeComparisonResult:
+ return await asyncio.to_thread(_compare_trees_rsync, repo_dir, archive_dir)
+
+
+def _compare_trees_rsync(repo_dir: pathlib.Path, archive_dir: pathlib.Path) ->
TreeComparisonResult: # noqa: C901
+ if shutil.which("rsync") is None:
+ raise RuntimeError("rsync is not available on PATH")
+ command = [
+ "rsync",
+ "-a",
+ "--delete",
+ "--dry-run",
+ "--itemize-changes",
+ "--out-format=%i %n",
+ "--no-motd",
+ "--checksum",
+ f"{repo_dir}{os.sep}",
+ f"{archive_dir}{os.sep}",
+ ]
+ result = subprocess.run(
+ command,
+ capture_output=True,
+ text=True,
+ check=False,
+ )
+ if result.returncode != 0:
+ raise RuntimeError(f"rsync failed with code {result.returncode}:
{result.stderr.strip()}")
+ invalid: set[str] = set()
+ repo_only: set[str] = set()
+ for line in (result.stdout + "\n" + result.stderr).splitlines():
+ line = line.strip()
+ if not line:
+ continue
+ rel_path: str | None = None
+ is_repo_only = False
+ if line.startswith("*deleting "):
+ rel_path = line.removeprefix("*deleting ").strip().rstrip("/")
+ elif line.startswith("deleting "):
+ rel_path = line.removeprefix("deleting ").strip().rstrip("/")
+ else:
+ parts = line.split(" ", 1)
+ if len(parts) == 2:
+ flags = parts[0]
+ rel_path = parts[1].rstrip("/")
+ if flags.startswith(">f") and (len(flags) >= 3) and (flags[2]
== "+"):
+ is_repo_only = True
+ if not rel_path:
+ continue
+ full_repo = repo_dir / rel_path
+ full_archive = archive_dir / rel_path
+ if full_repo.is_file() or full_archive.is_file():
+ if is_repo_only:
+ repo_only.add(rel_path)
+ else:
+ invalid.add(rel_path)
+ return TreeComparisonResult(invalid, repo_only)
+
+
async def _decompress_archive(
archive_path: pathlib.Path,
extract_dir: pathlib.Path,
diff --git a/tests/unit/test_checks_compare.py
b/tests/unit/test_checks_compare.py
index 1ee3c76..dbac4fc 100644
--- a/tests/unit/test_checks_compare.py
+++ b/tests/unit/test_checks_compare.py
@@ -17,7 +17,9 @@
import datetime
import pathlib
-from collections.abc import Callable, Mapping
+import shutil
+import subprocess
+from collections.abc import Callable, Iterable, Mapping
import aiofiles.os
import dulwich.objects
@@ -60,6 +62,23 @@ class CommitStub:
self.tree = tree
+class CompareRecorder:
+ def __init__(
+ self,
+ invalid: set[str] | None = None,
+ repo_only: set[str] | None = None,
+ ) -> None:
+ self.calls: list[tuple[pathlib.Path, pathlib.Path]] = []
+ self.invalid = invalid or set()
+ self.repo_only = repo_only or set()
+
+ async def __call__(
+ self, repo_dir: pathlib.Path, archive_dir: pathlib.Path
+ ) -> atr.tasks.checks.compare.TreeComparisonResult:
+ self.calls.append((repo_dir, archive_dir))
+ return
atr.tasks.checks.compare.TreeComparisonResult(set(self.invalid),
set(self.repo_only))
+
+
class DecompressRecorder:
def __init__(self, return_value: bool = True) -> None:
self.archive_path: pathlib.Path | None = None
@@ -83,6 +102,21 @@ class DecompressRecorder:
return self.return_value
+class ExtractErrorRaiser:
+ def __call__(self, *args: object, **kwargs: object) -> tuple[int,
list[str]]:
+ raise atr.tasks.checks.compare.archives.ExtractionError("Extraction
error")
+
+
+class ExtractRecorder:
+ def __init__(self, extracted_size: int = 123) -> None:
+ self.calls: list[tuple[str, str, int, int]] = []
+ self.extracted_size = extracted_size
+
+ def __call__(self, archive_path: str, extract_dir: str, max_size: int,
chunk_size: int) -> tuple[int, list[str]]:
+ self.calls.append((archive_path, extract_dir, max_size, chunk_size))
+ return self.extracted_size, []
+
+
class GitClientStub:
def __init__(self) -> None:
self.closed = False
@@ -130,19 +164,20 @@ class ParseCommitRecorder:
return self.commit
-class ExtractErrorRaiser:
- def __call__(self, *args: object, **kwargs: object) -> tuple[int,
list[str]]:
- raise atr.tasks.checks.compare.archives.ExtractionError("Extraction
error")
+class RunRecorder:
+ def __init__(self, completed: subprocess.CompletedProcess[str]) -> None:
+ self.calls: list[list[str]] = []
+ self.completed = completed
-
-class ExtractRecorder:
- def __init__(self, extracted_size: int = 123) -> None:
- self.calls: list[tuple[str, str, int, int]] = []
- self.extracted_size = extracted_size
-
- def __call__(self, archive_path: str, extract_dir: str, max_size: int,
chunk_size: int) -> tuple[int, list[str]]:
- self.calls.append((archive_path, extract_dir, max_size, chunk_size))
- return self.extracted_size, []
+ def __call__(
+ self,
+ args: list[str],
+ capture_output: bool,
+ text: bool,
+ check: bool,
+ ) -> subprocess.CompletedProcess[str]:
+ self.calls.append(args)
+ return self.completed
class PayloadLoader:
@@ -191,6 +226,7 @@ class RecorderStub(atr.tasks.checks.Recorder):
afresh=False,
)
self.failure_calls: list[tuple[str, object]] = []
+ self.success_calls: list[tuple[str, object]] = []
self._is_source = is_source
async def primary_path_is_source(self) -> bool:
@@ -213,6 +249,23 @@ class RecorderStub(atr.tasks.checks.Recorder):
cached=False,
)
+ async def success(
+ self, message: str, data: object, primary_rel_path: str | None = None,
member_rel_path: str | None = None
+ ) -> atr.models.sql.CheckResult:
+ self.success_calls.append((message, data))
+ return atr.models.sql.CheckResult(
+ release_name=self.release_name,
+ revision_number=self.revision_number,
+ checker=self.checker,
+ primary_rel_path=primary_rel_path or self.primary_rel_path,
+ member_rel_path=member_rel_path,
+ created=datetime.datetime.now(datetime.UTC),
+ status=atr.models.sql.CheckResultStatus.SUCCESS,
+ message=message,
+ data=data,
+ cached=False,
+ )
+
class RepoStub:
def __init__(self, controldir: pathlib.Path, worktree: object) -> None:
@@ -329,6 +382,116 @@ def
test_clone_repo_raises_when_commit_missing(monkeypatch: pytest.MonkeyPatch,
assert git_client.closed is True
+def test_compare_trees_rsync_archive_has_extra_file(monkeypatch:
pytest.MonkeyPatch, tmp_path: pathlib.Path) -> None:
+ repo_dir = tmp_path / "repo"
+ archive_dir = tmp_path / "archive"
+ _make_tree(repo_dir, ["a.txt"])
+ _make_tree(archive_dir, ["a.txt", "b.txt"])
+ completed = subprocess.CompletedProcess(
+ args=["rsync"],
+ returncode=0,
+ stdout="",
+ stderr="*deleting b.txt\n",
+ )
+ run_recorder = RunRecorder(completed)
+
+ monkeypatch.setattr(shutil, "which", lambda _name: "/usr/bin/rsync")
+ monkeypatch.setattr(subprocess, "run", run_recorder)
+
+ result = atr.tasks.checks.compare._compare_trees_rsync(repo_dir,
archive_dir)
+
+ assert result.invalid == {"b.txt"}
+ assert result.repo_only == set()
+
+
+def test_compare_trees_rsync_content_differs(monkeypatch: pytest.MonkeyPatch,
tmp_path: pathlib.Path) -> None:
+ repo_dir = tmp_path / "repo"
+ archive_dir = tmp_path / "archive"
+ _make_tree(repo_dir, ["a.txt", "b.txt"])
+ _make_tree(archive_dir, ["a.txt", "b.txt"])
+ completed = subprocess.CompletedProcess(
+ args=["rsync"],
+ returncode=0,
+ stdout=">fc......... b.txt\n",
+ stderr="",
+ )
+ run_recorder = RunRecorder(completed)
+
+ monkeypatch.setattr(shutil, "which", lambda _name: "/usr/bin/rsync")
+ monkeypatch.setattr(subprocess, "run", run_recorder)
+
+ result = atr.tasks.checks.compare._compare_trees_rsync(repo_dir,
archive_dir)
+
+ assert result.invalid == {"b.txt"}
+ assert result.repo_only == set()
+
+
+def test_compare_trees_rsync_distinct_files(monkeypatch: pytest.MonkeyPatch,
tmp_path: pathlib.Path) -> None:
+ repo_dir = tmp_path / "repo"
+ archive_dir = tmp_path / "archive"
+ _make_tree(repo_dir, ["a.txt"])
+ _make_tree(archive_dir, ["b.txt"])
+ completed = subprocess.CompletedProcess(
+ args=["rsync"],
+ returncode=0,
+ stdout=">f+++++++++ a.txt\n",
+ stderr="*deleting b.txt\n",
+ )
+ run_recorder = RunRecorder(completed)
+
+ monkeypatch.setattr(shutil, "which", lambda _name: "/usr/bin/rsync")
+ monkeypatch.setattr(subprocess, "run", run_recorder)
+
+ result = atr.tasks.checks.compare._compare_trees_rsync(repo_dir,
archive_dir)
+
+ assert result.invalid == {"b.txt"}
+ assert result.repo_only == {"a.txt"}
+
+
+def test_compare_trees_rsync_repo_has_extra_file(monkeypatch:
pytest.MonkeyPatch, tmp_path: pathlib.Path) -> None:
+ repo_dir = tmp_path / "repo"
+ archive_dir = tmp_path / "archive"
+ _make_tree(repo_dir, ["a.txt", "b.txt"])
+ _make_tree(archive_dir, ["a.txt"])
+ completed = subprocess.CompletedProcess(
+ args=["rsync"],
+ returncode=0,
+ stdout=">f+++++++++ b.txt\n",
+ stderr="",
+ )
+ run_recorder = RunRecorder(completed)
+
+ monkeypatch.setattr(shutil, "which", lambda _name: "/usr/bin/rsync")
+ monkeypatch.setattr(subprocess, "run", run_recorder)
+
+ result = atr.tasks.checks.compare._compare_trees_rsync(repo_dir,
archive_dir)
+
+ assert result.invalid == set()
+ assert result.repo_only == {"b.txt"}
+
+
+def test_compare_trees_rsync_trees_match(monkeypatch: pytest.MonkeyPatch,
tmp_path: pathlib.Path) -> None:
+ repo_dir = tmp_path / "repo"
+ archive_dir = tmp_path / "archive"
+ _make_tree(repo_dir, ["a.txt", "b.txt"])
+ _make_tree(archive_dir, ["a.txt", "b.txt"])
+ completed = subprocess.CompletedProcess(
+ args=["rsync"],
+ returncode=0,
+ stdout="",
+ stderr="",
+ )
+ run_recorder = RunRecorder(completed)
+
+ monkeypatch.setattr(shutil, "which", lambda _name: "/usr/bin/rsync")
+ monkeypatch.setattr(subprocess, "run", run_recorder)
+
+ result = atr.tasks.checks.compare._compare_trees_rsync(repo_dir,
archive_dir)
+
+ assert result.invalid == set()
+ assert result.repo_only == set()
+
+
@pytest.mark.asyncio
async def test_decompress_archive_calls_extract(monkeypatch:
pytest.MonkeyPatch, tmp_path: pathlib.Path) -> None:
archive_path = tmp_path / "artifact.tar.gz"
@@ -368,11 +531,13 @@ async def
test_source_trees_creates_temp_workspace_and_cleans_up(
payload = _make_payload()
checkout = CheckoutRecorder()
decompress = DecompressRecorder()
+ compare = CompareRecorder(repo_only={"extra1.txt", "extra2.txt"})
tmp_root = tmp_path / "temporary-root"
monkeypatch.setattr(atr.tasks.checks.compare, "_load_tp_payload",
PayloadLoader(payload))
monkeypatch.setattr(atr.tasks.checks.compare, "_checkout_github_source",
checkout)
monkeypatch.setattr(atr.tasks.checks.compare, "_decompress_archive",
decompress)
+ monkeypatch.setattr(atr.tasks.checks.compare, "_compare_trees", compare)
monkeypatch.setattr(atr.tasks.checks.compare.util, "get_tmp_dir",
ReturnValue(tmp_root))
await atr.tasks.checks.compare.source_trees(args)
@@ -386,6 +551,12 @@ async def
test_source_trees_creates_temp_workspace_and_cleans_up(
assert checkout_dir.parent.name.startswith("trees-")
assert await aiofiles.os.path.exists(tmp_root)
assert not await aiofiles.os.path.exists(checkout_dir.parent)
+ assert len(recorder.success_calls) == 1
+ message, data = recorder.success_calls[0]
+ assert message == "Source archive is a valid subset of GitHub checkout"
+ assert isinstance(data, dict)
+ assert data["repo_only_count"] == 2
+ assert data["repo_only_paths_sample"] == ["extra1.txt", "extra2.txt"]
@pytest.mark.asyncio
@@ -409,6 +580,35 @@ async def
test_source_trees_payload_none_skips_temp_workspace(monkeypatch: pytes
await atr.tasks.checks.compare.source_trees(args)
[email protected]
+async def test_source_trees_records_failure_when_archive_has_invalid_files(
+ monkeypatch: pytest.MonkeyPatch, tmp_path: pathlib.Path
+) -> None:
+ recorder = RecorderStub(True)
+ args = _make_args(recorder)
+ payload = _make_payload()
+ checkout = CheckoutRecorder()
+ decompress = DecompressRecorder()
+ compare = CompareRecorder(invalid={"bad1.txt", "bad2.txt"},
repo_only={"ok.txt"})
+ tmp_root = tmp_path / "temporary-root"
+
+ monkeypatch.setattr(atr.tasks.checks.compare, "_load_tp_payload",
PayloadLoader(payload))
+ monkeypatch.setattr(atr.tasks.checks.compare, "_checkout_github_source",
checkout)
+ monkeypatch.setattr(atr.tasks.checks.compare, "_decompress_archive",
decompress)
+ monkeypatch.setattr(atr.tasks.checks.compare, "_compare_trees", compare)
+ monkeypatch.setattr(atr.tasks.checks.compare.util, "get_tmp_dir",
ReturnValue(tmp_root))
+
+ await atr.tasks.checks.compare.source_trees(args)
+
+ assert len(recorder.failure_calls) == 1
+ assert len(recorder.success_calls) == 0
+ message, data = recorder.failure_calls[0]
+ assert message == "Source archive contains files not in GitHub checkout or
with different content"
+ assert isinstance(data, dict)
+ assert data["invalid_count"] == 2
+ assert data["invalid_paths"] == ["bad1.txt", "bad2.txt"]
+
+
@pytest.mark.asyncio
async def test_source_trees_records_failure_when_decompress_fails(
monkeypatch: pytest.MonkeyPatch, tmp_path: pathlib.Path
@@ -435,6 +635,34 @@ async def
test_source_trees_records_failure_when_decompress_fails(
assert data["extract_dir"] == str(decompress.extract_dir)
[email protected]
+async def test_source_trees_reports_repo_only_sample_limited_to_five(
+ monkeypatch: pytest.MonkeyPatch, tmp_path: pathlib.Path
+) -> None:
+ recorder = RecorderStub(True)
+ args = _make_args(recorder)
+ payload = _make_payload()
+ checkout = CheckoutRecorder()
+ decompress = DecompressRecorder()
+ repo_only_files = {f"file{i}.txt" for i in range(10)}
+ compare = CompareRecorder(repo_only=repo_only_files)
+ tmp_root = tmp_path / "temporary-root"
+
+ monkeypatch.setattr(atr.tasks.checks.compare, "_load_tp_payload",
PayloadLoader(payload))
+ monkeypatch.setattr(atr.tasks.checks.compare, "_checkout_github_source",
checkout)
+ monkeypatch.setattr(atr.tasks.checks.compare, "_decompress_archive",
decompress)
+ monkeypatch.setattr(atr.tasks.checks.compare, "_compare_trees", compare)
+ monkeypatch.setattr(atr.tasks.checks.compare.util, "get_tmp_dir",
ReturnValue(tmp_root))
+
+ await atr.tasks.checks.compare.source_trees(args)
+
+ assert len(recorder.success_calls) == 1
+ _message, data = recorder.success_calls[0]
+ assert isinstance(data, dict)
+ assert data["repo_only_count"] == 10
+ assert len(data["repo_only_paths_sample"]) == 5
+
+
@pytest.mark.asyncio
async def test_source_trees_skips_when_not_source(monkeypatch:
pytest.MonkeyPatch) -> None:
recorder = RecorderStub(False)
@@ -497,3 +725,11 @@ def _make_payload(
"workflow_sha": "ffffffffffffffffffffffffffffffffffffffff",
}
return
atr.sbom.models.github.TrustedPublisherPayload.model_validate(payload)
+
+
+def _make_tree(root: pathlib.Path, files: Iterable[str]) -> None:
+ root.mkdir(parents=True, exist_ok=True)
+ for rel_path in files:
+ target = root / rel_path
+ target.parent.mkdir(parents=True, exist_ok=True)
+ target.write_text(rel_path, encoding="utf-8")
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]