This is an automated email from the ASF dual-hosted git repository.

sbp pushed a commit to branch sbp
in repository https://gitbox.apache.org/repos/asf/tooling-trusted-releases.git


The following commit(s) were added to refs/heads/sbp by this push:
     new 342cefb  Use rsync to compare GitHub trees with extracted archives
342cefb is described below

commit 342cefb093bfa2b7d66d8d597afc21b7d62d40db
Author: Sean B. Palmer <[email protected]>
AuthorDate: Thu Feb 5 16:22:21 2026 +0000

    Use rsync to compare GitHub trees with extracted archives
---
 atr/storage/__init__.py           |   2 +-
 atr/tasks/checks/compare.py       |  95 ++++++++++++++
 tests/unit/test_checks_compare.py | 262 ++++++++++++++++++++++++++++++++++++--
 3 files changed, 345 insertions(+), 14 deletions(-)

diff --git a/atr/storage/__init__.py b/atr/storage/__init__.py
index fcca6c0..cf15941 100644
--- a/atr/storage/__init__.py
+++ b/atr/storage/__init__.py
@@ -52,7 +52,7 @@ def audit(**kwargs: basic.JSON) -> None:
     kwargs = {"datetime": now, "action": action, **kwargs}
     if request_user_id:
         kwargs["request_user_id"] = request_user_id
-    if admin_user_id and request_user_id != admin_user_id:
+    if admin_user_id and (request_user_id != admin_user_id):
         kwargs["admin_user_id"] = admin_user_id
     msg = json.dumps(kwargs, allow_nan=False)
     # The atr.log logger should give the same name
diff --git a/atr/tasks/checks/compare.py b/atr/tasks/checks/compare.py
index d1afe88..3cf62c7 100644
--- a/atr/tasks/checks/compare.py
+++ b/atr/tasks/checks/compare.py
@@ -17,10 +17,12 @@
 
 import asyncio
 import contextlib
+import dataclasses
 import json
 import os
 import pathlib
 import shutil
+import subprocess
 import time
 from collections.abc import Mapping
 from typing import Any, Final
@@ -60,6 +62,12 @@ class DetermineWantsForSha:
         return [dulwich.objects.ObjectID(self.sha.encode("ascii"))]
 
 
[email protected]
+class TreeComparisonResult:
+    invalid: set[str]
+    repo_only: set[str]
+
+
 async def source_trees(args: checks.FunctionArguments) -> results.Results | 
None:
     recorder = await args.recorder()
     is_source = await recorder.primary_path_is_source()
@@ -89,6 +97,12 @@ async def source_trees(args: checks.FunctionArguments) -> 
results.Results | None
             await aiofiles.os.makedirs(github_dir, exist_ok=True)
             await aiofiles.os.makedirs(archive_dir_path, exist_ok=True)
             checkout_dir = await _checkout_github_source(payload, github_dir)
+            if checkout_dir is None:
+                await recorder.failure(
+                    "Failed to clone GitHub repository for comparison",
+                    {"repo_url": 
f"https://github.com/{payload.repository}.git";, "sha": payload.sha},
+                )
+                return None
             if await _decompress_archive(primary_abs_path, archive_dir_path, 
max_extract_size, chunk_size):
                 archive_dir = str(archive_dir_path)
             else:
@@ -97,6 +111,29 @@ async def source_trees(args: checks.FunctionArguments) -> 
results.Results | None
                     {"archive_path": str(primary_abs_path), "extract_dir": 
str(archive_dir_path)},
                 )
                 return None
+            try:
+                comparison = await _compare_trees(github_dir, archive_dir_path)
+            except RuntimeError as exc:
+                await recorder.failure(
+                    "Failed to compare source tree against GitHub checkout",
+                    {"error": str(exc)},
+                )
+                return None
+            if comparison.invalid:
+                invalid_list = sorted(comparison.invalid)
+                await recorder.failure(
+                    "Source archive contains files not in GitHub checkout or 
with different content",
+                    {"invalid_count": len(invalid_list), "invalid_paths": 
invalid_list},
+                )
+                return None
+            repo_only_list = sorted(comparison.repo_only)
+            await recorder.success(
+                "Source archive is a valid subset of GitHub checkout",
+                {
+                    "repo_only_count": len(repo_only_list),
+                    "repo_only_paths_sample": repo_only_list[:5],
+                },
+            )
     payload_summary = _payload_summary(payload)
     log.info(
         "Ran compare.source_trees successfully",
@@ -166,6 +203,64 @@ def _clone_repo(repo_url: str, sha: str, checkout_dir: 
pathlib.Path) -> None:
         shutil.rmtree(git_dir)
 
 
+async def _compare_trees(repo_dir: pathlib.Path, archive_dir: pathlib.Path) -> 
TreeComparisonResult:
+    return await asyncio.to_thread(_compare_trees_rsync, repo_dir, archive_dir)
+
+
+def _compare_trees_rsync(repo_dir: pathlib.Path, archive_dir: pathlib.Path) -> 
TreeComparisonResult:  # noqa: C901
+    if shutil.which("rsync") is None:
+        raise RuntimeError("rsync is not available on PATH")
+    command = [
+        "rsync",
+        "-a",
+        "--delete",
+        "--dry-run",
+        "--itemize-changes",
+        "--out-format=%i %n",
+        "--no-motd",
+        "--checksum",
+        f"{repo_dir}{os.sep}",
+        f"{archive_dir}{os.sep}",
+    ]
+    result = subprocess.run(
+        command,
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+    if result.returncode != 0:
+        raise RuntimeError(f"rsync failed with code {result.returncode}: 
{result.stderr.strip()}")
+    invalid: set[str] = set()
+    repo_only: set[str] = set()
+    for line in (result.stdout + "\n" + result.stderr).splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        rel_path: str | None = None
+        is_repo_only = False
+        if line.startswith("*deleting "):
+            rel_path = line.removeprefix("*deleting ").strip().rstrip("/")
+        elif line.startswith("deleting "):
+            rel_path = line.removeprefix("deleting ").strip().rstrip("/")
+        else:
+            parts = line.split(" ", 1)
+            if len(parts) == 2:
+                flags = parts[0]
+                rel_path = parts[1].rstrip("/")
+                if flags.startswith(">f") and (len(flags) >= 3) and (flags[2] 
== "+"):
+                    is_repo_only = True
+        if not rel_path:
+            continue
+        full_repo = repo_dir / rel_path
+        full_archive = archive_dir / rel_path
+        if full_repo.is_file() or full_archive.is_file():
+            if is_repo_only:
+                repo_only.add(rel_path)
+            else:
+                invalid.add(rel_path)
+    return TreeComparisonResult(invalid, repo_only)
+
+
 async def _decompress_archive(
     archive_path: pathlib.Path,
     extract_dir: pathlib.Path,
diff --git a/tests/unit/test_checks_compare.py 
b/tests/unit/test_checks_compare.py
index 1ee3c76..dbac4fc 100644
--- a/tests/unit/test_checks_compare.py
+++ b/tests/unit/test_checks_compare.py
@@ -17,7 +17,9 @@
 
 import datetime
 import pathlib
-from collections.abc import Callable, Mapping
+import shutil
+import subprocess
+from collections.abc import Callable, Iterable, Mapping
 
 import aiofiles.os
 import dulwich.objects
@@ -60,6 +62,23 @@ class CommitStub:
         self.tree = tree
 
 
+class CompareRecorder:
+    def __init__(
+        self,
+        invalid: set[str] | None = None,
+        repo_only: set[str] | None = None,
+    ) -> None:
+        self.calls: list[tuple[pathlib.Path, pathlib.Path]] = []
+        self.invalid = invalid or set()
+        self.repo_only = repo_only or set()
+
+    async def __call__(
+        self, repo_dir: pathlib.Path, archive_dir: pathlib.Path
+    ) -> atr.tasks.checks.compare.TreeComparisonResult:
+        self.calls.append((repo_dir, archive_dir))
+        return 
atr.tasks.checks.compare.TreeComparisonResult(set(self.invalid), 
set(self.repo_only))
+
+
 class DecompressRecorder:
     def __init__(self, return_value: bool = True) -> None:
         self.archive_path: pathlib.Path | None = None
@@ -83,6 +102,21 @@ class DecompressRecorder:
         return self.return_value
 
 
+class ExtractErrorRaiser:
+    def __call__(self, *args: object, **kwargs: object) -> tuple[int, 
list[str]]:
+        raise atr.tasks.checks.compare.archives.ExtractionError("Extraction 
error")
+
+
+class ExtractRecorder:
+    def __init__(self, extracted_size: int = 123) -> None:
+        self.calls: list[tuple[str, str, int, int]] = []
+        self.extracted_size = extracted_size
+
+    def __call__(self, archive_path: str, extract_dir: str, max_size: int, 
chunk_size: int) -> tuple[int, list[str]]:
+        self.calls.append((archive_path, extract_dir, max_size, chunk_size))
+        return self.extracted_size, []
+
+
 class GitClientStub:
     def __init__(self) -> None:
         self.closed = False
@@ -130,19 +164,20 @@ class ParseCommitRecorder:
         return self.commit
 
 
-class ExtractErrorRaiser:
-    def __call__(self, *args: object, **kwargs: object) -> tuple[int, 
list[str]]:
-        raise atr.tasks.checks.compare.archives.ExtractionError("Extraction 
error")
+class RunRecorder:
+    def __init__(self, completed: subprocess.CompletedProcess[str]) -> None:
+        self.calls: list[list[str]] = []
+        self.completed = completed
 
-
-class ExtractRecorder:
-    def __init__(self, extracted_size: int = 123) -> None:
-        self.calls: list[tuple[str, str, int, int]] = []
-        self.extracted_size = extracted_size
-
-    def __call__(self, archive_path: str, extract_dir: str, max_size: int, 
chunk_size: int) -> tuple[int, list[str]]:
-        self.calls.append((archive_path, extract_dir, max_size, chunk_size))
-        return self.extracted_size, []
+    def __call__(
+        self,
+        args: list[str],
+        capture_output: bool,
+        text: bool,
+        check: bool,
+    ) -> subprocess.CompletedProcess[str]:
+        self.calls.append(args)
+        return self.completed
 
 
 class PayloadLoader:
@@ -191,6 +226,7 @@ class RecorderStub(atr.tasks.checks.Recorder):
             afresh=False,
         )
         self.failure_calls: list[tuple[str, object]] = []
+        self.success_calls: list[tuple[str, object]] = []
         self._is_source = is_source
 
     async def primary_path_is_source(self) -> bool:
@@ -213,6 +249,23 @@ class RecorderStub(atr.tasks.checks.Recorder):
             cached=False,
         )
 
+    async def success(
+        self, message: str, data: object, primary_rel_path: str | None = None, 
member_rel_path: str | None = None
+    ) -> atr.models.sql.CheckResult:
+        self.success_calls.append((message, data))
+        return atr.models.sql.CheckResult(
+            release_name=self.release_name,
+            revision_number=self.revision_number,
+            checker=self.checker,
+            primary_rel_path=primary_rel_path or self.primary_rel_path,
+            member_rel_path=member_rel_path,
+            created=datetime.datetime.now(datetime.UTC),
+            status=atr.models.sql.CheckResultStatus.SUCCESS,
+            message=message,
+            data=data,
+            cached=False,
+        )
+
 
 class RepoStub:
     def __init__(self, controldir: pathlib.Path, worktree: object) -> None:
@@ -329,6 +382,116 @@ def 
test_clone_repo_raises_when_commit_missing(monkeypatch: pytest.MonkeyPatch,
     assert git_client.closed is True
 
 
+def test_compare_trees_rsync_archive_has_extra_file(monkeypatch: 
pytest.MonkeyPatch, tmp_path: pathlib.Path) -> None:
+    repo_dir = tmp_path / "repo"
+    archive_dir = tmp_path / "archive"
+    _make_tree(repo_dir, ["a.txt"])
+    _make_tree(archive_dir, ["a.txt", "b.txt"])
+    completed = subprocess.CompletedProcess(
+        args=["rsync"],
+        returncode=0,
+        stdout="",
+        stderr="*deleting b.txt\n",
+    )
+    run_recorder = RunRecorder(completed)
+
+    monkeypatch.setattr(shutil, "which", lambda _name: "/usr/bin/rsync")
+    monkeypatch.setattr(subprocess, "run", run_recorder)
+
+    result = atr.tasks.checks.compare._compare_trees_rsync(repo_dir, 
archive_dir)
+
+    assert result.invalid == {"b.txt"}
+    assert result.repo_only == set()
+
+
+def test_compare_trees_rsync_content_differs(monkeypatch: pytest.MonkeyPatch, 
tmp_path: pathlib.Path) -> None:
+    repo_dir = tmp_path / "repo"
+    archive_dir = tmp_path / "archive"
+    _make_tree(repo_dir, ["a.txt", "b.txt"])
+    _make_tree(archive_dir, ["a.txt", "b.txt"])
+    completed = subprocess.CompletedProcess(
+        args=["rsync"],
+        returncode=0,
+        stdout=">fc......... b.txt\n",
+        stderr="",
+    )
+    run_recorder = RunRecorder(completed)
+
+    monkeypatch.setattr(shutil, "which", lambda _name: "/usr/bin/rsync")
+    monkeypatch.setattr(subprocess, "run", run_recorder)
+
+    result = atr.tasks.checks.compare._compare_trees_rsync(repo_dir, 
archive_dir)
+
+    assert result.invalid == {"b.txt"}
+    assert result.repo_only == set()
+
+
+def test_compare_trees_rsync_distinct_files(monkeypatch: pytest.MonkeyPatch, 
tmp_path: pathlib.Path) -> None:
+    repo_dir = tmp_path / "repo"
+    archive_dir = tmp_path / "archive"
+    _make_tree(repo_dir, ["a.txt"])
+    _make_tree(archive_dir, ["b.txt"])
+    completed = subprocess.CompletedProcess(
+        args=["rsync"],
+        returncode=0,
+        stdout=">f+++++++++ a.txt\n",
+        stderr="*deleting b.txt\n",
+    )
+    run_recorder = RunRecorder(completed)
+
+    monkeypatch.setattr(shutil, "which", lambda _name: "/usr/bin/rsync")
+    monkeypatch.setattr(subprocess, "run", run_recorder)
+
+    result = atr.tasks.checks.compare._compare_trees_rsync(repo_dir, 
archive_dir)
+
+    assert result.invalid == {"b.txt"}
+    assert result.repo_only == {"a.txt"}
+
+
+def test_compare_trees_rsync_repo_has_extra_file(monkeypatch: 
pytest.MonkeyPatch, tmp_path: pathlib.Path) -> None:
+    repo_dir = tmp_path / "repo"
+    archive_dir = tmp_path / "archive"
+    _make_tree(repo_dir, ["a.txt", "b.txt"])
+    _make_tree(archive_dir, ["a.txt"])
+    completed = subprocess.CompletedProcess(
+        args=["rsync"],
+        returncode=0,
+        stdout=">f+++++++++ b.txt\n",
+        stderr="",
+    )
+    run_recorder = RunRecorder(completed)
+
+    monkeypatch.setattr(shutil, "which", lambda _name: "/usr/bin/rsync")
+    monkeypatch.setattr(subprocess, "run", run_recorder)
+
+    result = atr.tasks.checks.compare._compare_trees_rsync(repo_dir, 
archive_dir)
+
+    assert result.invalid == set()
+    assert result.repo_only == {"b.txt"}
+
+
+def test_compare_trees_rsync_trees_match(monkeypatch: pytest.MonkeyPatch, 
tmp_path: pathlib.Path) -> None:
+    repo_dir = tmp_path / "repo"
+    archive_dir = tmp_path / "archive"
+    _make_tree(repo_dir, ["a.txt", "b.txt"])
+    _make_tree(archive_dir, ["a.txt", "b.txt"])
+    completed = subprocess.CompletedProcess(
+        args=["rsync"],
+        returncode=0,
+        stdout="",
+        stderr="",
+    )
+    run_recorder = RunRecorder(completed)
+
+    monkeypatch.setattr(shutil, "which", lambda _name: "/usr/bin/rsync")
+    monkeypatch.setattr(subprocess, "run", run_recorder)
+
+    result = atr.tasks.checks.compare._compare_trees_rsync(repo_dir, 
archive_dir)
+
+    assert result.invalid == set()
+    assert result.repo_only == set()
+
+
 @pytest.mark.asyncio
 async def test_decompress_archive_calls_extract(monkeypatch: 
pytest.MonkeyPatch, tmp_path: pathlib.Path) -> None:
     archive_path = tmp_path / "artifact.tar.gz"
@@ -368,11 +531,13 @@ async def 
test_source_trees_creates_temp_workspace_and_cleans_up(
     payload = _make_payload()
     checkout = CheckoutRecorder()
     decompress = DecompressRecorder()
+    compare = CompareRecorder(repo_only={"extra1.txt", "extra2.txt"})
     tmp_root = tmp_path / "temporary-root"
 
     monkeypatch.setattr(atr.tasks.checks.compare, "_load_tp_payload", 
PayloadLoader(payload))
     monkeypatch.setattr(atr.tasks.checks.compare, "_checkout_github_source", 
checkout)
     monkeypatch.setattr(atr.tasks.checks.compare, "_decompress_archive", 
decompress)
+    monkeypatch.setattr(atr.tasks.checks.compare, "_compare_trees", compare)
     monkeypatch.setattr(atr.tasks.checks.compare.util, "get_tmp_dir", 
ReturnValue(tmp_root))
 
     await atr.tasks.checks.compare.source_trees(args)
@@ -386,6 +551,12 @@ async def 
test_source_trees_creates_temp_workspace_and_cleans_up(
     assert checkout_dir.parent.name.startswith("trees-")
     assert await aiofiles.os.path.exists(tmp_root)
     assert not await aiofiles.os.path.exists(checkout_dir.parent)
+    assert len(recorder.success_calls) == 1
+    message, data = recorder.success_calls[0]
+    assert message == "Source archive is a valid subset of GitHub checkout"
+    assert isinstance(data, dict)
+    assert data["repo_only_count"] == 2
+    assert data["repo_only_paths_sample"] == ["extra1.txt", "extra2.txt"]
 
 
 @pytest.mark.asyncio
@@ -409,6 +580,35 @@ async def 
test_source_trees_payload_none_skips_temp_workspace(monkeypatch: pytes
     await atr.tasks.checks.compare.source_trees(args)
 
 
[email protected]
+async def test_source_trees_records_failure_when_archive_has_invalid_files(
+    monkeypatch: pytest.MonkeyPatch, tmp_path: pathlib.Path
+) -> None:
+    recorder = RecorderStub(True)
+    args = _make_args(recorder)
+    payload = _make_payload()
+    checkout = CheckoutRecorder()
+    decompress = DecompressRecorder()
+    compare = CompareRecorder(invalid={"bad1.txt", "bad2.txt"}, 
repo_only={"ok.txt"})
+    tmp_root = tmp_path / "temporary-root"
+
+    monkeypatch.setattr(atr.tasks.checks.compare, "_load_tp_payload", 
PayloadLoader(payload))
+    monkeypatch.setattr(atr.tasks.checks.compare, "_checkout_github_source", 
checkout)
+    monkeypatch.setattr(atr.tasks.checks.compare, "_decompress_archive", 
decompress)
+    monkeypatch.setattr(atr.tasks.checks.compare, "_compare_trees", compare)
+    monkeypatch.setattr(atr.tasks.checks.compare.util, "get_tmp_dir", 
ReturnValue(tmp_root))
+
+    await atr.tasks.checks.compare.source_trees(args)
+
+    assert len(recorder.failure_calls) == 1
+    assert len(recorder.success_calls) == 0
+    message, data = recorder.failure_calls[0]
+    assert message == "Source archive contains files not in GitHub checkout or 
with different content"
+    assert isinstance(data, dict)
+    assert data["invalid_count"] == 2
+    assert data["invalid_paths"] == ["bad1.txt", "bad2.txt"]
+
+
 @pytest.mark.asyncio
 async def test_source_trees_records_failure_when_decompress_fails(
     monkeypatch: pytest.MonkeyPatch, tmp_path: pathlib.Path
@@ -435,6 +635,34 @@ async def 
test_source_trees_records_failure_when_decompress_fails(
     assert data["extract_dir"] == str(decompress.extract_dir)
 
 
[email protected]
+async def test_source_trees_reports_repo_only_sample_limited_to_five(
+    monkeypatch: pytest.MonkeyPatch, tmp_path: pathlib.Path
+) -> None:
+    recorder = RecorderStub(True)
+    args = _make_args(recorder)
+    payload = _make_payload()
+    checkout = CheckoutRecorder()
+    decompress = DecompressRecorder()
+    repo_only_files = {f"file{i}.txt" for i in range(10)}
+    compare = CompareRecorder(repo_only=repo_only_files)
+    tmp_root = tmp_path / "temporary-root"
+
+    monkeypatch.setattr(atr.tasks.checks.compare, "_load_tp_payload", 
PayloadLoader(payload))
+    monkeypatch.setattr(atr.tasks.checks.compare, "_checkout_github_source", 
checkout)
+    monkeypatch.setattr(atr.tasks.checks.compare, "_decompress_archive", 
decompress)
+    monkeypatch.setattr(atr.tasks.checks.compare, "_compare_trees", compare)
+    monkeypatch.setattr(atr.tasks.checks.compare.util, "get_tmp_dir", 
ReturnValue(tmp_root))
+
+    await atr.tasks.checks.compare.source_trees(args)
+
+    assert len(recorder.success_calls) == 1
+    _message, data = recorder.success_calls[0]
+    assert isinstance(data, dict)
+    assert data["repo_only_count"] == 10
+    assert len(data["repo_only_paths_sample"]) == 5
+
+
 @pytest.mark.asyncio
 async def test_source_trees_skips_when_not_source(monkeypatch: 
pytest.MonkeyPatch) -> None:
     recorder = RecorderStub(False)
@@ -497,3 +725,11 @@ def _make_payload(
         "workflow_sha": "ffffffffffffffffffffffffffffffffffffffff",
     }
     return 
atr.sbom.models.github.TrustedPublisherPayload.model_validate(payload)
+
+
+def _make_tree(root: pathlib.Path, files: Iterable[str]) -> None:
+    root.mkdir(parents=True, exist_ok=True)
+    for rel_path in files:
+        target = root / rel_path
+        target.parent.mkdir(parents=True, exist_ok=True)
+        target.write_text(rel_path, encoding="utf-8")


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to