This is an automated email from the ASF dual-hosted git repository.

sbp pushed a commit to branch sbp
in repository https://gitbox.apache.org/repos/asf/tooling-trusted-releases.git


The following commit(s) were added to refs/heads/sbp by this push:
     new 77a47ec  Extract archives for comparison with GitHub trees
77a47ec is described below

commit 77a47ec24fc37d7c946aab6c30a8547da30e819e
Author: Sean B. Palmer <[email protected]>
AuthorDate: Thu Feb 5 15:30:50 2026 +0000

    Extract archives for comparison with GitHub trees
---
 atr/tasks/checks/compare.py       |  54 +++++++++++++++++
 tests/unit/test_checks_compare.py | 123 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 177 insertions(+)

diff --git a/atr/tasks/checks/compare.py b/atr/tasks/checks/compare.py
index 8f1464c..d1afe88 100644
--- a/atr/tasks/checks/compare.py
+++ b/atr/tasks/checks/compare.py
@@ -34,13 +34,16 @@ import dulwich.porcelain
 import dulwich.refs
 import pydantic
 
+import atr.archives as archives
 import atr.attestable as attestable
+import atr.config as config
 import atr.log as log
 import atr.models.results as results
 import atr.sbom.models.github as github_models
 import atr.tasks.checks as checks
 import atr.util as util
 
+_CONFIG: Final = config.get()
 _DEFAULT_EMAIL: Final[str] = "atr@localhost"
 _DEFAULT_USER: Final[str] = "atr"
 
@@ -72,13 +75,28 @@ async def source_trees(args: checks.FunctionArguments) -> 
results.Results | None
 
     payload = await _load_tp_payload(args.project_name, args.version_name, 
args.revision_number)
     checkout_dir: str | None = None
+    archive_dir: str | None = None
     if payload is not None:
+        if not (primary_abs_path := await recorder.abs_path()):
+            return None
+        max_extract_size = args.extra_args.get("max_extract_size", 
_CONFIG.MAX_EXTRACT_SIZE)
+        chunk_size = args.extra_args.get("chunk_size", 
_CONFIG.EXTRACT_CHUNK_SIZE)
         tmp_dir = util.get_tmp_dir()
         await aiofiles.os.makedirs(tmp_dir, exist_ok=True)
         async with util.async_temporary_directory(prefix="trees-", 
dir=tmp_dir) as temp_dir:
             github_dir = temp_dir / "github"
+            archive_dir_path = temp_dir / "archive"
             await aiofiles.os.makedirs(github_dir, exist_ok=True)
+            await aiofiles.os.makedirs(archive_dir_path, exist_ok=True)
             checkout_dir = await _checkout_github_source(payload, github_dir)
+            if await _decompress_archive(primary_abs_path, archive_dir_path, 
max_extract_size, chunk_size):
+                archive_dir = str(archive_dir_path)
+            else:
+                await recorder.failure(
+                    "Failed to extract source archive for comparison",
+                    {"archive_path": str(primary_abs_path), "extract_dir": 
str(archive_dir_path)},
+                )
+                return None
     payload_summary = _payload_summary(payload)
     log.info(
         "Ran compare.source_trees successfully",
@@ -88,6 +106,7 @@ async def source_trees(args: checks.FunctionArguments) -> 
results.Results | None
         path=args.primary_rel_path,
         github_payload=payload_summary,
         github_checkout=checkout_dir,
+        archive_checkout=archive_dir,
     )
     return None
 
@@ -147,6 +166,41 @@ def _clone_repo(repo_url: str, sha: str, checkout_dir: 
pathlib.Path) -> None:
         shutil.rmtree(git_dir)
 
 
+async def _decompress_archive(
+    archive_path: pathlib.Path,
+    extract_dir: pathlib.Path,
+    max_extract_size: int,
+    chunk_size: int,
+) -> bool:
+    started_ns = time.perf_counter_ns()
+    try:
+        extracted_size, _extracted_paths = await asyncio.to_thread(
+            archives.extract,
+            str(archive_path),
+            str(extract_dir),
+            max_size=max_extract_size,
+            chunk_size=chunk_size,
+        )
+    except (archives.ExtractionError, OSError):
+        elapsed_ms = (time.perf_counter_ns() - started_ns) / 1_000_000.0
+        log.exception(
+            "Failed to extract source archive for compare.source_trees",
+            archive_path=str(archive_path),
+            extract_dir=str(extract_dir),
+            extract_ms=elapsed_ms,
+        )
+        return False
+    elapsed_ms = (time.perf_counter_ns() - started_ns) / 1_000_000.0
+    log.info(
+        "Extracted source archive for compare.source_trees",
+        archive_path=str(archive_path),
+        extract_dir=str(extract_dir),
+        extracted_bytes=extracted_size,
+        extract_ms=elapsed_ms,
+    )
+    return True
+
+
 def _ensure_clone_identity_env() -> None:
     os.environ["USER"] = _DEFAULT_USER
     os.environ["EMAIL"] = _DEFAULT_EMAIL
diff --git a/tests/unit/test_checks_compare.py 
b/tests/unit/test_checks_compare.py
index b57ee97..1ee3c76 100644
--- a/tests/unit/test_checks_compare.py
+++ b/tests/unit/test_checks_compare.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import datetime
 import pathlib
 from collections.abc import Callable, Mapping
 
@@ -23,6 +24,7 @@ import dulwich.objects
 import dulwich.refs
 import pytest
 
+import atr.models.sql
 import atr.sbom.models.github
 import atr.tasks.checks
 import atr.tasks.checks.compare
@@ -58,6 +60,29 @@ class CommitStub:
         self.tree = tree
 
 
+class DecompressRecorder:
+    def __init__(self, return_value: bool = True) -> None:
+        self.archive_path: pathlib.Path | None = None
+        self.extract_dir: pathlib.Path | None = None
+        self.max_extract_size: int | None = None
+        self.chunk_size: int | None = None
+        self.return_value = return_value
+
+    async def __call__(
+        self,
+        archive_path: pathlib.Path,
+        extract_dir: pathlib.Path,
+        max_extract_size: int,
+        chunk_size: int,
+    ) -> bool:
+        self.archive_path = archive_path
+        self.extract_dir = extract_dir
+        self.max_extract_size = max_extract_size
+        self.chunk_size = chunk_size
+        assert await aiofiles.os.path.exists(extract_dir)
+        return self.return_value
+
+
 class GitClientStub:
     def __init__(self) -> None:
         self.closed = False
@@ -105,6 +130,21 @@ class ParseCommitRecorder:
         return self.commit
 
 
+class ExtractErrorRaiser:
+    def __call__(self, *args: object, **kwargs: object) -> tuple[int, 
list[str]]:
+        raise atr.tasks.checks.compare.archives.ExtractionError("Extraction 
error")
+
+
+class ExtractRecorder:
+    def __init__(self, extracted_size: int = 123) -> None:
+        self.calls: list[tuple[str, str, int, int]] = []
+        self.extracted_size = extracted_size
+
+    def __call__(self, archive_path: str, extract_dir: str, max_size: int, 
chunk_size: int) -> tuple[int, list[str]]:
+        self.calls.append((archive_path, extract_dir, max_size, chunk_size))
+        return self.extracted_size, []
+
+
 class PayloadLoader:
     def __init__(self, payload: atr.sbom.models.github.TrustedPublisherPayload 
| None) -> None:
         self.payload = payload
@@ -150,11 +190,29 @@ class RecorderStub(atr.tasks.checks.Recorder):
             member_rel_path=None,
             afresh=False,
         )
+        self.failure_calls: list[tuple[str, object]] = []
         self._is_source = is_source
 
     async def primary_path_is_source(self) -> bool:
         return self._is_source
 
+    async def failure(
+        self, message: str, data: object, primary_rel_path: str | None = None, 
member_rel_path: str | None = None
+    ) -> atr.models.sql.CheckResult:
+        self.failure_calls.append((message, data))
+        return atr.models.sql.CheckResult(
+            release_name=self.release_name,
+            revision_number=self.revision_number,
+            checker=self.checker,
+            primary_rel_path=primary_rel_path or self.primary_rel_path,
+            member_rel_path=member_rel_path,
+            created=datetime.datetime.now(datetime.UTC),
+            status=atr.models.sql.CheckResultStatus.FAILURE,
+            message=message,
+            data=data,
+            cached=False,
+        )
+
 
 class RepoStub:
     def __init__(self, controldir: pathlib.Path, worktree: object) -> None:
@@ -271,6 +329,36 @@ def 
test_clone_repo_raises_when_commit_missing(monkeypatch: pytest.MonkeyPatch,
     assert git_client.closed is True
 
 
[email protected]
+async def test_decompress_archive_calls_extract(monkeypatch: 
pytest.MonkeyPatch, tmp_path: pathlib.Path) -> None:
+    archive_path = tmp_path / "artifact.tar.gz"
+    extract_dir = tmp_path / "extracted"
+    extract_dir.mkdir()
+    extract_recorder = ExtractRecorder()
+
+    monkeypatch.setattr(atr.tasks.checks.compare.archives, "extract", 
extract_recorder)
+
+    result = await atr.tasks.checks.compare._decompress_archive(archive_path, 
extract_dir, 10, 20)
+
+    assert result is True
+    assert extract_recorder.calls == [(str(archive_path), str(extract_dir), 
10, 20)]
+
+
[email protected]
+async def test_decompress_archive_handles_extraction_error(
+    monkeypatch: pytest.MonkeyPatch, tmp_path: pathlib.Path
+) -> None:
+    archive_path = tmp_path / "artifact.tar.gz"
+    extract_dir = tmp_path / "extracted"
+    extract_dir.mkdir()
+
+    monkeypatch.setattr(atr.tasks.checks.compare.archives, "extract", 
ExtractErrorRaiser())
+
+    result = await atr.tasks.checks.compare._decompress_archive(archive_path, 
extract_dir, 10, 20)
+
+    assert result is False
+
+
 @pytest.mark.asyncio
 async def test_source_trees_creates_temp_workspace_and_cleans_up(
     monkeypatch: pytest.MonkeyPatch, tmp_path: pathlib.Path
@@ -279,10 +367,12 @@ async def 
test_source_trees_creates_temp_workspace_and_cleans_up(
     args = _make_args(recorder)
     payload = _make_payload()
     checkout = CheckoutRecorder()
+    decompress = DecompressRecorder()
     tmp_root = tmp_path / "temporary-root"
 
     monkeypatch.setattr(atr.tasks.checks.compare, "_load_tp_payload", 
PayloadLoader(payload))
     monkeypatch.setattr(atr.tasks.checks.compare, "_checkout_github_source", 
checkout)
+    monkeypatch.setattr(atr.tasks.checks.compare, "_decompress_archive", 
decompress)
     monkeypatch.setattr(atr.tasks.checks.compare.util, "get_tmp_dir", 
ReturnValue(tmp_root))
 
     await atr.tasks.checks.compare.source_trees(args)
@@ -290,6 +380,8 @@ async def 
test_source_trees_creates_temp_workspace_and_cleans_up(
     assert checkout.checkout_dir is not None
     checkout_dir = checkout.checkout_dir
     assert checkout_dir.name == "github"
+    assert decompress.extract_dir is not None
+    assert decompress.extract_dir.name == "archive"
     assert checkout_dir.parent.parent == tmp_root
     assert checkout_dir.parent.name.startswith("trees-")
     assert await aiofiles.os.path.exists(tmp_root)
@@ -307,11 +399,42 @@ async def 
test_source_trees_payload_none_skips_temp_workspace(monkeypatch: pytes
         "_checkout_github_source",
         RaiseAsync("_checkout_github_source should not be called"),
     )
+    monkeypatch.setattr(
+        atr.tasks.checks.compare,
+        "_decompress_archive",
+        RaiseAsync("_decompress_archive should not be called"),
+    )
     monkeypatch.setattr(atr.tasks.checks.compare.util, "get_tmp_dir", 
RaiseSync("get_tmp_dir should not be called"))
 
     await atr.tasks.checks.compare.source_trees(args)
 
 
[email protected]
+async def test_source_trees_records_failure_when_decompress_fails(
+    monkeypatch: pytest.MonkeyPatch, tmp_path: pathlib.Path
+) -> None:
+    recorder = RecorderStub(True)
+    args = _make_args(recorder)
+    payload = _make_payload()
+    checkout = CheckoutRecorder()
+    decompress = DecompressRecorder(return_value=False)
+    tmp_root = tmp_path / "temporary-root"
+
+    monkeypatch.setattr(atr.tasks.checks.compare, "_load_tp_payload", 
PayloadLoader(payload))
+    monkeypatch.setattr(atr.tasks.checks.compare, "_checkout_github_source", 
checkout)
+    monkeypatch.setattr(atr.tasks.checks.compare, "_decompress_archive", 
decompress)
+    monkeypatch.setattr(atr.tasks.checks.compare.util, "get_tmp_dir", 
ReturnValue(tmp_root))
+
+    await atr.tasks.checks.compare.source_trees(args)
+
+    assert len(recorder.failure_calls) == 1
+    message, data = recorder.failure_calls[0]
+    assert message == "Failed to extract source archive for comparison"
+    assert isinstance(data, dict)
+    assert data["archive_path"] == str(await recorder.abs_path())
+    assert data["extract_dir"] == str(decompress.extract_dir)
+
+
 @pytest.mark.asyncio
 async def test_source_trees_skips_when_not_source(monkeypatch: 
pytest.MonkeyPatch) -> None:
     recorder = RecorderStub(False)


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to