This is an automated email from the ASF dual-hosted git repository.

sbp pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tooling-trusted-releases.git

commit fa2ec3002f6febd0944c7b5c02147db67750547d
Author: Sean B. Palmer <[email protected]>
AuthorDate: Mon Mar 16 16:48:39 2026 +0000

    Add file classifications to attestable data
---
 atr/attestable.py               | 86 ++++++++++++++++++++++++++++++++---------
 atr/detection.py                |  2 +-
 atr/models/attestable.py        | 15 +++++++
 atr/storage/writers/revision.py | 11 +++---
 atr/tasks/checks/__init__.py    |  2 +-
 atr/tasks/quarantine.py         |  2 +-
 tests/unit/test_attestable.py   | 84 ++++++++++++++++++++++++++++++++++++++++
 7 files changed, 176 insertions(+), 26 deletions(-)

diff --git a/atr/attestable.py b/atr/attestable.py
index a020776d..1cf40c08 100644
--- a/atr/attestable.py
+++ b/atr/attestable.py
@@ -18,12 +18,14 @@
 from __future__ import annotations
 
 import json
-from typing import TYPE_CHECKING, Any
+import pathlib
+from typing import Any
 
 import aiofiles
 import aiofiles.os
 import pydantic
 
+import atr.classify as classify
 import atr.hashes as hashes
 import atr.log as log
 import atr.models.attestable as models
@@ -31,9 +33,6 @@ import atr.models.safe as safe
 import atr.paths as paths
 import atr.util as util
 
-if TYPE_CHECKING:
-    import pathlib
-
 
 def attestable_checks_path(
     project_name: safe.ProjectName, version_name: safe.VersionName, 
revision_number: safe.RevisionNumber
@@ -73,14 +72,14 @@ async def load(
     project_name: safe.ProjectName,
     version_name: safe.VersionName,
     revision_number: safe.RevisionNumber,
-) -> models.AttestableV1 | None:
+) -> models.Attestable | None:
     file_path = attestable_path(project_name, version_name, revision_number)
     if not await aiofiles.os.path.isfile(file_path):
         return None
     try:
         async with aiofiles.open(file_path, encoding="utf-8") as f:
             data = json.loads(await f.read())
-        return models.AttestableV1.model_validate(data)
+        return _parse_attestable(data)
     except (json.JSONDecodeError, pydantic.ValidationError) as e:
         log.warning(f"Could not parse {file_path}, starting fresh: {e}")
         return None
@@ -124,7 +123,7 @@ async def load_paths(
             log.warning(f"Could not parse {file_path}: {e}")
     # combined = await load(project_name, version_name, revision_number)
     # if combined is not None:
-    #     return combined.paths
+    #     return path_hashes(combined)
     return None
 
 
@@ -148,8 +147,8 @@ def migrate_to_paths_files() -> int:
                 try:
                     with open(json_file, encoding="utf-8") as f:
                         data = json.loads(f.read())
-                    validated = models.AttestableV1.model_validate(data)
-                    paths_result = 
models.AttestablePathsV1(paths=validated.paths)
+                    validated = _parse_attestable(data)
+                    paths_result = 
models.AttestablePathsV1(paths=path_hashes(validated))
                     tmp = target.with_suffix(".tmp")
                     with open(tmp, "w", encoding="utf-8") as f:
                         f.write(paths_result.model_dump_json(indent=2))
@@ -160,6 +159,26 @@ def migrate_to_paths_files() -> int:
     return count
 
 
+def path_classification(attestable: models.Attestable, path_key: str) -> str | 
None:
+    if isinstance(attestable, models.AttestableV2):
+        entry = attestable.paths.get(path_key)
+        return entry.classification if (entry is not None) else None
+    return None
+
+
+def path_hash(attestable: models.Attestable, path_key: str) -> str | None:
+    if isinstance(attestable, models.AttestableV2):
+        entry = attestable.paths.get(path_key)
+        return entry.content_hash if (entry is not None) else None
+    return attestable.paths.get(path_key)
+
+
+def path_hashes(attestable: models.Attestable) -> dict[str, str]:
+    if isinstance(attestable, models.AttestableV2):
+        return {path_key: entry.content_hash for path_key, entry in 
attestable.paths.items()}
+    return dict(attestable.paths)
+
+
 async def paths_to_hashes_and_sizes(directory: pathlib.Path) -> 
tuple[dict[str, str], dict[str, int]]:
     path_to_hash: dict[str, str] = {}
     path_to_size: dict[str, int] = {}
@@ -204,14 +223,17 @@ async def write_files_data(
     revision_number: safe.RevisionNumber,
     release_policy: dict[str, Any] | None,
     uploader_uid: str,
-    previous: models.AttestableV1 | None,
+    previous: models.Attestable | None,
     path_to_hash: dict[str, str],
     path_to_size: dict[str, int],
+    base_path: pathlib.Path,
 ) -> None:
-    result = _generate_files_data(path_to_hash, path_to_size, revision_number, 
release_policy, uploader_uid, previous)
+    result = _generate_files_data(
+        path_to_hash, path_to_size, revision_number, release_policy, 
uploader_uid, previous, base_path
+    )
     file_path = attestable_path(project_name, version_name, revision_number)
     await util.atomic_write_file(file_path, result.model_dump_json(indent=2))
-    paths_result = models.AttestablePathsV1(paths=result.paths)
+    paths_result = models.AttestablePathsV1(paths=path_hashes(result))
     paths_file_path = attestable_paths_path(project_name, version_name, 
revision_number)
     await util.atomic_write_file(paths_file_path, 
paths_result.model_dump_json(indent=2))
     checks_file_path = attestable_checks_path(project_name, version_name, 
revision_number)
@@ -220,16 +242,33 @@ async def write_files_data(
             await 
f.write(models.AttestableChecksV2().model_dump_json(indent=2))
 
 
+def _compute_classifications(
+    path_to_hash: dict[str, str],
+    release_policy: dict[str, Any] | None,
+    base_path: pathlib.Path,
+) -> dict[str, str]:
+    policy = release_policy or {}
+    source_matcher, binary_matcher = classify.matchers_from_policy(
+        policy.get("source_artifact_paths", []),
+        policy.get("binary_artifact_paths", []),
+        base_path,
+    )
+    return {
+        path_key: classify.classify(pathlib.Path(path_key), base_path, 
source_matcher, binary_matcher).value
+        for path_key in path_to_hash
+    }
+
+
 def _compute_hashes_with_attribution(  # noqa: C901
     current_hash_to_paths: dict[str, set[str]],
     path_to_size: dict[str, int],
-    previous: models.AttestableV1 | None,
+    previous: models.Attestable | None,
     uploader_uid: str,
     revision_number: safe.RevisionNumber,
 ) -> dict[str, models.HashEntry]:
     previous_hash_to_paths: dict[str, set[str]] = {}
     if previous is not None:
-        for path_key, hash_ref in previous.paths.items():
+        for path_key, hash_ref in path_hashes(previous).items():
             previous_hash_to_paths.setdefault(hash_ref, set()).add(path_key)
 
     new_hashes: dict[str, models.HashEntry] = {}
@@ -271,8 +310,9 @@ def _generate_files_data(
     revision_number: safe.RevisionNumber,
     release_policy: dict[str, Any] | None,
     uploader_uid: str,
-    previous: models.AttestableV1 | None,
-) -> models.AttestableV1:
+    previous: models.Attestable | None,
+    base_path: pathlib.Path,
+) -> models.AttestableV2:
     current_hash_to_paths: dict[str, set[str]] = {}
     for path_key, hash_ref in path_to_hash.items():
         current_hash_to_paths.setdefault(hash_ref, set()).add(path_key)
@@ -281,12 +321,22 @@ def _generate_files_data(
         current_hash_to_paths, path_to_size, previous, uploader_uid, 
revision_number
     )
 
-    return models.AttestableV1(
-        paths=dict(path_to_hash),
+    classifications = _compute_classifications(path_to_hash, release_policy, 
base_path)
+    return models.AttestableV2(
         hashes=dict(new_hashes),
+        paths={
+            path_key: models.PathEntryV2(content_hash=hash_ref, 
classification=classifications[path_key])
+            for path_key, hash_ref in path_to_hash.items()
+        },
         policy=release_policy or {},
     )
 
 
+def _parse_attestable(data: dict[str, object]) -> models.Attestable:
+    if data.get("version") == 2:
+        return models.AttestableV2.model_validate(data)
+    return models.AttestableV1.model_validate(data)
+
+
 def _path_basename(path_key: str) -> str:
     return path_key.rsplit("/", maxsplit=1)[-1]
diff --git a/atr/detection.py b/atr/detection.py
index 7b13efe5..22e73c5c 100644
--- a/atr/detection.py
+++ b/atr/detection.py
@@ -100,7 +100,7 @@ def deduplicate_quarantine_archives(archive_paths: 
list[str], path_to_hash: dict
 
 
 def detect_archives_requiring_quarantine(
-    path_to_hash: dict[str, str], previous_attestable: models.AttestableV1 | 
None
+    path_to_hash: dict[str, str], previous_attestable: models.Attestable | None
 ) -> list[str]:
     quarantine_paths: list[str] = []
     for path_key, hash_ref in path_to_hash.items():
diff --git a/atr/models/attestable.py b/atr/models/attestable.py
index 1e1bd7bb..bf1b5866 100644
--- a/atr/models/attestable.py
+++ b/atr/models/attestable.py
@@ -48,3 +48,18 @@ class AttestableV1(schema.Strict):
     paths: dict[str, str] = schema.factory(dict)
     hashes: dict[str, HashEntry] = schema.factory(dict)
     policy: dict[str, Any] = schema.factory(dict)
+
+
+class PathEntryV2(schema.Strict):
+    content_hash: str
+    classification: str
+
+
+class AttestableV2(schema.Strict):
+    version: Literal[2] = 2
+    hashes: dict[str, HashEntry] = schema.factory(dict)
+    paths: dict[str, PathEntryV2] = schema.factory(dict)
+    policy: dict[str, Any] = schema.factory(dict)
+
+
+type Attestable = AttestableV1 | AttestableV2
diff --git a/atr/storage/writers/revision.py b/atr/storage/writers/revision.py
index d5c02dfa..0acf071a 100644
--- a/atr/storage/writers/revision.py
+++ b/atr/storage/writers/revision.py
@@ -82,7 +82,7 @@ async def finalise_revision(
     old_revision: sql.Revision | None,
     path_to_hash: dict[str, str],
     path_to_size: dict[str, int],
-    previous_attestable: atr.models.attestable.AttestableV1 | None,
+    previous_attestable: atr.models.attestable.Attestable | None,
     project_name: safe.ProjectName,
     release: sql.Release,
     release_name: safe.ReleaseName,
@@ -137,7 +137,7 @@ async def _commit_new_revision(
     merge_base_revision_name: str | None,
     path_to_hash: dict[str, str],
     path_to_size: dict[str, int],
-    previous_attestable: atr.models.attestable.AttestableV1 | None,
+    previous_attestable: atr.models.attestable.Attestable | None,
     project_name: safe.ProjectName,
     release: sql.Release,
     release_name: str,
@@ -202,6 +202,7 @@ async def _commit_new_revision(
         previous_attestable,
         path_to_hash,
         path_to_size,
+        new_revision_dir,
     )
 
     # Commit to end the transaction started by data.begin_immediate
@@ -241,13 +242,13 @@ async def _lock_and_merge(
     old_revision: sql.Revision | None,
     path_to_hash: dict[str, str],
     path_to_size: dict[str, int],
-    previous_attestable: atr.models.attestable.AttestableV1 | None,
+    previous_attestable: atr.models.attestable.Attestable | None,
     project_name: safe.ProjectName,
     release: sql.Release,
     _release_name: safe.ReleaseName,
     temp_dir_path: pathlib.Path,
     version_name: safe.VersionName,
-) -> tuple[atr.models.attestable.AttestableV1 | None, str | None, str | None, 
sql.Release]:
+) -> tuple[atr.models.attestable.Attestable | None, str | None, str | None, 
sql.Release]:
     # Acquire the write lock
     # We need this write lock for moving the directory afterwards atomically
     # But it also helps to make models.populate_revision_sequence_and_name 
safe against races
@@ -434,7 +435,7 @@ class CommitteeParticipant(FoundationCommitter):
             if merge_enabled and (old_revision is not None):
                 base_dir = old_release_dir
                 base_inodes = await asyncio.to_thread(util.paths_to_inodes, 
base_dir)
-                base_hashes = dict(previous_attestable.paths) if 
(previous_attestable is not None) else {}
+                base_hashes = attestable.path_hashes(previous_attestable) if 
(previous_attestable is not None) else {}
             n_inodes = await asyncio.to_thread(util.paths_to_inodes, 
temp_dir_path)
         except Exception:
             await aioshutil.rmtree(temp_dir)
diff --git a/atr/tasks/checks/__init__.py b/atr/tasks/checks/__init__.py
index 16f1bb8c..d10dd4a8 100644
--- a/atr/tasks/checks/__init__.py
+++ b/atr/tasks/checks/__init__.py
@@ -357,7 +357,7 @@ async def resolve_cache_key(
         policy_dict = _coerce_policy_nulls(attestable_data.policy)
         policy = sql.ReleasePolicy.model_validate(policy_dict)
         if not ignore_path:
-            file_hash = attestable_data.paths.get(file) if file else None
+            file_hash = attestable.path_hash(attestable_data, file) if file 
else None
     else:
         # TODO: Is this fallback valid / necessary? Or should we bail out if 
there's no attestable data?
         policy = release.release_policy or release.project.release_policy
diff --git a/atr/tasks/quarantine.py b/atr/tasks/quarantine.py
index 643e1af8..d042df0c 100644
--- a/atr/tasks/quarantine.py
+++ b/atr/tasks/quarantine.py
@@ -326,7 +326,7 @@ async def _promote(
     if old_revision is not None:
         old_release_dir = paths.release_directory_base(release) / 
old_revision.number
         base_inodes = await asyncio.to_thread(util.paths_to_inodes, 
old_release_dir)
-        base_hashes = dict(previous_attestable.paths) if (previous_attestable 
is not None) else {}
+        base_hashes = attestable.path_hashes(previous_attestable) if 
(previous_attestable is not None) else {}
     n_inodes = await asyncio.to_thread(util.paths_to_inodes, 
quarantine_dir_path)
 
     async with revision.SafeSession(quarantine_dir) as data:
diff --git a/tests/unit/test_attestable.py b/tests/unit/test_attestable.py
index 24bd4a6a..0d8619c8 100644
--- a/tests/unit/test_attestable.py
+++ b/tests/unit/test_attestable.py
@@ -15,10 +15,51 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import pathlib
+
 import atr.attestable as attestable
 import atr.models.attestable as models
 
 
+def test_attestable_v2_round_trip():
+    original = models.AttestableV2(
+        hashes={"h1": models.HashEntry(size=100, uploaders=[("alice", 
"00001")])},
+        paths={
+            "a.tar.gz": models.PathEntryV2(content_hash="h1", 
classification="source"),
+            "a.tar.gz.sha512": models.PathEntryV2(content_hash="h2", 
classification="metadata"),
+        },
+        policy={"min_hours": 72},
+    )
+
+    loaded = 
models.AttestableV2.model_validate_json(original.model_dump_json())
+
+    assert loaded == original
+    assert loaded.version == 2
+    assert loaded.paths["a.tar.gz"].content_hash == "h1"
+    assert loaded.paths["a.tar.gz"].classification == "source"
+    assert loaded.paths["a.tar.gz.sha512"].content_hash == "h2"
+    assert loaded.paths["a.tar.gz.sha512"].classification == "metadata"
+
+
+def test_generate_files_data_returns_attestable_v2():
+    data = attestable._generate_files_data(
+        path_to_hash={"apache-widget-1.0-src.tar.gz": "h1", 
"apache-widget-1.0-src.tar.gz.sha512": "h2"},
+        path_to_size={"apache-widget-1.0-src.tar.gz": 100, 
"apache-widget-1.0-src.tar.gz.sha512": 64},
+        revision_number="00001",
+        release_policy=None,
+        uploader_uid="alice",
+        previous=None,
+        base_path=pathlib.Path("/test"),
+    )
+
+    assert isinstance(data, models.AttestableV2)
+    assert data.version == 2
+    assert data.paths["apache-widget-1.0-src.tar.gz"].content_hash == "h1"
+    assert data.paths["apache-widget-1.0-src.tar.gz"].classification == 
"source"
+    assert data.paths["apache-widget-1.0-src.tar.gz.sha512"].content_hash == 
"h2"
+    assert data.paths["apache-widget-1.0-src.tar.gz.sha512"].classification == 
"metadata"
+
+
 def test_hash_entry_basenames_round_trip():
     entry = models.HashEntry(
         size=123,
@@ -64,8 +105,51 @@ def 
test_hash_metadata_basenames_are_cumulative_and_unique():
         release_policy=None,
         uploader_uid="bob",
         previous=previous,
+        base_path=pathlib.Path("/test"),
     )
 
     assert data.hashes["h1"].basenames == ["apache-widget-1.0-src.tar.gz", 
"apache-widget-1.0.zip"]
     assert data.hashes["h1"].uploaders == [("alice", "00001"), ("bob", 
"00002")]
     assert data.hashes["h2"].basenames == ["readme.txt"]
+
+
+def test_parse_attestable_v1():
+    data = {"version": 1, "paths": {"a.tar.gz": "h1"}, "hashes": {}, "policy": 
{}}
+
+    result = attestable._parse_attestable(data)
+
+    assert isinstance(result, models.AttestableV1)
+    assert result.version == 1
+    assert result.paths == {"a.tar.gz": "h1"}
+
+
+def test_parse_attestable_v2():
+    data = {
+        "version": 2,
+        "paths": {
+            "a.tar.gz": {"content_hash": "h1", "classification": "source"},
+        },
+        "hashes": {},
+        "policy": {},
+    }
+
+    result = attestable._parse_attestable(data)
+
+    assert isinstance(result, models.AttestableV2)
+    assert result.version == 2
+    assert result.paths["a.tar.gz"].content_hash == "h1"
+    assert result.paths["a.tar.gz"].classification == "source"
+
+
+def test_path_hashes_support_v1_and_v2():
+    v1 = models.AttestableV1(paths={"a.tar.gz": "h1"}, hashes={}, policy={})
+    v2 = models.AttestableV2(
+        paths={"a.tar.gz": models.PathEntryV2(content_hash="h1", 
classification="source")},
+        hashes={},
+        policy={},
+    )
+
+    assert attestable.path_hashes(v1) == {"a.tar.gz": "h1"}
+    assert attestable.path_hashes(v2) == {"a.tar.gz": "h1"}
+    assert attestable.path_hash(v2, "a.tar.gz") == "h1"
+    assert attestable.path_classification(v2, "a.tar.gz") == "source"


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to