(tooling-trusted-releases) 01/01: Classify files by counting classification markers

sbp Wed, 18 Mar 2026 13:43:03 -0700

This is an automated email from the ASF dual-hosted git repository.

sbp pushed a commit to branch sbp
in repository https://gitbox.apache.org/repos/asf/tooling-trusted-releases.git


commit f3f490654b1d1d70320164d4ebc4c36d341ec655
Author: Sean B. Palmer <[email protected]>
AuthorDate: Wed Mar 18 20:42:19 2026 +0000

    Classify files by counting classification markers
---
 atr/classify.py | 192 +++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 177 insertions(+), 15 deletions(-)

diff --git a/atr/classify.py b/atr/classify.py
index 9e57dac3..12b99a1c 100644
--- a/atr/classify.py
+++ b/atr/classify.py
@@ -23,10 +23,86 @@ from typing import Final
 
 import atr.analysis as analysis
 import atr.detection as detection
+import atr.models.safe as safe
 import atr.util as util
 
-_BINARY_STEM: Final[re.Pattern[str]] = 
re.compile(r"[-_](binary-assembly|binary|bin)(?=[-_]|$)")
-_SOURCE_STEM: Final[re.Pattern[str]] = 
re.compile(r"[-_](source-release|sources|source|src)(?=[-_]|$)")
+# Binary markers (of 2553 classified as binary):
+#   _bin_binary_token           1378   54.0%
+#   _bin_distribution_token      202    7.9%
+#   _bin_path_binary            1062   41.6%
+#   _bin_platform_token          719   28.2%
+#   _bin_path_platform            24    0.9%
+#
+# Docs markers (of 2553 classified as binary):
+#   _doc_extra_token              37    1.4%
+#   _doc_filename_token           78    3.1%
+#   _doc_path_token               38    1.5%
+#
+# Source markers (of 3797 classified as source):
+#   _src_source_release         1028   27.1%
+#   _src_source_token           1917   50.5%
+#   _src_path_source             502   13.2%
+
+# Binary markers
+
+# 54.0%
+_BIN_BINARY_RE: Final[re.Pattern[str]] = 
re.compile(r"(^|[-_.])(binary-assembly|binary|bin)(?=[-_.]|$)", re.IGNORECASE)
+
+# 7.9%
+_BIN_DISTRIBUTION_TOKENS: Final[frozenset[str]] = frozenset(
+    {"classic", "cli", "debug", "lib", "plugin", "plugins", "portable", "sdk", 
"vagrant", "war"}
+)
+
+# 41.6%
+_BIN_PATH_PARTS: Final[frozenset[str]] = frozenset({"bin", "binaries", 
"binary"})
+
+# 0.9% (path), 28.2% (filename)
+_BIN_PLATFORM_TOKENS: Final[frozenset[str]] = frozenset(
+    {
+        "64bit",
+        "aarch64",
+        "amd64",
+        "apk",
+        "arm64",
+        "arm64bit",
+        "darwin",
+        "linux",
+        "mac",
+        "macos",
+        "osx",
+        "win",
+        "windows",
+        "win32",
+        "win64",
+        "x64",
+        "x86",
+        "x86_64",
+    }
+)
+
+# Docs markers
+
+# 1.4%
+_DOC_EXTRA_TOKENS: Final[frozenset[str]] = frozenset({"apidocs", "markdown", 
"wikipages"})
+
+# 1.5%
+_DOC_PATH_PARTS: Final[frozenset[str]] = frozenset({"doc", "docs", "javadoc", 
"manual", "site", "wikipages"})
+
+# 3.1%
+_DOC_TOKEN_RE: Final[re.Pattern[str]] = 
re.compile(r"(^|[-_.])(doc|docs|javadoc|manual|site)(?=[-_.]|$)", re.IGNORECASE)
+
+# Source markers
+
+# 13.2%
+_SRC_PATH_PARTS: Final[frozenset[str]] = frozenset({"source", "src"})
+
+# 27.1%
+_SRC_RELEASE_RE: Final[re.Pattern[str]] = 
re.compile(r"(^|[-_.])source-release(?=[-_.]|$)", re.IGNORECASE)
+
+# 50.5%
+_SRC_SOURCE_RE: Final[re.Pattern[str]] = 
re.compile(r"(^|[-_.])(project|source|sources|src)(?=[-_.]|$)", re.IGNORECASE)
+
+_TOKEN_SPLIT_RE: Final[re.Pattern[str]] = re.compile(r"[-_.]+")
 
 
 class FileType(enum.Enum):
@@ -36,11 +112,31 @@ class FileType(enum.Enum):
     SOURCE = "source"
 
 
+def archive_marker_counts(stem: str, path: pathlib.PurePath) -> tuple[int, 
int, int]:
+    name = pathlib.PurePosixPath(stem).name
+    tokens = _get_stem_tokens(name)
+    ptokens = _get_path_tokens(path)
+    release = _src_source_release(name)
+    source = _src_source_token(name)
+    explicit = release or source
+    source_count = release + source + _src_path_source(ptokens)
+    binary_count = (
+        _bin_binary_token(name)
+        + _bin_distribution_token(tokens, explicit)
+        + _bin_path_binary(ptokens)
+        + _bin_platform_token(tokens)
+        + _bin_path_platform(ptokens)
+    )
+    docs_count = _doc_extra_token(tokens) + _doc_filename_token(name) + 
_doc_path_token(ptokens)
+    return source_count, binary_count, docs_count
+
+
 def classify(
     path: pathlib.Path,
     base_path: pathlib.Path | None = None,
     source_matcher: Callable[[str], bool] | None = None,
     binary_matcher: Callable[[str], bool] | None = None,
+    _project_key: safe.ProjectKey | None = None,
 ) -> FileType:
     if (path.name in analysis.DISALLOWED_FILENAMES) or (path.suffix in 
analysis.DISALLOWED_SUFFIXES):
         return FileType.DISALLOWED
@@ -54,20 +150,27 @@ def classify(
     if any(path_str.endswith(s) for s in 
analysis.STANDALONE_METADATA_SUFFIXES):
         return FileType.METADATA
 
-    if search and search.group("artifact"):
-        abs_str = str(base_path / path) if (base_path is not None) else None
-        if (source_matcher is not None) and (abs_str is not None) and 
source_matcher(abs_str):
-            return FileType.SOURCE
-        if (binary_matcher is not None) and (abs_str is not None) and 
binary_matcher(abs_str):
-            return FileType.BINARY
-        stem = path_str[: search.start()]
-        if _SOURCE_STEM.search(stem):
-            return FileType.SOURCE
-        if _BINARY_STEM.search(stem):
-            return FileType.BINARY
-        if any(path_str.endswith(suffix) for suffix in 
detection.QUARANTINE_ARCHIVE_SUFFIXES):
-            return FileType.SOURCE
+    if (not search) or (not search.group("artifact")):
+        return FileType.BINARY
+
+    abs_str = str(base_path / path) if (base_path is not None) else None
+    if (source_matcher is not None) and (abs_str is not None) and 
source_matcher(abs_str):
+        return FileType.SOURCE
+    if (binary_matcher is not None) and (abs_str is not None) and 
binary_matcher(abs_str):
+        return FileType.BINARY
+    stem = path_str[: search.start()]
+    if not any(path_str.endswith(suffix) for suffix in 
detection.QUARANTINE_ARCHIVE_SUFFIXES):
+        return FileType.BINARY
+    return classify_from_counts(*archive_marker_counts(stem, path))
+
 
+def classify_from_counts(source_count: int, binary_count: int, docs_count: 
int) -> FileType:
+    if (source_count == 0) and (binary_count == 0):
+        if docs_count > 0:
+            return FileType.BINARY
+        return FileType.SOURCE
+    if source_count >= binary_count:
+        return FileType.SOURCE
     return FileType.BINARY
 
 
@@ -81,3 +184,62 @@ def matchers_from_policy(
     source_matcher = util.create_path_matcher(source_artifact_paths, None, 
base_path) if source_artifact_paths else None
     binary_matcher = util.create_path_matcher(binary_artifact_paths, None, 
base_path) if binary_artifact_paths else None
     return source_matcher, binary_matcher
+
+
+def _bin_binary_token(name: str) -> bool:
+    return bool(_BIN_BINARY_RE.search(name))
+
+
+def _bin_distribution_token(tokens: frozenset[str], explicit_source: bool) -> 
bool:
+    if explicit_source:
+        return False
+    return bool(tokens & _BIN_DISTRIBUTION_TOKENS)
+
+
+def _bin_path_binary(ptokens: frozenset[str]) -> bool:
+    return bool(ptokens & _BIN_PATH_PARTS)
+
+
+def _bin_path_platform(ptokens: frozenset[str]) -> bool:
+    return bool(ptokens & _BIN_PLATFORM_TOKENS)
+
+
+def _bin_platform_token(tokens: frozenset[str]) -> bool:
+    return bool(tokens & _BIN_PLATFORM_TOKENS)
+
+
+def _doc_extra_token(tokens: frozenset[str]) -> bool:
+    return bool(tokens & _DOC_EXTRA_TOKENS)
+
+
+def _doc_filename_token(name: str) -> bool:
+    return bool(_DOC_TOKEN_RE.search(name))
+
+
+def _doc_path_token(ptokens: frozenset[str]) -> bool:
+    return bool(ptokens & _DOC_PATH_PARTS)
+
+
+def _get_path_tokens(path: pathlib.PurePath) -> frozenset[str]:
+    tokens: set[str] = set()
+    for part in path.parent.parts:
+        lower = part.lower()
+        tokens.add(lower)
+        tokens.update(token for token in _TOKEN_SPLIT_RE.split(lower) if token)
+    return frozenset(tokens)
+
+
+def _get_stem_tokens(name: str) -> frozenset[str]:
+    return frozenset(token.lower() for token in _TOKEN_SPLIT_RE.split(name) if 
token)
+
+
+def _src_path_source(ptokens: frozenset[str]) -> bool:
+    return bool(ptokens & _SRC_PATH_PARTS)
+
+
+def _src_source_release(name: str) -> bool:
+    return bool(_SRC_RELEASE_RE.search(name))
+
+
+def _src_source_token(name: str) -> bool:
+    return bool(_SRC_SOURCE_RE.search(name)) and (not 
bool(_SRC_RELEASE_RE.search(name)))


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(tooling-trusted-releases) 01/01: Classify files by counting classification markers

Reply via email to