This is an automated email from the ASF dual-hosted git repository.
sbp pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tooling-atr-experiments.git
The following commit(s) were added to refs/heads/main by this push:
new ecb2dc1 Split archive task functions into their own module
ecb2dc1 is described below
commit ecb2dc17d91a9b99049fb8203230e7113ea43919
Author: Sean B. Palmer <[email protected]>
AuthorDate: Mon Mar 10 16:05:36 2025 +0200
Split archive task functions into their own module
---
atr/tasks/archive.py | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++
atr/tasks/task.py | 28 +++++++++++++
atr/verify.py | 53 +----------------------
atr/worker.py | 50 ++++++++--------------
docs/conventions.html | 4 ++
docs/conventions.md | 8 ++++
docs/plan.html | 1 +
docs/plan.md | 1 +
8 files changed, 173 insertions(+), 85 deletions(-)
diff --git a/atr/tasks/archive.py b/atr/tasks/archive.py
new file mode 100644
index 0000000..a05fe1d
--- /dev/null
+++ b/atr/tasks/archive.py
@@ -0,0 +1,113 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import logging
+import os.path
+import tarfile
+from typing import Any, Final
+
+import atr.tasks.task as task
+
+_LOGGER = logging.getLogger(__name__)
+
+
+def check_integrity(args: list[str]) -> tuple[task.TaskStatus, str | None,
tuple[Any, ...]]:
+ """Check the integrity of a .tar.gz file."""
+ # TODO: We should standardise the "ERROR" mechanism here in the data
+ # Then we can have a single task wrapper for all tasks
+ # TODO: We should use task.TaskError as standard, and maybe typeguard each
function
+ # First argument should be the path, second is optional chunk_size
+ path = args[0]
+ chunk_size = int(args[1]) if len(args) > 1 else 4096
+ task_results = _wrap_results(_check_integrity_core(path, chunk_size))
+ _LOGGER.info(f"Verified {args} and computed size {task_results[0]}")
+ return task.COMPLETED, None, task_results
+
+
+def check_structure(args: list[str]) -> tuple[task.TaskStatus, str | None,
tuple[Any, ...]]:
+ """Check the structure of a .tar.gz file."""
+ task_results = _wrap_results(_check_structure_core(*args))
+ _LOGGER.info(f"Verified archive structure for {args}")
+ status = task.FAILED if not task_results[0]["valid"] else task.COMPLETED
+ error = task_results[0]["message"] if not task_results[0]["valid"] else
None
+ return status, error, task_results
+
+
+def root_directory(tgz_path: str) -> str:
+ """Find the root directory in a tar archive and validate that it has only
one root dir."""
+ root = None
+
+ with tarfile.open(tgz_path, mode="r|gz") as tf:
+ for member in tf:
+ parts = member.name.split("/", 1)
+ if len(parts) >= 1:
+ if not root:
+ root = parts[0]
+ elif parts[0] != root:
+ raise ValueError(f"Multiple root directories found:
{root}, {parts[0]}")
+
+ if not root:
+ raise ValueError("No root directory found in archive")
+
+ return root
+
+
+def _check_integrity_core(tgz_path: str, chunk_size: int = 4096) -> int:
+ """Verify a .tar.gz file and compute its uncompressed size."""
+ total_size = 0
+
+ with tarfile.open(tgz_path, mode="r|gz") as tf:
+ for member in tf:
+ total_size += member.size
+ # Verify file by extraction
+ if member.isfile():
+ f = tf.extractfile(member)
+ if f is not None:
+ while True:
+ data = f.read(chunk_size)
+ if not data:
+ break
+ return total_size
+
+
+def _check_structure_core(tgz_path: str, filename: str) -> dict[str, Any]:
+ """
+ Verify that the archive contains exactly one root directory named after
the package.
+ The package name should match the archive filename without the .tar.gz
extension.
+ """
+ expected_root: Final[str] =
os.path.splitext(os.path.splitext(filename)[0])[0]
+
+ try:
+ root = root_directory(tgz_path)
+ except ValueError as e:
+ return {"valid": False, "root_dirs": [], "message": str(e)}
+
+ if root != expected_root:
+ return {
+ "valid": False,
+ "root_dirs": [root],
+ "message": f"Root directory '{root}' does not match expected name
'{expected_root}'",
+ }
+
+ return {"valid": True, "root_dirs": [root], "message": "Archive structure
is valid"}
+
+
+def _wrap_results(item: Any) -> tuple[Any, ...]:
+ """Ensure that returned results are structured as a tuple."""
+ if not isinstance(item, tuple):
+ return (item,)
+ return item
diff --git a/atr/tasks/task.py b/atr/tasks/task.py
new file mode 100644
index 0000000..4a14900
--- /dev/null
+++ b/atr/tasks/task.py
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from enum import Enum
+from typing import Final, Literal
+
+
+class TaskStatus(Enum):
+ COMPLETED = "completed"
+ FAILED = "failed"
+
+
+COMPLETED: Final[Literal[TaskStatus.COMPLETED]] = TaskStatus.COMPLETED
+FAILED: Final[Literal[TaskStatus.FAILED]] = TaskStatus.FAILED
diff --git a/atr/verify.py b/atr/verify.py
index df2507a..635f8c4 100644
--- a/atr/verify.py
+++ b/atr/verify.py
@@ -69,6 +69,7 @@ class VerifyError(Exception):
def utility_archive_root_dir_find(artifact_path: str) -> tuple[str | None, str
| None]:
"""Find the root directory in a tar archive and validate that it has only
one root dir."""
+ # TODO: Replace instances of this with archive.root_directory()
root_dir = None
error_msg = None
@@ -88,58 +89,6 @@ def utility_archive_root_dir_find(artifact_path: str) ->
tuple[str | None, str |
return root_dir, error_msg
-def archive_integrity(path: str, chunk_size: int = 4096) -> int:
- """Verify a .tar.gz file and compute its uncompressed size."""
- total_size = 0
-
- with tarfile.open(path, mode="r|gz") as tf:
- for member in tf:
- total_size += member.size
- # Verify file by extraction
- if member.isfile():
- f = tf.extractfile(member)
- if f is not None:
- while True:
- data = f.read(chunk_size)
- if not data:
- break
- return total_size
-
-
-def archive_structure(path: str, filename: str) -> dict[str, Any]:
- """
- Verify that the archive contains exactly one root directory named after
the package.
- The package name should match the archive filename without the .tar.gz
extension.
- """
- expected_dirname = os.path.splitext(os.path.splitext(filename)[0])[0]
- root_dirs = set()
-
- with tarfile.open(path, mode="r|gz") as tf:
- for member in tf:
- parts = member.name.split("/", 1)
- if len(parts) >= 1:
- root_dirs.add(parts[0])
-
- if len(root_dirs) == 0:
- return {"valid": False, "root_dirs": list(root_dirs), "message":
"Archive contains no directories"}
- elif len(root_dirs) > 1:
- return {
- "valid": False,
- "root_dirs": list(root_dirs),
- "message": f"Archive contains multiple root directories: {',
'.join(root_dirs)}",
- }
-
- root_dir = root_dirs.pop()
- if root_dir != expected_dirname:
- return {
- "valid": False,
- "root_dirs": [root_dir],
- "message": f"Root directory '{root_dir}' does not match expected
name '{expected_dirname}'",
- }
-
- return {"valid": True, "root_dirs": [root_dir], "message": "Archive
structure is valid"}
-
-
def license_files_license(tf: tarfile.TarFile, member: tarfile.TarInfo) ->
bool:
"""Verify that the LICENSE file matches the Apache 2.0 license."""
import hashlib
diff --git a/atr/worker.py b/atr/worker.py
index dbc2c8f..d01d1c2 100644
--- a/atr/worker.py
+++ b/atr/worker.py
@@ -35,6 +35,11 @@ from typing import Any
from sqlalchemy import text
+import atr.tasks.archive as archive
+import atr.tasks.bulk as bulk
+import atr.tasks.mailtest as mailtest
+import atr.tasks.task as task
+import atr.tasks.vote as vote
import atr.verify as verify
from atr.db import create_sync_db_engine, create_sync_db_session
@@ -174,27 +179,6 @@ def task_result_process(
)
-def task_verify_archive_integrity(args: list[str]) -> tuple[str, str | None,
tuple[Any, ...]]:
- """Process verify_archive_integrity task."""
- # TODO: We should standardise the "ERROR" mechanism here in the data
- # Then we can have a single task wrapper for all tasks
- # First argument should be the path, second is optional chunk_size
- path = args[0]
- chunk_size = int(args[1]) if len(args) > 1 else 4096
- task_results = task_process_wrap(verify.archive_integrity(path,
chunk_size))
- _LOGGER.info(f"Verified {args} and computed size {task_results[0]}")
- return "COMPLETED", None, task_results
-
-
-def task_verify_archive_structure(args: list[str]) -> tuple[str, str | None,
tuple[Any, ...]]:
- """Process verify_archive_structure task."""
- task_results = task_process_wrap(verify.archive_structure(*args))
- _LOGGER.info(f"Verified archive structure for {args}")
- status = "FAILED" if not task_results[0]["valid"] else "COMPLETED"
- error = task_results[0]["message"] if not task_results[0]["valid"] else
None
- return status, error, task_results
-
-
def task_verify_license_files(args: list[str]) -> tuple[str, str | None,
tuple[Any, ...]]:
"""Process verify_license_files task."""
task_results = task_process_wrap(verify.license_files(*args))
@@ -393,12 +377,6 @@ def task_bulk_download_debug(args: list[str] | dict) ->
tuple[str, str | None, t
def task_process(task_id: int, task_type: str, task_args: str) -> None:
"""Process a claimed task."""
- # TODO: This does not go here permanently
- # We need to move the other tasks into atr.tasks
- from atr.tasks.bulk import download as bulk_download
- from atr.tasks.mailtest import send as mailtest_send
- from atr.tasks.vote import initiate as vote_initiate
-
_LOGGER.info(f"Processing task {task_id} ({task_type}) with args
{task_args}")
try:
args = json.loads(task_args)
@@ -406,16 +384,16 @@ def task_process(task_id: int, task_type: str, task_args:
str) -> None:
# Map task types to their handler functions
# TODO: We should use a decorator to register these automatically
task_handlers = {
- "verify_archive_integrity": task_verify_archive_integrity,
- "verify_archive_structure": task_verify_archive_structure,
+ "verify_archive_integrity": archive.check_integrity,
+ "verify_archive_structure": archive.check_structure,
"verify_license_files": task_verify_license_files,
"verify_signature": task_verify_signature,
"verify_license_headers": task_verify_license_headers,
"verify_rat_license": task_verify_rat_license,
"generate_cyclonedx_sbom": task_generate_cyclonedx_sbom,
- "package_bulk_download": bulk_download,
- "mailtest_send": mailtest_send,
- "vote_initiate": vote_initiate,
+ "package_bulk_download": bulk.download,
+ "mailtest_send": mailtest.send,
+ "vote_initiate": vote.initiate,
}
handler = task_handlers.get(task_type)
@@ -424,7 +402,13 @@ def task_process(task_id: int, task_type: str, task_args:
str) -> None:
_LOGGER.error(msg)
raise Exception(msg)
- status, error, task_results = handler(args)
+ raw_status, error, task_results = handler(args)
+ if isinstance(raw_status, task.TaskStatus):
+ status = raw_status.value
+ elif isinstance(raw_status, str):
+ status = raw_status
+ else:
+ raise Exception(f"Unknown task status type: {type(raw_status)}")
task_result_process(task_id, task_results, status=status, error=error)
except Exception as e:
diff --git a/docs/conventions.html b/docs/conventions.html
index b4d87c3..0961f46 100644
--- a/docs/conventions.html
+++ b/docs/conventions.html
@@ -28,6 +28,10 @@ import a.b.c as c
import a.b.c
</code></pre>
<p>This convention aligns with Go's package naming practices. Follow <a
href="https://go.dev/blog/package-names">Go naming rules</a> for all
modules.</p>
+<p>This only applies to modules outside of the Python standard library. The
standard library module <code>os.path</code>, for example, must always be
imported using the form <code>import os.path</code>, and <em>not</em>
<code>import os.path as path</code>.</p>
+<h3>Avoid duplicated module names</h3>
+<p>Try to avoid using, for example, <code>baking/apple/pie.py</code> and
<code>baking/cherry/pie.py</code> because these will both be imported as
<code>pie</code> and one will have to be renamed.</p>
+<p>If there are duplicates imported within a single file, they should be
disambiguated by the next level up. In the pie example, that would be
<code>import baking.apple as apple</code> and then <code>apple.pie</code>, and
<code>import baking.cherry as cherry</code> and <code>cherry.pie</code>.</p>
<h3>Never import names directly from modules</h3>
<p>Avoid importing specific names from modules:</p>
<pre><code class="language-python"># Preferred
diff --git a/docs/conventions.md b/docs/conventions.md
index f8fef0a..bfe7efb 100644
--- a/docs/conventions.md
+++ b/docs/conventions.md
@@ -48,6 +48,14 @@ import a.b.c
This convention aligns with Go's package naming practices. Follow [Go naming
rules](https://go.dev/blog/package-names) for all modules.
+This only applies to modules outside of the Python standard library. The
standard library module `os.path`, for example, must always be imported using
the form `import os.path`, and _not_ `import os.path as path`.
+
+### Avoid duplicated module names
+
+Try to avoid using, for example, `baking/apple/pie.py` and
`baking/cherry/pie.py` because these will both be imported as `pie` and one
will have to be renamed.
+
+If there are duplicates imported within a single file, they should be
disambiguated by the next level up. In the pie example, that would be `import
baking.apple as apple` and then `apple.pie`, and `import baking.cherry as
cherry` and `cherry.pie`.
+
### Never import names directly from modules
Avoid importing specific names from modules:
diff --git a/docs/plan.html b/docs/plan.html
index e896c82..2125724 100644
--- a/docs/plan.html
+++ b/docs/plan.html
@@ -116,6 +116,7 @@
<li>[DONE] Improve the proprietary platform patch in ASFQuart</li>
<li>[DONE] Ensure that all errors are caught and logged or displayed</li>
<li>Add further tests</li>
+<li>Decide whether to use Alembic and, if not, remove
<code>alembic.cfg</code></li>
</ul>
</li>
<li>
diff --git a/docs/plan.md b/docs/plan.md
index 4fdd6ab..8df95fb 100644
--- a/docs/plan.md
+++ b/docs/plan.md
@@ -89,6 +89,7 @@ Advanced tasks, possibly deferred
- [DONE] Improve the proprietary platform patch in ASFQuart
- [DONE] Ensure that all errors are caught and logged or displayed
- Add further tests
+ - Decide whether to use Alembic and, if not, remove `alembic.cfg`
2. Ensure that performance is optimal
- [DONE] Add page load timing metrics to a log
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]