This is an automated email from the ASF dual-hosted git repository.

sbp pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tooling-atr-experiments.git


The following commit(s) were added to refs/heads/main by this push:
     new ecb2dc1  Split archive task functions into their own module
ecb2dc1 is described below

commit ecb2dc17d91a9b99049fb8203230e7113ea43919
Author: Sean B. Palmer <[email protected]>
AuthorDate: Mon Mar 10 16:05:36 2025 +0200

    Split archive task functions into their own module
---
 atr/tasks/archive.py  | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++
 atr/tasks/task.py     |  28 +++++++++++++
 atr/verify.py         |  53 +----------------------
 atr/worker.py         |  50 ++++++++--------------
 docs/conventions.html |   4 ++
 docs/conventions.md   |   8 ++++
 docs/plan.html        |   1 +
 docs/plan.md          |   1 +
 8 files changed, 173 insertions(+), 85 deletions(-)

diff --git a/atr/tasks/archive.py b/atr/tasks/archive.py
new file mode 100644
index 0000000..a05fe1d
--- /dev/null
+++ b/atr/tasks/archive.py
@@ -0,0 +1,113 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import logging
+import os.path
+import tarfile
+from typing import Any, Final
+
+import atr.tasks.task as task
+
+_LOGGER = logging.getLogger(__name__)
+
+
+def check_integrity(args: list[str]) -> tuple[task.TaskStatus, str | None, 
tuple[Any, ...]]:
+    """Check the integrity of a .tar.gz file."""
+    # TODO: We should standardise the "ERROR" mechanism here in the data
+    # Then we can have a single task wrapper for all tasks
+    # TODO: We should use task.TaskError as standard, and maybe typeguard each 
function
+    # First argument should be the path, second is optional chunk_size
+    path = args[0]
+    chunk_size = int(args[1]) if len(args) > 1 else 4096
+    task_results = _wrap_results(_check_integrity_core(path, chunk_size))
+    _LOGGER.info(f"Verified {args} and computed size {task_results[0]}")
+    return task.COMPLETED, None, task_results
+
+
+def check_structure(args: list[str]) -> tuple[task.TaskStatus, str | None, 
tuple[Any, ...]]:
+    """Check the structure of a .tar.gz file."""
+    task_results = _wrap_results(_check_structure_core(*args))
+    _LOGGER.info(f"Verified archive structure for {args}")
+    status = task.FAILED if not task_results[0]["valid"] else task.COMPLETED
+    error = task_results[0]["message"] if not task_results[0]["valid"] else 
None
+    return status, error, task_results
+
+
+def root_directory(tgz_path: str) -> str:
+    """Find the root directory in a tar archive and validate that it has only 
one root dir."""
+    root = None
+
+    with tarfile.open(tgz_path, mode="r|gz") as tf:
+        for member in tf:
+            parts = member.name.split("/", 1)
+            if len(parts) >= 1:
+                if not root:
+                    root = parts[0]
+                elif parts[0] != root:
+                    raise ValueError(f"Multiple root directories found: 
{root}, {parts[0]}")
+
+    if not root:
+        raise ValueError("No root directory found in archive")
+
+    return root
+
+
+def _check_integrity_core(tgz_path: str, chunk_size: int = 4096) -> int:
+    """Verify a .tar.gz file and compute its uncompressed size."""
+    total_size = 0
+
+    with tarfile.open(tgz_path, mode="r|gz") as tf:
+        for member in tf:
+            total_size += member.size
+            # Verify file by extraction
+            if member.isfile():
+                f = tf.extractfile(member)
+                if f is not None:
+                    while True:
+                        data = f.read(chunk_size)
+                        if not data:
+                            break
+    return total_size
+
+
+def _check_structure_core(tgz_path: str, filename: str) -> dict[str, Any]:
+    """
+    Verify that the archive contains exactly one root directory named after 
the package.
+    The package name should match the archive filename without the .tar.gz 
extension.
+    """
+    expected_root: Final[str] = 
os.path.splitext(os.path.splitext(filename)[0])[0]
+
+    try:
+        root = root_directory(tgz_path)
+    except ValueError as e:
+        return {"valid": False, "root_dirs": [], "message": str(e)}
+
+    if root != expected_root:
+        return {
+            "valid": False,
+            "root_dirs": [root],
+            "message": f"Root directory '{root}' does not match expected name 
'{expected_root}'",
+        }
+
+    return {"valid": True, "root_dirs": [root], "message": "Archive structure 
is valid"}
+
+
+def _wrap_results(item: Any) -> tuple[Any, ...]:
+    """Ensure that returned results are structured as a tuple."""
+    if not isinstance(item, tuple):
+        return (item,)
+    return item
diff --git a/atr/tasks/task.py b/atr/tasks/task.py
new file mode 100644
index 0000000..4a14900
--- /dev/null
+++ b/atr/tasks/task.py
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from enum import Enum
+from typing import Final, Literal
+
+
+class TaskStatus(Enum):
+    COMPLETED = "completed"
+    FAILED = "failed"
+
+
+COMPLETED: Final[Literal[TaskStatus.COMPLETED]] = TaskStatus.COMPLETED
+FAILED: Final[Literal[TaskStatus.FAILED]] = TaskStatus.FAILED
diff --git a/atr/verify.py b/atr/verify.py
index df2507a..635f8c4 100644
--- a/atr/verify.py
+++ b/atr/verify.py
@@ -69,6 +69,7 @@ class VerifyError(Exception):
 
 def utility_archive_root_dir_find(artifact_path: str) -> tuple[str | None, str 
| None]:
     """Find the root directory in a tar archive and validate that it has only 
one root dir."""
+    # TODO: Replace instances of this with archive.root_directory()
     root_dir = None
     error_msg = None
 
@@ -88,58 +89,6 @@ def utility_archive_root_dir_find(artifact_path: str) -> 
tuple[str | None, str |
     return root_dir, error_msg
 
 
-def archive_integrity(path: str, chunk_size: int = 4096) -> int:
-    """Verify a .tar.gz file and compute its uncompressed size."""
-    total_size = 0
-
-    with tarfile.open(path, mode="r|gz") as tf:
-        for member in tf:
-            total_size += member.size
-            # Verify file by extraction
-            if member.isfile():
-                f = tf.extractfile(member)
-                if f is not None:
-                    while True:
-                        data = f.read(chunk_size)
-                        if not data:
-                            break
-    return total_size
-
-
-def archive_structure(path: str, filename: str) -> dict[str, Any]:
-    """
-    Verify that the archive contains exactly one root directory named after 
the package.
-    The package name should match the archive filename without the .tar.gz 
extension.
-    """
-    expected_dirname = os.path.splitext(os.path.splitext(filename)[0])[0]
-    root_dirs = set()
-
-    with tarfile.open(path, mode="r|gz") as tf:
-        for member in tf:
-            parts = member.name.split("/", 1)
-            if len(parts) >= 1:
-                root_dirs.add(parts[0])
-
-    if len(root_dirs) == 0:
-        return {"valid": False, "root_dirs": list(root_dirs), "message": 
"Archive contains no directories"}
-    elif len(root_dirs) > 1:
-        return {
-            "valid": False,
-            "root_dirs": list(root_dirs),
-            "message": f"Archive contains multiple root directories: {', 
'.join(root_dirs)}",
-        }
-
-    root_dir = root_dirs.pop()
-    if root_dir != expected_dirname:
-        return {
-            "valid": False,
-            "root_dirs": [root_dir],
-            "message": f"Root directory '{root_dir}' does not match expected 
name '{expected_dirname}'",
-        }
-
-    return {"valid": True, "root_dirs": [root_dir], "message": "Archive 
structure is valid"}
-
-
 def license_files_license(tf: tarfile.TarFile, member: tarfile.TarInfo) -> 
bool:
     """Verify that the LICENSE file matches the Apache 2.0 license."""
     import hashlib
diff --git a/atr/worker.py b/atr/worker.py
index dbc2c8f..d01d1c2 100644
--- a/atr/worker.py
+++ b/atr/worker.py
@@ -35,6 +35,11 @@ from typing import Any
 
 from sqlalchemy import text
 
+import atr.tasks.archive as archive
+import atr.tasks.bulk as bulk
+import atr.tasks.mailtest as mailtest
+import atr.tasks.task as task
+import atr.tasks.vote as vote
 import atr.verify as verify
 from atr.db import create_sync_db_engine, create_sync_db_session
 
@@ -174,27 +179,6 @@ def task_result_process(
                 )
 
 
-def task_verify_archive_integrity(args: list[str]) -> tuple[str, str | None, 
tuple[Any, ...]]:
-    """Process verify_archive_integrity task."""
-    # TODO: We should standardise the "ERROR" mechanism here in the data
-    # Then we can have a single task wrapper for all tasks
-    # First argument should be the path, second is optional chunk_size
-    path = args[0]
-    chunk_size = int(args[1]) if len(args) > 1 else 4096
-    task_results = task_process_wrap(verify.archive_integrity(path, 
chunk_size))
-    _LOGGER.info(f"Verified {args} and computed size {task_results[0]}")
-    return "COMPLETED", None, task_results
-
-
-def task_verify_archive_structure(args: list[str]) -> tuple[str, str | None, 
tuple[Any, ...]]:
-    """Process verify_archive_structure task."""
-    task_results = task_process_wrap(verify.archive_structure(*args))
-    _LOGGER.info(f"Verified archive structure for {args}")
-    status = "FAILED" if not task_results[0]["valid"] else "COMPLETED"
-    error = task_results[0]["message"] if not task_results[0]["valid"] else 
None
-    return status, error, task_results
-
-
 def task_verify_license_files(args: list[str]) -> tuple[str, str | None, 
tuple[Any, ...]]:
     """Process verify_license_files task."""
     task_results = task_process_wrap(verify.license_files(*args))
@@ -393,12 +377,6 @@ def task_bulk_download_debug(args: list[str] | dict) -> 
tuple[str, str | None, t
 
 def task_process(task_id: int, task_type: str, task_args: str) -> None:
     """Process a claimed task."""
-    # TODO: This does not go here permanently
-    # We need to move the other tasks into atr.tasks
-    from atr.tasks.bulk import download as bulk_download
-    from atr.tasks.mailtest import send as mailtest_send
-    from atr.tasks.vote import initiate as vote_initiate
-
     _LOGGER.info(f"Processing task {task_id} ({task_type}) with args 
{task_args}")
     try:
         args = json.loads(task_args)
@@ -406,16 +384,16 @@ def task_process(task_id: int, task_type: str, task_args: 
str) -> None:
         # Map task types to their handler functions
         # TODO: We should use a decorator to register these automatically
         task_handlers = {
-            "verify_archive_integrity": task_verify_archive_integrity,
-            "verify_archive_structure": task_verify_archive_structure,
+            "verify_archive_integrity": archive.check_integrity,
+            "verify_archive_structure": archive.check_structure,
             "verify_license_files": task_verify_license_files,
             "verify_signature": task_verify_signature,
             "verify_license_headers": task_verify_license_headers,
             "verify_rat_license": task_verify_rat_license,
             "generate_cyclonedx_sbom": task_generate_cyclonedx_sbom,
-            "package_bulk_download": bulk_download,
-            "mailtest_send": mailtest_send,
-            "vote_initiate": vote_initiate,
+            "package_bulk_download": bulk.download,
+            "mailtest_send": mailtest.send,
+            "vote_initiate": vote.initiate,
         }
 
         handler = task_handlers.get(task_type)
@@ -424,7 +402,13 @@ def task_process(task_id: int, task_type: str, task_args: 
str) -> None:
             _LOGGER.error(msg)
             raise Exception(msg)
 
-        status, error, task_results = handler(args)
+        raw_status, error, task_results = handler(args)
+        if isinstance(raw_status, task.TaskStatus):
+            status = raw_status.value
+        elif isinstance(raw_status, str):
+            status = raw_status
+        else:
+            raise Exception(f"Unknown task status type: {type(raw_status)}")
         task_result_process(task_id, task_results, status=status, error=error)
 
     except Exception as e:
diff --git a/docs/conventions.html b/docs/conventions.html
index b4d87c3..0961f46 100644
--- a/docs/conventions.html
+++ b/docs/conventions.html
@@ -28,6 +28,10 @@ import a.b.c as c
 import a.b.c
 </code></pre>
 <p>This convention aligns with Go's package naming practices. Follow <a 
href="https://go.dev/blog/package-names";>Go naming rules</a> for all 
modules.</p>
+<p>This only applies to modules outside of the Python standard library. The 
standard library module <code>os.path</code>, for example, must always be 
imported using the form <code>import os.path</code>, and <em>not</em> 
<code>import os.path as path</code>.</p>
+<h3>Avoid duplicated module names</h3>
+<p>Try to avoid using, for example, <code>baking/apple/pie.py</code> and 
<code>baking/cherry/pie.py</code> because these will both be imported as 
<code>pie</code> and one will have to be renamed.</p>
+<p>If there are duplicates imported within a single file, they should be 
disambiguated by the next level up. In the pie example, that would be 
<code>import baking.apple as apple</code> and then <code>apple.pie</code>, and 
<code>import baking.cherry as cherry</code> and <code>cherry.pie</code>.</p>
 <h3>Never import names directly from modules</h3>
 <p>Avoid importing specific names from modules:</p>
 <pre><code class="language-python"># Preferred
diff --git a/docs/conventions.md b/docs/conventions.md
index f8fef0a..bfe7efb 100644
--- a/docs/conventions.md
+++ b/docs/conventions.md
@@ -48,6 +48,14 @@ import a.b.c
 
 This convention aligns with Go's package naming practices. Follow [Go naming 
rules](https://go.dev/blog/package-names) for all modules.
 
+This only applies to modules outside of the Python standard library. The 
standard library module `os.path`, for example, must always be imported using 
the form `import os.path`, and _not_ `import os.path as path`.
+
+### Avoid duplicated module names
+
+Try to avoid using, for example, `baking/apple/pie.py` and 
`baking/cherry/pie.py` because these will both be imported as `pie` and one 
will have to be renamed.
+
+If there are duplicates imported within a single file, they should be 
disambiguated by the next level up. In the pie example, that would be `import 
baking.apple as apple` and then `apple.pie`, and `import baking.cherry as 
cherry` and `cherry.pie`.
+
 ### Never import names directly from modules
 
 Avoid importing specific names from modules:
diff --git a/docs/plan.html b/docs/plan.html
index e896c82..2125724 100644
--- a/docs/plan.html
+++ b/docs/plan.html
@@ -116,6 +116,7 @@
 <li>[DONE] Improve the proprietary platform patch in ASFQuart</li>
 <li>[DONE] Ensure that all errors are caught and logged or displayed</li>
 <li>Add further tests</li>
+<li>Decide whether to use Alembic and, if not, remove 
<code>alembic.cfg</code></li>
 </ul>
 </li>
 <li>
diff --git a/docs/plan.md b/docs/plan.md
index 4fdd6ab..8df95fb 100644
--- a/docs/plan.md
+++ b/docs/plan.md
@@ -89,6 +89,7 @@ Advanced tasks, possibly deferred
    - [DONE] Improve the proprietary platform patch in ASFQuart
    - [DONE] Ensure that all errors are caught and logged or displayed
    - Add further tests
+   - Decide whether to use Alembic and, if not, remove `alembic.cfg`
 
 2. Ensure that performance is optimal
    - [DONE] Add page load timing metrics to a log


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to