This is an automated email from the ASF dual-hosted git repository.

akm pushed a commit to branch archive-member-count-604
in repository https://gitbox.apache.org/repos/asf/tooling-trusted-releases.git

commit b728116dd72d9d38290211b1cb2b8c277ff33f8f
Author: Andrew K. Musselman <[email protected]>
AuthorDate: Tue Jan 27 19:55:20 2026 -0800

    Archive member count limit #604
---
 atr/archives.py               | 71 +++++++++++++++++++++++------------------
 atr/tarzip.py                 | 73 ++++++++++++++++++++++++++++++++++++++++---
 atr/tasks/checks/targz.py     |  6 ++--
 atr/tasks/checks/zipformat.py | 30 +++++++++++-------
 atr/util.py                   | 23 +++++---------
 5 files changed, 137 insertions(+), 66 deletions(-)

diff --git a/atr/archives.py b/atr/archives.py
index 0f87d88..0ed6982 100644
--- a/atr/archives.py
+++ b/atr/archives.py
@@ -43,10 +43,19 @@ def extract(
     try:
         with tarzip.open_archive(archive_path) as archive:
             match archive.specific():
-                case tarfile.TarFile() as tf:
-                    for member in tf:
-                        total_extracted, extracted_paths = 
_archive_extract_member(
-                            tf, member, extract_dir, total_extracted, 
max_size, chunk_size, track_files, extracted_paths
+                case tarfile.TarFile():
+                    for member in archive:
+                        if not isinstance(member, tarzip.TarMember):
+                            continue
+                        total_extracted, extracted_paths = 
_tar_archive_extract_member(
+                            archive,
+                            member,
+                            extract_dir,
+                            total_extracted,
+                            max_size,
+                            chunk_size,
+                            track_files,
+                            extracted_paths,
                         )
 
                 case zipfile.ZipFile():
@@ -64,7 +73,7 @@ def extract(
                             extracted_paths,
                         )
 
-    except (tarfile.TarError, zipfile.BadZipFile, ValueError) as e:
+    except (tarfile.TarError, zipfile.BadZipFile, ValueError, 
tarzip.ArchiveMemberLimitExceededError) as e:
         raise ExtractionError(f"Failed to read archive: {e}", {"archive_path": 
archive_path}) from e
 
     return total_extracted, extracted_paths
@@ -73,8 +82,8 @@ def extract(
 def total_size(tgz_path: str, chunk_size: int = 4096) -> int:
     with tarzip.open_archive(tgz_path) as archive:
         match archive.specific():
-            case tarfile.TarFile() as tf:
-                total_size = _size_tar(tf, chunk_size)
+            case tarfile.TarFile():
+                total_size = _size_tar(archive, chunk_size)
 
             case zipfile.ZipFile():
                 total_size = _size_zip(archive, chunk_size)
@@ -82,9 +91,9 @@ def total_size(tgz_path: str, chunk_size: int = 4096) -> int:
     return total_size
 
 
-def _archive_extract_member(  # noqa: C901
-    tf: tarfile.TarFile,
-    member: tarfile.TarInfo,
+def _tar_archive_extract_member(  # noqa: C901
+    archive: tarzip.Archive,
+    member: tarzip.TarMember,
     extract_dir: str,
     total_extracted: int,
     max_size: int,
@@ -107,7 +116,7 @@ def _archive_extract_member(  # noqa: C901
             extracted_paths.append(member.name)
 
     # Check whether extraction would exceed the size limit
-    if member.isreg() and ((total_extracted + member.size) > max_size):
+    if member.isfile() and ((total_extracted + member.size) > max_size):
         raise ExtractionError(
             f"Extraction would exceed maximum size limit of {max_size} bytes",
             {"max_size": max_size, "current_size": total_extracted, 
"file_size": member.size},
@@ -119,32 +128,32 @@ def _archive_extract_member(  # noqa: C901
         if _safe_path(extract_dir, member.name) is None:
             log.warning(f"Skipping potentially unsafe path: {member.name}")
             return 0, extracted_paths
-        tf.extract(member, extract_dir, numeric_owner=True, 
filter="fully_trusted")
+        archive.extract_member(member, extract_dir, numeric_owner=True, 
tar_filter="fully_trusted")
 
-    elif member.isreg():
-        extracted_size = _archive_extract_safe_process_file(
-            tf, member, extract_dir, total_extracted, max_size, chunk_size
+    elif member.isfile():
+        extracted_size = _tar_archive_extract_safe_process_file(
+            archive, member, extract_dir, total_extracted, max_size, chunk_size
         )
         total_extracted += extracted_size
 
     elif member.issym():
-        _archive_extract_safe_process_symlink(member, extract_dir)
+        _tar_archive_extract_safe_process_symlink(member, extract_dir)
 
     elif member.islnk():
-        _archive_extract_safe_process_hardlink(member, extract_dir)
+        _tar_archive_extract_safe_process_hardlink(member, extract_dir)
 
     return total_extracted, extracted_paths
 
 
-def _archive_extract_safe_process_file(
-    tf: tarfile.TarFile,
-    member: tarfile.TarInfo,
+def _tar_archive_extract_safe_process_file(
+    archive: tarzip.Archive,
+    member: tarzip.TarMember,
     extract_dir: str,
     total_extracted: int,
     max_size: int,
     chunk_size: int,
 ) -> int:
-    """Process a single file member during safe archive extraction."""
+    """Process a single file member during safe tar archive extraction."""
     target_path = _safe_path(extract_dir, member.name)
     if target_path is None:
         log.warning(f"Skipping potentially unsafe path: {member.name}")
@@ -152,9 +161,9 @@ def _archive_extract_safe_process_file(
 
     os.makedirs(os.path.dirname(target_path), exist_ok=True)
 
-    source = tf.extractfile(member)
+    source = archive.extractfile(member)
     if source is None:
-        # Should not happen if member.isreg() is true
+        # Should not happen if member.isfile() is true
         log.warning(f"Could not extract file object for member: {member.name}")
         return 0
 
@@ -180,8 +189,8 @@ def _archive_extract_safe_process_file(
     return extracted_file_size
 
 
-def _archive_extract_safe_process_hardlink(member: tarfile.TarInfo, 
extract_dir: str) -> None:
-    """Safely create a hard link from the TarInfo entry."""
+def _tar_archive_extract_safe_process_hardlink(member: tarzip.TarMember, 
extract_dir: str) -> None:
+    """Safely create a hard link from the TarMember entry."""
     target_path = _safe_path(extract_dir, member.name)
     if target_path is None:
         log.warning(f"Skipping potentially unsafe hard link path: 
{member.name}")
@@ -203,8 +212,8 @@ def _archive_extract_safe_process_hardlink(member: 
tarfile.TarInfo, extract_dir:
         log.warning(f"Failed to create hard link {target_path} -> 
{source_path}: {e}")
 
 
-def _archive_extract_safe_process_symlink(member: tarfile.TarInfo, 
extract_dir: str) -> None:
-    """Safely create a symbolic link from the TarInfo entry."""
+def _tar_archive_extract_safe_process_symlink(member: tarzip.TarMember, 
extract_dir: str) -> None:
+    """Safely create a symbolic link from the TarMember entry."""
     target_path = _safe_path(extract_dir, member.name)
     if target_path is None:
         log.warning(f"Skipping potentially unsafe symlink path: {member.name}")
@@ -242,12 +251,14 @@ def _safe_path(base_dir: str, *paths: str) -> str | None:
     return None
 
 
-def _size_tar(tf: tarfile.TarFile, chunk_size: int) -> int:
+def _size_tar(archive: tarzip.Archive, chunk_size: int) -> int:
     total_size = 0
-    for member in tf:
+    for member in archive:
+        if not isinstance(member, tarzip.TarMember):
+            continue
         total_size += member.size
         if member.isfile():
-            fileobj = tf.extractfile(member)
+            fileobj = archive.extractfile(member)
             if fileobj is not None:
                 while fileobj.read(chunk_size):
                     pass
diff --git a/atr/tarzip.py b/atr/tarzip.py
index 391b1cb..7ad6a66 100644
--- a/atr/tarzip.py
+++ b/atr/tarzip.py
@@ -19,9 +19,16 @@ import contextlib
 import tarfile
 import zipfile
 from collections.abc import Generator, Iterator
-from typing import IO, TypeVar
+from typing import IO, Final, Literal, TypeVar
 from typing import Protocol as TypingProtocol
 
+MAX_ARCHIVE_MEMBERS: Final[int] = 100_000
+
+
+class ArchiveMemberLimitExceededError(Exception):
+    pass
+
+
 ArchiveT = TypeVar("ArchiveT", tarfile.TarFile, zipfile.ZipFile)
 # If you set this covariant=True, then mypy says an "invariant one is expected"
 # But pyright warns, "should be covariant" if you don't
@@ -96,19 +103,32 @@ type Member = TarMember | ZipMember
 
 class ArchiveContext[ArchiveT: (tarfile.TarFile, zipfile.ZipFile)]:
     _archive_obj: ArchiveT
+    _max_members: int
 
-    def __init__(self, archive_obj: ArchiveT):
+    def __init__(self, archive_obj: ArchiveT, max_members: int = 
MAX_ARCHIVE_MEMBERS):
         self._archive_obj = archive_obj
+        self._max_members = max_members
 
     def __iter__(self) -> Iterator[TarMember | ZipMember]:
+        count = 0
         match self._archive_obj:
             case tarfile.TarFile() as tf:
                 for member_orig in tf:
                     if member_orig.isdev():
                         continue
+                    count += 1
+                    if count > self._max_members:
+                        raise ArchiveMemberLimitExceededError(
+                            f"Archive contains too many members: exceeded 
limit of {self._max_members}"
+                        )
                     yield TarMember(member_orig)
             case zipfile.ZipFile() as zf:
                 for member_orig in zf.infolist():
+                    count += 1
+                    if count > self._max_members:
+                        raise ArchiveMemberLimitExceededError(
+                            f"Archive contains too many members: exceeded 
limit of {self._max_members}"
+                        )
                     yield ZipMember(member_orig)
 
     def extractfile(self, member_wrapper: Member) -> IO[bytes] | None:
@@ -125,6 +145,34 @@ class ArchiveContext[ArchiveT: (tarfile.TarFile, 
zipfile.ZipFile)]:
         except (KeyError, AttributeError, Exception):
             return None
 
+    def extract_member(
+        self,
+        member_wrapper: Member,
+        path: str,
+        *,
+        numeric_owner: bool = True,
+        tar_filter: Literal["fully_trusted", "tar", "data"] | None = 
"fully_trusted",
+    ) -> None:
+        """Extract a single member to the specified path.
+
+        For tar archives, this uses the tarfile extract method with the 
specified filter.
+        For zip archives, this is a no-op as zip extraction is handled 
differently
+        (directories are created via os.makedirs in the calling code).
+        """
+        match self._archive_obj:
+            case tarfile.TarFile() as tf:
+                if not isinstance(member_wrapper, TarMember):
+                    raise TypeError("Archive is TarFile, but member_wrapper is 
not TarMember")
+                tf.extract(
+                    member_wrapper._original_info,
+                    path,
+                    numeric_owner=numeric_owner,
+                    filter=tar_filter,
+                )
+            case zipfile.ZipFile():
+                # Zip extraction is handled differently in the calling code
+                pass
+
     def specific(self) -> tarfile.TarFile | zipfile.ZipFile:
         return self._archive_obj
 
@@ -135,7 +183,22 @@ type Archive = TarArchive | ZipArchive
 
 
 @contextlib.contextmanager
-def open_archive(archive_path: str) -> Generator[Archive]:
+def open_archive(
+    archive_path: str,
+    max_members: int = MAX_ARCHIVE_MEMBERS,
+) -> Generator[Archive]:
+    """Open an archive file (tar or zip) and yield an ArchiveContext.
+
+    Args:
+        archive_path: Path to the archive file.
+        max_members: Maximum number of members allowed in the archive.
+            Defaults to MAX_ARCHIVE_MEMBERS (100,000).
+            Set to 0 or negative to disable the limit.
+
+    Raises:
+        ValueError: If the archive format is unsupported or corrupted.
+        ArchiveMemberLimitExceededError: If the archive exceeds max_members 
during iteration.
+    """
     archive_file: tarfile.TarFile | zipfile.ZipFile | None = None
     try:
         try:
@@ -148,9 +211,9 @@ def open_archive(archive_path: str) -> Generator[Archive]:
 
         match archive_file:
             case tarfile.TarFile() as tf_concrete:
-                yield ArchiveContext[tarfile.TarFile](tf_concrete)
+                yield ArchiveContext[tarfile.TarFile](tf_concrete, max_members)
             case zipfile.ZipFile() as zf_concrete:
-                yield ArchiveContext[zipfile.ZipFile](zf_concrete)
+                yield ArchiveContext[zipfile.ZipFile](zf_concrete, max_members)
 
     finally:
         if archive_file:
diff --git a/atr/tasks/checks/targz.py b/atr/tasks/checks/targz.py
index c3302e9..18ae3f4 100644
--- a/atr/tasks/checks/targz.py
+++ b/atr/tasks/checks/targz.py
@@ -16,12 +16,12 @@
 # under the License.
 
 import asyncio
-import tarfile
 from typing import Final
 
 import atr.archives as archives
 import atr.log as log
 import atr.models.results as results
+import atr.tarzip as tarzip
 import atr.tasks.checks as checks
 
 
@@ -52,8 +52,8 @@ def root_directory(tgz_path: str) -> str:
     """Find the root directory in a tar archive and validate that it has only 
one root dir."""
     root = None
 
-    with tarfile.open(tgz_path, mode="r|gz") as tf:
-        for member in tf:
+    with tarzip.open_archive(tgz_path) as archive:
+        for member in archive:
             if member.name and member.name.split("/")[-1].startswith("._"):
                 # Metadata convention
                 continue
diff --git a/atr/tasks/checks/zipformat.py b/atr/tasks/checks/zipformat.py
index cdce89e..d69138f 100644
--- a/atr/tasks/checks/zipformat.py
+++ b/atr/tasks/checks/zipformat.py
@@ -22,6 +22,7 @@ from typing import Any
 
 import atr.log as log
 import atr.models.results as results
+import atr.tarzip as tarzip
 import atr.tasks.checks as checks
 import atr.util as util
 
@@ -76,11 +77,13 @@ async def structure(args: checks.FunctionArguments) -> 
results.Results | None:
 def _integrity_check_core_logic(artifact_path: str) -> dict[str, Any]:
     """Verify that a zip file can be opened and its members listed."""
     try:
-        with zipfile.ZipFile(artifact_path, "r") as zf:
+        with tarzip.open_archive(artifact_path) as archive:
             # This is a simple check using list members
             # We can use zf.testzip() for CRC checks if needed, though this 
will be slower
-            member_list = zf.namelist()
-            return {"member_count": len(member_list)}
+            member_count = sum(1 for _ in archive)
+            return {"member_count": member_count}
+    except tarzip.ArchiveMemberLimitExceededError as e:
+        return {"error": f"Archive has too many members: {e}"}
     except zipfile.BadZipFile as e:
         return {"error": f"Bad zip file: {e}"}
     except FileNotFoundError:
@@ -92,8 +95,8 @@ def _integrity_check_core_logic(artifact_path: str) -> 
dict[str, Any]:
 def _structure_check_core_logic(artifact_path: str) -> dict[str, Any]:
     """Verify the internal structure of the zip archive."""
     try:
-        with zipfile.ZipFile(artifact_path, "r") as zf:
-            members = zf.namelist()
+        with tarzip.open_archive(artifact_path) as archive:
+            members: list[tarzip.Member] = list(archive)
             if not members:
                 return {"error": "Archive is empty"}
 
@@ -107,9 +110,10 @@ def _structure_check_core_logic(artifact_path: str) -> 
dict[str, Any]:
             #     name_part = "-".join(name_part.split("-")[:-1])
             expected_root = name_part
 
-            root_dirs, non_rooted_files = 
_structure_check_core_logic_find_roots(zf, members)
+            root_dirs, non_rooted_files = 
_structure_check_core_logic_find_roots(members)
+            member_names = [m.name for m in members]
             actual_root, error_msg = _structure_check_core_logic_validate_root(
-                members, root_dirs, non_rooted_files, expected_root
+                member_names, root_dirs, non_rooted_files, expected_root
             )
 
             if error_msg:
@@ -121,6 +125,8 @@ def _structure_check_core_logic(artifact_path: str) -> 
dict[str, Any]:
                 return {"root_dir": actual_root}
             return {"error": "Unknown structure validation error"}
 
+    except tarzip.ArchiveMemberLimitExceededError as e:
+        return {"error": f"Archive has too many members: {e}"}
     except zipfile.BadZipFile as e:
         return {"error": f"Bad zip file: {e}"}
     except FileNotFoundError:
@@ -129,15 +135,15 @@ def _structure_check_core_logic(artifact_path: str) -> 
dict[str, Any]:
         return {"error": f"Unexpected error: {e}"}
 
 
-def _structure_check_core_logic_find_roots(zf: zipfile.ZipFile, members: 
list[str]) -> tuple[set[str], list[str]]:
+def _structure_check_core_logic_find_roots(members: list[tarzip.Member]) -> 
tuple[set[str], list[str]]:
     """Identify root directories and non-rooted files in a zip archive."""
     root_dirs: set[str] = set()
     non_rooted_files: list[str] = []
     for member in members:
-        if "/" in member:
-            root_dirs.add(member.split("/", 1)[0])
-        elif not zipfile.Path(zf, member).is_dir():
-            non_rooted_files.append(member)
+        if "/" in member.name:
+            root_dirs.add(member.name.split("/", 1)[0])
+        elif not member.isdir():
+            non_rooted_files.append(member.name)
     return root_dirs, non_rooted_files
 
 
diff --git a/atr/util.py b/atr/util.py
index 4b83f96..4095440 100644
--- a/atr/util.py
+++ b/atr/util.py
@@ -50,6 +50,7 @@ import atr.ldap as ldap
 import atr.log as log
 import atr.models.sql as sql
 import atr.registry as registry
+import atr.tarzip as tarzip
 import atr.user as user
 
 T = TypeVar("T")
@@ -95,26 +96,16 @@ async def archive_listing(file_path: pathlib.Path) -> 
list[str] | None:
         return None
 
     with contextlib.suppress(Exception):
-        if file_path.name.endswith((".tar.gz", ".tgz")):
+        if file_path.name.endswith((".tar.gz", ".tgz", ".zip")):
 
-            def _read_tar() -> list[str] | None:
-                with contextlib.suppress(tarfile.ReadError, EOFError, 
ValueError, OSError):
-                    with tarfile.open(file_path, mode="r:*") as tf:
+            def _read_archive() -> list[str] | None:
+                with contextlib.suppress(tarfile.ReadError, 
zipfile.BadZipFile, EOFError, ValueError, OSError):
+                    with tarzip.open_archive(str(file_path)) as archive:
                         # TODO: Skip metadata files
-                        return sorted(tf.getnames())
+                        return sorted(member.name for member in archive)
                 return None
 
-            return await asyncio.to_thread(_read_tar)
-
-        elif file_path.name.endswith(".zip"):
-
-            def _read_zip() -> list[str] | None:
-                with contextlib.suppress(zipfile.BadZipFile, EOFError, 
ValueError, OSError):
-                    with zipfile.ZipFile(file_path, "r") as zf:
-                        return sorted(zf.namelist())
-                return None
-
-            return await asyncio.to_thread(_read_zip)
+            return await asyncio.to_thread(_read_archive)
 
     return None
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to