This is an automated email from the ASF dual-hosted git repository. akm pushed a commit to branch archive-member-count-604 in repository https://gitbox.apache.org/repos/asf/tooling-trusted-releases.git
commit b728116dd72d9d38290211b1cb2b8c277ff33f8f Author: Andrew K. Musselman <[email protected]> AuthorDate: Tue Jan 27 19:55:20 2026 -0800 Archive member count limit #604 --- atr/archives.py | 71 +++++++++++++++++++++++------------------ atr/tarzip.py | 73 ++++++++++++++++++++++++++++++++++++++++--- atr/tasks/checks/targz.py | 6 ++-- atr/tasks/checks/zipformat.py | 30 +++++++++++------- atr/util.py | 23 +++++--------- 5 files changed, 137 insertions(+), 66 deletions(-) diff --git a/atr/archives.py b/atr/archives.py index 0f87d88..0ed6982 100644 --- a/atr/archives.py +++ b/atr/archives.py @@ -43,10 +43,19 @@ def extract( try: with tarzip.open_archive(archive_path) as archive: match archive.specific(): - case tarfile.TarFile() as tf: - for member in tf: - total_extracted, extracted_paths = _archive_extract_member( - tf, member, extract_dir, total_extracted, max_size, chunk_size, track_files, extracted_paths + case tarfile.TarFile(): + for member in archive: + if not isinstance(member, tarzip.TarMember): + continue + total_extracted, extracted_paths = _tar_archive_extract_member( + archive, + member, + extract_dir, + total_extracted, + max_size, + chunk_size, + track_files, + extracted_paths, ) case zipfile.ZipFile(): @@ -64,7 +73,7 @@ def extract( extracted_paths, ) - except (tarfile.TarError, zipfile.BadZipFile, ValueError) as e: + except (tarfile.TarError, zipfile.BadZipFile, ValueError, tarzip.ArchiveMemberLimitExceededError) as e: raise ExtractionError(f"Failed to read archive: {e}", {"archive_path": archive_path}) from e return total_extracted, extracted_paths @@ -73,8 +82,8 @@ def extract( def total_size(tgz_path: str, chunk_size: int = 4096) -> int: with tarzip.open_archive(tgz_path) as archive: match archive.specific(): - case tarfile.TarFile() as tf: - total_size = _size_tar(tf, chunk_size) + case tarfile.TarFile(): + total_size = _size_tar(archive, chunk_size) case zipfile.ZipFile(): total_size = _size_zip(archive, chunk_size) @@ -82,9 +91,9 @@ def total_size(tgz_path: str, chunk_size: int = 4096) -> int: return total_size -def _archive_extract_member( # noqa: C901 - tf: tarfile.TarFile, - member: tarfile.TarInfo, +def _tar_archive_extract_member( # noqa: C901 + archive: tarzip.Archive, + member: tarzip.TarMember, extract_dir: str, total_extracted: int, max_size: int, @@ -107,7 +116,7 @@ def _archive_extract_member( # noqa: C901 extracted_paths.append(member.name) # Check whether extraction would exceed the size limit - if member.isreg() and ((total_extracted + member.size) > max_size): + if member.isfile() and ((total_extracted + member.size) > max_size): raise ExtractionError( f"Extraction would exceed maximum size limit of {max_size} bytes", {"max_size": max_size, "current_size": total_extracted, "file_size": member.size}, @@ -119,32 +128,32 @@ def _archive_extract_member( # noqa: C901 if _safe_path(extract_dir, member.name) is None: log.warning(f"Skipping potentially unsafe path: {member.name}") return 0, extracted_paths - tf.extract(member, extract_dir, numeric_owner=True, filter="fully_trusted") + archive.extract_member(member, extract_dir, numeric_owner=True, tar_filter="fully_trusted") - elif member.isreg(): - extracted_size = _archive_extract_safe_process_file( - tf, member, extract_dir, total_extracted, max_size, chunk_size + elif member.isfile(): + extracted_size = _tar_archive_extract_safe_process_file( + archive, member, extract_dir, total_extracted, max_size, chunk_size ) total_extracted += extracted_size elif member.issym(): - _archive_extract_safe_process_symlink(member, extract_dir) + _tar_archive_extract_safe_process_symlink(member, extract_dir) elif member.islnk(): - _archive_extract_safe_process_hardlink(member, extract_dir) + _tar_archive_extract_safe_process_hardlink(member, extract_dir) return total_extracted, extracted_paths -def _archive_extract_safe_process_file( - tf: tarfile.TarFile, - member: tarfile.TarInfo, +def _tar_archive_extract_safe_process_file( + archive: tarzip.Archive, + member: tarzip.TarMember, extract_dir: str, total_extracted: int, max_size: int, chunk_size: int, ) -> int: - """Process a single file member during safe archive extraction.""" + """Process a single file member during safe tar archive extraction.""" target_path = _safe_path(extract_dir, member.name) if target_path is None: log.warning(f"Skipping potentially unsafe path: {member.name}") @@ -152,9 +161,9 @@ def _archive_extract_safe_process_file( os.makedirs(os.path.dirname(target_path), exist_ok=True) - source = tf.extractfile(member) + source = archive.extractfile(member) if source is None: - # Should not happen if member.isreg() is true + # Should not happen if member.isfile() is true log.warning(f"Could not extract file object for member: {member.name}") return 0 @@ -180,8 +189,8 @@ def _archive_extract_safe_process_file( return extracted_file_size -def _archive_extract_safe_process_hardlink(member: tarfile.TarInfo, extract_dir: str) -> None: - """Safely create a hard link from the TarInfo entry.""" +def _tar_archive_extract_safe_process_hardlink(member: tarzip.TarMember, extract_dir: str) -> None: + """Safely create a hard link from the TarMember entry.""" target_path = _safe_path(extract_dir, member.name) if target_path is None: log.warning(f"Skipping potentially unsafe hard link path: {member.name}") @@ -203,8 +212,8 @@ def _archive_extract_safe_process_hardlink(member: tarfile.TarInfo, extract_dir: log.warning(f"Failed to create hard link {target_path} -> {source_path}: {e}") -def _archive_extract_safe_process_symlink(member: tarfile.TarInfo, extract_dir: str) -> None: - """Safely create a symbolic link from the TarInfo entry.""" +def _tar_archive_extract_safe_process_symlink(member: tarzip.TarMember, extract_dir: str) -> None: + """Safely create a symbolic link from the TarMember entry.""" target_path = _safe_path(extract_dir, member.name) if target_path is None: log.warning(f"Skipping potentially unsafe symlink path: {member.name}") @@ -242,12 +251,14 @@ def _safe_path(base_dir: str, *paths: str) -> str | None: return None -def _size_tar(tf: tarfile.TarFile, chunk_size: int) -> int: +def _size_tar(archive: tarzip.Archive, chunk_size: int) -> int: total_size = 0 - for member in tf: + for member in archive: + if not isinstance(member, tarzip.TarMember): + continue total_size += member.size if member.isfile(): - fileobj = tf.extractfile(member) + fileobj = archive.extractfile(member) if fileobj is not None: while fileobj.read(chunk_size): pass diff --git a/atr/tarzip.py b/atr/tarzip.py index 391b1cb..7ad6a66 100644 --- a/atr/tarzip.py +++ b/atr/tarzip.py @@ -19,9 +19,16 @@ import contextlib import tarfile import zipfile from collections.abc import Generator, Iterator -from typing import IO, TypeVar +from typing import IO, Final, Literal, TypeVar from typing import Protocol as TypingProtocol +MAX_ARCHIVE_MEMBERS: Final[int] = 100_000 + + +class ArchiveMemberLimitExceededError(Exception): + pass + + ArchiveT = TypeVar("ArchiveT", tarfile.TarFile, zipfile.ZipFile) # If you set this covariant=True, then mypy says an "invariant one is expected" # But pyright warns, "should be covariant" if you don't @@ -96,19 +103,32 @@ type Member = TarMember | ZipMember class ArchiveContext[ArchiveT: (tarfile.TarFile, zipfile.ZipFile)]: _archive_obj: ArchiveT + _max_members: int - def __init__(self, archive_obj: ArchiveT): + def __init__(self, archive_obj: ArchiveT, max_members: int = MAX_ARCHIVE_MEMBERS): self._archive_obj = archive_obj + self._max_members = max_members def __iter__(self) -> Iterator[TarMember | ZipMember]: + count = 0 match self._archive_obj: case tarfile.TarFile() as tf: for member_orig in tf: if member_orig.isdev(): continue + count += 1 + if count > self._max_members: + raise ArchiveMemberLimitExceededError( + f"Archive contains too many members: exceeded limit of {self._max_members}" + ) yield TarMember(member_orig) case zipfile.ZipFile() as zf: for member_orig in zf.infolist(): + count += 1 + if count > self._max_members: + raise ArchiveMemberLimitExceededError( + f"Archive contains too many members: exceeded limit of {self._max_members}" + ) yield ZipMember(member_orig) def extractfile(self, member_wrapper: Member) -> IO[bytes] | None: @@ -125,6 +145,34 @@ class ArchiveContext[ArchiveT: (tarfile.TarFile, zipfile.ZipFile)]: except (KeyError, AttributeError, Exception): return None + def extract_member( + self, + member_wrapper: Member, + path: str, + *, + numeric_owner: bool = True, + tar_filter: Literal["fully_trusted", "tar", "data"] | None = "fully_trusted", + ) -> None: + """Extract a single member to the specified path. + + For tar archives, this uses the tarfile extract method with the specified filter. + For zip archives, this is a no-op as zip extraction is handled differently + (directories are created via os.makedirs in the calling code). + """ + match self._archive_obj: + case tarfile.TarFile() as tf: + if not isinstance(member_wrapper, TarMember): + raise TypeError("Archive is TarFile, but member_wrapper is not TarMember") + tf.extract( + member_wrapper._original_info, + path, + numeric_owner=numeric_owner, + filter=tar_filter, + ) + case zipfile.ZipFile(): + # Zip extraction is handled differently in the calling code + pass + def specific(self) -> tarfile.TarFile | zipfile.ZipFile: return self._archive_obj @@ -135,7 +183,22 @@ type Archive = TarArchive | ZipArchive @contextlib.contextmanager -def open_archive(archive_path: str) -> Generator[Archive]: +def open_archive( + archive_path: str, + max_members: int = MAX_ARCHIVE_MEMBERS, +) -> Generator[Archive]: + """Open an archive file (tar or zip) and yield an ArchiveContext. + + Args: + archive_path: Path to the archive file. + max_members: Maximum number of members allowed in the archive. + Defaults to MAX_ARCHIVE_MEMBERS (100,000). + Set to 0 or negative to disable the limit. + + Raises: + ValueError: If the archive format is unsupported or corrupted. + ArchiveMemberLimitExceededError: If the archive exceeds max_members during iteration. + """ archive_file: tarfile.TarFile | zipfile.ZipFile | None = None try: try: @@ -148,9 +211,9 @@ def open_archive(archive_path: str) -> Generator[Archive]: match archive_file: case tarfile.TarFile() as tf_concrete: - yield ArchiveContext[tarfile.TarFile](tf_concrete) + yield ArchiveContext[tarfile.TarFile](tf_concrete, max_members) case zipfile.ZipFile() as zf_concrete: - yield ArchiveContext[zipfile.ZipFile](zf_concrete) + yield ArchiveContext[zipfile.ZipFile](zf_concrete, max_members) finally: if archive_file: diff --git a/atr/tasks/checks/targz.py b/atr/tasks/checks/targz.py index c3302e9..18ae3f4 100644 --- a/atr/tasks/checks/targz.py +++ b/atr/tasks/checks/targz.py @@ -16,12 +16,12 @@ # under the License. import asyncio -import tarfile from typing import Final import atr.archives as archives import atr.log as log import atr.models.results as results +import atr.tarzip as tarzip import atr.tasks.checks as checks @@ -52,8 +52,8 @@ def root_directory(tgz_path: str) -> str: """Find the root directory in a tar archive and validate that it has only one root dir.""" root = None - with tarfile.open(tgz_path, mode="r|gz") as tf: - for member in tf: + with tarzip.open_archive(tgz_path) as archive: + for member in archive: if member.name and member.name.split("/")[-1].startswith("._"): # Metadata convention continue diff --git a/atr/tasks/checks/zipformat.py b/atr/tasks/checks/zipformat.py index cdce89e..d69138f 100644 --- a/atr/tasks/checks/zipformat.py +++ b/atr/tasks/checks/zipformat.py @@ -22,6 +22,7 @@ from typing import Any import atr.log as log import atr.models.results as results +import atr.tarzip as tarzip import atr.tasks.checks as checks import atr.util as util @@ -76,11 +77,13 @@ async def structure(args: checks.FunctionArguments) -> results.Results | None: def _integrity_check_core_logic(artifact_path: str) -> dict[str, Any]: """Verify that a zip file can be opened and its members listed.""" try: - with zipfile.ZipFile(artifact_path, "r") as zf: + with tarzip.open_archive(artifact_path) as archive: # This is a simple check using list members # We can use zf.testzip() for CRC checks if needed, though this will be slower - member_list = zf.namelist() - return {"member_count": len(member_list)} + member_count = sum(1 for _ in archive) + return {"member_count": member_count} + except tarzip.ArchiveMemberLimitExceededError as e: + return {"error": f"Archive has too many members: {e}"} except zipfile.BadZipFile as e: return {"error": f"Bad zip file: {e}"} except FileNotFoundError: @@ -92,8 +95,8 @@ def _integrity_check_core_logic(artifact_path: str) -> dict[str, Any]: def _structure_check_core_logic(artifact_path: str) -> dict[str, Any]: """Verify the internal structure of the zip archive.""" try: - with zipfile.ZipFile(artifact_path, "r") as zf: - members = zf.namelist() + with tarzip.open_archive(artifact_path) as archive: + members: list[tarzip.Member] = list(archive) if not members: return {"error": "Archive is empty"} @@ -107,9 +110,10 @@ def _structure_check_core_logic(artifact_path: str) -> dict[str, Any]: # name_part = "-".join(name_part.split("-")[:-1]) expected_root = name_part - root_dirs, non_rooted_files = _structure_check_core_logic_find_roots(zf, members) + root_dirs, non_rooted_files = _structure_check_core_logic_find_roots(members) + member_names = [m.name for m in members] actual_root, error_msg = _structure_check_core_logic_validate_root( - members, root_dirs, non_rooted_files, expected_root + member_names, root_dirs, non_rooted_files, expected_root ) if error_msg: @@ -121,6 +125,8 @@ def _structure_check_core_logic(artifact_path: str) -> dict[str, Any]: return {"root_dir": actual_root} return {"error": "Unknown structure validation error"} + except tarzip.ArchiveMemberLimitExceededError as e: + return {"error": f"Archive has too many members: {e}"} except zipfile.BadZipFile as e: return {"error": f"Bad zip file: {e}"} except FileNotFoundError: @@ -129,15 +135,15 @@ def _structure_check_core_logic(artifact_path: str) -> dict[str, Any]: return {"error": f"Unexpected error: {e}"} -def _structure_check_core_logic_find_roots(zf: zipfile.ZipFile, members: list[str]) -> tuple[set[str], list[str]]: +def _structure_check_core_logic_find_roots(members: list[tarzip.Member]) -> tuple[set[str], list[str]]: """Identify root directories and non-rooted files in a zip archive.""" root_dirs: set[str] = set() non_rooted_files: list[str] = [] for member in members: - if "/" in member: - root_dirs.add(member.split("/", 1)[0]) - elif not zipfile.Path(zf, member).is_dir(): - non_rooted_files.append(member) + if "/" in member.name: + root_dirs.add(member.name.split("/", 1)[0]) + elif not member.isdir(): + non_rooted_files.append(member.name) return root_dirs, non_rooted_files diff --git a/atr/util.py b/atr/util.py index 4b83f96..4095440 100644 --- a/atr/util.py +++ b/atr/util.py @@ -50,6 +50,7 @@ import atr.ldap as ldap import atr.log as log import atr.models.sql as sql import atr.registry as registry +import atr.tarzip as tarzip import atr.user as user T = TypeVar("T") @@ -95,26 +96,16 @@ async def archive_listing(file_path: pathlib.Path) -> list[str] | None: return None with contextlib.suppress(Exception): - if file_path.name.endswith((".tar.gz", ".tgz")): + if file_path.name.endswith((".tar.gz", ".tgz", ".zip")): - def _read_tar() -> list[str] | None: - with contextlib.suppress(tarfile.ReadError, EOFError, ValueError, OSError): - with tarfile.open(file_path, mode="r:*") as tf: + def _read_archive() -> list[str] | None: + with contextlib.suppress(tarfile.ReadError, zipfile.BadZipFile, EOFError, ValueError, OSError): + with tarzip.open_archive(str(file_path)) as archive: # TODO: Skip metadata files - return sorted(tf.getnames()) + return sorted(member.name for member in archive) return None - return await asyncio.to_thread(_read_tar) - - elif file_path.name.endswith(".zip"): - - def _read_zip() -> list[str] | None: - with contextlib.suppress(zipfile.BadZipFile, EOFError, ValueError, OSError): - with zipfile.ZipFile(file_path, "r") as zf: - return sorted(zf.namelist()) - return None - - return await asyncio.to_thread(_read_zip) + return await asyncio.to_thread(_read_archive) return None --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
