kevinjqliu commented on code in PR #2951:
URL: https://github.com/apache/iceberg-python/pull/2951#discussion_r2725929380


##########
pyiceberg/manifest.py:
##########
@@ -892,15 +891,52 @@ def __hash__(self) -> int:
         return hash(self.manifest_path)
 
 
-# Global cache for manifest lists
-_manifest_cache: LRUCache[Any, tuple[ManifestFile, ...]] = 
LRUCache(maxsize=128)
+# Global cache for individual ManifestFile objects, keyed by manifest_path.
+# This avoids duplicating ManifestFile objects when multiple manifest lists
+# share the same manifests (which is common after appends).
+_manifest_cache: LRUCache[str, ManifestFile] = LRUCache(maxsize=512)
+
+# Lock for thread-safe cache access
+_manifest_cache_lock = threading.RLock()
 
 
-@cached(cache=_manifest_cache, key=lambda io, manifest_list: 
hashkey(manifest_list), lock=threading.RLock())
 def _manifests(io: FileIO, manifest_list: str) -> tuple[ManifestFile, ...]:
-    """Read and cache manifests from the given manifest list, returning a 
tuple to prevent modification."""
+    """Read manifests from the given manifest list, caching individual 
ManifestFile objects.
+
+    Unlike caching entire manifest lists, this approach caches individual 
ManifestFile
+    objects by their manifest_path. This is more memory-efficient because:
+    - ManifestList1 contains: (ManifestFile1)
+    - ManifestList2 contains: (ManifestFile1, ManifestFile2)
+    - ManifestList3 contains: (ManifestFile1, ManifestFile2, ManifestFile3)
+
+    With per-ManifestFile caching, ManifestFile1 is stored only once and 
reused,
+    instead of being duplicated in each manifest list's cached tuple.
+
+    Args:
+        io: The FileIO to read the manifest list.
+        manifest_list: The path to the manifest list file.
+
+    Returns:
+        A tuple of ManifestFile objects (tuple to prevent modification).
+    """
+    # Read manifest list outside the lock to avoid blocking other threads 
during I/O
     file = io.new_input(manifest_list)
-    return tuple(read_manifest_list(file))
+    manifest_files = list(read_manifest_list(file))
+

Review Comment:
   this is intentional



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to