geruh commented on code in PR #2951:
URL: https://github.com/apache/iceberg-python/pull/2951#discussion_r2725866181
##########
pyiceberg/manifest.py:
##########
@@ -892,15 +891,49 @@ def __hash__(self) -> int:
return hash(self.manifest_path)
-# Global cache for manifest lists
-_manifest_cache: LRUCache[Any, tuple[ManifestFile, ...]] =
LRUCache(maxsize=128)
+# Global cache for individual ManifestFile objects, keyed by manifest_path.
+# This avoids duplicating ManifestFile objects when multiple manifest lists
+# share the same manifests (which is common after appends).
+_manifest_cache: LRUCache[str, ManifestFile] = LRUCache(maxsize=512)
+
+# Lock for thread-safe cache access
+_manifest_cache_lock = threading.RLock()
-@cached(cache=_manifest_cache, key=lambda io, manifest_list:
hashkey(manifest_list), lock=threading.RLock())
def _manifests(io: FileIO, manifest_list: str) -> tuple[ManifestFile, ...]:
- """Read and cache manifests from the given manifest list, returning a
tuple to prevent modification."""
+ """Read manifests from the given manifest list, caching individual
ManifestFile objects.
+
+ Unlike caching entire manifest lists, this approach caches individual
ManifestFile
+ objects by their manifest_path. This is more memory-efficient because:
+ - ManifestList1 contains: (ManifestFile1)
+ - ManifestList2 contains: (ManifestFile1, ManifestFile2)
+ - ManifestList3 contains: (ManifestFile1, ManifestFile2, ManifestFile3)
+
+ With per-ManifestFile caching, ManifestFile1 is stored only once and
reused,
+ instead of being duplicated in each manifest list's cached tuple.
+
+ Args:
+ io: The FileIO to read the manifest list.
+ manifest_list: The path to the manifest list file.
+
+ Returns:
+ A tuple of ManifestFile objects (tuple to prevent modification).
+ """
file = io.new_input(manifest_list)
- return tuple(read_manifest_list(file))
+ result = []
+
+ for manifest_file in read_manifest_list(file):
+ manifest_path = manifest_file.manifest_path
+ with _manifest_cache_lock:
Review Comment:
Now that the lock is moved out of the decorator, should we acquire the lock
once around the entire loop instead of each iteration?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]