This is an automated email from the ASF dual-hosted git repository.

gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-cpp.git


The following commit(s) were added to refs/heads/main by this push:
     new a89924de feat: add snapshot cached manifests (#444)
a89924de is described below

commit a89924de07963d35271ee243c505cf8301e75ffb
Author: Junwang Zhao <[email protected]>
AuthorDate: Tue Dec 30 13:52:00 2025 +0800

    feat: add snapshot cached manifests (#444)
---
 src/iceberg/manifest/manifest_list.h    |  3 +-
 src/iceberg/manifest/manifest_writer.cc |  1 +
 src/iceberg/snapshot.cc                 | 61 +++++++++++++++++++++++++++++++++
 src/iceberg/snapshot.h                  | 55 +++++++++++++++++++++++++++++
 4 files changed, 118 insertions(+), 2 deletions(-)

diff --git a/src/iceberg/manifest/manifest_list.h 
b/src/iceberg/manifest/manifest_list.h
index 47a7ad48..da70fb69 100644
--- a/src/iceberg/manifest/manifest_list.h
+++ b/src/iceberg/manifest/manifest_list.h
@@ -31,7 +31,6 @@
 #include "iceberg/partition_spec.h"
 #include "iceberg/result.h"
 #include "iceberg/schema_field.h"
-#include "iceberg/snapshot.h"
 #include "iceberg/table_metadata.h"
 #include "iceberg/type.h"
 
@@ -107,7 +106,7 @@ struct ICEBERG_EXPORT ManifestFile {
   int64_t min_sequence_number = TableMetadata::kInitialSequenceNumber;
   /// Field id: 503
   /// ID of the snapshot where the manifest file was added
-  int64_t added_snapshot_id = Snapshot::kInvalidSnapshotId;
+  int64_t added_snapshot_id = -1;  // Snapshot::kInvalidSnapshotId
   /// Field id: 504
   /// Number of entries in the manifest that have status ADDED (1), when null 
this is
   /// assumed to be non-zero
diff --git a/src/iceberg/manifest/manifest_writer.cc 
b/src/iceberg/manifest/manifest_writer.cc
index e3d2564a..52ad807e 100644
--- a/src/iceberg/manifest/manifest_writer.cc
+++ b/src/iceberg/manifest/manifest_writer.cc
@@ -29,6 +29,7 @@
 #include "iceberg/partition_summary_internal.h"
 #include "iceberg/result.h"
 #include "iceberg/schema.h"
+#include "iceberg/snapshot.h"
 #include "iceberg/table_metadata.h"
 #include "iceberg/util/macros.h"
 
diff --git a/src/iceberg/snapshot.cc b/src/iceberg/snapshot.cc
index fb994f8b..f421e838 100644
--- a/src/iceberg/snapshot.cc
+++ b/src/iceberg/snapshot.cc
@@ -19,6 +19,11 @@
 
 #include "iceberg/snapshot.h"
 
+#include "iceberg/file_io.h"
+#include "iceberg/manifest/manifest_list.h"
+#include "iceberg/manifest/manifest_reader.h"
+#include "iceberg/util/macros.h"
+
 namespace iceberg {
 
 bool SnapshotRef::Branch::Equals(const SnapshotRef::Branch& other) const {
@@ -80,4 +85,60 @@ bool Snapshot::Equals(const Snapshot& other) const {
          schema_id == other.schema_id;
 }
 
+Result<CachedSnapshot::ManifestsCache> CachedSnapshot::InitManifestsCache(
+    const Snapshot& snapshot, std::shared_ptr<FileIO> file_io) {
+  if (file_io == nullptr) {
+    return InvalidArgument("Cannot cache manifests: FileIO is null");
+  }
+
+  // Read manifest list
+  ICEBERG_ASSIGN_OR_RAISE(auto reader,
+                          ManifestListReader::Make(snapshot.manifest_list, 
file_io));
+  ICEBERG_ASSIGN_OR_RAISE(auto manifest_files, reader->Files());
+
+  std::vector<ManifestFile> manifests;
+  manifests.reserve(manifest_files.size());
+
+  // Partition manifests: data manifests first, then delete manifests
+  // First pass: collect data manifests
+  for (const auto& manifest_file : manifest_files) {
+    if (manifest_file.content == ManifestContent::kData) {
+      manifests.push_back(manifest_file);
+    }
+  }
+  size_t data_manifests_count = manifests.size();
+
+  // Second pass: append delete manifests
+  for (const auto& manifest_file : manifest_files) {
+    if (manifest_file.content == ManifestContent::kDeletes) {
+      manifests.push_back(manifest_file);
+    }
+  }
+
+  return std::make_pair(std::move(manifests), data_manifests_count);
+}
+
+Result<std::span<ManifestFile>> CachedSnapshot::Manifests(
+    std::shared_ptr<FileIO> file_io) const {
+  ICEBERG_ASSIGN_OR_RAISE(auto cache_ref, manifests_cache_.Get(snapshot_, 
file_io));
+  auto& cache = cache_ref.get();
+  return std::span<ManifestFile>(cache.first.data(), cache.first.size());
+}
+
+Result<std::span<ManifestFile>> CachedSnapshot::DataManifests(
+    std::shared_ptr<FileIO> file_io) const {
+  ICEBERG_ASSIGN_OR_RAISE(auto cache_ref, manifests_cache_.Get(snapshot_, 
file_io));
+  auto& cache = cache_ref.get();
+  return std::span<ManifestFile>(cache.first.data(), cache.second);
+}
+
+Result<std::span<ManifestFile>> CachedSnapshot::DeleteManifests(
+    std::shared_ptr<FileIO> file_io) const {
+  ICEBERG_ASSIGN_OR_RAISE(auto cache_ref, manifests_cache_.Get(snapshot_, 
file_io));
+  auto& cache = cache_ref.get();
+  const size_t delete_start = cache.second;
+  const size_t delete_count = cache.first.size() - delete_start;
+  return std::span<ManifestFile>(cache.first.data() + delete_start, 
delete_count);
+}
+
 }  // namespace iceberg
diff --git a/src/iceberg/snapshot.h b/src/iceberg/snapshot.h
index 5afe2d22..a047c76b 100644
--- a/src/iceberg/snapshot.h
+++ b/src/iceberg/snapshot.h
@@ -19,7 +19,9 @@
 
 #pragma once
 
+#include <memory>
 #include <optional>
+#include <span>
 #include <string>
 #include <string_view>
 #include <unordered_map>
@@ -27,7 +29,10 @@
 #include <variant>
 
 #include "iceberg/iceberg_export.h"
+#include "iceberg/manifest/manifest_list.h"
 #include "iceberg/result.h"
+#include "iceberg/type_fwd.h"
+#include "iceberg/util/lazy.h"
 #include "iceberg/util/timepoint.h"
 
 namespace iceberg {
@@ -260,4 +265,54 @@ struct ICEBERG_EXPORT Snapshot {
   bool Equals(const Snapshot& other) const;
 };
 
+/// \brief A snapshot with cached manifest loading capabilities.
+///
+/// This class wraps a Snapshot reference and provides lazy-loading of 
manifests.
+class ICEBERG_EXPORT CachedSnapshot {
+ public:
+  explicit CachedSnapshot(const Snapshot& snapshot) : snapshot_(snapshot) {}
+
+  /// \brief Get the underlying Snapshot reference
+  const Snapshot& snapshot() const { return snapshot_; }
+
+  /// \brief Returns all ManifestFile instances for either data or delete 
manifests
+  /// in this snapshot.
+  ///
+  /// \param file_io The FileIO instance to use for reading the manifest list
+  /// \return A span of ManifestFile instances, or an error
+  Result<std::span<ManifestFile>> Manifests(std::shared_ptr<FileIO> file_io) 
const;
+
+  /// \brief Returns a ManifestFile for each data manifest in this snapshot.
+  ///
+  /// \param file_io The FileIO instance to use for reading the manifest list
+  /// \return A span of ManifestFile instances, or an error
+  Result<std::span<ManifestFile>> DataManifests(std::shared_ptr<FileIO> 
file_io) const;
+
+  /// \brief Returns a ManifestFile for each delete manifest in this snapshot.
+  ///
+  /// \param file_io The FileIO instance to use for reading the manifest list
+  /// \return A span of ManifestFile instances, or an error
+  Result<std::span<ManifestFile>> DeleteManifests(std::shared_ptr<FileIO> 
file_io) const;
+
+ private:
+  /// \brief Cache structure for storing loaded manifests
+  ///
+  /// \note Manifests are stored in a single vector with data manifests at the 
head
+  /// and delete manifests at the tail, separated by the number of data 
manifests.
+  using ManifestsCache = std::pair<std::vector<ManifestFile>, size_t>;
+
+  /// \brief Initialize manifests cache by loading them from the manifest list 
file.
+  /// \param snapshot The snapshot to initialize the manifests cache for
+  /// \param file_io The FileIO instance to use for reading the manifest list
+  /// \return A result containing the manifests cache
+  static Result<ManifestsCache> InitManifestsCache(const Snapshot& snapshot,
+                                                   std::shared_ptr<FileIO> 
file_io);
+
+  /// The underlying snapshot data
+  const Snapshot& snapshot_;
+
+  /// Lazy-loaded manifests cache
+  Lazy<InitManifestsCache> manifests_cache_;
+};
+
 }  // namespace iceberg

Reply via email to