This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-cpp.git
The following commit(s) were added to refs/heads/main by this push:
new 80f7408f refactor: separate lazy-initialized fields into dedicated
cache classes (#477)
80f7408f is described below
commit 80f7408f25707cb738bbfa622c74f7d8fa311798
Author: Junwang Zhao <[email protected]>
AuthorDate: Fri Jan 2 17:13:29 2026 +0800
refactor: separate lazy-initialized fields into dedicated cache classes
(#477)
- Split Schema lazy initialized fields into SchemaCache class
- Rename CachedSnapshot to SnapshotCache
This refactoring improves consistency with TableMetadataCache
---
src/iceberg/schema.cc | 140 ++++++++++++++++++++++++++++--------------------
src/iceberg/schema.h | 61 +++++++++++++++------
src/iceberg/snapshot.cc | 12 ++---
src/iceberg/snapshot.h | 12 ++---
4 files changed, 138 insertions(+), 87 deletions(-)
diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc
index 1cb263c4..00905378 100644
--- a/src/iceberg/schema.cc
+++ b/src/iceberg/schema.cc
@@ -36,7 +36,9 @@
namespace iceberg {
Schema::Schema(std::vector<SchemaField> fields, int32_t schema_id)
- : StructType(std::move(fields)), schema_id_(schema_id) {}
+ : StructType(std::move(fields)),
+ schema_id_(schema_id),
+ cache_(std::make_unique<SchemaCache>(this)) {}
Result<std::unique_ptr<Schema>> Schema::Make(std::vector<SchemaField> fields,
int32_t schema_id,
@@ -156,14 +158,14 @@ bool Schema::Equals(const Schema& other) const {
Result<std::optional<std::reference_wrapper<const SchemaField>>>
Schema::FindFieldByName(
std::string_view name, bool case_sensitive) const {
if (case_sensitive) {
- ICEBERG_ASSIGN_OR_RAISE(auto name_id_map, name_id_map_.Get(*this));
+ ICEBERG_ASSIGN_OR_RAISE(auto name_id_map, cache_->GetNameIdMap());
auto it = name_id_map.get().name_to_id.find(name);
if (it == name_id_map.get().name_to_id.end()) {
return std::nullopt;
};
return FindFieldById(it->second);
}
- ICEBERG_ASSIGN_OR_RAISE(auto lowercase_name_to_id,
lowercase_name_to_id_.Get(*this));
+ ICEBERG_ASSIGN_OR_RAISE(auto lowercase_name_to_id,
cache_->GetLowercaseNameToIdMap());
auto it = lowercase_name_to_id.get().find(StringUtils::ToLower(name));
if (it == lowercase_name_to_id.get().end()) {
return std::nullopt;
@@ -171,39 +173,9 @@ Result<std::optional<std::reference_wrapper<const
SchemaField>>> Schema::FindFie
return FindFieldById(it->second);
}
-Result<std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>>
-Schema::InitIdToFieldMap(const Schema& self) {
- std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>
id_to_field;
- IdToFieldVisitor visitor(id_to_field);
- ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(self, &visitor));
- return id_to_field;
-}
-
-Result<Schema::NameIdMap> Schema::InitNameIdMap(const Schema& self) {
- NameIdMap name_id_map;
- NameToIdVisitor visitor(name_id_map.name_to_id, &name_id_map.id_to_name,
- /*case_sensitive=*/true);
- ICEBERG_RETURN_UNEXPECTED(
- VisitTypeInline(self, &visitor, /*path=*/"", /*short_path=*/""));
- visitor.Finish();
- return name_id_map;
-}
-
-Result<std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>>
-Schema::InitLowerCaseNameToIdMap(const Schema& self) {
- std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>
- lowercase_name_to_id;
- NameToIdVisitor visitor(lowercase_name_to_id, /*id_to_name=*/nullptr,
- /*case_sensitive=*/false);
- ICEBERG_RETURN_UNEXPECTED(
- VisitTypeInline(self, &visitor, /*path=*/"", /*short_path=*/""));
- visitor.Finish();
- return lowercase_name_to_id;
-}
-
Result<std::optional<std::reference_wrapper<const SchemaField>>>
Schema::FindFieldById(
int32_t field_id) const {
- ICEBERG_ASSIGN_OR_RAISE(auto id_to_field, id_to_field_.Get(*this));
+ ICEBERG_ASSIGN_OR_RAISE(auto id_to_field, cache_->GetIdToFieldMap());
auto it = id_to_field.get().find(field_id);
if (it == id_to_field.get().end()) {
return std::nullopt;
@@ -213,7 +185,7 @@ Result<std::optional<std::reference_wrapper<const
SchemaField>>> Schema::FindFie
Result<std::optional<std::string_view>> Schema::FindColumnNameById(
int32_t field_id) const {
- ICEBERG_ASSIGN_OR_RAISE(auto name_id_map, name_id_map_.Get(*this));
+ ICEBERG_ASSIGN_OR_RAISE(auto name_id_map, cache_->GetNameIdMap());
auto it = name_id_map.get().id_to_name.find(field_id);
if (it == name_id_map.get().id_to_name.end()) {
return std::nullopt;
@@ -221,30 +193,9 @@ Result<std::optional<std::string_view>>
Schema::FindColumnNameById(
return it->second;
}
-Result<std::unordered_map<int32_t, std::vector<size_t>>>
Schema::InitIdToPositionPath(
- const Schema& self) {
- PositionPathVisitor visitor;
- ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(self, &visitor));
- return visitor.Finish();
-}
-
-Result<int32_t> Schema::InitHighestFieldId(const Schema& self) {
- ICEBERG_ASSIGN_OR_RAISE(auto id_to_field, self.id_to_field_.Get(self));
-
- if (id_to_field.get().empty()) {
- return kInitialColumnId;
- }
-
- auto max_it = std::ranges::max_element(
- id_to_field.get(),
- [](const auto& lhs, const auto& rhs) { return lhs.first < rhs.first; });
-
- return max_it->first;
-}
-
Result<std::unique_ptr<StructLikeAccessor>> Schema::GetAccessorById(
int32_t field_id) const {
- ICEBERG_ASSIGN_OR_RAISE(auto id_to_position_path,
id_to_position_path_.Get(*this));
+ ICEBERG_ASSIGN_OR_RAISE(auto id_to_position_path,
cache_->GetIdToPositionPathMap());
if (auto it = id_to_position_path.get().find(field_id);
it != id_to_position_path.get().cend()) {
ICEBERG_ASSIGN_OR_RAISE(auto field, FindFieldById(field_id));
@@ -322,7 +273,7 @@ Result<std::vector<std::string>>
Schema::IdentifierFieldNames() const {
return names;
}
-Result<int32_t> Schema::HighestFieldId() const { return
highest_field_id_.Get(*this); }
+Result<int32_t> Schema::HighestFieldId() const { return
cache_->GetHighestFieldId(); }
bool Schema::SameSchema(const Schema& other) const {
return fields_ == other.fields_ && identifier_field_ids_ ==
other.identifier_field_ids_;
@@ -330,7 +281,7 @@ bool Schema::SameSchema(const Schema& other) const {
Status Schema::Validate(int32_t format_version) const {
// Get all fields including nested ones
- ICEBERG_ASSIGN_OR_RAISE(auto id_to_field, id_to_field_.Get(*this));
+ ICEBERG_ASSIGN_OR_RAISE(auto id_to_field, cache_->GetIdToFieldMap());
// Check each field's type and defaults
for (const auto& [field_id, field_ref] : id_to_field.get()) {
@@ -351,4 +302,75 @@ Status Schema::Validate(int32_t format_version) const {
return {};
}
+Result<SchemaCache::IdToFieldMapRef> SchemaCache::GetIdToFieldMap() const {
+ return id_to_field_.Get(schema_);
+}
+
+Result<SchemaCache::NameIdMapRef> SchemaCache::GetNameIdMap() const {
+ return name_id_map_.Get(schema_);
+}
+
+Result<SchemaCache::LowercaseNameToIdMapRef>
SchemaCache::GetLowercaseNameToIdMap()
+ const {
+ return lowercase_name_to_id_.Get(schema_);
+}
+
+Result<SchemaCache::IdToPositionPathMapRef>
SchemaCache::GetIdToPositionPathMap() const {
+ return id_to_position_path_.Get(schema_);
+}
+
+Result<int32_t> SchemaCache::GetHighestFieldId() const {
+ return highest_field_id_.Get(schema_);
+}
+
+Result<SchemaCache::IdToFieldMap> SchemaCache::InitIdToFieldMap(const Schema*
schema) {
+ std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>
id_to_field;
+ IdToFieldVisitor visitor(id_to_field);
+ ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*schema, &visitor));
+ return id_to_field;
+}
+
+Result<SchemaCache::NameIdMap> SchemaCache::InitNameIdMap(const Schema*
schema) {
+ NameIdMap name_id_map;
+ NameToIdVisitor visitor(name_id_map.name_to_id, &name_id_map.id_to_name,
+ /*case_sensitive=*/true);
+ ICEBERG_RETURN_UNEXPECTED(
+ VisitTypeInline(*schema, &visitor, /*path=*/"", /*short_path=*/""));
+ visitor.Finish();
+ return name_id_map;
+}
+
+Result<SchemaCache::LowercaseNameToIdMap>
SchemaCache::InitLowerCaseNameToIdMap(
+ const Schema* schema) {
+ std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>
+ lowercase_name_to_id;
+ NameToIdVisitor visitor(lowercase_name_to_id, /*id_to_name=*/nullptr,
+ /*case_sensitive=*/false);
+ ICEBERG_RETURN_UNEXPECTED(
+ VisitTypeInline(*schema, &visitor, /*path=*/"", /*short_path=*/""));
+ visitor.Finish();
+ return lowercase_name_to_id;
+}
+
+Result<SchemaCache::IdToPositionPathMap> SchemaCache::InitIdToPositionPath(
+ const Schema* schema) {
+ PositionPathVisitor visitor;
+ ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*schema, &visitor));
+ return visitor.Finish();
+}
+
+Result<int32_t> SchemaCache::InitHighestFieldId(const Schema* schema) {
+ ICEBERG_ASSIGN_OR_RAISE(auto id_to_field, InitIdToFieldMap(schema));
+
+ if (id_to_field.empty()) {
+ return Schema::kInitialColumnId;
+ }
+
+ auto max_it = std::ranges::max_element(
+ id_to_field,
+ [](const auto& lhs, const auto& rhs) { return lhs.first < rhs.first; });
+
+ return max_it->first;
+}
+
} // namespace iceberg
diff --git a/src/iceberg/schema.h b/src/iceberg/schema.h
index 1445a793..bcaccaa1 100644
--- a/src/iceberg/schema.h
+++ b/src/iceberg/schema.h
@@ -26,6 +26,7 @@
#include <cstdint>
#include <optional>
#include <string>
+#include <unordered_map>
#include <unordered_set>
#include <vector>
@@ -38,6 +39,8 @@
namespace iceberg {
+class SchemaCache;
+
/// \brief A schema for a Table.
///
/// A schema is a list of typed columns, along with a unique integer ID. A
@@ -187,6 +190,22 @@ class ICEBERG_EXPORT Schema : public StructType {
/// \brief Compare two schemas for equality.
bool Equals(const Schema& other) const;
+ const int32_t schema_id_;
+ // Field IDs that uniquely identify rows in the table.
+ std::vector<int32_t> identifier_field_ids_;
+ // Cache for schema mappings to facilitate fast lookups.
+ std::unique_ptr<SchemaCache> cache_;
+};
+
+// Cache for schema mappings to facilitate fast lookups.
+class ICEBERG_EXPORT SchemaCache {
+ public:
+ explicit SchemaCache(const Schema* schema) : schema_(schema) {}
+
+ using IdToFieldMap =
+ std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>;
+ using IdToFieldMapRef = std::reference_wrapper<const IdToFieldMap>;
+
struct NameIdMap {
/// \brief Mapping from canonical field name to ID
///
@@ -201,28 +220,38 @@ class ICEBERG_EXPORT Schema : public StructType {
/// 'list.element.field' instead of 'list.field'.
std::unordered_map<int32_t, std::string> id_to_name;
};
+ using NameIdMapRef = std::reference_wrapper<const NameIdMap>;
- static Result<std::unordered_map<int32_t, std::reference_wrapper<const
SchemaField>>>
- InitIdToFieldMap(const Schema&);
- static Result<NameIdMap> InitNameIdMap(const Schema&);
- static Result<std::unordered_map<std::string, int32_t, StringHash,
std::equal_to<>>>
- InitLowerCaseNameToIdMap(const Schema&);
- static Result<std::unordered_map<int32_t, std::vector<size_t>>>
InitIdToPositionPath(
- const Schema&);
- static Result<int32_t> InitHighestFieldId(const Schema&);
+ using LowercaseNameToIdMap =
+ std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>;
+ using LowercaseNameToIdMapRef = std::reference_wrapper<const
LowercaseNameToIdMap>;
- const int32_t schema_id_;
- /// Field IDs that uniquely identify rows in the table.
- std::vector<int32_t> identifier_field_ids_;
- /// Mapping from field id to field.
+ using IdToPositionPathMap = std::unordered_map<int32_t, std::vector<size_t>>;
+ using IdToPositionPathMapRef = std::reference_wrapper<const
IdToPositionPathMap>;
+
+ Result<IdToFieldMapRef> GetIdToFieldMap() const;
+ Result<NameIdMapRef> GetNameIdMap() const;
+ Result<LowercaseNameToIdMapRef> GetLowercaseNameToIdMap() const;
+ Result<IdToPositionPathMapRef> GetIdToPositionPathMap() const;
+ Result<int32_t> GetHighestFieldId() const;
+
+ private:
+ static Result<IdToFieldMap> InitIdToFieldMap(const Schema* schema);
+ static Result<NameIdMap> InitNameIdMap(const Schema* schema);
+ static Result<LowercaseNameToIdMap> InitLowerCaseNameToIdMap(const Schema*
schema);
+ static Result<IdToPositionPathMap> InitIdToPositionPath(const Schema*
schema);
+ static Result<int32_t> InitHighestFieldId(const Schema* schema);
+
+ const Schema* schema_;
+ // Mapping from field id to field.
Lazy<InitIdToFieldMap> id_to_field_;
- /// Mapping from field name to field id.
+ // Mapping from field name to field id.
Lazy<InitNameIdMap> name_id_map_;
- /// Mapping from lowercased field name to field id
+ // Mapping from lowercased field name to field id.
Lazy<InitLowerCaseNameToIdMap> lowercase_name_to_id_;
- /// Mapping from field id to (nested) position path to access the field.
+ // Mapping from field id to (nested) position path to access the field.
Lazy<InitIdToPositionPath> id_to_position_path_;
- /// Highest field ID in the schema.
+ // Highest field ID in the schema.
Lazy<InitHighestFieldId> highest_field_id_;
};
diff --git a/src/iceberg/snapshot.cc b/src/iceberg/snapshot.cc
index f421e838..5b6aabc1 100644
--- a/src/iceberg/snapshot.cc
+++ b/src/iceberg/snapshot.cc
@@ -85,15 +85,15 @@ bool Snapshot::Equals(const Snapshot& other) const {
schema_id == other.schema_id;
}
-Result<CachedSnapshot::ManifestsCache> CachedSnapshot::InitManifestsCache(
- const Snapshot& snapshot, std::shared_ptr<FileIO> file_io) {
+Result<SnapshotCache::ManifestsCache> SnapshotCache::InitManifestsCache(
+ const Snapshot* snapshot, std::shared_ptr<FileIO> file_io) {
if (file_io == nullptr) {
return InvalidArgument("Cannot cache manifests: FileIO is null");
}
// Read manifest list
ICEBERG_ASSIGN_OR_RAISE(auto reader,
- ManifestListReader::Make(snapshot.manifest_list,
file_io));
+ ManifestListReader::Make(snapshot->manifest_list,
file_io));
ICEBERG_ASSIGN_OR_RAISE(auto manifest_files, reader->Files());
std::vector<ManifestFile> manifests;
@@ -118,21 +118,21 @@ Result<CachedSnapshot::ManifestsCache>
CachedSnapshot::InitManifestsCache(
return std::make_pair(std::move(manifests), data_manifests_count);
}
-Result<std::span<ManifestFile>> CachedSnapshot::Manifests(
+Result<std::span<ManifestFile>> SnapshotCache::Manifests(
std::shared_ptr<FileIO> file_io) const {
ICEBERG_ASSIGN_OR_RAISE(auto cache_ref, manifests_cache_.Get(snapshot_,
file_io));
auto& cache = cache_ref.get();
return std::span<ManifestFile>(cache.first.data(), cache.first.size());
}
-Result<std::span<ManifestFile>> CachedSnapshot::DataManifests(
+Result<std::span<ManifestFile>> SnapshotCache::DataManifests(
std::shared_ptr<FileIO> file_io) const {
ICEBERG_ASSIGN_OR_RAISE(auto cache_ref, manifests_cache_.Get(snapshot_,
file_io));
auto& cache = cache_ref.get();
return std::span<ManifestFile>(cache.first.data(), cache.second);
}
-Result<std::span<ManifestFile>> CachedSnapshot::DeleteManifests(
+Result<std::span<ManifestFile>> SnapshotCache::DeleteManifests(
std::shared_ptr<FileIO> file_io) const {
ICEBERG_ASSIGN_OR_RAISE(auto cache_ref, manifests_cache_.Get(snapshot_,
file_io));
auto& cache = cache_ref.get();
diff --git a/src/iceberg/snapshot.h b/src/iceberg/snapshot.h
index 3889607f..86a7fe20 100644
--- a/src/iceberg/snapshot.h
+++ b/src/iceberg/snapshot.h
@@ -265,13 +265,13 @@ struct ICEBERG_EXPORT Snapshot {
/// \brief A snapshot with cached manifest loading capabilities.
///
-/// This class wraps a Snapshot reference and provides lazy-loading of
manifests.
-class ICEBERG_EXPORT CachedSnapshot {
+/// This class wraps a Snapshot pointer and provides lazy-loading of manifests.
+class ICEBERG_EXPORT SnapshotCache {
public:
- explicit CachedSnapshot(const Snapshot& snapshot) : snapshot_(snapshot) {}
+ explicit SnapshotCache(const Snapshot* snapshot) : snapshot_(snapshot) {}
/// \brief Get the underlying Snapshot reference
- const Snapshot& snapshot() const { return snapshot_; }
+ const Snapshot& snapshot() const { return *snapshot_; }
/// \brief Returns all ManifestFile instances for either data or delete
manifests
/// in this snapshot.
@@ -303,11 +303,11 @@ class ICEBERG_EXPORT CachedSnapshot {
/// \param snapshot The snapshot to initialize the manifests cache for
/// \param file_io The FileIO instance to use for reading the manifest list
/// \return A result containing the manifests cache
- static Result<ManifestsCache> InitManifestsCache(const Snapshot& snapshot,
+ static Result<ManifestsCache> InitManifestsCache(const Snapshot* snapshot,
std::shared_ptr<FileIO>
file_io);
/// The underlying snapshot data
- const Snapshot& snapshot_;
+ const Snapshot* snapshot_;
/// Lazy-loaded manifests cache
Lazy<InitManifestsCache> manifests_cache_;