This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-cpp.git
The following commit(s) were added to refs/heads/main by this push:
new a94cf21e feat: assign fresh field ids for new schema (#430)
a94cf21e is described below
commit a94cf21ef1dee2622d78529d1f3110eb3a71661d
Author: wzhuo <[email protected]>
AuthorDate: Wed Dec 24 11:42:42 2025 +0800
feat: assign fresh field ids for new schema (#430)
This PR mainly includes the following changes:
1. Modify NameToIdVisitor so that the visit method can set both
name_to_id and id_to_name map simultaneously.
2. Add identifier_field_ids as a member of Schema to specify
identifier_fields.
3. Add AssignFreshIdVisitor to assign entirely new field IDs to a
Schema. This will be used later when creating a new table.
---
src/iceberg/schema.cc | 73 +++++++++--
src/iceberg/schema.h | 48 +++++++-
src/iceberg/test/CMakeLists.txt | 1 +
src/iceberg/test/assign_id_visitor_test.cc | 188 +++++++++++++++++++++++++++++
src/iceberg/test/meson.build | 1 +
src/iceberg/test/schema_test.cc | 55 ++++++++-
src/iceberg/util/type_util.cc | 78 +++++++++++-
src/iceberg/util/type_util.h | 29 ++++-
8 files changed, 452 insertions(+), 21 deletions(-)
diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc
index ca0d943f..414219f8 100644
--- a/src/iceberg/schema.cc
+++ b/src/iceberg/schema.cc
@@ -33,8 +33,28 @@
namespace iceberg {
-Schema::Schema(std::vector<SchemaField> fields, std::optional<int32_t>
schema_id)
- : StructType(std::move(fields)), schema_id_(schema_id) {}
+Schema::Schema(std::vector<SchemaField> fields, std::optional<int32_t>
schema_id,
+ std::vector<int32_t> identifier_field_ids)
+ : StructType(std::move(fields)),
+ schema_id_(schema_id),
+ identifier_field_ids_(std::move(identifier_field_ids)) {}
+
+Result<std::unique_ptr<Schema>> Schema::Make(
+ std::vector<SchemaField> fields, std::optional<int32_t> schema_id,
+ const std::vector<std::string>& identifier_field_names) {
+ auto schema = std::make_unique<Schema>(std::move(fields), schema_id);
+
+ std::vector<int32_t> fresh_identifier_ids;
+ for (const auto& name : identifier_field_names) {
+ ICEBERG_ASSIGN_OR_RAISE(auto field, schema->FindFieldByName(name));
+ if (!field) {
+ return InvalidSchema("Cannot find identifier field: {}", name);
+ }
+ fresh_identifier_ids.push_back(field.value().get().field_id());
+ }
+ schema->identifier_field_ids_ = std::move(fresh_identifier_ids);
+ return schema;
+}
std::optional<int32_t> Schema::schema_id() const { return schema_id_; }
@@ -48,15 +68,16 @@ std::string Schema::ToString() const {
}
bool Schema::Equals(const Schema& other) const {
- return schema_id_ == other.schema_id_ && fields_ == other.fields_;
+ return schema_id_ == other.schema_id_ && fields_ == other.fields_ &&
+ identifier_field_ids_ == other.identifier_field_ids_;
}
Result<std::optional<std::reference_wrapper<const SchemaField>>>
Schema::FindFieldByName(
std::string_view name, bool case_sensitive) const {
if (case_sensitive) {
- ICEBERG_ASSIGN_OR_RAISE(auto name_to_id, name_to_id_.Get(*this));
- auto it = name_to_id.get().find(name);
- if (it == name_to_id.get().end()) {
+ ICEBERG_ASSIGN_OR_RAISE(auto name_id_map, name_id_map_.Get(*this));
+ auto it = name_id_map.get().name_to_id.find(name);
+ if (it == name_id_map.get().name_to_id.end()) {
return std::nullopt;
};
return FindFieldById(it->second);
@@ -77,21 +98,22 @@ Schema::InitIdToFieldMap(const Schema& self) {
return id_to_field;
}
-Result<std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>>
-Schema::InitNameToIdMap(const Schema& self) {
- std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>
name_to_id;
- NameToIdVisitor visitor(name_to_id, /*case_sensitive=*/true);
+Result<Schema::NameIdMap> Schema::InitNameIdMap(const Schema& self) {
+ NameIdMap name_id_map;
+ NameToIdVisitor visitor(name_id_map.name_to_id, &name_id_map.id_to_name,
+ /*case_sensitive=*/true);
ICEBERG_RETURN_UNEXPECTED(
VisitTypeInline(self, &visitor, /*path=*/"", /*short_path=*/""));
visitor.Finish();
- return name_to_id;
+ return name_id_map;
}
Result<std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>>
Schema::InitLowerCaseNameToIdMap(const Schema& self) {
std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>
lowercase_name_to_id;
- NameToIdVisitor visitor(lowercase_name_to_id, /*case_sensitive=*/false);
+ NameToIdVisitor visitor(lowercase_name_to_id, /*id_to_name=*/nullptr,
+ /*case_sensitive=*/false);
ICEBERG_RETURN_UNEXPECTED(
VisitTypeInline(self, &visitor, /*path=*/"", /*short_path=*/""));
visitor.Finish();
@@ -108,6 +130,16 @@ Result<std::optional<std::reference_wrapper<const
SchemaField>>> Schema::FindFie
return it->second;
}
+Result<std::optional<std::string_view>> Schema::FindColumnNameById(
+ int32_t field_id) const {
+ ICEBERG_ASSIGN_OR_RAISE(auto name_id_map, name_id_map_.Get(*this));
+ auto it = name_id_map.get().id_to_name.find(field_id);
+ if (it == name_id_map.get().id_to_name.end()) {
+ return std::nullopt;
+ }
+ return it->second;
+}
+
Result<std::unordered_map<int32_t, std::vector<size_t>>>
Schema::InitIdToPositionPath(
const Schema& self) {
PositionPathVisitor visitor;
@@ -179,4 +211,21 @@ Result<std::unique_ptr<Schema>> Schema::Project(
std::nullopt);
}
+const std::vector<int32_t>& Schema::IdentifierFieldIds() const {
+ return identifier_field_ids_;
+}
+
+Result<std::vector<std::string>> Schema::IdentifierFieldNames() const {
+ std::vector<std::string> names;
+ names.reserve(identifier_field_ids_.size());
+ for (auto id : identifier_field_ids_) {
+ ICEBERG_ASSIGN_OR_RAISE(auto name, FindColumnNameById(id));
+ if (!name.has_value()) {
+ return InvalidSchema("Cannot find the field of the specified field id:
{}", id);
+ }
+ names.emplace_back(name.value());
+ }
+ return names;
+}
+
} // namespace iceberg
diff --git a/src/iceberg/schema.h b/src/iceberg/schema.h
index f6c459d8..bb983962 100644
--- a/src/iceberg/schema.h
+++ b/src/iceberg/schema.h
@@ -49,7 +49,18 @@ class ICEBERG_EXPORT Schema : public StructType {
static constexpr int32_t kInvalidColumnId = -1;
explicit Schema(std::vector<SchemaField> fields,
- std::optional<int32_t> schema_id = std::nullopt);
+ std::optional<int32_t> schema_id = std::nullopt,
+ std::vector<int32_t> identifier_field_ids = {});
+
+ /// \brief Create a schema.
+ ///
+ /// \param fields The fields that make up the schema.
+ /// \param schema_id The unique identifier for this schema (default:
kInitialSchemaId).
+ /// \param identifier_field_names Canonical names of fields that uniquely
identify rows
+ /// in the table (default: empty). \return A new Schema instance or Status
if failed.
+ static Result<std::unique_ptr<Schema>> Make(
+ std::vector<SchemaField> fields, std::optional<int32_t> schema_id =
std::nullopt,
+ const std::vector<std::string>& identifier_field_names = {});
/// \brief Get the schema ID.
///
@@ -78,6 +89,13 @@ class ICEBERG_EXPORT Schema : public StructType {
Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldById(
int32_t field_id) const;
+ /// \brief Returns the canonical field name for the given id.
+ ///
+ /// \param field_id The id of the field to get the canonical name for.
+ /// \return The canocinal column name of the field with the given id, or
std::nullopt if
+ /// not found.
+ Result<std::optional<std::string_view>> FindColumnNameById(int32_t field_id)
const;
+
/// \brief Get the accessor to access the field by field id.
///
/// \param field_id The id of the field to get the accessor for.
@@ -103,26 +121,48 @@ class ICEBERG_EXPORT Schema : public StructType {
Result<std::unique_ptr<Schema>> Project(
const std::unordered_set<int32_t>& field_ids) const;
+ /// \brief Return the field IDs of the identifier fields.
+ const std::vector<int32_t>& IdentifierFieldIds() const;
+
+ /// \brief Return the canonical field names of the identifier fields.
+ Result<std::vector<std::string>> IdentifierFieldNames() const;
+
friend bool operator==(const Schema& lhs, const Schema& rhs) { return
lhs.Equals(rhs); }
private:
/// \brief Compare two schemas for equality.
bool Equals(const Schema& other) const;
+ struct NameIdMap {
+ /// \brief Mapping from canonical field name to ID
+ ///
+ /// \note Short names for maps and lists are included for any name that
does not
+ /// conflict with a canonical name. For example, a list, 'l', of structs
with field
+ /// 'x' will produce short name 'l.x' in addition to canonical name
'l.element.x'.
+ std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>
name_to_id;
+
+ /// \brief Mapping from field ID to canonical name
+ ///
+ /// \note Canonical names, but not short names are set, for example
+ /// 'list.element.field' instead of 'list.field'.
+ std::unordered_map<int32_t, std::string> id_to_name;
+ };
+
static Result<std::unordered_map<int32_t, std::reference_wrapper<const
SchemaField>>>
InitIdToFieldMap(const Schema&);
- static Result<std::unordered_map<std::string, int32_t, StringHash,
std::equal_to<>>>
- InitNameToIdMap(const Schema&);
+ static Result<NameIdMap> InitNameIdMap(const Schema&);
static Result<std::unordered_map<std::string, int32_t, StringHash,
std::equal_to<>>>
InitLowerCaseNameToIdMap(const Schema&);
static Result<std::unordered_map<int32_t, std::vector<size_t>>>
InitIdToPositionPath(
const Schema&);
const std::optional<int32_t> schema_id_;
+ /// Field IDs that uniquely identify rows in the table.
+ std::vector<int32_t> identifier_field_ids_;
/// Mapping from field id to field.
Lazy<InitIdToFieldMap> id_to_field_;
/// Mapping from field name to field id.
- Lazy<InitNameToIdMap> name_to_id_;
+ Lazy<InitNameIdMap> name_id_map_;
/// Mapping from lowercased field name to field id
Lazy<InitLowerCaseNameToIdMap> lowercase_name_to_id_;
/// Mapping from field id to (nested) position path to access the field.
diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt
index 28178b88..fef63efe 100644
--- a/src/iceberg/test/CMakeLists.txt
+++ b/src/iceberg/test/CMakeLists.txt
@@ -54,6 +54,7 @@ endfunction()
add_iceberg_test(schema_test
SOURCES
+ assign_id_visitor_test.cc
name_mapping_test.cc
partition_field_test.cc
partition_spec_test.cc
diff --git a/src/iceberg/test/assign_id_visitor_test.cc
b/src/iceberg/test/assign_id_visitor_test.cc
new file mode 100644
index 00000000..f9290d7f
--- /dev/null
+++ b/src/iceberg/test/assign_id_visitor_test.cc
@@ -0,0 +1,188 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "iceberg/schema.h"
+#include "iceberg/schema_field.h"
+#include "iceberg/test/matchers.h"
+#include "iceberg/type.h"
+#include "iceberg/util/type_util.h"
+
+namespace iceberg {
+
+namespace {
+
+Schema CreateFlatSchema() {
+ return Schema({
+ SchemaField::MakeRequired(/*field_id=*/10, "id", iceberg::int64()),
+ SchemaField::MakeOptional(/*field_id=*/20, "name", iceberg::string()),
+ SchemaField::MakeOptional(/*field_id=*/30, "age", iceberg::int32()),
+ SchemaField::MakeRequired(/*field_id=*/40, "data", iceberg::float64()),
+ });
+}
+
+std::shared_ptr<Type> CreateListOfStruct() {
+ return std::make_shared<ListType>(SchemaField::MakeOptional(
+ /*field_id=*/101, "element",
+ std::make_shared<StructType>(std::vector<SchemaField>{
+ SchemaField::MakeOptional(/*field_id=*/102, "x", iceberg::int32()),
+ SchemaField::MakeRequired(/*field_id=*/103, "y", iceberg::string()),
+ })));
+}
+
+std::shared_ptr<Type> CreateMapWithStructValue() {
+ return std::make_shared<MapType>(
+ SchemaField::MakeRequired(/*field_id=*/201, "key", iceberg::string()),
+ SchemaField::MakeRequired(
+ /*field_id=*/202, "value",
+ std::make_shared<StructType>(std::vector<SchemaField>{
+ SchemaField::MakeRequired(/*field_id=*/203, "id",
iceberg::int64()),
+ SchemaField::MakeOptional(/*field_id=*/204, "name",
iceberg::string()),
+ })));
+}
+
+std::shared_ptr<Type> CreateNestedStruct() {
+ return std::make_shared<StructType>(std::vector<SchemaField>{
+ SchemaField::MakeRequired(/*field_id=*/301, "outer_id",
iceberg::int64()),
+ SchemaField::MakeRequired(
+ /*field_id=*/302, "nested",
+ std::make_shared<StructType>(std::vector<SchemaField>{
+ SchemaField::MakeOptional(/*field_id=*/303, "inner_id",
iceberg::int32()),
+ SchemaField::MakeRequired(/*field_id=*/304, "inner_name",
+ iceberg::string()),
+ })),
+ });
+}
+
+Schema CreateNestedSchema(std::vector<int32_t> identifier_field_ids = {}) {
+ return Schema(
+ {
+ SchemaField::MakeRequired(/*field_id=*/10, "id", iceberg::int64()),
+ SchemaField::MakeOptional(/*field_id=*/20, "list",
CreateListOfStruct()),
+ SchemaField::MakeOptional(/*field_id=*/30, "map",
CreateMapWithStructValue()),
+ SchemaField::MakeRequired(/*field_id=*/40, "struct",
CreateNestedStruct()),
+ },
+ Schema::kInitialSchemaId, std::move(identifier_field_ids));
+}
+
+} // namespace
+
+TEST(AssignFreshIdVisitorTest, FlatSchema) {
+ Schema schema = CreateFlatSchema();
+
+ std::atomic<int32_t> id = 0;
+ auto next_id = [&id]() { return ++id; };
+ ICEBERG_UNWRAP_OR_FAIL(auto fresh_schema,
+ AssignFreshIds(Schema::kInitialSchemaId, schema,
next_id));
+
+ ASSERT_EQ(fresh_schema->fields().size(), schema.fields().size());
+ EXPECT_EQ(Schema(
+ {
+ SchemaField::MakeRequired(/*field_id=*/1, "id",
iceberg::int64()),
+ SchemaField::MakeOptional(/*field_id=*/2, "name",
iceberg::string()),
+ SchemaField::MakeOptional(/*field_id=*/3, "age",
iceberg::int32()),
+ SchemaField::MakeRequired(/*field_id=*/4, "data",
iceberg::float64()),
+ },
+ Schema::kInitialSchemaId),
+ *fresh_schema);
+}
+
+TEST(AssignFreshIdVisitorTest, NestedSchema) {
+ Schema schema = CreateNestedSchema();
+ std::atomic<int32_t> id = 0;
+ auto next_id = [&id]() { return ++id; };
+ ICEBERG_UNWRAP_OR_FAIL(auto fresh_schema,
+ AssignFreshIds(Schema::kInitialSchemaId, schema,
next_id));
+
+ ASSERT_EQ(4, fresh_schema->fields().size());
+ for (int32_t i = 0; i < fresh_schema->fields().size(); ++i) {
+ EXPECT_EQ(i + 1, fresh_schema->fields()[i].field_id());
+ }
+
+ auto list_field = fresh_schema->fields()[1];
+ auto list_type = std::dynamic_pointer_cast<ListType>(list_field.type());
+ ASSERT_TRUE(list_type);
+ auto list_element_field = list_type->fields()[0];
+ EXPECT_EQ(5, list_element_field.field_id());
+ auto list_element_type =
+ std::dynamic_pointer_cast<StructType>(list_element_field.type());
+ ASSERT_TRUE(list_element_type);
+ EXPECT_EQ(StructType(std::vector<SchemaField>{
+ SchemaField::MakeOptional(/*field_id=*/6, "x",
iceberg::int32()),
+ SchemaField::MakeRequired(/*field_id=*/7, "y",
iceberg::string()),
+ }),
+ *list_element_type);
+
+ auto map_field = fresh_schema->fields()[2];
+ auto map_type = std::dynamic_pointer_cast<MapType>(map_field.type());
+ ASSERT_TRUE(map_type);
+ EXPECT_EQ(8, map_type->fields()[0].field_id());
+ auto map_value_field = map_type->fields()[1];
+ EXPECT_EQ(9, map_value_field.field_id());
+ auto map_value_type =
std::dynamic_pointer_cast<StructType>(map_value_field.type());
+ ASSERT_TRUE(map_value_type);
+ EXPECT_EQ(StructType(std::vector<SchemaField>{
+ SchemaField::MakeRequired(/*field_id=*/10, "id",
iceberg::int64()),
+ SchemaField::MakeOptional(/*field_id=*/11, "name",
iceberg::string()),
+ }),
+ *map_value_type);
+
+ auto struct_field = fresh_schema->fields()[3];
+ auto struct_type =
std::dynamic_pointer_cast<StructType>(struct_field.type());
+ ASSERT_TRUE(struct_type);
+
+ auto expect_nested_struct_type =
std::make_shared<StructType>(std::vector<SchemaField>{
+ SchemaField::MakeOptional(/*field_id=*/14, "inner_id", iceberg::int32()),
+ SchemaField::MakeRequired(/*field_id=*/15, "inner_name",
iceberg::string()),
+ });
+ EXPECT_EQ(StructType(std::vector<SchemaField>{
+ SchemaField::MakeRequired(/*field_id=*/12, "outer_id",
iceberg::int64()),
+ SchemaField::MakeRequired(
+ /*field_id=*/13, "nested", expect_nested_struct_type)}),
+ *struct_type);
+
+ auto nested_struct_field = struct_type->fields()[1];
+ auto nested_struct_type =
+ std::dynamic_pointer_cast<StructType>(nested_struct_field.type());
+ ASSERT_TRUE(nested_struct_type);
+ EXPECT_EQ(*expect_nested_struct_type, *nested_struct_type);
+}
+
+TEST(AssignFreshIdVisitorTest, RefreshIdentifierId) {
+ std::atomic<int32_t> id = 0;
+ auto next_id = [&id]() { return ++id; };
+
+ Schema invalid_schema = CreateNestedSchema({10, 400});
+ // Invalid identified field id
+ auto result = AssignFreshIds(Schema::kInitialSchemaId, invalid_schema,
next_id);
+ EXPECT_THAT(result, IsError(ErrorKind::kInvalidSchema));
+ EXPECT_THAT(result, HasErrorMessage("Cannot find"));
+
+ id = 0;
+ Schema schema = CreateNestedSchema({10, 301});
+ ICEBERG_UNWRAP_OR_FAIL(auto fresh_schema,
+ AssignFreshIds(Schema::kInitialSchemaId, schema,
next_id));
+ EXPECT_THAT(fresh_schema->IdentifierFieldIds(), testing::ElementsAre(1, 12));
+}
+
+} // namespace iceberg
diff --git a/src/iceberg/test/meson.build b/src/iceberg/test/meson.build
index fcd397b9..37818281 100644
--- a/src/iceberg/test/meson.build
+++ b/src/iceberg/test/meson.build
@@ -30,6 +30,7 @@ configure_file(
iceberg_tests = {
'schema_test': {
'sources': files(
+ 'assign_id_visitor_test.cc',
'name_mapping_test.cc',
'partition_field_test.cc',
'partition_spec_test.cc',
diff --git a/src/iceberg/test/schema_test.cc b/src/iceberg/test/schema_test.cc
index 89a8d54b..ff6bf060 100644
--- a/src/iceberg/test/schema_test.cc
+++ b/src/iceberg/test/schema_test.cc
@@ -70,6 +70,21 @@ TEST(SchemaTest, Basics) {
ASSERT_THAT(result,
iceberg::HasErrorMessage("Invalid index -1 to get field from
struct"));
ASSERT_EQ(std::nullopt, schema.GetFieldByName("element"));
+ ASSERT_EQ(0, schema.IdentifierFieldIds().size());
+ auto identifier_field_names = schema.IdentifierFieldNames();
+ ASSERT_THAT(identifier_field_names, iceberg::IsOk());
+ ASSERT_THAT(identifier_field_names.value(), ::testing::IsEmpty());
+ }
+
+ {
+ // identifier fields not empty
+ iceberg::SchemaField field1(5, "foo", iceberg::int32(), true);
+ iceberg::SchemaField field2(7, "bar", iceberg::string(), true);
+ iceberg::Schema schema({field1, field2}, 100, {5, 7});
+ ASSERT_THAT(schema.IdentifierFieldIds(), testing::ElementsAre(5, 7));
+ auto result = schema.IdentifierFieldNames();
+ ASSERT_THAT(result, iceberg::IsOk());
+ ASSERT_THAT(result.value(), testing::ElementsAre("foo", "bar"));
}
}
@@ -82,6 +97,9 @@ TEST(SchemaTest, Equality) {
iceberg::Schema schema3({field1}, 101);
iceberg::Schema schema4({field3, field2}, 101);
iceberg::Schema schema5({field1, field2}, 100);
+ iceberg::Schema schema6({field1, field2}, 100, {5});
+ iceberg::Schema schema7({field1, field2}, 100, {5});
+ iceberg::Schema schema8({field1, field2}, 100, {7});
ASSERT_EQ(schema1, schema1);
ASSERT_NE(schema1, schema2);
@@ -92,6 +110,10 @@ TEST(SchemaTest, Equality) {
ASSERT_NE(schema4, schema1);
ASSERT_EQ(schema1, schema5);
ASSERT_EQ(schema5, schema1);
+
+ ASSERT_NE(schema5, schema6);
+ ASSERT_EQ(schema6, schema7);
+ ASSERT_NE(schema6, schema8);
}
class BasicShortNameTest : public ::testing::Test {
@@ -215,8 +237,8 @@ class ComplexShortNameTest : public ::testing::Test {
field9_ = std::make_unique<iceberg::SchemaField>(9, "Map", maptype, false);
- schema_ =
-
std::make_unique<iceberg::Schema>(std::vector<iceberg::SchemaField>{*field9_},
1);
+ schema_ = std::make_unique<iceberg::Schema>(
+ std::vector<iceberg::SchemaField>{*field9_}, 1,
std::vector<int32_t>{1, 2});
}
std::unique_ptr<iceberg::Schema> schema_;
@@ -245,6 +267,27 @@ TEST_F(ComplexShortNameTest, TestFindById) {
ASSERT_THAT(schema_->FindFieldById(0), ::testing::Optional(std::nullopt));
}
+TEST_F(ComplexShortNameTest, TestFindColumnNameById) {
+ ASSERT_THAT(schema_->FindColumnNameById(0),
::testing::Optional(std::nullopt));
+ ASSERT_THAT(schema_->FindColumnNameById(1),
+
::testing::Optional(std::string("Map.value.Second_child.element.Foo")));
+ ASSERT_THAT(schema_->FindColumnNameById(2),
+
::testing::Optional(std::string("Map.value.Second_child.element.Bar")));
+ ASSERT_THAT(schema_->FindColumnNameById(3),
+
::testing::Optional(std::string("Map.value.Second_child.element.Foobar")));
+ ASSERT_THAT(schema_->FindColumnNameById(4),
+
::testing::Optional(std::string("Map.value.Second_child.element")));
+ ASSERT_THAT(schema_->FindColumnNameById(5),
+ ::testing::Optional(std::string("Map.value.First_child")));
+ ASSERT_THAT(schema_->FindColumnNameById(6),
+ ::testing::Optional(std::string("Map.value.Second_child")));
+ ASSERT_THAT(schema_->FindColumnNameById(7),
+ ::testing::Optional(std::string("Map.key")));
+ ASSERT_THAT(schema_->FindColumnNameById(8),
+ ::testing::Optional(std::string("Map.value")));
+ ASSERT_THAT(schema_->FindColumnNameById(9),
::testing::Optional(std::string("Map")));
+}
+
TEST_F(ComplexShortNameTest, TestFindByName) {
ASSERT_THAT(schema_->FindFieldByName("Map"), ::testing::Optional(*field9_));
ASSERT_THAT(schema_->FindFieldByName("Map.value"),
::testing::Optional(*field8_));
@@ -315,6 +358,14 @@ TEST_F(ComplexShortNameTest,
TestFindByShortNameCaseInsensitive) {
::testing::Optional(std::nullopt));
}
+TEST_F(ComplexShortNameTest, TestIdentifierFieldNames) {
+ auto result = schema_->IdentifierFieldNames();
+ ASSERT_THAT(result, iceberg::IsOk());
+ ASSERT_THAT(result.value(),
+ ::testing::ElementsAre("Map.value.Second_child.element.Foo",
+ "Map.value.Second_child.element.Bar"));
+}
+
class ComplexMapStructShortNameTest : public ::testing::Test {
protected:
void SetUp() override {
diff --git a/src/iceberg/util/type_util.cc b/src/iceberg/util/type_util.cc
index 016397f0..a6cfd645 100644
--- a/src/iceberg/util/type_util.cc
+++ b/src/iceberg/util/type_util.cc
@@ -22,6 +22,7 @@
#include <stack>
#include "iceberg/result.h"
+#include "iceberg/schema.h"
#include "iceberg/util/checked_cast.h"
#include "iceberg/util/formatter_internal.h"
#include "iceberg/util/string_util.h"
@@ -50,9 +51,11 @@ Status IdToFieldVisitor::Visit(const NestedType& type) {
NameToIdVisitor::NameToIdVisitor(
std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>&
name_to_id,
- bool case_sensitive, std::function<std::string(std::string_view)>
quoting_func)
+ std::unordered_map<int32_t, std::string>* id_to_name, bool case_sensitive,
+ std::function<std::string(std::string_view)> quoting_func)
: case_sensitive_(case_sensitive),
name_to_id_(name_to_id),
+ id_to_name_(id_to_name),
quoting_func_(std::move(quoting_func)) {}
Status NameToIdVisitor::Visit(const ListType& type, const std::string& path,
@@ -140,6 +143,11 @@ std::string NameToIdVisitor::BuildPath(std::string_view
prefix,
}
void NameToIdVisitor::Finish() {
+ if (id_to_name_) {
+ for (auto& [name, id] : name_to_id_) {
+ id_to_name_->try_emplace(id, name);
+ }
+ }
for (auto&& it : short_name_to_id_) {
name_to_id_.try_emplace(it.first, it.second);
}
@@ -294,4 +302,72 @@ std::unordered_map<int32_t, int32_t> IndexParents(const
StructType& root_struct)
return id_to_parent;
}
+AssignFreshIdVisitor::AssignFreshIdVisitor(std::function<int32_t()> next_id)
+ : next_id_(std::move(next_id)) {}
+
+std::shared_ptr<Type> AssignFreshIdVisitor::Visit(
+ const std::shared_ptr<Type>& type) const {
+ switch (type->type_id()) {
+ case TypeId::kStruct:
+ return Visit(*internal::checked_pointer_cast<StructType>(type));
+ case TypeId::kMap:
+ return Visit(*internal::checked_pointer_cast<MapType>(type));
+ case TypeId::kList:
+ return Visit(*internal::checked_pointer_cast<ListType>(type));
+ default:
+ return type;
+ }
+}
+
+std::shared_ptr<StructType> AssignFreshIdVisitor::Visit(const StructType&
type) const {
+ auto fresh_ids =
+ type.fields() |
+ std::views::transform([&](const auto& /* unused */) { return next_id_();
}) |
+ std::ranges::to<std::vector<int32_t>>();
+ std::vector<SchemaField> fresh_fields;
+ for (size_t i = 0; i < type.fields().size(); ++i) {
+ const auto& field = type.fields()[i];
+ fresh_fields.emplace_back(fresh_ids[i], std::string(field.name()),
+ Visit(field.type()), field.optional(),
+ std::string(field.doc()));
+ }
+ return std::make_shared<StructType>(std::move(fresh_fields));
+}
+
+std::shared_ptr<ListType> AssignFreshIdVisitor::Visit(const ListType& type)
const {
+ const auto& elem_field = type.fields()[0];
+ int32_t fresh_id = next_id_();
+ SchemaField fresh_elem_field(fresh_id, std::string(elem_field.name()),
+ Visit(elem_field.type()), elem_field.optional(),
+ std::string(elem_field.doc()));
+ return std::make_shared<ListType>(std::move(fresh_elem_field));
+}
+
+std::shared_ptr<MapType> AssignFreshIdVisitor::Visit(const MapType& type)
const {
+ const auto& key_field = type.fields()[0];
+ const auto& value_field = type.fields()[1];
+
+ int32_t fresh_key_id = next_id_();
+ int32_t fresh_value_id = next_id_();
+
+ SchemaField fresh_key_field(fresh_key_id, std::string(key_field.name()),
+ Visit(key_field.type()), key_field.optional(),
+ std::string(key_field.doc()));
+ SchemaField fresh_value_field(fresh_value_id,
std::string(value_field.name()),
+ Visit(value_field.type()),
value_field.optional(),
+ std::string(value_field.doc()));
+ return std::make_shared<MapType>(std::move(fresh_key_field),
+ std::move(fresh_value_field));
+}
+
+Result<std::shared_ptr<Schema>> AssignFreshIds(int32_t schema_id, const
Schema& schema,
+ std::function<int32_t()>
next_id) {
+ auto fresh_type = AssignFreshIdVisitor(std::move(next_id))
+ .Visit(internal::checked_cast<const
StructType&>(schema));
+ std::vector<SchemaField> fields =
+ fresh_type->fields() | std::ranges::to<std::vector<SchemaField>>();
+ ICEBERG_ASSIGN_OR_RAISE(auto identifier_field_names,
schema.IdentifierFieldNames());
+ return Schema::Make(std::move(fields), schema_id, identifier_field_names);
+}
+
} // namespace iceberg
diff --git a/src/iceberg/util/type_util.h b/src/iceberg/util/type_util.h
index 7cc274b0..959bdb9f 100644
--- a/src/iceberg/util/type_util.h
+++ b/src/iceberg/util/type_util.h
@@ -51,12 +51,13 @@ class IdToFieldVisitor {
std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>&
id_to_field_;
};
-/// \brief Visitor for building a map from field name to field ID.
+/// \brief Visitor for building maps from field name to field ID and field ID
to field
+/// name.
class NameToIdVisitor {
public:
explicit NameToIdVisitor(
std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>&
name_to_id,
- bool case_sensitive = true,
+ std::unordered_map<int32_t, std::string>* id_to_name, bool
case_sensitive = true,
std::function<std::string(std::string_view)> quoting_func = {});
Status Visit(const ListType& type, const std::string& path,
const std::string& short_path);
@@ -75,6 +76,7 @@ class NameToIdVisitor {
private:
bool case_sensitive_;
std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>&
name_to_id_;
+ std::unordered_map<int32_t, std::string>* id_to_name_;
std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>
short_name_to_id_;
std::function<std::string(std::string_view)> quoting_func_;
};
@@ -131,4 +133,27 @@ class PruneColumnVisitor {
ICEBERG_EXPORT std::unordered_map<int32_t, int32_t> IndexParents(
const StructType& root_struct);
+/// \brief Assigns fresh IDs to all fields in the schema.
+class AssignFreshIdVisitor {
+ public:
+ explicit AssignFreshIdVisitor(std::function<int32_t()> next_id);
+
+ std::shared_ptr<Type> Visit(const std::shared_ptr<Type>& type) const;
+ std::shared_ptr<StructType> Visit(const StructType& type) const;
+ std::shared_ptr<ListType> Visit(const ListType& type) const;
+ std::shared_ptr<MapType> Visit(const MapType& type) const;
+
+ private:
+ std::function<int32_t()> next_id_;
+};
+
+/// \brief Assigns fresh IDs to all fields in a schema.
+///
+/// \param schema_id An ID assigned to this schema
+/// \param schema The schema to assign IDs to.
+/// \param next_id An id assignment function, which returns the next ID to
assign.
+/// \return A schema with new ids assigned by the next_id function.
+ICEBERG_EXPORT Result<std::shared_ptr<Schema>> AssignFreshIds(
+ int32_t schema_id, const Schema& schema, std::function<int32_t()> next_id);
+
} // namespace iceberg