This is an automated email from the ASF dual-hosted git repository.

gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-cpp.git


The following commit(s) were added to refs/heads/main by this push:
     new a94cf21e feat: assign fresh field ids for new schema (#430)
a94cf21e is described below

commit a94cf21ef1dee2622d78529d1f3110eb3a71661d
Author: wzhuo <[email protected]>
AuthorDate: Wed Dec 24 11:42:42 2025 +0800

    feat: assign fresh field ids for new schema (#430)
    
    This PR mainly includes the following changes:
    
    1. Modify NameToIdVisitor so that the visit method can set both
    name_to_id and id_to_name map simultaneously.
    2. Add identifier_field_ids as a member of Schema to specify
    identifier_fields.
    3. Add AssignFreshIdVisitor to assign entirely new field IDs to a
    Schema. This will be used later when creating a new table.
---
 src/iceberg/schema.cc                      |  73 +++++++++--
 src/iceberg/schema.h                       |  48 +++++++-
 src/iceberg/test/CMakeLists.txt            |   1 +
 src/iceberg/test/assign_id_visitor_test.cc | 188 +++++++++++++++++++++++++++++
 src/iceberg/test/meson.build               |   1 +
 src/iceberg/test/schema_test.cc            |  55 ++++++++-
 src/iceberg/util/type_util.cc              |  78 +++++++++++-
 src/iceberg/util/type_util.h               |  29 ++++-
 8 files changed, 452 insertions(+), 21 deletions(-)

diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc
index ca0d943f..414219f8 100644
--- a/src/iceberg/schema.cc
+++ b/src/iceberg/schema.cc
@@ -33,8 +33,28 @@
 
 namespace iceberg {
 
-Schema::Schema(std::vector<SchemaField> fields, std::optional<int32_t> 
schema_id)
-    : StructType(std::move(fields)), schema_id_(schema_id) {}
+Schema::Schema(std::vector<SchemaField> fields, std::optional<int32_t> 
schema_id,
+               std::vector<int32_t> identifier_field_ids)
+    : StructType(std::move(fields)),
+      schema_id_(schema_id),
+      identifier_field_ids_(std::move(identifier_field_ids)) {}
+
+Result<std::unique_ptr<Schema>> Schema::Make(
+    std::vector<SchemaField> fields, std::optional<int32_t> schema_id,
+    const std::vector<std::string>& identifier_field_names) {
+  auto schema = std::make_unique<Schema>(std::move(fields), schema_id);
+
+  std::vector<int32_t> fresh_identifier_ids;
+  for (const auto& name : identifier_field_names) {
+    ICEBERG_ASSIGN_OR_RAISE(auto field, schema->FindFieldByName(name));
+    if (!field) {
+      return InvalidSchema("Cannot find identifier field: {}", name);
+    }
+    fresh_identifier_ids.push_back(field.value().get().field_id());
+  }
+  schema->identifier_field_ids_ = std::move(fresh_identifier_ids);
+  return schema;
+}
 
 std::optional<int32_t> Schema::schema_id() const { return schema_id_; }
 
@@ -48,15 +68,16 @@ std::string Schema::ToString() const {
 }
 
 bool Schema::Equals(const Schema& other) const {
-  return schema_id_ == other.schema_id_ && fields_ == other.fields_;
+  return schema_id_ == other.schema_id_ && fields_ == other.fields_ &&
+         identifier_field_ids_ == other.identifier_field_ids_;
 }
 
 Result<std::optional<std::reference_wrapper<const SchemaField>>> 
Schema::FindFieldByName(
     std::string_view name, bool case_sensitive) const {
   if (case_sensitive) {
-    ICEBERG_ASSIGN_OR_RAISE(auto name_to_id, name_to_id_.Get(*this));
-    auto it = name_to_id.get().find(name);
-    if (it == name_to_id.get().end()) {
+    ICEBERG_ASSIGN_OR_RAISE(auto name_id_map, name_id_map_.Get(*this));
+    auto it = name_id_map.get().name_to_id.find(name);
+    if (it == name_id_map.get().name_to_id.end()) {
       return std::nullopt;
     };
     return FindFieldById(it->second);
@@ -77,21 +98,22 @@ Schema::InitIdToFieldMap(const Schema& self) {
   return id_to_field;
 }
 
-Result<std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>>
-Schema::InitNameToIdMap(const Schema& self) {
-  std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>> 
name_to_id;
-  NameToIdVisitor visitor(name_to_id, /*case_sensitive=*/true);
+Result<Schema::NameIdMap> Schema::InitNameIdMap(const Schema& self) {
+  NameIdMap name_id_map;
+  NameToIdVisitor visitor(name_id_map.name_to_id, &name_id_map.id_to_name,
+                          /*case_sensitive=*/true);
   ICEBERG_RETURN_UNEXPECTED(
       VisitTypeInline(self, &visitor, /*path=*/"", /*short_path=*/""));
   visitor.Finish();
-  return name_to_id;
+  return name_id_map;
 }
 
 Result<std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>>
 Schema::InitLowerCaseNameToIdMap(const Schema& self) {
   std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>
       lowercase_name_to_id;
-  NameToIdVisitor visitor(lowercase_name_to_id, /*case_sensitive=*/false);
+  NameToIdVisitor visitor(lowercase_name_to_id, /*id_to_name=*/nullptr,
+                          /*case_sensitive=*/false);
   ICEBERG_RETURN_UNEXPECTED(
       VisitTypeInline(self, &visitor, /*path=*/"", /*short_path=*/""));
   visitor.Finish();
@@ -108,6 +130,16 @@ Result<std::optional<std::reference_wrapper<const 
SchemaField>>> Schema::FindFie
   return it->second;
 }
 
+Result<std::optional<std::string_view>> Schema::FindColumnNameById(
+    int32_t field_id) const {
+  ICEBERG_ASSIGN_OR_RAISE(auto name_id_map, name_id_map_.Get(*this));
+  auto it = name_id_map.get().id_to_name.find(field_id);
+  if (it == name_id_map.get().id_to_name.end()) {
+    return std::nullopt;
+  }
+  return it->second;
+}
+
 Result<std::unordered_map<int32_t, std::vector<size_t>>> 
Schema::InitIdToPositionPath(
     const Schema& self) {
   PositionPathVisitor visitor;
@@ -179,4 +211,21 @@ Result<std::unique_ptr<Schema>> Schema::Project(
                         std::nullopt);
 }
 
+const std::vector<int32_t>& Schema::IdentifierFieldIds() const {
+  return identifier_field_ids_;
+}
+
+Result<std::vector<std::string>> Schema::IdentifierFieldNames() const {
+  std::vector<std::string> names;
+  names.reserve(identifier_field_ids_.size());
+  for (auto id : identifier_field_ids_) {
+    ICEBERG_ASSIGN_OR_RAISE(auto name, FindColumnNameById(id));
+    if (!name.has_value()) {
+      return InvalidSchema("Cannot find the field of the specified field id: 
{}", id);
+    }
+    names.emplace_back(name.value());
+  }
+  return names;
+}
+
 }  // namespace iceberg
diff --git a/src/iceberg/schema.h b/src/iceberg/schema.h
index f6c459d8..bb983962 100644
--- a/src/iceberg/schema.h
+++ b/src/iceberg/schema.h
@@ -49,7 +49,18 @@ class ICEBERG_EXPORT Schema : public StructType {
   static constexpr int32_t kInvalidColumnId = -1;
 
   explicit Schema(std::vector<SchemaField> fields,
-                  std::optional<int32_t> schema_id = std::nullopt);
+                  std::optional<int32_t> schema_id = std::nullopt,
+                  std::vector<int32_t> identifier_field_ids = {});
+
+  /// \brief Create a schema.
+  ///
+  /// \param fields The fields that make up the schema.
+  /// \param schema_id The unique identifier for this schema (default: 
kInitialSchemaId).
+  /// \param identifier_field_names Canonical names of fields that uniquely 
identify rows
+  /// in the table (default: empty). \return A new Schema instance or Status 
if failed.
+  static Result<std::unique_ptr<Schema>> Make(
+      std::vector<SchemaField> fields, std::optional<int32_t> schema_id = 
std::nullopt,
+      const std::vector<std::string>& identifier_field_names = {});
 
   /// \brief Get the schema ID.
   ///
@@ -78,6 +89,13 @@ class ICEBERG_EXPORT Schema : public StructType {
   Result<std::optional<std::reference_wrapper<const SchemaField>>> 
FindFieldById(
       int32_t field_id) const;
 
+  /// \brief Returns the canonical field name for the given id.
+  ///
+  /// \param field_id The id of the field to get the canonical name for.
+  /// \return The canocinal column name of the field with the given id, or 
std::nullopt if
+  /// not found.
+  Result<std::optional<std::string_view>> FindColumnNameById(int32_t field_id) 
const;
+
   /// \brief Get the accessor to access the field by field id.
   ///
   /// \param field_id The id of the field to get the accessor for.
@@ -103,26 +121,48 @@ class ICEBERG_EXPORT Schema : public StructType {
   Result<std::unique_ptr<Schema>> Project(
       const std::unordered_set<int32_t>& field_ids) const;
 
+  /// \brief Return the field IDs of the identifier fields.
+  const std::vector<int32_t>& IdentifierFieldIds() const;
+
+  /// \brief Return the canonical field names of the identifier fields.
+  Result<std::vector<std::string>> IdentifierFieldNames() const;
+
   friend bool operator==(const Schema& lhs, const Schema& rhs) { return 
lhs.Equals(rhs); }
 
  private:
   /// \brief Compare two schemas for equality.
   bool Equals(const Schema& other) const;
 
+  struct NameIdMap {
+    /// \brief Mapping from canonical field name to ID
+    ///
+    /// \note Short names for maps and lists are included for any name that 
does not
+    /// conflict with a canonical name. For example, a list, 'l', of structs 
with field
+    /// 'x' will produce short name 'l.x' in addition to canonical name 
'l.element.x'.
+    std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>> 
name_to_id;
+
+    /// \brief Mapping from field ID to canonical name
+    ///
+    /// \note Canonical names, but not short names are set, for example
+    /// 'list.element.field' instead of 'list.field'.
+    std::unordered_map<int32_t, std::string> id_to_name;
+  };
+
   static Result<std::unordered_map<int32_t, std::reference_wrapper<const 
SchemaField>>>
   InitIdToFieldMap(const Schema&);
-  static Result<std::unordered_map<std::string, int32_t, StringHash, 
std::equal_to<>>>
-  InitNameToIdMap(const Schema&);
+  static Result<NameIdMap> InitNameIdMap(const Schema&);
   static Result<std::unordered_map<std::string, int32_t, StringHash, 
std::equal_to<>>>
   InitLowerCaseNameToIdMap(const Schema&);
   static Result<std::unordered_map<int32_t, std::vector<size_t>>> 
InitIdToPositionPath(
       const Schema&);
 
   const std::optional<int32_t> schema_id_;
+  /// Field IDs that uniquely identify rows in the table.
+  std::vector<int32_t> identifier_field_ids_;
   /// Mapping from field id to field.
   Lazy<InitIdToFieldMap> id_to_field_;
   /// Mapping from field name to field id.
-  Lazy<InitNameToIdMap> name_to_id_;
+  Lazy<InitNameIdMap> name_id_map_;
   /// Mapping from lowercased field name to field id
   Lazy<InitLowerCaseNameToIdMap> lowercase_name_to_id_;
   /// Mapping from field id to (nested) position path to access the field.
diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt
index 28178b88..fef63efe 100644
--- a/src/iceberg/test/CMakeLists.txt
+++ b/src/iceberg/test/CMakeLists.txt
@@ -54,6 +54,7 @@ endfunction()
 
 add_iceberg_test(schema_test
                  SOURCES
+                 assign_id_visitor_test.cc
                  name_mapping_test.cc
                  partition_field_test.cc
                  partition_spec_test.cc
diff --git a/src/iceberg/test/assign_id_visitor_test.cc 
b/src/iceberg/test/assign_id_visitor_test.cc
new file mode 100644
index 00000000..f9290d7f
--- /dev/null
+++ b/src/iceberg/test/assign_id_visitor_test.cc
@@ -0,0 +1,188 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "iceberg/schema.h"
+#include "iceberg/schema_field.h"
+#include "iceberg/test/matchers.h"
+#include "iceberg/type.h"
+#include "iceberg/util/type_util.h"
+
+namespace iceberg {
+
+namespace {
+
+Schema CreateFlatSchema() {
+  return Schema({
+      SchemaField::MakeRequired(/*field_id=*/10, "id", iceberg::int64()),
+      SchemaField::MakeOptional(/*field_id=*/20, "name", iceberg::string()),
+      SchemaField::MakeOptional(/*field_id=*/30, "age", iceberg::int32()),
+      SchemaField::MakeRequired(/*field_id=*/40, "data", iceberg::float64()),
+  });
+}
+
+std::shared_ptr<Type> CreateListOfStruct() {
+  return std::make_shared<ListType>(SchemaField::MakeOptional(
+      /*field_id=*/101, "element",
+      std::make_shared<StructType>(std::vector<SchemaField>{
+          SchemaField::MakeOptional(/*field_id=*/102, "x", iceberg::int32()),
+          SchemaField::MakeRequired(/*field_id=*/103, "y", iceberg::string()),
+      })));
+}
+
+std::shared_ptr<Type> CreateMapWithStructValue() {
+  return std::make_shared<MapType>(
+      SchemaField::MakeRequired(/*field_id=*/201, "key", iceberg::string()),
+      SchemaField::MakeRequired(
+          /*field_id=*/202, "value",
+          std::make_shared<StructType>(std::vector<SchemaField>{
+              SchemaField::MakeRequired(/*field_id=*/203, "id", 
iceberg::int64()),
+              SchemaField::MakeOptional(/*field_id=*/204, "name", 
iceberg::string()),
+          })));
+}
+
+std::shared_ptr<Type> CreateNestedStruct() {
+  return std::make_shared<StructType>(std::vector<SchemaField>{
+      SchemaField::MakeRequired(/*field_id=*/301, "outer_id", 
iceberg::int64()),
+      SchemaField::MakeRequired(
+          /*field_id=*/302, "nested",
+          std::make_shared<StructType>(std::vector<SchemaField>{
+              SchemaField::MakeOptional(/*field_id=*/303, "inner_id", 
iceberg::int32()),
+              SchemaField::MakeRequired(/*field_id=*/304, "inner_name",
+                                        iceberg::string()),
+          })),
+  });
+}
+
+Schema CreateNestedSchema(std::vector<int32_t> identifier_field_ids = {}) {
+  return Schema(
+      {
+          SchemaField::MakeRequired(/*field_id=*/10, "id", iceberg::int64()),
+          SchemaField::MakeOptional(/*field_id=*/20, "list", 
CreateListOfStruct()),
+          SchemaField::MakeOptional(/*field_id=*/30, "map", 
CreateMapWithStructValue()),
+          SchemaField::MakeRequired(/*field_id=*/40, "struct", 
CreateNestedStruct()),
+      },
+      Schema::kInitialSchemaId, std::move(identifier_field_ids));
+}
+
+}  // namespace
+
+TEST(AssignFreshIdVisitorTest, FlatSchema) {
+  Schema schema = CreateFlatSchema();
+
+  std::atomic<int32_t> id = 0;
+  auto next_id = [&id]() { return ++id; };
+  ICEBERG_UNWRAP_OR_FAIL(auto fresh_schema,
+                         AssignFreshIds(Schema::kInitialSchemaId, schema, 
next_id));
+
+  ASSERT_EQ(fresh_schema->fields().size(), schema.fields().size());
+  EXPECT_EQ(Schema(
+                {
+                    SchemaField::MakeRequired(/*field_id=*/1, "id", 
iceberg::int64()),
+                    SchemaField::MakeOptional(/*field_id=*/2, "name", 
iceberg::string()),
+                    SchemaField::MakeOptional(/*field_id=*/3, "age", 
iceberg::int32()),
+                    SchemaField::MakeRequired(/*field_id=*/4, "data", 
iceberg::float64()),
+                },
+                Schema::kInitialSchemaId),
+            *fresh_schema);
+}
+
+TEST(AssignFreshIdVisitorTest, NestedSchema) {
+  Schema schema = CreateNestedSchema();
+  std::atomic<int32_t> id = 0;
+  auto next_id = [&id]() { return ++id; };
+  ICEBERG_UNWRAP_OR_FAIL(auto fresh_schema,
+                         AssignFreshIds(Schema::kInitialSchemaId, schema, 
next_id));
+
+  ASSERT_EQ(4, fresh_schema->fields().size());
+  for (int32_t i = 0; i < fresh_schema->fields().size(); ++i) {
+    EXPECT_EQ(i + 1, fresh_schema->fields()[i].field_id());
+  }
+
+  auto list_field = fresh_schema->fields()[1];
+  auto list_type = std::dynamic_pointer_cast<ListType>(list_field.type());
+  ASSERT_TRUE(list_type);
+  auto list_element_field = list_type->fields()[0];
+  EXPECT_EQ(5, list_element_field.field_id());
+  auto list_element_type =
+      std::dynamic_pointer_cast<StructType>(list_element_field.type());
+  ASSERT_TRUE(list_element_type);
+  EXPECT_EQ(StructType(std::vector<SchemaField>{
+                SchemaField::MakeOptional(/*field_id=*/6, "x", 
iceberg::int32()),
+                SchemaField::MakeRequired(/*field_id=*/7, "y", 
iceberg::string()),
+            }),
+            *list_element_type);
+
+  auto map_field = fresh_schema->fields()[2];
+  auto map_type = std::dynamic_pointer_cast<MapType>(map_field.type());
+  ASSERT_TRUE(map_type);
+  EXPECT_EQ(8, map_type->fields()[0].field_id());
+  auto map_value_field = map_type->fields()[1];
+  EXPECT_EQ(9, map_value_field.field_id());
+  auto map_value_type = 
std::dynamic_pointer_cast<StructType>(map_value_field.type());
+  ASSERT_TRUE(map_value_type);
+  EXPECT_EQ(StructType(std::vector<SchemaField>{
+                SchemaField::MakeRequired(/*field_id=*/10, "id", 
iceberg::int64()),
+                SchemaField::MakeOptional(/*field_id=*/11, "name", 
iceberg::string()),
+            }),
+            *map_value_type);
+
+  auto struct_field = fresh_schema->fields()[3];
+  auto struct_type = 
std::dynamic_pointer_cast<StructType>(struct_field.type());
+  ASSERT_TRUE(struct_type);
+
+  auto expect_nested_struct_type = 
std::make_shared<StructType>(std::vector<SchemaField>{
+      SchemaField::MakeOptional(/*field_id=*/14, "inner_id", iceberg::int32()),
+      SchemaField::MakeRequired(/*field_id=*/15, "inner_name", 
iceberg::string()),
+  });
+  EXPECT_EQ(StructType(std::vector<SchemaField>{
+                SchemaField::MakeRequired(/*field_id=*/12, "outer_id", 
iceberg::int64()),
+                SchemaField::MakeRequired(
+                    /*field_id=*/13, "nested", expect_nested_struct_type)}),
+            *struct_type);
+
+  auto nested_struct_field = struct_type->fields()[1];
+  auto nested_struct_type =
+      std::dynamic_pointer_cast<StructType>(nested_struct_field.type());
+  ASSERT_TRUE(nested_struct_type);
+  EXPECT_EQ(*expect_nested_struct_type, *nested_struct_type);
+}
+
+TEST(AssignFreshIdVisitorTest, RefreshIdentifierId) {
+  std::atomic<int32_t> id = 0;
+  auto next_id = [&id]() { return ++id; };
+
+  Schema invalid_schema = CreateNestedSchema({10, 400});
+  // Invalid identified field id
+  auto result = AssignFreshIds(Schema::kInitialSchemaId, invalid_schema, 
next_id);
+  EXPECT_THAT(result, IsError(ErrorKind::kInvalidSchema));
+  EXPECT_THAT(result, HasErrorMessage("Cannot find"));
+
+  id = 0;
+  Schema schema = CreateNestedSchema({10, 301});
+  ICEBERG_UNWRAP_OR_FAIL(auto fresh_schema,
+                         AssignFreshIds(Schema::kInitialSchemaId, schema, 
next_id));
+  EXPECT_THAT(fresh_schema->IdentifierFieldIds(), testing::ElementsAre(1, 12));
+}
+
+}  // namespace iceberg
diff --git a/src/iceberg/test/meson.build b/src/iceberg/test/meson.build
index fcd397b9..37818281 100644
--- a/src/iceberg/test/meson.build
+++ b/src/iceberg/test/meson.build
@@ -30,6 +30,7 @@ configure_file(
 iceberg_tests = {
     'schema_test': {
         'sources': files(
+            'assign_id_visitor_test.cc',
             'name_mapping_test.cc',
             'partition_field_test.cc',
             'partition_spec_test.cc',
diff --git a/src/iceberg/test/schema_test.cc b/src/iceberg/test/schema_test.cc
index 89a8d54b..ff6bf060 100644
--- a/src/iceberg/test/schema_test.cc
+++ b/src/iceberg/test/schema_test.cc
@@ -70,6 +70,21 @@ TEST(SchemaTest, Basics) {
     ASSERT_THAT(result,
                 iceberg::HasErrorMessage("Invalid index -1 to get field from 
struct"));
     ASSERT_EQ(std::nullopt, schema.GetFieldByName("element"));
+    ASSERT_EQ(0, schema.IdentifierFieldIds().size());
+    auto identifier_field_names = schema.IdentifierFieldNames();
+    ASSERT_THAT(identifier_field_names, iceberg::IsOk());
+    ASSERT_THAT(identifier_field_names.value(), ::testing::IsEmpty());
+  }
+
+  {
+    // identifier fields not empty
+    iceberg::SchemaField field1(5, "foo", iceberg::int32(), true);
+    iceberg::SchemaField field2(7, "bar", iceberg::string(), true);
+    iceberg::Schema schema({field1, field2}, 100, {5, 7});
+    ASSERT_THAT(schema.IdentifierFieldIds(), testing::ElementsAre(5, 7));
+    auto result = schema.IdentifierFieldNames();
+    ASSERT_THAT(result, iceberg::IsOk());
+    ASSERT_THAT(result.value(), testing::ElementsAre("foo", "bar"));
   }
 }
 
@@ -82,6 +97,9 @@ TEST(SchemaTest, Equality) {
   iceberg::Schema schema3({field1}, 101);
   iceberg::Schema schema4({field3, field2}, 101);
   iceberg::Schema schema5({field1, field2}, 100);
+  iceberg::Schema schema6({field1, field2}, 100, {5});
+  iceberg::Schema schema7({field1, field2}, 100, {5});
+  iceberg::Schema schema8({field1, field2}, 100, {7});
 
   ASSERT_EQ(schema1, schema1);
   ASSERT_NE(schema1, schema2);
@@ -92,6 +110,10 @@ TEST(SchemaTest, Equality) {
   ASSERT_NE(schema4, schema1);
   ASSERT_EQ(schema1, schema5);
   ASSERT_EQ(schema5, schema1);
+
+  ASSERT_NE(schema5, schema6);
+  ASSERT_EQ(schema6, schema7);
+  ASSERT_NE(schema6, schema8);
 }
 
 class BasicShortNameTest : public ::testing::Test {
@@ -215,8 +237,8 @@ class ComplexShortNameTest : public ::testing::Test {
 
     field9_ = std::make_unique<iceberg::SchemaField>(9, "Map", maptype, false);
 
-    schema_ =
-        
std::make_unique<iceberg::Schema>(std::vector<iceberg::SchemaField>{*field9_}, 
1);
+    schema_ = std::make_unique<iceberg::Schema>(
+        std::vector<iceberg::SchemaField>{*field9_}, 1, 
std::vector<int32_t>{1, 2});
   }
 
   std::unique_ptr<iceberg::Schema> schema_;
@@ -245,6 +267,27 @@ TEST_F(ComplexShortNameTest, TestFindById) {
   ASSERT_THAT(schema_->FindFieldById(0), ::testing::Optional(std::nullopt));
 }
 
+TEST_F(ComplexShortNameTest, TestFindColumnNameById) {
+  ASSERT_THAT(schema_->FindColumnNameById(0), 
::testing::Optional(std::nullopt));
+  ASSERT_THAT(schema_->FindColumnNameById(1),
+              
::testing::Optional(std::string("Map.value.Second_child.element.Foo")));
+  ASSERT_THAT(schema_->FindColumnNameById(2),
+              
::testing::Optional(std::string("Map.value.Second_child.element.Bar")));
+  ASSERT_THAT(schema_->FindColumnNameById(3),
+              
::testing::Optional(std::string("Map.value.Second_child.element.Foobar")));
+  ASSERT_THAT(schema_->FindColumnNameById(4),
+              
::testing::Optional(std::string("Map.value.Second_child.element")));
+  ASSERT_THAT(schema_->FindColumnNameById(5),
+              ::testing::Optional(std::string("Map.value.First_child")));
+  ASSERT_THAT(schema_->FindColumnNameById(6),
+              ::testing::Optional(std::string("Map.value.Second_child")));
+  ASSERT_THAT(schema_->FindColumnNameById(7),
+              ::testing::Optional(std::string("Map.key")));
+  ASSERT_THAT(schema_->FindColumnNameById(8),
+              ::testing::Optional(std::string("Map.value")));
+  ASSERT_THAT(schema_->FindColumnNameById(9), 
::testing::Optional(std::string("Map")));
+}
+
 TEST_F(ComplexShortNameTest, TestFindByName) {
   ASSERT_THAT(schema_->FindFieldByName("Map"), ::testing::Optional(*field9_));
   ASSERT_THAT(schema_->FindFieldByName("Map.value"), 
::testing::Optional(*field8_));
@@ -315,6 +358,14 @@ TEST_F(ComplexShortNameTest, 
TestFindByShortNameCaseInsensitive) {
               ::testing::Optional(std::nullopt));
 }
 
+TEST_F(ComplexShortNameTest, TestIdentifierFieldNames) {
+  auto result = schema_->IdentifierFieldNames();
+  ASSERT_THAT(result, iceberg::IsOk());
+  ASSERT_THAT(result.value(),
+              ::testing::ElementsAre("Map.value.Second_child.element.Foo",
+                                     "Map.value.Second_child.element.Bar"));
+}
+
 class ComplexMapStructShortNameTest : public ::testing::Test {
  protected:
   void SetUp() override {
diff --git a/src/iceberg/util/type_util.cc b/src/iceberg/util/type_util.cc
index 016397f0..a6cfd645 100644
--- a/src/iceberg/util/type_util.cc
+++ b/src/iceberg/util/type_util.cc
@@ -22,6 +22,7 @@
 #include <stack>
 
 #include "iceberg/result.h"
+#include "iceberg/schema.h"
 #include "iceberg/util/checked_cast.h"
 #include "iceberg/util/formatter_internal.h"
 #include "iceberg/util/string_util.h"
@@ -50,9 +51,11 @@ Status IdToFieldVisitor::Visit(const NestedType& type) {
 
 NameToIdVisitor::NameToIdVisitor(
     std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>& 
name_to_id,
-    bool case_sensitive, std::function<std::string(std::string_view)> 
quoting_func)
+    std::unordered_map<int32_t, std::string>* id_to_name, bool case_sensitive,
+    std::function<std::string(std::string_view)> quoting_func)
     : case_sensitive_(case_sensitive),
       name_to_id_(name_to_id),
+      id_to_name_(id_to_name),
       quoting_func_(std::move(quoting_func)) {}
 
 Status NameToIdVisitor::Visit(const ListType& type, const std::string& path,
@@ -140,6 +143,11 @@ std::string NameToIdVisitor::BuildPath(std::string_view 
prefix,
 }
 
 void NameToIdVisitor::Finish() {
+  if (id_to_name_) {
+    for (auto& [name, id] : name_to_id_) {
+      id_to_name_->try_emplace(id, name);
+    }
+  }
   for (auto&& it : short_name_to_id_) {
     name_to_id_.try_emplace(it.first, it.second);
   }
@@ -294,4 +302,72 @@ std::unordered_map<int32_t, int32_t> IndexParents(const 
StructType& root_struct)
   return id_to_parent;
 }
 
+AssignFreshIdVisitor::AssignFreshIdVisitor(std::function<int32_t()> next_id)
+    : next_id_(std::move(next_id)) {}
+
+std::shared_ptr<Type> AssignFreshIdVisitor::Visit(
+    const std::shared_ptr<Type>& type) const {
+  switch (type->type_id()) {
+    case TypeId::kStruct:
+      return Visit(*internal::checked_pointer_cast<StructType>(type));
+    case TypeId::kMap:
+      return Visit(*internal::checked_pointer_cast<MapType>(type));
+    case TypeId::kList:
+      return Visit(*internal::checked_pointer_cast<ListType>(type));
+    default:
+      return type;
+  }
+}
+
+std::shared_ptr<StructType> AssignFreshIdVisitor::Visit(const StructType& 
type) const {
+  auto fresh_ids =
+      type.fields() |
+      std::views::transform([&](const auto& /* unused */) { return next_id_(); 
}) |
+      std::ranges::to<std::vector<int32_t>>();
+  std::vector<SchemaField> fresh_fields;
+  for (size_t i = 0; i < type.fields().size(); ++i) {
+    const auto& field = type.fields()[i];
+    fresh_fields.emplace_back(fresh_ids[i], std::string(field.name()),
+                              Visit(field.type()), field.optional(),
+                              std::string(field.doc()));
+  }
+  return std::make_shared<StructType>(std::move(fresh_fields));
+}
+
+std::shared_ptr<ListType> AssignFreshIdVisitor::Visit(const ListType& type) 
const {
+  const auto& elem_field = type.fields()[0];
+  int32_t fresh_id = next_id_();
+  SchemaField fresh_elem_field(fresh_id, std::string(elem_field.name()),
+                               Visit(elem_field.type()), elem_field.optional(),
+                               std::string(elem_field.doc()));
+  return std::make_shared<ListType>(std::move(fresh_elem_field));
+}
+
+std::shared_ptr<MapType> AssignFreshIdVisitor::Visit(const MapType& type) 
const {
+  const auto& key_field = type.fields()[0];
+  const auto& value_field = type.fields()[1];
+
+  int32_t fresh_key_id = next_id_();
+  int32_t fresh_value_id = next_id_();
+
+  SchemaField fresh_key_field(fresh_key_id, std::string(key_field.name()),
+                              Visit(key_field.type()), key_field.optional(),
+                              std::string(key_field.doc()));
+  SchemaField fresh_value_field(fresh_value_id, 
std::string(value_field.name()),
+                                Visit(value_field.type()), 
value_field.optional(),
+                                std::string(value_field.doc()));
+  return std::make_shared<MapType>(std::move(fresh_key_field),
+                                   std::move(fresh_value_field));
+}
+
+Result<std::shared_ptr<Schema>> AssignFreshIds(int32_t schema_id, const 
Schema& schema,
+                                               std::function<int32_t()> 
next_id) {
+  auto fresh_type = AssignFreshIdVisitor(std::move(next_id))
+                        .Visit(internal::checked_cast<const 
StructType&>(schema));
+  std::vector<SchemaField> fields =
+      fresh_type->fields() | std::ranges::to<std::vector<SchemaField>>();
+  ICEBERG_ASSIGN_OR_RAISE(auto identifier_field_names, 
schema.IdentifierFieldNames());
+  return Schema::Make(std::move(fields), schema_id, identifier_field_names);
+}
+
 }  // namespace iceberg
diff --git a/src/iceberg/util/type_util.h b/src/iceberg/util/type_util.h
index 7cc274b0..959bdb9f 100644
--- a/src/iceberg/util/type_util.h
+++ b/src/iceberg/util/type_util.h
@@ -51,12 +51,13 @@ class IdToFieldVisitor {
   std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>& 
id_to_field_;
 };
 
-/// \brief Visitor for building a map from field name to field ID.
+/// \brief Visitor for building maps from field name to field ID and field ID 
to field
+/// name.
 class NameToIdVisitor {
  public:
   explicit NameToIdVisitor(
       std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>& 
name_to_id,
-      bool case_sensitive = true,
+      std::unordered_map<int32_t, std::string>* id_to_name, bool 
case_sensitive = true,
       std::function<std::string(std::string_view)> quoting_func = {});
   Status Visit(const ListType& type, const std::string& path,
                const std::string& short_path);
@@ -75,6 +76,7 @@ class NameToIdVisitor {
  private:
   bool case_sensitive_;
   std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>& 
name_to_id_;
+  std::unordered_map<int32_t, std::string>* id_to_name_;
   std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>> 
short_name_to_id_;
   std::function<std::string(std::string_view)> quoting_func_;
 };
@@ -131,4 +133,27 @@ class PruneColumnVisitor {
 ICEBERG_EXPORT std::unordered_map<int32_t, int32_t> IndexParents(
     const StructType& root_struct);
 
+/// \brief Assigns fresh IDs to all fields in the schema.
+class AssignFreshIdVisitor {
+ public:
+  explicit AssignFreshIdVisitor(std::function<int32_t()> next_id);
+
+  std::shared_ptr<Type> Visit(const std::shared_ptr<Type>& type) const;
+  std::shared_ptr<StructType> Visit(const StructType& type) const;
+  std::shared_ptr<ListType> Visit(const ListType& type) const;
+  std::shared_ptr<MapType> Visit(const MapType& type) const;
+
+ private:
+  std::function<int32_t()> next_id_;
+};
+
+/// \brief Assigns fresh IDs to all fields in a schema.
+///
+/// \param schema_id An ID assigned to this schema
+/// \param schema The schema to assign IDs to.
+/// \param next_id An id assignment function, which returns the next ID to 
assign.
+/// \return A schema with new ids assigned by the next_id function.
+ICEBERG_EXPORT Result<std::shared_ptr<Schema>> AssignFreshIds(
+    int32_t schema_id, const Schema& schema, std::function<int32_t()> next_id);
+
 }  // namespace iceberg

Reply via email to