This is an automated email from the ASF dual-hosted git repository.

fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-cpp.git


The following commit(s) were added to refs/heads/main by this push:
     new 1cea1e9  feat: avro schema add sanitize field name (#190)
1cea1e9 is described below

commit 1cea1e9b6bd6ab3c146b94e1f1b4c4c65a0d3bbe
Author: liuxiaoyu <[email protected]>
AuthorDate: Thu Sep 25 17:43:47 2025 +0800

    feat: avro schema add sanitize field name (#190)
---
 src/iceberg/avro/avro_schema_util.cc         |  69 +++++++++++++-
 src/iceberg/avro/avro_schema_util_internal.h |  33 +++++++
 test/avro_schema_test.cc                     | 129 ++++++++++++++++++++++++++-
 3 files changed, 227 insertions(+), 4 deletions(-)

diff --git a/src/iceberg/avro/avro_schema_util.cc 
b/src/iceberg/avro/avro_schema_util.cc
index fd40cc7..d315eb2 100644
--- a/src/iceberg/avro/avro_schema_util.cc
+++ b/src/iceberg/avro/avro_schema_util.cc
@@ -20,6 +20,7 @@
 #include <charconv>
 #include <format>
 #include <mutex>
+#include <sstream>
 #include <string_view>
 
 #include <arrow/type.h>
@@ -56,8 +57,60 @@ namespace {
   return attributes;
 }
 
+void SanitizeChar(char c, std::ostringstream& os) {
+  if (std::isdigit(c)) {
+    os << '_' << c;
+  } else {
+    os << "_x" << std::uppercase << std::hex << static_cast<int>(c);
+  }
+}
+
 }  // namespace
 
+bool ValidAvroName(std::string_view name) {
+  if (name.empty()) {
+    return false;
+  }
+
+  char first = name[0];
+  if (!(std::isalpha(first) || first == '_')) {
+    return false;
+  }
+
+  for (size_t i = 1; i < name.length(); i++) {
+    char character = name[i];
+    if (!(std::isalnum(character) || character == '_')) {
+      return false;
+    }
+  }
+  return true;
+}
+
+std::string SanitizeFieldName(std::string_view field_name) {
+  if (field_name.empty()) {
+    return "";
+  }
+
+  std::ostringstream result;
+
+  if (!std::isalpha(field_name[0]) && field_name[0] != '_') {
+    SanitizeChar(field_name[0], result);
+  } else {
+    result << field_name[0];
+  }
+
+  for (size_t i = 1; i < field_name.size(); ++i) {
+    char c = field_name[i];
+    if (std::isalnum(c) || c == '_') {
+      result << c;
+    } else {
+      SanitizeChar(c, result);
+    }
+  }
+
+  return result.str();
+}
+
 std::string ToString(const ::avro::NodePtr& node) {
   std::stringstream ss;
   ss << *node;
@@ -181,10 +234,20 @@ Status ToAvroNodeVisitor::Visit(const StructType& type, 
::avro::NodePtr* node) {
     ::avro::NodePtr field_node;
     ICEBERG_RETURN_UNEXPECTED(Visit(sub_field, &field_node));
 
-    // TODO(gangwu): sanitize field name
-    (*node)->addName(std::string(sub_field.name()));
+    bool is_valid_field_name = ValidAvroName(sub_field.name());
+    std::string field_name = is_valid_field_name ? 
std::string(sub_field.name())
+                                                 : 
SanitizeFieldName(sub_field.name());
+
+    (*node)->addName(field_name);
     (*node)->addLeaf(field_node);
-    
(*node)->addCustomAttributesForField(GetAttributesWithFieldId(sub_field.field_id()));
+
+    ::avro::CustomAttributes attributes = 
GetAttributesWithFieldId(sub_field.field_id());
+    if (!is_valid_field_name) {
+      attributes.addAttribute(std::string(kIcebergFieldNameProp),
+                              std::string(sub_field.name()),
+                              /*addQuotes=*/true);
+    }
+    (*node)->addCustomAttributesForField(attributes);
   }
   return {};
 }
diff --git a/src/iceberg/avro/avro_schema_util_internal.h 
b/src/iceberg/avro/avro_schema_util_internal.h
index 1647870..bdfbf13 100644
--- a/src/iceberg/avro/avro_schema_util_internal.h
+++ b/src/iceberg/avro/avro_schema_util_internal.h
@@ -149,6 +149,16 @@ std::string ToString(const ::avro::LogicalType::Type& 
logical_type);
 /// \return True if the node has a map logical type, false otherwise.
 bool HasMapLogicalType(const ::avro::NodePtr& node);
 
+/// \brief Check if a string is a valid Avro name.
+///
+/// Valid Avro names must:
+/// 1. Start with a letter or underscore
+/// 2. Contain only letters, digits, or underscores
+///
+/// \param name The name to check.
+/// \return True if the name is valid, false otherwise.
+bool ValidAvroName(std::string_view name);
+
 /// \brief Create a new Avro node with field IDs from name mapping.
 /// \param original_node The original Avro node to copy.
 /// \param mapping The name mapping to apply field IDs from.
@@ -156,4 +166,27 @@ bool HasMapLogicalType(const ::avro::NodePtr& node);
 Result<::avro::NodePtr> MakeAvroNodeWithFieldIds(const ::avro::NodePtr& 
original_node,
                                                  const NameMapping& mapping);
 
+/// \brief Sanitize a field name to make it compatible with Avro field name 
requirements.
+///
+/// Converts names that are not valid Avro names to valid Avro names.
+/// Conversion rules:
+/// 1. If the first character is not a letter or underscore, it is specially 
handled:
+///    - Digits: Prefixed with an underscore (e.g., '3' -> '_3')
+///    - Other characters: Converted to '_x' followed by the uppercase 
hexadecimal
+///    representation of the character (e.g., '$' -> '_x24')
+/// 2. For characters other than the first:
+///    - If it's a letter, digit, or underscore, it remains unchanged
+///    - Other characters: Converted to '_x' followed by the uppercase 
hexadecimal
+///    representation
+///
+/// Examples:
+/// - "123field" -> "_123field"
+/// - "user-name" -> "user_x2Dname"
+/// - "$price" -> "_x24price"
+/// - "valid_name_123" -> "valid_name_123" (no conversion needed)
+///
+/// \param field_name The original field name to sanitize.
+/// \return A sanitized field name that follows Avro naming conventions.
+std::string SanitizeFieldName(std::string_view field_name);
+
 }  // namespace iceberg::avro
diff --git a/test/avro_schema_test.cc b/test/avro_schema_test.cc
index 6fbbb85..6870ee2 100644
--- a/test/avro_schema_test.cc
+++ b/test/avro_schema_test.cc
@@ -47,8 +47,82 @@ void CheckFieldIdAt(const ::avro::NodePtr& node, size_t 
index, int32_t field_id,
   ASSERT_EQ(attrs.getAttribute(key), 
std::make_optional(std::to_string(field_id)));
 }
 
+// Helper function to check if a custom attribute exists for a field name 
preservation
+void CheckIcebergFieldName(const ::avro::NodePtr& node, size_t index,
+                           const std::string& original_name) {
+  ASSERT_LT(index, node->customAttributes());
+  const auto& attrs = node->customAttributesAt(index);
+  ASSERT_EQ(attrs.getAttribute("iceberg-field-name"), 
std::make_optional(original_name));
+}
+
 }  // namespace
 
+TEST(ValidAvroNameTest, ValidNames) {
+  // Valid field names should return true
+  EXPECT_TRUE(ValidAvroName("valid_field"));
+  EXPECT_TRUE(ValidAvroName("field123"));
+  EXPECT_TRUE(ValidAvroName("_private"));
+  EXPECT_TRUE(ValidAvroName("CamelCase"));
+  EXPECT_TRUE(ValidAvroName("field_with_underscores"));
+}
+
+TEST(ValidAvroNameTest, InvalidNames) {
+  // Names starting with numbers should return false
+  EXPECT_FALSE(ValidAvroName("123field"));
+  EXPECT_FALSE(ValidAvroName("0value"));
+
+  // Names with special characters should return false
+  EXPECT_FALSE(ValidAvroName("field-name"));
+  EXPECT_FALSE(ValidAvroName("field.name"));
+  EXPECT_FALSE(ValidAvroName("field name"));
+  EXPECT_FALSE(ValidAvroName("field@name"));
+  EXPECT_FALSE(ValidAvroName("field#name"));
+}
+
+TEST(ValidAvroNameTest, EmptyName) {
+  // Empty name should return false
+  EXPECT_FALSE(ValidAvroName(""));
+}
+
+TEST(SanitizeFieldNameTest, ValidFieldNames) {
+  // Valid field names should remain unchanged
+  EXPECT_EQ(SanitizeFieldName("valid_field"), "valid_field");
+  EXPECT_EQ(SanitizeFieldName("field123"), "field123");
+  EXPECT_EQ(SanitizeFieldName("_private"), "_private");
+  EXPECT_EQ(SanitizeFieldName("CamelCase"), "CamelCase");
+  EXPECT_EQ(SanitizeFieldName("field_with_underscores"), 
"field_with_underscores");
+}
+
+TEST(SanitizeFieldNameTest, InvalidFieldNames) {
+  // Field names starting with numbers should be prefixed with underscore
+  EXPECT_EQ(SanitizeFieldName("123field"), "_123field");
+  EXPECT_EQ(SanitizeFieldName("0value"), "_0value");
+
+  // Field names with special characters should be encoded with hex values
+  EXPECT_EQ(SanitizeFieldName("field-name"), "field_x2Dname");
+  EXPECT_EQ(SanitizeFieldName("field.name"), "field_x2Ename");
+  EXPECT_EQ(SanitizeFieldName("field name"), "field_x20name");
+  EXPECT_EQ(SanitizeFieldName("field@name"), "field_x40name");
+  EXPECT_EQ(SanitizeFieldName("field#name"), "field_x23name");
+
+  // Complex field names with multiple issues
+  EXPECT_EQ(SanitizeFieldName("1field-with.special@chars"),
+            "_1field_x2Dwith_x2Especial_x40chars");
+  EXPECT_EQ(SanitizeFieldName("user-email"), "user_x2Demail");
+}
+
+TEST(SanitizeFieldNameTest, EdgeCases) {
+  // Empty field name
+  EXPECT_EQ(SanitizeFieldName(""), "");
+
+  // Field name with only special characters
+  EXPECT_EQ(SanitizeFieldName("@#$"), "_x40_x23_x24");
+
+  // Field name starting with special character
+  EXPECT_EQ(SanitizeFieldName("-field"), "_x2Dfield");
+  EXPECT_EQ(SanitizeFieldName(".field"), "_x2Efield");
+}
+
 TEST(ToAvroNodeVisitorTest, BooleanType) {
   ::avro::NodePtr node;
   EXPECT_THAT(ToAvroNodeVisitor{}.Visit(BooleanType{}, &node), IsOk());
@@ -181,6 +255,60 @@ TEST(ToAvroNodeVisitorTest, StructType) {
   EXPECT_EQ(node->leafAt(1)->leafAt(1)->type(), ::avro::AVRO_INT);
 }
 
+TEST(ToAvroNodeVisitorTest, StructTypeWithFieldNames) {
+  StructType struct_type{
+      {SchemaField{/*field_id=*/1, "user-name", iceberg::string(),
+                   /*optional=*/false},
+       SchemaField{/*field_id=*/2, "valid_field", iceberg::string(),
+                   /*optional=*/false},
+       SchemaField{/*field_id=*/3, "email.address", iceberg::string(),
+                   /*optional=*/true},
+       SchemaField{/*field_id=*/4, "AnotherField", iceberg::int32(),
+                   /*optional=*/true},
+       SchemaField{/*field_id=*/5, "123field", iceberg::int32(),
+                   /*optional=*/false},
+       SchemaField{/*field_id=*/6, "field with spaces", iceberg::boolean(),
+                   /*optional=*/true}}};
+  ::avro::NodePtr node;
+  EXPECT_THAT(ToAvroNodeVisitor{}.Visit(struct_type, &node), IsOk());
+  EXPECT_EQ(node->type(), ::avro::AVRO_RECORD);
+
+  ASSERT_EQ(node->names(), 6);
+
+  EXPECT_EQ(node->nameAt(0), "user_x2Dname");  // "user-name" -> "user_x2Dname"
+  EXPECT_EQ(node->nameAt(2),
+            "email_x2Eaddress");            // "email.address" -> 
"email_x2Eaddress"
+  EXPECT_EQ(node->nameAt(4), "_123field");  // "123field" -> "_123field"
+  EXPECT_EQ(
+      node->nameAt(5),
+      "field_x20with_x20spaces");  // "field with spaces" -> 
"field_x20with_x20spaces"
+
+  EXPECT_EQ(node->nameAt(1), "valid_field");
+  EXPECT_EQ(node->nameAt(3), "AnotherField");
+
+  ASSERT_EQ(node->customAttributes(), 6);
+  ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/0, /*field_id=*/1));
+  ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/1, /*field_id=*/2));
+  ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/2, /*field_id=*/3));
+  ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/3, /*field_id=*/4));
+  ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/4, /*field_id=*/5));
+  ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/5, /*field_id=*/6));
+
+  const auto& attrs1 = node->customAttributesAt(1);  // valid_field
+  const auto& attrs3 = node->customAttributesAt(3);  // AnotherField
+  EXPECT_FALSE(attrs1.getAttribute("iceberg-field-name").has_value());
+  EXPECT_FALSE(attrs3.getAttribute("iceberg-field-name").has_value());
+
+  ASSERT_NO_FATAL_FAILURE(
+      CheckIcebergFieldName(node, /*index=*/0, /*original_name=*/"user-name"));
+  ASSERT_NO_FATAL_FAILURE(
+      CheckIcebergFieldName(node, /*index=*/2, 
/*original_name=*/"email.address"));
+  ASSERT_NO_FATAL_FAILURE(
+      CheckIcebergFieldName(node, /*index=*/4, /*original_name=*/"123field"));
+  ASSERT_NO_FATAL_FAILURE(
+      CheckIcebergFieldName(node, /*index=*/5, /*original_name=*/"field with 
spaces"));
+}
+
 TEST(ToAvroNodeVisitorTest, ListType) {
   ListType list_type{SchemaField{/*field_id=*/5, "element", iceberg::string(),
                                  /*optional=*/true}};
@@ -1436,5 +1564,4 @@ TEST_F(NameMappingAvroSchemaTest, MissingFieldIdError) {
   auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping);
   ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema));
 }
-
 }  // namespace iceberg::avro

Reply via email to