This is an automated email from the ASF dual-hosted git repository.
fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-cpp.git
The following commit(s) were added to refs/heads/main by this push:
new 1cea1e9 feat: avro schema add sanitize field name (#190)
1cea1e9 is described below
commit 1cea1e9b6bd6ab3c146b94e1f1b4c4c65a0d3bbe
Author: liuxiaoyu <[email protected]>
AuthorDate: Thu Sep 25 17:43:47 2025 +0800
feat: avro schema add sanitize field name (#190)
---
src/iceberg/avro/avro_schema_util.cc | 69 +++++++++++++-
src/iceberg/avro/avro_schema_util_internal.h | 33 +++++++
test/avro_schema_test.cc | 129 ++++++++++++++++++++++++++-
3 files changed, 227 insertions(+), 4 deletions(-)
diff --git a/src/iceberg/avro/avro_schema_util.cc
b/src/iceberg/avro/avro_schema_util.cc
index fd40cc7..d315eb2 100644
--- a/src/iceberg/avro/avro_schema_util.cc
+++ b/src/iceberg/avro/avro_schema_util.cc
@@ -20,6 +20,7 @@
#include <charconv>
#include <format>
#include <mutex>
+#include <sstream>
#include <string_view>
#include <arrow/type.h>
@@ -56,8 +57,60 @@ namespace {
return attributes;
}
+void SanitizeChar(char c, std::ostringstream& os) {
+ if (std::isdigit(c)) {
+ os << '_' << c;
+ } else {
+ os << "_x" << std::uppercase << std::hex << static_cast<int>(c);
+ }
+}
+
} // namespace
+bool ValidAvroName(std::string_view name) {
+ if (name.empty()) {
+ return false;
+ }
+
+ char first = name[0];
+ if (!(std::isalpha(first) || first == '_')) {
+ return false;
+ }
+
+ for (size_t i = 1; i < name.length(); i++) {
+ char character = name[i];
+ if (!(std::isalnum(character) || character == '_')) {
+ return false;
+ }
+ }
+ return true;
+}
+
+std::string SanitizeFieldName(std::string_view field_name) {
+ if (field_name.empty()) {
+ return "";
+ }
+
+ std::ostringstream result;
+
+ if (!std::isalpha(field_name[0]) && field_name[0] != '_') {
+ SanitizeChar(field_name[0], result);
+ } else {
+ result << field_name[0];
+ }
+
+ for (size_t i = 1; i < field_name.size(); ++i) {
+ char c = field_name[i];
+ if (std::isalnum(c) || c == '_') {
+ result << c;
+ } else {
+ SanitizeChar(c, result);
+ }
+ }
+
+ return result.str();
+}
+
std::string ToString(const ::avro::NodePtr& node) {
std::stringstream ss;
ss << *node;
@@ -181,10 +234,20 @@ Status ToAvroNodeVisitor::Visit(const StructType& type,
::avro::NodePtr* node) {
::avro::NodePtr field_node;
ICEBERG_RETURN_UNEXPECTED(Visit(sub_field, &field_node));
- // TODO(gangwu): sanitize field name
- (*node)->addName(std::string(sub_field.name()));
+ bool is_valid_field_name = ValidAvroName(sub_field.name());
+ std::string field_name = is_valid_field_name ?
std::string(sub_field.name())
+ :
SanitizeFieldName(sub_field.name());
+
+ (*node)->addName(field_name);
(*node)->addLeaf(field_node);
-
(*node)->addCustomAttributesForField(GetAttributesWithFieldId(sub_field.field_id()));
+
+ ::avro::CustomAttributes attributes =
GetAttributesWithFieldId(sub_field.field_id());
+ if (!is_valid_field_name) {
+ attributes.addAttribute(std::string(kIcebergFieldNameProp),
+ std::string(sub_field.name()),
+ /*addQuotes=*/true);
+ }
+ (*node)->addCustomAttributesForField(attributes);
}
return {};
}
diff --git a/src/iceberg/avro/avro_schema_util_internal.h
b/src/iceberg/avro/avro_schema_util_internal.h
index 1647870..bdfbf13 100644
--- a/src/iceberg/avro/avro_schema_util_internal.h
+++ b/src/iceberg/avro/avro_schema_util_internal.h
@@ -149,6 +149,16 @@ std::string ToString(const ::avro::LogicalType::Type&
logical_type);
/// \return True if the node has a map logical type, false otherwise.
bool HasMapLogicalType(const ::avro::NodePtr& node);
+/// \brief Check if a string is a valid Avro name.
+///
+/// Valid Avro names must:
+/// 1. Start with a letter or underscore
+/// 2. Contain only letters, digits, or underscores
+///
+/// \param name The name to check.
+/// \return True if the name is valid, false otherwise.
+bool ValidAvroName(std::string_view name);
+
/// \brief Create a new Avro node with field IDs from name mapping.
/// \param original_node The original Avro node to copy.
/// \param mapping The name mapping to apply field IDs from.
@@ -156,4 +166,27 @@ bool HasMapLogicalType(const ::avro::NodePtr& node);
Result<::avro::NodePtr> MakeAvroNodeWithFieldIds(const ::avro::NodePtr&
original_node,
const NameMapping& mapping);
+/// \brief Sanitize a field name to make it compatible with Avro field name
requirements.
+///
+/// Converts names that are not valid Avro names to valid Avro names.
+/// Conversion rules:
+/// 1. If the first character is not a letter or underscore, it is specially
handled:
+/// - Digits: Prefixed with an underscore (e.g., '3' -> '_3')
+/// - Other characters: Converted to '_x' followed by the uppercase
hexadecimal
+/// representation of the character (e.g., '$' -> '_x24')
+/// 2. For characters other than the first:
+/// - If it's a letter, digit, or underscore, it remains unchanged
+/// - Other characters: Converted to '_x' followed by the uppercase
hexadecimal
+/// representation
+///
+/// Examples:
+/// - "123field" -> "_123field"
+/// - "user-name" -> "user_x2Dname"
+/// - "$price" -> "_x24price"
+/// - "valid_name_123" -> "valid_name_123" (no conversion needed)
+///
+/// \param field_name The original field name to sanitize.
+/// \return A sanitized field name that follows Avro naming conventions.
+std::string SanitizeFieldName(std::string_view field_name);
+
} // namespace iceberg::avro
diff --git a/test/avro_schema_test.cc b/test/avro_schema_test.cc
index 6fbbb85..6870ee2 100644
--- a/test/avro_schema_test.cc
+++ b/test/avro_schema_test.cc
@@ -47,8 +47,82 @@ void CheckFieldIdAt(const ::avro::NodePtr& node, size_t
index, int32_t field_id,
ASSERT_EQ(attrs.getAttribute(key),
std::make_optional(std::to_string(field_id)));
}
+// Helper function to check if a custom attribute exists for a field name
preservation
+void CheckIcebergFieldName(const ::avro::NodePtr& node, size_t index,
+ const std::string& original_name) {
+ ASSERT_LT(index, node->customAttributes());
+ const auto& attrs = node->customAttributesAt(index);
+ ASSERT_EQ(attrs.getAttribute("iceberg-field-name"),
std::make_optional(original_name));
+}
+
} // namespace
+TEST(ValidAvroNameTest, ValidNames) {
+ // Valid field names should return true
+ EXPECT_TRUE(ValidAvroName("valid_field"));
+ EXPECT_TRUE(ValidAvroName("field123"));
+ EXPECT_TRUE(ValidAvroName("_private"));
+ EXPECT_TRUE(ValidAvroName("CamelCase"));
+ EXPECT_TRUE(ValidAvroName("field_with_underscores"));
+}
+
+TEST(ValidAvroNameTest, InvalidNames) {
+ // Names starting with numbers should return false
+ EXPECT_FALSE(ValidAvroName("123field"));
+ EXPECT_FALSE(ValidAvroName("0value"));
+
+ // Names with special characters should return false
+ EXPECT_FALSE(ValidAvroName("field-name"));
+ EXPECT_FALSE(ValidAvroName("field.name"));
+ EXPECT_FALSE(ValidAvroName("field name"));
+ EXPECT_FALSE(ValidAvroName("field@name"));
+ EXPECT_FALSE(ValidAvroName("field#name"));
+}
+
+TEST(ValidAvroNameTest, EmptyName) {
+ // Empty name should return false
+ EXPECT_FALSE(ValidAvroName(""));
+}
+
+TEST(SanitizeFieldNameTest, ValidFieldNames) {
+ // Valid field names should remain unchanged
+ EXPECT_EQ(SanitizeFieldName("valid_field"), "valid_field");
+ EXPECT_EQ(SanitizeFieldName("field123"), "field123");
+ EXPECT_EQ(SanitizeFieldName("_private"), "_private");
+ EXPECT_EQ(SanitizeFieldName("CamelCase"), "CamelCase");
+ EXPECT_EQ(SanitizeFieldName("field_with_underscores"),
"field_with_underscores");
+}
+
+TEST(SanitizeFieldNameTest, InvalidFieldNames) {
+ // Field names starting with numbers should be prefixed with underscore
+ EXPECT_EQ(SanitizeFieldName("123field"), "_123field");
+ EXPECT_EQ(SanitizeFieldName("0value"), "_0value");
+
+ // Field names with special characters should be encoded with hex values
+ EXPECT_EQ(SanitizeFieldName("field-name"), "field_x2Dname");
+ EXPECT_EQ(SanitizeFieldName("field.name"), "field_x2Ename");
+ EXPECT_EQ(SanitizeFieldName("field name"), "field_x20name");
+ EXPECT_EQ(SanitizeFieldName("field@name"), "field_x40name");
+ EXPECT_EQ(SanitizeFieldName("field#name"), "field_x23name");
+
+ // Complex field names with multiple issues
+ EXPECT_EQ(SanitizeFieldName("1field-with.special@chars"),
+ "_1field_x2Dwith_x2Especial_x40chars");
+ EXPECT_EQ(SanitizeFieldName("user-email"), "user_x2Demail");
+}
+
+TEST(SanitizeFieldNameTest, EdgeCases) {
+ // Empty field name
+ EXPECT_EQ(SanitizeFieldName(""), "");
+
+ // Field name with only special characters
+ EXPECT_EQ(SanitizeFieldName("@#$"), "_x40_x23_x24");
+
+ // Field name starting with special character
+ EXPECT_EQ(SanitizeFieldName("-field"), "_x2Dfield");
+ EXPECT_EQ(SanitizeFieldName(".field"), "_x2Efield");
+}
+
TEST(ToAvroNodeVisitorTest, BooleanType) {
::avro::NodePtr node;
EXPECT_THAT(ToAvroNodeVisitor{}.Visit(BooleanType{}, &node), IsOk());
@@ -181,6 +255,60 @@ TEST(ToAvroNodeVisitorTest, StructType) {
EXPECT_EQ(node->leafAt(1)->leafAt(1)->type(), ::avro::AVRO_INT);
}
+TEST(ToAvroNodeVisitorTest, StructTypeWithFieldNames) {
+ StructType struct_type{
+ {SchemaField{/*field_id=*/1, "user-name", iceberg::string(),
+ /*optional=*/false},
+ SchemaField{/*field_id=*/2, "valid_field", iceberg::string(),
+ /*optional=*/false},
+ SchemaField{/*field_id=*/3, "email.address", iceberg::string(),
+ /*optional=*/true},
+ SchemaField{/*field_id=*/4, "AnotherField", iceberg::int32(),
+ /*optional=*/true},
+ SchemaField{/*field_id=*/5, "123field", iceberg::int32(),
+ /*optional=*/false},
+ SchemaField{/*field_id=*/6, "field with spaces", iceberg::boolean(),
+ /*optional=*/true}}};
+ ::avro::NodePtr node;
+ EXPECT_THAT(ToAvroNodeVisitor{}.Visit(struct_type, &node), IsOk());
+ EXPECT_EQ(node->type(), ::avro::AVRO_RECORD);
+
+ ASSERT_EQ(node->names(), 6);
+
+ EXPECT_EQ(node->nameAt(0), "user_x2Dname"); // "user-name" -> "user_x2Dname"
+ EXPECT_EQ(node->nameAt(2),
+ "email_x2Eaddress"); // "email.address" ->
"email_x2Eaddress"
+ EXPECT_EQ(node->nameAt(4), "_123field"); // "123field" -> "_123field"
+ EXPECT_EQ(
+ node->nameAt(5),
+ "field_x20with_x20spaces"); // "field with spaces" ->
"field_x20with_x20spaces"
+
+ EXPECT_EQ(node->nameAt(1), "valid_field");
+ EXPECT_EQ(node->nameAt(3), "AnotherField");
+
+ ASSERT_EQ(node->customAttributes(), 6);
+ ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/0, /*field_id=*/1));
+ ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/1, /*field_id=*/2));
+ ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/2, /*field_id=*/3));
+ ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/3, /*field_id=*/4));
+ ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/4, /*field_id=*/5));
+ ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/5, /*field_id=*/6));
+
+ const auto& attrs1 = node->customAttributesAt(1); // valid_field
+ const auto& attrs3 = node->customAttributesAt(3); // AnotherField
+ EXPECT_FALSE(attrs1.getAttribute("iceberg-field-name").has_value());
+ EXPECT_FALSE(attrs3.getAttribute("iceberg-field-name").has_value());
+
+ ASSERT_NO_FATAL_FAILURE(
+ CheckIcebergFieldName(node, /*index=*/0, /*original_name=*/"user-name"));
+ ASSERT_NO_FATAL_FAILURE(
+ CheckIcebergFieldName(node, /*index=*/2,
/*original_name=*/"email.address"));
+ ASSERT_NO_FATAL_FAILURE(
+ CheckIcebergFieldName(node, /*index=*/4, /*original_name=*/"123field"));
+ ASSERT_NO_FATAL_FAILURE(
+ CheckIcebergFieldName(node, /*index=*/5, /*original_name=*/"field with
spaces"));
+}
+
TEST(ToAvroNodeVisitorTest, ListType) {
ListType list_type{SchemaField{/*field_id=*/5, "element", iceberg::string(),
/*optional=*/true}};
@@ -1436,5 +1564,4 @@ TEST_F(NameMappingAvroSchemaTest, MissingFieldIdError) {
auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping);
ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema));
}
-
} // namespace iceberg::avro