wgtmac commented on code in PR #194: URL: https://github.com/apache/iceberg-cpp/pull/194#discussion_r2312980287
########## src/iceberg/type.h: ########## @@ -101,6 +106,7 @@ class ICEBERG_EXPORT NestedType : public Type { /// \brief A data type representing a struct with nested fields. class ICEBERG_EXPORT StructType : public NestedType { public: + using NestedType::GetFieldByName; Review Comment: ```suggestion ``` Move it closer to its friends. ########## src/iceberg/type.h: ########## @@ -109,23 +115,32 @@ class ICEBERG_EXPORT StructType : public NestedType { std::string ToString() const override; std::span<const SchemaField> fields() const override; - std::optional<std::reference_wrapper<const SchemaField>> GetFieldById( + Result<std::optional<SchemaFieldConstRef>> GetFieldById( int32_t field_id) const override; - std::optional<std::reference_wrapper<const SchemaField>> GetFieldByIndex( + Result<std::optional<SchemaFieldConstRef>> GetFieldByIndex( int32_t index) const override; - std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName( - std::string_view name) const override; + Result<std::optional<SchemaFieldConstRef>> GetFieldByName( + std::string_view name, bool case_sensitive) const override; protected: bool Equals(const Type& other) const override; + // TODO(nullccxsy): Lazy initialization has concurrency issues, need to add proper + // synchronization mechanism + Status InitFieldById() const; + Status InitFieldByName() const; + Status InitFieldByLowerCaseName() const; + protected: std::vector<SchemaField> fields_; - std::unordered_map<int32_t, size_t> field_id_to_index_; + mutable std::unordered_map<int32_t, SchemaFieldConstRef> field_by_id_; + mutable std::unordered_map<std::string_view, SchemaFieldConstRef> field_by_name_; + mutable std::unordered_map<std::string, SchemaFieldConstRef> field_by_lowercase_name_; }; /// \brief A data type representing a list of values. class ICEBERG_EXPORT ListType : public NestedType { public: + using NestedType::GetFieldByName; Review Comment: ```suggestion ``` ########## src/iceberg/type.h: ########## @@ -156,6 +171,7 @@ class ICEBERG_EXPORT ListType : public NestedType { /// \brief A data type representing a dictionary of values. class ICEBERG_EXPORT MapType : public NestedType { public: + using NestedType::GetFieldByName; Review Comment: ```suggestion ``` ########## src/iceberg/type.h: ########## @@ -72,26 +73,30 @@ class ICEBERG_EXPORT NestedType : public Type { public: bool is_primitive() const override { return false; } bool is_nested() const override { return true; } + using SchemaFieldConstRef = std::reference_wrapper<const SchemaField>; /// \brief Get a view of the child fields. [[nodiscard]] virtual std::span<const SchemaField> fields() const = 0; /// \brief Get a field by field ID. /// /// \note This is O(1) complexity. - [[nodiscard]] virtual std::optional<std::reference_wrapper<const SchemaField>> - GetFieldById(int32_t field_id) const = 0; + [[nodiscard]] virtual Result<std::optional<SchemaFieldConstRef>> GetFieldById( + int32_t field_id) const = 0; /// \brief Get a field by index. /// /// \note This is O(1) complexity. - [[nodiscard]] virtual std::optional<std::reference_wrapper<const SchemaField>> - GetFieldByIndex(int32_t index) const = 0; - /// \brief Get a field by name (case-sensitive). Behavior is undefined if + [[nodiscard]] virtual Result<std::optional<SchemaFieldConstRef>> GetFieldByIndex( + int32_t index) const = 0; + /// \brief Get a field by name. Return an error Status if /// the field name is not unique; prefer GetFieldById or GetFieldByIndex /// when possible. /// - /// \note This is currently O(n) complexity. - [[nodiscard]] virtual std::optional<std::reference_wrapper<const SchemaField>> - GetFieldByName(std::string_view name) const = 0; + /// \note This is currently O(1) complexity. Review Comment: ```suggestion /// \note This is O(1) complexity. ``` ########## test/schema_test.cc: ########## @@ -47,18 +47,16 @@ TEST(SchemaTest, Basics) { ASSERT_THAT(schema.GetFieldByName("bar"), ::testing::Optional(field2)); ASSERT_EQ(std::nullopt, schema.GetFieldById(0)); - ASSERT_EQ(std::nullopt, schema.GetFieldByIndex(2)); - ASSERT_EQ(std::nullopt, schema.GetFieldByIndex(-1)); + auto result = schema.GetFieldByIndex(2); + ASSERT_EQ(result.error().kind, iceberg::ErrorKind::kInvalidArgument); + ASSERT_THAT(result.error().message, + ::testing::HasSubstr("Invalid index 2 to get field from struct")); Review Comment: ```suggestion ASSERT_THAT(result, IsError(ErrorKind::kInvalidArgument)); ASSERT_THAT(result, HasErrorMessage("Invalid index 2 to get field from struct")); ``` ########## test/schema_test.cc: ########## @@ -47,18 +47,16 @@ TEST(SchemaTest, Basics) { ASSERT_THAT(schema.GetFieldByName("bar"), ::testing::Optional(field2)); ASSERT_EQ(std::nullopt, schema.GetFieldById(0)); - ASSERT_EQ(std::nullopt, schema.GetFieldByIndex(2)); - ASSERT_EQ(std::nullopt, schema.GetFieldByIndex(-1)); + auto result = schema.GetFieldByIndex(2); + ASSERT_EQ(result.error().kind, iceberg::ErrorKind::kInvalidArgument); + ASSERT_THAT(result.error().message, + ::testing::HasSubstr("Invalid index 2 to get field from struct")); Review Comment: Same for other similar change in the test files. ########## src/iceberg/type.h: ########## @@ -109,23 +115,32 @@ class ICEBERG_EXPORT StructType : public NestedType { std::string ToString() const override; std::span<const SchemaField> fields() const override; - std::optional<std::reference_wrapper<const SchemaField>> GetFieldById( + Result<std::optional<SchemaFieldConstRef>> GetFieldById( int32_t field_id) const override; - std::optional<std::reference_wrapper<const SchemaField>> GetFieldByIndex( + Result<std::optional<SchemaFieldConstRef>> GetFieldByIndex( int32_t index) const override; - std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName( - std::string_view name) const override; + Result<std::optional<SchemaFieldConstRef>> GetFieldByName( + std::string_view name, bool case_sensitive) const override; protected: bool Equals(const Type& other) const override; + // TODO(nullccxsy): Lazy initialization has concurrency issues, need to add proper + // synchronization mechanism + Status InitFieldById() const; + Status InitFieldByName() const; + Status InitFieldByLowerCaseName() const; + protected: std::vector<SchemaField> fields_; - std::unordered_map<int32_t, size_t> field_id_to_index_; + mutable std::unordered_map<int32_t, SchemaFieldConstRef> field_by_id_; + mutable std::unordered_map<std::string_view, SchemaFieldConstRef> field_by_name_; + mutable std::unordered_map<std::string, SchemaFieldConstRef> field_by_lowercase_name_; Review Comment: We need to wait for https://github.com/apache/iceberg-cpp/pull/180 and rebase on it to use `StringHash` here. ########## src/iceberg/type.h: ########## @@ -72,26 +73,30 @@ class ICEBERG_EXPORT NestedType : public Type { public: bool is_primitive() const override { return false; } bool is_nested() const override { return true; } + using SchemaFieldConstRef = std::reference_wrapper<const SchemaField>; Review Comment: ``` class ICEBERG_EXPORT NestedType : public Type { using SchemaFieldConstRef = std::reference_wrapper<const SchemaField>; public: bool is_primitive() const override { return false; } ``` It would be better to put the type alias in the non-public scope ########## src/iceberg/type.h: ########## @@ -109,23 +115,32 @@ class ICEBERG_EXPORT StructType : public NestedType { std::string ToString() const override; std::span<const SchemaField> fields() const override; - std::optional<std::reference_wrapper<const SchemaField>> GetFieldById( + Result<std::optional<SchemaFieldConstRef>> GetFieldById( int32_t field_id) const override; - std::optional<std::reference_wrapper<const SchemaField>> GetFieldByIndex( + Result<std::optional<SchemaFieldConstRef>> GetFieldByIndex( int32_t index) const override; - std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName( - std::string_view name) const override; + Result<std::optional<SchemaFieldConstRef>> GetFieldByName( + std::string_view name, bool case_sensitive) const override; Review Comment: ```suggestion Result<std::optional<SchemaFieldConstRef>> GetFieldByName( std::string_view name, bool case_sensitive) const override; using NestedType::GetFieldByName; ``` ########## src/iceberg/type.h: ########## @@ -72,26 +73,30 @@ class ICEBERG_EXPORT NestedType : public Type { public: bool is_primitive() const override { return false; } bool is_nested() const override { return true; } + using SchemaFieldConstRef = std::reference_wrapper<const SchemaField>; /// \brief Get a view of the child fields. [[nodiscard]] virtual std::span<const SchemaField> fields() const = 0; /// \brief Get a field by field ID. /// /// \note This is O(1) complexity. - [[nodiscard]] virtual std::optional<std::reference_wrapper<const SchemaField>> - GetFieldById(int32_t field_id) const = 0; + [[nodiscard]] virtual Result<std::optional<SchemaFieldConstRef>> GetFieldById( + int32_t field_id) const = 0; /// \brief Get a field by index. /// /// \note This is O(1) complexity. - [[nodiscard]] virtual std::optional<std::reference_wrapper<const SchemaField>> - GetFieldByIndex(int32_t index) const = 0; - /// \brief Get a field by name (case-sensitive). Behavior is undefined if + [[nodiscard]] virtual Result<std::optional<SchemaFieldConstRef>> GetFieldByIndex( + int32_t index) const = 0; + /// \brief Get a field by name. Return an error Status if /// the field name is not unique; prefer GetFieldById or GetFieldByIndex /// when possible. /// - /// \note This is currently O(n) complexity. - [[nodiscard]] virtual std::optional<std::reference_wrapper<const SchemaField>> - GetFieldByName(std::string_view name) const = 0; + /// \note This is currently O(1) complexity. + [[nodiscard]] virtual Result<std::optional<SchemaFieldConstRef>> GetFieldByName( + std::string_view name, bool case_sensitive) const = 0; + /// \brief Get a field by name(case-sensitive). Review Comment: ```suggestion /// \brief Get a field by name (case-sensitive). ``` ########## src/iceberg/type.h: ########## @@ -172,12 +188,12 @@ class ICEBERG_EXPORT MapType : public NestedType { std::string ToString() const override; std::span<const SchemaField> fields() const override; - std::optional<std::reference_wrapper<const SchemaField>> GetFieldById( + Result<std::optional<SchemaFieldConstRef>> GetFieldById( int32_t field_id) const override; - std::optional<std::reference_wrapper<const SchemaField>> GetFieldByIndex( + Result<std::optional<SchemaFieldConstRef>> GetFieldByIndex( int32_t index) const override; - std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName( - std::string_view name) const override; + Result<std::optional<SchemaFieldConstRef>> GetFieldByName( + std::string_view name, bool case_sensitive) const override; Review Comment: ```suggestion Result<std::optional<SchemaFieldConstRef>> GetFieldByName( std::string_view name, bool case_sensitive) const override; using NestedType::GetFieldByName; ``` ########## src/iceberg/type.h: ########## @@ -140,12 +155,12 @@ class ICEBERG_EXPORT ListType : public NestedType { std::string ToString() const override; std::span<const SchemaField> fields() const override; - std::optional<std::reference_wrapper<const SchemaField>> GetFieldById( + Result<std::optional<SchemaFieldConstRef>> GetFieldById( int32_t field_id) const override; - std::optional<std::reference_wrapper<const SchemaField>> GetFieldByIndex( + Result<std::optional<SchemaFieldConstRef>> GetFieldByIndex( int32_t index) const override; - std::optional<std::reference_wrapper<const SchemaField>> GetFieldByName( - std::string_view name) const override; + Result<std::optional<SchemaFieldConstRef>> GetFieldByName( + std::string_view name, bool case_sensitive) const override; Review Comment: ```suggestion Result<std::optional<SchemaFieldConstRef>> GetFieldByName( std::string_view name, bool case_sensitive) const override; using NestedType::GetFieldByName; ``` ########## src/iceberg/util/macros.h: ########## @@ -19,10 +19,13 @@ #pragma once -#define ICEBERG_RETURN_UNEXPECTED(result) \ - if (!result) [[unlikely]] { \ - return std::unexpected<Error>(result.error()); \ - } +#define ICEBERG_RETURN_UNEXPECTED(result) \ + do { \ + auto&& iceberg_temp_result = (result); \ + if (!iceberg_temp_result) [[unlikely]] { \ + return std::unexpected<Error>(iceberg_temp_result.error()); \ Review Comment: ```suggestion auto&& result_name = (result); \ if (!result_name) [[unlikely]] { \ return std::unexpected<Error>(result_name.error()); \ ``` ########## src/iceberg/type.cc: ########## @@ -84,6 +86,48 @@ bool StructType::Equals(const Type& other) const { const auto& struct_ = static_cast<const StructType&>(other); return fields_ == struct_.fields_; } +Status StructType::InitFieldById() const { + if (!field_by_id_.empty()) { + return {}; + } + for (const auto& field : fields_) { + auto it = field_by_id_.try_emplace(field.field_id(), field); + if (!it.second) { + return InvalidSchema("Duplicate field id found: {} (prev name: {}, curr name: {})", + field.field_id(), it.first->second.get().name(), field.name()); + } + } + return {}; +} +Status StructType::InitFieldByName() const { + if (!field_by_name_.empty()) { + return {}; + } + for (const auto& field : fields_) { + auto it = field_by_name_.try_emplace(field.name(), field); + if (!it.second) { + return InvalidSchema("Duplicate field name found: {} (prev id: {}, curr id: {})", + it.first->first, it.first->second.get().field_id(), + field.field_id()); + } + } + return {}; +} +Status StructType::InitFieldByLowerCaseName() const { + if (!field_by_lowercase_name_.empty()) { + return {}; + } + for (const auto& field : fields_) { + auto it = + field_by_lowercase_name_.try_emplace(StringUtils::ToLower(field.name()), field); + if (!it.second) { + return InvalidSchema("Duplicate field name found: {} (prev id: {}, curr id: {})", Review Comment: ```suggestion return InvalidSchema("Duplicate lowercase field name found: {} (prev id: {}, curr id: {})", ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org