wgtmac commented on code in PR #159: URL: https://github.com/apache/iceberg-cpp/pull/159#discussion_r2263131645
########## src/iceberg/parquet/parquet_schema_util.cc: ########## @@ -17,20 +17,393 @@ * under the License. */ +#include <charconv> + +#include <arrow/type.h> +#include <arrow/type_fwd.h> +#include <arrow/util/key_value_metadata.h> +#include <parquet/arrow/schema.h> #include <parquet/schema.h> +#include "iceberg/metadata_columns.h" #include "iceberg/parquet/parquet_schema_util_internal.h" +#include "iceberg/result.h" +#include "iceberg/schema_util_internal.h" #include "iceberg/util/checked_cast.h" +#include "iceberg/util/formatter.h" +#include "iceberg/util/macros.h" namespace iceberg::parquet { +namespace { + +constexpr std::string_view kParquetFieldIdKey = "PARQUET:field_id"; + +std::optional<int32_t> FieldIdFromMetadata( + const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata) { + if (!metadata) { + return std::nullopt; + } + int key = metadata->FindKey(kParquetFieldIdKey); + if (key < 0) { + return std::nullopt; + } + std::string field_id_str = metadata->value(key); + int32_t field_id = -1; + auto [_, ec] = std::from_chars(field_id_str.data(), + field_id_str.data() + field_id_str.size(), field_id); + if (ec != std::errc() || field_id < 0) { + return std::nullopt; + } + return field_id; +} + +std::optional<int32_t> GetFieldId(const ::parquet::arrow::SchemaField& parquet_field) { + return FieldIdFromMetadata(parquet_field.field->metadata()); +} + +// TODO(gangwu): support v3 unknown type +Status ValidateParquetSchemaEvolution( + const Type& expected_type, const ::parquet::arrow::SchemaField& parquet_field) { + const auto& arrow_type = parquet_field.field->type(); + switch (expected_type.type_id()) { + case TypeId::kBoolean: + if (arrow_type->id() == ::arrow::Type::BOOL) { + return {}; + } + break; + case TypeId::kInt: + if (arrow_type->id() == ::arrow::Type::INT32) { + return {}; + } + break; + case TypeId::kLong: + if (arrow_type->id() == ::arrow::Type::INT64 || + arrow_type->id() == ::arrow::Type::INT32) { + return {}; + } + break; + case TypeId::kFloat: + if (arrow_type->id() == ::arrow::Type::FLOAT) { + return {}; + } + break; + case TypeId::kDouble: + if (arrow_type->id() == ::arrow::Type::DOUBLE || + arrow_type->id() == ::arrow::Type::FLOAT) { + return {}; + } + break; + case TypeId::kDate: + if (arrow_type->id() == ::arrow::Type::DATE32) { + return {}; + } + break; + case TypeId::kTime: + if (arrow_type->id() == ::arrow::Type::TIME64) { Review Comment: Good catch! I have added an exhaustive test case to make sure I don't miss any primitive type. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org