wgtmac commented on code in PR #159:
URL: https://github.com/apache/iceberg-cpp/pull/159#discussion_r2263131645


##########
src/iceberg/parquet/parquet_schema_util.cc:
##########
@@ -17,20 +17,393 @@
  * under the License.
  */
 
+#include <charconv>
+
+#include <arrow/type.h>
+#include <arrow/type_fwd.h>
+#include <arrow/util/key_value_metadata.h>
+#include <parquet/arrow/schema.h>
 #include <parquet/schema.h>
 
+#include "iceberg/metadata_columns.h"
 #include "iceberg/parquet/parquet_schema_util_internal.h"
+#include "iceberg/result.h"
+#include "iceberg/schema_util_internal.h"
 #include "iceberg/util/checked_cast.h"
+#include "iceberg/util/formatter.h"
+#include "iceberg/util/macros.h"
 
 namespace iceberg::parquet {
 
+namespace {
+
+constexpr std::string_view kParquetFieldIdKey = "PARQUET:field_id";
+
+std::optional<int32_t> FieldIdFromMetadata(
+    const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata) {
+  if (!metadata) {
+    return std::nullopt;
+  }
+  int key = metadata->FindKey(kParquetFieldIdKey);
+  if (key < 0) {
+    return std::nullopt;
+  }
+  std::string field_id_str = metadata->value(key);
+  int32_t field_id = -1;
+  auto [_, ec] = std::from_chars(field_id_str.data(),
+                                 field_id_str.data() + field_id_str.size(), 
field_id);
+  if (ec != std::errc() || field_id < 0) {
+    return std::nullopt;
+  }
+  return field_id;
+}
+
+std::optional<int32_t> GetFieldId(const ::parquet::arrow::SchemaField& 
parquet_field) {
+  return FieldIdFromMetadata(parquet_field.field->metadata());
+}
+
+// TODO(gangwu): support v3 unknown type
+Status ValidateParquetSchemaEvolution(
+    const Type& expected_type, const ::parquet::arrow::SchemaField& 
parquet_field) {
+  const auto& arrow_type = parquet_field.field->type();
+  switch (expected_type.type_id()) {
+    case TypeId::kBoolean:
+      if (arrow_type->id() == ::arrow::Type::BOOL) {
+        return {};
+      }
+      break;
+    case TypeId::kInt:
+      if (arrow_type->id() == ::arrow::Type::INT32) {
+        return {};
+      }
+      break;
+    case TypeId::kLong:
+      if (arrow_type->id() == ::arrow::Type::INT64 ||
+          arrow_type->id() == ::arrow::Type::INT32) {
+        return {};
+      }
+      break;
+    case TypeId::kFloat:
+      if (arrow_type->id() == ::arrow::Type::FLOAT) {
+        return {};
+      }
+      break;
+    case TypeId::kDouble:
+      if (arrow_type->id() == ::arrow::Type::DOUBLE ||
+          arrow_type->id() == ::arrow::Type::FLOAT) {
+        return {};
+      }
+      break;
+    case TypeId::kDate:
+      if (arrow_type->id() == ::arrow::Type::DATE32) {
+        return {};
+      }
+      break;
+    case TypeId::kTime:
+      if (arrow_type->id() == ::arrow::Type::TIME64) {

Review Comment:
   Good catch! I have added an exhaustive test case to make sure I don't miss 
any primitive type.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Reply via email to