rok commented on code in PR #13901:
URL: https://github.com/apache/arrow/pull/13901#discussion_r1741226634


##########
cpp/src/parquet/arrow/schema.cc:
##########
@@ -984,21 +992,51 @@ Result<bool> ApplyOriginalMetadata(const Field& 
origin_field, SchemaField* infer
   bool modified = false;
 
   auto& origin_type = origin_field.type();
+  auto inferred_type = inferred->field->type();
 
   if (origin_type->id() == ::arrow::Type::EXTENSION) {
     const auto& ex_type = checked_cast<const 
::arrow::ExtensionType&>(*origin_type);
-    auto origin_storage_field = origin_field.WithType(ex_type.storage_type());
+    if (inferred_type->id() != ::arrow::Type::EXTENSION &&
+        ex_type.extension_name() == 
::arrow::extension::JsonExtensionType::type_name()) {
+      // Schema mismatch.
+      //
+      // Arrow extensions are DISABLED in Parquet.
+      // origin_type is ::arrow::extension::json()
+      // inferred_type is ::arrow::binary()
+      //
+      // Origin type is restored as Arrow should be considered the source of 
truth.
+      DCHECK_EQ(inferred_type->id(), ::arrow::Type::BINARY);
+      inferred->field = inferred->field->WithType(origin_type);
+      RETURN_NOT_OK(ApplyOriginalStorageMetadata(origin_field, inferred));
+    } else {
+      auto origin_storage_field = 
origin_field.WithType(ex_type.storage_type());
 
-    // Apply metadata recursively to storage type
-    RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, 
inferred));
+      // Apply metadata recursively to storage type
+      RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, 
inferred));
 
-    // Restore extension type, if the storage type is the same as inferred
-    // from the Parquet type
-    if (ex_type.storage_type()->Equals(*inferred->field->type())) {
-      inferred->field = inferred->field->WithType(origin_type);
+      // Restore extension type, if the storage type is the same as inferred
+      // from the Parquet type
+      if (ex_type.storage_type()->Equals(*inferred->field->type())) {
+        inferred->field = inferred->field->WithType(origin_type);
+      }
     }
     modified = true;
   } else {
+    if (inferred_type->id() == ::arrow::Type::EXTENSION) {
+      const auto& ex_type = checked_cast<const 
::arrow::ExtensionType&>(*inferred_type);
+      if (ex_type.extension_name() ==
+          ::arrow::extension::JsonExtensionType::type_name()) {
+        // Schema mismatch.
+        //
+        // Arrow extensions are ENABLED in Parquet.
+        // origin_type is ::arrow::binary()
+        // inferred_type is ::arrow::extension::json()
+        //
+        // Origin type is restored as Arrow should be considered the source of 
truth.

Review Comment:
   I don't have a strong opinion on this either way. The example I can think of 
is a another implementation without canonical types writing this.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to