rok commented on code in PR #13901: URL: https://github.com/apache/arrow/pull/13901#discussion_r1741226634
########## cpp/src/parquet/arrow/schema.cc: ########## @@ -984,21 +992,51 @@ Result<bool> ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer bool modified = false; auto& origin_type = origin_field.type(); + auto inferred_type = inferred->field->type(); if (origin_type->id() == ::arrow::Type::EXTENSION) { const auto& ex_type = checked_cast<const ::arrow::ExtensionType&>(*origin_type); - auto origin_storage_field = origin_field.WithType(ex_type.storage_type()); + if (inferred_type->id() != ::arrow::Type::EXTENSION && + ex_type.extension_name() == ::arrow::extension::JsonExtensionType::type_name()) { + // Schema mismatch. + // + // Arrow extensions are DISABLED in Parquet. + // origin_type is ::arrow::extension::json() + // inferred_type is ::arrow::binary() + // + // Origin type is restored as Arrow should be considered the source of truth. + DCHECK_EQ(inferred_type->id(), ::arrow::Type::BINARY); + inferred->field = inferred->field->WithType(origin_type); + RETURN_NOT_OK(ApplyOriginalStorageMetadata(origin_field, inferred)); + } else { + auto origin_storage_field = origin_field.WithType(ex_type.storage_type()); - // Apply metadata recursively to storage type - RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred)); + // Apply metadata recursively to storage type + RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred)); - // Restore extension type, if the storage type is the same as inferred - // from the Parquet type - if (ex_type.storage_type()->Equals(*inferred->field->type())) { - inferred->field = inferred->field->WithType(origin_type); + // Restore extension type, if the storage type is the same as inferred + // from the Parquet type + if (ex_type.storage_type()->Equals(*inferred->field->type())) { + inferred->field = inferred->field->WithType(origin_type); + } } modified = true; } else { + if (inferred_type->id() == ::arrow::Type::EXTENSION) { + const auto& ex_type = checked_cast<const ::arrow::ExtensionType&>(*inferred_type); + if (ex_type.extension_name() == + ::arrow::extension::JsonExtensionType::type_name()) { + // Schema mismatch. + // + // Arrow extensions are ENABLED in Parquet. + // origin_type is ::arrow::binary() + // inferred_type is ::arrow::extension::json() + // + // Origin type is restored as Arrow should be considered the source of truth. Review Comment: I don't have a strong opinion on this either way. The example I can think of is a another implementation without canonical types writing this. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org