rok commented on code in PR #13901:
URL: https://github.com/apache/arrow/pull/13901#discussion_r1737558446


##########
cpp/src/parquet/arrow/arrow_schema_test.cc:
##########
@@ -724,6 +727,85 @@ TEST_F(TestConvertParquetSchema, 
ParquetRepeatedNestedSchema) {
   ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema));
 }
 
+TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) {
+  std::vector<NodePtr> parquet_fields;
+  parquet_fields.push_back(PrimitiveNode::Make(
+      "json_1", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, 
ConvertedType::JSON));
+  parquet_fields.push_back(PrimitiveNode::Make(
+      "json_2", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, 
ConvertedType::JSON));
+
+  {
+    // Parquet file does not contain Arrow schema.
+    // By default, both fields should be treated as utf8() fields in Arrow.
+    auto arrow_schema = ::arrow::schema(
+        {::arrow::field("json_1", UTF8, true), ::arrow::field("json_2", UTF8, 
true)});
+    std::shared_ptr<KeyValueMetadata> metadata = 
::arrow::key_value_metadata({}, {});
+    ASSERT_OK(ConvertSchema(parquet_fields, metadata));
+    CheckFlatSchema(arrow_schema);
+  }
+
+  {
+    // Parquet file does not contain Arrow schema.
+    // If Arrow extensions are enabled, both fields should be treated as 
json() extension
+    // fields.
+    ArrowReaderProperties props;
+    props.set_arrow_extensions_enabled();
+    auto arrow_schema =
+        ::arrow::schema({::arrow::field("json_1", ::arrow::extension::json(), 
true),
+                         ::arrow::field("json_2", ::arrow::extension::json(), 
true)});
+    std::shared_ptr<KeyValueMetadata> metadata = 
::arrow::key_value_metadata({}, {});
+    ASSERT_OK(ConvertSchema(parquet_fields, metadata, props));
+    CheckFlatSchema(arrow_schema);
+  }
+
+  {
+    // Parquet file contains Arrow schema.
+    // Arrow schema has precedence. json_1 should be returned as a json() 
field even
+    // though extensions are not enabled.
+    std::shared_ptr<KeyValueMetadata> field_metadata =
+        ::arrow::key_value_metadata({"foo", "bar"}, {"biz", "baz"});
+    auto arrow_schema = ::arrow::schema(
+        {::arrow::field("json_1", ::arrow::extension::json(), true, 
field_metadata),
+         ::arrow::field("json_2", UTF8, true)});
+
+    ASSERT_OK_AND_ASSIGN(
+        std::shared_ptr<Buffer> serialized,
+        ::arrow::ipc::SerializeSchema(*arrow_schema, 
::arrow::default_memory_pool()));
+    std::string schema_as_string = serialized->ToString();
+    std::string schema_base64 = ::arrow::util::base64_encode(schema_as_string);
+    std::shared_ptr<KeyValueMetadata> metadata =
+        ::arrow::key_value_metadata({"ARROW:schema"}, {schema_base64});

Review Comment:
   Done.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to