pitrou commented on code in PR #13901:
URL: https://github.com/apache/arrow/pull/13901#discussion_r1752275863


##########
cpp/src/parquet/arrow/arrow_schema_test.cc:
##########
@@ -724,6 +727,86 @@ TEST_F(TestConvertParquetSchema, 
ParquetRepeatedNestedSchema) {
   ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema));
 }
 
+Status ArrowSchemaToParquetMetadata(std::shared_ptr<::arrow::Schema>& 
arrow_schema,
+                                    std::shared_ptr<KeyValueMetadata>& 
metadata) {
+  ARROW_ASSIGN_OR_RAISE(
+      std::shared_ptr<Buffer> serialized,
+      ::arrow::ipc::SerializeSchema(*arrow_schema, 
::arrow::default_memory_pool()));
+  std::string schema_as_string = serialized->ToString();
+  std::string schema_base64 = ::arrow::util::base64_encode(schema_as_string);
+  metadata = ::arrow::key_value_metadata({"ARROW:schema"}, {schema_base64});
+  return Status::OK();
+}
+
+TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) {
+  std::vector<NodePtr> parquet_fields;
+  parquet_fields.push_back(PrimitiveNode::Make(
+      "json_1", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, 
ConvertedType::JSON));
+  parquet_fields.push_back(PrimitiveNode::Make(
+      "json_2", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, 
ConvertedType::JSON));
+
+  {
+    // Parquet file does not contain Arrow schema.
+    // By default, both fields should be treated as utf8() fields in Arrow.
+    auto arrow_schema = ::arrow::schema(
+        {::arrow::field("json_1", UTF8, true), ::arrow::field("json_2", UTF8, 
true)});
+    std::shared_ptr<KeyValueMetadata> metadata{};
+    ASSERT_OK(ConvertSchema(parquet_fields, metadata));
+    CheckFlatSchema(arrow_schema);
+  }
+
+  {
+    // Parquet file does not contain Arrow schema.
+    // If Arrow extensions are enabled, both fields should be treated as 
json() extension
+    // fields.
+    ArrowReaderProperties props;
+    props.set_arrow_extensions_enabled(true);
+    auto arrow_schema =
+        ::arrow::schema({::arrow::field("json_1", ::arrow::extension::json(), 
true),
+                         ::arrow::field("json_2", ::arrow::extension::json(), 
true)});
+    std::shared_ptr<KeyValueMetadata> metadata{};
+    ASSERT_OK(ConvertSchema(parquet_fields, metadata, props));
+    CheckFlatSchema(arrow_schema);
+  }
+
+  {
+    // Parquet file contains Arrow schema.
+    // Parquet logical type has precedence. Both json_1 and json_2 should be 
returned
+    // as a json() field even though extensions are not enabled.
+    ArrowReaderProperties props;
+    props.set_arrow_extensions_enabled(true);
+    std::shared_ptr<KeyValueMetadata> field_metadata =
+        ::arrow::key_value_metadata({"foo", "bar"}, {"biz", "baz"});
+    auto arrow_schema = ::arrow::schema(
+        {::arrow::field("json_1", ::arrow::extension::json(), true, 
field_metadata),
+         ::arrow::field("json_2", ::arrow::extension::json(), true)});
+
+    std::shared_ptr<KeyValueMetadata> metadata;
+    ASSERT_OK(ArrowSchemaToParquetMetadata(arrow_schema, metadata));
+    ASSERT_OK(ConvertSchema(parquet_fields, metadata, props));
+    CheckFlatSchema(arrow_schema, true /* check_metadata */);
+  }
+
+  {

Review Comment:
   This looks like the exact same case as above, am I mistaken? Perhaps remove 
this?



##########
cpp/src/parquet/arrow/arrow_reader_writer_test.cc:
##########
@@ -1428,6 +1435,42 @@ TEST_F(TestLargeStringParquetIO, Basics) {
   this->RoundTripSingleColumn(large_array, large_array, arrow_properties);
 }
 
+using TestJsonParquetIO = TestParquetIO<::arrow::extension::JsonExtensionType>;
+
+TEST_F(TestJsonParquetIO, JsonExtension) {
+  const char* json = R"([
+    "null",
+    "1234",
+    "3.14159",
+    "true",
+    "false",
+    "\"a json string\"",
+    "[\"a\", \"json\", \"array\"]",
+    "{\"obj\": \"a simple json object\"}"
+  ])";
+
+  const auto json_type = ::arrow::extension::json();
+  const auto json_string_array = ::arrow::ArrayFromJSON(::arrow::utf8(), json);

Review Comment:
   The other discussion got me thinking: should we also have a test where the 
original storage type is `large_utf8`?



##########
cpp/src/parquet/properties.h:
##########
@@ -941,6 +942,18 @@ class PARQUET_EXPORT ArrowReaderProperties {
     return coerce_int96_timestamp_unit_;
   }
 
+  /// Enable Parquet supported Arrow ExtensionTypes.
+  ///
+  /// When enabled, Parquet will use supported Arrow ExtensionTypes by mapping 
correctly
+  /// mapping them to Arrow types at read time. Currently only 
arrow::extension::json()

Review Comment:
   ```suggestion
     /// When enabled, Parquet logical types will be mapped to their 
corresponding Arrow
     /// extension types at read time, if such exist. Currently only 
arrow::extension::json()
   ```



##########
cpp/src/parquet/properties.h:
##########
@@ -941,6 +942,18 @@ class PARQUET_EXPORT ArrowReaderProperties {
     return coerce_int96_timestamp_unit_;
   }
 
+  /// Enable Parquet supported Arrow ExtensionTypes.
+  ///
+  /// When enabled, Parquet will use supported Arrow ExtensionTypes by mapping 
correctly
+  /// mapping them to Arrow types at read time. Currently only 
arrow::extension::json()
+  /// extension type is supported. Columns whose LogicalType is JSON will be 
interpreted
+  /// as arrow::extension::json() ExtensionType with storage type utf8, 
large_utf8 or
+  /// utf8_view at parquet read time.

Review Comment:
   ```suggestion
     /// extension type is supported. Columns whose LogicalType is JSON will be 
interpreted
     /// as arrow::extension::json(), with storage type inferred from the 
serialized Arrow
     /// schema if present, or `utf8` by default.
   ```



##########
cpp/src/parquet/arrow/arrow_schema_test.cc:
##########
@@ -724,6 +727,86 @@ TEST_F(TestConvertParquetSchema, 
ParquetRepeatedNestedSchema) {
   ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema));
 }
 
+Status ArrowSchemaToParquetMetadata(std::shared_ptr<::arrow::Schema>& 
arrow_schema,
+                                    std::shared_ptr<KeyValueMetadata>& 
metadata) {
+  ARROW_ASSIGN_OR_RAISE(
+      std::shared_ptr<Buffer> serialized,
+      ::arrow::ipc::SerializeSchema(*arrow_schema, 
::arrow::default_memory_pool()));
+  std::string schema_as_string = serialized->ToString();
+  std::string schema_base64 = ::arrow::util::base64_encode(schema_as_string);
+  metadata = ::arrow::key_value_metadata({"ARROW:schema"}, {schema_base64});
+  return Status::OK();
+}
+
+TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) {
+  std::vector<NodePtr> parquet_fields;
+  parquet_fields.push_back(PrimitiveNode::Make(
+      "json_1", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, 
ConvertedType::JSON));
+  parquet_fields.push_back(PrimitiveNode::Make(
+      "json_2", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, 
ConvertedType::JSON));
+
+  {
+    // Parquet file does not contain Arrow schema.
+    // By default, both fields should be treated as utf8() fields in Arrow.
+    auto arrow_schema = ::arrow::schema(
+        {::arrow::field("json_1", UTF8, true), ::arrow::field("json_2", UTF8, 
true)});
+    std::shared_ptr<KeyValueMetadata> metadata{};
+    ASSERT_OK(ConvertSchema(parquet_fields, metadata));
+    CheckFlatSchema(arrow_schema);
+  }
+
+  {
+    // Parquet file does not contain Arrow schema.
+    // If Arrow extensions are enabled, both fields should be treated as 
json() extension
+    // fields.
+    ArrowReaderProperties props;
+    props.set_arrow_extensions_enabled(true);
+    auto arrow_schema =
+        ::arrow::schema({::arrow::field("json_1", ::arrow::extension::json(), 
true),
+                         ::arrow::field("json_2", ::arrow::extension::json(), 
true)});

Review Comment:
   To spice things up, perhaps `json_2` should be backed by a `large_utf8` 
storage type?



##########
cpp/src/parquet/arrow/arrow_schema_test.cc:
##########
@@ -724,6 +727,86 @@ TEST_F(TestConvertParquetSchema, 
ParquetRepeatedNestedSchema) {
   ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema));
 }
 
+Status ArrowSchemaToParquetMetadata(std::shared_ptr<::arrow::Schema>& 
arrow_schema,
+                                    std::shared_ptr<KeyValueMetadata>& 
metadata) {
+  ARROW_ASSIGN_OR_RAISE(
+      std::shared_ptr<Buffer> serialized,
+      ::arrow::ipc::SerializeSchema(*arrow_schema, 
::arrow::default_memory_pool()));
+  std::string schema_as_string = serialized->ToString();
+  std::string schema_base64 = ::arrow::util::base64_encode(schema_as_string);
+  metadata = ::arrow::key_value_metadata({"ARROW:schema"}, {schema_base64});
+  return Status::OK();
+}
+
+TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) {
+  std::vector<NodePtr> parquet_fields;
+  parquet_fields.push_back(PrimitiveNode::Make(
+      "json_1", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, 
ConvertedType::JSON));
+  parquet_fields.push_back(PrimitiveNode::Make(
+      "json_2", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, 
ConvertedType::JSON));
+
+  {
+    // Parquet file does not contain Arrow schema.
+    // By default, both fields should be treated as utf8() fields in Arrow.
+    auto arrow_schema = ::arrow::schema(
+        {::arrow::field("json_1", UTF8, true), ::arrow::field("json_2", UTF8, 
true)});
+    std::shared_ptr<KeyValueMetadata> metadata{};
+    ASSERT_OK(ConvertSchema(parquet_fields, metadata));
+    CheckFlatSchema(arrow_schema);
+  }
+
+  {
+    // Parquet file does not contain Arrow schema.
+    // If Arrow extensions are enabled, both fields should be treated as 
json() extension
+    // fields.
+    ArrowReaderProperties props;
+    props.set_arrow_extensions_enabled(true);
+    auto arrow_schema =
+        ::arrow::schema({::arrow::field("json_1", ::arrow::extension::json(), 
true),
+                         ::arrow::field("json_2", ::arrow::extension::json(), 
true)});
+    std::shared_ptr<KeyValueMetadata> metadata{};
+    ASSERT_OK(ConvertSchema(parquet_fields, metadata, props));
+    CheckFlatSchema(arrow_schema);
+  }
+
+  {
+    // Parquet file contains Arrow schema.
+    // Parquet logical type has precedence. Both json_1 and json_2 should be 
returned
+    // as a json() field even though extensions are not enabled.
+    ArrowReaderProperties props;
+    props.set_arrow_extensions_enabled(true);
+    std::shared_ptr<KeyValueMetadata> field_metadata =
+        ::arrow::key_value_metadata({"foo", "bar"}, {"biz", "baz"});
+    auto arrow_schema = ::arrow::schema(
+        {::arrow::field("json_1", ::arrow::extension::json(), true, 
field_metadata),
+         ::arrow::field("json_2", ::arrow::extension::json(), true)});

Review Comment:
   Same here.



##########
cpp/src/parquet/properties.h:
##########
@@ -941,6 +942,18 @@ class PARQUET_EXPORT ArrowReaderProperties {
     return coerce_int96_timestamp_unit_;
   }
 
+  /// Enable Parquet supported Arrow ExtensionTypes.

Review Comment:
   Nit :-)
   ```suggestion
     /// Enable Parquet-supported Arrow extension types.
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to