westonpace commented on a change in pull request #10289: URL: https://github.com/apache/arrow/pull/10289#discussion_r640049553
########## File path: cpp/src/parquet/arrow/arrow_schema_test.cc ########## @@ -1157,6 +1160,94 @@ TEST_F(TestConvertArrowSchema, ParquetFlatDecimals) { ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields)); } +class TestConvertRoundTrip : public ::testing::Test { + public: + ::arrow::Status ConvertSchema( + const std::vector<std::shared_ptr<Field>>& fields, + std::shared_ptr<::parquet::ArrowWriterProperties> arrow_properties = + ::parquet::default_arrow_writer_properties()) { + arrow_schema_ = ::arrow::schema(fields); + std::shared_ptr<::parquet::WriterProperties> properties = + ::parquet::default_writer_properties(); + RETURN_NOT_OK(ToParquetSchema(arrow_schema_.get(), *properties.get(), + *arrow_properties, &parquet_schema_)); + ::parquet::schema::ToParquet(parquet_schema_->group_node(), &parquet_format_schema_); + auto parquet_schema = ::parquet::schema::FromParquet(parquet_format_schema_); + return FromParquetSchema(parquet_schema.get(), &result_schema_); + } + + protected: + std::shared_ptr<::arrow::Schema> arrow_schema_; + std::shared_ptr<SchemaDescriptor> parquet_schema_; + std::vector<SchemaElement> parquet_format_schema_; + std::shared_ptr<::arrow::Schema> result_schema_; +}; + +int GetFieldId(const ::arrow::Field& field) { + if (field.metadata() == nullptr) { + return -1; + } + auto maybe_field = field.metadata()->Get("PARQUET:field_id"); + if (!maybe_field.ok()) { + return -1; + } + return std::stoi(maybe_field.ValueOrDie()); +} + +void GetFieldIdsDfs(const ::arrow::FieldVector& fields, std::vector<int>* field_ids) { + for (const auto& field : fields) { + field_ids->push_back(GetFieldId(*field)); + GetFieldIdsDfs(field->type()->fields(), field_ids); + } +} + +std::vector<int> GetFieldIdsDfs(const ::arrow::FieldVector& fields) { + std::vector<int> field_ids; + GetFieldIdsDfs(fields, &field_ids); + return field_ids; +} + +TEST_F(TestConvertRoundTrip, GroupIdMissingIfNotSpecified) { + std::vector<std::shared_ptr<Field>> arrow_fields; + arrow_fields.push_back(::arrow::field("simple", ::arrow::int32(), false)); + /// { "nested": { "outer": { "inner" }, "sibling" } + arrow_fields.push_back(::arrow::field( + "nested", + ::arrow::struct_({::arrow::field("outer", ::arrow::struct_({::arrow::field( + "inner", ::arrow::utf8())})), + ::arrow::field("sibling", ::arrow::date32())}), + false)); + + ASSERT_OK(ConvertSchema(arrow_fields)); + auto field_ids = GetFieldIdsDfs(result_schema_->fields()); + for (int actual_id : field_ids) { + ASSERT_EQ(actual_id, -1); + } +} + +std::shared_ptr<::arrow::KeyValueMetadata> FieldIdMetadata(int field_id) { + return ::arrow::key_value_metadata({"PARQUET:field_id"}, {std::to_string(field_id)}); +} + +TEST_F(TestConvertRoundTrip, GroupIdPreserveExisting) { Review comment: Perhaps `TestConvertRoundTrip::ConvertSchema` should be renamed to `RoundTripSchema`. It does the following transformations... * `vector<Field> -> arrow::Schema` * `arrow::Schema` -> `parquet::SchemaDescriptor` * `parquet::SchemaDescriptor` -> `vector<parquet::format::SchemaElement>` * `vector<parquet::format::SchemaElement>` -> `parquet::SchemaDescriptor` * `parquet::SchemaDescriptor` -> `arrow::Schema` So I believe it does indeed test the Parquet schema node. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org