This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 0cf32b23c3 GH-49114: [C++][Parquet] Fix converting schema failure with
deep nested two-level encoding list structure (#49125)
0cf32b23c3 is described below
commit 0cf32b23c361d174742befc201617b56040fc095
Author: Zehua Zou <[email protected]>
AuthorDate: Sat Feb 21 13:39:37 2026 +0800
GH-49114: [C++][Parquet] Fix converting schema failure with deep nested
two-level encoding list structure (#49125)
### Rationale for this change
Fix the failure when converting parquet schema with deep nested two-level
encoding list structure to arrow schema.
### What changes are included in this PR?
Modified the `ListToSchemaField` and `MapToSchemaField` methods.
### Are these changes tested?
Yes.
### Are there any user-facing changes?
It allows some legal schemas that were once considered illegal.
* GitHub Issue: #49114
Authored-by: Zehua Zou <[email protected]>
Signed-off-by: Gang Wu <[email protected]>
---
cpp/src/parquet/arrow/arrow_schema_test.cc | 100 ++++++++++++----
cpp/src/parquet/arrow/schema.cc | 183 ++++++++++++++++-------------
2 files changed, 183 insertions(+), 100 deletions(-)
diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc
b/cpp/src/parquet/arrow/arrow_schema_test.cc
index 721244fdbe..7d9ecb5e64 100644
--- a/cpp/src/parquet/arrow/arrow_schema_test.cc
+++ b/cpp/src/parquet/arrow/arrow_schema_test.cc
@@ -650,6 +650,32 @@ TEST_F(TestConvertParquetSchema, ParquetLists) {
arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
}
+ // Deep nested two-level encoding List<List<List<Integer>>>:
+ // optional group my_list (LIST) {
+ // repeated group array (LIST) {
+ // repeated group array (LIST) {
+ // repeated int32 array;
+ // }
+ // }
+ // }
+ {
+ auto inner_array =
+ PrimitiveNode::Make("array", Repetition::REPEATED,
ParquetType::INT32);
+ auto middle_array = GroupNode::Make("array", Repetition::REPEATED,
{inner_array},
+ ConvertedType::LIST);
+ auto outer_array = GroupNode::Make("array", Repetition::REPEATED,
{middle_array},
+ ConvertedType::LIST);
+ parquet_fields.push_back(GroupNode::Make("my_list", Repetition::OPTIONAL,
+ {outer_array},
ConvertedType::LIST));
+ auto arrow_inner_array = ::arrow::field("array", INT32,
/*nullable=*/false);
+ auto arrow_middle_array = ::arrow::field(
+ "array", list_case.type_factory(arrow_inner_array),
/*nullable=*/false);
+ auto arrow_outer_array = ::arrow::field(
+ "array", list_case.type_factory(arrow_middle_array),
/*nullable=*/false);
+ auto arrow_list = list_case.type_factory(arrow_outer_array);
+ arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
+ }
+
// List<Map<String, String>> in three-level list encoding:
// optional group my_list (LIST) {
// repeated group list {
@@ -682,6 +708,36 @@ TEST_F(TestConvertParquetSchema, ParquetLists) {
arrow_fields.push_back(::arrow::field("my_list", arrow_list,
/*nullable=*/true));
}
+ // List<Map<String, String>> in two-level list encoding:
+ //
+ // optional group my_list (LIST) {
+ // repeated group array (MAP) {
+ // repeated group key_value {
+ // required binary key (STRING);
+ // optional binary value (STRING);
+ // }
+ // }
+ // }
+ {
+ auto key = PrimitiveNode::Make("key", Repetition::REQUIRED,
ParquetType::BYTE_ARRAY,
+ ConvertedType::UTF8);
+ auto value = PrimitiveNode::Make("value", Repetition::OPTIONAL,
+ ParquetType::BYTE_ARRAY,
ConvertedType::UTF8);
+ auto key_value = GroupNode::Make("key_value", Repetition::REPEATED,
{key, value});
+ auto array =
+ GroupNode::Make("array", Repetition::REPEATED, {key_value},
ConvertedType::MAP);
+ parquet_fields.push_back(
+ GroupNode::Make("my_list", Repetition::OPTIONAL, {array},
ConvertedType::LIST));
+
+ auto arrow_key = ::arrow::field("key", UTF8, /*nullable=*/false);
+ auto arrow_value = ::arrow::field("value", UTF8, /*nullable=*/true);
+ auto arrow_element = ::arrow::field(
+ "array", std::make_shared<::arrow::MapType>(arrow_key, arrow_value),
+ /*nullable=*/false);
+ auto arrow_list = list_case.type_factory(arrow_element);
+ arrow_fields.push_back(::arrow::field("my_list", arrow_list,
/*nullable=*/true));
+ }
+
auto arrow_schema = ::arrow::schema(arrow_fields);
ArrowReaderProperties props;
@@ -845,34 +901,39 @@ TEST_F(TestConvertParquetSchema,
ParquetRepeatedNestedSchema) {
}
TEST_F(TestConvertParquetSchema, IllegalParquetNestedSchema) {
- // List<Map<String, String>> in two-level list encoding:
+ // Two-level list-annotated group cannot be repeated
//
- // optional group my_list (LIST) {
- // repeated group array (MAP) {
- // repeated group key_value {
- // required binary key (STRING);
- // optional binary value (STRING);
- // }
- // }
+ // repeated group my_list (LIST) {
+ // repeated int32 array;
// }
{
- auto key = PrimitiveNode::Make("key", Repetition::REQUIRED,
ParquetType::BYTE_ARRAY,
- ConvertedType::UTF8);
- auto value = PrimitiveNode::Make("value", Repetition::OPTIONAL,
- ParquetType::BYTE_ARRAY,
ConvertedType::UTF8);
- auto key_value = GroupNode::Make("key_value", Repetition::REPEATED, {key,
value});
- auto array =
- GroupNode::Make("array", Repetition::REPEATED, {key_value},
ConvertedType::MAP);
+ auto array = PrimitiveNode::Make("array", Repetition::REPEATED,
ParquetType::INT32);
std::vector<NodePtr> parquet_fields;
parquet_fields.push_back(
- GroupNode::Make("my_list", Repetition::OPTIONAL, {array},
ConvertedType::LIST));
+ GroupNode::Make("my_list", Repetition::REPEATED, {array},
ConvertedType::LIST));
EXPECT_RAISES_WITH_MESSAGE_THAT(
- Invalid,
- testing::HasSubstr("Group with one repeated child must be
LIST-annotated."),
+ Invalid, testing::HasSubstr("LIST-annotated groups must not be
repeated."),
ConvertSchema(parquet_fields));
}
+ // Two-level list-annotated group cannot be repeated
+ //
+ // required group my_struct {
+ // repeated group my_list (LIST) {
+ // repeated int32 array;
+ // }
+ // }
+ {
+ auto array = PrimitiveNode::Make("array", Repetition::REPEATED,
ParquetType::INT32);
+ auto list =
+ GroupNode::Make("my_list", Repetition::REPEATED, {array},
ConvertedType::LIST);
+ std::vector<NodePtr> parquet_fields;
+ parquet_fields.push_back(GroupNode::Make("my_struct",
Repetition::REQUIRED, {list}));
+ EXPECT_RAISES_WITH_MESSAGE_THAT(
+ Invalid, testing::HasSubstr("LIST-annotated groups must not be
repeated."),
+ ConvertSchema(parquet_fields));
+ }
// List<List<String>>: outer list is two-level encoding, inner list is
three-level
//
// optional group my_list (LIST) {
@@ -913,8 +974,7 @@ TEST_F(TestConvertParquetSchema,
IllegalParquetNestedSchema) {
GroupNode::Make("my_list", Repetition::OPTIONAL, {array},
ConvertedType::LIST));
EXPECT_RAISES_WITH_MESSAGE_THAT(
- Invalid,
- testing::HasSubstr("LIST-annotated groups must have at least one
child."),
+ Invalid, testing::HasSubstr("Group must have at least one child."),
ConvertSchema(parquet_fields));
}
}
diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc
index ed30661f9b..11d5d13e4b 100644
--- a/cpp/src/parquet/arrow/schema.cc
+++ b/cpp/src/parquet/arrow/schema.cc
@@ -28,6 +28,7 @@
#include "arrow/io/memory.h"
#include "arrow/ipc/api.h"
#include "arrow/result.h"
+#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/base64.h"
#include "arrow/util/checked_cast.h"
@@ -623,7 +624,10 @@ Status MapToSchemaField(const GroupNode& group, LevelInfo
current_levels,
if (group.field_count() != 1) {
return Status::Invalid("MAP-annotated groups must have a single child.");
}
- if (group.is_repeated()) {
+ // MAP-annotated groups cannot be repeated unless served as an element
within a
+ // LIST-annotated group with 2-level structure.
+ if (group.is_repeated() &&
+ (group.parent() == nullptr ||
!group.parent()->logical_type()->is_list())) {
return Status::Invalid("MAP-annotated groups must not be repeated.");
}
@@ -654,7 +658,9 @@ Status MapToSchemaField(const GroupNode& group, LevelInfo
current_levels,
return ListToSchemaField(group, current_levels, ctx, parent, out);
}
- current_levels.Increment(group);
+ if (group.is_optional()) {
+ current_levels.IncrementOptional();
+ }
int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
out->children.resize(1);
@@ -697,41 +703,58 @@ Status MapToSchemaField(const GroupNode& group, LevelInfo
current_levels,
return Status::OK();
}
-Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels,
- SchemaTreeContext* ctx, const SchemaField* parent,
- SchemaField* out) {
- if (group.field_count() != 1) {
- return Status::Invalid("LIST-annotated groups must have a single child.");
- }
- if (group.is_repeated()) {
- return Status::Invalid("LIST-annotated groups must not be repeated.");
- }
-
- current_levels.Increment(group);
+Status ResolveList(const GroupNode& group, LevelInfo current_levels,
+ SchemaTreeContext* ctx, SchemaField* out) {
+ auto check_two_level_list_repetition = [](const GroupNode& group) -> Status {
+ // When it is repeated, the LIST-annotated 2-level structure can only
serve as an
+ // element within another LIST-annotated 2-level structure.
+ if (group.is_repeated() &&
+ (group.parent() == nullptr ||
!group.parent()->logical_type()->is_list())) {
+ return Status::Invalid("LIST-annotated groups must not be repeated.");
+ }
+ return {};
+ };
- out->children.resize(group.field_count());
SchemaField* child_field = &out->children[0];
-
- ctx->LinkParent(out, parent);
- ctx->LinkParent(child_field, out);
-
const Node& list_node = *group.field(0);
-
if (!list_node.is_repeated()) {
return Status::Invalid(
"Non-repeated nodes in a LIST-annotated group are not supported.");
}
- int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
if (list_node.is_group()) {
const auto& list_group = static_cast<const GroupNode&>(list_node);
if (list_group.field_count() > 1) {
// The inner type of the list should be a struct when there are multiple
fields
// in the repeated group
- RETURN_NOT_OK(GroupToStruct(list_group, current_levels, ctx, out,
child_field));
- } else if (list_group.field_count() == 1) {
- const auto& repeated_field = list_group.field(0);
- if (repeated_field->is_repeated()) {
+ RETURN_NOT_OK(check_two_level_list_repetition(group));
+ return GroupToStruct(list_group, current_levels, ctx, out, child_field);
+ }
+ if (list_group.field_count() == 0) {
+ return Status::Invalid("Group must have at least one child.");
+ }
+
+ if (list_group.logical_type()->is_none() && HasListElementName(list_group,
group)) {
+ // Rule 4 at
+ //
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules
+ //
+ // required/optional group name=SOMETHING {
+ // repeated group name=array or $SOMETHING_tuple {
+ // required/optional TYPE item;
+ // }
+ // }
+ //
+ // The inner type of the list should be a struct rather than a primitive
value
+ //
+ // yields list<item: struct<item: TYPE ?nullable> not null> ?nullable
+ RETURN_NOT_OK(check_two_level_list_repetition(group));
+ return GroupToStruct(list_group, current_levels, ctx, out, child_field);
+ }
+
+ const auto& repeated_field = list_group.field(0);
+ if (!list_group.logical_type()->is_none() ||
repeated_field->is_repeated()) {
+ RETURN_NOT_OK(check_two_level_list_repetition(group));
+ if (list_group.logical_type()->is_list()) {
// Special case where the inner type might be a list with two-level
encoding
// like below:
//
@@ -742,69 +765,69 @@ Status ListToSchemaField(const GroupNode& group,
LevelInfo current_levels,
// }
//
// yields list<item: list<item: TYPE not null> not null> ?nullable
- if (!list_group.logical_type()->is_list()) {
- return Status::Invalid("Group with one repeated child must be
LIST-annotated.");
- }
- // LIST-annotated group with three-level encoding cannot be repeated.
- if (repeated_field->is_group()) {
- auto& repeated_group_field = static_cast<const
GroupNode&>(*repeated_field);
- if (repeated_group_field.field_count() == 0) {
- return Status::Invalid("LIST-annotated groups must have at least
one child.");
- }
- if (!repeated_group_field.field(0)->is_repeated()) {
- return Status::Invalid("LIST-annotated groups must not be
repeated.");
- }
- }
- RETURN_NOT_OK(
- NodeToSchemaField(*repeated_field, current_levels, ctx, out,
child_field));
- } else if (HasListElementName(list_group, group)) {
- // We distinguish the special case that we have
- //
- // required/optional group name=SOMETHING {
- // repeated group name=array or $SOMETHING_tuple {
- // required/optional TYPE item;
- // }
- // }
- //
- // The inner type of the list should be a struct rather than a
primitive value
- //
- // yields list<item: struct<item: TYPE ?nullable> not null> ?nullable
- RETURN_NOT_OK(GroupToStruct(list_group, current_levels, ctx, out,
child_field));
+ return ListToSchemaField(list_group, current_levels, ctx, out,
child_field);
+ } else if (list_group.logical_type()->is_map()) {
+ return MapToSchemaField(list_group, current_levels, ctx, out,
child_field);
} else {
- // Resolve 3-level encoding
- //
- // required/optional group name=whatever {
- // repeated group name=list {
- // required/optional TYPE item;
- // }
- // }
- //
- // yields list<item: TYPE ?nullable> ?nullable
- RETURN_NOT_OK(
- NodeToSchemaField(*repeated_field, current_levels, ctx, out,
child_field));
+ return GroupToStruct(list_group, current_levels, ctx, out,
child_field);
}
- } else {
- return Status::Invalid("Group must have at least one child.");
}
- } else {
- // Two-level list encoding
+
+ // Resolve normal 3-level encoding
//
- // required/optional group LIST {
- // repeated TYPE;
+ // required/optional group name=whatever {
+ // repeated group name=list {
+ // required/optional TYPE item;
+ // }
// }
//
- // TYPE is a primitive type
- //
- // yields list<item: TYPE not null> ?nullable
- const auto& primitive_node = static_cast<const PrimitiveNode&>(list_node);
- int column_index = ctx->schema->GetColumnIndex(primitive_node);
- ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrowType> type,
- GetTypeForNode(column_index, primitive_node, ctx));
- auto item_field = ::arrow::field(list_node.name(), type,
/*nullable=*/false,
- FieldIdMetadata(list_node.field_id()));
- RETURN_NOT_OK(
- PopulateLeaf(column_index, item_field, current_levels, ctx, out,
child_field));
+ // yields list<item: TYPE ?nullable> ?nullable
+ if (group.is_repeated()) {
+ return Status::Invalid("LIST-annotated groups must not be repeated.");
+ }
+
+ return NodeToSchemaField(*repeated_field, current_levels, ctx, out,
child_field);
+ }
+
+ // Two-level list encoding
+ //
+ // required/optional group LIST {
+ // repeated TYPE;
+ // }
+ //
+ // TYPE is a primitive type
+ //
+ // yields list<item: TYPE not null> ?nullable
+ RETURN_NOT_OK(check_two_level_list_repetition(group));
+ const auto& primitive_node = static_cast<const PrimitiveNode&>(list_node);
+ int column_index = ctx->schema->GetColumnIndex(primitive_node);
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrowType> type,
+ GetTypeForNode(column_index, primitive_node, ctx));
+ auto item_field = ::arrow::field(list_node.name(), type, /*nullable=*/false,
+ FieldIdMetadata(list_node.field_id()));
+ return PopulateLeaf(column_index, item_field, current_levels, ctx, out,
child_field);
+}
+
+Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels,
+ SchemaTreeContext* ctx, const SchemaField* parent,
+ SchemaField* out) {
+ if (group.field_count() != 1) {
+ return Status::Invalid("LIST-annotated groups must have a single child.");
+ }
+
+ if (group.is_optional()) {
+ current_levels.IncrementOptional();
}
+
+ out->children.resize(group.field_count());
+ SchemaField* child_field = &out->children[0];
+
+ ctx->LinkParent(out, parent);
+ ctx->LinkParent(child_field, out);
+
+ int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
+ RETURN_NOT_OK(ResolveList(group, current_levels, ctx, out));
+
ARROW_ASSIGN_OR_RAISE(auto list_type,
MakeArrowList(child_field->field, ctx->properties));
out->field = ::arrow::field(group.name(), std::move(list_type),
group.is_optional(),