This is an automated email from the ASF dual-hosted git repository.

gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 0cf32b23c3 GH-49114: [C++][Parquet] Fix converting schema failure with 
deep nested two-level encoding list structure (#49125)
0cf32b23c3 is described below

commit 0cf32b23c361d174742befc201617b56040fc095
Author: Zehua Zou <[email protected]>
AuthorDate: Sat Feb 21 13:39:37 2026 +0800

    GH-49114: [C++][Parquet] Fix converting schema failure with deep nested 
two-level encoding list structure (#49125)
    
    ### Rationale for this change
    
    Fix the failure when converting parquet schema with deep nested two-level 
encoding list structure to arrow schema.
    
    ### What changes are included in this PR?
    
    Modified the `ListToSchemaField` and `MapToSchemaField` methods.
    
    ### Are these changes tested?
    
    Yes.
    
    ### Are there any user-facing changes?
    
    It allows some legal schemas that were once considered illegal.
    * GitHub Issue: #49114
    
    Authored-by: Zehua Zou <[email protected]>
    Signed-off-by: Gang Wu <[email protected]>
---
 cpp/src/parquet/arrow/arrow_schema_test.cc | 100 ++++++++++++----
 cpp/src/parquet/arrow/schema.cc            | 183 ++++++++++++++++-------------
 2 files changed, 183 insertions(+), 100 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc 
b/cpp/src/parquet/arrow/arrow_schema_test.cc
index 721244fdbe..7d9ecb5e64 100644
--- a/cpp/src/parquet/arrow/arrow_schema_test.cc
+++ b/cpp/src/parquet/arrow/arrow_schema_test.cc
@@ -650,6 +650,32 @@ TEST_F(TestConvertParquetSchema, ParquetLists) {
       arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
     }
 
+    // Deep nested two-level encoding List<List<List<Integer>>>:
+    // optional group my_list (LIST) {
+    //   repeated group array (LIST) {
+    //     repeated group array (LIST) {
+    //       repeated int32 array;
+    //     }
+    //   }
+    // }
+    {
+      auto inner_array =
+          PrimitiveNode::Make("array", Repetition::REPEATED, 
ParquetType::INT32);
+      auto middle_array = GroupNode::Make("array", Repetition::REPEATED, 
{inner_array},
+                                          ConvertedType::LIST);
+      auto outer_array = GroupNode::Make("array", Repetition::REPEATED, 
{middle_array},
+                                         ConvertedType::LIST);
+      parquet_fields.push_back(GroupNode::Make("my_list", Repetition::OPTIONAL,
+                                               {outer_array}, 
ConvertedType::LIST));
+      auto arrow_inner_array = ::arrow::field("array", INT32, 
/*nullable=*/false);
+      auto arrow_middle_array = ::arrow::field(
+          "array", list_case.type_factory(arrow_inner_array), 
/*nullable=*/false);
+      auto arrow_outer_array = ::arrow::field(
+          "array", list_case.type_factory(arrow_middle_array), 
/*nullable=*/false);
+      auto arrow_list = list_case.type_factory(arrow_outer_array);
+      arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
+    }
+
     // List<Map<String, String>> in three-level list encoding:
     // optional group my_list (LIST) {
     //   repeated group list {
@@ -682,6 +708,36 @@ TEST_F(TestConvertParquetSchema, ParquetLists) {
       arrow_fields.push_back(::arrow::field("my_list", arrow_list, 
/*nullable=*/true));
     }
 
+    // List<Map<String, String>> in two-level list encoding:
+    //
+    // optional group my_list (LIST) {
+    //   repeated group array (MAP) {
+    //     repeated group key_value {
+    //       required binary key (STRING);
+    //       optional binary value (STRING);
+    //     }
+    //   }
+    // }
+    {
+      auto key = PrimitiveNode::Make("key", Repetition::REQUIRED, 
ParquetType::BYTE_ARRAY,
+                                     ConvertedType::UTF8);
+      auto value = PrimitiveNode::Make("value", Repetition::OPTIONAL,
+                                       ParquetType::BYTE_ARRAY, 
ConvertedType::UTF8);
+      auto key_value = GroupNode::Make("key_value", Repetition::REPEATED, 
{key, value});
+      auto array =
+          GroupNode::Make("array", Repetition::REPEATED, {key_value}, 
ConvertedType::MAP);
+      parquet_fields.push_back(
+          GroupNode::Make("my_list", Repetition::OPTIONAL, {array}, 
ConvertedType::LIST));
+
+      auto arrow_key = ::arrow::field("key", UTF8, /*nullable=*/false);
+      auto arrow_value = ::arrow::field("value", UTF8, /*nullable=*/true);
+      auto arrow_element = ::arrow::field(
+          "array", std::make_shared<::arrow::MapType>(arrow_key, arrow_value),
+          /*nullable=*/false);
+      auto arrow_list = list_case.type_factory(arrow_element);
+      arrow_fields.push_back(::arrow::field("my_list", arrow_list, 
/*nullable=*/true));
+    }
+
     auto arrow_schema = ::arrow::schema(arrow_fields);
 
     ArrowReaderProperties props;
@@ -845,34 +901,39 @@ TEST_F(TestConvertParquetSchema, 
ParquetRepeatedNestedSchema) {
 }
 
 TEST_F(TestConvertParquetSchema, IllegalParquetNestedSchema) {
-  // List<Map<String, String>> in two-level list encoding:
+  // Two-level list-annotated group cannot be repeated
   //
-  // optional group my_list (LIST) {
-  //   repeated group array (MAP) {
-  //     repeated group key_value {
-  //       required binary key (STRING);
-  //       optional binary value (STRING);
-  //     }
-  //   }
+  // repeated group my_list (LIST) {
+  //   repeated int32 array;
   // }
   {
-    auto key = PrimitiveNode::Make("key", Repetition::REQUIRED, 
ParquetType::BYTE_ARRAY,
-                                   ConvertedType::UTF8);
-    auto value = PrimitiveNode::Make("value", Repetition::OPTIONAL,
-                                     ParquetType::BYTE_ARRAY, 
ConvertedType::UTF8);
-    auto key_value = GroupNode::Make("key_value", Repetition::REPEATED, {key, 
value});
-    auto array =
-        GroupNode::Make("array", Repetition::REPEATED, {key_value}, 
ConvertedType::MAP);
+    auto array = PrimitiveNode::Make("array", Repetition::REPEATED, 
ParquetType::INT32);
     std::vector<NodePtr> parquet_fields;
     parquet_fields.push_back(
-        GroupNode::Make("my_list", Repetition::OPTIONAL, {array}, 
ConvertedType::LIST));
+        GroupNode::Make("my_list", Repetition::REPEATED, {array}, 
ConvertedType::LIST));
 
     EXPECT_RAISES_WITH_MESSAGE_THAT(
-        Invalid,
-        testing::HasSubstr("Group with one repeated child must be 
LIST-annotated."),
+        Invalid, testing::HasSubstr("LIST-annotated groups must not be 
repeated."),
         ConvertSchema(parquet_fields));
   }
+  // Two-level list-annotated group cannot be repeated
+  //
+  // required group my_struct {
+  //   repeated group my_list (LIST) {
+  //     repeated int32 array;
+  //   }
+  // }
+  {
+    auto array = PrimitiveNode::Make("array", Repetition::REPEATED, 
ParquetType::INT32);
+    auto list =
+        GroupNode::Make("my_list", Repetition::REPEATED, {array}, 
ConvertedType::LIST);
+    std::vector<NodePtr> parquet_fields;
+    parquet_fields.push_back(GroupNode::Make("my_struct", 
Repetition::REQUIRED, {list}));
 
+    EXPECT_RAISES_WITH_MESSAGE_THAT(
+        Invalid, testing::HasSubstr("LIST-annotated groups must not be 
repeated."),
+        ConvertSchema(parquet_fields));
+  }
   // List<List<String>>: outer list is two-level encoding, inner list is 
three-level
   //
   // optional group my_list (LIST) {
@@ -913,8 +974,7 @@ TEST_F(TestConvertParquetSchema, 
IllegalParquetNestedSchema) {
         GroupNode::Make("my_list", Repetition::OPTIONAL, {array}, 
ConvertedType::LIST));
 
     EXPECT_RAISES_WITH_MESSAGE_THAT(
-        Invalid,
-        testing::HasSubstr("LIST-annotated groups must have at least one 
child."),
+        Invalid, testing::HasSubstr("Group must have at least one child."),
         ConvertSchema(parquet_fields));
   }
 }
diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc
index ed30661f9b..11d5d13e4b 100644
--- a/cpp/src/parquet/arrow/schema.cc
+++ b/cpp/src/parquet/arrow/schema.cc
@@ -28,6 +28,7 @@
 #include "arrow/io/memory.h"
 #include "arrow/ipc/api.h"
 #include "arrow/result.h"
+#include "arrow/status.h"
 #include "arrow/type.h"
 #include "arrow/util/base64.h"
 #include "arrow/util/checked_cast.h"
@@ -623,7 +624,10 @@ Status MapToSchemaField(const GroupNode& group, LevelInfo 
current_levels,
   if (group.field_count() != 1) {
     return Status::Invalid("MAP-annotated groups must have a single child.");
   }
-  if (group.is_repeated()) {
+  // MAP-annotated groups cannot be repeated unless served as an element 
within a
+  // LIST-annotated group with 2-level structure.
+  if (group.is_repeated() &&
+      (group.parent() == nullptr || 
!group.parent()->logical_type()->is_list())) {
     return Status::Invalid("MAP-annotated groups must not be repeated.");
   }
 
@@ -654,7 +658,9 @@ Status MapToSchemaField(const GroupNode& group, LevelInfo 
current_levels,
     return ListToSchemaField(group, current_levels, ctx, parent, out);
   }
 
-  current_levels.Increment(group);
+  if (group.is_optional()) {
+    current_levels.IncrementOptional();
+  }
   int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
 
   out->children.resize(1);
@@ -697,41 +703,58 @@ Status MapToSchemaField(const GroupNode& group, LevelInfo 
current_levels,
   return Status::OK();
 }
 
-Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels,
-                         SchemaTreeContext* ctx, const SchemaField* parent,
-                         SchemaField* out) {
-  if (group.field_count() != 1) {
-    return Status::Invalid("LIST-annotated groups must have a single child.");
-  }
-  if (group.is_repeated()) {
-    return Status::Invalid("LIST-annotated groups must not be repeated.");
-  }
-
-  current_levels.Increment(group);
+Status ResolveList(const GroupNode& group, LevelInfo current_levels,
+                   SchemaTreeContext* ctx, SchemaField* out) {
+  auto check_two_level_list_repetition = [](const GroupNode& group) -> Status {
+    // When it is repeated, the LIST-annotated 2-level structure can only 
serve as an
+    // element within another LIST-annotated 2-level structure.
+    if (group.is_repeated() &&
+        (group.parent() == nullptr || 
!group.parent()->logical_type()->is_list())) {
+      return Status::Invalid("LIST-annotated groups must not be repeated.");
+    }
+    return {};
+  };
 
-  out->children.resize(group.field_count());
   SchemaField* child_field = &out->children[0];
-
-  ctx->LinkParent(out, parent);
-  ctx->LinkParent(child_field, out);
-
   const Node& list_node = *group.field(0);
-
   if (!list_node.is_repeated()) {
     return Status::Invalid(
         "Non-repeated nodes in a LIST-annotated group are not supported.");
   }
 
-  int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
   if (list_node.is_group()) {
     const auto& list_group = static_cast<const GroupNode&>(list_node);
     if (list_group.field_count() > 1) {
       // The inner type of the list should be a struct when there are multiple 
fields
       // in the repeated group
-      RETURN_NOT_OK(GroupToStruct(list_group, current_levels, ctx, out, 
child_field));
-    } else if (list_group.field_count() == 1) {
-      const auto& repeated_field = list_group.field(0);
-      if (repeated_field->is_repeated()) {
+      RETURN_NOT_OK(check_two_level_list_repetition(group));
+      return GroupToStruct(list_group, current_levels, ctx, out, child_field);
+    }
+    if (list_group.field_count() == 0) {
+      return Status::Invalid("Group must have at least one child.");
+    }
+
+    if (list_group.logical_type()->is_none() && HasListElementName(list_group, 
group)) {
+      // Rule 4 at
+      // 
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules
+      //
+      // required/optional group name=SOMETHING {
+      //   repeated group name=array or $SOMETHING_tuple {
+      //     required/optional TYPE item;
+      //   }
+      // }
+      //
+      // The inner type of the list should be a struct rather than a primitive 
value
+      //
+      // yields list<item: struct<item: TYPE ?nullable> not null> ?nullable
+      RETURN_NOT_OK(check_two_level_list_repetition(group));
+      return GroupToStruct(list_group, current_levels, ctx, out, child_field);
+    }
+
+    const auto& repeated_field = list_group.field(0);
+    if (!list_group.logical_type()->is_none() || 
repeated_field->is_repeated()) {
+      RETURN_NOT_OK(check_two_level_list_repetition(group));
+      if (list_group.logical_type()->is_list()) {
         // Special case where the inner type might be a list with two-level 
encoding
         // like below:
         //
@@ -742,69 +765,69 @@ Status ListToSchemaField(const GroupNode& group, 
LevelInfo current_levels,
         // }
         //
         // yields list<item: list<item: TYPE not null> not null> ?nullable
-        if (!list_group.logical_type()->is_list()) {
-          return Status::Invalid("Group with one repeated child must be 
LIST-annotated.");
-        }
-        // LIST-annotated group with three-level encoding cannot be repeated.
-        if (repeated_field->is_group()) {
-          auto& repeated_group_field = static_cast<const 
GroupNode&>(*repeated_field);
-          if (repeated_group_field.field_count() == 0) {
-            return Status::Invalid("LIST-annotated groups must have at least 
one child.");
-          }
-          if (!repeated_group_field.field(0)->is_repeated()) {
-            return Status::Invalid("LIST-annotated groups must not be 
repeated.");
-          }
-        }
-        RETURN_NOT_OK(
-            NodeToSchemaField(*repeated_field, current_levels, ctx, out, 
child_field));
-      } else if (HasListElementName(list_group, group)) {
-        // We distinguish the special case that we have
-        //
-        // required/optional group name=SOMETHING {
-        //   repeated group name=array or $SOMETHING_tuple {
-        //     required/optional TYPE item;
-        //   }
-        // }
-        //
-        // The inner type of the list should be a struct rather than a 
primitive value
-        //
-        // yields list<item: struct<item: TYPE ?nullable> not null> ?nullable
-        RETURN_NOT_OK(GroupToStruct(list_group, current_levels, ctx, out, 
child_field));
+        return ListToSchemaField(list_group, current_levels, ctx, out, 
child_field);
+      } else if (list_group.logical_type()->is_map()) {
+        return MapToSchemaField(list_group, current_levels, ctx, out, 
child_field);
       } else {
-        // Resolve 3-level encoding
-        //
-        // required/optional group name=whatever {
-        //   repeated group name=list {
-        //     required/optional TYPE item;
-        //   }
-        // }
-        //
-        // yields list<item: TYPE ?nullable> ?nullable
-        RETURN_NOT_OK(
-            NodeToSchemaField(*repeated_field, current_levels, ctx, out, 
child_field));
+        return GroupToStruct(list_group, current_levels, ctx, out, 
child_field);
       }
-    } else {
-      return Status::Invalid("Group must have at least one child.");
     }
-  } else {
-    // Two-level list encoding
+
+    // Resolve normal 3-level encoding
     //
-    // required/optional group LIST {
-    //   repeated TYPE;
+    // required/optional group name=whatever {
+    //   repeated group name=list {
+    //     required/optional TYPE item;
+    //   }
     // }
     //
-    // TYPE is a primitive type
-    //
-    // yields list<item: TYPE not null> ?nullable
-    const auto& primitive_node = static_cast<const PrimitiveNode&>(list_node);
-    int column_index = ctx->schema->GetColumnIndex(primitive_node);
-    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrowType> type,
-                          GetTypeForNode(column_index, primitive_node, ctx));
-    auto item_field = ::arrow::field(list_node.name(), type, 
/*nullable=*/false,
-                                     FieldIdMetadata(list_node.field_id()));
-    RETURN_NOT_OK(
-        PopulateLeaf(column_index, item_field, current_levels, ctx, out, 
child_field));
+    // yields list<item: TYPE ?nullable> ?nullable
+    if (group.is_repeated()) {
+      return Status::Invalid("LIST-annotated groups must not be repeated.");
+    }
+
+    return NodeToSchemaField(*repeated_field, current_levels, ctx, out, 
child_field);
+  }
+
+  // Two-level list encoding
+  //
+  // required/optional group LIST {
+  //   repeated TYPE;
+  // }
+  //
+  // TYPE is a primitive type
+  //
+  // yields list<item: TYPE not null> ?nullable
+  RETURN_NOT_OK(check_two_level_list_repetition(group));
+  const auto& primitive_node = static_cast<const PrimitiveNode&>(list_node);
+  int column_index = ctx->schema->GetColumnIndex(primitive_node);
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrowType> type,
+                        GetTypeForNode(column_index, primitive_node, ctx));
+  auto item_field = ::arrow::field(list_node.name(), type, /*nullable=*/false,
+                                   FieldIdMetadata(list_node.field_id()));
+  return PopulateLeaf(column_index, item_field, current_levels, ctx, out, 
child_field);
+}
+
+Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels,
+                         SchemaTreeContext* ctx, const SchemaField* parent,
+                         SchemaField* out) {
+  if (group.field_count() != 1) {
+    return Status::Invalid("LIST-annotated groups must have a single child.");
+  }
+
+  if (group.is_optional()) {
+    current_levels.IncrementOptional();
   }
+
+  out->children.resize(group.field_count());
+  SchemaField* child_field = &out->children[0];
+
+  ctx->LinkParent(out, parent);
+  ctx->LinkParent(child_field, out);
+
+  int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
+  RETURN_NOT_OK(ResolveList(group, current_levels, ctx, out));
+
   ARROW_ASSIGN_OR_RAISE(auto list_type,
                         MakeArrowList(child_field->field, ctx->properties));
   out->field = ::arrow::field(group.name(), std::move(list_type), 
group.is_optional(),

Reply via email to