pitrou commented on a change in pull request #7973:
URL: https://github.com/apache/arrow/pull/7973#discussion_r477352114
##########
File path: cpp/src/parquet/level_conversion.h
##########
@@ -20,10 +20,117 @@
#include <cstdint>
#include "parquet/platform.h"
+#include "parquet/schema.h"
namespace parquet {
namespace internal {
+struct PARQUET_EXPORT LevelInfo {
+ LevelInfo()
+ : null_slot_usage(1), def_level(0), rep_level(0),
repeated_ancestor_def_level(0) {}
+ LevelInfo(int32_t null_slots, int32_t definition_level, int32_t
repitition_level,
+ int32_t repeated_ancestor_definition_level)
+ : null_slot_usage(null_slots),
+ def_level(definition_level),
+ rep_level(repitition_level),
+ repeated_ancestor_def_level(repeated_ancestor_definition_level) {}
+
+ bool operator==(const LevelInfo& b) const {
+ return null_slot_usage == b.null_slot_usage && def_level == b.def_level &&
+ rep_level == b.rep_level &&
+ repeated_ancestor_def_level == b.repeated_ancestor_def_level;
+ }
+
+ // How many slots a null element consumes.
+ // This is only ever >1 for descendents of
+ // FixedSizeList.
+ int32_t null_slot_usage = 1;
+
+ // The definition level at which the value for the field
+ // is considered not null (definition levels greater than
+ // or equal to indicate this value indicate a not-null
+ // value for the field). For list fields definition levels
+ // greater then or equal to this field indicate a present
+ // , possibly null, element.
+ int16_t def_level = 0;
+
+ // The repetition level corresponding to this element
+ // or the closest repeated ancestor. Any repetition
+ // level less than this indicates either a new list OR
+ // an empty list (which is determined in conjunction
+ // definition_level).
+ int16_t rep_level = 0;
+
+ // The definition level indicating the level at which the closest
+ // repeated ancestor was not empty. This is used to discriminate
+ // between a value less than |definition_level|
+ // being null or excluded entirely.
+ // For instance if we have an arrow schema like:
+ // list(struct(f0: int)). Then then there are the following
+ // definition levels:
+ // 0 = null list
+ // 1 = present but empty list.
+ // 2 = a null value in the list
+ // 3 = a non null struct but null integer.
+ // 4 = a present integer.
Review comment:
+1, thanks for this example!
##########
File path: cpp/src/parquet/level_conversion.h
##########
@@ -20,10 +20,117 @@
#include <cstdint>
#include "parquet/platform.h"
+#include "parquet/schema.h"
namespace parquet {
namespace internal {
+struct PARQUET_EXPORT LevelInfo {
+ LevelInfo()
+ : null_slot_usage(1), def_level(0), rep_level(0),
repeated_ancestor_def_level(0) {}
+ LevelInfo(int32_t null_slots, int32_t definition_level, int32_t
repitition_level,
+ int32_t repeated_ancestor_definition_level)
+ : null_slot_usage(null_slots),
+ def_level(definition_level),
+ rep_level(repitition_level),
+ repeated_ancestor_def_level(repeated_ancestor_definition_level) {}
+
+ bool operator==(const LevelInfo& b) const {
+ return null_slot_usage == b.null_slot_usage && def_level == b.def_level &&
+ rep_level == b.rep_level &&
+ repeated_ancestor_def_level == b.repeated_ancestor_def_level;
+ }
+
+ // How many slots a null element consumes.
+ // This is only ever >1 for descendents of
+ // FixedSizeList.
+ int32_t null_slot_usage = 1;
+
+ // The definition level at which the value for the field
+ // is considered not null (definition levels greater than
+ // or equal to indicate this value indicate a not-null
+ // value for the field). For list fields definition levels
+ // greater then or equal to this field indicate a present
+ // , possibly null, element.
+ int16_t def_level = 0;
+
+ // The repetition level corresponding to this element
+ // or the closest repeated ancestor. Any repetition
+ // level less than this indicates either a new list OR
+ // an empty list (which is determined in conjunction
+ // definition_level).
+ int16_t rep_level = 0;
+
+ // The definition level indicating the level at which the closest
+ // repeated ancestor was not empty. This is used to discriminate
Review comment:
Do you mean logical ancestor (in Arrow terms)? Or physical ancestor (in
Parquet nesting)?
##########
File path: cpp/src/parquet/level_conversion.h
##########
@@ -20,10 +20,117 @@
#include <cstdint>
#include "parquet/platform.h"
+#include "parquet/schema.h"
namespace parquet {
namespace internal {
+struct PARQUET_EXPORT LevelInfo {
+ LevelInfo()
+ : null_slot_usage(1), def_level(0), rep_level(0),
repeated_ancestor_def_level(0) {}
+ LevelInfo(int32_t null_slots, int32_t definition_level, int32_t
repitition_level,
+ int32_t repeated_ancestor_definition_level)
+ : null_slot_usage(null_slots),
+ def_level(definition_level),
+ rep_level(repitition_level),
+ repeated_ancestor_def_level(repeated_ancestor_definition_level) {}
+
+ bool operator==(const LevelInfo& b) const {
+ return null_slot_usage == b.null_slot_usage && def_level == b.def_level &&
+ rep_level == b.rep_level &&
+ repeated_ancestor_def_level == b.repeated_ancestor_def_level;
+ }
+
+ // How many slots a null element consumes.
+ // This is only ever >1 for descendents of
+ // FixedSizeList.
+ int32_t null_slot_usage = 1;
+
+ // The definition level at which the value for the field
+ // is considered not null (definition levels greater than
+ // or equal to indicate this value indicate a not-null
Review comment:
"or equal to this value"?
##########
File path: cpp/src/parquet/level_conversion.h
##########
@@ -20,10 +20,117 @@
#include <cstdint>
#include "parquet/platform.h"
+#include "parquet/schema.h"
namespace parquet {
namespace internal {
+struct PARQUET_EXPORT LevelInfo {
+ LevelInfo()
+ : null_slot_usage(1), def_level(0), rep_level(0),
repeated_ancestor_def_level(0) {}
+ LevelInfo(int32_t null_slots, int32_t definition_level, int32_t
repitition_level,
+ int32_t repeated_ancestor_definition_level)
+ : null_slot_usage(null_slots),
+ def_level(definition_level),
+ rep_level(repitition_level),
+ repeated_ancestor_def_level(repeated_ancestor_definition_level) {}
+
+ bool operator==(const LevelInfo& b) const {
+ return null_slot_usage == b.null_slot_usage && def_level == b.def_level &&
+ rep_level == b.rep_level &&
+ repeated_ancestor_def_level == b.repeated_ancestor_def_level;
+ }
+
+ // How many slots a null element consumes.
+ // This is only ever >1 for descendents of
+ // FixedSizeList.
+ int32_t null_slot_usage = 1;
+
+ // The definition level at which the value for the field
+ // is considered not null (definition levels greater than
+ // or equal to indicate this value indicate a not-null
+ // value for the field). For list fields definition levels
+ // greater then or equal to this field indicate a present
+ // , possibly null, element.
+ int16_t def_level = 0;
+
+ // The repetition level corresponding to this element
+ // or the closest repeated ancestor. Any repetition
+ // level less than this indicates either a new list OR
+ // an empty list (which is determined in conjunction
+ // definition_level).
Review comment:
Do you mean "def_level"? Or "repeated_ancestor_def_level" perhaps?
##########
File path: cpp/src/parquet/arrow/schema.cc
##########
@@ -477,12 +484,11 @@ Status ListToSchemaField(const GroupNode& group, int16_t
current_def_level,
const Node& list_node = *group.field(0);
if (!list_node.is_repeated()) {
- return Status::NotImplemented(
+ return Status::Invalid(
"Non-repeated nodes in a LIST-annotated group are not supported.");
}
- ++current_def_level;
- ++current_rep_level;
+ int16_t repeated_ancesor_def_level = current_levels.IncrementRepeated();
Review comment:
"ancestor"?
##########
File path: cpp/src/parquet/arrow/arrow_schema_test.cc
##########
@@ -1140,5 +1144,244 @@ TEST(TestFromParquetSchema, CorruptMetadata) {
ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props,
&arrow_schema));
}
+::arrow::Result<std::deque<LevelInfo>> RootToTreeLeafLevels(
+ const SchemaManifest& manifest, int column_number) {
+ std::deque<LevelInfo> out;
+ const SchemaField* field;
+ RETURN_NOT_OK(manifest.GetColumnField(column_number, &field));
+ while (field != nullptr) {
+ out.push_front(field->level_info);
+ field = manifest.GetParent(field);
+ }
+ return out;
+}
+
+class TestLevels : public ::testing::Test {
+ public:
+ virtual void SetUp() {}
+
+ ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) {
+ descriptor_.reset(new SchemaDescriptor());
+ manifest_.reset(new SchemaManifest());
+ descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column}));
+ return SchemaManifest::Make(descriptor_.get(),
+ std::shared_ptr<const
::arrow::KeyValueMetadata>(),
+ ArrowReaderProperties(), manifest_.get());
+ }
+ void SetParquetSchema(const NodePtr& column) {
+ ASSERT_OK(MaybeSetParquetSchema(column));
+ }
+
+ protected:
+ std::unique_ptr<SchemaDescriptor> descriptor_;
+ std::unique_ptr<SchemaManifest> manifest_;
+};
+
+TEST_F(TestLevels, TestPrimitive) {
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REQUIRED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(std::deque<LevelInfo> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(LevelInfo{/*null_slot_usage=*/1,
+ /*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::OPTIONAL,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(LevelInfo{/*null_slot_usage=*/1,
/*def_level=*/1,
+ /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ // Arrow schema: list(bool not null) not null
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REPEATED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(
+ levels,
+ ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/1,
+ /*ancestor_list_def_level*/ 0}, // List Field
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1})); // primitive
field
+}
+
+TEST_F(TestLevels, TestSimpleGroups) {
+ // Arrow schema: struct(child: struct(inner: boolean not null))
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REQUIRED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<LevelInfo> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(
+ levels,
+ ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ // Arrow schema: struct(child: struct(inner: boolean ))
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(
+ levels,
+ ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/3,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ // Arrow schema: struct(child: struct(inner: boolean)) not null
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REQUIRED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(
+ levels,
+ ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/0,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+}
+
+TEST_F(TestLevels, TestRepeatedGroups) {
+ // Arrow schema: list(struct(child: struct(list(bool not null) not null))
non null) not
+ // null
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REPEATED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<LevelInfo> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(
+ levels,
+ ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2,
/*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1}, // optional child
struct
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/3,
/*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // repeated field
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/3,
/*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // innter field
Review comment:
"inner"
##########
File path: cpp/src/parquet/arrow/arrow_schema_test.cc
##########
@@ -1140,5 +1144,244 @@ TEST(TestFromParquetSchema, CorruptMetadata) {
ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props,
&arrow_schema));
}
+::arrow::Result<std::deque<LevelInfo>> RootToTreeLeafLevels(
+ const SchemaManifest& manifest, int column_number) {
+ std::deque<LevelInfo> out;
+ const SchemaField* field;
+ RETURN_NOT_OK(manifest.GetColumnField(column_number, &field));
+ while (field != nullptr) {
+ out.push_front(field->level_info);
+ field = manifest.GetParent(field);
+ }
+ return out;
+}
+
+class TestLevels : public ::testing::Test {
+ public:
+ virtual void SetUp() {}
+
+ ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) {
+ descriptor_.reset(new SchemaDescriptor());
+ manifest_.reset(new SchemaManifest());
+ descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column}));
+ return SchemaManifest::Make(descriptor_.get(),
+ std::shared_ptr<const
::arrow::KeyValueMetadata>(),
+ ArrowReaderProperties(), manifest_.get());
+ }
+ void SetParquetSchema(const NodePtr& column) {
+ ASSERT_OK(MaybeSetParquetSchema(column));
+ }
+
+ protected:
+ std::unique_ptr<SchemaDescriptor> descriptor_;
+ std::unique_ptr<SchemaManifest> manifest_;
+};
+
+TEST_F(TestLevels, TestPrimitive) {
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REQUIRED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(std::deque<LevelInfo> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(LevelInfo{/*null_slot_usage=*/1,
+ /*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::OPTIONAL,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(LevelInfo{/*null_slot_usage=*/1,
/*def_level=*/1,
+ /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ // Arrow schema: list(bool not null) not null
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REPEATED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(
+ levels,
+ ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/1,
+ /*ancestor_list_def_level*/ 0}, // List Field
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1})); // primitive
field
+}
+
+TEST_F(TestLevels, TestSimpleGroups) {
+ // Arrow schema: struct(child: struct(inner: boolean not null))
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REQUIRED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<LevelInfo> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(
+ levels,
+ ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ // Arrow schema: struct(child: struct(inner: boolean ))
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(
+ levels,
+ ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/3,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ // Arrow schema: struct(child: struct(inner: boolean)) not null
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REQUIRED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(
+ levels,
+ ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/0,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+}
+
+TEST_F(TestLevels, TestRepeatedGroups) {
+ // Arrow schema: list(struct(child: struct(list(bool not null) not null))
non null) not
+ // null
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REPEATED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<LevelInfo> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(
+ levels,
+ ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2,
/*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1}, // optional child
struct
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/3,
/*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // repeated field
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/3,
/*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // innter field
+
+ // Arrow schema: list(struct(child_list: list(struct(f0: bool f1: bool
no-required )))
+ // not null) not null
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child_list", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "list", Repetition::REPEATED,
+ {GroupNode::Make(
+ "element", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("f0", Repetition::OPTIONAL,
ParquetType::BOOLEAN),
+ PrimitiveNode::Make("f1", Repetition::REQUIRED,
+ ParquetType::BOOLEAN)})})},
+ LogicalType::List())}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(
+ levels,
+ ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ // Def_level=2 is handled together with def_level=3
+ // When decoding. Def_level=2 indicate present but empty
+ // list. def_level=3 indicates a present element in the
+ // list.
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/3,
/*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // list field
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/4,
/*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3}, // inner struct
field
+
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/5,
/*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // f0 bool field
Review comment:
Hmm, ok, now I don't understand why 4 LevelInfo are returned. I would
expect 5:
* for outer `list`
* for outer `list(struct)`
* for `list(struct(child: list))`
* for `list(struct(child: list(struct)))`
* for `list(struct(child: list(struct: f0)))`
##########
File path: cpp/src/parquet/level_conversion.h
##########
@@ -20,10 +20,117 @@
#include <cstdint>
#include "parquet/platform.h"
+#include "parquet/schema.h"
namespace parquet {
namespace internal {
+struct PARQUET_EXPORT LevelInfo {
+ LevelInfo()
+ : null_slot_usage(1), def_level(0), rep_level(0),
repeated_ancestor_def_level(0) {}
+ LevelInfo(int32_t null_slots, int32_t definition_level, int32_t
repitition_level,
Review comment:
"repetition_level"
##########
File path: cpp/src/parquet/level_conversion.h
##########
@@ -20,10 +20,117 @@
#include <cstdint>
#include "parquet/platform.h"
+#include "parquet/schema.h"
namespace parquet {
namespace internal {
+struct PARQUET_EXPORT LevelInfo {
+ LevelInfo()
+ : null_slot_usage(1), def_level(0), rep_level(0),
repeated_ancestor_def_level(0) {}
+ LevelInfo(int32_t null_slots, int32_t definition_level, int32_t
repitition_level,
+ int32_t repeated_ancestor_definition_level)
+ : null_slot_usage(null_slots),
+ def_level(definition_level),
+ rep_level(repitition_level),
+ repeated_ancestor_def_level(repeated_ancestor_definition_level) {}
+
+ bool operator==(const LevelInfo& b) const {
+ return null_slot_usage == b.null_slot_usage && def_level == b.def_level &&
+ rep_level == b.rep_level &&
+ repeated_ancestor_def_level == b.repeated_ancestor_def_level;
+ }
+
+ // How many slots a null element consumes.
+ // This is only ever >1 for descendents of
+ // FixedSizeList.
+ int32_t null_slot_usage = 1;
+
+ // The definition level at which the value for the field
+ // is considered not null (definition levels greater than
+ // or equal to indicate this value indicate a not-null
+ // value for the field). For list fields definition levels
+ // greater then or equal to this field indicate a present
+ // , possibly null, element.
+ int16_t def_level = 0;
+
+ // The repetition level corresponding to this element
+ // or the closest repeated ancestor. Any repetition
+ // level less than this indicates either a new list OR
+ // an empty list (which is determined in conjunction
+ // definition_level).
+ int16_t rep_level = 0;
+
+ // The definition level indicating the level at which the closest
+ // repeated ancestor was not empty. This is used to discriminate
+ // between a value less than |definition_level|
+ // being null or excluded entirely.
+ // For instance if we have an arrow schema like:
+ // list(struct(f0: int)). Then then there are the following
+ // definition levels:
+ // 0 = null list
+ // 1 = present but empty list.
+ // 2 = a null value in the list
+ // 3 = a non null struct but null integer.
+ // 4 = a present integer.
+ // When reconstructing the struct and integer Array's
+ // repeated_ancestor_def_level would be 2. Any
Review comment:
The sentence is not clear here, could you rephrase?
Do you mean something like:
> When reconstructing the struct and integer, their
`repeated_ancestor_def_level` would be 2.
##########
File path: cpp/src/parquet/level_conversion.h
##########
@@ -20,10 +20,117 @@
#include <cstdint>
#include "parquet/platform.h"
+#include "parquet/schema.h"
namespace parquet {
namespace internal {
+struct PARQUET_EXPORT LevelInfo {
+ LevelInfo()
+ : null_slot_usage(1), def_level(0), rep_level(0),
repeated_ancestor_def_level(0) {}
+ LevelInfo(int32_t null_slots, int32_t definition_level, int32_t
repitition_level,
+ int32_t repeated_ancestor_definition_level)
+ : null_slot_usage(null_slots),
+ def_level(definition_level),
+ rep_level(repitition_level),
+ repeated_ancestor_def_level(repeated_ancestor_definition_level) {}
+
+ bool operator==(const LevelInfo& b) const {
+ return null_slot_usage == b.null_slot_usage && def_level == b.def_level &&
+ rep_level == b.rep_level &&
+ repeated_ancestor_def_level == b.repeated_ancestor_def_level;
+ }
+
+ // How many slots a null element consumes.
Review comment:
In other words, is it the number of definition levels per null element?
(I assume "slot" doesn't mean "value slot" because nulls are not encoded in
values)
##########
File path: cpp/src/parquet/arrow/arrow_schema_test.cc
##########
@@ -1140,5 +1144,244 @@ TEST(TestFromParquetSchema, CorruptMetadata) {
ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props,
&arrow_schema));
}
+::arrow::Result<std::deque<LevelInfo>> RootToTreeLeafLevels(
+ const SchemaManifest& manifest, int column_number) {
+ std::deque<LevelInfo> out;
+ const SchemaField* field;
+ RETURN_NOT_OK(manifest.GetColumnField(column_number, &field));
+ while (field != nullptr) {
+ out.push_front(field->level_info);
+ field = manifest.GetParent(field);
+ }
+ return out;
+}
+
+class TestLevels : public ::testing::Test {
+ public:
+ virtual void SetUp() {}
+
+ ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) {
+ descriptor_.reset(new SchemaDescriptor());
+ manifest_.reset(new SchemaManifest());
+ descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column}));
+ return SchemaManifest::Make(descriptor_.get(),
+ std::shared_ptr<const
::arrow::KeyValueMetadata>(),
+ ArrowReaderProperties(), manifest_.get());
+ }
+ void SetParquetSchema(const NodePtr& column) {
+ ASSERT_OK(MaybeSetParquetSchema(column));
+ }
+
+ protected:
+ std::unique_ptr<SchemaDescriptor> descriptor_;
+ std::unique_ptr<SchemaManifest> manifest_;
+};
+
+TEST_F(TestLevels, TestPrimitive) {
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REQUIRED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(std::deque<LevelInfo> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(LevelInfo{/*null_slot_usage=*/1,
+ /*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::OPTIONAL,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(LevelInfo{/*null_slot_usage=*/1,
/*def_level=*/1,
+ /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ // Arrow schema: list(bool not null) not null
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REPEATED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(
+ levels,
+ ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/1,
+ /*ancestor_list_def_level*/ 0}, // List Field
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1})); // primitive
field
+}
+
+TEST_F(TestLevels, TestSimpleGroups) {
+ // Arrow schema: struct(child: struct(inner: boolean not null))
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REQUIRED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<LevelInfo> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(
+ levels,
+ ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ // Arrow schema: struct(child: struct(inner: boolean ))
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(
+ levels,
+ ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/3,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ // Arrow schema: struct(child: struct(inner: boolean)) not null
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REQUIRED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(
+ levels,
+ ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/0,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+}
+
+TEST_F(TestLevels, TestRepeatedGroups) {
+ // Arrow schema: list(struct(child: struct(list(bool not null) not null))
non null) not
Review comment:
The outer struct is nullable and there seem to be too many nesting
levels, so I'd say
```
list(struct(child: list(bool not null) not null)) not null
```
##########
File path: cpp/src/parquet/arrow/arrow_schema_test.cc
##########
@@ -1140,5 +1144,244 @@ TEST(TestFromParquetSchema, CorruptMetadata) {
ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props,
&arrow_schema));
}
+::arrow::Result<std::deque<LevelInfo>> RootToTreeLeafLevels(
+ const SchemaManifest& manifest, int column_number) {
+ std::deque<LevelInfo> out;
+ const SchemaField* field;
+ RETURN_NOT_OK(manifest.GetColumnField(column_number, &field));
+ while (field != nullptr) {
+ out.push_front(field->level_info);
+ field = manifest.GetParent(field);
+ }
+ return out;
+}
+
+class TestLevels : public ::testing::Test {
+ public:
+ virtual void SetUp() {}
+
+ ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) {
+ descriptor_.reset(new SchemaDescriptor());
+ manifest_.reset(new SchemaManifest());
+ descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column}));
+ return SchemaManifest::Make(descriptor_.get(),
+ std::shared_ptr<const
::arrow::KeyValueMetadata>(),
+ ArrowReaderProperties(), manifest_.get());
+ }
+ void SetParquetSchema(const NodePtr& column) {
+ ASSERT_OK(MaybeSetParquetSchema(column));
+ }
+
+ protected:
+ std::unique_ptr<SchemaDescriptor> descriptor_;
+ std::unique_ptr<SchemaManifest> manifest_;
+};
+
+TEST_F(TestLevels, TestPrimitive) {
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REQUIRED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(std::deque<LevelInfo> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(LevelInfo{/*null_slot_usage=*/1,
+ /*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::OPTIONAL,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(LevelInfo{/*null_slot_usage=*/1,
/*def_level=*/1,
+ /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ // Arrow schema: list(bool not null) not null
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REPEATED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(
+ levels,
+ ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/1,
+ /*ancestor_list_def_level*/ 0}, // List Field
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1})); // primitive
field
+}
+
+TEST_F(TestLevels, TestSimpleGroups) {
+ // Arrow schema: struct(child: struct(inner: boolean not null))
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REQUIRED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<LevelInfo> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(
+ levels,
+ ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ // Arrow schema: struct(child: struct(inner: boolean ))
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(
+ levels,
+ ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/3,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ // Arrow schema: struct(child: struct(inner: boolean)) not null
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REQUIRED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(
+ levels,
+ ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/0,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+}
+
+TEST_F(TestLevels, TestRepeatedGroups) {
Review comment:
Sorry to ask more, but can we also have simpler tests with a single
top-level list field?
(e.g. `list(bool not null)`, `list(bool)`, `list(bool) not null`...)
Once we have that, it's easier to reason about the more complicated ones.
##########
File path: cpp/src/parquet/arrow/arrow_schema_test.cc
##########
@@ -1140,5 +1144,244 @@ TEST(TestFromParquetSchema, CorruptMetadata) {
ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props,
&arrow_schema));
}
+::arrow::Result<std::deque<LevelInfo>> RootToTreeLeafLevels(
+ const SchemaManifest& manifest, int column_number) {
+ std::deque<LevelInfo> out;
+ const SchemaField* field;
+ RETURN_NOT_OK(manifest.GetColumnField(column_number, &field));
+ while (field != nullptr) {
+ out.push_front(field->level_info);
+ field = manifest.GetParent(field);
+ }
+ return out;
+}
+
+class TestLevels : public ::testing::Test {
+ public:
+ virtual void SetUp() {}
+
+ ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) {
+ descriptor_.reset(new SchemaDescriptor());
+ manifest_.reset(new SchemaManifest());
+ descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column}));
+ return SchemaManifest::Make(descriptor_.get(),
+ std::shared_ptr<const
::arrow::KeyValueMetadata>(),
+ ArrowReaderProperties(), manifest_.get());
+ }
+ void SetParquetSchema(const NodePtr& column) {
+ ASSERT_OK(MaybeSetParquetSchema(column));
+ }
+
+ protected:
+ std::unique_ptr<SchemaDescriptor> descriptor_;
+ std::unique_ptr<SchemaManifest> manifest_;
+};
+
+TEST_F(TestLevels, TestPrimitive) {
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REQUIRED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(std::deque<LevelInfo> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(LevelInfo{/*null_slot_usage=*/1,
+ /*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::OPTIONAL,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(LevelInfo{/*null_slot_usage=*/1,
/*def_level=*/1,
+ /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ // Arrow schema: list(bool not null) not null
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REPEATED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(
+ levels,
+ ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/1,
+ /*ancestor_list_def_level*/ 0}, // List Field
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1})); // primitive
field
+}
+
+TEST_F(TestLevels, TestSimpleGroups) {
+ // Arrow schema: struct(child: struct(inner: boolean not null))
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REQUIRED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<LevelInfo> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(
+ levels,
+ ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ // Arrow schema: struct(child: struct(inner: boolean ))
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(
+ levels,
+ ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/3,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ // Arrow schema: struct(child: struct(inner: boolean)) not null
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REQUIRED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(
+ levels,
+ ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/0,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+}
+
+TEST_F(TestLevels, TestRepeatedGroups) {
+ // Arrow schema: list(struct(child: struct(list(bool not null) not null))
non null) not
+ // null
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REPEATED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<LevelInfo> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(
+ levels,
+ ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2,
/*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1}, // optional child
struct
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/3,
/*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // repeated field
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/3,
/*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // innter field
+
+ // Arrow schema: list(struct(child_list: list(struct(f0: bool f1: bool
no-required )))
+ // not null) not null
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child_list", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "list", Repetition::REPEATED,
+ {GroupNode::Make(
+ "element", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("f0", Repetition::OPTIONAL,
ParquetType::BOOLEAN),
+ PrimitiveNode::Make("f1", Repetition::REQUIRED,
+ ParquetType::BOOLEAN)})})},
+ LogicalType::List())}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(
+ levels,
+ ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ // Def_level=2 is handled together with def_level=3
+ // When decoding. Def_level=2 indicate present but empty
+ // list. def_level=3 indicates a present element in the
+ // list.
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/3,
/*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // list field
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/4,
/*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3}, // inner struct
field
+
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/5,
/*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // f0 bool field
+
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/1));
+ EXPECT_THAT(
+ levels,
+ ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ // Def_level=2 is handled together with def_level=3
+ // When decoding. Def_level=2 indicate present but empty
+ // list. def_level=3 indicates a present element in the
+ // list.
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/3,
/*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // list field
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/4,
/*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3}, // inner struct
field
+
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/4,
/*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // f1 bool field
+
+ // Arrow schema: list(struct(child_list: list(bool not null)) not null) not
null
+ // Legacy 2-level necoding (required for backwards compatibility. See
Review comment:
"encoding"
##########
File path: cpp/src/parquet/arrow/arrow_schema_test.cc
##########
@@ -1140,5 +1144,244 @@ TEST(TestFromParquetSchema, CorruptMetadata) {
ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props,
&arrow_schema));
}
+::arrow::Result<std::deque<LevelInfo>> RootToTreeLeafLevels(
+ const SchemaManifest& manifest, int column_number) {
+ std::deque<LevelInfo> out;
+ const SchemaField* field;
+ RETURN_NOT_OK(manifest.GetColumnField(column_number, &field));
+ while (field != nullptr) {
+ out.push_front(field->level_info);
+ field = manifest.GetParent(field);
+ }
+ return out;
+}
+
+class TestLevels : public ::testing::Test {
+ public:
+ virtual void SetUp() {}
+
+ ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) {
+ descriptor_.reset(new SchemaDescriptor());
+ manifest_.reset(new SchemaManifest());
+ descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column}));
+ return SchemaManifest::Make(descriptor_.get(),
+ std::shared_ptr<const
::arrow::KeyValueMetadata>(),
+ ArrowReaderProperties(), manifest_.get());
+ }
+ void SetParquetSchema(const NodePtr& column) {
+ ASSERT_OK(MaybeSetParquetSchema(column));
+ }
+
+ protected:
+ std::unique_ptr<SchemaDescriptor> descriptor_;
+ std::unique_ptr<SchemaManifest> manifest_;
+};
+
+TEST_F(TestLevels, TestPrimitive) {
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REQUIRED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(std::deque<LevelInfo> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(LevelInfo{/*null_slot_usage=*/1,
+ /*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::OPTIONAL,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(LevelInfo{/*null_slot_usage=*/1,
/*def_level=*/1,
+ /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ // Arrow schema: list(bool not null) not null
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REPEATED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(
+ levels,
+ ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/1,
+ /*ancestor_list_def_level*/ 0}, // List Field
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1})); // primitive
field
+}
+
+TEST_F(TestLevels, TestSimpleGroups) {
+ // Arrow schema: struct(child: struct(inner: boolean not null))
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REQUIRED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<LevelInfo> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(
+ levels,
+ ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ // Arrow schema: struct(child: struct(inner: boolean ))
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(
+ levels,
+ ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/3,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ // Arrow schema: struct(child: struct(inner: boolean)) not null
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REQUIRED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(
+ levels,
+ ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/0,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2,
/*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+}
+
+TEST_F(TestLevels, TestRepeatedGroups) {
+ // Arrow schema: list(struct(child: struct(list(bool not null) not null))
non null) not
+ // null
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REPEATED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<LevelInfo> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(
+ levels,
+ ElementsAre(LevelInfo{/*null_slot_usage=*/1, /*def_level=*/1,
/*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/2,
/*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1}, // optional child
struct
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/3,
/*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // repeated field
+ LevelInfo{/*null_slot_usage=*/1, /*def_level=*/3,
/*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // innter field
+
+ // Arrow schema: list(struct(child_list: list(struct(f0: bool f1: bool
no-required )))
+ // not null) not null
Review comment:
The outer struct seems nullable, also can we use "not null" everywhere
instead of "no-required"?
##########
File path: cpp/src/parquet/arrow/arrow_schema_test.cc
##########
@@ -1140,5 +1143,231 @@ TEST(TestFromParquetSchema, CorruptMetadata) {
ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props,
&arrow_schema));
}
+struct Levels {
+ int16_t def_level;
+ int16_t rep_level;
+ int16_t repeated_ancestor_def;
+ friend std::ostream& operator<<(std::ostream& os, const Levels& levels) {
+ // This print method is to silence valgrind issues. What's printed
+ // is not important because all asserts happen directly on
+ // members.
+ os << "{def=" << levels.def_level << ", rep=" << levels.rep_level
+ << ", repeated_ancestor_def=" << levels.repeated_ancestor_def << "}";
+ return os;
+ }
+};
+
+bool operator==(const Levels& a, const Levels& b) {
+ return a.def_level == b.def_level && a.rep_level == b.rep_level &&
+ a.repeated_ancestor_def == b.repeated_ancestor_def;
+}
+
+::arrow::Result<std::deque<Levels>> RootToTreeLeafLevels(const SchemaManifest&
manifest,
+ int column_number) {
+ std::deque<Levels> out;
+ const SchemaField* field;
+ RETURN_NOT_OK(manifest.GetColumnField(column_number, &field));
+ while (field != nullptr) {
+ out.push_front({field->definition_level, field->repetition_level,
+ field->repeated_ancestor_definition_level});
+ field = manifest.GetParent(field);
+ }
+ return out;
+}
+
+class TestLevels : public ::testing::Test {
+ public:
+ virtual void SetUp() {}
+
+ ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) {
+ descriptor_.reset(new SchemaDescriptor());
+ manifest_.reset(new SchemaManifest());
+ descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column}));
+ return SchemaManifest::Make(descriptor_.get(),
+ std::shared_ptr<const
::arrow::KeyValueMetadata>(),
+ ArrowReaderProperties(), manifest_.get());
+ }
+ void SetParquetSchema(const NodePtr& column) {
+ ASSERT_OK(MaybeSetParquetSchema(column));
+ }
+
+ protected:
+ std::unique_ptr<SchemaDescriptor> descriptor_;
+ std::unique_ptr<SchemaManifest> manifest_;
+};
+
+TEST_F(TestLevels, TestPrimitive) {
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REQUIRED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::OPTIONAL,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REPEATED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 0}, // List Field
+ Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1})); //
primitive field
+}
+
+TEST_F(TestLevels, TestSimpleGroups) {
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REQUIRED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/3, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REQUIRED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+}
+
+TEST_F(TestLevels, TestRepeatedGroups) {
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REPEATED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ Levels{/*def_level=*/2, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1}, // optional
child struct
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // repeated
field
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // innter
field
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child_list", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "list", Repetition::REPEATED,
+ {GroupNode::Make(
+ "element", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("f0", Repetition::OPTIONAL,
ParquetType::BOOLEAN),
+ PrimitiveNode::Make("f1", Repetition::REQUIRED,
+ ParquetType::BOOLEAN)})})},
+ ConvertedType::LIST)}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ // Def_ldevl=2 is skipped because it represents a
null list.
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // list field
+ Levels{/*def_level=*/4, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3}, // inner
struct field
+
+ Levels{/*def_level=*/5, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // f0 bool
field
+
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/1));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ // Def_ldevl=2 is skipped because it represents a
null list.
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // list field
+ Levels{/*def_level=*/4, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3}, // inner
struct field
+
+ Levels{/*def_level=*/4, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // f1 bool
field
+
+ // Legacy 2-level necoding
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child_list", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("bool", Repetition::REPEATED,
ParquetType::BOOLEAN)},
+ ConvertedType::LIST)}));
+
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ // Def_ldevl=2 is skipped because it represents a
null list.
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // list field
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // inner
struct field
+}
+
+TEST_F(TestLevels, ListErrors) {
+ {
+ ::arrow::Status error = MaybeSetParquetSchema(GroupNode::Make(
+ "child_list", Repetition::REPEATED,
+ {PrimitiveNode::Make("bool", Repetition::REPEATED,
ParquetType::BOOLEAN)},
+ ConvertedType::LIST));
+ EXPECT_TRUE(error.IsInvalid());
+ std::string expected("LIST-annotated groups must not be repeated.");
+ EXPECT_EQ(error.message().substr(0, expected.size()), expected);
+ }
Review comment:
However, I see this kind of comments in `path_internal_test.cc`:
```
// Translates to parquet schema:
// optional group bag {
// repeated group [unseen] (List) {
// required int64 Entries;
// }
// }
```
Should the List annotation be on the toplevel "bag" node, rather than on the
middle "[unseen]" node?
(also, what does "[unseen]" mean? shouldn't it be named "list" as per the
Parquet spec?)
##########
File path: cpp/src/parquet/arrow/arrow_schema_test.cc
##########
@@ -1140,5 +1143,231 @@ TEST(TestFromParquetSchema, CorruptMetadata) {
ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props,
&arrow_schema));
}
+struct Levels {
+ int16_t def_level;
+ int16_t rep_level;
+ int16_t repeated_ancestor_def;
+ friend std::ostream& operator<<(std::ostream& os, const Levels& levels) {
+ // This print method is to silence valgrind issues. What's printed
+ // is not important because all asserts happen directly on
+ // members.
+ os << "{def=" << levels.def_level << ", rep=" << levels.rep_level
+ << ", repeated_ancestor_def=" << levels.repeated_ancestor_def << "}";
+ return os;
+ }
+};
+
+bool operator==(const Levels& a, const Levels& b) {
+ return a.def_level == b.def_level && a.rep_level == b.rep_level &&
+ a.repeated_ancestor_def == b.repeated_ancestor_def;
+}
+
+::arrow::Result<std::deque<Levels>> RootToTreeLeafLevels(const SchemaManifest&
manifest,
+ int column_number) {
+ std::deque<Levels> out;
+ const SchemaField* field;
+ RETURN_NOT_OK(manifest.GetColumnField(column_number, &field));
+ while (field != nullptr) {
+ out.push_front({field->definition_level, field->repetition_level,
+ field->repeated_ancestor_definition_level});
+ field = manifest.GetParent(field);
+ }
+ return out;
+}
+
+class TestLevels : public ::testing::Test {
+ public:
+ virtual void SetUp() {}
+
+ ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) {
+ descriptor_.reset(new SchemaDescriptor());
+ manifest_.reset(new SchemaManifest());
+ descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column}));
+ return SchemaManifest::Make(descriptor_.get(),
+ std::shared_ptr<const
::arrow::KeyValueMetadata>(),
+ ArrowReaderProperties(), manifest_.get());
+ }
+ void SetParquetSchema(const NodePtr& column) {
+ ASSERT_OK(MaybeSetParquetSchema(column));
+ }
+
+ protected:
+ std::unique_ptr<SchemaDescriptor> descriptor_;
+ std::unique_ptr<SchemaManifest> manifest_;
+};
+
+TEST_F(TestLevels, TestPrimitive) {
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REQUIRED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::OPTIONAL,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REPEATED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 0}, // List Field
+ Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1})); //
primitive field
+}
+
+TEST_F(TestLevels, TestSimpleGroups) {
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REQUIRED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/3, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REQUIRED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+}
+
+TEST_F(TestLevels, TestRepeatedGroups) {
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REPEATED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ Levels{/*def_level=*/2, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1}, // optional
child struct
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // repeated
field
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // innter
field
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child_list", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "list", Repetition::REPEATED,
+ {GroupNode::Make(
+ "element", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("f0", Repetition::OPTIONAL,
ParquetType::BOOLEAN),
+ PrimitiveNode::Make("f1", Repetition::REQUIRED,
+ ParquetType::BOOLEAN)})})},
+ ConvertedType::LIST)}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ // Def_ldevl=2 is skipped because it represents a
null list.
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // list field
+ Levels{/*def_level=*/4, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3}, // inner
struct field
+
+ Levels{/*def_level=*/5, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // f0 bool
field
+
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/1));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ // Def_ldevl=2 is skipped because it represents a
null list.
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // list field
+ Levels{/*def_level=*/4, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3}, // inner
struct field
+
+ Levels{/*def_level=*/4, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // f1 bool
field
+
+ // Legacy 2-level necoding
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child_list", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("bool", Repetition::REPEATED,
ParquetType::BOOLEAN)},
+ ConvertedType::LIST)}));
+
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ // Def_ldevl=2 is skipped because it represents a
null list.
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // list field
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // inner
struct field
+}
+
+TEST_F(TestLevels, ListErrors) {
+ {
+ ::arrow::Status error = MaybeSetParquetSchema(GroupNode::Make(
+ "child_list", Repetition::REPEATED,
+ {PrimitiveNode::Make("bool", Repetition::REPEATED,
ParquetType::BOOLEAN)},
+ ConvertedType::LIST));
+ EXPECT_TRUE(error.IsInvalid());
+ std::string expected("LIST-annotated groups must not be repeated.");
+ EXPECT_EQ(error.message().substr(0, expected.size()), expected);
+ }
Review comment:
Ok, I read the spec now, I understand better :-)
##########
File path: cpp/src/parquet/arrow/arrow_schema_test.cc
##########
@@ -1140,5 +1143,231 @@ TEST(TestFromParquetSchema, CorruptMetadata) {
ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props,
&arrow_schema));
}
+struct Levels {
+ int16_t def_level;
+ int16_t rep_level;
+ int16_t repeated_ancestor_def;
+ friend std::ostream& operator<<(std::ostream& os, const Levels& levels) {
+ // This print method is to silence valgrind issues. What's printed
+ // is not important because all asserts happen directly on
+ // members.
+ os << "{def=" << levels.def_level << ", rep=" << levels.rep_level
+ << ", repeated_ancestor_def=" << levels.repeated_ancestor_def << "}";
+ return os;
+ }
+};
+
+bool operator==(const Levels& a, const Levels& b) {
+ return a.def_level == b.def_level && a.rep_level == b.rep_level &&
+ a.repeated_ancestor_def == b.repeated_ancestor_def;
+}
+
+::arrow::Result<std::deque<Levels>> RootToTreeLeafLevels(const SchemaManifest&
manifest,
+ int column_number) {
+ std::deque<Levels> out;
+ const SchemaField* field;
+ RETURN_NOT_OK(manifest.GetColumnField(column_number, &field));
+ while (field != nullptr) {
+ out.push_front({field->definition_level, field->repetition_level,
+ field->repeated_ancestor_definition_level});
+ field = manifest.GetParent(field);
+ }
+ return out;
+}
+
+class TestLevels : public ::testing::Test {
+ public:
+ virtual void SetUp() {}
+
+ ::arrow::Status MaybeSetParquetSchema(const NodePtr& column) {
+ descriptor_.reset(new SchemaDescriptor());
+ manifest_.reset(new SchemaManifest());
+ descriptor_->Init(GroupNode::Make("root", Repetition::REQUIRED, {column}));
+ return SchemaManifest::Make(descriptor_.get(),
+ std::shared_ptr<const
::arrow::KeyValueMetadata>(),
+ ArrowReaderProperties(), manifest_.get());
+ }
+ void SetParquetSchema(const NodePtr& column) {
+ ASSERT_OK(MaybeSetParquetSchema(column));
+ }
+
+ protected:
+ std::unique_ptr<SchemaDescriptor> descriptor_;
+ std::unique_ptr<SchemaManifest> manifest_;
+};
+
+TEST_F(TestLevels, TestPrimitive) {
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REQUIRED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::OPTIONAL,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(
+ PrimitiveNode::Make("node_name", Repetition::REPEATED,
ParquetType::BOOLEAN));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 0}, // List Field
+ Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1})); //
primitive field
+}
+
+TEST_F(TestLevels, TestSimpleGroups) {
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REQUIRED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/3, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REQUIRED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::OPTIONAL,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels, ElementsAre(Levels{/*def_level=*/0, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/1, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0},
+ Levels{/*def_level=*/2, /*rep_level=*/0,
+ /*ancestor_list_def_level*/ 0}));
+}
+
+TEST_F(TestLevels, TestRepeatedGroups) {
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("inner", Repetition::REPEATED,
ParquetType::BOOLEAN)})}));
+ ASSERT_OK_AND_ASSIGN(std::deque<Levels> levels,
+ RootToTreeLeafLevels(*manifest_, /*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ Levels{/*def_level=*/2, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1}, // optional
child struct
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // repeated
field
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // innter
field
+
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child_list", Repetition::OPTIONAL,
+ {GroupNode::Make(
+ "list", Repetition::REPEATED,
+ {GroupNode::Make(
+ "element", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("f0", Repetition::OPTIONAL,
ParquetType::BOOLEAN),
+ PrimitiveNode::Make("f1", Repetition::REQUIRED,
+ ParquetType::BOOLEAN)})})},
+ ConvertedType::LIST)}));
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ // Def_ldevl=2 is skipped because it represents a
null list.
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // list field
+ Levels{/*def_level=*/4, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3}, // inner
struct field
+
+ Levels{/*def_level=*/5, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // f0 bool
field
+
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/1));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ // Def_ldevl=2 is skipped because it represents a
null list.
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // list field
+ Levels{/*def_level=*/4, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3}, // inner
struct field
+
+ Levels{/*def_level=*/4, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // f1 bool
field
+
+ // Legacy 2-level necoding
+ SetParquetSchema(GroupNode::Make(
+ "parent", Repetition::REPEATED,
+ {GroupNode::Make(
+ "child_list", Repetition::OPTIONAL,
+ {PrimitiveNode::Make("bool", Repetition::REPEATED,
ParquetType::BOOLEAN)},
+ ConvertedType::LIST)}));
+
+ ASSERT_OK_AND_ASSIGN(levels, RootToTreeLeafLevels(*manifest_,
/*column_number=*/0));
+ EXPECT_THAT(levels,
+ ElementsAre(Levels{/*def_level=*/1, /*rep_level=*/1,
+ /*ancestor_list_def_level*/ 1},
+ // Def_ldevl=2 is skipped because it represents a
null list.
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 1}, // list field
+ Levels{/*def_level=*/3, /*rep_level=*/2,
+ /*ancestor_list_def_level*/ 3})); // inner
struct field
+}
+
+TEST_F(TestLevels, ListErrors) {
+ {
+ ::arrow::Status error = MaybeSetParquetSchema(GroupNode::Make(
+ "child_list", Repetition::REPEATED,
+ {PrimitiveNode::Make("bool", Repetition::REPEATED,
ParquetType::BOOLEAN)},
+ ConvertedType::LIST));
+ EXPECT_TRUE(error.IsInvalid());
+ std::string expected("LIST-annotated groups must not be repeated.");
+ EXPECT_EQ(error.message().substr(0, expected.size()), expected);
Review comment:
I see, thank you.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]