stiga-huang commented on a change in pull request #1024:
URL: https://github.com/apache/orc/pull/1024#discussion_r795264673
##########
File path: c++/test/TestReader.cc
##########
@@ -250,4 +252,206 @@ namespace orc {
EXPECT_EQ(3, intArrayArrayArrayBatch.offsets.data()[1]);
EXPECT_EQ(nullptr, intArrayArrayArrayBatch.elements.get());
}
+
+ /**
+ * Read TestOrcFile.nestedMap.orc and verify the resolved selections.
+ *
+ * The ORC fie has the following schema:
+ * struct<
+ * id:int,
+ * single_map:map<string,string>,
+ * nested_map:map<string,map<string,map<string,string>>>
+ * >
+ */
+ void verifyMapSelection(const RowReaderOptions::IdReadIntentMap&
idReadIntentMap,
+ const std::vector<uint32_t>& expectedSelection) {
+ std::string fileName = "TestOrcFile.nestedMap.orc";
+ std::unique_ptr<Reader> reader = createExampleReader(fileName);
+
+ RowReaderOptions rowReaderOpts;
+ rowReaderOpts.includeTypesWithIntents(idReadIntentMap);
+ std::unique_ptr<RowReader> rowReader =
reader->createRowReader(rowReaderOpts);
+ std::vector<bool> expected(reader->getType().getMaximumColumnId() + 1,
false);
+ for (auto id : expectedSelection) {
+ expected[id] = true;
+ }
+ ASSERT_THAT(rowReader->getSelectedColumns(), ElementsAreArray(expected));
+ }
+
+ TEST(TestReadIntent, testMapAll) {
+ // select all of single_map.
+ verifyMapSelection({{2, ReadIntent_ALL}}, {0, 2, 3, 4});
+ }
+
+ TEST(TestReadIntent, testMapOffsets) {
+ // select only the offsets of single_map.
+ verifyMapSelection({{2, ReadIntent_OFFSETS}}, {0, 2});
+
+ // select only the offsets of single_map and the outermost offsets of
nested_map.
+ verifyMapSelection({{2, ReadIntent_OFFSETS}, {5, ReadIntent_OFFSETS}}, {0,
2, 5});
+
+ // select the entire offsets of nested_map without the map items of the
innermost map.
+ verifyMapSelection({{5, ReadIntent_OFFSETS}, {9, ReadIntent_OFFSETS}}, {0,
5, 7, 9});
+ }
+
+ TEST(TestReadIntent, testMapAllAndOffsets) {
+ // select all of single_map and only the outermost offsets of nested_map.
+ verifyMapSelection({{2, ReadIntent_ALL}, {5, ReadIntent_OFFSETS}}, {0, 2,
3, 4, 5});
+ }
+
+ TEST(TestReadIntent, testMapConflictingIntent) {
+ // test conflicting ReadIntent on nested_map.
+ verifyMapSelection({{5, ReadIntent_OFFSETS}, {9, ReadIntent_ALL}}, {0, 5,
7, 9, 10, 11});
+ verifyMapSelection({{5, ReadIntent_ALL}, {9, ReadIntent_OFFSETS}}, {0, 5,
6, 7, 8, 9, 10, 11});
+ verifyMapSelection({{5, ReadIntent_OFFSETS}, {7, ReadIntent_ALL}, {9,
ReadIntent_OFFSETS}},
+ {0, 5, 7, 8, 9, 10, 11});
+ }
+
+ TEST(TestReadIntent, testMapRowBatchContent) {
+ std::unique_ptr<Reader> reader =
createExampleReader("TestOrcFile.nestedMap.orc");
+
+ // select all of single_map and only the offsets of nested_map.
+ RowReaderOptions::IdReadIntentMap idReadIntentMap = {{2, ReadIntent_ALL},
+ {5,
ReadIntent_OFFSETS}};
+ RowReaderOptions rowReaderOpts;
+ rowReaderOpts.includeTypesWithIntents(idReadIntentMap);
+ std::unique_ptr<RowReader> rowReader =
reader->createRowReader(rowReaderOpts);
+
+ // Read a row batch.
+ std::unique_ptr<ColumnVectorBatch> batch = rowReader->createRowBatch(1024);
+ EXPECT_TRUE(rowReader->next(*batch));
+ EXPECT_EQ(1, batch->numElements);
+ auto& structBatch = dynamic_cast<StructVectorBatch&>(*batch);
+
+ // verify content of int_array selection.
Review comment:
`int_array` should be `single_map`
##########
File path: c++/test/TestReader.cc
##########
@@ -250,4 +252,206 @@ namespace orc {
EXPECT_EQ(3, intArrayArrayArrayBatch.offsets.data()[1]);
EXPECT_EQ(nullptr, intArrayArrayArrayBatch.elements.get());
}
+
+ /**
+ * Read TestOrcFile.nestedMap.orc and verify the resolved selections.
+ *
+ * The ORC fie has the following schema:
+ * struct<
+ * id:int,
+ * single_map:map<string,string>,
+ * nested_map:map<string,map<string,map<string,string>>>
+ * >
+ */
+ void verifyMapSelection(const RowReaderOptions::IdReadIntentMap&
idReadIntentMap,
+ const std::vector<uint32_t>& expectedSelection) {
+ std::string fileName = "TestOrcFile.nestedMap.orc";
+ std::unique_ptr<Reader> reader = createExampleReader(fileName);
+
+ RowReaderOptions rowReaderOpts;
+ rowReaderOpts.includeTypesWithIntents(idReadIntentMap);
+ std::unique_ptr<RowReader> rowReader =
reader->createRowReader(rowReaderOpts);
+ std::vector<bool> expected(reader->getType().getMaximumColumnId() + 1,
false);
+ for (auto id : expectedSelection) {
+ expected[id] = true;
+ }
+ ASSERT_THAT(rowReader->getSelectedColumns(), ElementsAreArray(expected));
+ }
+
+ TEST(TestReadIntent, testMapAll) {
+ // select all of single_map.
+ verifyMapSelection({{2, ReadIntent_ALL}}, {0, 2, 3, 4});
+ }
+
+ TEST(TestReadIntent, testMapOffsets) {
+ // select only the offsets of single_map.
+ verifyMapSelection({{2, ReadIntent_OFFSETS}}, {0, 2});
+
+ // select only the offsets of single_map and the outermost offsets of
nested_map.
+ verifyMapSelection({{2, ReadIntent_OFFSETS}, {5, ReadIntent_OFFSETS}}, {0,
2, 5});
+
+ // select the entire offsets of nested_map without the map items of the
innermost map.
+ verifyMapSelection({{5, ReadIntent_OFFSETS}, {9, ReadIntent_OFFSETS}}, {0,
5, 7, 9});
+ }
+
+ TEST(TestReadIntent, testMapAllAndOffsets) {
+ // select all of single_map and only the outermost offsets of nested_map.
+ verifyMapSelection({{2, ReadIntent_ALL}, {5, ReadIntent_OFFSETS}}, {0, 2,
3, 4, 5});
+ }
+
+ TEST(TestReadIntent, testMapConflictingIntent) {
+ // test conflicting ReadIntent on nested_map.
+ verifyMapSelection({{5, ReadIntent_OFFSETS}, {9, ReadIntent_ALL}}, {0, 5,
7, 9, 10, 11});
+ verifyMapSelection({{5, ReadIntent_ALL}, {9, ReadIntent_OFFSETS}}, {0, 5,
6, 7, 8, 9, 10, 11});
+ verifyMapSelection({{5, ReadIntent_OFFSETS}, {7, ReadIntent_ALL}, {9,
ReadIntent_OFFSETS}},
+ {0, 5, 7, 8, 9, 10, 11});
+ }
+
+ TEST(TestReadIntent, testMapRowBatchContent) {
+ std::unique_ptr<Reader> reader =
createExampleReader("TestOrcFile.nestedMap.orc");
+
+ // select all of single_map and only the offsets of nested_map.
+ RowReaderOptions::IdReadIntentMap idReadIntentMap = {{2, ReadIntent_ALL},
+ {5,
ReadIntent_OFFSETS}};
+ RowReaderOptions rowReaderOpts;
+ rowReaderOpts.includeTypesWithIntents(idReadIntentMap);
+ std::unique_ptr<RowReader> rowReader =
reader->createRowReader(rowReaderOpts);
+
+ // Read a row batch.
+ std::unique_ptr<ColumnVectorBatch> batch = rowReader->createRowBatch(1024);
+ EXPECT_TRUE(rowReader->next(*batch));
+ EXPECT_EQ(1, batch->numElements);
+ auto& structBatch = dynamic_cast<StructVectorBatch&>(*batch);
+
+ // verify content of int_array selection.
+ auto& mapBatch = dynamic_cast<MapVectorBatch&>(*structBatch.fields[0]);
+ auto& keyBatch = dynamic_cast<StringVectorBatch&>(*mapBatch.keys);
+ auto& valueBatch = dynamic_cast<StringVectorBatch&>(*mapBatch.elements);
+ EXPECT_EQ(1, mapBatch.numElements);
+ EXPECT_EQ(0, mapBatch.offsets.data()[0]);
+ EXPECT_EQ(1, mapBatch.offsets.data()[1]);
+ // verify key content.
+ EXPECT_EQ(1, keyBatch.numElements);
+ EXPECT_EQ(2, keyBatch.length.data()[0]);
+ EXPECT_EQ(0, strncmp("k0", keyBatch.data.data()[0], 2));
+ // verify value content.
+ EXPECT_EQ(1, valueBatch.numElements);
+ EXPECT_EQ(2, valueBatch.length.data()[0]);
+ EXPECT_EQ(0, strncmp("v0", valueBatch.data.data()[0], 2));
+
+ // verify content of nested_map selection.
+ auto& nestedMapBatch =
dynamic_cast<MapVectorBatch&>(*structBatch.fields[1]);
+ EXPECT_EQ(1, nestedMapBatch.numElements);
+ EXPECT_EQ(0, nestedMapBatch.offsets.data()[0]);
+ EXPECT_EQ(1, nestedMapBatch.offsets.data()[1]);
+ EXPECT_EQ(nullptr, nestedMapBatch.keys.get());
+ EXPECT_EQ(nullptr, nestedMapBatch.elements.get());
+ }
+
+ /**
+ * Read TestOrcFile.nestedUnion.orc and verify the resolved selections.
+ *
+ * The ORC fie has the following schema:
+ * struct<
+ * id:int,
+ * single_union:uniontype<int,string>,
+ * nested_union:uniontype<uniontype<int,uniontype<int,string>>,int>
+ * >
+ */
+ void verifyUnionSelection(const RowReaderOptions::IdReadIntentMap&
idReadIntentMap,
Review comment:
The same as `verifyMapSelection`, can we merge this function with
`verifySelection`?
##########
File path: c++/test/TestReader.cc
##########
@@ -250,4 +252,206 @@ namespace orc {
EXPECT_EQ(3, intArrayArrayArrayBatch.offsets.data()[1]);
EXPECT_EQ(nullptr, intArrayArrayArrayBatch.elements.get());
}
+
+ /**
+ * Read TestOrcFile.nestedMap.orc and verify the resolved selections.
+ *
+ * The ORC fie has the following schema:
Review comment:
typo: file
##########
File path: c++/src/Reader.cc
##########
@@ -120,10 +120,24 @@ namespace orc {
bool ColumnSelector::selectParents(std::vector<bool>& selectedColumns, const
Type& type) {
size_t id = static_cast<size_t>(type.getColumnId());
bool result = selectedColumns[id];
+ uint64_t numSubtypeSelected = 0;
for(uint64_t c=0; c < type.getSubtypeCount(); ++c) {
- result |= selectParents(selectedColumns, *type.getSubtype(c));
+ if (selectParents(selectedColumns, *type.getSubtype(c))) {
+ result = true;
+ numSubtypeSelected++;
+ }
}
selectedColumns[id] = result;
+
+ if (type.getKind() == TypeKind::UNION && selectedColumns[id]) {
+ if (0 < numSubtypeSelected && numSubtypeSelected <
type.getSubtypeCount()) {
+ // Subtypes of UNION should be fully selected or not selected at all.
+ // Override partial subtype selections with full selections.
+ for (uint64_t c = 0; c < type.getSubtypeCount(); ++c) {
Review comment:
Can we use `selectChildren(selectedColumns, type)` directly?
##########
File path: c++/test/TestReader.cc
##########
@@ -250,4 +252,206 @@ namespace orc {
EXPECT_EQ(3, intArrayArrayArrayBatch.offsets.data()[1]);
EXPECT_EQ(nullptr, intArrayArrayArrayBatch.elements.get());
}
+
+ /**
+ * Read TestOrcFile.nestedMap.orc and verify the resolved selections.
+ *
+ * The ORC fie has the following schema:
+ * struct<
+ * id:int,
+ * single_map:map<string,string>,
+ * nested_map:map<string,map<string,map<string,string>>>
+ * >
+ */
+ void verifyMapSelection(const RowReaderOptions::IdReadIntentMap&
idReadIntentMap,
+ const std::vector<uint32_t>& expectedSelection) {
+ std::string fileName = "TestOrcFile.nestedMap.orc";
+ std::unique_ptr<Reader> reader = createExampleReader(fileName);
+
+ RowReaderOptions rowReaderOpts;
+ rowReaderOpts.includeTypesWithIntents(idReadIntentMap);
+ std::unique_ptr<RowReader> rowReader =
reader->createRowReader(rowReaderOpts);
+ std::vector<bool> expected(reader->getType().getMaximumColumnId() + 1,
false);
+ for (auto id : expectedSelection) {
+ expected[id] = true;
+ }
+ ASSERT_THAT(rowReader->getSelectedColumns(), ElementsAreArray(expected));
+ }
+
+ TEST(TestReadIntent, testMapAll) {
+ // select all of single_map.
+ verifyMapSelection({{2, ReadIntent_ALL}}, {0, 2, 3, 4});
+ }
+
+ TEST(TestReadIntent, testMapOffsets) {
+ // select only the offsets of single_map.
+ verifyMapSelection({{2, ReadIntent_OFFSETS}}, {0, 2});
+
+ // select only the offsets of single_map and the outermost offsets of
nested_map.
+ verifyMapSelection({{2, ReadIntent_OFFSETS}, {5, ReadIntent_OFFSETS}}, {0,
2, 5});
+
+ // select the entire offsets of nested_map without the map items of the
innermost map.
+ verifyMapSelection({{5, ReadIntent_OFFSETS}, {9, ReadIntent_OFFSETS}}, {0,
5, 7, 9});
+ }
+
+ TEST(TestReadIntent, testMapAllAndOffsets) {
+ // select all of single_map and only the outermost offsets of nested_map.
+ verifyMapSelection({{2, ReadIntent_ALL}, {5, ReadIntent_OFFSETS}}, {0, 2,
3, 4, 5});
+ }
+
+ TEST(TestReadIntent, testMapConflictingIntent) {
+ // test conflicting ReadIntent on nested_map.
+ verifyMapSelection({{5, ReadIntent_OFFSETS}, {9, ReadIntent_ALL}}, {0, 5,
7, 9, 10, 11});
+ verifyMapSelection({{5, ReadIntent_ALL}, {9, ReadIntent_OFFSETS}}, {0, 5,
6, 7, 8, 9, 10, 11});
+ verifyMapSelection({{5, ReadIntent_OFFSETS}, {7, ReadIntent_ALL}, {9,
ReadIntent_OFFSETS}},
+ {0, 5, 7, 8, 9, 10, 11});
+ }
+
+ TEST(TestReadIntent, testMapRowBatchContent) {
+ std::unique_ptr<Reader> reader =
createExampleReader("TestOrcFile.nestedMap.orc");
+
+ // select all of single_map and only the offsets of nested_map.
+ RowReaderOptions::IdReadIntentMap idReadIntentMap = {{2, ReadIntent_ALL},
+ {5,
ReadIntent_OFFSETS}};
+ RowReaderOptions rowReaderOpts;
+ rowReaderOpts.includeTypesWithIntents(idReadIntentMap);
+ std::unique_ptr<RowReader> rowReader =
reader->createRowReader(rowReaderOpts);
+
+ // Read a row batch.
+ std::unique_ptr<ColumnVectorBatch> batch = rowReader->createRowBatch(1024);
+ EXPECT_TRUE(rowReader->next(*batch));
+ EXPECT_EQ(1, batch->numElements);
+ auto& structBatch = dynamic_cast<StructVectorBatch&>(*batch);
+
+ // verify content of int_array selection.
+ auto& mapBatch = dynamic_cast<MapVectorBatch&>(*structBatch.fields[0]);
+ auto& keyBatch = dynamic_cast<StringVectorBatch&>(*mapBatch.keys);
+ auto& valueBatch = dynamic_cast<StringVectorBatch&>(*mapBatch.elements);
+ EXPECT_EQ(1, mapBatch.numElements);
+ EXPECT_EQ(0, mapBatch.offsets.data()[0]);
+ EXPECT_EQ(1, mapBatch.offsets.data()[1]);
+ // verify key content.
+ EXPECT_EQ(1, keyBatch.numElements);
+ EXPECT_EQ(2, keyBatch.length.data()[0]);
+ EXPECT_EQ(0, strncmp("k0", keyBatch.data.data()[0], 2));
+ // verify value content.
+ EXPECT_EQ(1, valueBatch.numElements);
+ EXPECT_EQ(2, valueBatch.length.data()[0]);
+ EXPECT_EQ(0, strncmp("v0", valueBatch.data.data()[0], 2));
+
+ // verify content of nested_map selection.
+ auto& nestedMapBatch =
dynamic_cast<MapVectorBatch&>(*structBatch.fields[1]);
+ EXPECT_EQ(1, nestedMapBatch.numElements);
+ EXPECT_EQ(0, nestedMapBatch.offsets.data()[0]);
+ EXPECT_EQ(1, nestedMapBatch.offsets.data()[1]);
+ EXPECT_EQ(nullptr, nestedMapBatch.keys.get());
+ EXPECT_EQ(nullptr, nestedMapBatch.elements.get());
+ }
+
+ /**
+ * Read TestOrcFile.nestedUnion.orc and verify the resolved selections.
+ *
+ * The ORC fie has the following schema:
+ * struct<
+ * id:int,
+ * single_union:uniontype<int,string>,
+ * nested_union:uniontype<uniontype<int,uniontype<int,string>>,int>
+ * >
+ */
+ void verifyUnionSelection(const RowReaderOptions::IdReadIntentMap&
idReadIntentMap,
+ const std::vector<uint32_t>& expectedSelection) {
+ std::string fileName = "TestOrcFile.nestedUnion.orc";
+ std::unique_ptr<Reader> reader = createExampleReader(fileName);
+
+ RowReaderOptions rowReaderOpts;
+ rowReaderOpts.includeTypesWithIntents(idReadIntentMap);
+ std::unique_ptr<RowReader> rowReader =
reader->createRowReader(rowReaderOpts);
+ std::vector<bool> expected(reader->getType().getMaximumColumnId() + 1,
false);
+ for (auto id : expectedSelection) {
+ expected[id] = true;
+ }
+ ASSERT_THAT(rowReader->getSelectedColumns(), ElementsAreArray(expected));
+ }
+
+ TEST(TestReadIntent, testUnionAll) {
+ // select all of single_union.
+ verifyUnionSelection({{2, ReadIntent_ALL}}, {0, 2, 3, 4});
+ }
+
+ TEST(TestReadIntent, testUnionOffsets) {
+ // select only the offsets of single_union.
+ verifyUnionSelection({{2, ReadIntent_OFFSETS}}, {0, 2});
+
+ // select only the offsets of single_union and the outermost offsets of
nested_union.
+ verifyUnionSelection({{2, ReadIntent_OFFSETS}, {5, ReadIntent_OFFSETS}},
{0, 2, 5});
+
+ // select only the offsets of single_union and the innermost offsets of
nested_union.
+ verifyUnionSelection({{2, ReadIntent_OFFSETS}, {8, ReadIntent_OFFSETS}},
+ {0, 2, 5, 6, 7, 8, 11});
+ }
+
+ TEST(TestReadIntent, testUnionAllAndOffsets) {
+ // select all of single_union and only the outermost offsets of
nested_union.
+ verifyUnionSelection({{2, ReadIntent_ALL}, {5, ReadIntent_OFFSETS}}, {0,
2, 3, 4, 5});
+ }
+
+ TEST(TestReadIntent, testUnionConflictingIntent) {
+ // test conflicting ReadIntent on nested_union.
+ verifyUnionSelection({{5, ReadIntent_OFFSETS}, {8, ReadIntent_ALL}},
+ {0, 5, 6, 7, 8, 9, 10, 11});
+ verifyUnionSelection({{5, ReadIntent_ALL}, {8, ReadIntent_OFFSETS}},
+ {0, 5, 6, 7, 8, 9, 10, 11});
+ verifyUnionSelection({{5, ReadIntent_OFFSETS}, {6, ReadIntent_ALL}, {8,
ReadIntent_OFFSETS}},
+ {0, 5, 6, 7, 8, 9, 10, 11});
+ }
+
+ TEST(TestReadIntent, testUnionRowBatchContent) {
+ std::unique_ptr<Reader> reader =
createExampleReader("TestOrcFile.nestedUnion.orc");
+
+ // select all of single_union and only the offsets of nested_union.
+ RowReaderOptions::IdReadIntentMap idReadIntentMap = {{2, ReadIntent_ALL},
+ {5,
ReadIntent_OFFSETS}};
+ RowReaderOptions rowReaderOpts;
+ rowReaderOpts.includeTypesWithIntents(idReadIntentMap);
+ std::unique_ptr<RowReader> rowReader =
reader->createRowReader(rowReaderOpts);
+
+ // Read a row batch.
+ std::unique_ptr<ColumnVectorBatch> batch = rowReader->createRowBatch(1024);
+ EXPECT_TRUE(rowReader->next(*batch));
+ EXPECT_EQ(4, batch->numElements);
+ auto& structBatch = dynamic_cast<StructVectorBatch&>(*batch);
+
+ // verify content of int_array selection.
+ auto& unionBatch = dynamic_cast<UnionVectorBatch&>(*structBatch.fields[0]);
+ EXPECT_EQ(4, unionBatch.numElements);
+ EXPECT_EQ(2, unionBatch.children.size());
+ auto& longBatch = dynamic_cast<LongVectorBatch&>(*unionBatch.children[0]);
+ auto& stringBatch =
dynamic_cast<StringVectorBatch&>(*unionBatch.children[1]);
+ EXPECT_EQ(3, longBatch.numElements);
+ EXPECT_EQ(1, stringBatch.numElements);
+ // verify content of the first row.
+ EXPECT_EQ(0, unionBatch.tags.data()[0]);
+ EXPECT_EQ(0, unionBatch.offsets.data()[0]);
+ EXPECT_EQ(0, longBatch.data.data()[0]);
+ // verify content of the second row.
+ EXPECT_EQ(1, unionBatch.tags.data()[1]);
+ EXPECT_EQ(0, unionBatch.offsets.data()[1]);
+ EXPECT_EQ(2, stringBatch.length.data()[0]);
+ EXPECT_EQ(0, strncmp("s1", stringBatch.data.data()[0], 2));
Review comment:
I feel like it's more readable if we create the test ORC files in
flight. E.g.
https://github.com/apache/orc/blob/0622002bd140d7bd88af0ae71d73fd1b3a101c17/c%2B%2B/test/TestPredicatePushdown.cc#L29
Otherwise, each developer should manually dump the file content to
understand these. So I think we should try creating the ORC files in tests
unless we can't (e.g. for corrupt files). Creating the ORC files inflight also
reduce the repo size. But I'm not strongly opposing the current approach.
@dongjoon-hyun Could you share your opinion?
##########
File path: c++/test/TestReader.cc
##########
@@ -250,4 +252,206 @@ namespace orc {
EXPECT_EQ(3, intArrayArrayArrayBatch.offsets.data()[1]);
EXPECT_EQ(nullptr, intArrayArrayArrayBatch.elements.get());
}
+
+ /**
+ * Read TestOrcFile.nestedMap.orc and verify the resolved selections.
+ *
+ * The ORC fie has the following schema:
+ * struct<
+ * id:int,
+ * single_map:map<string,string>,
+ * nested_map:map<string,map<string,map<string,string>>>
+ * >
+ */
+ void verifyMapSelection(const RowReaderOptions::IdReadIntentMap&
idReadIntentMap,
Review comment:
The implementation is duplicated with `verifySelection` except the
`fileName`. Can we merge them into one?
##########
File path: c++/test/TestReader.cc
##########
@@ -250,4 +252,206 @@ namespace orc {
EXPECT_EQ(3, intArrayArrayArrayBatch.offsets.data()[1]);
EXPECT_EQ(nullptr, intArrayArrayArrayBatch.elements.get());
}
+
+ /**
+ * Read TestOrcFile.nestedMap.orc and verify the resolved selections.
+ *
+ * The ORC fie has the following schema:
+ * struct<
+ * id:int,
+ * single_map:map<string,string>,
+ * nested_map:map<string,map<string,map<string,string>>>
+ * >
+ */
+ void verifyMapSelection(const RowReaderOptions::IdReadIntentMap&
idReadIntentMap,
+ const std::vector<uint32_t>& expectedSelection) {
+ std::string fileName = "TestOrcFile.nestedMap.orc";
+ std::unique_ptr<Reader> reader = createExampleReader(fileName);
+
+ RowReaderOptions rowReaderOpts;
+ rowReaderOpts.includeTypesWithIntents(idReadIntentMap);
+ std::unique_ptr<RowReader> rowReader =
reader->createRowReader(rowReaderOpts);
+ std::vector<bool> expected(reader->getType().getMaximumColumnId() + 1,
false);
+ for (auto id : expectedSelection) {
+ expected[id] = true;
+ }
+ ASSERT_THAT(rowReader->getSelectedColumns(), ElementsAreArray(expected));
+ }
+
+ TEST(TestReadIntent, testMapAll) {
+ // select all of single_map.
+ verifyMapSelection({{2, ReadIntent_ALL}}, {0, 2, 3, 4});
+ }
+
+ TEST(TestReadIntent, testMapOffsets) {
+ // select only the offsets of single_map.
+ verifyMapSelection({{2, ReadIntent_OFFSETS}}, {0, 2});
+
+ // select only the offsets of single_map and the outermost offsets of
nested_map.
+ verifyMapSelection({{2, ReadIntent_OFFSETS}, {5, ReadIntent_OFFSETS}}, {0,
2, 5});
+
+ // select the entire offsets of nested_map without the map items of the
innermost map.
+ verifyMapSelection({{5, ReadIntent_OFFSETS}, {9, ReadIntent_OFFSETS}}, {0,
5, 7, 9});
+ }
+
+ TEST(TestReadIntent, testMapAllAndOffsets) {
+ // select all of single_map and only the outermost offsets of nested_map.
+ verifyMapSelection({{2, ReadIntent_ALL}, {5, ReadIntent_OFFSETS}}, {0, 2,
3, 4, 5});
+ }
+
+ TEST(TestReadIntent, testMapConflictingIntent) {
+ // test conflicting ReadIntent on nested_map.
+ verifyMapSelection({{5, ReadIntent_OFFSETS}, {9, ReadIntent_ALL}}, {0, 5,
7, 9, 10, 11});
+ verifyMapSelection({{5, ReadIntent_ALL}, {9, ReadIntent_OFFSETS}}, {0, 5,
6, 7, 8, 9, 10, 11});
+ verifyMapSelection({{5, ReadIntent_OFFSETS}, {7, ReadIntent_ALL}, {9,
ReadIntent_OFFSETS}},
+ {0, 5, 7, 8, 9, 10, 11});
+ }
+
+ TEST(TestReadIntent, testMapRowBatchContent) {
+ std::unique_ptr<Reader> reader =
createExampleReader("TestOrcFile.nestedMap.orc");
+
+ // select all of single_map and only the offsets of nested_map.
+ RowReaderOptions::IdReadIntentMap idReadIntentMap = {{2, ReadIntent_ALL},
+ {5,
ReadIntent_OFFSETS}};
+ RowReaderOptions rowReaderOpts;
+ rowReaderOpts.includeTypesWithIntents(idReadIntentMap);
+ std::unique_ptr<RowReader> rowReader =
reader->createRowReader(rowReaderOpts);
+
+ // Read a row batch.
+ std::unique_ptr<ColumnVectorBatch> batch = rowReader->createRowBatch(1024);
+ EXPECT_TRUE(rowReader->next(*batch));
+ EXPECT_EQ(1, batch->numElements);
+ auto& structBatch = dynamic_cast<StructVectorBatch&>(*batch);
+
+ // verify content of int_array selection.
+ auto& mapBatch = dynamic_cast<MapVectorBatch&>(*structBatch.fields[0]);
+ auto& keyBatch = dynamic_cast<StringVectorBatch&>(*mapBatch.keys);
+ auto& valueBatch = dynamic_cast<StringVectorBatch&>(*mapBatch.elements);
+ EXPECT_EQ(1, mapBatch.numElements);
+ EXPECT_EQ(0, mapBatch.offsets.data()[0]);
+ EXPECT_EQ(1, mapBatch.offsets.data()[1]);
+ // verify key content.
+ EXPECT_EQ(1, keyBatch.numElements);
+ EXPECT_EQ(2, keyBatch.length.data()[0]);
+ EXPECT_EQ(0, strncmp("k0", keyBatch.data.data()[0], 2));
+ // verify value content.
+ EXPECT_EQ(1, valueBatch.numElements);
+ EXPECT_EQ(2, valueBatch.length.data()[0]);
+ EXPECT_EQ(0, strncmp("v0", valueBatch.data.data()[0], 2));
+
+ // verify content of nested_map selection.
+ auto& nestedMapBatch =
dynamic_cast<MapVectorBatch&>(*structBatch.fields[1]);
+ EXPECT_EQ(1, nestedMapBatch.numElements);
+ EXPECT_EQ(0, nestedMapBatch.offsets.data()[0]);
+ EXPECT_EQ(1, nestedMapBatch.offsets.data()[1]);
+ EXPECT_EQ(nullptr, nestedMapBatch.keys.get());
+ EXPECT_EQ(nullptr, nestedMapBatch.elements.get());
+ }
+
+ /**
+ * Read TestOrcFile.nestedUnion.orc and verify the resolved selections.
+ *
+ * The ORC fie has the following schema:
+ * struct<
+ * id:int,
+ * single_union:uniontype<int,string>,
+ * nested_union:uniontype<uniontype<int,uniontype<int,string>>,int>
+ * >
+ */
+ void verifyUnionSelection(const RowReaderOptions::IdReadIntentMap&
idReadIntentMap,
+ const std::vector<uint32_t>& expectedSelection) {
+ std::string fileName = "TestOrcFile.nestedUnion.orc";
+ std::unique_ptr<Reader> reader = createExampleReader(fileName);
+
+ RowReaderOptions rowReaderOpts;
+ rowReaderOpts.includeTypesWithIntents(idReadIntentMap);
+ std::unique_ptr<RowReader> rowReader =
reader->createRowReader(rowReaderOpts);
+ std::vector<bool> expected(reader->getType().getMaximumColumnId() + 1,
false);
+ for (auto id : expectedSelection) {
+ expected[id] = true;
+ }
+ ASSERT_THAT(rowReader->getSelectedColumns(), ElementsAreArray(expected));
+ }
+
+ TEST(TestReadIntent, testUnionAll) {
+ // select all of single_union.
+ verifyUnionSelection({{2, ReadIntent_ALL}}, {0, 2, 3, 4});
+ }
+
+ TEST(TestReadIntent, testUnionOffsets) {
+ // select only the offsets of single_union.
+ verifyUnionSelection({{2, ReadIntent_OFFSETS}}, {0, 2});
+
+ // select only the offsets of single_union and the outermost offsets of
nested_union.
+ verifyUnionSelection({{2, ReadIntent_OFFSETS}, {5, ReadIntent_OFFSETS}},
{0, 2, 5});
+
+ // select only the offsets of single_union and the innermost offsets of
nested_union.
+ verifyUnionSelection({{2, ReadIntent_OFFSETS}, {8, ReadIntent_OFFSETS}},
+ {0, 2, 5, 6, 7, 8, 11});
+ }
+
+ TEST(TestReadIntent, testUnionAllAndOffsets) {
+ // select all of single_union and only the outermost offsets of
nested_union.
+ verifyUnionSelection({{2, ReadIntent_ALL}, {5, ReadIntent_OFFSETS}}, {0,
2, 3, 4, 5});
+ }
+
+ TEST(TestReadIntent, testUnionConflictingIntent) {
+ // test conflicting ReadIntent on nested_union.
+ verifyUnionSelection({{5, ReadIntent_OFFSETS}, {8, ReadIntent_ALL}},
+ {0, 5, 6, 7, 8, 9, 10, 11});
+ verifyUnionSelection({{5, ReadIntent_ALL}, {8, ReadIntent_OFFSETS}},
+ {0, 5, 6, 7, 8, 9, 10, 11});
+ verifyUnionSelection({{5, ReadIntent_OFFSETS}, {6, ReadIntent_ALL}, {8,
ReadIntent_OFFSETS}},
+ {0, 5, 6, 7, 8, 9, 10, 11});
+ }
+
+ TEST(TestReadIntent, testUnionRowBatchContent) {
+ std::unique_ptr<Reader> reader =
createExampleReader("TestOrcFile.nestedUnion.orc");
+
+ // select all of single_union and only the offsets of nested_union.
+ RowReaderOptions::IdReadIntentMap idReadIntentMap = {{2, ReadIntent_ALL},
+ {5,
ReadIntent_OFFSETS}};
+ RowReaderOptions rowReaderOpts;
+ rowReaderOpts.includeTypesWithIntents(idReadIntentMap);
+ std::unique_ptr<RowReader> rowReader =
reader->createRowReader(rowReaderOpts);
+
+ // Read a row batch.
+ std::unique_ptr<ColumnVectorBatch> batch = rowReader->createRowBatch(1024);
+ EXPECT_TRUE(rowReader->next(*batch));
+ EXPECT_EQ(4, batch->numElements);
+ auto& structBatch = dynamic_cast<StructVectorBatch&>(*batch);
+
+ // verify content of int_array selection.
Review comment:
`int_array` should be `single_union`
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]