mathyingzhou commented on a change in pull request #8648:
URL: https://github.com/apache/arrow/pull/8648#discussion_r567596202
##########
File path: cpp/src/arrow/adapters/orc/adapter_test.cc
##########
@@ -157,4 +197,1930 @@ TEST(TestAdapter, readIntAndStringFileMultipleStripes) {
EXPECT_TRUE(stripe_reader->ReadNext(&record_batch).ok());
}
}
+
+// WriteORC tests
+
+// Trivial
+TEST(TestAdapterWriteTrivial, writeZeroRowsNoConversion) {
+ std::shared_ptr<Table> table = TableFromJSON(
+ schema({field("bool", boolean()), field("int8", int8()), field("int16",
int16()),
+ field("int32", int32()), field("int64", int64()), field("float",
float32()),
+ field("double", float64()), field("decimal128nz", decimal(25,
6)),
+ field("decimal128z", decimal(32, 0)), field("date32", date32()),
+ field("ts3", timestamp(TimeUnit::NANO)), field("string", utf8()),
+ field("binary", binary()),
+ field("struct", struct_({field("a", utf8()), field("b",
int64())})),
+ field("list", list(int32())),
+ field("lsl", list(struct_({field("lsl0", list(int32()))})))}),
+ {R"([])"});
+ AssertTableWriteReadEqual(table, table, kDefaultSmallMemStreamSize / 16);
+}
+TEST(TestAdapterWriteTrivial, writeChunklessNoConversion) {
+ std::shared_ptr<Table> table = TableFromJSON(
+ schema({field("bool", boolean()), field("int8", int8()), field("int16",
int16()),
+ field("int32", int32()), field("int64", int64()), field("float",
float32()),
+ field("double", float64()), field("decimal128nz", decimal(25,
6)),
+ field("decimal128z", decimal(32, 0)), field("date32", date32()),
+ field("ts3", timestamp(TimeUnit::NANO)), field("string", utf8()),
+ field("binary", binary()),
+ field("struct", struct_({field("a", utf8()), field("b",
int64())})),
+ field("list", list(int32())),
+ field("lsl", list(struct_({field("lsl0", list(int32()))})))}),
+ {});
+ AssertTableWriteReadEqual(table, table, kDefaultSmallMemStreamSize / 16);
+}
+TEST(TestAdapterWriteTrivial, writeZeroRowsWithConversion) {
+ std::shared_ptr<Table>
+ input_table = TableFromJSON(
+ schema({field("date64", date64()), field("ts0",
timestamp(TimeUnit::SECOND)),
+ field("ts1", timestamp(TimeUnit::MILLI)),
+ field("ts2", timestamp(TimeUnit::MICRO)),
+ field("large_string", large_utf8()),
+ field("large_binary", large_binary()),
+ field("fixed_size_binary0", fixed_size_binary(0)),
+ field("fixed_size_binary", fixed_size_binary(5)),
+ field("large_list", large_list(int32())),
+ field("fixed_size_list", fixed_size_list(int32(), 3)),
+ field("map", map(utf8(), utf8()))}),
+ {R"([])"}),
+ expected_output_table = TableFromJSON(
+ schema({field("date64", timestamp(TimeUnit::NANO)),
+ field("ts0", timestamp(TimeUnit::NANO)),
+ field("ts1", timestamp(TimeUnit::NANO)),
+ field("ts2", timestamp(TimeUnit::NANO)),
field("large_string", utf8()),
+ field("large_binary", binary()), field("fixed_size_binary0",
binary()),
+ field("fixed_size_binary", binary()),
+ field("large_list", list(int32())),
+ field("fixed_size_list", list(int32())),
+ field("map",
+ list(struct_({field("key", utf8()), field("value",
utf8())})))}),
+ {R"([])"});
+ AssertTableWriteReadEqual(input_table, expected_output_table,
+ kDefaultSmallMemStreamSize / 16);
+}
+TEST(TestAdapterWriteTrivial, writeChunklessWithConversion) {
+ std::shared_ptr<Table>
+ input_table = TableFromJSON(
+ schema({field("date64", date64()), field("ts0",
timestamp(TimeUnit::SECOND)),
+ field("ts1", timestamp(TimeUnit::MILLI)),
+ field("ts2", timestamp(TimeUnit::MICRO)),
+ field("large_string", large_utf8()),
+ field("large_binary", large_binary()),
+ field("fixed_size_binary0", fixed_size_binary(0)),
+ field("fixed_size_binary", fixed_size_binary(5)),
+ field("large_list", large_list(int32())),
+ field("fixed_size_list", fixed_size_list(int32(), 3)),
+ field("map", map(utf8(), utf8()))}),
+ {}),
+ expected_output_table = TableFromJSON(
+ schema({field("date64", timestamp(TimeUnit::NANO)),
+ field("ts0", timestamp(TimeUnit::NANO)),
+ field("ts1", timestamp(TimeUnit::NANO)),
+ field("ts2", timestamp(TimeUnit::NANO)),
field("large_string", utf8()),
+ field("large_binary", binary()), field("fixed_size_binary0",
binary()),
+ field("fixed_size_binary", binary()),
+ field("large_list", list(int32())),
+ field("fixed_size_list", list(int32())),
+ field("map",
+ list(struct_({field("key", utf8()), field("value",
utf8())})))}),
+ {});
+ AssertTableWriteReadEqual(input_table, expected_output_table,
+ kDefaultSmallMemStreamSize / 16);
+}
+
+// General
+TEST(TestAdapterWriteGeneral, writeAllNullsNew) {
+ std::vector<std::shared_ptr<Field>> table_fields{
+ field("bool", boolean()),
+ field("int8", int8()),
+ field("int16", int16()),
+ field("int32", int32()),
+ field("int64", int64()),
+ field("decimal128nz", decimal(33, 4)),
+ field("decimal128z", decimal(35, 0)),
+ field("date32", date32()),
+ field("ts3", timestamp(TimeUnit::NANO)),
+ field("string", utf8()),
+ field("binary", binary())};
+ std::shared_ptr<Schema> table_schema = schema(table_fields);
+ arrow::random::RandomArrayGenerator rand(kRandomSeed);
+
+ int64_t num_rows = 10000;
+ int64_t numCols = table_fields.size();
+
+ ArrayMatrix arrays(numCols, ArrayVector(5, NULLPTR));
+ for (int i = 0; i < numCols; i++) {
+ for (int j = 0; j < 5; j++) {
+ int row_count = j % 2 ? 0 : num_rows / 2;
+ arrays[i][j] = rand.ArrayOf(table_fields[i]->type(), row_count, 1);
+ }
+ }
+
+ ChunkedArrayVector cv;
+ cv.reserve(numCols);
+
+ for (int col = 0; col < numCols; col++) {
+ cv.push_back(std::make_shared<ChunkedArray>(arrays[col]));
+ }
+
+ std::shared_ptr<Table> table = Table::Make(table_schema, cv);
+ AssertTableWriteReadEqual(table, table);
+}
+
+TEST(TestAdapterWriteGeneral, writeAllNulls) {
+ std::vector<std::shared_ptr<Field>> table_fields{
+ field("bool", boolean()),
+ field("int8", int8()),
+ field("int16", int16()),
+ field("int32", int32()),
+ field("int64", int64()),
+ field("decimal128nz", decimal(33, 4)),
+ field("decimal128z", decimal(35, 0)),
+ field("date32", date32()),
+ field("ts3", timestamp(TimeUnit::NANO)),
+ field("string", utf8()),
+ field("binary", binary())};
+ std::shared_ptr<Schema> table_schema =
std::make_shared<Schema>(table_fields);
+
+ int64_t num_rows = 10000;
+ int64_t numCols = table_fields.size();
+
+ ArrayBuilderMatrix builders(numCols, ArrayBuilderVector(5, NULLPTR));
+
+ for (int i = 0; i < 5; i++) {
+ builders[0][i] =
+
std::static_pointer_cast<ArrayBuilder>(std::make_shared<BooleanBuilder>());
+ builders[1][i] =
+
std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int8Builder>());
+ builders[2][i] =
+
std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int16Builder>());
+ builders[3][i] =
+
std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int32Builder>());
+ builders[4][i] =
+
std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int64Builder>());
+ builders[5][i] = std::static_pointer_cast<ArrayBuilder>(
+ std::make_shared<Decimal128Builder>(decimal(33, 4)));
+ builders[6][i] = std::static_pointer_cast<ArrayBuilder>(
+ std::make_shared<Decimal128Builder>(decimal(35, 0)));
Review comment:
Yup. It is not supported in ORC now so we can't. I can file an ORC
ticket though.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]