Not really. So what’s really going on?! TEST(TestAdapterWriteNested, writeList) { std::shared_ptr<Schema> table_schema = schema({field("list", list(int32()))}); int64_t num_rows = 10000; arrow::random::RandomArrayGenerator rand(kRandomSeed); auto value_array = rand.ArrayOf(int32(), 5 * num_rows, 0.6); std::shared_ptr<Array> array = rand.List(*value_array, num_rows + 1, 0.8); std::shared_ptr<ChunkedArray> chunked_array = std::make_shared<ChunkedArray>(array); std::shared_ptr<Table> table = Table::Make(table_schema, {chunked_array});
std::shared_ptr<io::BufferOutputStream> buffer_output_stream = io::BufferOutputStream::Create(kDefaultSmallMemStreamSize * 15).ValueOrDie(); std::unique_ptr<adapters::orc::ORCFileWriter> writer = adapters::orc::ORCFileWriter::Open(*buffer_output_stream).ValueOrDie(); ARROW_EXPECT_OK(writer->Write(*table)); ARROW_EXPECT_OK(writer->Close()); std::shared_ptr<Buffer> buffer = buffer_output_stream->Finish().ValueOrDie(); std::shared_ptr<io::RandomAccessFile> in_stream(new io::BufferReader(buffer)); std::unique_ptr<adapters::orc::ORCFileReader> reader; ARROW_EXPECT_OK( adapters::orc::ORCFileReader::Open(in_stream, default_memory_pool(), &reader)); std::shared_ptr<Table> actual_output_table; ARROW_EXPECT_OK(reader->Read(&actual_output_table)); auto actual_array = std::static_pointer_cast<ListArray>(actual_output_table->column(0)->chunk(0)); auto expected_array = std::static_pointer_cast<ListArray>(table->column(0)->chunk(0)); AssertArraysEqual(*(actual_array->offsets()), *(expected_array->offsets())); AssertArraysEqual(*(actual_array->values()), *(expected_array->values())); AssertBufferEqual(*(actual_array->null_bitmap()), *(expected_array->null_bitmap())); ASSERT_TRUE(actual_array->type()->Equals(*(expected_array->type()))); RecordProperty("output_type", actual_array->type()->ToString()); RecordProperty("input_type", expected_array->type()->ToString()); RecordProperty("array_equality", actual_array->Equals(*expected_array)); } <testcase name="writeList" status="run" result="completed" time="0.029" timestamp="2021-02-10T12:33:47" classname="TestAdapterWriteNested"> <properties> <property name="output_type" value="list<item: int32>"/> <property name="input_type" value="list<item: int32>"/> <property name="array_equality" value="0"/> </properties> </testcase> > On Feb 10, 2021, at 12:10 PM, Antoine Pitrou <anto...@python.org> wrote: > > > Hmm, perhaps the types are unequal, then. Can you print them out > (including field metadata)? > > > Le 10/02/2021 à 18:03, Ying Zhou a écrit : >> Thanks! Now we have an even weirder phenomenon. Even the null bitmaps and >> offsets are equal. However the arrays aren’t! Does anyone know why? >> >> TEST(TestAdapterWriteNested, writeList) { >> std::shared_ptr<Schema> table_schema = schema({field("list", >> list(int32()))}); >> int64_t num_rows = 10000; >> arrow::random::RandomArrayGenerator rand(kRandomSeed); >> auto value_array = rand.ArrayOf(int32(), 5 * num_rows, 0.6); >> std::shared_ptr<Array> array = rand.List(*value_array, num_rows + 1, 0.8); >> std::shared_ptr<ChunkedArray> chunked_array = >> std::make_shared<ChunkedArray>(array); >> std::shared_ptr<Table> table = Table::Make(table_schema, {chunked_array}); >> >> std::shared_ptr<io::BufferOutputStream> buffer_output_stream = >> io::BufferOutputStream::Create(kDefaultSmallMemStreamSize * >> 15).ValueOrDie(); >> std::unique_ptr<adapters::orc::ORCFileWriter> writer = >> adapters::orc::ORCFileWriter::Open(*buffer_output_stream).ValueOrDie(); >> ARROW_EXPECT_OK(writer->Write(*table)); >> ARROW_EXPECT_OK(writer->Close()); >> std::shared_ptr<Buffer> buffer = >> buffer_output_stream->Finish().ValueOrDie(); >> std::shared_ptr<io::RandomAccessFile> in_stream(new >> io::BufferReader(buffer)); >> std::unique_ptr<adapters::orc::ORCFileReader> reader; >> ARROW_EXPECT_OK( >> adapters::orc::ORCFileReader::Open(in_stream, default_memory_pool(), >> &reader)); >> std::shared_ptr<Table> actual_output_table; >> ARROW_EXPECT_OK(reader->Read(&actual_output_table)); >> auto actual_array = >> >> std::static_pointer_cast<ListArray>(actual_output_table->column(0)->chunk(0)); >> auto expected_array = >> std::static_pointer_cast<ListArray>(table->column(0)->chunk(0)); >> AssertArraysEqual(*(actual_array->offsets()), *(expected_array->offsets())); >> AssertArraysEqual(*(actual_array->values()), *(expected_array->values())); >> AssertBufferEqual(*(actual_array->null_bitmap()), >> *(expected_array->null_bitmap())); >> RecordProperty("array_equality", actual_array->Equals(*expected_array)); >> } >> >> <testcase name="writeList" status="run" result="completed" time="0.028" >> timestamp="2021-02-10T11:58:23" classname="TestAdapterWriteNested"> >> <properties> >> <property name="array_equality" value="0"/> >> </properties> >> </testcase> >> >>> On Feb 10, 2021, at 3:52 AM, Antoine Pitrou <anto...@python.org> wrote: >>> >>> >>> Hi Ying, >>> >>> Hmm, yes, this may be related to the null bitmaps, or the offsets. >>> Can you try to inspect or pretty-print the offsets arrays for the two >>> list arrays? >>> >>> Regards >>> >>> Antoine. >>> >>> >>> Le 10/02/2021 à 03:26, Ying Zhou a écrit : >>>> Hi, >>>> >>>> This is an extremely weird phenomenon. There are two 2*1 tables that are >>>> supposedly different when I got a confusing error message like this: >>>> >>>> [ RUN ] TestAdapterWriteNested.writeList >>>> /Users/karlkatzen/Documents/code/arrow-dev/arrow/cpp/src/arrow/testing/gtest_util.cc:459: >>>> Failure >>>> Failed >>>> Unequal at absolute position 2 >>>> Expected: >>>> [ >>>> [ >>>> null, >>>> 1074834796, >>>> null, >>>> null >>>> ], >>>> null >>>> ] >>>> Actual: >>>> [ >>>> [ >>>> null, >>>> 1074834796, >>>> null, >>>> null >>>> ], >>>> null >>>> ] >>>> [ FAILED ] TestAdapterWriteNested.writeList (2 ms) >>>> >>>> Here is the code that causes the issue: >>>> >>>> TEST(TestAdapterWriteNested, writeList) { >>>> std::shared_ptr<Schema> table_schema = schema({field("list", >>>> list(int32()))}); >>>> int64_t num_rows = 2; >>>> arrow::random::RandomArrayGenerator rand(kRandomSeed); >>>> auto value_array = rand.ArrayOf(int32(), 2 * num_rows, 0.6); >>>> std::shared_ptr<Array> array = rand.List(*value_array, num_rows + 1, 1); >>>> std::shared_ptr<ChunkedArray> chunked_array = >>>> std::make_shared<ChunkedArray>(array); >>>> std::shared_ptr<Table> table = Table::Make(table_schema, {chunked_array}); >>>> AssertTableWriteReadEqual(table, table, kDefaultSmallMemStreamSize * 5); >>>> } >>>> >>>> Here AssertTableWriteReadEqual is a function I use to test that >>>> from_orc(to_orc(table_in)) == expected_table_out. The function did not >>>> have issues before. >>>> >>>> void AssertTableWriteReadEqual(const std::shared_ptr<Table>& input_table, >>>> const std::shared_ptr<Table>& >>>> expected_output_table, >>>> const int64_t max_size = >>>> kDefaultSmallMemStreamSize) { >>>> std::shared_ptr<io::BufferOutputStream> buffer_output_stream = >>>> io::BufferOutputStream::Create(max_size).ValueOrDie(); >>>> std::unique_ptr<adapters::orc::ORCFileWriter> writer = >>>> adapters::orc::ORCFileWriter::Open(*buffer_output_stream).ValueOrDie(); >>>> ARROW_EXPECT_OK(writer->Write(*input_table)); >>>> ARROW_EXPECT_OK(writer->Close()); >>>> std::shared_ptr<Buffer> buffer = >>>> buffer_output_stream->Finish().ValueOrDie(); >>>> std::shared_ptr<io::RandomAccessFile> in_stream(new >>>> io::BufferReader(buffer)); >>>> std::unique_ptr<adapters::orc::ORCFileReader> reader; >>>> ARROW_EXPECT_OK( >>>> adapters::orc::ORCFileReader::Open(in_stream, default_memory_pool(), >>>> &reader)); >>>> std::shared_ptr<Table> actual_output_table; >>>> ARROW_EXPECT_OK(reader->Read(&actual_output_table)); >>>> AssertTablesEqual(*actual_output_table, *expected_output_table, false, >>>> false); >>>> } >>>> >>>> I strongly suspect that this is related to the null bitmaps. What do you >>>> guys think? >>>> >>>> Ying >>>> >> >>