This is an automated email from the ASF dual-hosted git repository. caiconghui pushed a commit to branch orc-2.1 in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
commit da3247fa04ee8f17117c9b96239c953a87a23892 Author: Qi Chen <[email protected]> AuthorDate: Wed May 7 14:13:15 2025 +0800 [Fix] Fixed issue with top level struct column having present stream failing to access repeatedly when deferred materialization occurs. When using an older version of pyorc (e.g., pyorc-0.3.0), If there are null values in the data, a present stream will be generated for the top level struct column. However, this behavior does not occur in newer versions of pyorc (e.g., pyorc-0.10.0) or in ORC files generated by tools like Hive or Spark. Therefore, the present stream generated by the older version causes the ORC file to be read twice during late materialization, resulting in an error 'bad read in next buffer' during the second read. The current solution is to avoid reading the present stream if it is in the top level struct column. --- c++/src/ColumnReader.cc | 23 +++++++++++++---------- c++/src/ColumnReader.hh | 5 +++-- c++/src/Reader.cc | 2 +- 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/c++/src/ColumnReader.cc b/c++/src/ColumnReader.cc index 59b976f36ce..875ce81a9de 100644 --- a/c++/src/ColumnReader.cc +++ b/c++/src/ColumnReader.cc @@ -46,15 +46,17 @@ namespace orc { } } - ColumnReader::ColumnReader(const Type& _type, StripeStreams& stripe) + ColumnReader::ColumnReader(const Type& _type, StripeStreams& stripe, bool readPresentStream) : type(_type), columnId(type.getColumnId()), memoryPool(stripe.getMemoryPool()), metrics(stripe.getReaderMetrics()) { - std::unique_ptr<SeekableInputStream> stream = - stripe.getStream(columnId, proto::Stream_Kind_PRESENT, true); - if (stream.get()) { - notNullDecoder = createBooleanRleDecoder(std::move(stream), metrics); + if (readPresentStream) { + std::unique_ptr<SeekableInputStream> stream = + stripe.getStream(columnId, proto::Stream_Kind_PRESENT, true); + if (stream.get()) { + notNullDecoder = createBooleanRleDecoder(std::move(stream), metrics); + } } } @@ -1109,7 +1111,8 @@ namespace orc { std::vector<std::unique_ptr<ColumnReader>> children; public: - StructColumnReader(const Type& type, StripeStreams& stipe, bool useTightNumericVector = false); + StructColumnReader(const Type& type, StripeStreams& stipe, bool useTightNumericVector = false, + bool isTopLevel = false); uint64_t skip(uint64_t numValues, const ReadPhase& readPhase) override; @@ -1133,8 +1136,8 @@ namespace orc { }; StructColumnReader::StructColumnReader(const Type& type, StripeStreams& stripe, - bool useTightNumericVector) - : ColumnReader(type, stripe) { + bool useTightNumericVector, bool isTopLevel) + : ColumnReader(type, stripe, !isTopLevel) { // count the number of selected sub-columns const std::vector<bool> selectedColumns = stripe.getSelectedColumns(); switch (static_cast<int64_t>(stripe.getEncoding(columnId).kind())) { @@ -2309,7 +2312,7 @@ namespace orc { * Create a reader for the given stripe. */ std::unique_ptr<ColumnReader> buildReader(const Type& type, StripeStreams& stripe, - bool useTightNumericVector) { + bool useTightNumericVector, bool isTopLevel) { switch (static_cast<int64_t>(type.getKind())) { case SHORT: { if (useTightNumericVector) { @@ -2363,7 +2366,7 @@ namespace orc { return std::make_unique<UnionColumnReader>(type, stripe, useTightNumericVector); case STRUCT: - return std::make_unique<StructColumnReader>(type, stripe, useTightNumericVector); + return std::make_unique<StructColumnReader>(type, stripe, useTightNumericVector, isTopLevel); case FLOAT: { if (useTightNumericVector) { diff --git a/c++/src/ColumnReader.hh b/c++/src/ColumnReader.hh index 8c0e36bd353..b22bbc9617c 100644 --- a/c++/src/ColumnReader.hh +++ b/c++/src/ColumnReader.hh @@ -129,7 +129,7 @@ namespace orc { } public: - ColumnReader(const Type& type, StripeStreams& stipe); + ColumnReader(const Type& type, StripeStreams& stipe, bool readPresentStream = true); virtual ~ColumnReader(); @@ -188,7 +188,8 @@ namespace orc { * Create a reader for the given stripe. */ std::unique_ptr<ColumnReader> buildReader(const Type& type, StripeStreams& stripe, - bool useTightNumericVector = false); + bool useTightNumericVector = false, + bool isTopLevel = false); void loadStringDicts(ColumnReader* columnReader, const std::unordered_map<uint64_t, std::string>& columnIdToNameMap, diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc index 88f1a50305d..619bea0385d 100644 --- a/c++/src/Reader.cc +++ b/c++/src/Reader.cc @@ -1256,7 +1256,7 @@ namespace orc { StripeStreamsImpl stripeStreams(*this, currentStripe, currentStripeInfo, currentStripeFooter, currentStripeInfo.offset(), *contents->stream, writerTimezone, readerTimezone); - reader = buildReader(*contents->schema, stripeStreams, useTightNumericVector); + reader = buildReader(*contents->schema, stripeStreams, useTightNumericVector, true); if (stringDictFilter != nullptr) { std::list<std::string> dictFilterColumnNames; --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
