This is an automated email from the ASF dual-hosted git repository. william pushed a commit to branch branch-2.2 in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/branch-2.2 by this push: new b88dbf560 ORC-1950: [C++] Make sure dictionary is sorted before flushed into ORC file to follow ORC specs b88dbf560 is described below commit b88dbf56028a7003cac5a59443fe5308bfa04191 Author: taiyang-li <654010...@qq.com> AuthorDate: Wed Jul 23 23:44:48 2025 -0700 ORC-1950: [C++] Make sure dictionary is sorted before flushed into ORC file to follow ORC specs ### What changes were proposed in this pull request? Make sure dictionary is sorted before flushed into ORC file to follow ORC specs. The [issue](https://github.com/apache/orc/pull/2321#discussion_r2219569602) was brought by https://github.com/apache/orc/pull/2336. ### Why are the changes needed? ### How was this patch tested? ### Was this patch authored or co-authored using generative AI tooling? Closes #2337 from taiyang-li/make_dict_sorted. Authored-by: taiyang-li <654010...@qq.com> Signed-off-by: William Hyun <will...@apache.org> (cherry picked from commit a4ff1c8c4d1729cda4583ef55da807442b611749) Signed-off-by: William Hyun <will...@apache.org> --- CMakeLists.txt | 4 + c++/src/CMakeLists.txt | 7 +- c++/src/ColumnWriter.cc | 111 ++---------------------- c++/src/Dictionary.cc | 99 ++++++++++++++++++++++ c++/src/Dictionary.hh | 104 +++++++++++++++++++++++ c++/src/meson.build | 4 + c++/test/CMakeLists.txt | 1 - c++/test/TestDictionaryEncoding.cc | 144 +++++++++++++++++++++++--------- c++/test/meson.build | 1 - cmake_modules/ThirdpartyToolchain.cmake | 65 +++++++------- subprojects/sparsehash-c11.wrap | 2 +- 11 files changed, 364 insertions(+), 178 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cc4ee9adb..f15c812fb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -45,6 +45,10 @@ option (BUILD_LIBHDFSPP "Include LIBHDFSPP library in the build process" OFF) +option (BUILD_SPARSEHASH + "Include sparsehash library in the build process" + OFF) + option(BUILD_CPP_TESTS "Build the googletest unit tests" ON) diff --git a/c++/src/CMakeLists.txt b/c++/src/CMakeLists.txt index b8a168307..ae93e67d6 100644 --- a/c++/src/CMakeLists.txt +++ b/c++/src/CMakeLists.txt @@ -170,6 +170,7 @@ set(SOURCE_FILES Compression.cc ConvertColumnReader.cc CpuInfoUtil.cc + Dictionary.cc Exceptions.cc Geospatial.cc Int128.cc @@ -212,8 +213,8 @@ target_link_libraries (orc $<BUILD_INTERFACE:orc::snappy> $<BUILD_INTERFACE:orc::lz4> $<BUILD_INTERFACE:orc::zstd> - $<BUILD_INTERFACE:orc::sparsehash> $<BUILD_INTERFACE:${LIBHDFSPP_LIBRARIES}> + $<BUILD_INTERFACE:${SPARSEHASH_LIBRARIES}> ) target_include_directories (orc @@ -232,6 +233,10 @@ if (BUILD_LIBHDFSPP) target_compile_definitions(orc PUBLIC -DBUILD_LIBHDFSPP) endif (BUILD_LIBHDFSPP) +if (BUILD_SPARSEHASH) + target_compile_definitions(orc PUBLIC -DBUILD_SPARSEHASH) +endif (BUILD_SPARSEHASH) + if (BUILD_CPP_ENABLE_METRICS) message(STATUS "Enable the metrics collection") target_compile_definitions(orc PUBLIC ENABLE_METRICS=1) diff --git a/c++/src/ColumnWriter.cc b/c++/src/ColumnWriter.cc index 915277ef4..b9aac1a12 100644 --- a/c++/src/ColumnWriter.cc +++ b/c++/src/ColumnWriter.cc @@ -24,13 +24,12 @@ #include <memory> #include "ByteRLE.hh" #include "ColumnWriter.hh" +#include "Dictionary.hh" #include "RLE.hh" #include "Statistics.hh" #include "Timezone.hh" #include "Utils.hh" -#include <sparsehash/dense_hash_map> - namespace orc { StreamsFactory::~StreamsFactory() { // PASS @@ -927,104 +926,6 @@ namespace orc { ColumnWriter::finishStreams(); dataStream_->finishStream(); } - - /** - * Implementation of increasing sorted string dictionary - */ - class SortedStringDictionary { - public: - struct DictEntry { - DictEntry(const char* str, size_t len) : data(std::make_unique<std::string>(str, len)) {} - - std::unique_ptr<std::string> data; - }; - - SortedStringDictionary() : totalLength_(0) { - /// Need to set empty key otherwise dense_hash_map will not work correctly - keyToIndex_.set_empty_key(std::string_view{}); - } - - // insert a new string into dictionary, return its insertion order - size_t insert(const char* str, size_t len); - - // write dictionary data & length to output buffer - void flush(AppendOnlyBufferedStream* dataStream, RleEncoder* lengthEncoder) const; - - // get dict entries in insertion order - const std::vector<DictEntry>& getEntriesInInsertionOrder() const; - - // return count of entries - size_t size() const; - - // return total length of strings in the dictioanry - uint64_t length() const; - - void clear(); - - private: - // store dictionary entries in insertion order - mutable std::vector<DictEntry> flatDict_; - - // map from string to its insertion order index - google::dense_hash_map<std::string_view, size_t> keyToIndex_; - uint64_t totalLength_; - - // use friend class here to avoid being bothered by const function calls - friend class StringColumnWriter; - friend class CharColumnWriter; - friend class VarCharColumnWriter; - // store indexes of insertion order in the dictionary for not-null rows - std::vector<int64_t> idxInDictBuffer_; - }; - - // insert a new string into dictionary, return its insertion order - size_t SortedStringDictionary::insert(const char* str, size_t len) { - size_t index = flatDict_.size(); - - auto it = keyToIndex_.find(std::string_view{str, len}); - if (it != keyToIndex_.end()) { - return it->second; - } else { - flatDict_.emplace_back(str, len); - totalLength_ += len; - - const auto& lastEntry = flatDict_.back(); - keyToIndex_.emplace(std::string_view{lastEntry.data->data(), lastEntry.data->size()}, index); - return index; - } - } - - // write dictionary data & length to output buffer - void SortedStringDictionary::flush(AppendOnlyBufferedStream* dataStream, - RleEncoder* lengthEncoder) const { - for (const auto& entry : flatDict_) { - dataStream->write(entry.data->data(), entry.data->size()); - lengthEncoder->write(static_cast<int64_t>(entry.data->size())); - } - } - - // get dict entries in insertion order - const std::vector<SortedStringDictionary::DictEntry>& - SortedStringDictionary::getEntriesInInsertionOrder() const { - return flatDict_; - } - - // return count of entries - size_t SortedStringDictionary::size() const { - return flatDict_.size(); - } - - // return total length of strings in the dictioanry - uint64_t SortedStringDictionary::length() const { - return totalLength_; - } - - void SortedStringDictionary::clear() { - totalLength_ = 0; - keyToIndex_.clear(); - flatDict_.clear(); - } - class StringColumnWriter : public ColumnWriter { public: StringColumnWriter(const Type& type, const StreamsFactory& factory, @@ -1324,6 +1225,9 @@ namespace orc { // flush dictionary data & length streams dictionary.flush(dictStream.get(), dictLengthEncoder.get()); + // convert index from insertion order to dictionary order + dictionary.reorder(dictionary.idxInDictBuffer_); + // write data sequences int64_t* data = dictionary.idxInDictBuffer_.data(); if (enableIndex) { @@ -1367,14 +1271,15 @@ namespace orc { } // get dictionary entries in insertion order - const auto& entries = dictionary.getEntriesInInsertionOrder(); + std::vector<const SortedStringDictionary::DictEntry*> entries; + dictionary.getEntriesInInsertionOrder(entries); // store each length of the data into a vector for (uint64_t i = 0; i != dictionary.idxInDictBuffer_.size(); ++i) { // write one row data in direct encoding const auto& dictEntry = entries[static_cast<size_t>(dictionary.idxInDictBuffer_[i])]; - directDataStream->write(dictEntry.data->data(), dictEntry.data->size()); - directLengthEncoder->write(static_cast<int64_t>(dictEntry.data->size())); + directDataStream->write(dictEntry->data->data(), dictEntry->data->size()); + directLengthEncoder->write(static_cast<int64_t>(dictEntry->data->size())); } deleteDictStreams(); diff --git a/c++/src/Dictionary.cc b/c++/src/Dictionary.cc new file mode 100644 index 000000000..9eb60bb5b --- /dev/null +++ b/c++/src/Dictionary.cc @@ -0,0 +1,99 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Dictionary.hh" + +namespace orc { + + // insert a new string into dictionary, return its insertion order + size_t SortedStringDictionary::insert(const char* str, size_t len) { + size_t index = flatDict_.size(); + + auto it = keyToIndex_.find(std::string_view{str, len}); + if (it != keyToIndex_.end()) { + return it->second; + } else { + flatDict_.emplace_back(str, len, index); + totalLength_ += len; + + const auto& lastEntry = flatDict_.back().entry; + keyToIndex_.emplace(std::string_view{lastEntry.data->data(), lastEntry.data->size()}, index); + return index; + } + } + + // write dictionary data & length to output buffer + void SortedStringDictionary::flush(AppendOnlyBufferedStream* dataStream, + RleEncoder* lengthEncoder) const { + std::sort(flatDict_.begin(), flatDict_.end(), LessThan()); + + for (const auto& entryWithIndex : flatDict_) { + dataStream->write(entryWithIndex.entry.data->data(), entryWithIndex.entry.data->size()); + lengthEncoder->write(static_cast<int64_t>(entryWithIndex.entry.data->size())); + } + } + + /** + * Reorder input index buffer from insertion order to dictionary order + * + * We require this function because string values are buffered by indexes + * in their insertion order. Until the entire dictionary is complete can + * we get their sorted indexes in the dictionary in that ORC specification + * demands dictionary should be ordered. Therefore this function transforms + * the indexes from insertion order to dictionary value order for final + * output. + */ + void SortedStringDictionary::reorder(std::vector<int64_t>& idxBuffer) const { + // iterate the dictionary to get mapping from insertion order to value order + std::vector<size_t> mapping(flatDict_.size()); + for (size_t i = 0; i < flatDict_.size(); ++i) { + mapping[flatDict_[i].index] = i; + } + + // do the transformation + for (size_t i = 0; i != idxBuffer.size(); ++i) { + idxBuffer[i] = static_cast<int64_t>(mapping[static_cast<size_t>(idxBuffer[i])]); + } + } + + // get dict entries in insertion order + void SortedStringDictionary::getEntriesInInsertionOrder( + std::vector<const DictEntry*>& entries) const { + /// flatDict_ is sorted in insertion order before [[SortedStringDictionary::flush]] is invoked. + entries.resize(flatDict_.size()); + for (size_t i = 0; i < flatDict_.size(); ++i) { + entries[i] = &(flatDict_[i].entry); + } + } + + // return count of entries + size_t SortedStringDictionary::size() const { + return flatDict_.size(); + } + + // return total length of strings in the dictioanry + uint64_t SortedStringDictionary::length() const { + return totalLength_; + } + + void SortedStringDictionary::clear() { + totalLength_ = 0; + keyToIndex_.clear(); + flatDict_.clear(); + } +} // namespace orc \ No newline at end of file diff --git a/c++/src/Dictionary.hh b/c++/src/Dictionary.hh new file mode 100644 index 000000000..dca15b115 --- /dev/null +++ b/c++/src/Dictionary.hh @@ -0,0 +1,104 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <cstddef> +#include <memory> +#include <string> + +#ifdef BUILD_SPARSEHASH +#include <sparsehash/dense_hash_map> +#else +#include <unordered_map> +#endif + +#include "RLE.hh" + +namespace orc { + /** + * Implementation of increasing sorted string dictionary + */ + class SortedStringDictionary { + public: + struct DictEntry { + DictEntry(const char* str, size_t len) : data(std::make_unique<std::string>(str, len)) {} + + std::unique_ptr<std::string> data; + }; + + struct DictEntryWithIndex { + DictEntryWithIndex(const char* str, size_t len, size_t index) + : entry(str, len), index(index) {} + + DictEntry entry; + size_t index; + }; + + SortedStringDictionary() : totalLength_(0) { +#ifdef BUILD_SPARSEHASH + /// Need to set empty key otherwise dense_hash_map will not work correctly + keyToIndex_.set_empty_key(std::string_view{}); +#endif + } + + // insert a new string into dictionary, return its insertion order + size_t insert(const char* str, size_t len); + + // write dictionary data & length to output buffer + void flush(AppendOnlyBufferedStream* dataStream, RleEncoder* lengthEncoder) const; + + // reorder input index buffer from insertion order to dictionary order + void reorder(std::vector<int64_t>& idxBuffer) const; + + // get dict entries in insertion order + void getEntriesInInsertionOrder(std::vector<const DictEntry*>&) const; + + // return count of entries + size_t size() const; + + // return total length of strings in the dictioanry + uint64_t length() const; + + void clear(); + + private: + struct LessThan { + bool operator()(const DictEntryWithIndex& l, const DictEntryWithIndex& r) { + return *l.entry.data < *r.entry.data; // use std::string's operator< + } + }; + // store dictionary entries in insertion order + mutable std::vector<DictEntryWithIndex> flatDict_; + +#ifdef BUILD_SPARSEHASH + // map from string to its insertion order index + google::dense_hash_map<std::string_view, size_t> keyToIndex_; +#else + std::unordered_map<std::string_view, size_t> keyToIndex_; +#endif + + uint64_t totalLength_; + + // use friend class here to avoid being bothered by const function calls + friend class StringColumnWriter; + friend class CharColumnWriter; + friend class VarCharColumnWriter; + // store indexes of insertion order in the dictionary for not-null rows + std::vector<int64_t> idxInDictBuffer_; + }; + +} // namespace orc diff --git a/c++/src/meson.build b/c++/src/meson.build index 44a98500f..885df0072 100644 --- a/c++/src/meson.build +++ b/c++/src/meson.build @@ -150,6 +150,7 @@ source_files += files( 'Compression.cc', 'ConvertColumnReader.cc', 'CpuInfoUtil.cc', + 'Dictionary.cc', 'Exceptions.cc', 'Geospatial.cc', 'Int128.cc', @@ -180,6 +181,9 @@ threads_dep = dependency('threads') orc_lib = library( 'orc', sources: source_files, + cpp_args: [ + '-DBUILD_SPARSEHASH' + ], dependencies: [ orc_format_proto_dep, protobuf_dep, diff --git a/c++/test/CMakeLists.txt b/c++/test/CMakeLists.txt index b0ee48f38..3261fedde 100644 --- a/c++/test/CMakeLists.txt +++ b/c++/test/CMakeLists.txt @@ -77,7 +77,6 @@ target_link_libraries (orc-test orc::zlib orc::gtest orc::gmock - orc::sparsehash orc-test-include ) diff --git a/c++/test/TestDictionaryEncoding.cc b/c++/test/TestDictionaryEncoding.cc index 40c1b1a60..c13ceb646 100644 --- a/c++/test/TestDictionaryEncoding.cc +++ b/c++/test/TestDictionaryEncoding.cc @@ -35,6 +35,11 @@ namespace orc { const double DICT_THRESHOLD = 0.2; // make sure dictionary is used const double FALLBACK_THRESHOLD = 0.0; // make sure fallback happens + static bool doubleEquals(double a, double b) { + const double EPSILON = 1e-9; + return std::fabs(a - b) < EPSILON; + } + static std::unique_ptr<Reader> createReader(MemoryPool* memoryPool, std::unique_ptr<InputStream> stream) { ReaderOptions options; @@ -42,12 +47,39 @@ namespace orc { return createReader(std::move(stream), options); } - static std::unique_ptr<RowReader> createRowReader(Reader* reader) { + static void checkDictionaryEncoding(StringVectorBatch* batch) { + EXPECT_TRUE(batch->isEncoded); + + const auto* encoded_batch = dynamic_cast<EncodedStringVectorBatch*>(batch); + EXPECT_TRUE(encoded_batch != nullptr); + + const auto& dictionary = encoded_batch->dictionary; + EXPECT_TRUE(dictionary != nullptr); + + // Check if the dictionary is sorted + std::string prev; + for (size_t i = 0; i < dictionary->dictionaryOffset.size() - 1; ++i) { + char* begin = nullptr; + int64_t length = 0; + dictionary->getValueByIndex(i, begin, length); + + std::string curr = std::string(begin, static_cast<size_t>(length)); + if (i) { + EXPECT_GT(curr, prev); + } + + prev = std::move(curr); + } + } + + static std::unique_ptr<RowReader> createRowReader(Reader* reader, + bool enableEncodedBlock = false) { RowReaderOptions rowReaderOpts; + rowReaderOpts.setEnableLazyDecoding(enableEncodedBlock); return reader->createRowReader(rowReaderOpts); } - void testStringDictionary(bool enableIndex, double threshold) { + void testStringDictionary(bool enableIndex, double threshold, bool enableEncodedBlock = false) { MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); MemoryPool* pool = getDefaultPool(); std::unique_ptr<Type> type(Type::buildTypeFromString("struct<col1:string>")); @@ -87,16 +119,21 @@ namespace orc { std::unique_ptr<InputStream> inStream( new MemoryInputStream(memStream.getData(), memStream.getLength())); std::unique_ptr<Reader> reader = createReader(pool, std::move(inStream)); - std::unique_ptr<RowReader> rowReader = createRowReader(reader.get()); + std::unique_ptr<RowReader> rowReader = createRowReader(reader.get(), enableEncodedBlock); EXPECT_EQ(rowCount, reader->getNumberOfRows()); batch = rowReader->createRowBatch(rowCount); EXPECT_EQ(true, rowReader->next(*batch)); EXPECT_EQ(rowCount, batch->numElements); + structBatch = dynamic_cast<StructVectorBatch*>(batch.get()); + strBatch = dynamic_cast<StringVectorBatch*>(structBatch->fields[0]); + if (doubleEquals(threshold, DICT_THRESHOLD) && enableEncodedBlock) { + checkDictionaryEncoding(strBatch); + strBatch->decodeDictionary(); + } + for (uint64_t i = 0; i < rowCount; ++i) { - structBatch = dynamic_cast<StructVectorBatch*>(batch.get()); - strBatch = dynamic_cast<StringVectorBatch*>(structBatch->fields[0]); std::string str(strBatch->data[i], static_cast<size_t>(strBatch->length[i])); EXPECT_EQ(i % dictionarySize, static_cast<uint64_t>(atoi(str.c_str()))); } @@ -104,7 +141,7 @@ namespace orc { EXPECT_FALSE(rowReader->next(*batch)); } - void testVarcharDictionary(bool enableIndex, double threshold) { + void testVarcharDictionary(bool enableIndex, double threshold, bool enableEncodedBlock = false) { MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); MemoryPool* pool = getDefaultPool(); std::unique_ptr<Type> type(Type::buildTypeFromString("struct<col1:varchar(2)>")); @@ -144,17 +181,21 @@ namespace orc { std::unique_ptr<InputStream> inStream( new MemoryInputStream(memStream.getData(), memStream.getLength())); std::unique_ptr<Reader> reader = createReader(pool, std::move(inStream)); - std::unique_ptr<RowReader> rowReader = createRowReader(reader.get()); + std::unique_ptr<RowReader> rowReader = createRowReader(reader.get(), enableEncodedBlock); EXPECT_EQ(rowCount, reader->getNumberOfRows()); batch = rowReader->createRowBatch(rowCount); EXPECT_EQ(true, rowReader->next(*batch)); EXPECT_EQ(rowCount, batch->numElements); - for (uint64_t i = 0; i < rowCount; ++i) { - structBatch = dynamic_cast<StructVectorBatch*>(batch.get()); - varcharBatch = dynamic_cast<StringVectorBatch*>(structBatch->fields[0]); + structBatch = dynamic_cast<StructVectorBatch*>(batch.get()); + varcharBatch = dynamic_cast<StringVectorBatch*>(structBatch->fields[0]); + if (doubleEquals(threshold, DICT_THRESHOLD) && enableEncodedBlock) { + checkDictionaryEncoding(varcharBatch); + varcharBatch->decodeDictionary(); + } + for (uint64_t i = 0; i < rowCount; ++i) { std::ostringstream os; os << (i % dictionarySize); EXPECT_FALSE(varcharBatch->length[i] > 2); @@ -166,7 +207,7 @@ namespace orc { EXPECT_FALSE(rowReader->next(*batch)); } - void testCharDictionary(bool enableIndex, double threshold) { + void testCharDictionary(bool enableIndex, double threshold, bool enableEncodedBlock = false) { MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); MemoryPool* pool = getDefaultPool(); std::unique_ptr<Type> type(Type::buildTypeFromString("struct<col1:char(3)>")); @@ -204,17 +245,21 @@ namespace orc { std::unique_ptr<InputStream> inStream( new MemoryInputStream(memStream.getData(), memStream.getLength())); std::unique_ptr<Reader> reader = createReader(pool, std::move(inStream)); - std::unique_ptr<RowReader> rowReader = createRowReader(reader.get()); + std::unique_ptr<RowReader> rowReader = createRowReader(reader.get(), enableEncodedBlock); EXPECT_EQ(rowCount, reader->getNumberOfRows()); batch = rowReader->createRowBatch(rowCount); EXPECT_EQ(true, rowReader->next(*batch)); EXPECT_EQ(rowCount, batch->numElements); - for (uint64_t i = 0; i < rowCount; ++i) { - structBatch = dynamic_cast<StructVectorBatch*>(batch.get()); - charBatch = dynamic_cast<StringVectorBatch*>(structBatch->fields[0]); + structBatch = dynamic_cast<StructVectorBatch*>(batch.get()); + charBatch = dynamic_cast<StringVectorBatch*>(structBatch->fields[0]); + if (doubleEquals(threshold, DICT_THRESHOLD) && enableEncodedBlock) { + checkDictionaryEncoding(charBatch); + charBatch->decodeDictionary(); + } + for (uint64_t i = 0; i < rowCount; ++i) { EXPECT_EQ(3, charBatch->length[i]); std::string charsRead(charBatch->data[i], static_cast<size_t>(charBatch->length[i])); @@ -230,7 +275,8 @@ namespace orc { EXPECT_FALSE(rowReader->next(*batch)); } - void testStringDictionaryWithNull(double threshold, bool enableIndex) { + void testStringDictionaryWithNull(double threshold, bool enableIndex, + bool enableEncodedBlock = false) { MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); MemoryPool* pool = getDefaultPool(); std::unique_ptr<Type> type(Type::buildTypeFromString("struct<col1:string>")); @@ -277,17 +323,21 @@ namespace orc { std::unique_ptr<InputStream> inStream( new MemoryInputStream(memStream.getData(), memStream.getLength())); std::unique_ptr<Reader> reader = createReader(pool, std::move(inStream)); - std::unique_ptr<RowReader> rowReader = createRowReader(reader.get()); + std::unique_ptr<RowReader> rowReader = createRowReader(reader.get(), enableEncodedBlock); EXPECT_EQ(rowCount, reader->getNumberOfRows()); batch = rowReader->createRowBatch(rowCount); EXPECT_EQ(true, rowReader->next(*batch)); EXPECT_EQ(rowCount, batch->numElements); - for (uint64_t i = 0; i < rowCount; ++i) { - structBatch = dynamic_cast<StructVectorBatch*>(batch.get()); - strBatch = dynamic_cast<StringVectorBatch*>(structBatch->fields[0]); + structBatch = dynamic_cast<StructVectorBatch*>(batch.get()); + strBatch = dynamic_cast<StringVectorBatch*>(structBatch->fields[0]); + if (doubleEquals(threshold, DICT_THRESHOLD) && enableEncodedBlock) { + checkDictionaryEncoding(strBatch); + strBatch->decodeDictionary(); + } + for (uint64_t i = 0; i < rowCount; ++i) { if (i % 2 == 0) { EXPECT_FALSE(strBatch->notNull[i]); } else { @@ -357,9 +407,10 @@ namespace orc { for (uint64_t stripe = 0; stripe != stripeCount; ++stripe) { EXPECT_EQ(true, rowReader->next(*batch)); + structBatch = dynamic_cast<StructVectorBatch*>(batch.get()); + strBatch = dynamic_cast<StringVectorBatch*>(structBatch->fields[0]); + for (uint64_t i = 0; i < rowCount; ++i) { - structBatch = dynamic_cast<StructVectorBatch*>(batch.get()); - strBatch = dynamic_cast<StringVectorBatch*>(structBatch->fields[0]); std::string str(strBatch->data[i], static_cast<size_t>(strBatch->length[i])); EXPECT_EQ(i % dictionarySize, static_cast<uint64_t>(atoi(str.c_str()))); } @@ -368,7 +419,6 @@ namespace orc { // test seeking to check positions batch = rowReader->createRowBatch(1); - for (uint64_t stripe = 0; stripe != stripeCount; ++stripe) { for (uint64_t i = 0; i < rowCount; i += 10000 / 2) { rowReader->seekToRow(stripe * rowCount + i); @@ -385,45 +435,61 @@ namespace orc { // test dictionary encoding with index disabled // the decision of using dictionary if made at the end of 1st stripe TEST(DictionaryEncoding, writeStringDictionaryEncodingWithoutIndex) { - testStringDictionary(false, DICT_THRESHOLD); - testStringDictionary(false, FALLBACK_THRESHOLD); + for (auto enableEncodedBlock : {false, true}) { + testStringDictionary(false, DICT_THRESHOLD, enableEncodedBlock); + testStringDictionary(false, FALLBACK_THRESHOLD, enableEncodedBlock); + } } // test dictionary encoding with index enabled // the decision of using dictionary if made at the end of 1st row group TEST(DictionaryEncoding, writeStringDictionaryEncodingWithIndex) { - testStringDictionary(true, DICT_THRESHOLD); - testStringDictionary(true, FALLBACK_THRESHOLD); + for (auto enableEncodedBlock : {false, true}) { + testStringDictionary(true, DICT_THRESHOLD, enableEncodedBlock); + testStringDictionary(true, FALLBACK_THRESHOLD, enableEncodedBlock); + } } TEST(DictionaryEncoding, writeVarcharDictionaryEncodingWithoutIndex) { - testVarcharDictionary(false, DICT_THRESHOLD); - testVarcharDictionary(false, FALLBACK_THRESHOLD); + for (auto enableEncodedBlock : {false, true}) { + testVarcharDictionary(false, DICT_THRESHOLD, enableEncodedBlock); + testVarcharDictionary(false, FALLBACK_THRESHOLD, enableEncodedBlock); + } } TEST(DictionaryEncoding, writeVarcharDictionaryEncodingWithIndex) { - testVarcharDictionary(true, DICT_THRESHOLD); - testVarcharDictionary(true, FALLBACK_THRESHOLD); + for (auto enableEncodedBlock : {false, true}) { + testVarcharDictionary(true, DICT_THRESHOLD, enableEncodedBlock); + testVarcharDictionary(true, FALLBACK_THRESHOLD, enableEncodedBlock); + } } TEST(DictionaryEncoding, writeCharDictionaryEncodingWithoutIndex) { - testCharDictionary(false, DICT_THRESHOLD); - testCharDictionary(false, FALLBACK_THRESHOLD); + for (auto enableEncodedBlock : {false, true}) { + testCharDictionary(false, DICT_THRESHOLD, enableEncodedBlock); + testCharDictionary(false, FALLBACK_THRESHOLD, enableEncodedBlock); + } } TEST(DictionaryEncoding, writeCharDictionaryEncodingWithIndex) { - testCharDictionary(true, DICT_THRESHOLD); - testCharDictionary(true, FALLBACK_THRESHOLD); + for (auto enableEncodedBlock : {false, true}) { + testCharDictionary(true, DICT_THRESHOLD, enableEncodedBlock); + testCharDictionary(true, FALLBACK_THRESHOLD, enableEncodedBlock); + } } TEST(DictionaryEncoding, stringDictionaryWithNullWithIndex) { - testStringDictionaryWithNull(DICT_THRESHOLD, true); - testStringDictionaryWithNull(FALLBACK_THRESHOLD, true); + for (auto enableEncodedBlock : {false, true}) { + testStringDictionaryWithNull(DICT_THRESHOLD, true, enableEncodedBlock); + testStringDictionaryWithNull(FALLBACK_THRESHOLD, true, enableEncodedBlock); + } } TEST(DictionaryEncoding, stringDictionaryWithNullWithoutIndex) { - testStringDictionaryWithNull(DICT_THRESHOLD, false); - testStringDictionaryWithNull(FALLBACK_THRESHOLD, false); + for (auto enableEncodedBlock : {false, true}) { + testStringDictionaryWithNull(DICT_THRESHOLD, false, enableEncodedBlock); + testStringDictionaryWithNull(FALLBACK_THRESHOLD, false, enableEncodedBlock); + } } TEST(DictionaryEncoding, multipleStripesWithIndex) { diff --git a/c++/test/meson.build b/c++/test/meson.build index 75dcbb094..a8d30a6b9 100644 --- a/c++/test/meson.build +++ b/c++/test/meson.build @@ -72,7 +72,6 @@ orc_test = executable( zlib_dep, gtest_dep, gmock_dep, - sparsehash_c11_dep, ], ) diff --git a/cmake_modules/ThirdpartyToolchain.cmake b/cmake_modules/ThirdpartyToolchain.cmake index c494710ba..c77d3f1f5 100644 --- a/cmake_modules/ThirdpartyToolchain.cmake +++ b/cmake_modules/ThirdpartyToolchain.cmake @@ -583,46 +583,47 @@ endif () # ---------------------------------------------------------------------- # SPARSEHASH +if(BUILD_SPARSEHASH) + set(SPARSEHASH_HOME "${THIRDPARTY_DIR}/sparsehash_ep-install") + set(SPARSEHASH_INCLUDE_DIR "${SPARSEHASH_HOME}/include/google") + set(SPARSEHASH_CMAKE_ARGS + -DCMAKE_INSTALL_PREFIX=${SPARSEHASH_HOME} + -DBUILD_SHARED_LIBS=OFF + -DCMAKE_INSTALL_LIBDIR=lib + -DCMAKE_POLICY_VERSION_MINIMUM=3.5 + ) + if (BUILD_POSITION_INDEPENDENT_LIB) + set(SPARSEHASH_CMAKE_ARGS ${SPARSEHASH_CMAKE_ARGS} -DCMAKE_POSITION_INDEPENDENT_CODE=ON) + endif () -set(SPARSEHASH_HOME "${THIRDPARTY_DIR}/sparsehash_ep-install") -set(SPARSEHASH_INCLUDE_DIR "${SPARSEHASH_HOME}/include/google") -set(SPARSEHASH_CMAKE_ARGS - -DCMAKE_INSTALL_PREFIX=${SPARSEHASH_HOME} - -DBUILD_SHARED_LIBS=OFF - -DCMAKE_INSTALL_LIBDIR=lib - -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -) -if (BUILD_POSITION_INDEPENDENT_LIB) - set(SPARSEHASH_CMAKE_ARGS ${SPARSEHASH_CMAKE_ARGS} -DCMAKE_POSITION_INDEPENDENT_CODE=ON) -endif () - -if (CMAKE_VERSION VERSION_GREATER "3.7") - set(SPARSEHASH_CONFIGURE SOURCE_SUBDIR "" CMAKE_ARGS ${SPARSEHASH_CMAKE_ARGS}) - else() - set(SPARSEHASH_CONFIGURE CONFIGURE_COMMAND "${THIRDPARTY_CONFIGURE_COMMAND}" ${SPARSEHASH_CMAKE_ARGS} - "${CMAKE_CURRENT_BINARY_DIR}/sparsehash_ep-prefix/src/sparsehash_ep/") -endif() + if (CMAKE_VERSION VERSION_GREATER "3.7") + set(SPARSEHASH_CONFIGURE SOURCE_SUBDIR "" CMAKE_ARGS ${SPARSEHASH_CMAKE_ARGS}) + else() + set(SPARSEHASH_CONFIGURE CONFIGURE_COMMAND "${THIRDPARTY_CONFIGURE_COMMAND}" ${SPARSEHASH_CMAKE_ARGS} + "${CMAKE_CURRENT_BINARY_DIR}/sparsehash_ep-prefix/src/sparsehash_ep/") + endif() -ExternalProject_Add(sparsehash_ep - URL "https://github.com/sparsehash/sparsehash-c11/archive/refs/tags/v${SPARSEHASH_VERSION}.tar.gz" - ${SPARSEHASH_CONFIGURE} - ${THIRDPARTY_LOG_OPTIONS}) + ExternalProject_Add(sparsehash_ep + URL "https://github.com/sparsehash/sparsehash-c11/archive/refs/tags/v${SPARSEHASH_VERSION}.tar.gz" + ${SPARSEHASH_CONFIGURE} + ${THIRDPARTY_LOG_OPTIONS}) -# sparsehash-c11 is header-only, create interface library -add_library(orc_sparsehash INTERFACE) -target_include_directories(orc_sparsehash INTERFACE - $<BUILD_INTERFACE:${SPARSEHASH_INCLUDE_DIR}> - $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>) -add_dependencies(orc_sparsehash sparsehash_ep) + # sparsehash-c11 is header-only, create interface library + add_library(orc_sparsehash INTERFACE) + target_include_directories(orc_sparsehash INTERFACE + $<BUILD_INTERFACE:${SPARSEHASH_INCLUDE_DIR}> + $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>) + add_dependencies(orc_sparsehash sparsehash_ep) -list (APPEND ORC_VENDOR_DEPENDENCIES "orc::vendored_sparsehash") -list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$<INSTALL_INTERFACE:orc::vendored_sparsehash>") + list (APPEND ORC_VENDOR_DEPENDENCIES "orc::vendored_sparsehash") + list (APPEND ORC_INSTALL_INTERFACE_TARGETS "$<INSTALL_INTERFACE:orc::vendored_sparsehash>") -add_library (orc::sparsehash ALIAS orc_sparsehash) + add_library (orc::sparsehash ALIAS orc_sparsehash) + set (SPARSEHASH_LIBRARIES orc::sparsehash) +endif() # ---------------------------------------------------------------------- # LIBHDFSPP - if(BUILD_LIBHDFSPP) set (BUILD_LIBHDFSPP FALSE) if(ORC_CXX_HAS_THREAD_LOCAL) diff --git a/subprojects/sparsehash-c11.wrap b/subprojects/sparsehash-c11.wrap index 4177861ce..de3d18748 100644 --- a/subprojects/sparsehash-c11.wrap +++ b/subprojects/sparsehash-c11.wrap @@ -27,4 +27,4 @@ source_fallback_url = https://github.com/mesonbuild/wrapdb/releases/download/spa wrapdb_version = 2.11.1-1 [provide] -sparsehash-c11 = sparsehash_c11_dep \ No newline at end of file +sparsehash-c11 = sparsehash_c11_dep