This is an automated email from the ASF dual-hosted git repository.
william pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/main by this push:
new a4ff1c8c4 ORC-1950: [C++] Make sure dictionary is sorted before
flushed into ORC file to follow ORC specs
a4ff1c8c4 is described below
commit a4ff1c8c4d1729cda4583ef55da807442b611749
Author: taiyang-li <[email protected]>
AuthorDate: Wed Jul 23 23:44:48 2025 -0700
ORC-1950: [C++] Make sure dictionary is sorted before flushed into ORC file
to follow ORC specs
### What changes were proposed in this pull request?
Make sure dictionary is sorted before flushed into ORC file to follow ORC
specs. The
[issue](https://github.com/apache/orc/pull/2321#discussion_r2219569602) was
brought by https://github.com/apache/orc/pull/2336.
### Why are the changes needed?
### How was this patch tested?
### Was this patch authored or co-authored using generative AI tooling?
Closes #2337 from taiyang-li/make_dict_sorted.
Authored-by: taiyang-li <[email protected]>
Signed-off-by: William Hyun <[email protected]>
---
CMakeLists.txt | 4 +
c++/src/CMakeLists.txt | 7 +-
c++/src/ColumnWriter.cc | 111 ++----------------------
c++/src/Dictionary.cc | 99 ++++++++++++++++++++++
c++/src/Dictionary.hh | 104 +++++++++++++++++++++++
c++/src/meson.build | 4 +
c++/test/CMakeLists.txt | 1 -
c++/test/TestDictionaryEncoding.cc | 144 +++++++++++++++++++++++---------
c++/test/meson.build | 1 -
cmake_modules/ThirdpartyToolchain.cmake | 65 +++++++-------
subprojects/sparsehash-c11.wrap | 2 +-
11 files changed, 364 insertions(+), 178 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9d036aa8e..3454e4ccf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -45,6 +45,10 @@ option (BUILD_LIBHDFSPP
"Include LIBHDFSPP library in the build process"
OFF)
+option (BUILD_SPARSEHASH
+ "Include sparsehash library in the build process"
+ OFF)
+
option(BUILD_CPP_TESTS
"Build the googletest unit tests"
ON)
diff --git a/c++/src/CMakeLists.txt b/c++/src/CMakeLists.txt
index b8a168307..ae93e67d6 100644
--- a/c++/src/CMakeLists.txt
+++ b/c++/src/CMakeLists.txt
@@ -170,6 +170,7 @@ set(SOURCE_FILES
Compression.cc
ConvertColumnReader.cc
CpuInfoUtil.cc
+ Dictionary.cc
Exceptions.cc
Geospatial.cc
Int128.cc
@@ -212,8 +213,8 @@ target_link_libraries (orc
$<BUILD_INTERFACE:orc::snappy>
$<BUILD_INTERFACE:orc::lz4>
$<BUILD_INTERFACE:orc::zstd>
- $<BUILD_INTERFACE:orc::sparsehash>
$<BUILD_INTERFACE:${LIBHDFSPP_LIBRARIES}>
+ $<BUILD_INTERFACE:${SPARSEHASH_LIBRARIES}>
)
target_include_directories (orc
@@ -232,6 +233,10 @@ if (BUILD_LIBHDFSPP)
target_compile_definitions(orc PUBLIC -DBUILD_LIBHDFSPP)
endif (BUILD_LIBHDFSPP)
+if (BUILD_SPARSEHASH)
+ target_compile_definitions(orc PUBLIC -DBUILD_SPARSEHASH)
+endif (BUILD_SPARSEHASH)
+
if (BUILD_CPP_ENABLE_METRICS)
message(STATUS "Enable the metrics collection")
target_compile_definitions(orc PUBLIC ENABLE_METRICS=1)
diff --git a/c++/src/ColumnWriter.cc b/c++/src/ColumnWriter.cc
index 915277ef4..b9aac1a12 100644
--- a/c++/src/ColumnWriter.cc
+++ b/c++/src/ColumnWriter.cc
@@ -24,13 +24,12 @@
#include <memory>
#include "ByteRLE.hh"
#include "ColumnWriter.hh"
+#include "Dictionary.hh"
#include "RLE.hh"
#include "Statistics.hh"
#include "Timezone.hh"
#include "Utils.hh"
-#include <sparsehash/dense_hash_map>
-
namespace orc {
StreamsFactory::~StreamsFactory() {
// PASS
@@ -927,104 +926,6 @@ namespace orc {
ColumnWriter::finishStreams();
dataStream_->finishStream();
}
-
- /**
- * Implementation of increasing sorted string dictionary
- */
- class SortedStringDictionary {
- public:
- struct DictEntry {
- DictEntry(const char* str, size_t len) :
data(std::make_unique<std::string>(str, len)) {}
-
- std::unique_ptr<std::string> data;
- };
-
- SortedStringDictionary() : totalLength_(0) {
- /// Need to set empty key otherwise dense_hash_map will not work
correctly
- keyToIndex_.set_empty_key(std::string_view{});
- }
-
- // insert a new string into dictionary, return its insertion order
- size_t insert(const char* str, size_t len);
-
- // write dictionary data & length to output buffer
- void flush(AppendOnlyBufferedStream* dataStream, RleEncoder*
lengthEncoder) const;
-
- // get dict entries in insertion order
- const std::vector<DictEntry>& getEntriesInInsertionOrder() const;
-
- // return count of entries
- size_t size() const;
-
- // return total length of strings in the dictioanry
- uint64_t length() const;
-
- void clear();
-
- private:
- // store dictionary entries in insertion order
- mutable std::vector<DictEntry> flatDict_;
-
- // map from string to its insertion order index
- google::dense_hash_map<std::string_view, size_t> keyToIndex_;
- uint64_t totalLength_;
-
- // use friend class here to avoid being bothered by const function calls
- friend class StringColumnWriter;
- friend class CharColumnWriter;
- friend class VarCharColumnWriter;
- // store indexes of insertion order in the dictionary for not-null rows
- std::vector<int64_t> idxInDictBuffer_;
- };
-
- // insert a new string into dictionary, return its insertion order
- size_t SortedStringDictionary::insert(const char* str, size_t len) {
- size_t index = flatDict_.size();
-
- auto it = keyToIndex_.find(std::string_view{str, len});
- if (it != keyToIndex_.end()) {
- return it->second;
- } else {
- flatDict_.emplace_back(str, len);
- totalLength_ += len;
-
- const auto& lastEntry = flatDict_.back();
- keyToIndex_.emplace(std::string_view{lastEntry.data->data(),
lastEntry.data->size()}, index);
- return index;
- }
- }
-
- // write dictionary data & length to output buffer
- void SortedStringDictionary::flush(AppendOnlyBufferedStream* dataStream,
- RleEncoder* lengthEncoder) const {
- for (const auto& entry : flatDict_) {
- dataStream->write(entry.data->data(), entry.data->size());
- lengthEncoder->write(static_cast<int64_t>(entry.data->size()));
- }
- }
-
- // get dict entries in insertion order
- const std::vector<SortedStringDictionary::DictEntry>&
- SortedStringDictionary::getEntriesInInsertionOrder() const {
- return flatDict_;
- }
-
- // return count of entries
- size_t SortedStringDictionary::size() const {
- return flatDict_.size();
- }
-
- // return total length of strings in the dictioanry
- uint64_t SortedStringDictionary::length() const {
- return totalLength_;
- }
-
- void SortedStringDictionary::clear() {
- totalLength_ = 0;
- keyToIndex_.clear();
- flatDict_.clear();
- }
-
class StringColumnWriter : public ColumnWriter {
public:
StringColumnWriter(const Type& type, const StreamsFactory& factory,
@@ -1324,6 +1225,9 @@ namespace orc {
// flush dictionary data & length streams
dictionary.flush(dictStream.get(), dictLengthEncoder.get());
+ // convert index from insertion order to dictionary order
+ dictionary.reorder(dictionary.idxInDictBuffer_);
+
// write data sequences
int64_t* data = dictionary.idxInDictBuffer_.data();
if (enableIndex) {
@@ -1367,14 +1271,15 @@ namespace orc {
}
// get dictionary entries in insertion order
- const auto& entries = dictionary.getEntriesInInsertionOrder();
+ std::vector<const SortedStringDictionary::DictEntry*> entries;
+ dictionary.getEntriesInInsertionOrder(entries);
// store each length of the data into a vector
for (uint64_t i = 0; i != dictionary.idxInDictBuffer_.size(); ++i) {
// write one row data in direct encoding
const auto& dictEntry =
entries[static_cast<size_t>(dictionary.idxInDictBuffer_[i])];
- directDataStream->write(dictEntry.data->data(), dictEntry.data->size());
- directLengthEncoder->write(static_cast<int64_t>(dictEntry.data->size()));
+ directDataStream->write(dictEntry->data->data(),
dictEntry->data->size());
+
directLengthEncoder->write(static_cast<int64_t>(dictEntry->data->size()));
}
deleteDictStreams();
diff --git a/c++/src/Dictionary.cc b/c++/src/Dictionary.cc
new file mode 100644
index 000000000..9eb60bb5b
--- /dev/null
+++ b/c++/src/Dictionary.cc
@@ -0,0 +1,99 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Dictionary.hh"
+
+namespace orc {
+
+ // insert a new string into dictionary, return its insertion order
+ size_t SortedStringDictionary::insert(const char* str, size_t len) {
+ size_t index = flatDict_.size();
+
+ auto it = keyToIndex_.find(std::string_view{str, len});
+ if (it != keyToIndex_.end()) {
+ return it->second;
+ } else {
+ flatDict_.emplace_back(str, len, index);
+ totalLength_ += len;
+
+ const auto& lastEntry = flatDict_.back().entry;
+ keyToIndex_.emplace(std::string_view{lastEntry.data->data(),
lastEntry.data->size()}, index);
+ return index;
+ }
+ }
+
+ // write dictionary data & length to output buffer
+ void SortedStringDictionary::flush(AppendOnlyBufferedStream* dataStream,
+ RleEncoder* lengthEncoder) const {
+ std::sort(flatDict_.begin(), flatDict_.end(), LessThan());
+
+ for (const auto& entryWithIndex : flatDict_) {
+ dataStream->write(entryWithIndex.entry.data->data(),
entryWithIndex.entry.data->size());
+
lengthEncoder->write(static_cast<int64_t>(entryWithIndex.entry.data->size()));
+ }
+ }
+
+ /**
+ * Reorder input index buffer from insertion order to dictionary order
+ *
+ * We require this function because string values are buffered by indexes
+ * in their insertion order. Until the entire dictionary is complete can
+ * we get their sorted indexes in the dictionary in that ORC specification
+ * demands dictionary should be ordered. Therefore this function transforms
+ * the indexes from insertion order to dictionary value order for final
+ * output.
+ */
+ void SortedStringDictionary::reorder(std::vector<int64_t>& idxBuffer) const {
+ // iterate the dictionary to get mapping from insertion order to value
order
+ std::vector<size_t> mapping(flatDict_.size());
+ for (size_t i = 0; i < flatDict_.size(); ++i) {
+ mapping[flatDict_[i].index] = i;
+ }
+
+ // do the transformation
+ for (size_t i = 0; i != idxBuffer.size(); ++i) {
+ idxBuffer[i] =
static_cast<int64_t>(mapping[static_cast<size_t>(idxBuffer[i])]);
+ }
+ }
+
+ // get dict entries in insertion order
+ void SortedStringDictionary::getEntriesInInsertionOrder(
+ std::vector<const DictEntry*>& entries) const {
+ /// flatDict_ is sorted in insertion order before
[[SortedStringDictionary::flush]] is invoked.
+ entries.resize(flatDict_.size());
+ for (size_t i = 0; i < flatDict_.size(); ++i) {
+ entries[i] = &(flatDict_[i].entry);
+ }
+ }
+
+ // return count of entries
+ size_t SortedStringDictionary::size() const {
+ return flatDict_.size();
+ }
+
+ // return total length of strings in the dictioanry
+ uint64_t SortedStringDictionary::length() const {
+ return totalLength_;
+ }
+
+ void SortedStringDictionary::clear() {
+ totalLength_ = 0;
+ keyToIndex_.clear();
+ flatDict_.clear();
+ }
+} // namespace orc
\ No newline at end of file
diff --git a/c++/src/Dictionary.hh b/c++/src/Dictionary.hh
new file mode 100644
index 000000000..dca15b115
--- /dev/null
+++ b/c++/src/Dictionary.hh
@@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstddef>
+#include <memory>
+#include <string>
+
+#ifdef BUILD_SPARSEHASH
+#include <sparsehash/dense_hash_map>
+#else
+#include <unordered_map>
+#endif
+
+#include "RLE.hh"
+
+namespace orc {
+ /**
+ * Implementation of increasing sorted string dictionary
+ */
+ class SortedStringDictionary {
+ public:
+ struct DictEntry {
+ DictEntry(const char* str, size_t len) :
data(std::make_unique<std::string>(str, len)) {}
+
+ std::unique_ptr<std::string> data;
+ };
+
+ struct DictEntryWithIndex {
+ DictEntryWithIndex(const char* str, size_t len, size_t index)
+ : entry(str, len), index(index) {}
+
+ DictEntry entry;
+ size_t index;
+ };
+
+ SortedStringDictionary() : totalLength_(0) {
+#ifdef BUILD_SPARSEHASH
+ /// Need to set empty key otherwise dense_hash_map will not work
correctly
+ keyToIndex_.set_empty_key(std::string_view{});
+#endif
+ }
+
+ // insert a new string into dictionary, return its insertion order
+ size_t insert(const char* str, size_t len);
+
+ // write dictionary data & length to output buffer
+ void flush(AppendOnlyBufferedStream* dataStream, RleEncoder*
lengthEncoder) const;
+
+ // reorder input index buffer from insertion order to dictionary order
+ void reorder(std::vector<int64_t>& idxBuffer) const;
+
+ // get dict entries in insertion order
+ void getEntriesInInsertionOrder(std::vector<const DictEntry*>&) const;
+
+ // return count of entries
+ size_t size() const;
+
+ // return total length of strings in the dictioanry
+ uint64_t length() const;
+
+ void clear();
+
+ private:
+ struct LessThan {
+ bool operator()(const DictEntryWithIndex& l, const DictEntryWithIndex&
r) {
+ return *l.entry.data < *r.entry.data; // use std::string's operator<
+ }
+ };
+ // store dictionary entries in insertion order
+ mutable std::vector<DictEntryWithIndex> flatDict_;
+
+#ifdef BUILD_SPARSEHASH
+ // map from string to its insertion order index
+ google::dense_hash_map<std::string_view, size_t> keyToIndex_;
+#else
+ std::unordered_map<std::string_view, size_t> keyToIndex_;
+#endif
+
+ uint64_t totalLength_;
+
+ // use friend class here to avoid being bothered by const function calls
+ friend class StringColumnWriter;
+ friend class CharColumnWriter;
+ friend class VarCharColumnWriter;
+ // store indexes of insertion order in the dictionary for not-null rows
+ std::vector<int64_t> idxInDictBuffer_;
+ };
+
+} // namespace orc
diff --git a/c++/src/meson.build b/c++/src/meson.build
index 44a98500f..885df0072 100644
--- a/c++/src/meson.build
+++ b/c++/src/meson.build
@@ -150,6 +150,7 @@ source_files += files(
'Compression.cc',
'ConvertColumnReader.cc',
'CpuInfoUtil.cc',
+ 'Dictionary.cc',
'Exceptions.cc',
'Geospatial.cc',
'Int128.cc',
@@ -180,6 +181,9 @@ threads_dep = dependency('threads')
orc_lib = library(
'orc',
sources: source_files,
+ cpp_args: [
+ '-DBUILD_SPARSEHASH'
+ ],
dependencies: [
orc_format_proto_dep,
protobuf_dep,
diff --git a/c++/test/CMakeLists.txt b/c++/test/CMakeLists.txt
index b0ee48f38..3261fedde 100644
--- a/c++/test/CMakeLists.txt
+++ b/c++/test/CMakeLists.txt
@@ -77,7 +77,6 @@ target_link_libraries (orc-test
orc::zlib
orc::gtest
orc::gmock
- orc::sparsehash
orc-test-include
)
diff --git a/c++/test/TestDictionaryEncoding.cc
b/c++/test/TestDictionaryEncoding.cc
index 40c1b1a60..c13ceb646 100644
--- a/c++/test/TestDictionaryEncoding.cc
+++ b/c++/test/TestDictionaryEncoding.cc
@@ -35,6 +35,11 @@ namespace orc {
const double DICT_THRESHOLD = 0.2; // make sure dictionary is used
const double FALLBACK_THRESHOLD = 0.0; // make sure fallback happens
+ static bool doubleEquals(double a, double b) {
+ const double EPSILON = 1e-9;
+ return std::fabs(a - b) < EPSILON;
+ }
+
static std::unique_ptr<Reader> createReader(MemoryPool* memoryPool,
std::unique_ptr<InputStream>
stream) {
ReaderOptions options;
@@ -42,12 +47,39 @@ namespace orc {
return createReader(std::move(stream), options);
}
- static std::unique_ptr<RowReader> createRowReader(Reader* reader) {
+ static void checkDictionaryEncoding(StringVectorBatch* batch) {
+ EXPECT_TRUE(batch->isEncoded);
+
+ const auto* encoded_batch = dynamic_cast<EncodedStringVectorBatch*>(batch);
+ EXPECT_TRUE(encoded_batch != nullptr);
+
+ const auto& dictionary = encoded_batch->dictionary;
+ EXPECT_TRUE(dictionary != nullptr);
+
+ // Check if the dictionary is sorted
+ std::string prev;
+ for (size_t i = 0; i < dictionary->dictionaryOffset.size() - 1; ++i) {
+ char* begin = nullptr;
+ int64_t length = 0;
+ dictionary->getValueByIndex(i, begin, length);
+
+ std::string curr = std::string(begin, static_cast<size_t>(length));
+ if (i) {
+ EXPECT_GT(curr, prev);
+ }
+
+ prev = std::move(curr);
+ }
+ }
+
+ static std::unique_ptr<RowReader> createRowReader(Reader* reader,
+ bool enableEncodedBlock =
false) {
RowReaderOptions rowReaderOpts;
+ rowReaderOpts.setEnableLazyDecoding(enableEncodedBlock);
return reader->createRowReader(rowReaderOpts);
}
- void testStringDictionary(bool enableIndex, double threshold) {
+ void testStringDictionary(bool enableIndex, double threshold, bool
enableEncodedBlock = false) {
MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
MemoryPool* pool = getDefaultPool();
std::unique_ptr<Type>
type(Type::buildTypeFromString("struct<col1:string>"));
@@ -87,16 +119,21 @@ namespace orc {
std::unique_ptr<InputStream> inStream(
new MemoryInputStream(memStream.getData(), memStream.getLength()));
std::unique_ptr<Reader> reader = createReader(pool, std::move(inStream));
- std::unique_ptr<RowReader> rowReader = createRowReader(reader.get());
+ std::unique_ptr<RowReader> rowReader = createRowReader(reader.get(),
enableEncodedBlock);
EXPECT_EQ(rowCount, reader->getNumberOfRows());
batch = rowReader->createRowBatch(rowCount);
EXPECT_EQ(true, rowReader->next(*batch));
EXPECT_EQ(rowCount, batch->numElements);
+ structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
+ strBatch = dynamic_cast<StringVectorBatch*>(structBatch->fields[0]);
+ if (doubleEquals(threshold, DICT_THRESHOLD) && enableEncodedBlock) {
+ checkDictionaryEncoding(strBatch);
+ strBatch->decodeDictionary();
+ }
+
for (uint64_t i = 0; i < rowCount; ++i) {
- structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
- strBatch = dynamic_cast<StringVectorBatch*>(structBatch->fields[0]);
std::string str(strBatch->data[i],
static_cast<size_t>(strBatch->length[i]));
EXPECT_EQ(i % dictionarySize, static_cast<uint64_t>(atoi(str.c_str())));
}
@@ -104,7 +141,7 @@ namespace orc {
EXPECT_FALSE(rowReader->next(*batch));
}
- void testVarcharDictionary(bool enableIndex, double threshold) {
+ void testVarcharDictionary(bool enableIndex, double threshold, bool
enableEncodedBlock = false) {
MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
MemoryPool* pool = getDefaultPool();
std::unique_ptr<Type>
type(Type::buildTypeFromString("struct<col1:varchar(2)>"));
@@ -144,17 +181,21 @@ namespace orc {
std::unique_ptr<InputStream> inStream(
new MemoryInputStream(memStream.getData(), memStream.getLength()));
std::unique_ptr<Reader> reader = createReader(pool, std::move(inStream));
- std::unique_ptr<RowReader> rowReader = createRowReader(reader.get());
+ std::unique_ptr<RowReader> rowReader = createRowReader(reader.get(),
enableEncodedBlock);
EXPECT_EQ(rowCount, reader->getNumberOfRows());
batch = rowReader->createRowBatch(rowCount);
EXPECT_EQ(true, rowReader->next(*batch));
EXPECT_EQ(rowCount, batch->numElements);
- for (uint64_t i = 0; i < rowCount; ++i) {
- structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
- varcharBatch = dynamic_cast<StringVectorBatch*>(structBatch->fields[0]);
+ structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
+ varcharBatch = dynamic_cast<StringVectorBatch*>(structBatch->fields[0]);
+ if (doubleEquals(threshold, DICT_THRESHOLD) && enableEncodedBlock) {
+ checkDictionaryEncoding(varcharBatch);
+ varcharBatch->decodeDictionary();
+ }
+ for (uint64_t i = 0; i < rowCount; ++i) {
std::ostringstream os;
os << (i % dictionarySize);
EXPECT_FALSE(varcharBatch->length[i] > 2);
@@ -166,7 +207,7 @@ namespace orc {
EXPECT_FALSE(rowReader->next(*batch));
}
- void testCharDictionary(bool enableIndex, double threshold) {
+ void testCharDictionary(bool enableIndex, double threshold, bool
enableEncodedBlock = false) {
MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
MemoryPool* pool = getDefaultPool();
std::unique_ptr<Type>
type(Type::buildTypeFromString("struct<col1:char(3)>"));
@@ -204,17 +245,21 @@ namespace orc {
std::unique_ptr<InputStream> inStream(
new MemoryInputStream(memStream.getData(), memStream.getLength()));
std::unique_ptr<Reader> reader = createReader(pool, std::move(inStream));
- std::unique_ptr<RowReader> rowReader = createRowReader(reader.get());
+ std::unique_ptr<RowReader> rowReader = createRowReader(reader.get(),
enableEncodedBlock);
EXPECT_EQ(rowCount, reader->getNumberOfRows());
batch = rowReader->createRowBatch(rowCount);
EXPECT_EQ(true, rowReader->next(*batch));
EXPECT_EQ(rowCount, batch->numElements);
- for (uint64_t i = 0; i < rowCount; ++i) {
- structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
- charBatch = dynamic_cast<StringVectorBatch*>(structBatch->fields[0]);
+ structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
+ charBatch = dynamic_cast<StringVectorBatch*>(structBatch->fields[0]);
+ if (doubleEquals(threshold, DICT_THRESHOLD) && enableEncodedBlock) {
+ checkDictionaryEncoding(charBatch);
+ charBatch->decodeDictionary();
+ }
+ for (uint64_t i = 0; i < rowCount; ++i) {
EXPECT_EQ(3, charBatch->length[i]);
std::string charsRead(charBatch->data[i],
static_cast<size_t>(charBatch->length[i]));
@@ -230,7 +275,8 @@ namespace orc {
EXPECT_FALSE(rowReader->next(*batch));
}
- void testStringDictionaryWithNull(double threshold, bool enableIndex) {
+ void testStringDictionaryWithNull(double threshold, bool enableIndex,
+ bool enableEncodedBlock = false) {
MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
MemoryPool* pool = getDefaultPool();
std::unique_ptr<Type>
type(Type::buildTypeFromString("struct<col1:string>"));
@@ -277,17 +323,21 @@ namespace orc {
std::unique_ptr<InputStream> inStream(
new MemoryInputStream(memStream.getData(), memStream.getLength()));
std::unique_ptr<Reader> reader = createReader(pool, std::move(inStream));
- std::unique_ptr<RowReader> rowReader = createRowReader(reader.get());
+ std::unique_ptr<RowReader> rowReader = createRowReader(reader.get(),
enableEncodedBlock);
EXPECT_EQ(rowCount, reader->getNumberOfRows());
batch = rowReader->createRowBatch(rowCount);
EXPECT_EQ(true, rowReader->next(*batch));
EXPECT_EQ(rowCount, batch->numElements);
- for (uint64_t i = 0; i < rowCount; ++i) {
- structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
- strBatch = dynamic_cast<StringVectorBatch*>(structBatch->fields[0]);
+ structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
+ strBatch = dynamic_cast<StringVectorBatch*>(structBatch->fields[0]);
+ if (doubleEquals(threshold, DICT_THRESHOLD) && enableEncodedBlock) {
+ checkDictionaryEncoding(strBatch);
+ strBatch->decodeDictionary();
+ }
+ for (uint64_t i = 0; i < rowCount; ++i) {
if (i % 2 == 0) {
EXPECT_FALSE(strBatch->notNull[i]);
} else {
@@ -357,9 +407,10 @@ namespace orc {
for (uint64_t stripe = 0; stripe != stripeCount; ++stripe) {
EXPECT_EQ(true, rowReader->next(*batch));
+ structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
+ strBatch = dynamic_cast<StringVectorBatch*>(structBatch->fields[0]);
+
for (uint64_t i = 0; i < rowCount; ++i) {
- structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
- strBatch = dynamic_cast<StringVectorBatch*>(structBatch->fields[0]);
std::string str(strBatch->data[i],
static_cast<size_t>(strBatch->length[i]));
EXPECT_EQ(i % dictionarySize,
static_cast<uint64_t>(atoi(str.c_str())));
}
@@ -368,7 +419,6 @@ namespace orc {
// test seeking to check positions
batch = rowReader->createRowBatch(1);
-
for (uint64_t stripe = 0; stripe != stripeCount; ++stripe) {
for (uint64_t i = 0; i < rowCount; i += 10000 / 2) {
rowReader->seekToRow(stripe * rowCount + i);
@@ -385,45 +435,61 @@ namespace orc {
// test dictionary encoding with index disabled
// the decision of using dictionary if made at the end of 1st stripe
TEST(DictionaryEncoding, writeStringDictionaryEncodingWithoutIndex) {
- testStringDictionary(false, DICT_THRESHOLD);
- testStringDictionary(false, FALLBACK_THRESHOLD);
+ for (auto enableEncodedBlock : {false, true}) {
+ testStringDictionary(false, DICT_THRESHOLD, enableEncodedBlock);
+ testStringDictionary(false, FALLBACK_THRESHOLD, enableEncodedBlock);
+ }
}
// test dictionary encoding with index enabled
// the decision of using dictionary if made at the end of 1st row group
TEST(DictionaryEncoding, writeStringDictionaryEncodingWithIndex) {
- testStringDictionary(true, DICT_THRESHOLD);
- testStringDictionary(true, FALLBACK_THRESHOLD);
+ for (auto enableEncodedBlock : {false, true}) {
+ testStringDictionary(true, DICT_THRESHOLD, enableEncodedBlock);
+ testStringDictionary(true, FALLBACK_THRESHOLD, enableEncodedBlock);
+ }
}
TEST(DictionaryEncoding, writeVarcharDictionaryEncodingWithoutIndex) {
- testVarcharDictionary(false, DICT_THRESHOLD);
- testVarcharDictionary(false, FALLBACK_THRESHOLD);
+ for (auto enableEncodedBlock : {false, true}) {
+ testVarcharDictionary(false, DICT_THRESHOLD, enableEncodedBlock);
+ testVarcharDictionary(false, FALLBACK_THRESHOLD, enableEncodedBlock);
+ }
}
TEST(DictionaryEncoding, writeVarcharDictionaryEncodingWithIndex) {
- testVarcharDictionary(true, DICT_THRESHOLD);
- testVarcharDictionary(true, FALLBACK_THRESHOLD);
+ for (auto enableEncodedBlock : {false, true}) {
+ testVarcharDictionary(true, DICT_THRESHOLD, enableEncodedBlock);
+ testVarcharDictionary(true, FALLBACK_THRESHOLD, enableEncodedBlock);
+ }
}
TEST(DictionaryEncoding, writeCharDictionaryEncodingWithoutIndex) {
- testCharDictionary(false, DICT_THRESHOLD);
- testCharDictionary(false, FALLBACK_THRESHOLD);
+ for (auto enableEncodedBlock : {false, true}) {
+ testCharDictionary(false, DICT_THRESHOLD, enableEncodedBlock);
+ testCharDictionary(false, FALLBACK_THRESHOLD, enableEncodedBlock);
+ }
}
TEST(DictionaryEncoding, writeCharDictionaryEncodingWithIndex) {
- testCharDictionary(true, DICT_THRESHOLD);
- testCharDictionary(true, FALLBACK_THRESHOLD);
+ for (auto enableEncodedBlock : {false, true}) {
+ testCharDictionary(true, DICT_THRESHOLD, enableEncodedBlock);
+ testCharDictionary(true, FALLBACK_THRESHOLD, enableEncodedBlock);
+ }
}
TEST(DictionaryEncoding, stringDictionaryWithNullWithIndex) {
- testStringDictionaryWithNull(DICT_THRESHOLD, true);
- testStringDictionaryWithNull(FALLBACK_THRESHOLD, true);
+ for (auto enableEncodedBlock : {false, true}) {
+ testStringDictionaryWithNull(DICT_THRESHOLD, true, enableEncodedBlock);
+ testStringDictionaryWithNull(FALLBACK_THRESHOLD, true,
enableEncodedBlock);
+ }
}
TEST(DictionaryEncoding, stringDictionaryWithNullWithoutIndex) {
- testStringDictionaryWithNull(DICT_THRESHOLD, false);
- testStringDictionaryWithNull(FALLBACK_THRESHOLD, false);
+ for (auto enableEncodedBlock : {false, true}) {
+ testStringDictionaryWithNull(DICT_THRESHOLD, false, enableEncodedBlock);
+ testStringDictionaryWithNull(FALLBACK_THRESHOLD, false,
enableEncodedBlock);
+ }
}
TEST(DictionaryEncoding, multipleStripesWithIndex) {
diff --git a/c++/test/meson.build b/c++/test/meson.build
index 75dcbb094..a8d30a6b9 100644
--- a/c++/test/meson.build
+++ b/c++/test/meson.build
@@ -72,7 +72,6 @@ orc_test = executable(
zlib_dep,
gtest_dep,
gmock_dep,
- sparsehash_c11_dep,
],
)
diff --git a/cmake_modules/ThirdpartyToolchain.cmake
b/cmake_modules/ThirdpartyToolchain.cmake
index c494710ba..c77d3f1f5 100644
--- a/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cmake_modules/ThirdpartyToolchain.cmake
@@ -583,46 +583,47 @@ endif ()
# ----------------------------------------------------------------------
# SPARSEHASH
+if(BUILD_SPARSEHASH)
+ set(SPARSEHASH_HOME "${THIRDPARTY_DIR}/sparsehash_ep-install")
+ set(SPARSEHASH_INCLUDE_DIR "${SPARSEHASH_HOME}/include/google")
+ set(SPARSEHASH_CMAKE_ARGS
+ -DCMAKE_INSTALL_PREFIX=${SPARSEHASH_HOME}
+ -DBUILD_SHARED_LIBS=OFF
+ -DCMAKE_INSTALL_LIBDIR=lib
+ -DCMAKE_POLICY_VERSION_MINIMUM=3.5
+ )
+ if (BUILD_POSITION_INDEPENDENT_LIB)
+ set(SPARSEHASH_CMAKE_ARGS ${SPARSEHASH_CMAKE_ARGS}
-DCMAKE_POSITION_INDEPENDENT_CODE=ON)
+ endif ()
-set(SPARSEHASH_HOME "${THIRDPARTY_DIR}/sparsehash_ep-install")
-set(SPARSEHASH_INCLUDE_DIR "${SPARSEHASH_HOME}/include/google")
-set(SPARSEHASH_CMAKE_ARGS
- -DCMAKE_INSTALL_PREFIX=${SPARSEHASH_HOME}
- -DBUILD_SHARED_LIBS=OFF
- -DCMAKE_INSTALL_LIBDIR=lib
- -DCMAKE_POLICY_VERSION_MINIMUM=3.5
-)
-if (BUILD_POSITION_INDEPENDENT_LIB)
- set(SPARSEHASH_CMAKE_ARGS ${SPARSEHASH_CMAKE_ARGS}
-DCMAKE_POSITION_INDEPENDENT_CODE=ON)
-endif ()
-
-if (CMAKE_VERSION VERSION_GREATER "3.7")
- set(SPARSEHASH_CONFIGURE SOURCE_SUBDIR "" CMAKE_ARGS
${SPARSEHASH_CMAKE_ARGS})
- else()
- set(SPARSEHASH_CONFIGURE CONFIGURE_COMMAND
"${THIRDPARTY_CONFIGURE_COMMAND}" ${SPARSEHASH_CMAKE_ARGS}
-
"${CMAKE_CURRENT_BINARY_DIR}/sparsehash_ep-prefix/src/sparsehash_ep/")
-endif()
+ if (CMAKE_VERSION VERSION_GREATER "3.7")
+ set(SPARSEHASH_CONFIGURE SOURCE_SUBDIR "" CMAKE_ARGS
${SPARSEHASH_CMAKE_ARGS})
+ else()
+ set(SPARSEHASH_CONFIGURE CONFIGURE_COMMAND
"${THIRDPARTY_CONFIGURE_COMMAND}" ${SPARSEHASH_CMAKE_ARGS}
+
"${CMAKE_CURRENT_BINARY_DIR}/sparsehash_ep-prefix/src/sparsehash_ep/")
+ endif()
-ExternalProject_Add(sparsehash_ep
- URL
"https://github.com/sparsehash/sparsehash-c11/archive/refs/tags/v${SPARSEHASH_VERSION}.tar.gz"
- ${SPARSEHASH_CONFIGURE}
- ${THIRDPARTY_LOG_OPTIONS})
+ ExternalProject_Add(sparsehash_ep
+ URL
"https://github.com/sparsehash/sparsehash-c11/archive/refs/tags/v${SPARSEHASH_VERSION}.tar.gz"
+ ${SPARSEHASH_CONFIGURE}
+ ${THIRDPARTY_LOG_OPTIONS})
-# sparsehash-c11 is header-only, create interface library
-add_library(orc_sparsehash INTERFACE)
-target_include_directories(orc_sparsehash INTERFACE
- $<BUILD_INTERFACE:${SPARSEHASH_INCLUDE_DIR}>
- $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
-add_dependencies(orc_sparsehash sparsehash_ep)
+ # sparsehash-c11 is header-only, create interface library
+ add_library(orc_sparsehash INTERFACE)
+ target_include_directories(orc_sparsehash INTERFACE
+ $<BUILD_INTERFACE:${SPARSEHASH_INCLUDE_DIR}>
+ $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+ add_dependencies(orc_sparsehash sparsehash_ep)
-list (APPEND ORC_VENDOR_DEPENDENCIES "orc::vendored_sparsehash")
-list (APPEND ORC_INSTALL_INTERFACE_TARGETS
"$<INSTALL_INTERFACE:orc::vendored_sparsehash>")
+ list (APPEND ORC_VENDOR_DEPENDENCIES "orc::vendored_sparsehash")
+ list (APPEND ORC_INSTALL_INTERFACE_TARGETS
"$<INSTALL_INTERFACE:orc::vendored_sparsehash>")
-add_library (orc::sparsehash ALIAS orc_sparsehash)
+ add_library (orc::sparsehash ALIAS orc_sparsehash)
+ set (SPARSEHASH_LIBRARIES orc::sparsehash)
+endif()
# ----------------------------------------------------------------------
# LIBHDFSPP
-
if(BUILD_LIBHDFSPP)
set (BUILD_LIBHDFSPP FALSE)
if(ORC_CXX_HAS_THREAD_LOCAL)
diff --git a/subprojects/sparsehash-c11.wrap b/subprojects/sparsehash-c11.wrap
index 4177861ce..de3d18748 100644
--- a/subprojects/sparsehash-c11.wrap
+++ b/subprojects/sparsehash-c11.wrap
@@ -27,4 +27,4 @@ source_fallback_url =
https://github.com/mesonbuild/wrapdb/releases/download/spa
wrapdb_version = 2.11.1-1
[provide]
-sparsehash-c11 = sparsehash_c11_dep
\ No newline at end of file
+sparsehash-c11 = sparsehash_c11_dep