(orc) branch main updated: ORC-1950: [C++] Make sure dictionary is sorted before flushed into ORC file to follow ORC specs

william Wed, 23 Jul 2025 23:45:01 -0700

This is an automated email from the ASF dual-hosted git repository.

william pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git



The following commit(s) were added to refs/heads/main by this push:
     new a4ff1c8c4 ORC-1950: [C++] Make sure dictionary is sorted before 
flushed into ORC file to follow ORC specs
a4ff1c8c4 is described below

commit a4ff1c8c4d1729cda4583ef55da807442b611749
Author: taiyang-li <[email protected]>
AuthorDate: Wed Jul 23 23:44:48 2025 -0700

    ORC-1950: [C++] Make sure dictionary is sorted before flushed into ORC file 
to follow ORC specs
    
    ### What changes were proposed in this pull request?
    
    Make sure dictionary is sorted before flushed into ORC file to follow ORC 
specs. The 
[issue](https://github.com/apache/orc/pull/2321#discussion_r2219569602) was 
brought by https://github.com/apache/orc/pull/2336.
    
    ### Why are the changes needed?
    
    ### How was this patch tested?
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    Closes #2337 from taiyang-li/make_dict_sorted.
    
    Authored-by: taiyang-li <[email protected]>
    Signed-off-by: William Hyun <[email protected]>
---
 CMakeLists.txt                          |   4 +
 c++/src/CMakeLists.txt                  |   7 +-
 c++/src/ColumnWriter.cc                 | 111 ++----------------------
 c++/src/Dictionary.cc                   |  99 ++++++++++++++++++++++
 c++/src/Dictionary.hh                   | 104 +++++++++++++++++++++++
 c++/src/meson.build                     |   4 +
 c++/test/CMakeLists.txt                 |   1 -
 c++/test/TestDictionaryEncoding.cc      | 144 +++++++++++++++++++++++---------
 c++/test/meson.build                    |   1 -
 cmake_modules/ThirdpartyToolchain.cmake |  65 +++++++-------
 subprojects/sparsehash-c11.wrap         |   2 +-
 11 files changed, 364 insertions(+), 178 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9d036aa8e..3454e4ccf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -45,6 +45,10 @@ option (BUILD_LIBHDFSPP
     "Include LIBHDFSPP library in the build process"
      OFF)
 
+option (BUILD_SPARSEHASH
+    "Include sparsehash library in the build process"
+    OFF)
+
 option(BUILD_CPP_TESTS
     "Build the googletest unit tests"
     ON)
diff --git a/c++/src/CMakeLists.txt b/c++/src/CMakeLists.txt
index b8a168307..ae93e67d6 100644
--- a/c++/src/CMakeLists.txt
+++ b/c++/src/CMakeLists.txt
@@ -170,6 +170,7 @@ set(SOURCE_FILES
   Compression.cc
   ConvertColumnReader.cc
   CpuInfoUtil.cc
+  Dictionary.cc
   Exceptions.cc
   Geospatial.cc
   Int128.cc
@@ -212,8 +213,8 @@ target_link_libraries (orc
     $<BUILD_INTERFACE:orc::snappy>
     $<BUILD_INTERFACE:orc::lz4>
     $<BUILD_INTERFACE:orc::zstd>
-    $<BUILD_INTERFACE:orc::sparsehash>
     $<BUILD_INTERFACE:${LIBHDFSPP_LIBRARIES}>
+    $<BUILD_INTERFACE:${SPARSEHASH_LIBRARIES}>
   )
 
 target_include_directories (orc
@@ -232,6 +233,10 @@ if (BUILD_LIBHDFSPP)
   target_compile_definitions(orc PUBLIC -DBUILD_LIBHDFSPP)
 endif (BUILD_LIBHDFSPP)
 
+if (BUILD_SPARSEHASH)
+  target_compile_definitions(orc PUBLIC -DBUILD_SPARSEHASH)
+endif (BUILD_SPARSEHASH)
+
 if (BUILD_CPP_ENABLE_METRICS)
   message(STATUS "Enable the metrics collection")
   target_compile_definitions(orc PUBLIC ENABLE_METRICS=1)
diff --git a/c++/src/ColumnWriter.cc b/c++/src/ColumnWriter.cc
index 915277ef4..b9aac1a12 100644
--- a/c++/src/ColumnWriter.cc
+++ b/c++/src/ColumnWriter.cc
@@ -24,13 +24,12 @@
 #include <memory>
 #include "ByteRLE.hh"
 #include "ColumnWriter.hh"
+#include "Dictionary.hh"
 #include "RLE.hh"
 #include "Statistics.hh"
 #include "Timezone.hh"
 #include "Utils.hh"
 
-#include <sparsehash/dense_hash_map>
-
 namespace orc {
   StreamsFactory::~StreamsFactory() {
     // PASS
@@ -927,104 +926,6 @@ namespace orc {
     ColumnWriter::finishStreams();
     dataStream_->finishStream();
   }
-
-  /**
-   * Implementation of increasing sorted string dictionary
-   */
-  class SortedStringDictionary {
-   public:
-    struct DictEntry {
-      DictEntry(const char* str, size_t len) : 
data(std::make_unique<std::string>(str, len)) {}
-
-      std::unique_ptr<std::string> data;
-    };
-
-    SortedStringDictionary() : totalLength_(0) {
-      /// Need to set empty key otherwise dense_hash_map will not work 
correctly
-      keyToIndex_.set_empty_key(std::string_view{});
-    }
-
-    // insert a new string into dictionary, return its insertion order
-    size_t insert(const char* str, size_t len);
-
-    // write dictionary data & length to output buffer
-    void flush(AppendOnlyBufferedStream* dataStream, RleEncoder* 
lengthEncoder) const;
-
-    // get dict entries in insertion order
-    const std::vector<DictEntry>& getEntriesInInsertionOrder() const;
-
-    // return count of entries
-    size_t size() const;
-
-    // return total length of strings in the dictioanry
-    uint64_t length() const;
-
-    void clear();
-
-   private:
-    // store dictionary entries in insertion order
-    mutable std::vector<DictEntry> flatDict_;
-
-    // map from string to its insertion order index
-    google::dense_hash_map<std::string_view, size_t> keyToIndex_;
-    uint64_t totalLength_;
-
-    // use friend class here to avoid being bothered by const function calls
-    friend class StringColumnWriter;
-    friend class CharColumnWriter;
-    friend class VarCharColumnWriter;
-    // store indexes of insertion order in the dictionary for not-null rows
-    std::vector<int64_t> idxInDictBuffer_;
-  };
-
-  // insert a new string into dictionary, return its insertion order
-  size_t SortedStringDictionary::insert(const char* str, size_t len) {
-    size_t index = flatDict_.size();
-
-    auto it = keyToIndex_.find(std::string_view{str, len});
-    if (it != keyToIndex_.end()) {
-      return it->second;
-    } else {
-      flatDict_.emplace_back(str, len);
-      totalLength_ += len;
-
-      const auto& lastEntry = flatDict_.back();
-      keyToIndex_.emplace(std::string_view{lastEntry.data->data(), 
lastEntry.data->size()}, index);
-      return index;
-    }
-  }
-
-  // write dictionary data & length to output buffer
-  void SortedStringDictionary::flush(AppendOnlyBufferedStream* dataStream,
-                                     RleEncoder* lengthEncoder) const {
-    for (const auto& entry : flatDict_) {
-      dataStream->write(entry.data->data(), entry.data->size());
-      lengthEncoder->write(static_cast<int64_t>(entry.data->size()));
-    }
-  }
-
-  // get dict entries in insertion order
-  const std::vector<SortedStringDictionary::DictEntry>&
-  SortedStringDictionary::getEntriesInInsertionOrder() const {
-    return flatDict_;
-  }
-
-  // return count of entries
-  size_t SortedStringDictionary::size() const {
-    return flatDict_.size();
-  }
-
-  // return total length of strings in the dictioanry
-  uint64_t SortedStringDictionary::length() const {
-    return totalLength_;
-  }
-
-  void SortedStringDictionary::clear() {
-    totalLength_ = 0;
-    keyToIndex_.clear();
-    flatDict_.clear();
-  }
-
   class StringColumnWriter : public ColumnWriter {
    public:
     StringColumnWriter(const Type& type, const StreamsFactory& factory,
@@ -1324,6 +1225,9 @@ namespace orc {
       // flush dictionary data & length streams
       dictionary.flush(dictStream.get(), dictLengthEncoder.get());
 
+      // convert index from insertion order to dictionary order
+      dictionary.reorder(dictionary.idxInDictBuffer_);
+
       // write data sequences
       int64_t* data = dictionary.idxInDictBuffer_.data();
       if (enableIndex) {
@@ -1367,14 +1271,15 @@ namespace orc {
     }
 
     // get dictionary entries in insertion order
-    const auto& entries = dictionary.getEntriesInInsertionOrder();
+    std::vector<const SortedStringDictionary::DictEntry*> entries;
+    dictionary.getEntriesInInsertionOrder(entries);
 
     // store each length of the data into a vector
     for (uint64_t i = 0; i != dictionary.idxInDictBuffer_.size(); ++i) {
       // write one row data in direct encoding
       const auto& dictEntry = 
entries[static_cast<size_t>(dictionary.idxInDictBuffer_[i])];
-      directDataStream->write(dictEntry.data->data(), dictEntry.data->size());
-      directLengthEncoder->write(static_cast<int64_t>(dictEntry.data->size()));
+      directDataStream->write(dictEntry->data->data(), 
dictEntry->data->size());
+      
directLengthEncoder->write(static_cast<int64_t>(dictEntry->data->size()));
     }
 
     deleteDictStreams();
diff --git a/c++/src/Dictionary.cc b/c++/src/Dictionary.cc
new file mode 100644
index 000000000..9eb60bb5b
--- /dev/null
+++ b/c++/src/Dictionary.cc
@@ -0,0 +1,99 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Dictionary.hh"
+
+namespace orc {
+
+  // insert a new string into dictionary, return its insertion order
+  size_t SortedStringDictionary::insert(const char* str, size_t len) {
+    size_t index = flatDict_.size();
+
+    auto it = keyToIndex_.find(std::string_view{str, len});
+    if (it != keyToIndex_.end()) {
+      return it->second;
+    } else {
+      flatDict_.emplace_back(str, len, index);
+      totalLength_ += len;
+
+      const auto& lastEntry = flatDict_.back().entry;
+      keyToIndex_.emplace(std::string_view{lastEntry.data->data(), 
lastEntry.data->size()}, index);
+      return index;
+    }
+  }
+
+  // write dictionary data & length to output buffer
+  void SortedStringDictionary::flush(AppendOnlyBufferedStream* dataStream,
+                                     RleEncoder* lengthEncoder) const {
+    std::sort(flatDict_.begin(), flatDict_.end(), LessThan());
+
+    for (const auto& entryWithIndex : flatDict_) {
+      dataStream->write(entryWithIndex.entry.data->data(), 
entryWithIndex.entry.data->size());
+      
lengthEncoder->write(static_cast<int64_t>(entryWithIndex.entry.data->size()));
+    }
+  }
+
+  /**
+   * Reorder input index buffer from insertion order to dictionary order
+   *
+   * We require this function because string values are buffered by indexes
+   * in their insertion order. Until the entire dictionary is complete can
+   * we get their sorted indexes in the dictionary in that ORC specification
+   * demands dictionary should be ordered. Therefore this function transforms
+   * the indexes from insertion order to dictionary value order for final
+   * output.
+   */
+  void SortedStringDictionary::reorder(std::vector<int64_t>& idxBuffer) const {
+    // iterate the dictionary to get mapping from insertion order to value 
order
+    std::vector<size_t> mapping(flatDict_.size());
+    for (size_t i = 0; i < flatDict_.size(); ++i) {
+      mapping[flatDict_[i].index] = i;
+    }
+
+    // do the transformation
+    for (size_t i = 0; i != idxBuffer.size(); ++i) {
+      idxBuffer[i] = 
static_cast<int64_t>(mapping[static_cast<size_t>(idxBuffer[i])]);
+    }
+  }
+
+  // get dict entries in insertion order
+  void SortedStringDictionary::getEntriesInInsertionOrder(
+      std::vector<const DictEntry*>& entries) const {
+    /// flatDict_ is sorted in insertion order before 
[[SortedStringDictionary::flush]] is invoked.
+    entries.resize(flatDict_.size());
+    for (size_t i = 0; i < flatDict_.size(); ++i) {
+      entries[i] = &(flatDict_[i].entry);
+    }
+  }
+
+  // return count of entries
+  size_t SortedStringDictionary::size() const {
+    return flatDict_.size();
+  }
+
+  // return total length of strings in the dictioanry
+  uint64_t SortedStringDictionary::length() const {
+    return totalLength_;
+  }
+
+  void SortedStringDictionary::clear() {
+    totalLength_ = 0;
+    keyToIndex_.clear();
+    flatDict_.clear();
+  }
+}  // namespace orc
\ No newline at end of file
diff --git a/c++/src/Dictionary.hh b/c++/src/Dictionary.hh
new file mode 100644
index 000000000..dca15b115
--- /dev/null
+++ b/c++/src/Dictionary.hh
@@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstddef>
+#include <memory>
+#include <string>
+
+#ifdef BUILD_SPARSEHASH
+#include <sparsehash/dense_hash_map>
+#else
+#include <unordered_map>
+#endif
+
+#include "RLE.hh"
+
+namespace orc {
+  /**
+   * Implementation of increasing sorted string dictionary
+   */
+  class SortedStringDictionary {
+   public:
+    struct DictEntry {
+      DictEntry(const char* str, size_t len) : 
data(std::make_unique<std::string>(str, len)) {}
+
+      std::unique_ptr<std::string> data;
+    };
+
+    struct DictEntryWithIndex {
+      DictEntryWithIndex(const char* str, size_t len, size_t index)
+          : entry(str, len), index(index) {}
+
+      DictEntry entry;
+      size_t index;
+    };
+
+    SortedStringDictionary() : totalLength_(0) {
+#ifdef BUILD_SPARSEHASH
+      /// Need to set empty key otherwise dense_hash_map will not work 
correctly
+      keyToIndex_.set_empty_key(std::string_view{});
+#endif
+    }
+
+    // insert a new string into dictionary, return its insertion order
+    size_t insert(const char* str, size_t len);
+
+    // write dictionary data & length to output buffer
+    void flush(AppendOnlyBufferedStream* dataStream, RleEncoder* 
lengthEncoder) const;
+
+    // reorder input index buffer from insertion order to dictionary order
+    void reorder(std::vector<int64_t>& idxBuffer) const;
+
+    // get dict entries in insertion order
+    void getEntriesInInsertionOrder(std::vector<const DictEntry*>&) const;
+
+    // return count of entries
+    size_t size() const;
+
+    // return total length of strings in the dictioanry
+    uint64_t length() const;
+
+    void clear();
+
+   private:
+    struct LessThan {
+      bool operator()(const DictEntryWithIndex& l, const DictEntryWithIndex& 
r) {
+        return *l.entry.data < *r.entry.data;  // use std::string's operator<
+      }
+    };
+    // store dictionary entries in insertion order
+    mutable std::vector<DictEntryWithIndex> flatDict_;
+
+#ifdef BUILD_SPARSEHASH
+    // map from string to its insertion order index
+    google::dense_hash_map<std::string_view, size_t> keyToIndex_;
+#else
+    std::unordered_map<std::string_view, size_t> keyToIndex_;
+#endif
+
+    uint64_t totalLength_;
+
+    // use friend class here to avoid being bothered by const function calls
+    friend class StringColumnWriter;
+    friend class CharColumnWriter;
+    friend class VarCharColumnWriter;
+    // store indexes of insertion order in the dictionary for not-null rows
+    std::vector<int64_t> idxInDictBuffer_;
+  };
+
+}  // namespace orc
diff --git a/c++/src/meson.build b/c++/src/meson.build
index 44a98500f..885df0072 100644
--- a/c++/src/meson.build
+++ b/c++/src/meson.build
@@ -150,6 +150,7 @@ source_files += files(
     'Compression.cc',
     'ConvertColumnReader.cc',
     'CpuInfoUtil.cc',
+    'Dictionary.cc',
     'Exceptions.cc',
     'Geospatial.cc',
     'Int128.cc',
@@ -180,6 +181,9 @@ threads_dep = dependency('threads')
 orc_lib = library(
     'orc',
     sources: source_files,
+    cpp_args: [
+        '-DBUILD_SPARSEHASH'
+    ],
     dependencies: [
         orc_format_proto_dep,
         protobuf_dep,
diff --git a/c++/test/CMakeLists.txt b/c++/test/CMakeLists.txt
index b0ee48f38..3261fedde 100644
--- a/c++/test/CMakeLists.txt
+++ b/c++/test/CMakeLists.txt
@@ -77,7 +77,6 @@ target_link_libraries (orc-test
   orc::zlib
   orc::gtest
   orc::gmock
-  orc::sparsehash
   orc-test-include
 )
 
diff --git a/c++/test/TestDictionaryEncoding.cc 
b/c++/test/TestDictionaryEncoding.cc
index 40c1b1a60..c13ceb646 100644
--- a/c++/test/TestDictionaryEncoding.cc
+++ b/c++/test/TestDictionaryEncoding.cc
@@ -35,6 +35,11 @@ namespace orc {
   const double DICT_THRESHOLD = 0.2;      // make sure dictionary is used
   const double FALLBACK_THRESHOLD = 0.0;  // make sure fallback happens
 
+  static bool doubleEquals(double a, double b) {
+    const double EPSILON = 1e-9;
+    return std::fabs(a - b) < EPSILON;
+  }
+
   static std::unique_ptr<Reader> createReader(MemoryPool* memoryPool,
                                               std::unique_ptr<InputStream> 
stream) {
     ReaderOptions options;
@@ -42,12 +47,39 @@ namespace orc {
     return createReader(std::move(stream), options);
   }
 
-  static std::unique_ptr<RowReader> createRowReader(Reader* reader) {
+  static void checkDictionaryEncoding(StringVectorBatch* batch) {
+    EXPECT_TRUE(batch->isEncoded);
+
+    const auto* encoded_batch = dynamic_cast<EncodedStringVectorBatch*>(batch);
+    EXPECT_TRUE(encoded_batch != nullptr);
+
+    const auto& dictionary = encoded_batch->dictionary;
+    EXPECT_TRUE(dictionary != nullptr);
+
+    // Check if the dictionary is sorted
+    std::string prev;
+    for (size_t i = 0; i < dictionary->dictionaryOffset.size() - 1; ++i) {
+      char* begin = nullptr;
+      int64_t length = 0;
+      dictionary->getValueByIndex(i, begin, length);
+
+      std::string curr = std::string(begin, static_cast<size_t>(length));
+      if (i) {
+        EXPECT_GT(curr, prev);
+      }
+
+      prev = std::move(curr);
+    }
+  }
+
+  static std::unique_ptr<RowReader> createRowReader(Reader* reader,
+                                                    bool enableEncodedBlock = 
false) {
     RowReaderOptions rowReaderOpts;
+    rowReaderOpts.setEnableLazyDecoding(enableEncodedBlock);
     return reader->createRowReader(rowReaderOpts);
   }
 
-  void testStringDictionary(bool enableIndex, double threshold) {
+  void testStringDictionary(bool enableIndex, double threshold, bool 
enableEncodedBlock = false) {
     MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
     MemoryPool* pool = getDefaultPool();
     std::unique_ptr<Type> 
type(Type::buildTypeFromString("struct<col1:string>"));
@@ -87,16 +119,21 @@ namespace orc {
     std::unique_ptr<InputStream> inStream(
         new MemoryInputStream(memStream.getData(), memStream.getLength()));
     std::unique_ptr<Reader> reader = createReader(pool, std::move(inStream));
-    std::unique_ptr<RowReader> rowReader = createRowReader(reader.get());
+    std::unique_ptr<RowReader> rowReader = createRowReader(reader.get(), 
enableEncodedBlock);
     EXPECT_EQ(rowCount, reader->getNumberOfRows());
 
     batch = rowReader->createRowBatch(rowCount);
     EXPECT_EQ(true, rowReader->next(*batch));
     EXPECT_EQ(rowCount, batch->numElements);
 
+    structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
+    strBatch = dynamic_cast<StringVectorBatch*>(structBatch->fields[0]);
+    if (doubleEquals(threshold, DICT_THRESHOLD) && enableEncodedBlock) {
+      checkDictionaryEncoding(strBatch);
+      strBatch->decodeDictionary();
+    }
+
     for (uint64_t i = 0; i < rowCount; ++i) {
-      structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
-      strBatch = dynamic_cast<StringVectorBatch*>(structBatch->fields[0]);
       std::string str(strBatch->data[i], 
static_cast<size_t>(strBatch->length[i]));
       EXPECT_EQ(i % dictionarySize, static_cast<uint64_t>(atoi(str.c_str())));
     }
@@ -104,7 +141,7 @@ namespace orc {
     EXPECT_FALSE(rowReader->next(*batch));
   }
 
-  void testVarcharDictionary(bool enableIndex, double threshold) {
+  void testVarcharDictionary(bool enableIndex, double threshold, bool 
enableEncodedBlock = false) {
     MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
     MemoryPool* pool = getDefaultPool();
     std::unique_ptr<Type> 
type(Type::buildTypeFromString("struct<col1:varchar(2)>"));
@@ -144,17 +181,21 @@ namespace orc {
     std::unique_ptr<InputStream> inStream(
         new MemoryInputStream(memStream.getData(), memStream.getLength()));
     std::unique_ptr<Reader> reader = createReader(pool, std::move(inStream));
-    std::unique_ptr<RowReader> rowReader = createRowReader(reader.get());
+    std::unique_ptr<RowReader> rowReader = createRowReader(reader.get(), 
enableEncodedBlock);
     EXPECT_EQ(rowCount, reader->getNumberOfRows());
 
     batch = rowReader->createRowBatch(rowCount);
     EXPECT_EQ(true, rowReader->next(*batch));
     EXPECT_EQ(rowCount, batch->numElements);
 
-    for (uint64_t i = 0; i < rowCount; ++i) {
-      structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
-      varcharBatch = dynamic_cast<StringVectorBatch*>(structBatch->fields[0]);
+    structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
+    varcharBatch = dynamic_cast<StringVectorBatch*>(structBatch->fields[0]);
+    if (doubleEquals(threshold, DICT_THRESHOLD) && enableEncodedBlock) {
+      checkDictionaryEncoding(varcharBatch);
+      varcharBatch->decodeDictionary();
+    }
 
+    for (uint64_t i = 0; i < rowCount; ++i) {
       std::ostringstream os;
       os << (i % dictionarySize);
       EXPECT_FALSE(varcharBatch->length[i] > 2);
@@ -166,7 +207,7 @@ namespace orc {
     EXPECT_FALSE(rowReader->next(*batch));
   }
 
-  void testCharDictionary(bool enableIndex, double threshold) {
+  void testCharDictionary(bool enableIndex, double threshold, bool 
enableEncodedBlock = false) {
     MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
     MemoryPool* pool = getDefaultPool();
     std::unique_ptr<Type> 
type(Type::buildTypeFromString("struct<col1:char(3)>"));
@@ -204,17 +245,21 @@ namespace orc {
     std::unique_ptr<InputStream> inStream(
         new MemoryInputStream(memStream.getData(), memStream.getLength()));
     std::unique_ptr<Reader> reader = createReader(pool, std::move(inStream));
-    std::unique_ptr<RowReader> rowReader = createRowReader(reader.get());
+    std::unique_ptr<RowReader> rowReader = createRowReader(reader.get(), 
enableEncodedBlock);
     EXPECT_EQ(rowCount, reader->getNumberOfRows());
 
     batch = rowReader->createRowBatch(rowCount);
     EXPECT_EQ(true, rowReader->next(*batch));
     EXPECT_EQ(rowCount, batch->numElements);
 
-    for (uint64_t i = 0; i < rowCount; ++i) {
-      structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
-      charBatch = dynamic_cast<StringVectorBatch*>(structBatch->fields[0]);
+    structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
+    charBatch = dynamic_cast<StringVectorBatch*>(structBatch->fields[0]);
+    if (doubleEquals(threshold, DICT_THRESHOLD) && enableEncodedBlock) {
+      checkDictionaryEncoding(charBatch);
+      charBatch->decodeDictionary();
+    }
 
+    for (uint64_t i = 0; i < rowCount; ++i) {
       EXPECT_EQ(3, charBatch->length[i]);
       std::string charsRead(charBatch->data[i], 
static_cast<size_t>(charBatch->length[i]));
 
@@ -230,7 +275,8 @@ namespace orc {
     EXPECT_FALSE(rowReader->next(*batch));
   }
 
-  void testStringDictionaryWithNull(double threshold, bool enableIndex) {
+  void testStringDictionaryWithNull(double threshold, bool enableIndex,
+                                    bool enableEncodedBlock = false) {
     MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
     MemoryPool* pool = getDefaultPool();
     std::unique_ptr<Type> 
type(Type::buildTypeFromString("struct<col1:string>"));
@@ -277,17 +323,21 @@ namespace orc {
     std::unique_ptr<InputStream> inStream(
         new MemoryInputStream(memStream.getData(), memStream.getLength()));
     std::unique_ptr<Reader> reader = createReader(pool, std::move(inStream));
-    std::unique_ptr<RowReader> rowReader = createRowReader(reader.get());
+    std::unique_ptr<RowReader> rowReader = createRowReader(reader.get(), 
enableEncodedBlock);
     EXPECT_EQ(rowCount, reader->getNumberOfRows());
 
     batch = rowReader->createRowBatch(rowCount);
     EXPECT_EQ(true, rowReader->next(*batch));
     EXPECT_EQ(rowCount, batch->numElements);
 
-    for (uint64_t i = 0; i < rowCount; ++i) {
-      structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
-      strBatch = dynamic_cast<StringVectorBatch*>(structBatch->fields[0]);
+    structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
+    strBatch = dynamic_cast<StringVectorBatch*>(structBatch->fields[0]);
+    if (doubleEquals(threshold, DICT_THRESHOLD) && enableEncodedBlock) {
+      checkDictionaryEncoding(strBatch);
+      strBatch->decodeDictionary();
+    }
 
+    for (uint64_t i = 0; i < rowCount; ++i) {
       if (i % 2 == 0) {
         EXPECT_FALSE(strBatch->notNull[i]);
       } else {
@@ -357,9 +407,10 @@ namespace orc {
     for (uint64_t stripe = 0; stripe != stripeCount; ++stripe) {
       EXPECT_EQ(true, rowReader->next(*batch));
 
+      structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
+      strBatch = dynamic_cast<StringVectorBatch*>(structBatch->fields[0]);
+
       for (uint64_t i = 0; i < rowCount; ++i) {
-        structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
-        strBatch = dynamic_cast<StringVectorBatch*>(structBatch->fields[0]);
         std::string str(strBatch->data[i], 
static_cast<size_t>(strBatch->length[i]));
         EXPECT_EQ(i % dictionarySize, 
static_cast<uint64_t>(atoi(str.c_str())));
       }
@@ -368,7 +419,6 @@ namespace orc {
 
     // test seeking to check positions
     batch = rowReader->createRowBatch(1);
-
     for (uint64_t stripe = 0; stripe != stripeCount; ++stripe) {
       for (uint64_t i = 0; i < rowCount; i += 10000 / 2) {
         rowReader->seekToRow(stripe * rowCount + i);
@@ -385,45 +435,61 @@ namespace orc {
   // test dictionary encoding with index disabled
   // the decision of using dictionary if made at the end of 1st stripe
   TEST(DictionaryEncoding, writeStringDictionaryEncodingWithoutIndex) {
-    testStringDictionary(false, DICT_THRESHOLD);
-    testStringDictionary(false, FALLBACK_THRESHOLD);
+    for (auto enableEncodedBlock : {false, true}) {
+      testStringDictionary(false, DICT_THRESHOLD, enableEncodedBlock);
+      testStringDictionary(false, FALLBACK_THRESHOLD, enableEncodedBlock);
+    }
   }
 
   // test dictionary encoding with index enabled
   // the decision of using dictionary if made at the end of 1st row group
   TEST(DictionaryEncoding, writeStringDictionaryEncodingWithIndex) {
-    testStringDictionary(true, DICT_THRESHOLD);
-    testStringDictionary(true, FALLBACK_THRESHOLD);
+    for (auto enableEncodedBlock : {false, true}) {
+      testStringDictionary(true, DICT_THRESHOLD, enableEncodedBlock);
+      testStringDictionary(true, FALLBACK_THRESHOLD, enableEncodedBlock);
+    }
   }
 
   TEST(DictionaryEncoding, writeVarcharDictionaryEncodingWithoutIndex) {
-    testVarcharDictionary(false, DICT_THRESHOLD);
-    testVarcharDictionary(false, FALLBACK_THRESHOLD);
+    for (auto enableEncodedBlock : {false, true}) {
+      testVarcharDictionary(false, DICT_THRESHOLD, enableEncodedBlock);
+      testVarcharDictionary(false, FALLBACK_THRESHOLD, enableEncodedBlock);
+    }
   }
 
   TEST(DictionaryEncoding, writeVarcharDictionaryEncodingWithIndex) {
-    testVarcharDictionary(true, DICT_THRESHOLD);
-    testVarcharDictionary(true, FALLBACK_THRESHOLD);
+    for (auto enableEncodedBlock : {false, true}) {
+      testVarcharDictionary(true, DICT_THRESHOLD, enableEncodedBlock);
+      testVarcharDictionary(true, FALLBACK_THRESHOLD, enableEncodedBlock);
+    }
   }
 
   TEST(DictionaryEncoding, writeCharDictionaryEncodingWithoutIndex) {
-    testCharDictionary(false, DICT_THRESHOLD);
-    testCharDictionary(false, FALLBACK_THRESHOLD);
+    for (auto enableEncodedBlock : {false, true}) {
+      testCharDictionary(false, DICT_THRESHOLD, enableEncodedBlock);
+      testCharDictionary(false, FALLBACK_THRESHOLD, enableEncodedBlock);
+    }
   }
 
   TEST(DictionaryEncoding, writeCharDictionaryEncodingWithIndex) {
-    testCharDictionary(true, DICT_THRESHOLD);
-    testCharDictionary(true, FALLBACK_THRESHOLD);
+    for (auto enableEncodedBlock : {false, true}) {
+      testCharDictionary(true, DICT_THRESHOLD, enableEncodedBlock);
+      testCharDictionary(true, FALLBACK_THRESHOLD, enableEncodedBlock);
+    }
   }
 
   TEST(DictionaryEncoding, stringDictionaryWithNullWithIndex) {
-    testStringDictionaryWithNull(DICT_THRESHOLD, true);
-    testStringDictionaryWithNull(FALLBACK_THRESHOLD, true);
+    for (auto enableEncodedBlock : {false, true}) {
+      testStringDictionaryWithNull(DICT_THRESHOLD, true, enableEncodedBlock);
+      testStringDictionaryWithNull(FALLBACK_THRESHOLD, true, 
enableEncodedBlock);
+    }
   }
 
   TEST(DictionaryEncoding, stringDictionaryWithNullWithoutIndex) {
-    testStringDictionaryWithNull(DICT_THRESHOLD, false);
-    testStringDictionaryWithNull(FALLBACK_THRESHOLD, false);
+    for (auto enableEncodedBlock : {false, true}) {
+      testStringDictionaryWithNull(DICT_THRESHOLD, false, enableEncodedBlock);
+      testStringDictionaryWithNull(FALLBACK_THRESHOLD, false, 
enableEncodedBlock);
+    }
   }
 
   TEST(DictionaryEncoding, multipleStripesWithIndex) {
diff --git a/c++/test/meson.build b/c++/test/meson.build
index 75dcbb094..a8d30a6b9 100644
--- a/c++/test/meson.build
+++ b/c++/test/meson.build
@@ -72,7 +72,6 @@ orc_test = executable(
         zlib_dep,
         gtest_dep,
         gmock_dep,
-        sparsehash_c11_dep,
     ],
 )
 
diff --git a/cmake_modules/ThirdpartyToolchain.cmake 
b/cmake_modules/ThirdpartyToolchain.cmake
index c494710ba..c77d3f1f5 100644
--- a/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cmake_modules/ThirdpartyToolchain.cmake
@@ -583,46 +583,47 @@ endif ()
 
 # ----------------------------------------------------------------------
 # SPARSEHASH
+if(BUILD_SPARSEHASH)
+  set(SPARSEHASH_HOME "${THIRDPARTY_DIR}/sparsehash_ep-install")
+  set(SPARSEHASH_INCLUDE_DIR "${SPARSEHASH_HOME}/include/google")
+  set(SPARSEHASH_CMAKE_ARGS
+      -DCMAKE_INSTALL_PREFIX=${SPARSEHASH_HOME}
+      -DBUILD_SHARED_LIBS=OFF
+      -DCMAKE_INSTALL_LIBDIR=lib
+      -DCMAKE_POLICY_VERSION_MINIMUM=3.5
+  )
+  if (BUILD_POSITION_INDEPENDENT_LIB)
+    set(SPARSEHASH_CMAKE_ARGS ${SPARSEHASH_CMAKE_ARGS} 
-DCMAKE_POSITION_INDEPENDENT_CODE=ON)
+  endif ()
 
-set(SPARSEHASH_HOME "${THIRDPARTY_DIR}/sparsehash_ep-install")
-set(SPARSEHASH_INCLUDE_DIR "${SPARSEHASH_HOME}/include/google")
-set(SPARSEHASH_CMAKE_ARGS
-    -DCMAKE_INSTALL_PREFIX=${SPARSEHASH_HOME}
-    -DBUILD_SHARED_LIBS=OFF
-    -DCMAKE_INSTALL_LIBDIR=lib
-    -DCMAKE_POLICY_VERSION_MINIMUM=3.5
-)
-if (BUILD_POSITION_INDEPENDENT_LIB)
-  set(SPARSEHASH_CMAKE_ARGS ${SPARSEHASH_CMAKE_ARGS} 
-DCMAKE_POSITION_INDEPENDENT_CODE=ON)
-endif ()
-
-if (CMAKE_VERSION VERSION_GREATER "3.7")
-    set(SPARSEHASH_CONFIGURE SOURCE_SUBDIR "" CMAKE_ARGS 
${SPARSEHASH_CMAKE_ARGS})
-  else()
-    set(SPARSEHASH_CONFIGURE CONFIGURE_COMMAND 
"${THIRDPARTY_CONFIGURE_COMMAND}" ${SPARSEHASH_CMAKE_ARGS}
-            
"${CMAKE_CURRENT_BINARY_DIR}/sparsehash_ep-prefix/src/sparsehash_ep/")
-endif()
+  if (CMAKE_VERSION VERSION_GREATER "3.7")
+      set(SPARSEHASH_CONFIGURE SOURCE_SUBDIR "" CMAKE_ARGS 
${SPARSEHASH_CMAKE_ARGS})
+    else()
+      set(SPARSEHASH_CONFIGURE CONFIGURE_COMMAND 
"${THIRDPARTY_CONFIGURE_COMMAND}" ${SPARSEHASH_CMAKE_ARGS}
+              
"${CMAKE_CURRENT_BINARY_DIR}/sparsehash_ep-prefix/src/sparsehash_ep/")
+  endif()
 
-ExternalProject_Add(sparsehash_ep
-    URL 
"https://github.com/sparsehash/sparsehash-c11/archive/refs/tags/v${SPARSEHASH_VERSION}.tar.gz";
-    ${SPARSEHASH_CONFIGURE}
-    ${THIRDPARTY_LOG_OPTIONS})
+  ExternalProject_Add(sparsehash_ep
+      URL 
"https://github.com/sparsehash/sparsehash-c11/archive/refs/tags/v${SPARSEHASH_VERSION}.tar.gz";
+      ${SPARSEHASH_CONFIGURE}
+      ${THIRDPARTY_LOG_OPTIONS})
 
-# sparsehash-c11 is header-only, create interface library
-add_library(orc_sparsehash INTERFACE)
-target_include_directories(orc_sparsehash INTERFACE 
-    $<BUILD_INTERFACE:${SPARSEHASH_INCLUDE_DIR}>
-    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
-add_dependencies(orc_sparsehash sparsehash_ep)
+  # sparsehash-c11 is header-only, create interface library
+  add_library(orc_sparsehash INTERFACE)
+  target_include_directories(orc_sparsehash INTERFACE 
+      $<BUILD_INTERFACE:${SPARSEHASH_INCLUDE_DIR}>
+      $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+  add_dependencies(orc_sparsehash sparsehash_ep)
 
-list (APPEND ORC_VENDOR_DEPENDENCIES "orc::vendored_sparsehash")
-list (APPEND ORC_INSTALL_INTERFACE_TARGETS 
"$<INSTALL_INTERFACE:orc::vendored_sparsehash>")
+  list (APPEND ORC_VENDOR_DEPENDENCIES "orc::vendored_sparsehash")
+  list (APPEND ORC_INSTALL_INTERFACE_TARGETS 
"$<INSTALL_INTERFACE:orc::vendored_sparsehash>")
 
-add_library (orc::sparsehash ALIAS orc_sparsehash)
+  add_library (orc::sparsehash ALIAS orc_sparsehash)
+  set (SPARSEHASH_LIBRARIES orc::sparsehash)
+endif()
 
 # ----------------------------------------------------------------------
 # LIBHDFSPP
-
 if(BUILD_LIBHDFSPP)
   set (BUILD_LIBHDFSPP FALSE)
   if(ORC_CXX_HAS_THREAD_LOCAL)
diff --git a/subprojects/sparsehash-c11.wrap b/subprojects/sparsehash-c11.wrap
index 4177861ce..de3d18748 100644
--- a/subprojects/sparsehash-c11.wrap
+++ b/subprojects/sparsehash-c11.wrap
@@ -27,4 +27,4 @@ source_fallback_url = 
https://github.com/mesonbuild/wrapdb/releases/download/spa
 wrapdb_version = 2.11.1-1
 
 [provide]
-sparsehash-c11 = sparsehash_c11_dep
\ No newline at end of file
+sparsehash-c11 = sparsehash_c11_dep

(orc) branch main updated: ORC-1950: [C++] Make sure dictionary is sorted before flushed into ORC file to follow ORC specs

Reply via email to