This is an automated email from the ASF dual-hosted git repository.

pitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new a329d88476 GH-36503: [C++] Make DictionaryArray::dictionary() 
thread-safe (#48905)
a329d88476 is described below

commit a329d88476ffdacaefc0e08b2c57f3bfa7a6af68
Author: Hyukjin Kwon <[email protected]>
AuthorDate: Thu May 14 00:57:39 2026 +0900

    GH-36503: [C++] Make DictionaryArray::dictionary() thread-safe (#48905)
    
    ### Rationale for this change
    
    The `DictionaryArray::dictionary()` method has a data race during lazy 
initialization. When multiple threads concurrently call `dictionary()`, the 
check then initialize pattern allows multiple threads to pass the check 
simultaneously, creating multiple `Array` objects instead of one shared cached 
instance.
    
    There were a bit of discussion in 
https://github.com/apache/arrow/pull/36418#discussion_r1248319000 but I just 
propose the standard approach with the keep it simple, stupid spirit.
    
    ### What changes are included in this PR?
    
    This PR makes `DictionaryArray::dictionary` thread-safe by `std::once_flag` 
and `std::call_once`.
    
    ### Are these changes tested?
    
    Manually tested with the example below:
    
    ```c++
    #include <arrow/api.h>
    #include <thread>
    #include <vector>
    
    int main() {
      arrow::StringBuilder dict_builder;
      dict_builder.Append("foo").Append("bar").Append("baz");
      auto dict = dict_builder.Finish().ValueOrDie();
      arrow::Int32Builder indices_builder;
      indices_builder.AppendValues({0, 1, 2, 0, 1, 2});
      auto indices = indices_builder.Finish().ValueOrDie();
    
      auto dict_array_result = arrow::DictionaryArray::FromArrays(indices, 
dict);
      auto dict_array = std::static_pointer_cast<arrow::DictionaryArray>(
          dict_array_result.ValueOrDie());
    
      // Spawn 20 threads that concurrently access dictionary
      std::vector<std::thread> threads;
      for (int i = 0; i < 20; ++i) {
        threads.emplace_back([&dict_array, i]() {
          const auto& dictionary = dict_array->dictionary();
          results[i] = dictionary.get();
        });
      }
    
      // Verify all threads got the same cached dictionary
      for (int i = 1; i < 20; ++i) {
        if (results[i] != results[0]) {
          return 1;
        }
      }
    }
    ```
    
    ### Are there any user-facing changes?
    
    No, just a fix for a potential race condition.
    
    * GitHub Issue: #36503
    
    Authored-by: Hyukjin Kwon <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/arrow/array/array_dict.cc | 6 ++----
 cpp/src/arrow/array/array_dict.h  | 5 ++++-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/cpp/src/arrow/array/array_dict.cc 
b/cpp/src/arrow/array/array_dict.cc
index 2e54e6ec49..6b3511721d 100644
--- a/cpp/src/arrow/array/array_dict.cc
+++ b/cpp/src/arrow/array/array_dict.cc
@@ -108,10 +108,8 @@ DictionaryArray::DictionaryArray(const 
std::shared_ptr<DataType>& type,
 }
 
 const std::shared_ptr<Array>& DictionaryArray::dictionary() const {
-  if (!dictionary_) {
-    // TODO(GH-36503) this isn't thread safe
-    dictionary_ = MakeArray(data_->dictionary);
-  }
+  std::call_once(dictionary_init_flag_,
+                 [this]() { dictionary_ = MakeArray(data_->dictionary); });
   return dictionary_;
 }
 
diff --git a/cpp/src/arrow/array/array_dict.h b/cpp/src/arrow/array/array_dict.h
index bf376b51f8..5844585734 100644
--- a/cpp/src/arrow/array/array_dict.h
+++ b/cpp/src/arrow/array/array_dict.h
@@ -19,6 +19,7 @@
 
 #include <cstdint>
 #include <memory>
+#include <mutex>
 
 #include "arrow/array/array_base.h"
 #include "arrow/array/data.h"
@@ -118,8 +119,10 @@ class ARROW_EXPORT DictionaryArray : public Array {
   const DictionaryType* dict_type_;
   std::shared_ptr<Array> indices_;
 
-  // Lazily initialized when invoking dictionary()
+  // Lazily initialized when invoking dictionary().
+  // Thread-safe initialization is ensured by dictionary_init_flag_.
   mutable std::shared_ptr<Array> dictionary_;
+  mutable std::once_flag dictionary_init_flag_;
 };
 
 /// \brief Helper class for incremental dictionary unification

Reply via email to