This is an automated email from the ASF dual-hosted git repository.
thisisnic pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new a1f569c433 GH-49689: [R][C++] Parquets do not support list-columns of
ordered factors (ordered dictionaries) (#49937)
a1f569c433 is described below
commit a1f569c433ecdeca4a32aff20ae0ce868d73fb1a
Author: Nic Crane <[email protected]>
AuthorDate: Thu Jun 11 09:26:56 2026 +0100
GH-49689: [R][C++] Parquets do not support list-columns of ordered factors
(ordered dictionaries) (#49937)
### Rationale for this change
Ordered dictionaries inside nested types lose their `ordered` flag during
construction, because `DictionaryBuilder` doesn't track it.
### What changes are included in this PR?
Store the `ordered` flag in `DictionaryBuilderBase` and pass it through
when reconstructing the `DictionaryType` in `type()` and `FinishInternal()`.
Remove the R-side workaround that was patching this for top-level columns
only.
### Are these changes tested?
Yes
### Are there any user-facing changes?
No
### AI usage
Written by Claude, reviewed by me and Codex (via
[roborev](https://github.com/roborev-dev/roborev)). I had Claude create the
tests first, to make sure they failed as expected, then had Claude make the
fixes and checked the tests passed. I questioned the approach as I went.
* GitHub Issue: #49689
Lead-authored-by: Nic Crane <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Nic Crane <[email protected]>
---
cpp/src/arrow/array/array_dict_test.cc | 76 ++++++++++++++++++++++++++++
cpp/src/arrow/array/builder_dict.h | 90 +++++++++++++++++++++++-----------
cpp/src/arrow/builder.cc | 26 +++++++---
r/src/r_to_arrow.cpp | 8 ---
r/tests/testthat/test-Array.R | 21 ++++++++
5 files changed, 176 insertions(+), 45 deletions(-)
diff --git a/cpp/src/arrow/array/array_dict_test.cc
b/cpp/src/arrow/array/array_dict_test.cc
index 22d6d1fc5a..5f4335bcbc 100644
--- a/cpp/src/arrow/array/array_dict_test.cc
+++ b/cpp/src/arrow/array/array_dict_test.cc
@@ -1761,4 +1761,80 @@ TEST(TestDictionaryUnifier, TableZeroColumns) {
AssertTablesEqual(*table, *unified);
}
+// GH-49689: Ordered dictionary tests
+
+TEST(TestDictionaryBuilderOrdered, TypePreservesOrderedFlag) {
+ for (bool ordered : {true, false}) {
+ ARROW_SCOPED_TRACE("ordered = ", ordered);
+ auto dict_type = dictionary(int8(), utf8(), ordered);
+ ASSERT_OK_AND_ASSIGN(auto boxed_builder, MakeBuilder(dict_type));
+
+ auto builder_type = boxed_builder->type();
+ ASSERT_EQ(checked_cast<const DictionaryType&>(*builder_type).ordered(),
ordered);
+ }
+}
+
+TEST(TestDictionaryBuilderOrdered, FinishPreservesOrderedFlag) {
+ for (bool ordered : {true, false}) {
+ ARROW_SCOPED_TRACE("ordered = ", ordered);
+ auto dict_type = dictionary(int8(), utf8(), ordered);
+ ASSERT_OK_AND_ASSIGN(auto boxed_builder, MakeBuilder(dict_type));
+ auto& builder =
checked_cast<DictionaryBuilder<StringType>&>(*boxed_builder);
+
+ ASSERT_OK(builder.Append("a"));
+ ASSERT_OK(builder.Append("b"));
+ ASSERT_OK(builder.Append("a"));
+
+ std::shared_ptr<Array> result;
+ ASSERT_OK(builder.Finish(&result));
+
+ const auto& result_type = checked_cast<const
DictionaryType&>(*result->type());
+ ASSERT_EQ(result_type.ordered(), ordered);
+
+ auto ex_dict = ArrayFromJSON(utf8(), R"(["a", "b"])");
+ auto ex_indices = ArrayFromJSON(int8(), "[0, 1, 0]");
+ DictionaryArray expected(dict_type, ex_indices, ex_dict);
+ AssertArraysEqual(expected, *result);
+ }
+}
+
+TEST(TestDictionaryBuilderOrdered, ListOfOrderedDictionary) {
+ for (bool ordered : {true, false}) {
+ ARROW_SCOPED_TRACE("ordered = ", ordered);
+ auto dict_type = dictionary(int8(), utf8(), ordered);
+ auto list_type = list(field("item", dict_type));
+
+ ASSERT_OK_AND_ASSIGN(auto boxed_builder, MakeBuilder(list_type));
+ auto& list_builder = checked_cast<ListBuilder&>(*boxed_builder);
+ auto& dict_builder =
+
checked_cast<DictionaryBuilder<StringType>&>(*list_builder.value_builder());
+
+ ASSERT_OK(list_builder.Append());
+ ASSERT_OK(dict_builder.Append("a"));
+ ASSERT_OK(dict_builder.Append("b"));
+ ASSERT_OK(list_builder.Append());
+ ASSERT_OK(dict_builder.Append("a"));
+
+ std::shared_ptr<Array> result;
+ ASSERT_OK(list_builder.Finish(&result));
+
+ const auto& result_list_type = checked_cast<const
ListType&>(*result->type());
+ const auto& result_dict_type =
+ checked_cast<const DictionaryType&>(*result_list_type.value_type());
+ ASSERT_EQ(result_dict_type.ordered(), ordered);
+ }
+}
+
+TEST(TestDictionaryBuilderOrdered, MakeDictionaryBuilderPreservesOrdered) {
+ for (bool ordered : {true, false}) {
+ ARROW_SCOPED_TRACE("ordered = ", ordered);
+ auto dict_type = dictionary(int8(), utf8(), ordered);
+ ASSERT_OK_AND_ASSIGN(auto builder,
+ MakeDictionaryBuilder(dict_type,
/*dictionary=*/nullptr));
+
+ auto builder_type = builder->type();
+ ASSERT_EQ(checked_cast<const DictionaryType&>(*builder_type).ordered(),
ordered);
+ }
+}
+
} // namespace arrow
diff --git a/cpp/src/arrow/array/builder_dict.h
b/cpp/src/arrow/array/builder_dict.h
index 116c82049e..269cdeee64 100644
--- a/cpp/src/arrow/array/builder_dict.h
+++ b/cpp/src/arrow/array/builder_dict.h
@@ -154,26 +154,28 @@ class DictionaryBuilderBase : public ArrayBuilder {
const std::shared_ptr<DataType>&>
value_type,
MemoryPool* pool = default_memory_pool(),
- int64_t alignment = kDefaultBufferAlignment)
+ int64_t alignment = kDefaultBufferAlignment, bool
ordered = false)
: ArrayBuilder(pool, alignment),
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
delta_offset_(0),
byte_width_(-1),
indices_builder_(start_int_size, pool, alignment),
- value_type_(value_type) {}
+ value_type_(value_type),
+ ordered_(ordered) {}
template <typename T1 = T>
explicit DictionaryBuilderBase(
enable_if_t<!is_fixed_size_binary_type<T1>::value, const
std::shared_ptr<DataType>&>
value_type,
MemoryPool* pool = default_memory_pool(),
- int64_t alignment = kDefaultBufferAlignment)
+ int64_t alignment = kDefaultBufferAlignment, bool ordered = false)
: ArrayBuilder(pool, alignment),
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
delta_offset_(0),
byte_width_(-1),
indices_builder_(pool, alignment),
- value_type_(value_type) {}
+ value_type_(value_type),
+ ordered_(ordered) {}
template <typename T1 = T>
explicit DictionaryBuilderBase(
@@ -181,13 +183,14 @@ class DictionaryBuilderBase : public ArrayBuilder {
enable_if_t<!is_fixed_size_binary_type<T1>::value, const
std::shared_ptr<DataType>&>
value_type,
MemoryPool* pool = default_memory_pool(),
- int64_t alignment = kDefaultBufferAlignment)
+ int64_t alignment = kDefaultBufferAlignment, bool ordered = false)
: ArrayBuilder(pool, alignment),
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
delta_offset_(0),
byte_width_(-1),
indices_builder_(index_type, pool, alignment),
- value_type_(value_type) {}
+ value_type_(value_type),
+ ordered_(ordered) {}
template <typename B = BuilderType, typename T1 = T>
DictionaryBuilderBase(uint8_t start_int_size,
@@ -196,38 +199,41 @@ class DictionaryBuilderBase : public ArrayBuilder {
const std::shared_ptr<DataType>&>
value_type,
MemoryPool* pool = default_memory_pool(),
- int64_t alignment = kDefaultBufferAlignment)
+ int64_t alignment = kDefaultBufferAlignment, bool
ordered = false)
: ArrayBuilder(pool, alignment),
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
delta_offset_(0),
byte_width_(static_cast<const T1&>(*value_type).byte_width()),
indices_builder_(start_int_size, pool, alignment),
- value_type_(value_type) {}
+ value_type_(value_type),
+ ordered_(ordered) {}
template <typename T1 = T>
explicit DictionaryBuilderBase(
enable_if_fixed_size_binary<T1, const std::shared_ptr<DataType>&>
value_type,
MemoryPool* pool = default_memory_pool(),
- int64_t alignment = kDefaultBufferAlignment)
+ int64_t alignment = kDefaultBufferAlignment, bool ordered = false)
: ArrayBuilder(pool, alignment),
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
delta_offset_(0),
byte_width_(static_cast<const T1&>(*value_type).byte_width()),
indices_builder_(pool, alignment),
- value_type_(value_type) {}
+ value_type_(value_type),
+ ordered_(ordered) {}
template <typename T1 = T>
explicit DictionaryBuilderBase(
const std::shared_ptr<DataType>& index_type,
enable_if_fixed_size_binary<T1, const std::shared_ptr<DataType>&>
value_type,
MemoryPool* pool = default_memory_pool(),
- int64_t alignment = kDefaultBufferAlignment)
+ int64_t alignment = kDefaultBufferAlignment, bool ordered = false)
: ArrayBuilder(pool, alignment),
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
delta_offset_(0),
byte_width_(static_cast<const T1&>(*value_type).byte_width()),
indices_builder_(index_type, pool, alignment),
- value_type_(value_type) {}
+ value_type_(value_type),
+ ordered_(ordered) {}
template <typename T1 = T>
explicit DictionaryBuilderBase(
@@ -237,13 +243,15 @@ class DictionaryBuilderBase : public ArrayBuilder {
// This constructor doesn't check for errors. Use InsertMemoValues instead.
explicit DictionaryBuilderBase(const std::shared_ptr<Array>& dictionary,
MemoryPool* pool = default_memory_pool(),
- int64_t alignment = kDefaultBufferAlignment)
+ int64_t alignment = kDefaultBufferAlignment,
+ bool ordered = false)
: ArrayBuilder(pool, alignment),
memo_table_(new internal::DictionaryMemoTable(pool, dictionary)),
delta_offset_(0),
byte_width_(-1),
indices_builder_(pool, alignment),
- value_type_(dictionary->type()) {}
+ value_type_(dictionary->type()),
+ ordered_(ordered) {}
~DictionaryBuilderBase() override = default;
@@ -490,7 +498,7 @@ class DictionaryBuilderBase : public ArrayBuilder {
Status Finish(std::shared_ptr<DictionaryArray>* out) { return
FinishTyped(out); }
std::shared_ptr<DataType> type() const override {
- return ::arrow::dictionary(indices_builder_.type(), value_type_);
+ return ::arrow::dictionary(indices_builder_.type(), value_type_, ordered_);
}
protected:
@@ -561,6 +569,7 @@ class DictionaryBuilderBase : public ArrayBuilder {
BuilderType indices_builder_;
std::shared_ptr<DataType> value_type_;
+ bool ordered_ = false;
};
template <typename BuilderType>
@@ -571,31 +580,53 @@ class DictionaryBuilderBase<BuilderType, NullType> :
public ArrayBuilder {
enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value, uint8_t>
start_int_size,
const std::shared_ptr<DataType>& value_type,
- MemoryPool* pool = default_memory_pool())
- : ArrayBuilder(pool), indices_builder_(start_int_size, pool) {}
+ MemoryPool* pool = default_memory_pool(),
+ int64_t alignment = kDefaultBufferAlignment, bool ordered = false)
+ : ArrayBuilder(pool, alignment),
+ indices_builder_(start_int_size, pool, alignment),
+ ordered_(ordered) {}
explicit DictionaryBuilderBase(const std::shared_ptr<DataType>& value_type,
- MemoryPool* pool = default_memory_pool())
- : ArrayBuilder(pool), indices_builder_(pool) {}
+ MemoryPool* pool = default_memory_pool(),
+ int64_t alignment = kDefaultBufferAlignment,
+ bool ordered = false)
+ : ArrayBuilder(pool, alignment),
+ indices_builder_(pool, alignment),
+ ordered_(ordered) {}
explicit DictionaryBuilderBase(const std::shared_ptr<DataType>& index_type,
const std::shared_ptr<DataType>& value_type,
- MemoryPool* pool = default_memory_pool())
- : ArrayBuilder(pool), indices_builder_(index_type, pool) {}
+ MemoryPool* pool = default_memory_pool(),
+ int64_t alignment = kDefaultBufferAlignment,
+ bool ordered = false)
+ : ArrayBuilder(pool, alignment),
+ indices_builder_(index_type, pool, alignment),
+ ordered_(ordered) {}
template <typename B = BuilderType>
explicit DictionaryBuilderBase(
enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value, uint8_t>
start_int_size,
- MemoryPool* pool = default_memory_pool())
- : ArrayBuilder(pool), indices_builder_(start_int_size, pool) {}
+ MemoryPool* pool = default_memory_pool(),
+ int64_t alignment = kDefaultBufferAlignment, bool ordered = false)
+ : ArrayBuilder(pool, alignment),
+ indices_builder_(start_int_size, pool, alignment),
+ ordered_(ordered) {}
- explicit DictionaryBuilderBase(MemoryPool* pool = default_memory_pool())
- : ArrayBuilder(pool), indices_builder_(pool) {}
+ explicit DictionaryBuilderBase(MemoryPool* pool = default_memory_pool(),
+ int64_t alignment = kDefaultBufferAlignment,
+ bool ordered = false)
+ : ArrayBuilder(pool, alignment),
+ indices_builder_(pool, alignment),
+ ordered_(ordered) {}
explicit DictionaryBuilderBase(const std::shared_ptr<Array>& dictionary,
- MemoryPool* pool = default_memory_pool())
- : ArrayBuilder(pool), indices_builder_(pool) {}
+ MemoryPool* pool = default_memory_pool(),
+ int64_t alignment = kDefaultBufferAlignment,
+ bool ordered = false)
+ : ArrayBuilder(pool, alignment),
+ indices_builder_(pool, alignment),
+ ordered_(ordered) {}
/// \brief Append a scalar null value
Status AppendNull() final {
@@ -647,7 +678,7 @@ class DictionaryBuilderBase<BuilderType, NullType> : public
ArrayBuilder {
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
ARROW_RETURN_NOT_OK(indices_builder_.FinishInternal(out));
- (*out)->type = dictionary((*out)->type, null());
+ (*out)->type = dictionary((*out)->type, null(), ordered_);
(*out)->dictionary = NullArray(0).data();
return Status::OK();
}
@@ -659,11 +690,12 @@ class DictionaryBuilderBase<BuilderType, NullType> :
public ArrayBuilder {
Status Finish(std::shared_ptr<DictionaryArray>* out) { return
FinishTyped(out); }
std::shared_ptr<DataType> type() const override {
- return ::arrow::dictionary(indices_builder_.type(), null());
+ return ::arrow::dictionary(indices_builder_.type(), null(), ordered_);
}
protected:
BuilderType indices_builder_;
+ bool ordered_ = false;
};
} // namespace internal
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index 1a6a718c15..2190f1ce04 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -168,17 +168,21 @@ struct DictionaryBuilderCase {
template <typename ValueType>
Status CreateFor() {
using AdaptiveBuilderType = DictionaryBuilder<ValueType>;
+ using ExactBuilderType =
+ internal::DictionaryBuilderBase<TypeErasedIntBuilder, ValueType>;
if (dictionary != nullptr) {
- out->reset(new AdaptiveBuilderType(dictionary, pool));
+ out->reset(
+ new AdaptiveBuilderType(dictionary, pool, kDefaultBufferAlignment,
ordered));
} else if (exact_index_type) {
if (!is_integer(index_type->id())) {
return Status::TypeError("MakeBuilder: invalid index type ",
*index_type);
}
- out->reset(new internal::DictionaryBuilderBase<TypeErasedIntBuilder,
ValueType>(
- index_type, value_type, pool));
+ out->reset(new ExactBuilderType(index_type, value_type, pool,
+ kDefaultBufferAlignment, ordered));
} else {
auto start_int_size = index_type->byte_width();
- out->reset(new AdaptiveBuilderType(start_int_size, value_type, pool));
+ out->reset(new AdaptiveBuilderType(start_int_size, value_type, pool,
+ kDefaultBufferAlignment, ordered));
}
return Status::OK();
}
@@ -188,8 +192,9 @@ struct DictionaryBuilderCase {
MemoryPool* pool;
const std::shared_ptr<DataType>& index_type;
const std::shared_ptr<DataType>& value_type;
- const std::shared_ptr<Array>& dictionary;
+ std::shared_ptr<Array> dictionary;
bool exact_index_type;
+ bool ordered;
std::unique_ptr<ArrayBuilder>* out;
};
@@ -206,6 +211,7 @@ struct MakeBuilderImpl {
dict_type.value_type(),
/*dictionary=*/nullptr,
exact_index_type,
+ dict_type.ordered(),
&out};
return visitor.Make();
}
@@ -332,9 +338,13 @@ Status MakeDictionaryBuilder(MemoryPool* pool, const
std::shared_ptr<DataType>&
const std::shared_ptr<Array>& dictionary,
std::unique_ptr<ArrayBuilder>* out) {
const auto& dict_type = static_cast<const DictionaryType&>(*type);
- DictionaryBuilderCase visitor = {
- pool, dict_type.index_type(), dict_type.value_type(),
- dictionary, /*exact_index_type=*/false, out};
+ DictionaryBuilderCase visitor = {pool,
+ dict_type.index_type(),
+ dict_type.value_type(),
+ dictionary,
+ /*exact_index_type=*/false,
+ dict_type.ordered(),
+ out};
return visitor.Make();
}
diff --git a/r/src/r_to_arrow.cpp b/r/src/r_to_arrow.cpp
index 4312572a5e..cbe404aba9 100644
--- a/r/src/r_to_arrow.cpp
+++ b/r/src/r_to_arrow.cpp
@@ -996,14 +996,6 @@ class RDictionaryConverter<ValueType,
enable_if_has_string_view<ValueType>>
Result<std::shared_ptr<ChunkedArray>> ToChunkedArray() override {
ARROW_ASSIGN_OR_RAISE(auto result, this->builder_->Finish());
- auto result_type = checked_cast<DictionaryType*>(result->type().get());
- if (this->dict_type_->ordered() && !result_type->ordered()) {
- // TODO: we should not have to do that, there is probably something wrong
- // in the DictionaryBuilder code
- result->data()->type =
- arrow::dictionary(result_type->index_type(),
result_type->value_type(), true);
- }
-
return std::make_shared<ChunkedArray>(
std::make_shared<DictionaryArray>(result->data()));
}
diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R
index 8520160d12..9fcc3e6b86 100644
--- a/r/tests/testthat/test-Array.R
+++ b/r/tests/testthat/test-Array.R
@@ -643,6 +643,13 @@ test_that("arrow_array() handles vector -> list arrays
(ARROW-7662)", {
expect_array_roundtrip(list(factor(c("b", "a"), levels = c("a", "b"))),
list_of(dictionary(int8(), utf8())))
expect_array_roundtrip(list(factor(NA, levels = c("a", "b"))),
list_of(dictionary(int8(), utf8())))
+ # ordered factor (GH-49689)
+ expect_array_roundtrip(
+ list(ordered(c("b", "a"), levels = c("a", "b"))),
+ list_of(dictionary(int8(), utf8(), ordered = TRUE))
+ )
+ expect_array_roundtrip(list(ordered(NA, levels = c("a", "b"))),
list_of(dictionary(int8(), utf8(), ordered = TRUE)))
+
# struct
expect_array_roundtrip(
list(tibble::tibble(a = integer(0), b = integer(0), c = character(0), d =
logical(0))),
@@ -742,6 +749,13 @@ test_that("arrow_array() handles vector -> large list
arrays", {
as = large_list_of(dictionary(int8(), utf8()))
)
+ # ordered factor (GH-49689)
+ expect_array_roundtrip(
+ list(ordered(c("b", "a"), levels = c("a", "b"))),
+ large_list_of(dictionary(int8(), utf8(), ordered = TRUE)),
+ as = large_list_of(dictionary(int8(), utf8(), ordered = TRUE))
+ )
+
# struct
expect_array_roundtrip(
list(tibble::tibble(a = integer(0), b = integer(0), c = character(0), d =
logical(0))),
@@ -809,6 +823,13 @@ test_that("arrow_array() handles vector -> fixed size list
arrays", {
as = fixed_size_list_of(dictionary(int8(), utf8()), 2L)
)
+ # ordered factor (GH-49689)
+ expect_array_roundtrip(
+ list(ordered(c("b", "a"), levels = c("a", "b"))),
+ fixed_size_list_of(dictionary(int8(), utf8(), ordered = TRUE), 2L),
+ as = fixed_size_list_of(dictionary(int8(), utf8(), ordered = TRUE), 2L)
+ )
+
# struct
expect_array_roundtrip(
list(tibble::tibble(a = 1L, b = 1L, c = "", d = TRUE)),