Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 80 additions & 0 deletions cpp/src/arrow/array/array_dict_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1761,4 +1761,84 @@ TEST(TestDictionaryUnifier, TableZeroColumns) {
AssertTablesEqual(*table, *unified);
}

// GH-49689: Ordered dictionary tests

TEST(TestDictionaryBuilderOrdered, TypePreservesOrderedFlag) {
for (bool ordered : {true, false}) {
ARROW_SCOPED_TRACE("ordered = ", ordered);
auto dict_type = dictionary(int8(), utf8(), ordered);
std::unique_ptr<ArrayBuilder> boxed_builder;
ASSERT_OK(MakeBuilder(default_memory_pool(), dict_type, &boxed_builder));

auto builder_type = boxed_builder->type();
ASSERT_EQ(checked_cast<const DictionaryType&>(*builder_type).ordered(), ordered);
}
}

TEST(TestDictionaryBuilderOrdered, FinishPreservesOrderedFlag) {
for (bool ordered : {true, false}) {
ARROW_SCOPED_TRACE("ordered = ", ordered);
auto dict_type = dictionary(int8(), utf8(), ordered);
std::unique_ptr<ArrayBuilder> boxed_builder;
ASSERT_OK(MakeBuilder(default_memory_pool(), dict_type, &boxed_builder));
auto& builder = checked_cast<DictionaryBuilder<StringType>&>(*boxed_builder);

ASSERT_OK(builder.Append("a"));
ASSERT_OK(builder.Append("b"));
ASSERT_OK(builder.Append("a"));

std::shared_ptr<Array> result;
ASSERT_OK(builder.Finish(&result));

const auto& result_type = checked_cast<const DictionaryType&>(*result->type());
ASSERT_EQ(result_type.ordered(), ordered);

auto ex_dict = ArrayFromJSON(utf8(), R"(["a", "b"])");
auto ex_indices = ArrayFromJSON(int8(), "[0, 1, 0]");
DictionaryArray expected(dict_type, ex_indices, ex_dict);
AssertArraysEqual(expected, *result);
}
}

TEST(TestDictionaryBuilderOrdered, ListOfOrderedDictionary) {
for (bool ordered : {true, false}) {
ARROW_SCOPED_TRACE("ordered = ", ordered);
auto dict_type = dictionary(int8(), utf8(), ordered);
auto list_type = list(field("item", dict_type));

std::unique_ptr<ArrayBuilder> boxed_builder;
ASSERT_OK(MakeBuilder(default_memory_pool(), list_type, &boxed_builder));
auto& list_builder = checked_cast<ListBuilder&>(*boxed_builder);
auto& dict_builder =
checked_cast<DictionaryBuilder<StringType>&>(*list_builder.value_builder());

ASSERT_OK(list_builder.Append());
ASSERT_OK(dict_builder.Append("a"));
ASSERT_OK(dict_builder.Append("b"));
ASSERT_OK(list_builder.Append());
ASSERT_OK(dict_builder.Append("a"));

std::shared_ptr<Array> result;
ASSERT_OK(list_builder.Finish(&result));

const auto& result_list_type = checked_cast<const ListType&>(*result->type());
const auto& result_dict_type =
checked_cast<const DictionaryType&>(*result_list_type.value_type());
ASSERT_EQ(result_dict_type.ordered(), ordered);
}
}

TEST(TestDictionaryBuilderOrdered, MakeDictionaryBuilderPreservesOrdered) {
for (bool ordered : {true, false}) {
ARROW_SCOPED_TRACE("ordered = ", ordered);
auto dict_type = dictionary(int8(), utf8(), ordered);
std::unique_ptr<ArrayBuilder> builder;
ASSERT_OK(MakeDictionaryBuilder(default_memory_pool(), dict_type,
/*dictionary=*/nullptr, &builder));

auto builder_type = builder->type();
ASSERT_EQ(checked_cast<const DictionaryType&>(*builder_type).ordered(), ordered);
}
}

Comment on lines +1764 to +1843
Copy link
Copy Markdown
Member Author

@thisisnic thisisnic May 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is all AI-generated. I roughly understood what it does but am not confident in it, other than having checked it looks relatively idiomatic, and all of these tests except UnorderedTypeStaysUnordered failed before the PR and passed after.

} // namespace arrow
90 changes: 61 additions & 29 deletions cpp/src/arrow/array/builder_dict.h
Original file line number Diff line number Diff line change
Expand Up @@ -154,40 +154,43 @@ class DictionaryBuilderBase : public ArrayBuilder {
const std::shared_ptr<DataType>&>
value_type,
MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment)
int64_t alignment = kDefaultBufferAlignment, bool ordered = false)
: ArrayBuilder(pool, alignment),
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
delta_offset_(0),
byte_width_(-1),
indices_builder_(start_int_size, pool, alignment),
value_type_(value_type) {}
value_type_(value_type),
ordered_(ordered) {}

template <typename T1 = T>
explicit DictionaryBuilderBase(
enable_if_t<!is_fixed_size_binary_type<T1>::value, const std::shared_ptr<DataType>&>
value_type,
MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment)
int64_t alignment = kDefaultBufferAlignment, bool ordered = false)
: ArrayBuilder(pool, alignment),
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
delta_offset_(0),
byte_width_(-1),
indices_builder_(pool, alignment),
value_type_(value_type) {}
value_type_(value_type),
ordered_(ordered) {}

template <typename T1 = T>
explicit DictionaryBuilderBase(
const std::shared_ptr<DataType>& index_type,
enable_if_t<!is_fixed_size_binary_type<T1>::value, const std::shared_ptr<DataType>&>
value_type,
MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment)
int64_t alignment = kDefaultBufferAlignment, bool ordered = false)
: ArrayBuilder(pool, alignment),
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
delta_offset_(0),
byte_width_(-1),
indices_builder_(index_type, pool, alignment),
value_type_(value_type) {}
value_type_(value_type),
ordered_(ordered) {}

template <typename B = BuilderType, typename T1 = T>
DictionaryBuilderBase(uint8_t start_int_size,
Expand All @@ -196,38 +199,41 @@ class DictionaryBuilderBase : public ArrayBuilder {
const std::shared_ptr<DataType>&>
value_type,
MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment)
int64_t alignment = kDefaultBufferAlignment, bool ordered = false)
: ArrayBuilder(pool, alignment),
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
delta_offset_(0),
byte_width_(static_cast<const T1&>(*value_type).byte_width()),
indices_builder_(start_int_size, pool, alignment),
value_type_(value_type) {}
value_type_(value_type),
ordered_(ordered) {}

template <typename T1 = T>
explicit DictionaryBuilderBase(
enable_if_fixed_size_binary<T1, const std::shared_ptr<DataType>&> value_type,
MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment)
int64_t alignment = kDefaultBufferAlignment, bool ordered = false)
: ArrayBuilder(pool, alignment),
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
delta_offset_(0),
byte_width_(static_cast<const T1&>(*value_type).byte_width()),
indices_builder_(pool, alignment),
value_type_(value_type) {}
value_type_(value_type),
ordered_(ordered) {}

template <typename T1 = T>
explicit DictionaryBuilderBase(
const std::shared_ptr<DataType>& index_type,
enable_if_fixed_size_binary<T1, const std::shared_ptr<DataType>&> value_type,
MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment)
int64_t alignment = kDefaultBufferAlignment, bool ordered = false)
: ArrayBuilder(pool, alignment),
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
delta_offset_(0),
byte_width_(static_cast<const T1&>(*value_type).byte_width()),
indices_builder_(index_type, pool, alignment),
value_type_(value_type) {}
value_type_(value_type),
ordered_(ordered) {}

template <typename T1 = T>
explicit DictionaryBuilderBase(
Expand All @@ -237,13 +243,15 @@ class DictionaryBuilderBase : public ArrayBuilder {
// This constructor doesn't check for errors. Use InsertMemoValues instead.
explicit DictionaryBuilderBase(const std::shared_ptr<Array>& dictionary,
MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment)
int64_t alignment = kDefaultBufferAlignment,
bool ordered = false)
: ArrayBuilder(pool, alignment),
memo_table_(new internal::DictionaryMemoTable(pool, dictionary)),
delta_offset_(0),
byte_width_(-1),
indices_builder_(pool, alignment),
value_type_(dictionary->type()) {}
value_type_(dictionary->type()),
ordered_(ordered) {}

~DictionaryBuilderBase() override = default;

Expand Down Expand Up @@ -490,7 +498,7 @@ class DictionaryBuilderBase : public ArrayBuilder {
Status Finish(std::shared_ptr<DictionaryArray>* out) { return FinishTyped(out); }

std::shared_ptr<DataType> type() const override {
return ::arrow::dictionary(indices_builder_.type(), value_type_);
return ::arrow::dictionary(indices_builder_.type(), value_type_, ordered_);
}

protected:
Expand Down Expand Up @@ -561,6 +569,7 @@ class DictionaryBuilderBase : public ArrayBuilder {

BuilderType indices_builder_;
std::shared_ptr<DataType> value_type_;
bool ordered_ = false;
};

template <typename BuilderType>
Expand All @@ -571,31 +580,53 @@ class DictionaryBuilderBase<BuilderType, NullType> : public ArrayBuilder {
enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value, uint8_t>
start_int_size,
const std::shared_ptr<DataType>& value_type,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool), indices_builder_(start_int_size, pool) {}
MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment, bool ordered = false)
: ArrayBuilder(pool, alignment),
indices_builder_(start_int_size, pool, alignment),
ordered_(ordered) {}

explicit DictionaryBuilderBase(const std::shared_ptr<DataType>& value_type,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool), indices_builder_(pool) {}
MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment,
bool ordered = false)
: ArrayBuilder(pool, alignment),
indices_builder_(pool, alignment),
ordered_(ordered) {}

explicit DictionaryBuilderBase(const std::shared_ptr<DataType>& index_type,
const std::shared_ptr<DataType>& value_type,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool), indices_builder_(index_type, pool) {}
MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment,
bool ordered = false)
: ArrayBuilder(pool, alignment),
indices_builder_(index_type, pool, alignment),
ordered_(ordered) {}

template <typename B = BuilderType>
explicit DictionaryBuilderBase(
enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value, uint8_t>
start_int_size,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool), indices_builder_(start_int_size, pool) {}
MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment, bool ordered = false)
: ArrayBuilder(pool, alignment),
indices_builder_(start_int_size, pool, alignment),
ordered_(ordered) {}

explicit DictionaryBuilderBase(MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool), indices_builder_(pool) {}
explicit DictionaryBuilderBase(MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment,
bool ordered = false)
: ArrayBuilder(pool, alignment),
indices_builder_(pool, alignment),
ordered_(ordered) {}

explicit DictionaryBuilderBase(const std::shared_ptr<Array>& dictionary,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(pool), indices_builder_(pool) {}
MemoryPool* pool = default_memory_pool(),
int64_t alignment = kDefaultBufferAlignment,
bool ordered = false)
: ArrayBuilder(pool, alignment),
indices_builder_(pool, alignment),
ordered_(ordered) {}

/// \brief Append a scalar null value
Status AppendNull() final {
Expand Down Expand Up @@ -647,7 +678,7 @@ class DictionaryBuilderBase<BuilderType, NullType> : public ArrayBuilder {

Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
ARROW_RETURN_NOT_OK(indices_builder_.FinishInternal(out));
(*out)->type = dictionary((*out)->type, null());
(*out)->type = dictionary((*out)->type, null(), ordered_);
(*out)->dictionary = NullArray(0).data();
return Status::OK();
}
Expand All @@ -659,11 +690,12 @@ class DictionaryBuilderBase<BuilderType, NullType> : public ArrayBuilder {
Status Finish(std::shared_ptr<DictionaryArray>* out) { return FinishTyped(out); }

std::shared_ptr<DataType> type() const override {
return ::arrow::dictionary(indices_builder_.type(), null());
return ::arrow::dictionary(indices_builder_.type(), null(), ordered_);
}

protected:
BuilderType indices_builder_;
bool ordered_ = false;
};

} // namespace internal
Expand Down
26 changes: 18 additions & 8 deletions cpp/src/arrow/builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -168,17 +168,21 @@ struct DictionaryBuilderCase {
template <typename ValueType>
Status CreateFor() {
using AdaptiveBuilderType = DictionaryBuilder<ValueType>;
using ExactBuilderType =
internal::DictionaryBuilderBase<TypeErasedIntBuilder, ValueType>;
if (dictionary != nullptr) {
out->reset(new AdaptiveBuilderType(dictionary, pool));
out->reset(
new AdaptiveBuilderType(dictionary, pool, kDefaultBufferAlignment, ordered));
} else if (exact_index_type) {
if (!is_integer(index_type->id())) {
return Status::TypeError("MakeBuilder: invalid index type ", *index_type);
}
out->reset(new internal::DictionaryBuilderBase<TypeErasedIntBuilder, ValueType>(
index_type, value_type, pool));
out->reset(new ExactBuilderType(index_type, value_type, pool,
kDefaultBufferAlignment, ordered));
} else {
auto start_int_size = index_type->byte_width();
out->reset(new AdaptiveBuilderType(start_int_size, value_type, pool));
out->reset(new AdaptiveBuilderType(start_int_size, value_type, pool,
kDefaultBufferAlignment, ordered));
}
return Status::OK();
}
Expand All @@ -188,8 +192,9 @@ struct DictionaryBuilderCase {
MemoryPool* pool;
const std::shared_ptr<DataType>& index_type;
const std::shared_ptr<DataType>& value_type;
const std::shared_ptr<Array>& dictionary;
std::shared_ptr<Array> dictionary;
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was updated as without it, we ended up with a dangling reference when nullptr is passed in, like when we're creating a builder from a type and don't have a dictionary array to pass in.

bool exact_index_type;
bool ordered;
std::unique_ptr<ArrayBuilder>* out;
};

Expand All @@ -206,6 +211,7 @@ struct MakeBuilderImpl {
dict_type.value_type(),
/*dictionary=*/nullptr,
exact_index_type,
dict_type.ordered(),
&out};
return visitor.Make();
}
Expand Down Expand Up @@ -332,9 +338,13 @@ Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>&
const std::shared_ptr<Array>& dictionary,
std::unique_ptr<ArrayBuilder>* out) {
const auto& dict_type = static_cast<const DictionaryType&>(*type);
DictionaryBuilderCase visitor = {
pool, dict_type.index_type(), dict_type.value_type(),
dictionary, /*exact_index_type=*/false, out};
DictionaryBuilderCase visitor = {pool,
dict_type.index_type(),
dict_type.value_type(),
dictionary,
/*exact_index_type=*/false,
dict_type.ordered(),
out};
return visitor.Make();
}

Expand Down
8 changes: 0 additions & 8 deletions r/src/r_to_arrow.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -996,14 +996,6 @@ class RDictionaryConverter<ValueType, enable_if_has_string_view<ValueType>>
Result<std::shared_ptr<ChunkedArray>> ToChunkedArray() override {
ARROW_ASSIGN_OR_RAISE(auto result, this->builder_->Finish());

auto result_type = checked_cast<DictionaryType*>(result->type().get());
if (this->dict_type_->ordered() && !result_type->ordered()) {
// TODO: we should not have to do that, there is probably something wrong
// in the DictionaryBuilder code
result->data()->type =
arrow::dictionary(result_type->index_type(), result_type->value_type(), true);
}

return std::make_shared<ChunkedArray>(
std::make_shared<DictionaryArray>(result->data()));
}
Expand Down
Loading
Loading