diff --git a/cpp/src/arrow/array/array_dict_test.cc b/cpp/src/arrow/array/array_dict_test.cc index 22d6d1fc5ae9..5b1af6f8cb3f 100644 --- a/cpp/src/arrow/array/array_dict_test.cc +++ b/cpp/src/arrow/array/array_dict_test.cc @@ -1761,4 +1761,84 @@ TEST(TestDictionaryUnifier, TableZeroColumns) { AssertTablesEqual(*table, *unified); } +// GH-49689: Ordered dictionary tests + +TEST(TestDictionaryBuilderOrdered, TypePreservesOrderedFlag) { + for (bool ordered : {true, false}) { + ARROW_SCOPED_TRACE("ordered = ", ordered); + auto dict_type = dictionary(int8(), utf8(), ordered); + std::unique_ptr boxed_builder; + ASSERT_OK(MakeBuilder(default_memory_pool(), dict_type, &boxed_builder)); + + auto builder_type = boxed_builder->type(); + ASSERT_EQ(checked_cast(*builder_type).ordered(), ordered); + } +} + +TEST(TestDictionaryBuilderOrdered, FinishPreservesOrderedFlag) { + for (bool ordered : {true, false}) { + ARROW_SCOPED_TRACE("ordered = ", ordered); + auto dict_type = dictionary(int8(), utf8(), ordered); + std::unique_ptr boxed_builder; + ASSERT_OK(MakeBuilder(default_memory_pool(), dict_type, &boxed_builder)); + auto& builder = checked_cast&>(*boxed_builder); + + ASSERT_OK(builder.Append("a")); + ASSERT_OK(builder.Append("b")); + ASSERT_OK(builder.Append("a")); + + std::shared_ptr result; + ASSERT_OK(builder.Finish(&result)); + + const auto& result_type = checked_cast(*result->type()); + ASSERT_EQ(result_type.ordered(), ordered); + + auto ex_dict = ArrayFromJSON(utf8(), R"(["a", "b"])"); + auto ex_indices = ArrayFromJSON(int8(), "[0, 1, 0]"); + DictionaryArray expected(dict_type, ex_indices, ex_dict); + AssertArraysEqual(expected, *result); + } +} + +TEST(TestDictionaryBuilderOrdered, ListOfOrderedDictionary) { + for (bool ordered : {true, false}) { + ARROW_SCOPED_TRACE("ordered = ", ordered); + auto dict_type = dictionary(int8(), utf8(), ordered); + auto list_type = list(field("item", dict_type)); + + std::unique_ptr boxed_builder; + ASSERT_OK(MakeBuilder(default_memory_pool(), list_type, &boxed_builder)); + auto& list_builder = checked_cast(*boxed_builder); + auto& dict_builder = + checked_cast&>(*list_builder.value_builder()); + + ASSERT_OK(list_builder.Append()); + ASSERT_OK(dict_builder.Append("a")); + ASSERT_OK(dict_builder.Append("b")); + ASSERT_OK(list_builder.Append()); + ASSERT_OK(dict_builder.Append("a")); + + std::shared_ptr result; + ASSERT_OK(list_builder.Finish(&result)); + + const auto& result_list_type = checked_cast(*result->type()); + const auto& result_dict_type = + checked_cast(*result_list_type.value_type()); + ASSERT_EQ(result_dict_type.ordered(), ordered); + } +} + +TEST(TestDictionaryBuilderOrdered, MakeDictionaryBuilderPreservesOrdered) { + for (bool ordered : {true, false}) { + ARROW_SCOPED_TRACE("ordered = ", ordered); + auto dict_type = dictionary(int8(), utf8(), ordered); + std::unique_ptr builder; + ASSERT_OK(MakeDictionaryBuilder(default_memory_pool(), dict_type, + /*dictionary=*/nullptr, &builder)); + + auto builder_type = builder->type(); + ASSERT_EQ(checked_cast(*builder_type).ordered(), ordered); + } +} + } // namespace arrow diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h index 116c82049eea..269cdeee646d 100644 --- a/cpp/src/arrow/array/builder_dict.h +++ b/cpp/src/arrow/array/builder_dict.h @@ -154,26 +154,28 @@ class DictionaryBuilderBase : public ArrayBuilder { const std::shared_ptr&> value_type, MemoryPool* pool = default_memory_pool(), - int64_t alignment = kDefaultBufferAlignment) + int64_t alignment = kDefaultBufferAlignment, bool ordered = false) : ArrayBuilder(pool, alignment), memo_table_(new internal::DictionaryMemoTable(pool, value_type)), delta_offset_(0), byte_width_(-1), indices_builder_(start_int_size, pool, alignment), - value_type_(value_type) {} + value_type_(value_type), + ordered_(ordered) {} template explicit DictionaryBuilderBase( enable_if_t::value, const std::shared_ptr&> value_type, MemoryPool* pool = default_memory_pool(), - int64_t alignment = kDefaultBufferAlignment) + int64_t alignment = kDefaultBufferAlignment, bool ordered = false) : ArrayBuilder(pool, alignment), memo_table_(new internal::DictionaryMemoTable(pool, value_type)), delta_offset_(0), byte_width_(-1), indices_builder_(pool, alignment), - value_type_(value_type) {} + value_type_(value_type), + ordered_(ordered) {} template explicit DictionaryBuilderBase( @@ -181,13 +183,14 @@ class DictionaryBuilderBase : public ArrayBuilder { enable_if_t::value, const std::shared_ptr&> value_type, MemoryPool* pool = default_memory_pool(), - int64_t alignment = kDefaultBufferAlignment) + int64_t alignment = kDefaultBufferAlignment, bool ordered = false) : ArrayBuilder(pool, alignment), memo_table_(new internal::DictionaryMemoTable(pool, value_type)), delta_offset_(0), byte_width_(-1), indices_builder_(index_type, pool, alignment), - value_type_(value_type) {} + value_type_(value_type), + ordered_(ordered) {} template DictionaryBuilderBase(uint8_t start_int_size, @@ -196,38 +199,41 @@ class DictionaryBuilderBase : public ArrayBuilder { const std::shared_ptr&> value_type, MemoryPool* pool = default_memory_pool(), - int64_t alignment = kDefaultBufferAlignment) + int64_t alignment = kDefaultBufferAlignment, bool ordered = false) : ArrayBuilder(pool, alignment), memo_table_(new internal::DictionaryMemoTable(pool, value_type)), delta_offset_(0), byte_width_(static_cast(*value_type).byte_width()), indices_builder_(start_int_size, pool, alignment), - value_type_(value_type) {} + value_type_(value_type), + ordered_(ordered) {} template explicit DictionaryBuilderBase( enable_if_fixed_size_binary&> value_type, MemoryPool* pool = default_memory_pool(), - int64_t alignment = kDefaultBufferAlignment) + int64_t alignment = kDefaultBufferAlignment, bool ordered = false) : ArrayBuilder(pool, alignment), memo_table_(new internal::DictionaryMemoTable(pool, value_type)), delta_offset_(0), byte_width_(static_cast(*value_type).byte_width()), indices_builder_(pool, alignment), - value_type_(value_type) {} + value_type_(value_type), + ordered_(ordered) {} template explicit DictionaryBuilderBase( const std::shared_ptr& index_type, enable_if_fixed_size_binary&> value_type, MemoryPool* pool = default_memory_pool(), - int64_t alignment = kDefaultBufferAlignment) + int64_t alignment = kDefaultBufferAlignment, bool ordered = false) : ArrayBuilder(pool, alignment), memo_table_(new internal::DictionaryMemoTable(pool, value_type)), delta_offset_(0), byte_width_(static_cast(*value_type).byte_width()), indices_builder_(index_type, pool, alignment), - value_type_(value_type) {} + value_type_(value_type), + ordered_(ordered) {} template explicit DictionaryBuilderBase( @@ -237,13 +243,15 @@ class DictionaryBuilderBase : public ArrayBuilder { // This constructor doesn't check for errors. Use InsertMemoValues instead. explicit DictionaryBuilderBase(const std::shared_ptr& dictionary, MemoryPool* pool = default_memory_pool(), - int64_t alignment = kDefaultBufferAlignment) + int64_t alignment = kDefaultBufferAlignment, + bool ordered = false) : ArrayBuilder(pool, alignment), memo_table_(new internal::DictionaryMemoTable(pool, dictionary)), delta_offset_(0), byte_width_(-1), indices_builder_(pool, alignment), - value_type_(dictionary->type()) {} + value_type_(dictionary->type()), + ordered_(ordered) {} ~DictionaryBuilderBase() override = default; @@ -490,7 +498,7 @@ class DictionaryBuilderBase : public ArrayBuilder { Status Finish(std::shared_ptr* out) { return FinishTyped(out); } std::shared_ptr type() const override { - return ::arrow::dictionary(indices_builder_.type(), value_type_); + return ::arrow::dictionary(indices_builder_.type(), value_type_, ordered_); } protected: @@ -561,6 +569,7 @@ class DictionaryBuilderBase : public ArrayBuilder { BuilderType indices_builder_; std::shared_ptr value_type_; + bool ordered_ = false; }; template @@ -571,31 +580,53 @@ class DictionaryBuilderBase : public ArrayBuilder { enable_if_t::value, uint8_t> start_int_size, const std::shared_ptr& value_type, - MemoryPool* pool = default_memory_pool()) - : ArrayBuilder(pool), indices_builder_(start_int_size, pool) {} + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment, bool ordered = false) + : ArrayBuilder(pool, alignment), + indices_builder_(start_int_size, pool, alignment), + ordered_(ordered) {} explicit DictionaryBuilderBase(const std::shared_ptr& value_type, - MemoryPool* pool = default_memory_pool()) - : ArrayBuilder(pool), indices_builder_(pool) {} + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment, + bool ordered = false) + : ArrayBuilder(pool, alignment), + indices_builder_(pool, alignment), + ordered_(ordered) {} explicit DictionaryBuilderBase(const std::shared_ptr& index_type, const std::shared_ptr& value_type, - MemoryPool* pool = default_memory_pool()) - : ArrayBuilder(pool), indices_builder_(index_type, pool) {} + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment, + bool ordered = false) + : ArrayBuilder(pool, alignment), + indices_builder_(index_type, pool, alignment), + ordered_(ordered) {} template explicit DictionaryBuilderBase( enable_if_t::value, uint8_t> start_int_size, - MemoryPool* pool = default_memory_pool()) - : ArrayBuilder(pool), indices_builder_(start_int_size, pool) {} + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment, bool ordered = false) + : ArrayBuilder(pool, alignment), + indices_builder_(start_int_size, pool, alignment), + ordered_(ordered) {} - explicit DictionaryBuilderBase(MemoryPool* pool = default_memory_pool()) - : ArrayBuilder(pool), indices_builder_(pool) {} + explicit DictionaryBuilderBase(MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment, + bool ordered = false) + : ArrayBuilder(pool, alignment), + indices_builder_(pool, alignment), + ordered_(ordered) {} explicit DictionaryBuilderBase(const std::shared_ptr& dictionary, - MemoryPool* pool = default_memory_pool()) - : ArrayBuilder(pool), indices_builder_(pool) {} + MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment, + bool ordered = false) + : ArrayBuilder(pool, alignment), + indices_builder_(pool, alignment), + ordered_(ordered) {} /// \brief Append a scalar null value Status AppendNull() final { @@ -647,7 +678,7 @@ class DictionaryBuilderBase : public ArrayBuilder { Status FinishInternal(std::shared_ptr* out) override { ARROW_RETURN_NOT_OK(indices_builder_.FinishInternal(out)); - (*out)->type = dictionary((*out)->type, null()); + (*out)->type = dictionary((*out)->type, null(), ordered_); (*out)->dictionary = NullArray(0).data(); return Status::OK(); } @@ -659,11 +690,12 @@ class DictionaryBuilderBase : public ArrayBuilder { Status Finish(std::shared_ptr* out) { return FinishTyped(out); } std::shared_ptr type() const override { - return ::arrow::dictionary(indices_builder_.type(), null()); + return ::arrow::dictionary(indices_builder_.type(), null(), ordered_); } protected: BuilderType indices_builder_; + bool ordered_ = false; }; } // namespace internal diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 1a6a718c15e1..2190f1ce04a9 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -168,17 +168,21 @@ struct DictionaryBuilderCase { template Status CreateFor() { using AdaptiveBuilderType = DictionaryBuilder; + using ExactBuilderType = + internal::DictionaryBuilderBase; if (dictionary != nullptr) { - out->reset(new AdaptiveBuilderType(dictionary, pool)); + out->reset( + new AdaptiveBuilderType(dictionary, pool, kDefaultBufferAlignment, ordered)); } else if (exact_index_type) { if (!is_integer(index_type->id())) { return Status::TypeError("MakeBuilder: invalid index type ", *index_type); } - out->reset(new internal::DictionaryBuilderBase( - index_type, value_type, pool)); + out->reset(new ExactBuilderType(index_type, value_type, pool, + kDefaultBufferAlignment, ordered)); } else { auto start_int_size = index_type->byte_width(); - out->reset(new AdaptiveBuilderType(start_int_size, value_type, pool)); + out->reset(new AdaptiveBuilderType(start_int_size, value_type, pool, + kDefaultBufferAlignment, ordered)); } return Status::OK(); } @@ -188,8 +192,9 @@ struct DictionaryBuilderCase { MemoryPool* pool; const std::shared_ptr& index_type; const std::shared_ptr& value_type; - const std::shared_ptr& dictionary; + std::shared_ptr dictionary; bool exact_index_type; + bool ordered; std::unique_ptr* out; }; @@ -206,6 +211,7 @@ struct MakeBuilderImpl { dict_type.value_type(), /*dictionary=*/nullptr, exact_index_type, + dict_type.ordered(), &out}; return visitor.Make(); } @@ -332,9 +338,13 @@ Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr& const std::shared_ptr& dictionary, std::unique_ptr* out) { const auto& dict_type = static_cast(*type); - DictionaryBuilderCase visitor = { - pool, dict_type.index_type(), dict_type.value_type(), - dictionary, /*exact_index_type=*/false, out}; + DictionaryBuilderCase visitor = {pool, + dict_type.index_type(), + dict_type.value_type(), + dictionary, + /*exact_index_type=*/false, + dict_type.ordered(), + out}; return visitor.Make(); } diff --git a/r/src/r_to_arrow.cpp b/r/src/r_to_arrow.cpp index 45d68043af5a..e521b0e8db32 100644 --- a/r/src/r_to_arrow.cpp +++ b/r/src/r_to_arrow.cpp @@ -996,14 +996,6 @@ class RDictionaryConverter> Result> ToChunkedArray() override { ARROW_ASSIGN_OR_RAISE(auto result, this->builder_->Finish()); - auto result_type = checked_cast(result->type().get()); - if (this->dict_type_->ordered() && !result_type->ordered()) { - // TODO: we should not have to do that, there is probably something wrong - // in the DictionaryBuilder code - result->data()->type = - arrow::dictionary(result_type->index_type(), result_type->value_type(), true); - } - return std::make_shared( std::make_shared(result->data())); } diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index 8520160d1255..9fcc3e6b868d 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -643,6 +643,13 @@ test_that("arrow_array() handles vector -> list arrays (ARROW-7662)", { expect_array_roundtrip(list(factor(c("b", "a"), levels = c("a", "b"))), list_of(dictionary(int8(), utf8()))) expect_array_roundtrip(list(factor(NA, levels = c("a", "b"))), list_of(dictionary(int8(), utf8()))) + # ordered factor (GH-49689) + expect_array_roundtrip( + list(ordered(c("b", "a"), levels = c("a", "b"))), + list_of(dictionary(int8(), utf8(), ordered = TRUE)) + ) + expect_array_roundtrip(list(ordered(NA, levels = c("a", "b"))), list_of(dictionary(int8(), utf8(), ordered = TRUE))) + # struct expect_array_roundtrip( list(tibble::tibble(a = integer(0), b = integer(0), c = character(0), d = logical(0))), @@ -742,6 +749,13 @@ test_that("arrow_array() handles vector -> large list arrays", { as = large_list_of(dictionary(int8(), utf8())) ) + # ordered factor (GH-49689) + expect_array_roundtrip( + list(ordered(c("b", "a"), levels = c("a", "b"))), + large_list_of(dictionary(int8(), utf8(), ordered = TRUE)), + as = large_list_of(dictionary(int8(), utf8(), ordered = TRUE)) + ) + # struct expect_array_roundtrip( list(tibble::tibble(a = integer(0), b = integer(0), c = character(0), d = logical(0))), @@ -809,6 +823,13 @@ test_that("arrow_array() handles vector -> fixed size list arrays", { as = fixed_size_list_of(dictionary(int8(), utf8()), 2L) ) + # ordered factor (GH-49689) + expect_array_roundtrip( + list(ordered(c("b", "a"), levels = c("a", "b"))), + fixed_size_list_of(dictionary(int8(), utf8(), ordered = TRUE), 2L), + as = fixed_size_list_of(dictionary(int8(), utf8(), ordered = TRUE), 2L) + ) + # struct expect_array_roundtrip( list(tibble::tibble(a = 1L, b = 1L, c = "", d = TRUE)),