This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new 981775ca feat: Encode dictionary schemas (#882)
981775ca is described below
commit 981775cad8542dee661aec0a9c0441bb2458f8be
Author: Dewey Dunnington <[email protected]>
AuthorDate: Tue May 12 21:46:38 2026 -0500
feat: Encode dictionary schemas (#882)
This PR adds support in the IPC writer for dictionary schemas, verifying
support by removing skips and adding cases for all the valid dictionary
index types in the parameterized test cases.
---
python/meson.build | 1 +
src/nanoarrow/ipc/decoder_test.cc | 65 +++++++++--------
src/nanoarrow/ipc/encoder.c | 144 +++++++++++++++++++++++++++++++-------
3 files changed, 153 insertions(+), 57 deletions(-)
diff --git a/python/meson.build b/python/meson.build
index 8a646400..a7e57c98 100644
--- a/python/meson.build
+++ b/python/meson.build
@@ -33,6 +33,7 @@ project(
'arrow-nanoarrow:ipc_with_zstd=enabled',
'arrow-nanoarrow:device=enabled',
'arrow-nanoarrow:namespace=PythonPkg',
+ 'arrow-nanoarrow:tests=disabled',
'zstd:bin_programs=false',
],
)
diff --git a/src/nanoarrow/ipc/decoder_test.cc
b/src/nanoarrow/ipc/decoder_test.cc
index 48ed0af1..e7feaf00 100644
--- a/src/nanoarrow/ipc/decoder_test.cc
+++ b/src/nanoarrow/ipc/decoder_test.cc
@@ -1211,17 +1211,6 @@ std::string ArrowSchemaToString(const struct
ArrowSchema* schema) {
#if defined(NANOARROW_BUILD_TESTS_WITH_ARROW)
TEST_P(ArrowTypeParameterizedTestFixture, NanoarrowIpcNanoarrowTypeRoundtrip) {
- if (GetParam()->id() == arrow::Type::DICTIONARY) {
- GTEST_SKIP() << "Dictionary array encode is not yet supported";
- }
-
- if (GetParam()->id() == arrow::Type::EXTENSION &&
-
std::static_pointer_cast<arrow::ExtensionType>(GetParam())->storage_type()->id()
==
- arrow::Type::DICTIONARY) {
- GTEST_SKIP()
- << "nanoarrow encoder cannot yet encode extension types with
dictionary storage";
- }
-
nanoarrow::UniqueSchema schema;
ASSERT_TRUE(
arrow::ExportSchema(arrow::Schema({arrow::field("", GetParam())}),
schema.get())
@@ -1326,11 +1315,21 @@ TEST_P(ArrowTypeParameterizedTestFixture,
NanoarrowIpcArrowArrayRoundtrip) {
ASSERT_TRUE(maybe_batch.ok());
EXPECT_EQ(maybe_batch.ValueUnsafe()->ToString(), empty->ToString());
- // Arrow C++ MakeEmpty() loses the ordered=1 flag for dictionary types.
+ // Arrow C++ MakeEmpty() loses the ordered=1 flag and unsigned index types
for
+ // dictionary types.
// https://github.com/apache/arrow/issues/49674
- // So for ordered dictionaries, we only check ToString() equality for empty
batches.
- if (data_type->id() != arrow::Type::DICTIONARY ||
- !std::static_pointer_cast<arrow::DictionaryType>(data_type)->ordered()) {
+ // So for ordered dictionaries and unsigned index types, we only check
ToString()
+ // equality for empty batches.
+ bool skip_equals_check = false;
+ if (data_type->id() == arrow::Type::DICTIONARY) {
+ auto dict_type =
std::static_pointer_cast<arrow::DictionaryType>(data_type);
+ auto index_id = dict_type->index_type()->id();
+ bool is_unsigned = index_id == arrow::Type::UINT8 ||
+ index_id == arrow::Type::UINT16 ||
+ index_id == arrow::Type::UINT32 || index_id ==
arrow::Type::UINT64;
+ skip_equals_check = dict_type->ordered() || is_unsigned;
+ }
+ if (!skip_equals_check) {
EXPECT_TRUE(maybe_batch.ValueUnsafe()->Equals(*empty)) <<
empty->ToString();
}
@@ -1530,7 +1529,15 @@ INSTANTIATE_TEST_SUITE_P(
arrow::list(arrow::field("some_custom_name", arrow::int32(),
arrow::KeyValueMetadata::Make({"key1"},
{"value1"}))),
// Dictionary encoding
+ arrow::dictionary(arrow::int8(), arrow::utf8()),
+ arrow::dictionary(arrow::int16(), arrow::utf8()),
arrow::dictionary(arrow::int32(), arrow::utf8()),
+ arrow::dictionary(arrow::int64(), arrow::utf8()),
+ arrow::dictionary(arrow::uint8(), arrow::utf8()),
+ arrow::dictionary(arrow::uint16(), arrow::utf8()),
+ arrow::dictionary(arrow::uint32(), arrow::utf8()),
+ arrow::dictionary(arrow::uint64(), arrow::utf8()),
+ // Ordered dictionary encoding
arrow::dictionary(arrow::int32(), arrow::utf8(), true),
// Extension type
arrow::extension::uuid(),
@@ -1577,12 +1584,6 @@ TEST_P(ArrowSchemaParameterizedTestFixture,
NanoarrowIpcArrowSchemaRoundtrip) {
}
TEST_P(ArrowSchemaParameterizedTestFixture,
NanoarrowIpcNanoarrowSchemaRoundtrip) {
- for (const auto& field : GetParam()->fields()) {
- if (field->type()->id() == arrow::Type::DICTIONARY) {
- GTEST_SKIP() << "nanoarrow cannot yet encode arrays with dictionaries";
- }
- }
-
const std::shared_ptr<arrow::Schema>& arrow_schema = GetParam();
nanoarrow::UniqueSchema schema;
@@ -1620,33 +1621,31 @@ TEST_P(ArrowSchemaParameterizedTestFixture,
NanoarrowIpcNanoarrowSchemaRoundtrip
}
TEST_P(ArrowSchemaParameterizedTestFixture,
NanoarrowIpcNanoarrowFooterRoundtrip) {
- for (const auto& field : GetParam()->fields()) {
- if (field->type()->id() == arrow::Type::DICTIONARY) {
- GTEST_SKIP() << "nanoarrow cannot yet encode arrays with dictionaries";
- }
- }
-
using namespace nanoarrow::literals;
const std::shared_ptr<arrow::Schema>& arrow_schema = GetParam();
nanoarrow::ipc::UniqueFooter footer;
ASSERT_TRUE(arrow::ExportSchema(*arrow_schema, &footer->schema).ok());
+ ArrowIpcDictionaryEncodingsInit(&footer->dictionaries);
+ ASSERT_EQ(
+ ArrowIpcDictionaryEncodingsAppendSchema(&footer->dictionaries,
&footer->schema),
+ NANOARROW_OK);
struct ArrowIpcFileBlock dummy_block = {1, 2, 3};
- EXPECT_EQ(
+ ASSERT_EQ(
ArrowBufferAppend(&footer->record_batch_blocks, &dummy_block,
sizeof(dummy_block)),
NANOARROW_OK);
nanoarrow::ipc::UniqueEncoder encoder;
- EXPECT_EQ(ArrowIpcEncoderInit(encoder.get()), NANOARROW_OK);
+ ASSERT_EQ(ArrowIpcEncoderInit(encoder.get()), NANOARROW_OK);
struct ArrowError error;
- EXPECT_EQ(ArrowIpcEncoderEncodeFooter(encoder.get(), footer.get(), &error),
+ ASSERT_EQ(ArrowIpcEncoderEncodeFooter(encoder.get(), footer.get(), &error),
NANOARROW_OK)
<< error.message;
nanoarrow::UniqueBuffer buffer;
- EXPECT_EQ(
+ ASSERT_EQ(
ArrowIpcEncoderFinalizeBuffer(encoder.get(), /*encapsulate=*/false,
buffer.get()),
NANOARROW_OK);
@@ -1654,9 +1653,9 @@ TEST_P(ArrowSchemaParameterizedTestFixture,
NanoarrowIpcNanoarrowFooterRoundtrip
uint32_t footer_size_le = bswap32(static_cast<uint32_t>(buffer->size_bytes));
EXPECT_EQ(ArrowBufferAppendInt32(buffer.get(), footer_size_le),
NANOARROW_OK);
#else
- EXPECT_EQ(ArrowBufferAppendInt32(buffer.get(), buffer->size_bytes),
NANOARROW_OK);
+ ASSERT_EQ(ArrowBufferAppendInt32(buffer.get(), buffer->size_bytes),
NANOARROW_OK);
#endif
- EXPECT_EQ(ArrowBufferAppendStringView(buffer.get(), "ARROW1"_asv),
NANOARROW_OK);
+ ASSERT_EQ(ArrowBufferAppendStringView(buffer.get(), "ARROW1"_asv),
NANOARROW_OK);
struct ArrowBufferView buffer_view;
buffer_view.data.data = buffer->data;
diff --git a/src/nanoarrow/ipc/encoder.c b/src/nanoarrow/ipc/encoder.c
index 27c9da95..d959bfa9 100644
--- a/src/nanoarrow/ipc/encoder.c
+++ b/src/nanoarrow/ipc/encoder.c
@@ -45,6 +45,7 @@ struct ArrowIpcEncoderPrivate {
struct ArrowBuffer buffers;
struct ArrowBuffer nodes;
int encoding_footer;
+ struct ArrowIpcDictionaryEncodings dictionary_encodings;
};
ArrowErrorCode ArrowIpcEncoderInit(struct ArrowIpcEncoder* encoder) {
@@ -63,6 +64,7 @@ ArrowErrorCode ArrowIpcEncoderInit(struct ArrowIpcEncoder*
encoder) {
private->encoding_footer = 0;
ArrowBufferInit(&private->buffers);
ArrowBufferInit(&private->nodes);
+ ArrowIpcDictionaryEncodingsInit(&private->dictionary_encodings);
return NANOARROW_OK;
}
@@ -74,6 +76,7 @@ void ArrowIpcEncoderReset(struct ArrowIpcEncoder* encoder) {
flatcc_builder_clear(&private->builder);
ArrowBufferReset(&private->nodes);
ArrowBufferReset(&private->buffers);
+ ArrowIpcDictionaryEncodingsReset(&private->dictionary_encodings);
ArrowFree(private);
}
memset(encoder, 0, sizeof(struct ArrowIpcEncoder));
@@ -337,9 +340,10 @@ static ArrowErrorCode
ArrowIpcEncodeFieldType(flatcc_builder_t* builder,
}
}
-static ArrowErrorCode ArrowIpcEncodeField(flatcc_builder_t* builder,
- const struct ArrowSchema* schema,
- struct ArrowError* error);
+static ArrowErrorCode ArrowIpcEncodeField(
+ flatcc_builder_t* builder, const struct ArrowSchema* schema,
+ const struct ArrowIpcDictionaryEncodings* dictionary_encodings,
+ struct ArrowError* error);
static ArrowErrorCode ArrowIpcEncodeMetadata(flatcc_builder_t* builder,
const struct ArrowSchema* schema,
@@ -364,36 +368,109 @@ static ArrowErrorCode
ArrowIpcEncodeMetadata(flatcc_builder_t* builder,
return NANOARROW_OK;
}
-static ArrowErrorCode ArrowIpcEncodeFields(flatcc_builder_t* builder,
- const struct ArrowSchema* schema,
- int
(*push_start)(flatcc_builder_t*),
- ns(Field_ref_t) *
- (*push_end)(flatcc_builder_t*),
- struct ArrowError* error) {
+static ArrowErrorCode ArrowIpcEncodeFields(
+ flatcc_builder_t* builder, const struct ArrowSchema* schema,
+ int (*push_start)(flatcc_builder_t*),
+ ns(Field_ref_t) * (*push_end)(flatcc_builder_t*),
+ const struct ArrowIpcDictionaryEncodings* dictionary_encodings,
+ struct ArrowError* error) {
for (int i = 0; i < schema->n_children; i++) {
FLATCC_RETURN_UNLESS_0_NO_NS(push_start(builder), error);
- NANOARROW_RETURN_NOT_OK(ArrowIpcEncodeField(builder, schema->children[i],
error));
+ NANOARROW_RETURN_NOT_OK(
+ ArrowIpcEncodeField(builder, schema->children[i],
dictionary_encodings, error));
FLATCC_RETURN_IF_NULL(push_end(builder), error);
}
return NANOARROW_OK;
}
-static ArrowErrorCode ArrowIpcEncodeField(flatcc_builder_t* builder,
- const struct ArrowSchema* schema,
- struct ArrowError* error) {
+static ArrowErrorCode ArrowIpcEncodeField(
+ flatcc_builder_t* builder, const struct ArrowSchema* schema,
+ const struct ArrowIpcDictionaryEncodings* dictionary_encodings,
+ struct ArrowError* error) {
FLATCC_RETURN_UNLESS_0(Field_name_create_str(builder, schema->name), error);
FLATCC_RETURN_UNLESS_0(
Field_nullable_add(builder, (schema->flags & ARROW_FLAG_NULLABLE) != 0),
error);
struct ArrowSchemaView schema_view;
NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&schema_view, schema, error));
+
+ if (schema_view.type == NANOARROW_TYPE_DICTIONARY) {
+ const struct ArrowIpcDictionaryEncoding* encoding =
+ ArrowIpcDictionaryEncodingsFind(dictionary_encodings, schema);
+
+ // We just computed these dictionary ids, so we should be able to resolve
them
+ if (encoding == NULL) {
+ ArrowErrorSet(error, "Unexpected missing dictionary encoding for field");
+ return EINVAL;
+ }
+
+ // Determine the index type's bitWidth and is_signed from the storage_type
+ int32_t index_bitwidth;
+ flatbuffers_bool_t index_is_signed;
+ switch (schema_view.storage_type) {
+ case NANOARROW_TYPE_INT8:
+ index_bitwidth = 8;
+ index_is_signed = 1;
+ break;
+ case NANOARROW_TYPE_UINT8:
+ index_bitwidth = 8;
+ index_is_signed = 0;
+ break;
+ case NANOARROW_TYPE_INT16:
+ index_bitwidth = 16;
+ index_is_signed = 1;
+ break;
+ case NANOARROW_TYPE_UINT16:
+ index_bitwidth = 16;
+ index_is_signed = 0;
+ break;
+ case NANOARROW_TYPE_INT32:
+ index_bitwidth = 32;
+ index_is_signed = 1;
+ break;
+ case NANOARROW_TYPE_UINT32:
+ index_bitwidth = 32;
+ index_is_signed = 0;
+ break;
+ case NANOARROW_TYPE_INT64:
+ index_bitwidth = 64;
+ index_is_signed = 1;
+ break;
+ case NANOARROW_TYPE_UINT64:
+ index_bitwidth = 64;
+ index_is_signed = 0;
+ break;
+ default:
+ ArrowErrorSet(error, "Invalid dictionary index type: %s",
+ ArrowTypeString(schema_view.storage_type));
+ return EINVAL;
+ }
+
+ // Create the Int type for the index type
+ ns(Int_ref_t) index_type_ref =
+ ns(Int_create(builder, index_bitwidth, index_is_signed));
+ FLATCC_RETURN_IF_NULL(index_type_ref, error);
+
+ // Create the DictionaryEncoding with id, indexType, isOrdered, and
dictionaryKind
+ flatbuffers_bool_t is_ordered = (schema->flags &
ARROW_FLAG_DICTIONARY_ORDERED) != 0;
+ ns(DictionaryEncoding_ref_t) dict_encoding_ref =
+ ns(DictionaryEncoding_create(builder, encoding->id, index_type_ref,
is_ordered,
+ ns(DictionaryKind_DenseArray)));
+ FLATCC_RETURN_IF_NULL(dict_encoding_ref, error);
+
+ // Add the dictionary encoding to the field
+ FLATCC_RETURN_UNLESS_0(Field_dictionary_add(builder, dict_encoding_ref),
error);
+
+ NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&schema_view,
schema->dictionary, error));
+ }
+
NANOARROW_RETURN_NOT_OK(ArrowIpcEncodeFieldType(builder, &schema_view,
error));
if (schema->n_children != 0) {
FLATCC_RETURN_UNLESS_0(Field_children_start(builder), error);
- NANOARROW_RETURN_NOT_OK(ArrowIpcEncodeFields(builder, schema,
-
&ns(Field_children_push_start),
- &ns(Field_children_push_end),
error));
+ NANOARROW_RETURN_NOT_OK(
+ ArrowIpcEncodeFields(builder, schema, &ns(Field_children_push_start),
+ &ns(Field_children_push_end),
dictionary_encodings, error));
FLATCC_RETURN_UNLESS_0(Field_children_end(builder), error);
}
@@ -407,9 +484,10 @@ static ArrowErrorCode
ArrowIpcEncodeField(flatcc_builder_t* builder,
return NANOARROW_OK;
}
-static ArrowErrorCode ArrowIpcEncodeSchema(flatcc_builder_t* builder,
- const struct ArrowSchema* schema,
- struct ArrowError* error) {
+static ArrowErrorCode ArrowIpcEncodeSchema(
+ flatcc_builder_t* builder, const struct ArrowSchema* schema,
+ const struct ArrowIpcDictionaryEncodings* dictionary_encodings,
+ struct ArrowError* error) {
NANOARROW_DCHECK(schema->release != NULL);
if (strcmp(schema->format, "+s") != 0) {
@@ -427,9 +505,9 @@ static ArrowErrorCode
ArrowIpcEncodeSchema(flatcc_builder_t* builder,
}
FLATCC_RETURN_UNLESS_0(Schema_fields_start(builder), error);
- NANOARROW_RETURN_NOT_OK(ArrowIpcEncodeFields(builder, schema,
- &ns(Schema_fields_push_start),
- &ns(Schema_fields_push_end),
error));
+ NANOARROW_RETURN_NOT_OK(
+ ArrowIpcEncodeFields(builder, schema, &ns(Schema_fields_push_start),
+ &ns(Schema_fields_push_end), dictionary_encodings,
error));
FLATCC_RETURN_UNLESS_0(Schema_fields_end(builder), error);
FLATCC_RETURN_UNLESS_0(Schema_custom_metadata_start(builder), error);
@@ -461,7 +539,19 @@ ArrowErrorCode ArrowIpcEncoderEncodeSchema(struct
ArrowIpcEncoder* encoder,
FLATCC_RETURN_UNLESS_0(Message_version_add(builder, ns(MetadataVersion_V5)),
error);
FLATCC_RETURN_UNLESS_0(Message_header_Schema_start(builder), error);
- NANOARROW_RETURN_NOT_OK(ArrowIpcEncodeSchema(builder, schema, error));
+
+ // Look for any fields of the schema that should be dictionary encoded
+ if (private->dictionary_encodings.encodings.size_bytes > 0) {
+ ArrowIpcDictionaryEncodingsReset(&private->dictionary_encodings);
+ ArrowIpcDictionaryEncodingsInit(&private->dictionary_encodings);
+ }
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(
+ ArrowIpcDictionaryEncodingsAppendSchema(&private->dictionary_encodings,
schema),
+ error);
+
+ NANOARROW_RETURN_NOT_OK(
+ ArrowIpcEncodeSchema(builder, schema, &private->dictionary_encodings,
error));
+
FLATCC_RETURN_UNLESS_0(Message_header_Schema_end(builder), error);
FLATCC_RETURN_UNLESS_0(Message_bodyLength_add(builder, 0), error);
@@ -538,6 +628,11 @@ static ArrowErrorCode ArrowIpcEncoderEncodeRecordBatchImpl(
return ENOTSUP;
}
+ if (array_view->dictionary != NULL) {
+ ArrowErrorSet(error, "Cannot encode dictionary arrays");
+ return ENOTSUP;
+ }
+
for (int64_t c = 0; c < array_view->n_children; ++c) {
const struct ArrowArrayView* child = array_view->children[c];
@@ -657,7 +752,8 @@ ArrowErrorCode ArrowIpcEncoderEncodeFooter(struct
ArrowIpcEncoder* encoder,
FLATCC_RETURN_UNLESS_0(Footer_version_add(builder, ns(MetadataVersion_V5)),
error);
FLATCC_RETURN_UNLESS_0(Footer_schema_start(builder), error);
- NANOARROW_RETURN_NOT_OK(ArrowIpcEncodeSchema(builder, &footer->schema,
error));
+ NANOARROW_RETURN_NOT_OK(
+ ArrowIpcEncodeSchema(builder, &footer->schema, &footer->dictionaries,
error));
FLATCC_RETURN_UNLESS_0(Footer_schema_end(builder), error);
const struct ArrowIpcFileBlock* blocks =