This is an automated email from the ASF dual-hosted git repository.
Gabriel39 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new ac877e5cc4f [fix](iceberg) Write binary columns with proper Arrow
types (#64949)
ac877e5cc4f is described below
commit ac877e5cc4f27de76753794a4948d55bd425bf0a
Author: Gabriel <[email protected]>
AuthorDate: Tue Jun 30 12:01:06 2026 +0800
[fix](iceberg) Write binary columns with proper Arrow types (#64949)
Doris used the Iceberg schema to build the Arrow schema for Parquet
writer, but mapped Iceberg BINARY/FIXED/UUID to Arrow utf8. This made
Doris write binary Iceberg columns with string semantics and could break
Spark/Iceberg vectorized reads with Unsupported type: binary.
Map Iceberg BINARY to Arrow binary, FIXED(n) to fixed-size binary(n),
and UUID to fixed-size binary(16). Add FixedType length access for
schema conversion.
Also allow STRING and VARBINARY columns to write into Arrow
binary/fixed-size binary builders so existing Doris-side mappings can
still produce the correct Iceberg Parquet schema. Fixed-size binary
writes validate byte width before appending.
Add ArrowSchemaUtil coverage for STRING/BINARY/FIXED/UUID mapping.
---
.../data_type_serde/data_type_string_serde.cpp | 118 +++++++++++++++++++
.../data_type_serde/data_type_varbinary_serde.cpp | 35 ++++++
be/src/format/arrow/arrow_block_convertor.cpp | 131 ++++++++++++++++++++-
be/src/format/table/iceberg/arrow_schema_util.cpp | 19 ++-
be/src/format/table/iceberg/arrow_schema_util.h | 1 +
be/src/format/table/iceberg/types.h | 2 +
.../data_type_serde/data_type_serde_arrow_test.cpp | 108 +++++++++++++++++
.../data_type_serde_varbinary_test.cpp | 18 +++
.../table/iceberg/arrow_schema_util_test.cpp | 28 +++++
9 files changed, 453 insertions(+), 7 deletions(-)
diff --git a/be/src/core/data_type_serde/data_type_string_serde.cpp
b/be/src/core/data_type_serde/data_type_string_serde.cpp
index ff340de9dcf..dc7667fefca 100644
--- a/be/src/core/data_type_serde/data_type_string_serde.cpp
+++ b/be/src/core/data_type_serde/data_type_string_serde.cpp
@@ -17,6 +17,9 @@
#include "core/data_type_serde/data_type_string_serde.h"
+#include <array>
+#include <cstring>
+
#include "core/column/column_string.h"
#include "core/data_type/define_primitive_type.h"
#include "util/jsonb_document_cast.h"
@@ -25,6 +28,88 @@
namespace doris {
+namespace {
+
+int hex_value(char c) {
+ if (c >= '0' && c <= '9') {
+ return c - '0';
+ }
+ if (c >= 'a' && c <= 'f') {
+ return c - 'a' + 10;
+ }
+ if (c >= 'A' && c <= 'F') {
+ return c - 'A' + 10;
+ }
+ return -1;
+}
+
+Status parse_uuid_to_bytes(StringRef uuid, std::array<uint8_t, 16>* bytes) {
+ if (uuid.size != 32 && uuid.size != 36) {
+ return Status::InvalidArgument("Invalid UUID string length: {}",
uuid.size);
+ }
+
+ int hex_count = 0;
+ int high_nibble = -1;
+ int byte_index = 0;
+ for (size_t i = 0; i < uuid.size; ++i) {
+ char c = uuid.data[i];
+ if (uuid.size == 36 && (i == 8 || i == 13 || i == 18 || i == 23)) {
+ if (c != '-') {
+ return Status::InvalidArgument("Invalid UUID string format");
+ }
+ continue;
+ }
+ if (c == '-') {
+ return Status::InvalidArgument("Invalid UUID string format");
+ }
+
+ int value = hex_value(c);
+ if (value < 0) {
+ return Status::InvalidArgument("Invalid UUID string format");
+ }
+ if (hex_count % 2 == 0) {
+ high_nibble = value;
+ } else {
+ (*bytes)[byte_index++] = static_cast<uint8_t>((high_nibble << 4) |
value);
+ }
+ ++hex_count;
+ }
+
+ if (hex_count != 32 || byte_index != 16) {
+ return Status::InvalidArgument("Invalid UUID string format");
+ }
+ return Status::OK();
+}
+
+Status append_fixed_size_binary(arrow::FixedSizeBinaryBuilder& builder, const
IColumn& column,
+ StringRef string_ref, int byte_width, bool
pad_char_value,
+ bool convert_uuid_string) {
+ if (convert_uuid_string && byte_width == 16 &&
+ (string_ref.size == 32 || string_ref.size == 36)) {
+ std::array<uint8_t, 16> bytes;
+ RETURN_IF_ERROR(parse_uuid_to_bytes(string_ref, &bytes));
+ return checkArrowStatus(builder.Append(bytes.data()), column, builder);
+ }
+
+ if (string_ref.size == byte_width) {
+ return checkArrowStatus(builder.Append(reinterpret_cast<const
uint8_t*>(string_ref.data)),
+ column, builder);
+ }
+
+ if (pad_char_value && string_ref.size < byte_width) {
+ std::string padded_value(byte_width, '\0');
+ std::memcpy(padded_value.data(), string_ref.data, string_ref.size);
+ return checkArrowStatus(
+ builder.Append(reinterpret_cast<const
uint8_t*>(padded_value.data())), column,
+ builder);
+ }
+
+ return Status::InvalidArgument("Fixed size binary column expects {} bytes,
got {}", byte_width,
+ string_ref.size);
+}
+
+} // namespace
+
template <typename ColumnType>
Status DataTypeStringSerDeBase<ColumnType>::serialize_column_to_json(const
IColumn& column,
int64_t
start_idx,
@@ -243,9 +328,42 @@ Status
DataTypeStringSerDeBase<ColumnType>::write_column_to_arrow(
if (array_builder->type()->id() == arrow::Type::LARGE_STRING) {
auto& builder =
assert_cast<arrow::LargeStringBuilder&>(*array_builder);
return write_column_to_arrow_impl(column, null_map, builder, start,
end);
+ } else if (array_builder->type()->id() == arrow::Type::LARGE_BINARY) {
+ auto& builder =
assert_cast<arrow::LargeBinaryBuilder&>(*array_builder);
+ const auto& string_column = assert_cast<const ColumnType&>(column);
+ for (size_t string_i = start; string_i < end; ++string_i) {
+ if (null_map && (*null_map)[string_i]) {
+ RETURN_IF_ERROR(checkArrowStatus(builder.AppendNull(), column,
builder));
+ continue;
+ }
+ auto string_ref = string_column.get_data_at(string_i);
+ RETURN_IF_ERROR(checkArrowStatus(
+ builder.Append(reinterpret_cast<const
uint8_t*>(string_ref.data),
+ cast_set<int64_t, size_t,
false>(string_ref.size)),
+ column, builder));
+ }
+ return Status::OK();
} else if (array_builder->type()->id() == arrow::Type::STRING) {
auto& builder = assert_cast<arrow::StringBuilder&>(*array_builder);
return write_column_to_arrow_impl(column, null_map, builder, start,
end);
+ } else if (array_builder->type()->id() == arrow::Type::BINARY) {
+ auto& builder = assert_cast<arrow::BinaryBuilder&>(*array_builder);
+ return write_column_to_arrow_impl(column, null_map, builder, start,
end);
+ } else if (array_builder->type()->id() == arrow::Type::FIXED_SIZE_BINARY) {
+ auto& builder =
assert_cast<arrow::FixedSizeBinaryBuilder&>(*array_builder);
+ const int byte_width =
+ static_cast<const
arrow::FixedSizeBinaryType&>(*array_builder->type()).byte_width();
+ const auto& string_column = assert_cast<const ColumnType&>(column);
+ for (size_t string_i = start; string_i < end; ++string_i) {
+ if (null_map && (*null_map)[string_i]) {
+ RETURN_IF_ERROR(checkArrowStatus(builder.AppendNull(), column,
builder));
+ continue;
+ }
+ auto string_ref = string_column.get_data_at(string_i);
+ RETURN_IF_ERROR(append_fixed_size_binary(builder, column,
string_ref, byte_width,
+ _type == TYPE_CHAR, _type
!= TYPE_CHAR));
+ }
+ return Status::OK();
} else {
return Status::InvalidArgument("Unsupported arrow type for string
column: {}",
array_builder->type()->name());
diff --git a/be/src/core/data_type_serde/data_type_varbinary_serde.cpp
b/be/src/core/data_type_serde/data_type_varbinary_serde.cpp
index 2b1069d3bbb..6aca595e84d 100644
--- a/be/src/core/data_type_serde/data_type_varbinary_serde.cpp
+++ b/be/src/core/data_type_serde/data_type_varbinary_serde.cpp
@@ -70,9 +70,44 @@ Status DataTypeVarbinarySerDe::write_column_to_arrow(const
IColumn& column, cons
if (array_builder->type()->id() == arrow::Type::BINARY) {
auto& builder = assert_cast<arrow::BinaryBuilder&>(*array_builder);
return lambda_function(builder);
+ } else if (array_builder->type()->id() == arrow::Type::LARGE_BINARY) {
+ auto& builder =
assert_cast<arrow::LargeBinaryBuilder&>(*array_builder);
+ const auto& varbinary_column_data = assert_cast<const
ColumnVarbinary&>(column).get_data();
+ for (size_t i = start; i < end; ++i) {
+ if (null_map && (*null_map)[i]) {
+ RETURN_IF_ERROR(checkArrowStatus(builder.AppendNull(), column,
builder));
+ continue;
+ }
+ const auto& string_view = varbinary_column_data[i];
+ RETURN_IF_ERROR(checkArrowStatus(
+ builder.Append(reinterpret_cast<const
uint8_t*>(string_view.data()),
+ cast_set<int64_t, size_t,
false>(string_view.size())),
+ column, builder));
+ }
+ return Status::OK();
} else if (array_builder->type()->id() == arrow::Type::STRING) {
auto& builder = assert_cast<arrow::StringBuilder&>(*array_builder);
return lambda_function(builder);
+ } else if (array_builder->type()->id() == arrow::Type::FIXED_SIZE_BINARY) {
+ auto& builder =
assert_cast<arrow::FixedSizeBinaryBuilder&>(*array_builder);
+ const int byte_width =
+ static_cast<const
arrow::FixedSizeBinaryType&>(*array_builder->type()).byte_width();
+ const auto& varbinary_column_data = assert_cast<const
ColumnVarbinary&>(column).get_data();
+ for (size_t i = start; i < end; ++i) {
+ if (null_map && (*null_map)[i]) {
+ RETURN_IF_ERROR(checkArrowStatus(builder.AppendNull(), column,
builder));
+ continue;
+ }
+ const auto& string_view = varbinary_column_data[i];
+ if (string_view.size() != byte_width) {
+ return Status::InvalidArgument("Fixed size binary column
expects {} bytes, got {}",
+ byte_width, string_view.size());
+ }
+ RETURN_IF_ERROR(checkArrowStatus(
+ builder.Append(reinterpret_cast<const
uint8_t*>(string_view.data())), column,
+ builder));
+ }
+ return Status::OK();
} else {
return Status::InvalidArgument("Unsupported arrow type for varbinary
column: {}",
array_builder->type()->name());
diff --git a/be/src/format/arrow/arrow_block_convertor.cpp
b/be/src/format/arrow/arrow_block_convertor.cpp
index 91593898ac5..d6a4735c73a 100644
--- a/be/src/format/arrow/arrow_block_convertor.cpp
+++ b/be/src/format/arrow/arrow_block_convertor.cpp
@@ -26,11 +26,14 @@
#include <arrow/status.h>
#include <arrow/type.h>
#include <arrow/util/decimal.h>
+#include <arrow/util/key_value_metadata.h>
#include <arrow/visit_type_inline.h>
#include <arrow/visitor.h>
#include <cctz/time_zone.h>
#include <glog/logging.h>
+#include <array>
+#include <cstring>
#include <ctime>
#include <memory>
#include <utility>
@@ -39,6 +42,8 @@
#include "common/status.h"
#include "core/block/column_with_type_and_name.h"
#include "core/column/column.h"
+#include "core/column/column_nullable.h"
+#include "core/column/column_string.h"
#include "core/data_type/data_type.h"
#include "core/data_type/data_type_array.h"
#include "core/data_type/data_type_nullable.h"
@@ -52,6 +57,115 @@ class Array;
namespace doris {
+namespace {
+
+constexpr const char* ICEBERG_ORIGINAL_TYPE_KEY = "originalType";
+constexpr const char* ICEBERG_UUID_TYPE_VALUE = "uuid";
+
+bool is_iceberg_uuid_field(const std::shared_ptr<arrow::Field>& field) {
+ if (field == nullptr || !field->HasMetadata()) {
+ return false;
+ }
+ const auto result = field->metadata()->Get(ICEBERG_ORIGINAL_TYPE_KEY);
+ return result.ok() && result.ValueUnsafe() == ICEBERG_UUID_TYPE_VALUE;
+}
+
+int hex_value(char c) {
+ if (c >= '0' && c <= '9') {
+ return c - '0';
+ }
+ if (c >= 'a' && c <= 'f') {
+ return c - 'a' + 10;
+ }
+ if (c >= 'A' && c <= 'F') {
+ return c - 'A' + 10;
+ }
+ return -1;
+}
+
+Status parse_uuid_to_bytes(StringRef uuid, std::array<uint8_t, 16>* bytes) {
+ if (uuid.size == 16) {
+ std::memcpy(bytes->data(), uuid.data, bytes->size());
+ return Status::OK();
+ }
+ if (uuid.size != 32 && uuid.size != 36) {
+ return Status::InvalidArgument("Invalid UUID string length: {}",
uuid.size);
+ }
+
+ int hex_count = 0;
+ int high_nibble = -1;
+ int byte_index = 0;
+ for (size_t i = 0; i < uuid.size; ++i) {
+ char c = uuid.data[i];
+ if (uuid.size == 36 && (i == 8 || i == 13 || i == 18 || i == 23)) {
+ if (c != '-') {
+ return Status::InvalidArgument("Invalid UUID string format");
+ }
+ continue;
+ }
+ if (c == '-') {
+ return Status::InvalidArgument("Invalid UUID string format");
+ }
+
+ int value = hex_value(c);
+ if (value < 0) {
+ return Status::InvalidArgument("Invalid UUID string format");
+ }
+ if (hex_count % 2 == 0) {
+ high_nibble = value;
+ } else {
+ (*bytes)[byte_index++] = static_cast<uint8_t>((high_nibble << 4) |
value);
+ }
+ ++hex_count;
+ }
+
+ if (hex_count != 32 || byte_index != 16) {
+ return Status::InvalidArgument("Invalid UUID string format");
+ }
+ return Status::OK();
+}
+
+Status write_iceberg_uuid_string_column_to_arrow(const IColumn& column, const
DataTypePtr& type,
+ arrow::ArrayBuilder*
array_builder, int64_t start,
+ int64_t end) {
+ if (array_builder->type()->id() != arrow::Type::FIXED_SIZE_BINARY) {
+ return Status::InvalidArgument("Iceberg UUID must be written to fixed
size binary");
+ }
+ const int byte_width =
+ static_cast<const
arrow::FixedSizeBinaryType&>(*array_builder->type()).byte_width();
+ if (byte_width != 16) {
+ return Status::InvalidArgument("Iceberg UUID expects 16 bytes, got
{}", byte_width);
+ }
+
+ auto& builder =
assert_cast<arrow::FixedSizeBinaryBuilder&>(*array_builder);
+ const IColumn* data_column = &column;
+ const NullMap* null_map = nullptr;
+ if (type->is_nullable()) {
+ const auto& nullable_column = assert_cast<const
ColumnNullable&>(column);
+ data_column = &nullable_column.get_nested_column();
+ null_map = &nullable_column.get_null_map_data();
+ }
+ if (!data_column->is_column_string()) {
+ return Status::InvalidArgument(
+ "Iceberg UUID string conversion expects string column, got {}",
+ data_column->get_name());
+ }
+
+ const auto& string_column = assert_cast<const ColumnString&>(*data_column);
+ for (size_t row = start; row < end; ++row) {
+ if (null_map != nullptr && (*null_map)[row]) {
+ RETURN_IF_ERROR(checkArrowStatus(builder.AppendNull(), column,
builder));
+ continue;
+ }
+ std::array<uint8_t, 16> bytes;
+ RETURN_IF_ERROR(parse_uuid_to_bytes(string_column.get_data_at(row),
&bytes));
+ RETURN_IF_ERROR(checkArrowStatus(builder.Append(bytes.data()), column,
builder));
+ }
+ return Status::OK();
+}
+
+} // namespace
+
Status
FromBlockToRecordBatchConverter::convert(std::shared_ptr<arrow::RecordBatch>*
out) {
int num_fields = _schema->num_fields();
if (_block.columns() != num_fields) {
@@ -80,8 +194,11 @@ Status
FromBlockToRecordBatchConverter::convert(std::shared_ptr<arrow::RecordBat
_cur_type = _block.get_by_position(idx).type;
auto column = _cur_col->convert_to_full_column_if_const();
auto arrow_type = _schema->field(idx)->type();
- if (arrow_type->name() == "utf8" && column->byte_size() >=
MAX_ARROW_UTF8) {
+ if (arrow_type->id() == arrow::Type::STRING && column->byte_size() >=
MAX_ARROW_UTF8) {
arrow_type = arrow::large_utf8();
+ } else if (arrow_type->id() == arrow::Type::BINARY &&
+ column->byte_size() >= MAX_ARROW_UTF8) {
+ arrow_type = arrow::large_binary();
}
std::unique_ptr<arrow::ArrayBuilder> builder;
auto arrow_st = arrow::MakeBuilder(_pool, arrow_type, &builder);
@@ -90,9 +207,15 @@ Status
FromBlockToRecordBatchConverter::convert(std::shared_ptr<arrow::RecordBat
}
_cur_builder = builder.get();
try {
- RETURN_IF_ERROR(_cur_type->get_serde()->write_column_to_arrow(
- *column, nullptr, _cur_builder, _cur_start, _cur_start +
_cur_rows,
- _timezone_obj));
+ if (is_iceberg_uuid_field(_schema->field(idx)) &&
+
is_string_type(remove_nullable(_cur_type)->get_primitive_type())) {
+ RETURN_IF_ERROR(write_iceberg_uuid_string_column_to_arrow(
+ *column, _cur_type, _cur_builder, _cur_start,
_cur_start + _cur_rows));
+ } else {
+ RETURN_IF_ERROR(_cur_type->get_serde()->write_column_to_arrow(
+ *column, nullptr, _cur_builder, _cur_start, _cur_start
+ _cur_rows,
+ _timezone_obj));
+ }
} catch (std::exception& e) {
return Status::InternalError(
"Fail to convert block data to arrow data, type: {}, name:
{}, error: {}",
diff --git a/be/src/format/table/iceberg/arrow_schema_util.cpp
b/be/src/format/table/iceberg/arrow_schema_util.cpp
index e0bf830dfc8..e7186e6a486 100644
--- a/be/src/format/table/iceberg/arrow_schema_util.cpp
+++ b/be/src/format/table/iceberg/arrow_schema_util.cpp
@@ -25,6 +25,7 @@ namespace doris::iceberg {
const char* ArrowSchemaUtil::PARQUET_FIELD_ID = "PARQUET:field_id";
const char* ArrowSchemaUtil::ORIGINAL_TYPE = "originalType";
const char* ArrowSchemaUtil::MAP_TYPE_VALUE = "mapType";
+const char* ArrowSchemaUtil::UUID_TYPE_VALUE = "uuid";
Status ArrowSchemaUtil::convert(const Schema* schema, const std::string&
timezone,
std::vector<std::shared_ptr<arrow::Field>>&
fields) {
@@ -75,13 +76,25 @@ Status ArrowSchemaUtil::convert_to(const
iceberg::NestedField& field,
break;
}
- case iceberg::TypeID::BINARY:
case iceberg::TypeID::STRING:
- case iceberg::TypeID::UUID:
- case iceberg::TypeID::FIXED:
arrow_type = arrow::utf8();
break;
+ case iceberg::TypeID::BINARY:
+ arrow_type = arrow::binary();
+ break;
+
+ case iceberg::TypeID::UUID:
+ metadata[ORIGINAL_TYPE] = UUID_TYPE_VALUE;
+ arrow_type = arrow::fixed_size_binary(16);
+ break;
+
+ case iceberg::TypeID::FIXED: {
+ iceberg::FixedType* fixed_type =
static_cast<iceberg::FixedType*>(field.field_type());
+ arrow_type = arrow::fixed_size_binary(fixed_type->get_length());
+ break;
+ }
+
case iceberg::TypeID::DECIMAL: {
auto* dt = dynamic_cast<DecimalType*>(field.field_type());
arrow_type = arrow::decimal(dt->get_precision(), dt->get_scale());
diff --git a/be/src/format/table/iceberg/arrow_schema_util.h
b/be/src/format/table/iceberg/arrow_schema_util.h
index 5b2e368fd14..ce13f543b74 100644
--- a/be/src/format/table/iceberg/arrow_schema_util.h
+++ b/be/src/format/table/iceberg/arrow_schema_util.h
@@ -32,6 +32,7 @@ private:
static const char* PARQUET_FIELD_ID;
static const char* ORIGINAL_TYPE;
static const char* MAP_TYPE_VALUE;
+ static const char* UUID_TYPE_VALUE;
static Status convert_to(const iceberg::NestedField& field,
std::shared_ptr<arrow::Field>* arrow_field,
diff --git a/be/src/format/table/iceberg/types.h
b/be/src/format/table/iceberg/types.h
index 53c54e238fa..1594003fccd 100644
--- a/be/src/format/table/iceberg/types.h
+++ b/be/src/format/table/iceberg/types.h
@@ -289,6 +289,8 @@ public:
return ss.str();
}
+ int get_length() const { return length; }
+
private:
int length;
};
diff --git a/be/test/core/data_type_serde/data_type_serde_arrow_test.cpp
b/be/test/core/data_type_serde/data_type_serde_arrow_test.cpp
index 63a3819cd35..7b20c2f82d0 100644
--- a/be/test/core/data_type_serde/data_type_serde_arrow_test.cpp
+++ b/be/test/core/data_type_serde/data_type_serde_arrow_test.cpp
@@ -15,6 +15,8 @@
// specific language governing permissions and limitations
// under the License.
+#include <arrow/array/array_binary.h>
+#include <arrow/array/array_nested.h>
#include <arrow/array/builder_base.h>
#include <arrow/array/builder_binary.h>
#include <arrow/array/builder_decimal.h>
@@ -25,6 +27,7 @@
#include <arrow/type.h>
#include <arrow/type_fwd.h>
#include <arrow/util/decimal.h>
+#include <arrow/util/key_value_metadata.h>
#include <arrow/visit_type_inline.h>
#include <arrow/visitor.h>
#include <gen_cpp/Descriptors_types.h>
@@ -35,6 +38,7 @@
#include <cmath>
#include <cstdint>
+#include <cstring>
#include <iostream>
#include <memory>
#include <string>
@@ -519,6 +523,110 @@ TEST(DataTypeSerDeArrowTest, BigStringSerDeTest) {
CommonDataTypeSerdeTest::compare_two_blocks(block, assert_block);
}
+TEST(DataTypeSerDeArrowTest, IcebergUuidStringToFixedSizeBinary) {
+ auto block = std::make_shared<Block>();
+ auto strcol = ColumnString::create();
+ strcol->insert_data("550e8400-e29b-41d4-a716-446655440000", 36);
+ strcol->insert_data("00112233445566778899aabbccddeeff", 32);
+ DataTypePtr data_type(std::make_shared<DataTypeString>());
+ block->insert(ColumnWithTypeAndName(strcol->get_ptr(), data_type,
"uuid_col"));
+
+ auto metadata = arrow::KeyValueMetadata::Make({"originalType"}, {"uuid"});
+ auto schema =
+ arrow::schema({arrow::field("uuid_col",
arrow::fixed_size_binary(16), true, metadata)});
+
+ std::shared_ptr<arrow::RecordBatch> record_batch;
+ cctz::time_zone default_timezone;
+ Status status = convert_to_arrow_batch(*block, schema,
arrow::default_memory_pool(),
+ &record_batch, default_timezone);
+ ASSERT_TRUE(status.ok()) << status;
+ ASSERT_NE(nullptr, record_batch);
+ ASSERT_EQ(2, record_batch->num_rows());
+
+ auto uuid_array =
+
std::static_pointer_cast<arrow::FixedSizeBinaryArray>(record_batch->column(0));
+ ASSERT_EQ(16, uuid_array->byte_width());
+
+ const uint8_t expected0[] = {0x55, 0x0e, 0x84, 0x00, 0xe2, 0x9b, 0x41,
0xd4,
+ 0xa7, 0x16, 0x44, 0x66, 0x55, 0x44, 0x00,
0x00};
+ const uint8_t expected1[] = {0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66,
0x77,
+ 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee,
0xff};
+ EXPECT_EQ(0, std::memcmp(uuid_array->GetValue(0), expected0,
sizeof(expected0)));
+ EXPECT_EQ(0, std::memcmp(uuid_array->GetValue(1), expected1,
sizeof(expected1)));
+}
+
+TEST(DataTypeSerDeArrowTest, NestedIcebergUuidStringToFixedSizeBinary) {
+ auto block = std::make_shared<Block>();
+ DataTypePtr data_type = std::make_shared<DataTypeStruct>(
+ std::vector<DataTypePtr> {std::make_shared<DataTypeString>()});
+ auto struct_column = data_type->create_column();
+
+ Struct row;
+
row.push_back(Field::create_field<TYPE_STRING>("550e8400-e29b-41d4-a716-446655440000"));
+ struct_column->insert(Field::create_field<TYPE_STRUCT>(row));
+ block->insert(ColumnWithTypeAndName(struct_column->get_ptr(), data_type,
"uuid_struct"));
+
+ auto metadata = arrow::KeyValueMetadata::Make({"originalType"}, {"uuid"});
+ auto schema = arrow::schema({arrow::field(
+ "uuid_struct",
+ arrow::struct_({arrow::field("id", arrow::fixed_size_binary(16),
true, metadata)}),
+ true)});
+
+ std::shared_ptr<arrow::RecordBatch> record_batch;
+ cctz::time_zone default_timezone;
+ Status status = convert_to_arrow_batch(*block, schema,
arrow::default_memory_pool(),
+ &record_batch, default_timezone);
+ ASSERT_TRUE(status.ok()) << status;
+
+ auto struct_array =
std::static_pointer_cast<arrow::StructArray>(record_batch->column(0));
+ auto uuid_array =
std::static_pointer_cast<arrow::FixedSizeBinaryArray>(struct_array->field(0));
+ const uint8_t expected[] = {0x55, 0x0e, 0x84, 0x00, 0xe2, 0x9b, 0x41, 0xd4,
+ 0xa7, 0x16, 0x44, 0x66, 0x55, 0x44, 0x00,
0x00};
+ EXPECT_EQ(0, std::memcmp(uuid_array->GetValue(0), expected,
sizeof(expected)));
+}
+
+TEST(DataTypeSerDeArrowTest, CharToFixedSizeBinaryPadsZeros) {
+ auto block = std::make_shared<Block>();
+ auto strcol = ColumnString::create();
+ strcol->insert_data("ab", 2);
+ DataTypePtr data_type(std::make_shared<DataTypeString>(4, TYPE_CHAR));
+ block->insert(ColumnWithTypeAndName(strcol->get_ptr(), data_type,
"fixed_col"));
+
+ auto schema = arrow::schema({arrow::field("fixed_col",
arrow::fixed_size_binary(4), true)});
+
+ std::shared_ptr<arrow::RecordBatch> record_batch;
+ cctz::time_zone default_timezone;
+ Status status = convert_to_arrow_batch(*block, schema,
arrow::default_memory_pool(),
+ &record_batch, default_timezone);
+ ASSERT_TRUE(status.ok()) << status;
+
+ auto fixed_array =
+
std::static_pointer_cast<arrow::FixedSizeBinaryArray>(record_batch->column(0));
+ const char expected[] = {'a', 'b', '\0', '\0'};
+ EXPECT_EQ(0, std::memcmp(fixed_array->GetValue(0), expected,
sizeof(expected)));
+}
+
+TEST(DataTypeSerDeArrowTest, StringToLargeBinary) {
+ auto block = std::make_shared<Block>();
+ auto strcol = ColumnString::create();
+ strcol->insert_data("binary-value", 12);
+ DataTypePtr data_type(std::make_shared<DataTypeString>());
+ block->insert(ColumnWithTypeAndName(strcol->get_ptr(), data_type,
"bin_col"));
+
+ auto schema = arrow::schema({arrow::field("bin_col",
arrow::large_binary(), true)});
+
+ std::shared_ptr<arrow::RecordBatch> record_batch;
+ cctz::time_zone default_timezone;
+ Status status = convert_to_arrow_batch(*block, schema,
arrow::default_memory_pool(),
+ &record_batch, default_timezone);
+ ASSERT_TRUE(status.ok()) << status;
+
+ auto binary_array =
std::static_pointer_cast<arrow::LargeBinaryArray>(record_batch->column(0));
+ ASSERT_EQ(12, binary_array->value_length(0));
+ const uint8_t* raw = binary_array->value_data()->data() +
binary_array->value_offset(0);
+ EXPECT_EQ(0, std::memcmp(raw, "binary-value", 12));
+}
+
TEST(DataTypeSerDeArrowTest, BlockConverterTest) {
std::vector<PrimitiveType> cols = {
TYPE_INT, TYPE_INT, TYPE_STRING, TYPE_DECIMAL128I,
TYPE_BOOLEAN,
diff --git a/be/test/core/data_type_serde/data_type_serde_varbinary_test.cpp
b/be/test/core/data_type_serde/data_type_serde_varbinary_test.cpp
index c078de14bab..6016bb4a37a 100644
--- a/be/test/core/data_type_serde/data_type_serde_varbinary_test.cpp
+++ b/be/test/core/data_type_serde/data_type_serde_varbinary_test.cpp
@@ -281,6 +281,24 @@ TEST_F(DataTypeVarbinarySerDeTest,
ArrowBinaryAndStringWithNullsAndInvalidType)
}
}
+ // LargeBinaryBuilder path (no nulls)
+ {
+ auto builder = std::make_shared<arrow::LargeBinaryBuilder>();
+ auto st = serde.write_column_to_arrow(*col, nullptr, builder.get(), 0,
vals.size(), tz);
+ EXPECT_TRUE(st.ok()) << st.to_string();
+ std::shared_ptr<arrow::Array> arr;
+ ASSERT_TRUE(builder->Finish(&arr).ok());
+ auto* bin = dynamic_cast<arrow::LargeBinaryArray*>(arr.get());
+ ASSERT_NE(bin, nullptr);
+ ASSERT_EQ(bin->length(), static_cast<int>(vals.size()));
+ for (int i = 0; i < bin->length(); ++i) {
+ ASSERT_FALSE(bin->IsNull(i));
+ ASSERT_EQ(bin->value_length(i),
static_cast<int64_t>(vals[i].size()));
+ const uint8_t* raw = bin->value_data()->data() +
bin->value_offset(i);
+ EXPECT_EQ(memcmp(raw, vals[i].data(), vals[i].size()), 0);
+ }
+ }
+
// Unsupported builder type
{
arrow::Int32Builder ib;
diff --git a/be/test/format/table/iceberg/arrow_schema_util_test.cpp
b/be/test/format/table/iceberg/arrow_schema_util_test.cpp
index 2d897d2cec6..3379ed27257 100644
--- a/be/test/format/table/iceberg/arrow_schema_util_test.cpp
+++ b/be/test/format/table/iceberg/arrow_schema_util_test.cpp
@@ -224,6 +224,34 @@ TEST(ArrowSchemaUtilTest, test_list_field) {
EXPECT_EQ("32",
arrow_list->value_field()->metadata()->Get(pfid).ValueUnsafe());
}
+TEST(ArrowSchemaUtilTest, test_binary_field_types) {
+ std::vector<NestedField> nested_fields;
+ nested_fields.reserve(4);
+ nested_fields.emplace_back(false, 1, "str_col",
std::make_unique<StringType>(), std::nullopt);
+ nested_fields.emplace_back(false, 2, "bin_col",
std::make_unique<BinaryType>(), std::nullopt);
+ nested_fields.emplace_back(false, 3, "fixed_col",
std::make_unique<FixedType>(4), std::nullopt);
+ nested_fields.emplace_back(false, 4, "uuid_col",
std::make_unique<UUIDType>(), std::nullopt);
+
+ Schema schema(1, std::move(nested_fields));
+
+ std::vector<std::shared_ptr<arrow::Field>> fields;
+ Status st = ArrowSchemaUtil::convert(&schema, "utc", fields);
+ ASSERT_TRUE(st.ok()) << st;
+ ASSERT_EQ(4, fields.size());
+
+ EXPECT_EQ(arrow::Type::STRING, fields[0]->type()->id());
+ EXPECT_EQ(arrow::Type::BINARY, fields[1]->type()->id());
+
+ ASSERT_EQ(arrow::Type::FIXED_SIZE_BINARY, fields[2]->type()->id());
+ auto fixed_type =
std::static_pointer_cast<arrow::FixedSizeBinaryType>(fields[2]->type());
+ EXPECT_EQ(4, fixed_type->byte_width());
+
+ ASSERT_EQ(arrow::Type::FIXED_SIZE_BINARY, fields[3]->type()->id());
+ auto uuid_type =
std::static_pointer_cast<arrow::FixedSizeBinaryType>(fields[3]->type());
+ EXPECT_EQ(16, uuid_type->byte_width());
+ EXPECT_EQ("uuid",
fields[3]->metadata()->Get("originalType").ValueUnsafe());
+}
+
TEST(ArrowSchemaUtilTest, test_parquet_filed_id) {
std::string test_dir = "ut_dir/test_parquet_filed_id";
Status st;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]