This is an automated email from the ASF dual-hosted git repository.
eldenmoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 35492968ea9 [refactor](variant) reimplement function insert field
(#51307)
35492968ea9 is described below
commit 35492968ea9d725e25951041e0283d10b3119596
Author: Sun Chenyang <[email protected]>
AuthorDate: Wed Aug 13 11:28:21 2025 +0800
[refactor](variant) reimplement function insert field (#51307)
reimplement function insert field
---
be/src/vec/columns/column_variant.cpp | 76 ++++-----
be/test/vec/columns/column_variant_test.cpp | 234 +++++++++++++++++++++++++++-
2 files changed, 267 insertions(+), 43 deletions(-)
diff --git a/be/src/vec/columns/column_variant.cpp
b/be/src/vec/columns/column_variant.cpp
index d94f02de0bc..54068ac5c78 100644
--- a/be/src/vec/columns/column_variant.cpp
+++ b/be/src/vec/columns/column_variant.cpp
@@ -45,6 +45,7 @@
#include "common/status.h"
#include "exprs/json_functions.h"
#include "olap/olap_common.h"
+#include "runtime/define_primitive_type.h"
#include "runtime/jsonb_value.h"
#include "runtime/primitive_type.h"
#include "util/defer_op.h"
@@ -206,60 +207,51 @@ void
ColumnVariant::Subcolumn::add_new_column_part(DataTypePtr type) {
}
void ColumnVariant::Subcolumn::insert(Field field, FieldInfo info) {
- auto base_type = info.scalar_type_id;
- if (base_type == PrimitiveType::INVALID_TYPE && info.num_dimensions == 0) {
+ if (field.is_null()) {
insert_default();
return;
}
- ++num_rows;
- auto column_dim = least_common_type.get_dimensions();
- auto value_dim = info.num_dimensions;
- if (least_common_type.get_base()->get_primitive_type() == INVALID_TYPE) {
- column_dim = value_dim;
- }
- if (base_type == PrimitiveType::INVALID_TYPE) {
- value_dim = column_dim;
- }
- bool type_changed = false;
- if (value_dim != column_dim || info.num_dimensions >= 2) {
- // Deduce to JSONB
- VLOG_DEBUG << fmt::format(
- "Dimension of types mismatched between inserted value and
column, "
- "expected:{}, but meet:{} for type:{}",
- column_dim, value_dim, least_common_type.get()->get_name());
- base_type = MOST_COMMON_TYPE_ID;
- value_dim = 0;
- type_changed = true;
- }
- // Currently we support specify predefined schema for other types include
decimal, datetime ...etc
- // so we should set specified info to create correct types, and those
predefined types are static and
- // no conflict, so we can set them directly.
- auto base_data_type =
- create_array_of_type(base_type, value_dim, is_nullable,
info.precision, info.scale);
+ auto from_type_id = info.scalar_type_id;
+ auto from_dim = info.num_dimensions;
+ auto least_common_type_id = least_common_type.get_base_type_id();
+ auto least_common_type_dim = least_common_type.get_dimensions();
+ bool type_changed = info.need_convert;
if (data.empty()) {
- add_new_column_part(base_data_type);
- } else if ((least_common_type.get_base_type_id() != base_type &&
- base_type != PrimitiveType::INVALID_TYPE) ||
- type_changed) {
- if (schema_util::is_conversion_required_between_integers(
- base_type, least_common_type.get_base_type_id())) {
- DataTypePtr least_type;
- get_least_supertype_jsonb(DataTypes {base_data_type,
least_common_type.get()},
- &least_type);
- if (!least_type->equals(*base_data_type)) {
+ if (from_dim > 1) {
+
add_new_column_part(create_array_of_type(PrimitiveType::TYPE_JSONB, 0,
is_nullable));
+ type_changed = true;
+ } else {
+ add_new_column_part(create_array_of_type(from_type_id, from_dim,
is_nullable));
+ }
+ } else {
+ if (least_common_type_dim != from_dim) {
+
add_new_column_part(create_array_of_type(PrimitiveType::TYPE_JSONB, 0,
is_nullable));
+ if (from_type_id != PrimitiveType::TYPE_JSONB || from_dim != 0) {
type_changed = true;
}
- add_new_column_part(least_type);
+ } else {
+ if (least_common_type_id != from_type_id &&
+
schema_util::is_conversion_required_between_integers(from_type_id,
+
least_common_type_id)) {
+ type_changed = true;
+ DataTypePtr new_least_common_base_type;
+ get_least_supertype_jsonb(PrimitiveTypeSet {from_type_id,
least_common_type_id},
+ &new_least_common_base_type);
+ if (new_least_common_base_type->get_primitive_type() !=
least_common_type_id) {
+ add_new_column_part(
+
create_array_of_type(new_least_common_base_type->get_primitive_type(),
+ least_common_type_dim,
is_nullable));
+ }
+ }
}
}
- // 1. type changed means encounter different type, we need to convert it
to the least common type
- // 2. need_convert means the type is not the same as the least common
type, we need to convert it
- if (type_changed || info.need_convert) {
+
+ if (type_changed) {
Field new_field;
convert_field_to_type(field, *least_common_type.get(), &new_field);
field = new_field;
}
-
+ ++num_rows;
data.back()->insert(field);
}
diff --git a/be/test/vec/columns/column_variant_test.cpp
b/be/test/vec/columns/column_variant_test.cpp
index 9d686d574ea..8bb2ce4ce73 100644
--- a/be/test/vec/columns/column_variant_test.cpp
+++ b/be/test/vec/columns/column_variant_test.cpp
@@ -3567,4 +3567,236 @@ TEST_F(ColumnVariantTest,
compatibility_deserialize_and_verify) {
std::cout << "Successfully verified deserialized data integrity!" <<
std::endl;
}
-}
\ No newline at end of file
+
+TEST_F(ColumnVariantTest, subcolumn_insert_range_from_test) {
+ ColumnVariant::Subcolumn subcolumn(0, true /* is_nullable */, false /*
is_root */);
+ Field int_field = Field::create_field<TYPE_INT>(200000);
+ Field string_field = Field::create_field<TYPE_STRING>("hello");
+
+ Array array_int(2);
+ array_int[0] = int_field;
+ array_int[1] = int_field;
+ Field array_int_field = Field::create_field<TYPE_ARRAY>(array_int);
+ ColumnVariant::Subcolumn subcolumn2(0, true /* is_nullable */, false /*
is_root */);
+ subcolumn2.insert(array_int_field);
+ subcolumn2.finalize();
+
+ Array array_tiny_int(2);
+ Field tiny_int = Field::create_field<TYPE_TINYINT>(100);
+ array_tiny_int[0] = tiny_int;
+ array_tiny_int[1] = tiny_int;
+ Field array_tiny_int_field =
Field::create_field<TYPE_ARRAY>(array_tiny_int);
+ ColumnVariant::Subcolumn subcolumn1(0, true /* is_nullable */, false /*
is_root */);
+ subcolumn1.insert(array_tiny_int_field);
+ subcolumn1.finalize();
+
+ Array array_string(2);
+ array_string[0] = string_field;
+ array_string[1] = string_field;
+ Field array_string_field = Field::create_field<TYPE_ARRAY>(array_string);
+ ColumnVariant::Subcolumn subcolumn3(0, true /* is_nullable */, false /*
is_root */);
+ subcolumn3.insert(array_string_field);
+ subcolumn3.finalize();
+
+ subcolumn.insert_range_from(subcolumn1, 0, 1);
+ subcolumn.insert_range_from(subcolumn2, 0, 1);
+ subcolumn.insert_range_from(subcolumn3, 0, 1);
+ subcolumn.finalize();
+ EXPECT_EQ(subcolumn.data.size(), 1);
+ std::cout << subcolumn.get_least_common_type()->get_name() << std::endl;
+ EXPECT_EQ(subcolumn.get_least_common_type()->get_primitive_type(),
PrimitiveType::TYPE_ARRAY);
+}
+
+TEST_F(ColumnVariantTest, subcolumn_insert_test) {
+ ColumnVariant::Subcolumn subcolumn(0, true /* is_nullable */, false /*
is_root */);
+ Field int_field = Field::create_field<TYPE_INT>(200000);
+ Field string_field = Field::create_field<TYPE_STRING>("hello");
+ Array array_int(2);
+ array_int[0] = int_field;
+ array_int[1] = int_field;
+ Field array_int_field = Field::create_field<TYPE_ARRAY>(array_int);
+
+ Array array_int2(2);
+ Field tiny_int = Field::create_field<TYPE_TINYINT>(100);
+ array_int2[0] = tiny_int;
+ array_int2[1] = tiny_int;
+ Field array_int2_field = Field::create_field<TYPE_ARRAY>(array_int2);
+
+ Array array_string(2);
+ array_string[0] = string_field;
+ array_string[1] = string_field;
+ Field array_string_field = Field::create_field<TYPE_ARRAY>(array_string);
+
+ subcolumn.insert(array_int2_field);
+ subcolumn.insert(array_int_field);
+ subcolumn.insert(array_string_field);
+ subcolumn.finalize();
+ EXPECT_EQ(subcolumn.data.size(), 1);
+ EXPECT_EQ(subcolumn.get_least_common_type()->get_primitive_type(),
PrimitiveType::TYPE_ARRAY);
+
+ subcolumn.insert(string_field);
+ subcolumn.insert(int_field);
+ EXPECT_EQ(subcolumn.data.size(), 2);
+
EXPECT_EQ(remove_nullable(subcolumn.get_least_common_type())->get_primitive_type(),
+ PrimitiveType::TYPE_JSONB);
+}
+
+TEST_F(ColumnVariantTest, subcolumn_insert_test_advanced) {
+ std::vector<Field> fields;
+
+ fields.emplace_back(Field::create_field<TYPE_NULL>(Null()));
+
+ fields.emplace_back(Field::create_field<TYPE_BOOLEAN>(true));
+
+ fields.emplace_back(Field::create_field<TYPE_BIGINT>(922337203685477588));
+
+
fields.emplace_back(Field::create_field<TYPE_LARGEINT>(922337203685477588));
+
+ fields.emplace_back(Field::create_field<TYPE_DOUBLE>(-3.14159265359));
+
+ fields.emplace_back(Field::create_field<TYPE_STRING>("hello world"));
+
+ Array arr_boolean(2);
+ arr_boolean[0] = Field::create_field<TYPE_BOOLEAN>(true);
+ arr_boolean[1] = Field::create_field<TYPE_BOOLEAN>(false);
+ Field arr_boolean_field = Field::create_field<TYPE_ARRAY>(arr_boolean);
+ fields.emplace_back(arr_boolean_field);
+
+ Array arr_int64(2);
+ arr_int64[0] = Field::create_field<TYPE_BIGINT>(1232323232323232323);
+ arr_int64[1] = Field::create_field<TYPE_BIGINT>(2232323223232323232);
+ Field arr_int64_field = Field::create_field<TYPE_ARRAY>(arr_int64);
+ fields.emplace_back(arr_int64_field);
+
+ Array arr_double(2);
+ arr_double[0] = Field::create_field<TYPE_DOUBLE>(1.1);
+ arr_double[1] = Field::create_field<TYPE_DOUBLE>(2.2);
+ Field arr_double_field = Field::create_field<TYPE_ARRAY>(arr_double);
+ fields.emplace_back(arr_double_field);
+
+ Array arr_string(2);
+ arr_string[0] = Field::create_field<TYPE_STRING>("one");
+ arr_string[1] = Field::create_field<TYPE_STRING>("two");
+ Field arr_string_field = Field::create_field<TYPE_ARRAY>(arr_string);
+ fields.emplace_back(arr_string_field);
+
+ Array arr_jsonb(5);
+ arr_jsonb[0] = Field::create_field<TYPE_STRING>("one");
+ arr_jsonb[1] = Field::create_field<TYPE_DOUBLE>(1.1);
+ arr_jsonb[2] = Field::create_field<TYPE_BOOLEAN>(true);
+ arr_jsonb[3] = Field::create_field<TYPE_LARGEINT>(1232323232323232323);
+ arr_jsonb[4] = Field::create_field<TYPE_BIGINT>(1232323232323232323);
+ Field arr_jsonb_field = Field::create_field<TYPE_ARRAY>(arr_jsonb);
+ fields.emplace_back(arr_jsonb_field);
+
+ std::random_device rd;
+ std::mt19937 g(rd());
+
+ for (int i = 0; i < (1 << fields.size()); i++) {
+ std::shuffle(fields.begin(), fields.end(), g);
+ auto subcolumn = ColumnVariant::Subcolumn(0, true, false);
+
+ for (const auto& field : fields) {
+ subcolumn.insert(field);
+ }
+
+ subcolumn.finalize();
+ EXPECT_EQ(subcolumn.data.size(), 1);
+ // std::cout << "least common type: " <<
subcolumn.get_least_common_type()->get_name() << std::endl;
+ EXPECT_EQ(subcolumn.least_common_type.get_base_type_id(),
PrimitiveType::TYPE_JSONB);
+
+ for (const auto& field : fields) {
+ subcolumn.insert(field);
+ }
+ EXPECT_EQ(subcolumn.least_common_type.get_base_type_id(),
PrimitiveType::TYPE_JSONB);
+
+ if (i % 1000 == 0) {
+ std::cout << "insert count " << i << std::endl;
+ }
+ }
+}
+
+TEST_F(ColumnVariantTest, subcolumn_insert_range_from_test_advanced) {
+ std::vector<Field> fields;
+
+ fields.emplace_back(Field::create_field<TYPE_NULL>(Null()));
+
+ fields.emplace_back(Field::create_field<TYPE_BOOLEAN>(true));
+
+ fields.emplace_back(Field::create_field<TYPE_BIGINT>(922337203685477588));
+
+
fields.emplace_back(Field::create_field<TYPE_LARGEINT>(922337203685477588));
+
+ fields.emplace_back(Field::create_field<TYPE_DOUBLE>(-3.14159265359));
+
+ fields.emplace_back(Field::create_field<TYPE_STRING>("hello world"));
+
+ Array arr_boolean(2);
+ arr_boolean[0] = Field::create_field<TYPE_BOOLEAN>(true);
+ arr_boolean[1] = Field::create_field<TYPE_BOOLEAN>(false);
+ Field arr_boolean_field = Field::create_field<TYPE_ARRAY>(arr_boolean);
+ fields.emplace_back(arr_boolean_field);
+
+ Array arr_int64(2);
+ arr_int64[0] = Field::create_field<TYPE_BIGINT>(1232323232323232323);
+ arr_int64[1] = Field::create_field<TYPE_BIGINT>(2232323223232323232);
+ Field arr_int64_field = Field::create_field<TYPE_ARRAY>(arr_int64);
+ fields.emplace_back(arr_int64_field);
+
+ Array arr_largeint(2);
+ arr_largeint[0] = Field::create_field<TYPE_LARGEINT>(1232323232323232323);
+ arr_largeint[1] = Field::create_field<TYPE_LARGEINT>(2232323223232323232);
+ Field arr_largeint_field = Field::create_field<TYPE_ARRAY>(arr_largeint);
+ fields.emplace_back(arr_largeint_field);
+
+ Array arr_double(2);
+ arr_double[0] = Field::create_field<TYPE_DOUBLE>(1.1);
+ arr_double[1] = Field::create_field<TYPE_DOUBLE>(2.2);
+ Field arr_double_field = Field::create_field<TYPE_ARRAY>(arr_double);
+ fields.emplace_back(arr_double_field);
+
+ Array arr_string(2);
+ arr_string[0] = Field::create_field<TYPE_STRING>("one");
+ arr_string[1] = Field::create_field<TYPE_STRING>("two");
+ Field arr_string_field = Field::create_field<TYPE_ARRAY>(arr_string);
+ fields.emplace_back(arr_string_field);
+
+ Array arr_jsonb(5);
+ arr_jsonb[0] = Field::create_field<TYPE_STRING>("one");
+ arr_jsonb[1] = Field::create_field<TYPE_DOUBLE>(1.1);
+ arr_jsonb[2] = Field::create_field<TYPE_BOOLEAN>(true);
+ arr_jsonb[3] = Field::create_field<TYPE_LARGEINT>(1232323232323232323);
+ arr_jsonb[4] = Field::create_field<TYPE_BIGINT>(1232323232323232323);
+ Field arr_jsonb_field = Field::create_field<TYPE_ARRAY>(arr_jsonb);
+ fields.emplace_back(arr_jsonb_field);
+
+ std::random_device rd;
+ std::mt19937 g(rd());
+
+ for (int i = 0; i < (1 << fields.size()); i++) {
+ std::shuffle(fields.begin(), fields.end(), g);
+ auto subcolumn = ColumnVariant::Subcolumn(0, true, false);
+
+ for (const auto& field : fields) {
+ auto subcolumn_tmp = ColumnVariant::Subcolumn(0, true, false);
+ subcolumn_tmp.insert(field);
+ subcolumn.insert_range_from(subcolumn_tmp, 0, 1);
+ }
+
+ subcolumn.finalize();
+ EXPECT_EQ(subcolumn.data.size(), 1);
+ // std::cout << "least common type: " <<
subcolumn.get_least_common_type()->get_name() << std::endl;
+ EXPECT_EQ(subcolumn.least_common_type.get_base_type_id(),
PrimitiveType::TYPE_JSONB);
+
+ for (const auto& field : fields) {
+ subcolumn.insert(field);
+ }
+ EXPECT_EQ(subcolumn.least_common_type.get_base_type_id(),
PrimitiveType::TYPE_JSONB);
+
+ if (i % 1000 == 0) {
+ std::cout << "insert count " << i << std::endl;
+ }
+ }
+}
+
+} // namespace doris::vectorized
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]