github-actions[bot] commented on code in PR #24554:
URL: https://github.com/apache/doris/pull/24554#discussion_r1384708293
##########
be/src/vec/columns/column_object.cpp:
##########
@@ -926,10 +1320,85 @@
num_rows = target_num_rows;
}
+void ColumnObject::create_root() {
+ auto type = is_nullable ? make_nullable(std::make_shared<MostCommonType>())
+ : std::make_shared<MostCommonType>();
+ add_sub_column({}, type->create_column(), type);
+}
+
+void ColumnObject::create_root(const DataTypePtr& type, MutableColumnPtr&&
column) {
+ if (num_rows == 0) {
+ num_rows = column->size();
+ }
+ add_sub_column({}, std::move(column), type);
+}
+
+bool ColumnObject::is_null_root() const {
Review Comment:
warning: method 'is_null_root' can be made static
[readability-convert-member-functions-to-static]
be/src/vec/columns/column_object.h:264:
```diff
- bool is_null_root() const;
+ static bool is_null_root() ;
```
```suggestion
bool ColumnObject::is_null_root() {
```
##########
be/src/vec/columns/column_object.cpp:
##########
@@ -845,28 +905,318 @@ bool ColumnObject::is_finalized() const {
[](const auto& entry) { return
entry->data.is_finalized(); });
}
-void ColumnObject::finalize() {
+static bool check_if_valid_column_name(const PathInData& path) {
+ static const std::regex
COLUMN_NAME_REGEX("^[_a-zA-Z@0-9][.a-zA-Z0-9_+-/><?@#$%^&*]{0,255}$");
+ return std::regex_match(path.get_path(), COLUMN_NAME_REGEX);
+}
+
+void ColumnObject::Subcolumn::wrapp_array_nullable() {
+ // Wrap array with nullable, treat empty array as null to elimate conflict
at present
+ auto& result_column = get_finalized_column_ptr();
+ if (result_column->is_column_array() && !result_column->is_nullable()) {
+ auto new_null_map = ColumnUInt8::create();
+ new_null_map->reserve(result_column->size());
+ auto& null_map_data = new_null_map->get_data();
+ auto array = static_cast<const ColumnArray*>(result_column.get());
+ for (size_t i = 0; i < array->size(); ++i) {
+ null_map_data.push_back(array->is_default_at(i));
+ }
+ result_column = ColumnNullable::create(std::move(result_column),
std::move(new_null_map));
+ data_types[0] = make_nullable(data_types[0]);
+ least_common_type = LeastCommonType {data_types[0]};
+ }
+}
+
+rapidjson::Value* find_leaf_node_by_path(rapidjson::Value& json, const
PathInData& path,
+ int idx = 0) {
+ if (idx >= path.get_parts().size()) {
+ return &json;
+ }
+
+ std::string_view current_key = path.get_parts()[idx].key;
+ if (!json.IsObject()) {
+ return nullptr;
+ }
+ rapidjson::Value name(current_key.data(), current_key.size());
+ auto it = json.FindMember(name);
+ if (it == json.MemberEnd()) {
+ return nullptr;
+ }
+ rapidjson::Value& current = it->value;
+ // if (idx == path.get_parts().size() - 1) {
+ // return ¤t;
+ // }
+ return find_leaf_node_by_path(current, path, idx + 1);
+}
+
+void find_and_set_leave_value(const IColumn* column, const PathInData& path,
+ const DataTypeSerDeSPtr& type, rapidjson::Value&
root,
+ rapidjson::Document::AllocatorType& allocator,
int row) {
+ const auto* nullable = assert_cast<const ColumnNullable*>(column);
+ if (nullable->is_null_at(row)) {
+ return;
+ }
+ // TODO could cache the result of leaf nodes with it's path info
+ rapidjson::Value* target = find_leaf_node_by_path(root, path);
+ if (UNLIKELY(!target)) {
+ rapidjson::StringBuffer buffer;
+ rapidjson::Writer<rapidjson::StringBuffer> writer(buffer);
+ root.Accept(writer);
+ LOG(FATAL) << "could not find path " << path.get_path()
+ << ", root: " << std::string(buffer.GetString(),
buffer.GetSize());
+ }
+ type->write_one_cell_to_json(*column, *target, allocator, row);
+}
+
+// compact null values
+// {"a" : {"b" : "d" {"n" : null}, "e" : null}, "c" : 10 }
+// after compact -> {"a" : {"c"} : 10}
+void compact_null_values(rapidjson::Value& json,
rapidjson::Document::AllocatorType& allocator) {
+ if (!json.IsObject() || json.IsNull()) {
+ return;
+ }
+
+ rapidjson::Value::MemberIterator it = json.MemberBegin();
+ while (it != json.MemberEnd()) {
+ rapidjson::Value& value = it->value;
+ if (value.IsNull()) {
+ it = json.EraseMember(it);
+ continue;
+ }
+ compact_null_values(value, allocator);
+ if (value.IsObject() && value.ObjectEmpty()) {
+ it = json.EraseMember(it);
+ continue;
+ }
+ ++it;
+ }
+}
+
+// Construct rapidjson value from Subcolumns
+void get_json_by_column_tree(rapidjson::Value& root,
rapidjson::Document::AllocatorType& allocator,
+ const ColumnObject::Subcolumns::Node* node_root) {
+ if (node_root == nullptr || node_root->children.empty()) {
+ root.SetNull();
+ return;
+ }
+ root.SetObject();
+ for (auto it = node_root->children.begin(); it !=
node_root->children.end(); ++it) {
+ auto child = it->get_second();
+ rapidjson::Value value(rapidjson::kObjectType);
+ get_json_by_column_tree(value, allocator, child.get());
+ root.AddMember(rapidjson::StringRef(it->get_first().data,
it->get_first().size), value,
+ allocator);
+ }
+}
+
+bool ColumnObject::serialize_one_row_to_string(int row, std::string* output)
const {
+ if (!is_finalized()) {
+ const_cast<ColumnObject*>(this)->finalize();
+ }
+ rapidjson::StringBuffer buf;
+ if (is_scalar_variant()) {
+ auto type = get_root_type();
+ *output = type->to_string(*get_root(), row);
+ return true;
+ }
+ bool res = serialize_one_row_to_json_format(row, &buf, nullptr);
+ if (res) {
+ // TODO avoid copy
+ *output = std::string(buf.GetString(), buf.GetSize());
+ }
+ return res;
+}
+
+bool ColumnObject::serialize_one_row_to_string(int row, BufferWritable&
output) const {
+ if (!is_finalized()) {
+ const_cast<ColumnObject*>(this)->finalize();
+ }
+ if (is_scalar_variant()) {
+ auto type = get_root_type();
+ type->to_string(*get_root(), row, output);
+ return true;
+ }
+ rapidjson::StringBuffer buf;
+ bool res = serialize_one_row_to_json_format(row, &buf, nullptr);
+ if (res) {
+ output.write(buf.GetString(), buf.GetLength());
+ }
+ return res;
+}
+
+bool ColumnObject::serialize_one_row_to_json_format(int row,
rapidjson::StringBuffer* output,
+ bool* is_null) const {
+ CHECK(is_finalized());
+ if (subcolumns.empty()) {
+ if (is_null != nullptr) {
+ *is_null = true;
+ } else {
+ rapidjson::Value root(rapidjson::kNullType);
+ rapidjson::Writer<rapidjson::StringBuffer> writer(*output);
+ return root.Accept(writer);
+ }
+ return true;
+ }
+ CHECK(size() > row);
+ rapidjson::StringBuffer buffer;
+ rapidjson::Value root(rapidjson::kNullType);
+ if (doc_structure == nullptr) {
+ doc_structure = std::make_shared<rapidjson::Document>();
+ rapidjson::Document::AllocatorType& allocator =
doc_structure->GetAllocator();
+ get_json_by_column_tree(*doc_structure, allocator,
subcolumns.get_root());
+ }
+ if (!doc_structure->IsNull()) {
+ root.CopyFrom(*doc_structure, doc_structure->GetAllocator());
+ }
+#ifndef NDEBUG
+ VLOG_DEBUG << "dump structure " <<
JsonFunctions::print_json_value(*doc_structure);
+#endif
+ for (const auto& subcolumn : subcolumns) {
+ find_and_set_leave_value(subcolumn->data.get_finalized_column_ptr(),
subcolumn->path,
+
subcolumn->data.get_least_common_type_serde(), root,
+ doc_structure->GetAllocator(), row);
+ }
+ compact_null_values(root, doc_structure->GetAllocator());
+ if (root.IsNull() && is_null != nullptr) {
+ // Fast path
+ *is_null = true;
+ } else {
+ output->Clear();
+ rapidjson::Writer<rapidjson::StringBuffer> writer(*output);
+ return root.Accept(writer);
+ }
+ return true;
+}
+
+void ColumnObject::merge_sparse_to_root_column() {
Review Comment:
warning: method 'merge_sparse_to_root_column' can be made const
[readability-make-member-function-const]
be/src/vec/columns/column_object.h:252:
```diff
- void merge_sparse_to_root_column();
+ void merge_sparse_to_root_column() const;
```
```suggestion
void ColumnObject::merge_sparse_to_root_column() const {
```
##########
be/src/vec/columns/column_object.cpp:
##########
@@ -845,28 +905,318 @@
[](const auto& entry) { return
entry->data.is_finalized(); });
}
-void ColumnObject::finalize() {
+static bool check_if_valid_column_name(const PathInData& path) {
+ static const std::regex
COLUMN_NAME_REGEX("^[_a-zA-Z@0-9][.a-zA-Z0-9_+-/><?@#$%^&*]{0,255}$");
+ return std::regex_match(path.get_path(), COLUMN_NAME_REGEX);
+}
+
+void ColumnObject::Subcolumn::wrapp_array_nullable() {
+ // Wrap array with nullable, treat empty array as null to elimate conflict
at present
+ auto& result_column = get_finalized_column_ptr();
+ if (result_column->is_column_array() && !result_column->is_nullable()) {
+ auto new_null_map = ColumnUInt8::create();
+ new_null_map->reserve(result_column->size());
+ auto& null_map_data = new_null_map->get_data();
+ auto array = static_cast<const ColumnArray*>(result_column.get());
+ for (size_t i = 0; i < array->size(); ++i) {
+ null_map_data.push_back(array->is_default_at(i));
+ }
+ result_column = ColumnNullable::create(std::move(result_column),
std::move(new_null_map));
+ data_types[0] = make_nullable(data_types[0]);
+ least_common_type = LeastCommonType {data_types[0]};
+ }
+}
+
+rapidjson::Value* find_leaf_node_by_path(rapidjson::Value& json, const
PathInData& path,
+ int idx = 0) {
+ if (idx >= path.get_parts().size()) {
+ return &json;
+ }
+
+ std::string_view current_key = path.get_parts()[idx].key;
+ if (!json.IsObject()) {
+ return nullptr;
+ }
+ rapidjson::Value name(current_key.data(), current_key.size());
+ auto it = json.FindMember(name);
+ if (it == json.MemberEnd()) {
+ return nullptr;
+ }
+ rapidjson::Value& current = it->value;
+ // if (idx == path.get_parts().size() - 1) {
+ // return ¤t;
+ // }
+ return find_leaf_node_by_path(current, path, idx + 1);
+}
+
+void find_and_set_leave_value(const IColumn* column, const PathInData& path,
+ const DataTypeSerDeSPtr& type, rapidjson::Value&
root,
+ rapidjson::Document::AllocatorType& allocator,
int row) {
+ const auto* nullable = assert_cast<const ColumnNullable*>(column);
+ if (nullable->is_null_at(row)) {
+ return;
+ }
+ // TODO could cache the result of leaf nodes with it's path info
+ rapidjson::Value* target = find_leaf_node_by_path(root, path);
+ if (UNLIKELY(!target)) {
+ rapidjson::StringBuffer buffer;
+ rapidjson::Writer<rapidjson::StringBuffer> writer(buffer);
+ root.Accept(writer);
+ LOG(FATAL) << "could not find path " << path.get_path()
+ << ", root: " << std::string(buffer.GetString(),
buffer.GetSize());
+ }
+ type->write_one_cell_to_json(*column, *target, allocator, row);
+}
+
+// compact null values
+// {"a" : {"b" : "d" {"n" : null}, "e" : null}, "c" : 10 }
+// after compact -> {"a" : {"c"} : 10}
+void compact_null_values(rapidjson::Value& json,
rapidjson::Document::AllocatorType& allocator) {
+ if (!json.IsObject() || json.IsNull()) {
+ return;
+ }
+
+ rapidjson::Value::MemberIterator it = json.MemberBegin();
+ while (it != json.MemberEnd()) {
+ rapidjson::Value& value = it->value;
+ if (value.IsNull()) {
+ it = json.EraseMember(it);
+ continue;
+ }
+ compact_null_values(value, allocator);
+ if (value.IsObject() && value.ObjectEmpty()) {
+ it = json.EraseMember(it);
+ continue;
+ }
+ ++it;
+ }
+}
+
+// Construct rapidjson value from Subcolumns
+void get_json_by_column_tree(rapidjson::Value& root,
rapidjson::Document::AllocatorType& allocator,
+ const ColumnObject::Subcolumns::Node* node_root) {
+ if (node_root == nullptr || node_root->children.empty()) {
+ root.SetNull();
+ return;
+ }
+ root.SetObject();
+ for (auto it = node_root->children.begin(); it !=
node_root->children.end(); ++it) {
+ auto child = it->get_second();
+ rapidjson::Value value(rapidjson::kObjectType);
+ get_json_by_column_tree(value, allocator, child.get());
+ root.AddMember(rapidjson::StringRef(it->get_first().data,
it->get_first().size), value,
+ allocator);
+ }
+}
+
+bool ColumnObject::serialize_one_row_to_string(int row, std::string* output)
const {
+ if (!is_finalized()) {
+ const_cast<ColumnObject*>(this)->finalize();
+ }
+ rapidjson::StringBuffer buf;
+ if (is_scalar_variant()) {
+ auto type = get_root_type();
+ *output = type->to_string(*get_root(), row);
+ return true;
+ }
+ bool res = serialize_one_row_to_json_format(row, &buf, nullptr);
+ if (res) {
+ // TODO avoid copy
+ *output = std::string(buf.GetString(), buf.GetSize());
+ }
+ return res;
+}
+
+bool ColumnObject::serialize_one_row_to_string(int row, BufferWritable&
output) const {
+ if (!is_finalized()) {
+ const_cast<ColumnObject*>(this)->finalize();
+ }
+ if (is_scalar_variant()) {
+ auto type = get_root_type();
+ type->to_string(*get_root(), row, output);
+ return true;
+ }
+ rapidjson::StringBuffer buf;
+ bool res = serialize_one_row_to_json_format(row, &buf, nullptr);
+ if (res) {
+ output.write(buf.GetString(), buf.GetLength());
+ }
+ return res;
+}
+
+bool ColumnObject::serialize_one_row_to_json_format(int row,
rapidjson::StringBuffer* output,
+ bool* is_null) const {
+ CHECK(is_finalized());
+ if (subcolumns.empty()) {
+ if (is_null != nullptr) {
+ *is_null = true;
+ } else {
+ rapidjson::Value root(rapidjson::kNullType);
+ rapidjson::Writer<rapidjson::StringBuffer> writer(*output);
+ return root.Accept(writer);
+ }
+ return true;
+ }
+ CHECK(size() > row);
+ rapidjson::StringBuffer buffer;
+ rapidjson::Value root(rapidjson::kNullType);
+ if (doc_structure == nullptr) {
+ doc_structure = std::make_shared<rapidjson::Document>();
+ rapidjson::Document::AllocatorType& allocator =
doc_structure->GetAllocator();
+ get_json_by_column_tree(*doc_structure, allocator,
subcolumns.get_root());
+ }
+ if (!doc_structure->IsNull()) {
+ root.CopyFrom(*doc_structure, doc_structure->GetAllocator());
+ }
+#ifndef NDEBUG
+ VLOG_DEBUG << "dump structure " <<
JsonFunctions::print_json_value(*doc_structure);
+#endif
+ for (const auto& subcolumn : subcolumns) {
+ find_and_set_leave_value(subcolumn->data.get_finalized_column_ptr(),
subcolumn->path,
+
subcolumn->data.get_least_common_type_serde(), root,
+ doc_structure->GetAllocator(), row);
+ }
+ compact_null_values(root, doc_structure->GetAllocator());
+ if (root.IsNull() && is_null != nullptr) {
+ // Fast path
+ *is_null = true;
+ } else {
+ output->Clear();
+ rapidjson::Writer<rapidjson::StringBuffer> writer(*output);
+ return root.Accept(writer);
+ }
+ return true;
+}
+
+void ColumnObject::merge_sparse_to_root_column() {
+ CHECK(is_finalized());
+ if (sparse_columns.empty()) {
+ return;
+ }
+ ColumnPtr src =
subcolumns.get_mutable_root()->data.get_finalized_column_ptr();
+ MutableColumnPtr mresult = src->clone_empty();
+ const ColumnNullable* src_null = assert_cast<const
ColumnNullable*>(src.get());
+ const ColumnString* src_column_ptr =
+ assert_cast<const ColumnString*>(&src_null->get_nested_column());
+ rapidjson::StringBuffer buffer;
+ doc_structure = std::make_shared<rapidjson::Document>();
+ rapidjson::Document::AllocatorType& allocator =
doc_structure->GetAllocator();
+ get_json_by_column_tree(*doc_structure, allocator,
sparse_columns.get_root());
+
+#ifndef NDEBUG
+ VLOG_DEBUG << "dump structure " <<
JsonFunctions::print_json_value(*doc_structure);
+#endif
+
+ ColumnNullable* result_column_nullable =
+ assert_cast<ColumnNullable*>(mresult->assume_mutable().get());
+ ColumnString* result_column_ptr =
+
assert_cast<ColumnString*>(&result_column_nullable->get_nested_column());
+ result_column_nullable->reserve(num_rows);
+ // parse each row to jsonb
+ for (size_t i = 0; i < num_rows; ++i) {
+ // root is not null, store original value, eg. the root is scalar type
like '[1]'
+ if (!src_null->empty() && !src_null->is_null_at(i)) {
+ result_column_ptr->insert_data(src_column_ptr->get_data_at(i).data,
+
src_column_ptr->get_data_at(i).size);
+ result_column_nullable->get_null_map_data().push_back(0);
+ continue;
+ }
+
+ // parse and encode sparse columns
+ buffer.Clear();
+ rapidjson::Value root(rapidjson::kNullType);
+ if (!doc_structure->IsNull()) {
+ root.CopyFrom(*doc_structure, doc_structure->GetAllocator());
+ }
+ size_t null_count = 0;
+ for (const auto& subcolumn : sparse_columns) {
+ auto& column = subcolumn->data.get_finalized_column_ptr();
+ if (assert_cast<const ColumnNullable&>(*column).is_null_at(i)) {
+ ++null_count;
+ continue;
+ }
+ find_and_set_leave_value(column, subcolumn->path,
+
subcolumn->data.get_least_common_type_serde(), root,
+ doc_structure->GetAllocator(), i);
+ }
+
+ // all null values, store null to sparse root
+ if (null_count == sparse_columns.size()) {
+ result_column_ptr->insert_default();
+ result_column_nullable->get_null_map_data().push_back(1);
+ continue;
+ }
+
+ // encode sparse columns into jsonb format
+ compact_null_values(root, doc_structure->GetAllocator());
+ // parse as jsonb value and put back to rootnode
+ // TODO, we could convert to jsonb directly from rapidjson::Value for
better performance, instead of parsing
+ JsonbParser parser;
+ rapidjson::Writer<rapidjson::StringBuffer> writer(buffer);
+ root.Accept(writer);
+ bool res = parser.parse(buffer.GetString(), buffer.GetSize());
+ CHECK(res) << "buffer:" << std::string(buffer.GetString(),
buffer.GetSize())
+ << ", row_num:" << i;
+
result_column_ptr->insert_data(parser.getWriter().getOutput()->getBuffer(),
+
parser.getWriter().getOutput()->getSize());
+ result_column_nullable->get_null_map_data().push_back(0);
+ }
+
+ // assign merged column
+ subcolumns.get_mutable_root()->data.get_finalized_column_ptr() =
mresult->get_ptr();
+}
+
+void ColumnObject::finalize(bool ignore_sparse) {
Review Comment:
warning: method 'finalize' can be made const
[readability-make-member-function-const]
be/src/vec/columns/column_object.h:326:
```diff
- void finalize(bool ignore_sparse);
+ void finalize(bool ignore_sparse) const;
```
```suggestion
void ColumnObject::finalize(bool ignore_sparse) const {
```
##########
be/src/vec/columns/column_object.cpp:
##########
@@ -845,28 +905,318 @@
[](const auto& entry) { return
entry->data.is_finalized(); });
}
-void ColumnObject::finalize() {
+static bool check_if_valid_column_name(const PathInData& path) {
+ static const std::regex
COLUMN_NAME_REGEX("^[_a-zA-Z@0-9][.a-zA-Z0-9_+-/><?@#$%^&*]{0,255}$");
+ return std::regex_match(path.get_path(), COLUMN_NAME_REGEX);
+}
+
+void ColumnObject::Subcolumn::wrapp_array_nullable() {
+ // Wrap array with nullable, treat empty array as null to elimate conflict
at present
+ auto& result_column = get_finalized_column_ptr();
+ if (result_column->is_column_array() && !result_column->is_nullable()) {
+ auto new_null_map = ColumnUInt8::create();
+ new_null_map->reserve(result_column->size());
+ auto& null_map_data = new_null_map->get_data();
+ auto array = static_cast<const ColumnArray*>(result_column.get());
+ for (size_t i = 0; i < array->size(); ++i) {
+ null_map_data.push_back(array->is_default_at(i));
+ }
+ result_column = ColumnNullable::create(std::move(result_column),
std::move(new_null_map));
+ data_types[0] = make_nullable(data_types[0]);
+ least_common_type = LeastCommonType {data_types[0]};
+ }
+}
+
+rapidjson::Value* find_leaf_node_by_path(rapidjson::Value& json, const
PathInData& path,
+ int idx = 0) {
+ if (idx >= path.get_parts().size()) {
+ return &json;
+ }
+
+ std::string_view current_key = path.get_parts()[idx].key;
+ if (!json.IsObject()) {
+ return nullptr;
+ }
+ rapidjson::Value name(current_key.data(), current_key.size());
+ auto it = json.FindMember(name);
+ if (it == json.MemberEnd()) {
+ return nullptr;
+ }
+ rapidjson::Value& current = it->value;
+ // if (idx == path.get_parts().size() - 1) {
+ // return ¤t;
+ // }
+ return find_leaf_node_by_path(current, path, idx + 1);
+}
+
+void find_and_set_leave_value(const IColumn* column, const PathInData& path,
+ const DataTypeSerDeSPtr& type, rapidjson::Value&
root,
+ rapidjson::Document::AllocatorType& allocator,
int row) {
+ const auto* nullable = assert_cast<const ColumnNullable*>(column);
+ if (nullable->is_null_at(row)) {
+ return;
+ }
+ // TODO could cache the result of leaf nodes with it's path info
+ rapidjson::Value* target = find_leaf_node_by_path(root, path);
+ if (UNLIKELY(!target)) {
+ rapidjson::StringBuffer buffer;
+ rapidjson::Writer<rapidjson::StringBuffer> writer(buffer);
+ root.Accept(writer);
+ LOG(FATAL) << "could not find path " << path.get_path()
+ << ", root: " << std::string(buffer.GetString(),
buffer.GetSize());
+ }
+ type->write_one_cell_to_json(*column, *target, allocator, row);
+}
+
+// compact null values
+// {"a" : {"b" : "d" {"n" : null}, "e" : null}, "c" : 10 }
+// after compact -> {"a" : {"c"} : 10}
+void compact_null_values(rapidjson::Value& json,
rapidjson::Document::AllocatorType& allocator) {
+ if (!json.IsObject() || json.IsNull()) {
+ return;
+ }
+
+ rapidjson::Value::MemberIterator it = json.MemberBegin();
+ while (it != json.MemberEnd()) {
+ rapidjson::Value& value = it->value;
+ if (value.IsNull()) {
+ it = json.EraseMember(it);
+ continue;
+ }
+ compact_null_values(value, allocator);
+ if (value.IsObject() && value.ObjectEmpty()) {
+ it = json.EraseMember(it);
+ continue;
+ }
+ ++it;
+ }
+}
+
+// Construct rapidjson value from Subcolumns
+void get_json_by_column_tree(rapidjson::Value& root,
rapidjson::Document::AllocatorType& allocator,
+ const ColumnObject::Subcolumns::Node* node_root) {
+ if (node_root == nullptr || node_root->children.empty()) {
+ root.SetNull();
+ return;
+ }
+ root.SetObject();
+ for (auto it = node_root->children.begin(); it !=
node_root->children.end(); ++it) {
+ auto child = it->get_second();
+ rapidjson::Value value(rapidjson::kObjectType);
+ get_json_by_column_tree(value, allocator, child.get());
+ root.AddMember(rapidjson::StringRef(it->get_first().data,
it->get_first().size), value,
+ allocator);
+ }
+}
+
+bool ColumnObject::serialize_one_row_to_string(int row, std::string* output)
const {
+ if (!is_finalized()) {
+ const_cast<ColumnObject*>(this)->finalize();
+ }
+ rapidjson::StringBuffer buf;
+ if (is_scalar_variant()) {
+ auto type = get_root_type();
+ *output = type->to_string(*get_root(), row);
+ return true;
+ }
+ bool res = serialize_one_row_to_json_format(row, &buf, nullptr);
+ if (res) {
+ // TODO avoid copy
+ *output = std::string(buf.GetString(), buf.GetSize());
+ }
+ return res;
+}
+
+bool ColumnObject::serialize_one_row_to_string(int row, BufferWritable&
output) const {
+ if (!is_finalized()) {
+ const_cast<ColumnObject*>(this)->finalize();
+ }
+ if (is_scalar_variant()) {
+ auto type = get_root_type();
+ type->to_string(*get_root(), row, output);
+ return true;
+ }
+ rapidjson::StringBuffer buf;
+ bool res = serialize_one_row_to_json_format(row, &buf, nullptr);
+ if (res) {
+ output.write(buf.GetString(), buf.GetLength());
+ }
+ return res;
+}
+
+bool ColumnObject::serialize_one_row_to_json_format(int row,
rapidjson::StringBuffer* output,
+ bool* is_null) const {
+ CHECK(is_finalized());
+ if (subcolumns.empty()) {
+ if (is_null != nullptr) {
+ *is_null = true;
+ } else {
+ rapidjson::Value root(rapidjson::kNullType);
+ rapidjson::Writer<rapidjson::StringBuffer> writer(*output);
+ return root.Accept(writer);
+ }
+ return true;
+ }
+ CHECK(size() > row);
+ rapidjson::StringBuffer buffer;
+ rapidjson::Value root(rapidjson::kNullType);
+ if (doc_structure == nullptr) {
+ doc_structure = std::make_shared<rapidjson::Document>();
+ rapidjson::Document::AllocatorType& allocator =
doc_structure->GetAllocator();
+ get_json_by_column_tree(*doc_structure, allocator,
subcolumns.get_root());
+ }
+ if (!doc_structure->IsNull()) {
+ root.CopyFrom(*doc_structure, doc_structure->GetAllocator());
+ }
+#ifndef NDEBUG
+ VLOG_DEBUG << "dump structure " <<
JsonFunctions::print_json_value(*doc_structure);
+#endif
+ for (const auto& subcolumn : subcolumns) {
+ find_and_set_leave_value(subcolumn->data.get_finalized_column_ptr(),
subcolumn->path,
+
subcolumn->data.get_least_common_type_serde(), root,
+ doc_structure->GetAllocator(), row);
+ }
+ compact_null_values(root, doc_structure->GetAllocator());
+ if (root.IsNull() && is_null != nullptr) {
+ // Fast path
+ *is_null = true;
+ } else {
+ output->Clear();
+ rapidjson::Writer<rapidjson::StringBuffer> writer(*output);
+ return root.Accept(writer);
+ }
+ return true;
+}
+
+void ColumnObject::merge_sparse_to_root_column() {
+ CHECK(is_finalized());
+ if (sparse_columns.empty()) {
+ return;
+ }
+ ColumnPtr src =
subcolumns.get_mutable_root()->data.get_finalized_column_ptr();
+ MutableColumnPtr mresult = src->clone_empty();
+ const ColumnNullable* src_null = assert_cast<const
ColumnNullable*>(src.get());
+ const ColumnString* src_column_ptr =
+ assert_cast<const ColumnString*>(&src_null->get_nested_column());
+ rapidjson::StringBuffer buffer;
+ doc_structure = std::make_shared<rapidjson::Document>();
+ rapidjson::Document::AllocatorType& allocator =
doc_structure->GetAllocator();
+ get_json_by_column_tree(*doc_structure, allocator,
sparse_columns.get_root());
+
+#ifndef NDEBUG
+ VLOG_DEBUG << "dump structure " <<
JsonFunctions::print_json_value(*doc_structure);
+#endif
+
+ ColumnNullable* result_column_nullable =
+ assert_cast<ColumnNullable*>(mresult->assume_mutable().get());
+ ColumnString* result_column_ptr =
+
assert_cast<ColumnString*>(&result_column_nullable->get_nested_column());
+ result_column_nullable->reserve(num_rows);
+ // parse each row to jsonb
+ for (size_t i = 0; i < num_rows; ++i) {
+ // root is not null, store original value, eg. the root is scalar type
like '[1]'
+ if (!src_null->empty() && !src_null->is_null_at(i)) {
+ result_column_ptr->insert_data(src_column_ptr->get_data_at(i).data,
+
src_column_ptr->get_data_at(i).size);
+ result_column_nullable->get_null_map_data().push_back(0);
+ continue;
+ }
+
+ // parse and encode sparse columns
+ buffer.Clear();
+ rapidjson::Value root(rapidjson::kNullType);
+ if (!doc_structure->IsNull()) {
+ root.CopyFrom(*doc_structure, doc_structure->GetAllocator());
+ }
+ size_t null_count = 0;
+ for (const auto& subcolumn : sparse_columns) {
+ auto& column = subcolumn->data.get_finalized_column_ptr();
+ if (assert_cast<const ColumnNullable&>(*column).is_null_at(i)) {
+ ++null_count;
+ continue;
+ }
+ find_and_set_leave_value(column, subcolumn->path,
+
subcolumn->data.get_least_common_type_serde(), root,
+ doc_structure->GetAllocator(), i);
+ }
+
+ // all null values, store null to sparse root
+ if (null_count == sparse_columns.size()) {
+ result_column_ptr->insert_default();
+ result_column_nullable->get_null_map_data().push_back(1);
+ continue;
+ }
+
+ // encode sparse columns into jsonb format
+ compact_null_values(root, doc_structure->GetAllocator());
+ // parse as jsonb value and put back to rootnode
+ // TODO, we could convert to jsonb directly from rapidjson::Value for
better performance, instead of parsing
+ JsonbParser parser;
+ rapidjson::Writer<rapidjson::StringBuffer> writer(buffer);
+ root.Accept(writer);
+ bool res = parser.parse(buffer.GetString(), buffer.GetSize());
+ CHECK(res) << "buffer:" << std::string(buffer.GetString(),
buffer.GetSize())
+ << ", row_num:" << i;
+
result_column_ptr->insert_data(parser.getWriter().getOutput()->getBuffer(),
+
parser.getWriter().getOutput()->getSize());
+ result_column_nullable->get_null_map_data().push_back(0);
+ }
+
+ // assign merged column
+ subcolumns.get_mutable_root()->data.get_finalized_column_ptr() =
mresult->get_ptr();
+}
+
+void ColumnObject::finalize(bool ignore_sparse) {
Subcolumns new_subcolumns;
+ // finalize root first
+ if (!ignore_sparse || !is_null_root()) {
+ new_subcolumns.create_root(subcolumns.get_root()->data);
+ new_subcolumns.get_mutable_root()->data.finalize();
+ }
for (auto&& entry : subcolumns) {
const auto& least_common_type = entry->data.get_least_common_type();
- /// Do not add subcolumns, which consists only from NULLs.
- if (is_nothing(getBaseTypeOfArray(least_common_type))) {
+ /// Do not add subcolumns, which consists only from NULLs
+ if (is_nothing(get_base_type_of_array(least_common_type))) {
continue;
}
- if (!entry->data.data.empty()) {
- entry->data.finalize();
- new_subcolumns.add(entry->path, entry->data);
+ entry->data.finalize();
+ entry->data.wrapp_array_nullable();
+
+ if (entry->data.is_root) {
+ continue;
+ }
+
+ // Check and spilit sparse subcolumns
+ if (!ignore_sparse && (entry->data.check_if_sparse_column(num_rows) ||
+ !check_if_valid_column_name(entry->path))) {
+ // TODO seperate ambiguous path
+ sparse_columns.add(entry->path, entry->data);
+ continue;
}
+
+ new_subcolumns.add(entry->path, entry->data);
}
- /// If all subcolumns were skipped add a dummy subcolumn,
- /// because Tuple type must have at least one element.
- // if (new_subcolumns.empty()) {
- // new_subcolumns.add(
- // PathInData {COLUMN_NAME_DUMMY},
- // Subcolumn
{static_cast<MutableColumnPtr&&>(ColumnUInt8::create(old_size, 0)),
- // is_nullable});
- // }
std::swap(subcolumns, new_subcolumns);
+ doc_structure = nullptr;
+}
+
+void ColumnObject::finalize() {
+ finalize(true);
+}
+
+void ColumnObject::ensure_root_node_type(const DataTypePtr&
expected_root_type) {
Review Comment:
warning: method 'ensure_root_node_type' can be made static
[readability-convert-member-functions-to-static]
be/src/vec/columns/column_object.h:255:
```diff
- void ensure_root_node_type(const DataTypePtr& type);
+ static void ensure_root_node_type(const DataTypePtr& type);
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]