This is an automated email from the ASF dual-hosted git repository.
eldenmoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new f563b2eb382 [Improve](variant) Keep first duplicate Variant JSON path
(#63082)
f563b2eb382 is described below
commit f563b2eb382a6a42c359d977707f7731d8e0d0a0
Author: lihangyu <[email protected]>
AuthorDate: Tue May 12 10:50:38 2026 +0800
[Improve](variant) Keep first duplicate Variant JSON path (#63082)
Add BE config variant_enable_duplicate_json_path_check to keep the first
duplicate JSON object key during Variant parsing when enabled.
---
be/src/common/config.cpp | 1 +
be/src/common/config.h | 2 +
.../data_type_serde/data_type_variant_serde.cpp | 6 +-
be/src/exec/common/variant_util.cpp | 27 ++-
be/src/util/json/json_parser.cpp | 54 ++++--
be/src/util/json/json_parser.h | 5 +
be/test/storage/segment/variant_util_test.cpp | 206 +++++++++++++++++++++
.../data/variant_p0/duplicate_json_path.json | 7 +
.../suites/variant_p0/duplicate_json_path.groovy | 106 +++++++++++
9 files changed, 395 insertions(+), 19 deletions(-)
diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp
index 8b9158d8f67..b1e7949d8c1 100644
--- a/be/src/common/config.cpp
+++ b/be/src/common/config.cpp
@@ -1164,6 +1164,7 @@ DEFINE_mBool(variant_use_cloud_schema_dict_cache, "true");
DEFINE_mInt64(variant_threshold_rows_to_estimate_sparse_column, "2048");
DEFINE_mInt32(variant_max_json_key_length, "255");
DEFINE_mBool(variant_throw_exeception_on_invalid_json, "false");
+DEFINE_mBool(variant_enable_duplicate_json_path_check, "false");
DEFINE_mBool(enable_vertical_compact_variant_subcolumns, "true");
DEFINE_mBool(enable_variant_doc_sparse_write_subcolumns, "true");
// Maximum depth of nested arrays to track with NestedGroup
diff --git a/be/src/common/config.h b/be/src/common/config.h
index 930e9f67fb6..2814f0539ff 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -1425,6 +1425,8 @@
DECLARE_mInt64(variant_threshold_rows_to_estimate_sparse_column);
DECLARE_mInt32(variant_max_json_key_length);
// Treat invalid json format str as string, instead of throwing exception if
false
DECLARE_mBool(variant_throw_exeception_on_invalid_json);
+// Enable duplicate path check when parsing json into variant subcolumns/jsonb.
+DECLARE_mBool(variant_enable_duplicate_json_path_check);
// Enable vertical compact subcolumns of variant column
DECLARE_mBool(enable_vertical_compact_variant_subcolumns);
DECLARE_mBool(enable_variant_doc_sparse_write_subcolumns);
diff --git a/be/src/core/data_type_serde/data_type_variant_serde.cpp
b/be/src/core/data_type_serde/data_type_variant_serde.cpp
index 31538f9fcc5..93ec3edfdfb 100644
--- a/be/src/core/data_type_serde/data_type_variant_serde.cpp
+++ b/be/src/core/data_type_serde/data_type_variant_serde.cpp
@@ -21,6 +21,7 @@
#include <string>
#include "common/cast_set.h"
+#include "common/config.h"
#include "common/exception.h"
#include "common/status.h"
#include "core/assert_cast.h"
@@ -107,10 +108,11 @@ Status
DataTypeVariantSerDe::serialize_one_cell_to_json(const IColumn& column, i
Status DataTypeVariantSerDe::deserialize_one_cell_from_json(IColumn& column,
Slice& slice,
const
FormatOptions& options) const {
- ParseConfig config;
+ ParseConfig parse_config;
+ parse_config.check_duplicate_json_path =
config::variant_enable_duplicate_json_path_check;
StringRef json_ref(slice.data, slice.size);
RETURN_IF_CATCH_EXCEPTION(
- variant_util::parse_json_to_variant(column, json_ref, nullptr,
config));
+ variant_util::parse_json_to_variant(column, json_ref, nullptr,
parse_config));
return Status::OK();
}
diff --git a/be/src/exec/common/variant_util.cpp
b/be/src/exec/common/variant_util.cpp
index af10e0d4ff2..2f2ce53d8ea 100644
--- a/be/src/exec/common/variant_util.cpp
+++ b/be/src/exec/common/variant_util.cpp
@@ -1928,16 +1928,26 @@ void parse_json_to_variant_impl(IColumn& column, const
char* src, size_t length,
}
};
+ auto is_plain_path = [](const PathInData& path) {
+ for (const auto& part : path.get_parts()) {
+ if (part.is_nested || part.anonymous_array_level != 0) {
+ return false;
+ }
+ }
+ return true;
+ };
+
auto get_or_create_subcolumn = [&](const PathInData& path, size_t
index_hint,
const FieldInfo& field_info) ->
ColumnVariant::Subcolumn* {
- if (column_variant.get_subcolumn(path, index_hint) == nullptr) {
+ auto* subcolumn = column_variant.get_subcolumn(path, index_hint);
+ if (subcolumn == nullptr) {
if (path.has_nested_part()) {
column_variant.add_nested_subcolumn(path, field_info,
old_num_rows);
} else {
column_variant.add_sub_column(path, old_num_rows);
}
+ subcolumn = column_variant.get_subcolumn(path, index_hint);
}
- auto* subcolumn = column_variant.get_subcolumn(path, index_hint);
if (!subcolumn) {
throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Failed to
find sub column {}",
path.get_path());
@@ -1945,6 +1955,13 @@ void parse_json_to_variant_impl(IColumn& column, const
char* src, size_t length,
return subcolumn;
};
+ auto normalize_plain_path = [&](const PathInData& path) {
+ if (!config.check_duplicate_json_path || path.empty() ||
!is_plain_path(path)) {
+ return path;
+ }
+ return PathInData(path.get_path());
+ };
+
auto insert_into_subcolumn = [&](size_t i,
bool check_size_mismatch) ->
ColumnVariant::Subcolumn* {
FieldInfo field_info;
@@ -1952,12 +1969,13 @@ void parse_json_to_variant_impl(IColumn& column, const
char* src, size_t length,
if (field_info.scalar_type_id == PrimitiveType::INVALID_TYPE) {
return nullptr;
}
- auto* subcolumn = get_or_create_subcolumn(paths[i], i, field_info);
+ auto path = normalize_plain_path(paths[i]);
+ auto* subcolumn = get_or_create_subcolumn(path, i, field_info);
flush_defaults(subcolumn);
if (check_size_mismatch && subcolumn->size() != old_num_rows) {
throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
"subcolumn {} size missmatched, may
contains duplicated entry",
- paths[i].get_path());
+ path.get_path());
}
subcolumn->insert(std::move(values[i]), std::move(field_info));
return subcolumn;
@@ -2221,6 +2239,7 @@ Status parse_and_materialize_variant_columns(Block&
block, const TabletSchema& t
// Deprecated legacy flatten-nested switch. Distinct from
variant_enable_nested_group.
configs[i].deprecated_enable_flatten_nested =
tablet_schema.deprecated_variant_flatten_nested();
+ configs[i].check_duplicate_json_path =
config::variant_enable_duplicate_json_path_check;
const auto& column = tablet_schema.column(variant_schema_pos[i]);
if (!column.is_variant_type()) {
return Status::InternalError("column is not variant type, column
name: {}",
diff --git a/be/src/util/json/json_parser.cpp b/be/src/util/json/json_parser.cpp
index 2a7c401b9d8..3df723c3849 100644
--- a/be/src/util/json/json_parser.cpp
+++ b/be/src/util/json/json_parser.cpp
@@ -26,6 +26,7 @@
#include <algorithm>
#include <cassert>
#include <string_view>
+#include <vector>
#include "common/cast_set.h"
// IWYU pragma: keep
@@ -46,6 +47,7 @@ std::optional<ParseResult>
JSONDataParser<ParserImpl>::parse(const char* begin,
// deprecated_enable_flatten_nested controls nested path traversal
// NestedGroup expansion is now handled at storage layer
context.deprecated_enable_flatten_nested =
config.deprecated_enable_flatten_nested;
+ context.check_duplicate_json_path = config.check_duplicate_json_path;
context.is_top_array = document.isArray();
traverse(document, context);
ParseResult result;
@@ -72,25 +74,39 @@ void JSONDataParser<ParserImpl>::traverse(const Element&
element, ParseContext&
// Parse nested arrays to JsonbField
JsonbWriter writer;
traverseArrayAsJsonb(element.getArray(), writer);
- ctx.paths.push_back(ctx.builder.get_parts());
- ctx.values.push_back(Field::create_field<TYPE_JSONB>(
- JsonbField(writer.getOutput()->getBuffer(),
writer.getOutput()->getSize())));
+ appendValueIfNotDuplicate(
+ ctx, ctx.builder.get_parts(),
+
Field::create_field<TYPE_JSONB>(JsonbField(writer.getOutput()->getBuffer(),
+
writer.getOutput()->getSize())));
} else {
traverseArray(element.getArray(), ctx);
}
// we should set has_nested_in_flatten to false when traverse array
finished for next array otherwise it will be true for next array
ctx.has_nested_in_flatten = false;
} else {
- ctx.paths.push_back(ctx.builder.get_parts());
- ctx.values.push_back(getValueAsField(element));
+ appendValueIfNotDuplicate(ctx, ctx.builder.get_parts(),
getValueAsField(element));
}
}
+
+template <typename ParserImpl>
+void JSONDataParser<ParserImpl>::appendValueIfNotDuplicate(ParseContext& ctx,
+ const
PathInData::Parts& path,
+ Field&& value) {
+ if (ctx.check_duplicate_json_path) {
+ PathInData path_in_data(path);
+ if (!ctx.visited_path_names.emplace(path_in_data.get_path()).second) {
+ return;
+ }
+ }
+ ctx.paths.push_back(path);
+ ctx.values.push_back(std::move(value));
+}
+
template <typename ParserImpl>
void JSONDataParser<ParserImpl>::traverseObject(const JSONObject& object,
ParseContext& ctx) {
ctx.paths.reserve(ctx.paths.size() + object.size());
ctx.values.reserve(ctx.values.size() + object.size());
- for (auto it = object.begin(); it != object.end(); ++it) {
- const auto& [key, value] = *it;
+ auto check_key_length = [](const auto& key) {
const size_t max_key_length =
cast_set<size_t>(config::variant_max_json_key_length);
if (key.size() > max_key_length) {
throw doris::Exception(
@@ -98,9 +114,17 @@ void JSONDataParser<ParserImpl>::traverseObject(const
JSONObject& object, ParseC
fmt::format("Key length exceeds maximum allowed size of {}
bytes.",
max_key_length));
}
+ };
+ auto traverse_object_member = [&](const auto& key, const auto& value) {
+ check_key_length(key);
ctx.builder.append(key, false);
traverse(value, ctx);
ctx.builder.pop_back();
+ };
+
+ for (auto it = object.begin(); it != object.end(); ++it) {
+ const auto& [key, value] = *it;
+ traverse_object_member(key, value);
}
}
@@ -176,6 +200,7 @@ void JSONDataParser<ParserImpl>::traverseArray(const
JSONArray& array, ParseCont
ParseArrayContext array_ctx;
array_ctx.has_nested_in_flatten = ctx.has_nested_in_flatten;
array_ctx.is_top_array = ctx.is_top_array;
+ array_ctx.check_duplicate_json_path = ctx.check_duplicate_json_path;
array_ctx.total_size = array.size();
for (auto it = array.begin(); it != array.end(); ++it) {
traverseArrayElement(*it, array_ctx);
@@ -183,16 +208,17 @@ void JSONDataParser<ParserImpl>::traverseArray(const
JSONArray& array, ParseCont
}
auto&& arrays_by_path = array_ctx.arrays_by_path;
if (arrays_by_path.empty()) {
- ctx.paths.push_back(ctx.builder.get_parts());
- ctx.values.push_back(Field::create_field<TYPE_ARRAY>(Array()));
+ appendValueIfNotDuplicate(ctx, ctx.builder.get_parts(),
+ Field::create_field<TYPE_ARRAY>(Array()));
} else {
ctx.paths.reserve(ctx.paths.size() + arrays_by_path.size());
ctx.values.reserve(ctx.values.size() + arrays_by_path.size());
for (auto it = arrays_by_path.begin(); it != arrays_by_path.end();
++it) {
auto&& [path, path_array] = it->second;
/// Merge prefix path and path of array element.
- ctx.paths.push_back(ctx.builder.append(path, true).get_parts());
-
ctx.values.push_back(Field::create_field<TYPE_ARRAY>(std::move(path_array)));
+ ctx.builder.append(path, true);
+ appendValueIfNotDuplicate(ctx, ctx.builder.get_parts(),
+
Field::create_field<TYPE_ARRAY>(std::move(path_array)));
ctx.builder.pop_back(path.size());
}
}
@@ -204,10 +230,12 @@ void
JSONDataParser<ParserImpl>::traverseArrayElement(const Element& element,
ParseContext element_ctx;
element_ctx.has_nested_in_flatten = ctx.has_nested_in_flatten;
element_ctx.is_top_array = ctx.is_top_array;
+ element_ctx.check_duplicate_json_path = ctx.check_duplicate_json_path;
traverse(element, element_ctx);
- auto& [_, paths, values, deprecated_flatten_nested, __, is_top_array] =
element_ctx;
+ auto& paths = element_ctx.paths;
+ auto& values = element_ctx.values;
- if (element_ctx.has_nested_in_flatten && is_top_array) {
+ if (element_ctx.has_nested_in_flatten && element_ctx.is_top_array) {
checkAmbiguousStructure(ctx, paths);
}
diff --git a/be/src/util/json/json_parser.h b/be/src/util/json/json_parser.h
index 4b49259588b..c4a165e8995 100644
--- a/be/src/util/json/json_parser.h
+++ b/be/src/util/json/json_parser.h
@@ -101,6 +101,7 @@ void writeValueAsJsonb(const Element& element, JsonbWriter&
writer) {
struct ParseConfig {
bool deprecated_enable_flatten_nested = false;
+ bool check_duplicate_json_path = false;
enum class ParseTo {
OnlySubcolumns = 0,
OnlyDocValueColumn = 1,
@@ -127,7 +128,9 @@ private:
PathInDataBuilder builder;
std::vector<PathInData::Parts> paths;
std::vector<Field> values;
+ phmap::flat_hash_set<std::string> visited_path_names;
bool deprecated_enable_flatten_nested = false;
+ bool check_duplicate_json_path = false;
bool has_nested_in_flatten = false;
bool is_top_array = false;
};
@@ -141,10 +144,12 @@ private:
KeyToSizes nested_sizes_by_key;
bool has_nested_in_flatten = false;
bool is_top_array = false;
+ bool check_duplicate_json_path = false;
};
void traverse(const Element& element, ParseContext& ctx);
void traverseObject(const JSONObject& object, ParseContext& ctx);
void traverseArray(const JSONArray& array, ParseContext& ctx);
+ void appendValueIfNotDuplicate(ParseContext& ctx, const PathInData::Parts&
path, Field&& value);
void traverseArrayElement(const Element& element, ParseArrayContext& ctx);
void checkAmbiguousStructure(const ParseArrayContext& ctx,
const std::vector<PathInData::Parts>& paths);
diff --git a/be/test/storage/segment/variant_util_test.cpp
b/be/test/storage/segment/variant_util_test.cpp
index 597623c1018..902bf9c843b 100644
--- a/be/test/storage/segment/variant_util_test.cpp
+++ b/be/test/storage/segment/variant_util_test.cpp
@@ -23,11 +23,13 @@
#include <string_view>
#include <vector>
+#include "common/config.h"
#include "core/block/block.h"
#include "core/column/column_string.h"
#include "core/column/column_variant.h"
#include "core/data_type/data_type_variant.h"
#include "core/field.h"
+#include "core/value/jsonb_value.h"
#include "exec/common/variant_util.h"
#include "gtest/gtest.h"
#include "storage/tablet/tablet_schema.h"
@@ -42,6 +44,20 @@ static ColumnString::MutablePtr _make_json_column(const
std::vector<std::string_
return col;
}
+class ScopedDuplicateJsonPathCheck {
+public:
+ explicit ScopedDuplicateJsonPathCheck(bool value)
+ : _old_value(config::variant_enable_duplicate_json_path_check) {
+ config::variant_enable_duplicate_json_path_check = value;
+ }
+ ~ScopedDuplicateJsonPathCheck() {
+ config::variant_enable_duplicate_json_path_check = _old_value;
+ }
+
+private:
+ bool _old_value;
+};
+
TEST(VariantUtilTest, ParseDocValueToSubcolumns_FillsDefaultsAndValues) {
const std::vector<std::string_view> jsons = {
R"({"a":1,"b":"x"})", //
@@ -225,6 +241,196 @@ TEST(VariantUtilTest,
ParseOnlyDocValueColumn_SerializesMixedTypes) {
EXPECT_EQ(f.field.get<TYPE_STRING>(), "y");
}
+TEST(VariantUtilTest, ParseDuplicateJsonPathsKeepsFirstValue) {
+ ScopedDuplicateJsonPathCheck check_guard(true);
+ const std::vector<std::string_view> jsons = {
+ R"({"a":42,"a":{"b":42}})", R"({"a":123,"a":"123"})",
R"({"a.b":1,"a":{"b":2}})",
+ R"({"a":{"b":3},"a.b":4})", R"({"a":{"b":5},"a":{"c":6}})",
+ };
+
+ auto variant = ColumnVariant::create(0, false);
+ auto json_col = _make_json_column(jsons);
+
+ ParseConfig cfg;
+ cfg.deprecated_enable_flatten_nested = false;
+ cfg.check_duplicate_json_path = true;
+ cfg.parse_to = ParseConfig::ParseTo::OnlySubcolumns;
+ parse_json_to_variant(*variant, *json_col, cfg);
+ ASSERT_TRUE(variant->sanitize().ok());
+
+ const auto* sub_a = variant->get_subcolumn(PathInData("a"));
+ const auto* sub_ab = variant->get_subcolumn(PathInData("a.b"));
+ const auto* sub_ac = variant->get_subcolumn(PathInData("a.c"));
+ ASSERT_NE(sub_a, nullptr);
+ ASSERT_NE(sub_ab, nullptr);
+ ASSERT_NE(sub_ac, nullptr);
+
+ FieldWithDataType f;
+ sub_a->get(0, f);
+ EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT);
+ EXPECT_EQ(f.field.get<TYPE_BIGINT>(), 42);
+ sub_a->get(1, f);
+ EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT);
+ EXPECT_EQ(f.field.get<TYPE_BIGINT>(), 123);
+
+ sub_ab->get(0, f);
+ EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT);
+ EXPECT_EQ(f.field.get<TYPE_BIGINT>(), 42);
+ sub_ab->get(1, f);
+ EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_NULL);
+ sub_ab->get(2, f);
+ EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT);
+ EXPECT_EQ(f.field.get<TYPE_BIGINT>(), 1);
+ sub_ab->get(3, f);
+ EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT);
+ EXPECT_EQ(f.field.get<TYPE_BIGINT>(), 3);
+ sub_ab->get(4, f);
+ EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT);
+ EXPECT_EQ(f.field.get<TYPE_BIGINT>(), 5);
+
+ sub_ac->get(4, f);
+ EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT);
+ EXPECT_EQ(f.field.get<TYPE_BIGINT>(), 6);
+}
+
+TEST(VariantUtilTest, ParseDuplicateJsonPathsKeepsFirstArrayOrScalarValue) {
+ ScopedDuplicateJsonPathCheck check_guard(true);
+ const std::vector<std::string_view> jsons = {
+ R"({"a":[1],"a":2})",
+ R"({"a":2,"a":[1]})",
+ };
+
+ auto variant = ColumnVariant::create(0, false);
+ auto json_col = _make_json_column(jsons);
+
+ ParseConfig cfg;
+ cfg.deprecated_enable_flatten_nested = false;
+ cfg.check_duplicate_json_path = true;
+ cfg.parse_to = ParseConfig::ParseTo::OnlySubcolumns;
+ parse_json_to_variant(*variant, *json_col, cfg);
+ ASSERT_TRUE(variant->sanitize().ok());
+
+ const auto* sub_a = variant->get_subcolumn(PathInData("a"));
+ ASSERT_NE(sub_a, nullptr);
+
+ FieldWithDataType f;
+ sub_a->get(0, f);
+ ASSERT_EQ(f.field.get_type(), PrimitiveType::TYPE_JSONB);
+ const auto& first = f.field.get<TYPE_JSONB>();
+ EXPECT_EQ(JsonbToJson::jsonb_to_json_string(first.get_value(),
first.get_size()), "[1]");
+
+ sub_a->get(1, f);
+ ASSERT_EQ(f.field.get_type(), PrimitiveType::TYPE_JSONB);
+ const auto& second = f.field.get<TYPE_JSONB>();
+ EXPECT_EQ(JsonbToJson::jsonb_to_json_string(second.get_value(),
second.get_size()), "2");
+}
+
+TEST(VariantUtilTest, ParseDuplicateJsonPathsInDocModeKeepsFirstValue) {
+ ScopedDuplicateJsonPathCheck check_guard(true);
+ const std::vector<std::string_view> jsons = {
+ R"({"a":42,"a":{"b":42}})", R"({"a":123,"a":"123"})",
R"({"a.b":1,"a":{"b":2}})",
+ R"({"a":{"b":3},"a.b":4})", R"({"a":{"b":5},"a":{"c":6}})",
+ };
+
+ auto variant = ColumnVariant::create(0, true);
+ auto json_col = _make_json_column(jsons);
+
+ ParseConfig cfg;
+ cfg.deprecated_enable_flatten_nested = false;
+ cfg.check_duplicate_json_path = true;
+ cfg.parse_to = ParseConfig::ParseTo::OnlyDocValueColumn;
+ parse_json_to_variant(*variant, *json_col, cfg);
+ ASSERT_TRUE(variant->sanitize().ok());
+
+ auto subcolumns = materialize_docs_to_subcolumns_map(*variant);
+ ASSERT_TRUE(subcolumns.contains("a"));
+ ASSERT_TRUE(subcolumns.contains("a.b"));
+ ASSERT_TRUE(subcolumns.contains("a.c"));
+
+ auto& sub_a = subcolumns.at("a");
+ auto& sub_ab = subcolumns.at("a.b");
+ auto& sub_ac = subcolumns.at("a.c");
+ sub_a.finalize();
+ sub_ab.finalize();
+ sub_ac.finalize();
+
+ FieldWithDataType f;
+ sub_a.get(0, f);
+ EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT);
+ EXPECT_EQ(f.field.get<TYPE_BIGINT>(), 42);
+ sub_a.get(1, f);
+ EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT);
+ EXPECT_EQ(f.field.get<TYPE_BIGINT>(), 123);
+
+ sub_ab.get(0, f);
+ EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT);
+ EXPECT_EQ(f.field.get<TYPE_BIGINT>(), 42);
+ sub_ab.get(1, f);
+ EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_NULL);
+ sub_ab.get(2, f);
+ EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT);
+ EXPECT_EQ(f.field.get<TYPE_BIGINT>(), 1);
+ sub_ab.get(3, f);
+ EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT);
+ EXPECT_EQ(f.field.get<TYPE_BIGINT>(), 3);
+ sub_ab.get(4, f);
+ EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT);
+ EXPECT_EQ(f.field.get<TYPE_BIGINT>(), 5);
+
+ sub_ac.get(4, f);
+ EXPECT_EQ(f.field.get_type(), PrimitiveType::TYPE_BIGINT);
+ EXPECT_EQ(f.field.get<TYPE_BIGINT>(), 6);
+}
+
+TEST(VariantUtilTest,
ParseDuplicateJsonPathsInDocModeKeepsFirstArrayOrScalarValue) {
+ ScopedDuplicateJsonPathCheck check_guard(true);
+ const std::vector<std::string_view> jsons = {
+ R"({"a":[1],"a":2})",
+ R"({"a":2,"a":[1]})",
+ };
+
+ auto variant = ColumnVariant::create(0, true);
+ auto json_col = _make_json_column(jsons);
+
+ ParseConfig cfg;
+ cfg.deprecated_enable_flatten_nested = false;
+ cfg.check_duplicate_json_path = true;
+ cfg.parse_to = ParseConfig::ParseTo::OnlyDocValueColumn;
+ parse_json_to_variant(*variant, *json_col, cfg);
+ ASSERT_TRUE(variant->sanitize().ok());
+
+ auto subcolumns = materialize_docs_to_subcolumns_map(*variant);
+ ASSERT_TRUE(subcolumns.contains("a"));
+
+ auto& sub_a = subcolumns.at("a");
+ sub_a.finalize();
+
+ FieldWithDataType f;
+ sub_a.get(0, f);
+ ASSERT_EQ(f.field.get_type(), PrimitiveType::TYPE_JSONB);
+ const auto& first = f.field.get<TYPE_JSONB>();
+ EXPECT_EQ(JsonbToJson::jsonb_to_json_string(first.get_value(),
first.get_size()), "[1]");
+
+ sub_a.get(1, f);
+ ASSERT_EQ(f.field.get_type(), PrimitiveType::TYPE_JSONB);
+ const auto& second = f.field.get<TYPE_JSONB>();
+ EXPECT_EQ(JsonbToJson::jsonb_to_json_string(second.get_value(),
second.get_size()), "2");
+}
+
+TEST(VariantUtilTest, ParseDuplicateJsonPathsCheckDisabledByDefault) {
+ ScopedDuplicateJsonPathCheck check_guard(false);
+ const std::vector<std::string_view> jsons = {
+ R"({"a":123,"a":"123"})",
+ };
+
+ auto variant = ColumnVariant::create(0, false);
+ auto json_col = _make_json_column(jsons);
+
+ ParseConfig cfg;
+ cfg.deprecated_enable_flatten_nested = false;
+ EXPECT_THROW(parse_json_to_variant(*variant, *json_col, cfg), Exception);
+}
+
TEST(VariantUtilTest, ParseVariantColumns_ScalarJsonStringToSubcolumns) {
TabletSchemaPB schema_pb;
schema_pb.set_keys_type(KeysType::DUP_KEYS);
diff --git a/regression-test/data/variant_p0/duplicate_json_path.json
b/regression-test/data/variant_p0/duplicate_json_path.json
new file mode 100644
index 00000000000..e065c9b2331
--- /dev/null
+++ b/regression-test/data/variant_p0/duplicate_json_path.json
@@ -0,0 +1,7 @@
+{"k":8,"v":{"a":42,"a":{"b":42}}}
+{"k":9,"v":{"a":123,"a":"123"}}
+{"k":10,"v":{"a.b":8,"a":{"b":9}}}
+{"k":11,"v":{"a":{"b":10},"a.b":11}}
+{"k":12,"v":{"a":{"b":11},"a":{"c":12}}}
+{"k":13,"v":{"a":[13],"a":14}}
+{"k":14,"v":{"a":14,"a":[13]}}
diff --git a/regression-test/suites/variant_p0/duplicate_json_path.groovy
b/regression-test/suites/variant_p0/duplicate_json_path.groovy
new file mode 100644
index 00000000000..0c6802f461e
--- /dev/null
+++ b/regression-test/suites/variant_p0/duplicate_json_path.groovy
@@ -0,0 +1,106 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("duplicate_json_path", "p0") {
+ def customBeConfig = [
+ variant_enable_duplicate_json_path_check: true
+ ]
+ setBeConfigTemporary(customBeConfig) {
+ sql "DROP TABLE IF EXISTS duplicate_json_path"
+ sql """
+ CREATE TABLE duplicate_json_path (
+ k int,
+ v variant
+ )
+ DUPLICATE KEY(k)
+ DISTRIBUTED BY HASH(k) BUCKETS 1
+ PROPERTIES (
+ "replication_num" = "1",
+ "group_commit_interval_ms" = "2000",
+ "disable_auto_compaction" = "true"
+ );
+ """
+
+ sql """insert into duplicate_json_path values (1,
'{"a":42,"a":{"b":42}}')"""
+ sql """insert into duplicate_json_path values (2, '{"a" : 123, "a" :
"123"}')"""
+ sql """insert into duplicate_json_path values (3,
'{"a.b":1,"a":{"b":2}}')"""
+ sql """insert into duplicate_json_path values (4,
'{"a":{"b":3},"a.b":4}')"""
+ sql """insert into duplicate_json_path values (5,
'{"a":{"b":5},"a":{"c":6}}')"""
+ sql """insert into duplicate_json_path values (6, '{"a":[1],"a":2}')"""
+ sql """insert into duplicate_json_path values (7, '{"a":2,"a":[1]}')"""
+
+ streamLoad {
+ table "duplicate_json_path"
+ set 'read_json_by_line', 'true'
+ set 'format', 'json'
+ set 'group_commit', 'async_mode'
+ unset 'label'
+ file 'duplicate_json_path.json'
+ time 10000
+
+ check { result, exception, startTime, endTime ->
+ if (exception != null) {
+ throw exception
+ }
+ def json = parseJson(result)
+ assertEquals("success", json.Status.toLowerCase())
+ assertEquals(7, json.NumberTotalRows)
+ assertEquals(7, json.NumberLoadedRows)
+ }
+ }
+
+ for (int i = 0; i < 30; i++) {
+ def count = sql "select count(*) from duplicate_json_path"
+ if (count[0][0] == 14) {
+ break
+ }
+ sleep(1000)
+ }
+ def totalRows = sql "select count(*) from duplicate_json_path"
+ assertEquals(14, totalRows[0][0])
+
+ // When duplicate path check is enabled, duplicate Variant paths keep
the first value.
+ def expectedResult = [
+ [1, "{\"b\":42}", "42", null],
+ [2, "123", null, null],
+ [3, "{\"b\":1}", "1", null],
+ [4, "{\"b\":3}", "3", null],
+ [5, "{\"b\":5,\"c\":6}", "5", "6"],
+ [6, "[1]", null, null],
+ [7, "2", null, null],
+ [8, "{\"b\":42}", "42", null],
+ [9, "123", null, null],
+ [10, "{\"b\":8}", "8", null],
+ [11, "{\"b\":10}", "10", null],
+ [12, "{\"b\":11,\"c\":12}", "11", "12"],
+ [13, "[13]", null, null],
+ [14, "14", null, null]
+ ]
+
+ def queryResult = {
+ sql """
+ select k, cast(v['a'] as string), cast(v['a']['b'] as string),
cast(v['a']['c'] as string)
+ from duplicate_json_path
+ order by k
+ """
+ }
+ assertEquals(expectedResult, queryResult())
+
+ trigger_and_wait_compaction("duplicate_json_path", "full")
+ assertEquals(expectedResult, queryResult())
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]