This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new 5b14858c37b [branch-4.0](variant) normalize legacy single-part dot-key
subcolumn paths on read (#62409) (#62771)
5b14858c37b is described below
commit 5b14858c37b2336f9ab5e8be9ff7b650e6f8ffb4
Author: Chenyang Sun <[email protected]>
AuthorDate: Sun May 10 11:49:30 2026 +0800
[branch-4.0](variant) normalize legacy single-part dot-key subcolumn paths
on read (#62409) (#62771)
pick from master #62409
---
.../segment_v2/variant/variant_column_reader.cpp | 7 +
.../variant_column_writer_reader_test.cpp | 152 +++++++++++++++++++++
2 files changed, 159 insertions(+)
diff --git a/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp
b/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp
index 19f122059ff..553e915c32a 100644
--- a/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp
@@ -959,6 +959,13 @@ Status VariantColumnReader::init(const
ColumnReaderOptions& opts, ColumnMetaAcce
_statistics->subcolumns_non_null_size.emplace(relative_path.get_path(),
column_pb.none_null_size());
}
+ // 3.1.2 may store a flat JSON key like {"a.b": 1} as a single
PathInData part.
+ // New compaction schema and query path expect a dot-split multi-part
shape.
+ // Rebuild via the string constructor when the path has neither typed
+ // nor nested metadata, so the tree matches the new shape.
+ if (!relative_path.get_is_typed() && !relative_path.has_nested_part())
{
+ relative_path = vectorized::PathInData(relative_path.get_path());
+ }
_subcolumns_meta_info->add(
relative_path,
SubcolumnMeta {
diff --git
a/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp
b/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp
index 3b0f8743871..19fc967136c 100644
--- a/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp
+++ b/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp
@@ -200,6 +200,158 @@ void check_sparse_column_meta(const ColumnMetaPB&
column_meta, auto& path_with_s
base_path.rfind("__DORIS_VARIANT_SPARSE__.b", 0) == 0);
}
+// Regression test for legacy flat-dot-key compatibility.
+//
+// Old versions (e.g. cloud-4.1.2 with variant_max_subcolumns_count=0) stored
+// a flat JSON key like {"a.b": 1} as a single PathInData part "a.b" in the
+// segment's ColumnPathInfo protobuf. New master compaction schema builds
+// query paths by splitting on dots (3+ parts including root), which does not
+// match the 1-part tree node and causes silent data loss during compaction.
+//
+// This test writes a normal variant segment via the writer, then *mutates*
+// the resulting footer to turn a subcolumn's `column_path_info` into the
+// legacy 1-part form, then calls `VariantColumnReader::init()` and verifies
+// that the normalization inside init() rebuilds a multi-level tree that can
+// be queried via both `get_subcolumn_meta_by_path` and prefix-path lookup.
+TEST_F(VariantColumnWriterReaderTest, test_legacy_flat_dot_key_reader_init) {
+ // 1. create tablet_schema with a variant column that has nested subcolumns
+ TabletSchemaPB schema_pb;
+ schema_pb.set_keys_type(KeysType::DUP_KEYS);
+ construct_column(schema_pb.add_column(), 1, "VARIANT", "V1",
/*max_subcolumns=*/10);
+ _tablet_schema = std::make_shared<TabletSchema>();
+ _tablet_schema->init_from_pb(schema_pb);
+
+ // 2. create tablet
+ TabletMetaSharedPtr tablet_meta(new TabletMeta(_tablet_schema));
+ _tablet_schema->set_external_segment_meta_used_default(false);
+ tablet_meta->_tablet_id = 20000;
+ _tablet = std::make_shared<Tablet>(*_engine_ref, tablet_meta,
_data_dir.get());
+ EXPECT_TRUE(_tablet->init().ok());
+
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok());
+
EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok());
+
+ // 3. create file_writer
+ io::FileWriterPtr file_writer;
+ auto file_path = local_segment_path(_tablet->tablet_path(), "0", 0);
+ auto st = io::global_local_filesystem()->create_file(file_path,
&file_writer);
+ EXPECT_TRUE(st.ok()) << st.msg();
+
+ // 4. create column_writer
+ SegmentFooterPB footer;
+ ColumnWriterOptions opts;
+ opts.meta = footer.add_columns();
+ opts.compression_type = CompressionTypePB::LZ4;
+ opts.file_writer = file_writer.get();
+ opts.footer = &footer;
+ RowsetWriterContext rowset_ctx;
+ rowset_ctx.write_type = DataWriteType::TYPE_DIRECT;
+ opts.rowset_ctx = &rowset_ctx;
+ opts.rowset_ctx->tablet_schema = _tablet_schema;
+ TabletColumn column = _tablet_schema->column(0);
+ _init_column_meta(opts.meta, 0, column, CompressionTypePB::LZ4);
+
+ std::unique_ptr<ColumnWriter> writer;
+ EXPECT_TRUE(ColumnWriter::create(opts, &column, file_writer.get(),
&writer).ok());
+ EXPECT_TRUE(writer->init().ok());
+
+ // 5. write nested json so the writer naturally creates a subcolumn "a.b"
+ // with a 2-part path ["a", "b"].
+ std::vector<std::string> jsons;
+ const int kNumRows = 8;
+ for (int i = 0; i < kNumRows; ++i) {
+ jsons.push_back(R"({"a": {"b": "v)" + std::to_string(i) + R"("}})");
+ }
+ {
+ auto block = _tablet_schema->create_block();
+ auto variant_col = ColumnVariant::create(
+ _tablet_schema->column(0).variant_max_subcolumns_count(),
false);
+ auto json_col = ColumnString::create();
+ for (const auto& json : jsons) {
+ json_col->insert_data(json.data(), json.size());
+ }
+ ParseConfig config;
+ parse_json_to_variant(*variant_col, *json_col, config);
+ auto columns = block.mutate_columns();
+ columns[0] = std::move(variant_col);
+ block.set_columns(std::move(columns));
+
+ auto converter = std::make_unique<OlapBlockDataConvertor>();
+ converter->add_column_data_convertor(_tablet_schema->column(0));
+ converter->set_source_content(&block, 0, jsons.size());
+ auto [status, accessor] = converter->convert_column_data(0);
+ ASSERT_TRUE(status.ok()) << status.msg();
+ ASSERT_TRUE(
+ writer->append(accessor->get_nullmap(), accessor->get_data(),
jsons.size()).ok());
+ }
+ EXPECT_TRUE(writer->finish().ok());
+ EXPECT_TRUE(writer->write_data().ok());
+ EXPECT_TRUE(writer->write_ordinal_index().ok());
+ EXPECT_TRUE(writer->write_zone_map().ok());
+ EXPECT_TRUE(file_writer->close().ok());
+ footer.set_num_rows(kNumRows);
+
+ // 6. Locate the "V1.a.b" subcolumn in the footer and mutate its
+ // column_path_info into the legacy 1-part form: pb.path = "V1.a.b" but
+ // path_part_infos = [{"V1"}, {"a.b"}]. This is exactly what cloud-4.1.2
+ // wrote for JSON key {"a.b": ...}.
+ int target_idx = -1;
+ for (int i = 1; i < footer.columns_size(); ++i) {
+ const auto& col_meta = footer.columns(i);
+ if (!col_meta.has_column_path_info()) {
+ continue;
+ }
+ if (col_meta.column_path_info().path() == "v1.a.b") {
+ target_idx = i;
+ break;
+ }
+ }
+ ASSERT_GT(target_idx, 0) << "failed to locate subcolumn V1.a.b in footer";
+
+ auto* target_path_info =
footer.mutable_columns(target_idx)->mutable_column_path_info();
+ target_path_info->clear_path_part_infos();
+ auto* root_part = target_path_info->add_path_part_infos();
+ root_part->set_key("v1");
+ root_part->set_is_nested(false);
+ root_part->set_anonymous_array_level(0);
+ auto* legacy_part = target_path_info->add_path_part_infos();
+ legacy_part->set_key("a.b"); // single legacy part containing a dot
+ legacy_part->set_is_nested(false);
+ legacy_part->set_anonymous_array_level(0);
+ target_path_info->set_has_nested(false);
+
+ // 7. Now initialize a fresh VariantColumnReader with the mutated footer.
+ // The init() path calls _subcolumns_meta_info->add() for each subcolumn;
+ // our fix normalizes the legacy 1-part relative path "a.b" into a
+ // 2-part path ["a", "b"] so the tree has root -> "a" -> "b".
+ io::FileReaderSPtr file_reader;
+ st = io::global_local_filesystem()->open_file(file_path, &file_reader);
+ ASSERT_TRUE(st.ok()) << st.msg();
+
+ std::shared_ptr<segment_v2::ColumnReader> column_reader;
+ st = create_variant_root_reader(footer, file_reader, _tablet_schema,
&column_reader);
+ ASSERT_TRUE(st.ok()) << st.msg();
+ auto* variant_reader =
assert_cast<segment_v2::VariantColumnReader*>(column_reader.get());
+ ASSERT_NE(variant_reader, nullptr);
+
+ // 8. Verify that queries against the normalized tree succeed.
+ // - Leaf lookup "a.b" (PathInData splits into 2 parts) should hit.
+ // - Intermediate lookup "a" should return the TUPLE parent, which
+ // has exactly one child "b".
+ const auto* leaf_node =
variant_reader->get_subcolumn_meta_by_path(PathInData("a.b"));
+ ASSERT_NE(leaf_node, nullptr)
+ << "normalized tree should be able to find leaf 'a.b' via
multi-part query";
+ EXPECT_TRUE(leaf_node->is_scalar());
+ EXPECT_GE(leaf_node->data.footer_ordinal, 0);
+
+ const auto* subtree = variant_reader->get_subcolumns_meta_info();
+ ASSERT_NE(subtree, nullptr);
+ const auto* intermediate = subtree->find_exact(PathInData("a"));
+ ASSERT_NE(intermediate, nullptr)
+ << "normalized tree should expose intermediate node 'a' as a
TUPLE";
+ EXPECT_FALSE(intermediate->is_scalar());
+ EXPECT_EQ(intermediate->children.size(), 1U);
+}
+
TEST_F(VariantColumnWriterReaderTest, test_statics) {
VariantStatisticsPB stats_pb;
auto* subcolumns_stats = stats_pb.mutable_sparse_column_non_null_size();
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]