github-actions[bot] commented on code in PR #61882:
URL: https://github.com/apache/doris/pull/61882#discussion_r3013245179
##########
cloud/src/meta-service/meta_service_resource.cpp:
##########
@@ -2148,6 +2148,175 @@ std::pair<MetaServiceCode, std::string>
handle_snapshot_intervals(const std::str
return std::make_pair(MetaServiceCode::OK, "");
}
+static std::pair<MetaServiceCode, std::string> get_instance_info(Transaction*
txn,
+ const
std::string& instance_id,
+
InstanceInfoPB* instance) {
+ std::string instance_val;
+ TxnErrorCode err = txn->get(instance_key({instance_id}), &instance_val);
+ if (err != TxnErrorCode::TXN_OK) {
+ return {cast_as<ErrCategory::READ>(err),
+ fmt::format("failed to get instance, instance_id={}, err={}",
instance_id, err)};
+ }
+ if (!instance->ParseFromString(instance_val)) {
+ return {MetaServiceCode::PROTOBUF_PARSE_ERR,
+ fmt::format("failed to parse instance proto, instance_id={}",
instance_id)};
+ }
+ return {MetaServiceCode::OK, ""};
+}
+
+// Traverses the predecessor chain of the given instance, populating
`predecessors` with the
+// instance info of each predecessor.
+static std::pair<MetaServiceCode, std::string> get_predecessor_instances(
+ Transaction* txn, const InstanceInfoPB& instance,
+ std::vector<InstanceInfoPB>* predecessors) {
+ std::string instance_id = instance.original_instance_id();
+ while (!instance_id.empty() && instance_id != instance.instance_id()) {
+ InstanceInfoPB current_instance;
+ auto [code, msg] = get_instance_info(txn, instance_id,
¤t_instance);
+ if (code != MetaServiceCode::OK) {
+ return {code, std::move(msg)};
+ }
+ instance_id = current_instance.successor_instance_id();
+ predecessors->emplace_back(std::move(current_instance));
+ }
+ return {MetaServiceCode::OK, ""};
+}
+
+static std::pair<MetaServiceCode, std::string> ensure_all_snapshot_recycled(
+ Transaction* txn, const std::string& instance_id,
+
std::optional<std::reference_wrapper<std::unordered_set<std::string_view>>>
+ exclude_snapshot_set) {
+ MetaReader meta_reader(instance_id);
+ std::vector<std::pair<SnapshotPB, Versionstamp>> snapshots;
+ TxnErrorCode err = meta_reader.get_snapshots(txn, &snapshots);
+ if (err != TxnErrorCode::TXN_OK) {
+ std::string msg = fmt::format("failed to get snapshots, err={}", err);
+ LOG(WARNING) << msg << " instance_id=" << instance_id;
+ return {cast_as<ErrCategory::READ>(err), std::move(msg)};
+ }
+ for (auto& [snapshot, versionstamp] : snapshots) {
+ std::string snapshot_id =
SnapshotManager::serialize_snapshot_id(versionstamp);
+ if (exclude_snapshot_set.has_value() &&
+ exclude_snapshot_set.value().get().count(snapshot_id) > 0) {
+ continue;
+ }
+ if (snapshot.status() != SnapshotStatus::SNAPSHOT_RECYCLED) {
+ std::string msg = fmt::format(
+ "failed to drop instance, instance has snapshots,
snapshot_id={}", snapshot_id);
+ LOG(WARNING) << msg << " instance_id=" << instance_id
+ << " snapshot=" << proto_to_json(snapshot);
+ return {MetaServiceCode::INVALID_ARGUMENT, std::move(msg)};
+ }
+ }
+ return {MetaServiceCode::OK, ""};
+}
+
+static std::pair<MetaServiceCode, std::string> drop_single_instance(const
std::string& instance_id,
+
Transaction* txn,
+
InstanceInfoPB* instance) {
+ auto [code, msg] = ensure_all_snapshot_recycled(txn, instance_id,
std::nullopt);
+ if (code != MetaServiceCode::OK) {
+ return {code, std::move(msg)};
+ }
+
+ instance->set_status(InstanceInfoPB::DELETED);
+
instance->set_mtime(duration_cast<seconds>(system_clock::now().time_since_epoch()).count());
+
+ std::string serialized = instance->SerializeAsString();
+ if (serialized.empty()) {
+ std::string msg = "failed to serialize";
+ LOG(ERROR) << msg;
+ return {MetaServiceCode::PROTOBUF_SERIALIZE_ERR, std::move(msg)};
+ }
+ LOG(INFO) << "drop instance_id=" << instance_id << " json=" <<
proto_to_json(*instance);
+
+ if (instance->has_source_instance_id() &&
instance->has_source_snapshot_id() &&
+ !instance->source_instance_id().empty() &&
!instance->source_snapshot_id().empty()) {
+ Versionstamp snapshot_versionstamp;
+ if
(!SnapshotManager::parse_snapshot_versionstamp(instance->source_snapshot_id(),
+
&snapshot_versionstamp)) {
+ std::string msg = "failed to parse snapshot_id to versionstamp,
snapshot_id=" +
+ instance->source_snapshot_id();
+ LOG(WARNING) << msg;
+ return {MetaServiceCode::INVALID_ARGUMENT, std::move(msg)};
+ }
+ std::string snapshot_ref_key = versioned::snapshot_reference_key(
+ {instance->source_instance_id(), snapshot_versionstamp,
instance_id});
+ txn->remove(snapshot_ref_key);
+ }
+ return {MetaServiceCode::OK, std::move(serialized)};
+}
+
+// Handles the instance-chain drop: all instances from the original up to and
including the tail
+// are marked DELETED atomically within the same transaction.
+static std::pair<MetaServiceCode, std::string> drop_instance_chain(
+ const std::string& tail_instance_id, Transaction* txn, InstanceInfoPB*
tail_instance) {
+ std::vector<InstanceInfoPB> predecessors;
+ if (auto [code, err] = get_predecessor_instances(txn, *tail_instance,
&predecessors);
+ code != MetaServiceCode::OK) {
+ LOG(WARNING) << "drop instance chain: " << err;
+ return {code, std::move(err)};
+ }
+
+ std::unordered_set<std::string_view> exclude_snapshot_set;
+ if (auto [code, err] = ensure_all_snapshot_recycled(txn, tail_instance_id,
std::nullopt);
+ code != MetaServiceCode::OK) {
+ return {code, std::move(err)};
+ }
+ exclude_snapshot_set.insert(tail_instance->source_snapshot_id());
Review Comment:
**[Low - Fragility] `std::unordered_set<std::string_view>` is
lifetime-fragile.** The `string_view`s in `exclude_snapshot_set` point into the
protobuf `source_snapshot_id()` fields of `tail_instance` and `predecessors`
vector elements. This is currently safe because `set_status()` and
`set_mtime()` (lines 152-153, 165-166) do not invalidate unrelated string
fields. However, this is a fragile pattern — any future modification that
clears or modifies `source_snapshot_id` on these objects before the set is
consumed would cause use-after-free.
Consider using `std::unordered_set<std::string>` instead for robustness, or
add a comment documenting why the `string_view` approach is safe here.
##########
cloud/src/snapshot/snapshot_manager.cpp:
##########
@@ -262,4 +264,88 @@ int
SnapshotManager::compact_snapshot_chains(InstanceChainCompactor* compactor)
return -1;
}
+static std::pair<MetaServiceCode, std::string> get_instance(Transaction* txn,
+ std::string_view
instance_id,
+ InstanceInfoPB*
instance_info) {
+ InstanceKeyInfo instance_key_info {instance_id};
+ std::string key = instance_key(instance_key_info);
+ std::string val;
+ TxnErrorCode err = txn->get(key, &val);
+ if (err != TxnErrorCode::TXN_OK) {
+ if (err == TxnErrorCode::TXN_KEY_NOT_FOUND) {
+ return {MetaServiceCode::INVALID_ARGUMENT,
+ fmt::format("instance not found, instance_id={}",
instance_id)};
+ } else {
+ return {cast_as<ErrCategory::READ>(err),
+ fmt::format("failed to get instance, instance_id={},
err={}", instance_id,
+ err)};
Review Comment:
**[Medium] Code duplication.** `get_instance()` (defined here as a
file-local static function) and `get_instance_info()` in
`meta_service_resource.cpp` serve the same purpose but with slightly different
error handling (this one distinguishes `TXN_KEY_NOT_FOUND`, the other does
not). Consider extracting a shared utility to avoid the duplication and ensure
consistent error handling.
##########
cloud/src/snapshot/snapshot_manager.cpp:
##########
@@ -262,4 +264,88 @@ int
SnapshotManager::compact_snapshot_chains(InstanceChainCompactor* compactor)
return -1;
}
+static std::pair<MetaServiceCode, std::string> get_instance(Transaction* txn,
+ std::string_view
instance_id,
+ InstanceInfoPB*
instance_info) {
+ InstanceKeyInfo instance_key_info {instance_id};
+ std::string key = instance_key(instance_key_info);
+ std::string val;
+ TxnErrorCode err = txn->get(key, &val);
+ if (err != TxnErrorCode::TXN_OK) {
+ if (err == TxnErrorCode::TXN_KEY_NOT_FOUND) {
+ return {MetaServiceCode::INVALID_ARGUMENT,
+ fmt::format("instance not found, instance_id={}",
instance_id)};
+ } else {
+ return {cast_as<ErrCategory::READ>(err),
+ fmt::format("failed to get instance, instance_id={},
err={}", instance_id,
+ err)};
+ }
+ }
+
+ if (!instance_info->ParseFromString(val)) {
+ return {MetaServiceCode::INVALID_ARGUMENT, "failed to parse instance
info"};
+ }
+ return {MetaServiceCode::OK, ""};
+}
+
+std::pair<MetaServiceCode, std::string> SnapshotManager::get_all_snapshots(
+ Transaction* txn, std::string_view instance_id, std::string_view
required_snapshot_id,
+ std::vector<std::pair<SnapshotPB, Versionstamp>>* snapshots) {
+ Versionstamp required_snapshot_versionstamp;
+ if (!required_snapshot_id.empty()) {
+ if (!parse_snapshot_versionstamp(required_snapshot_id,
&required_snapshot_versionstamp)) {
+ return {MetaServiceCode::INVALID_ARGUMENT, "invalid snapshot_id
format"};
+ }
+ }
+
+ InstanceInfoPB instance_info;
+ auto [code, error_msg] = get_instance(txn, instance_id, &instance_info);
+ if (code != MetaServiceCode::OK) {
+ return {code, error_msg};
+ }
+ std::string current_instance_id = instance_id;
+ if (instance_info.has_original_instance_id() &&
!instance_info.original_instance_id().empty()) {
+ // the earliest instance_id for rollback
+ current_instance_id = instance_info.original_instance_id();
+ }
+
+ do {
+ MetaReader meta_reader(current_instance_id);
+ if (required_snapshot_id.empty()) {
+ TxnErrorCode err = meta_reader.get_snapshots(txn, snapshots);
+ if (err != TxnErrorCode::TXN_OK) {
+ return {cast_as<ErrCategory::READ>(err), "failed to get
snapshots"};
+ }
+ } else {
+ SnapshotPB snapshot_pb;
+ TxnErrorCode err =
+ meta_reader.get_snapshot(txn,
required_snapshot_versionstamp, &snapshot_pb);
+ if (err == TxnErrorCode::TXN_OK) {
+ snapshots->emplace_back(snapshot_pb,
required_snapshot_versionstamp);
+ return {MetaServiceCode::OK, ""};
+ } else if (err != TxnErrorCode::TXN_KEY_NOT_FOUND) {
+ return {cast_as<ErrCategory::READ>(err), "failed to get
snapshot"};
+ }
+ }
+ if (current_instance_id == instance_id) {
+ break;
+ }
+ auto [code, error_msg] = get_instance(txn, current_instance_id,
&instance_info);
Review Comment:
**[Medium] Missing cycle protection in chain traversal.** The
`do-while(true)` loop at line 312 follows `successor_instance_id` links without
any cycle detection or iteration bound. If the instance chain metadata is
corrupted and forms a cycle that doesn't include `instance_id`, this loop will
run indefinitely.
Per the project's coding standards (AGENTS.md: "upon discovering errors or
unexpected situations, report errors or crash—never allow the process to
continue"), consider adding a `DORIS_CHECK` with a visited-set:
```cpp
std::unordered_set<std::string> visited;
// inside loop, after updating current_instance_id:
DORIS_CHECK(!visited.count(current_instance_id))
<< "cycle detected in instance chain at " << current_instance_id;
visited.insert(current_instance_id);
```
The same concern applies to `get_predecessor_instances` in
`meta_service_resource.cpp`.
##########
cloud/src/meta-service/meta_service_resource.cpp:
##########
@@ -2148,6 +2148,175 @@ std::pair<MetaServiceCode, std::string>
handle_snapshot_intervals(const std::str
return std::make_pair(MetaServiceCode::OK, "");
}
+static std::pair<MetaServiceCode, std::string> get_instance_info(Transaction*
txn,
+ const
std::string& instance_id,
+
InstanceInfoPB* instance) {
+ std::string instance_val;
+ TxnErrorCode err = txn->get(instance_key({instance_id}), &instance_val);
+ if (err != TxnErrorCode::TXN_OK) {
+ return {cast_as<ErrCategory::READ>(err),
+ fmt::format("failed to get instance, instance_id={}, err={}",
instance_id, err)};
+ }
+ if (!instance->ParseFromString(instance_val)) {
+ return {MetaServiceCode::PROTOBUF_PARSE_ERR,
+ fmt::format("failed to parse instance proto, instance_id={}",
instance_id)};
+ }
+ return {MetaServiceCode::OK, ""};
+}
+
+// Traverses the predecessor chain of the given instance, populating
`predecessors` with the
+// instance info of each predecessor.
+static std::pair<MetaServiceCode, std::string> get_predecessor_instances(
+ Transaction* txn, const InstanceInfoPB& instance,
+ std::vector<InstanceInfoPB>* predecessors) {
+ std::string instance_id = instance.original_instance_id();
+ while (!instance_id.empty() && instance_id != instance.instance_id()) {
+ InstanceInfoPB current_instance;
+ auto [code, msg] = get_instance_info(txn, instance_id,
¤t_instance);
+ if (code != MetaServiceCode::OK) {
+ return {code, std::move(msg)};
+ }
+ instance_id = current_instance.successor_instance_id();
+ predecessors->emplace_back(std::move(current_instance));
+ }
+ return {MetaServiceCode::OK, ""};
+}
+
+static std::pair<MetaServiceCode, std::string> ensure_all_snapshot_recycled(
+ Transaction* txn, const std::string& instance_id,
+
std::optional<std::reference_wrapper<std::unordered_set<std::string_view>>>
+ exclude_snapshot_set) {
+ MetaReader meta_reader(instance_id);
+ std::vector<std::pair<SnapshotPB, Versionstamp>> snapshots;
+ TxnErrorCode err = meta_reader.get_snapshots(txn, &snapshots);
+ if (err != TxnErrorCode::TXN_OK) {
+ std::string msg = fmt::format("failed to get snapshots, err={}", err);
+ LOG(WARNING) << msg << " instance_id=" << instance_id;
+ return {cast_as<ErrCategory::READ>(err), std::move(msg)};
+ }
+ for (auto& [snapshot, versionstamp] : snapshots) {
+ std::string snapshot_id =
SnapshotManager::serialize_snapshot_id(versionstamp);
+ if (exclude_snapshot_set.has_value() &&
+ exclude_snapshot_set.value().get().count(snapshot_id) > 0) {
+ continue;
+ }
+ if (snapshot.status() != SnapshotStatus::SNAPSHOT_RECYCLED) {
+ std::string msg = fmt::format(
+ "failed to drop instance, instance has snapshots,
snapshot_id={}", snapshot_id);
+ LOG(WARNING) << msg << " instance_id=" << instance_id
+ << " snapshot=" << proto_to_json(snapshot);
+ return {MetaServiceCode::INVALID_ARGUMENT, std::move(msg)};
+ }
+ }
+ return {MetaServiceCode::OK, ""};
+}
+
+static std::pair<MetaServiceCode, std::string> drop_single_instance(const
std::string& instance_id,
+
Transaction* txn,
+
InstanceInfoPB* instance) {
+ auto [code, msg] = ensure_all_snapshot_recycled(txn, instance_id,
std::nullopt);
+ if (code != MetaServiceCode::OK) {
+ return {code, std::move(msg)};
+ }
+
+ instance->set_status(InstanceInfoPB::DELETED);
+
instance->set_mtime(duration_cast<seconds>(system_clock::now().time_since_epoch()).count());
+
+ std::string serialized = instance->SerializeAsString();
+ if (serialized.empty()) {
+ std::string msg = "failed to serialize";
+ LOG(ERROR) << msg;
+ return {MetaServiceCode::PROTOBUF_SERIALIZE_ERR, std::move(msg)};
+ }
+ LOG(INFO) << "drop instance_id=" << instance_id << " json=" <<
proto_to_json(*instance);
+
+ if (instance->has_source_instance_id() &&
instance->has_source_snapshot_id() &&
+ !instance->source_instance_id().empty() &&
!instance->source_snapshot_id().empty()) {
+ Versionstamp snapshot_versionstamp;
+ if
(!SnapshotManager::parse_snapshot_versionstamp(instance->source_snapshot_id(),
+
&snapshot_versionstamp)) {
+ std::string msg = "failed to parse snapshot_id to versionstamp,
snapshot_id=" +
+ instance->source_snapshot_id();
+ LOG(WARNING) << msg;
+ return {MetaServiceCode::INVALID_ARGUMENT, std::move(msg)};
+ }
+ std::string snapshot_ref_key = versioned::snapshot_reference_key(
+ {instance->source_instance_id(), snapshot_versionstamp,
instance_id});
+ txn->remove(snapshot_ref_key);
+ }
+ return {MetaServiceCode::OK, std::move(serialized)};
+}
+
+// Handles the instance-chain drop: all instances from the original up to and
including the tail
+// are marked DELETED atomically within the same transaction.
+static std::pair<MetaServiceCode, std::string> drop_instance_chain(
+ const std::string& tail_instance_id, Transaction* txn, InstanceInfoPB*
tail_instance) {
+ std::vector<InstanceInfoPB> predecessors;
+ if (auto [code, err] = get_predecessor_instances(txn, *tail_instance,
&predecessors);
+ code != MetaServiceCode::OK) {
+ LOG(WARNING) << "drop instance chain: " << err;
+ return {code, std::move(err)};
+ }
+
+ std::unordered_set<std::string_view> exclude_snapshot_set;
+ if (auto [code, err] = ensure_all_snapshot_recycled(txn, tail_instance_id,
std::nullopt);
+ code != MetaServiceCode::OK) {
+ return {code, std::move(err)};
+ }
+ exclude_snapshot_set.insert(tail_instance->source_snapshot_id());
+
+ auto remove_snapshot_reference_key = [&](const InstanceInfoPB& instance) {
+ Versionstamp vs;
+ if
(SnapshotManager::parse_snapshot_versionstamp(instance.source_snapshot_id(),
&vs)) {
+ txn->remove(versioned::snapshot_reference_key(
Review Comment:
**[Medium] Missing validation before insert.**
`tail_instance->source_snapshot_id()` is inserted into `exclude_snapshot_set`
without checking if the field is set and non-empty. Although in practice a
chain-tail instance should always have this set, per project coding standards,
this assumption should be enforced with a `DORIS_CHECK`:
```cpp
DORIS_CHECK(tail_instance->has_source_snapshot_id() &&
!tail_instance->source_snapshot_id().empty())
<< "chain tail instance must have source_snapshot_id, instance_id="
<< tail_instance_id;
exclude_snapshot_set.insert(tail_instance->source_snapshot_id());
```
If this assumption doesn't hold, the empty string would be inserted
silently, which is benign but sloppy.
##########
cloud/src/meta-service/meta_service_resource.cpp:
##########
@@ -2148,6 +2148,175 @@ std::pair<MetaServiceCode, std::string>
handle_snapshot_intervals(const std::str
return std::make_pair(MetaServiceCode::OK, "");
}
+static std::pair<MetaServiceCode, std::string> get_instance_info(Transaction*
txn,
+ const
std::string& instance_id,
+
InstanceInfoPB* instance) {
+ std::string instance_val;
+ TxnErrorCode err = txn->get(instance_key({instance_id}), &instance_val);
+ if (err != TxnErrorCode::TXN_OK) {
+ return {cast_as<ErrCategory::READ>(err),
+ fmt::format("failed to get instance, instance_id={}, err={}",
instance_id, err)};
+ }
+ if (!instance->ParseFromString(instance_val)) {
+ return {MetaServiceCode::PROTOBUF_PARSE_ERR,
Review Comment:
**[Low] `get_instance_info` does not distinguish `TXN_KEY_NOT_FOUND`.**
Unlike the `get_instance` function in `snapshot_manager.cpp`, this function
uses `cast_as<ErrCategory::READ>(err)` for all non-OK errors. If a predecessor
instance key is missing (e.g., due to data corruption), the error message will
include the error code but the `MetaServiceCode` won't distinguish 'not found'
from other read errors. Consider handling `TXN_KEY_NOT_FOUND` explicitly for
better diagnostics.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]