github-actions[bot] commented on code in PR #63217:
URL: https://github.com/apache/doris/pull/63217#discussion_r3235874966
##########
cloud/src/meta-service/meta_service_resource.cpp:
##########
@@ -325,6 +325,96 @@ static int find_cascade_instances(TxnKv* txn_kv, const
std::string& root_instanc
return 0;
}
+static int find_storage_vault_position_by_id(const InstanceInfoPB& instance,
+ std::string_view vault_id) {
+ auto id_itr =
+ std::find(instance.resource_ids().begin(),
instance.resource_ids().end(), vault_id);
+ if (id_itr == instance.resource_ids().end()) {
+ return -1;
+ }
+ return static_cast<int>(id_itr - instance.resource_ids().begin());
+}
+
+static int find_storage_vault_id_by_name(const InstanceInfoPB& instance,
+ std::string_view vault_name,
std::string* vault_id) {
+ auto name_itr = std::find_if(
+ instance.storage_vault_names().begin(),
instance.storage_vault_names().end(),
+ [&](const auto& current_name) { return current_name == vault_name;
});
+ if (name_itr == instance.storage_vault_names().end()) {
+ return -1;
+ }
+ int pos = static_cast<int>(name_itr -
instance.storage_vault_names().begin());
+ *vault_id = instance.resource_ids().Get(pos);
+ return 0;
+}
+
+static int alter_instance_obj_store_info_by_id(InstanceInfoPB& instance,
+ std::string_view target_obj_id,
std::string_view ak,
+ std::string_view sk,
std::string_view role_arn,
+ std::string_view external_id,
+ const EncryptionInfoPB&
encryption_info,
+ MetaServiceCode& code,
std::string& msg) {
+ auto& obj_info =
const_cast<std::decay_t<decltype(instance.obj_info())>&>(instance.obj_info());
+ for (auto& it : obj_info) {
+ if (it.id() != target_obj_id) {
+ continue;
+ }
+
+ if (role_arn.empty()) {
+ if (it.ak() == ak && it.sk() == sk) {
+ code = MetaServiceCode::OK;
+ msg = "ak/sk not changed";
+ return 1;
+ }
+ it.clear_role_arn();
+ it.clear_external_id();
+ it.clear_cred_provider_type();
+
+ it.set_ak(std::string(ak));
+ it.set_sk(std::string(sk));
+ it.mutable_encryption_info()->CopyFrom(encryption_info);
+ } else {
+ if (!ak.empty() || !sk.empty()) {
+ code = MetaServiceCode::INVALID_ARGUMENT;
+ msg = "invaild argument, both set ak/sk and role_arn is not
allowed";
+ LOG(INFO) << msg;
+ return -1;
+ }
+
+ if (it.provider() != ObjectStoreInfoPB::S3) {
+ code = MetaServiceCode::INVALID_ARGUMENT;
+ msg = "role_arn is only supported for s3 provider";
+ LOG(INFO) << msg << " provider=" << it.provider();
+ return -1;
+ }
+
+ if (it.role_arn() == role_arn && it.external_id() == external_id) {
+ code = MetaServiceCode::OK;
+ msg = "ak/sk not changed";
+ return 1;
+ }
+ it.clear_ak();
+ it.clear_sk();
+ it.clear_encryption_info();
+
+ it.set_role_arn(std::string(role_arn));
+ it.set_external_id(std::string(external_id));
+ it.set_cred_provider_type(CredProviderTypePB::INSTANCE_PROFILE);
+ }
Review Comment:
This drops the requested credential provider type for `ALTER_OBJ_INFO` with
`role_arn`. The old code used `get_cred_provider_type(request->obj())` both
when checking for no-op and when storing the new value, so callers could set a
non-default provider. After this refactor every role_arn update is persisted as
`INSTANCE_PROFILE`, and a request that only changes `cred_provider_type` is
treated as unchanged by the earlier `role_arn/external_id` check. Please pass
the requested provider type into this helper and include it in the no-op
comparison.
##########
cloud/src/meta-service/meta_service_resource.cpp:
##########
@@ -1494,6 +1644,115 @@ void
MetaServiceImpl::alter_storage_vault(google::protobuf::RpcController* contr
code = cast_as<ErrCategory::COMMIT>(err);
msg = fmt::format("failed to commit kv txn, err={}", err);
LOG(WARNING) << msg;
+ return;
+ }
+
+ async_notify_refresh_instance(txn_kv_, instance_id, true);
+
+ if (!supports_cascade) {
+ return;
+ }
+
+ if (!instance.has_snapshot_switch_status() ||
+ instance.snapshot_switch_status() == SNAPSHOT_SWITCH_DISABLED) {
+ LOG(INFO) << "snapshot disabled for instance_id=" << instance_id
+ << ", skip cascade updating derived instances after
alter_storage_vault";
+ return;
+ }
+
+ std::vector<std::string> cascade_instance_ids;
+ if (find_cascade_instances(txn_kv_.get(), instance_id,
&cascade_instance_ids) != 0) {
+ LOG(WARNING) << "failed to find derived instances for storage vault
cascade, instance_id="
+ << instance_id;
+ return;
+ }
+
+ for (const auto& cascade_id : cascade_instance_ids) {
+ std::unique_ptr<Transaction> cascade_txn;
+ TxnErrorCode cascade_err = txn_kv_->create_txn(&cascade_txn);
+ if (cascade_err != TxnErrorCode::TXN_OK) {
+ code = cast_as<ErrCategory::CREATE>(cascade_err);
+ msg = fmt::format(
+ "failed to create txn for derived storage vault update,
instance_id={}, "
+ "err={}",
Review Comment:
This failure is returned after the root storage-vault transaction has
already committed above. If any derived-instance transaction
creation/read/commit fails, the client receives an error even though the parent
vault was durably changed. A retry is not idempotent: for a rename the parent
no longer has `request->vault().name()` so the retry fails before cascading,
and for credential-only changes the parent may be considered already updated
while children remain stale. Meta-service RPCs must be retry-safe for
retryable/maybe-committed KV errors; either make the cascade idempotently
resumable or do not report the post-root best-effort cascade failure as the RPC
failure.
##########
cloud/src/meta-service/meta_service_resource.cpp:
##########
@@ -1727,6 +1941,105 @@ void
MetaServiceImpl::alter_obj_store_info(google::protobuf::RpcController* cont
code = cast_as<ErrCategory::COMMIT>(err);
msg = fmt::format("failed to commit kv txn, err={}", err);
LOG(WARNING) << msg;
+ return;
+ }
+
+ async_notify_refresh_instance(txn_kv_, instance_id, true);
+
+ if (!supports_cascade) {
+ return;
+ }
+
+ if (!instance.has_snapshot_switch_status() ||
+ instance.snapshot_switch_status() == SNAPSHOT_SWITCH_DISABLED) {
+ LOG(INFO) << "snapshot disabled for instance_id=" << instance_id
+ << ", skip cascade updating derived instances after
alter_obj_store_info";
+ return;
+ }
+
+ std::vector<std::string> cascade_instance_ids;
+ if (find_cascade_instances(txn_kv_.get(), instance_id,
&cascade_instance_ids) != 0) {
+ LOG(WARNING) << "failed to find derived instances for obj store
cascade, instance_id="
+ << instance_id;
+ return;
+ }
+
+ for (const auto& cascade_id : cascade_instance_ids) {
+ std::unique_ptr<Transaction> cascade_txn;
+ TxnErrorCode cascade_err = txn_kv_->create_txn(&cascade_txn);
+ if (cascade_err != TxnErrorCode::TXN_OK) {
+ code = cast_as<ErrCategory::CREATE>(cascade_err);
+ msg = fmt::format(
+ "failed to create txn for derived obj store update,
instance_id={}, err={}",
+ cascade_id, cascade_err);
Review Comment:
Same partial-commit problem here for `alter_obj_store_info`: the root
instance has already committed before this cascade loop, but a derived-instance
failure overwrites the RPC status with an error. Retrying cannot reliably
repair it because the root update may now be a no-op and returns before
reaching the cascade path, leaving derived instances stale while the client
sees failures. Please make this path idempotently resumable or keep post-root
cascade failures from changing the committed root operation result.
##########
cloud/src/meta-service/meta_service_resource.cpp:
##########
@@ -1849,9 +2162,11 @@ void
MetaServiceImpl::update_ak_sk(google::protobuf::RpcController* controller,
std::unique_ptr<Transaction> cascade_txn;
TxnErrorCode cascade_err = txn_kv_->create_txn(&cascade_txn);
if (cascade_err != TxnErrorCode::TXN_OK) {
- LOG(WARNING) << "failed to create txn for derived instance,
instance_id=" << cascade_id
- << " err=" << cascade_err;
- continue;
+ code = cast_as<ErrCategory::CREATE>(cascade_err);
+ msg = fmt::format("failed to create txn for derived instance,
instance_id={}, err={}",
+ cascade_id, cascade_err);
Review Comment:
Changing the existing best-effort `continue` to `return` makes
`update_ak_sk` report failure after the root instance has already committed. On
retry, `update_instance_ak_sk` can reject the root as `ak sk eq original`, so
the request cannot complete the remaining derived instances and the client
observes a failure for a partially committed operation. This violates the
meta-service retry/idempotency contract; please either preserve best-effort
behavior or make the cascade retry path able to resume after the root update is
already applied.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]