This is an automated email from the ASF dual-hosted git repository.
wangdan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-pegasus.git
The following commit(s) were added to refs/heads/master by this push:
new f5e1b8130 fix(duplication): failed to start duplication after source
replica servers exit due to missing duplicating status in .app-info (#1608)
f5e1b8130 is described below
commit f5e1b81302c4fe85566f7c7872ac2007d0f16d91
Author: ninsmiracle <[email protected]>
AuthorDate: Wed Apr 3 11:11:42 2024 +0800
fix(duplication): failed to start duplication after source replica servers
exit due to missing duplicating status in .app-info (#1608)
https://github.com/apache/incubator-pegasus/issues/1595
Currently, after a duplication was added, `duplicating` status would be
only updated
for in-memory state. However, once some replica servers exited and were
started again,
`duplicating` status would be lost since it had not been persisted into
`.app-info` before.
Consequently, plog files would be removed by GC, which lead to failed
assertion for decree
in `replica_duplicator::verify_start_decree()`.
To resolve this problem, `duplicating` should be updated for both in-memory
and on-disk
states. Then, `duplicating` status would be loaded correctly from
`.app-info` file at the loading
stage of replicas.
---
src/replica/duplication/duplication_sync_timer.cpp | 29 +++++++++++++++++-----
.../duplication/replica_duplicator_manager.h | 2 +-
src/replica/replica.h | 2 ++
src/replica/replica_config.cpp | 19 ++++++++++++++
4 files changed, 45 insertions(+), 7 deletions(-)
diff --git a/src/replica/duplication/duplication_sync_timer.cpp
b/src/replica/duplication/duplication_sync_timer.cpp
index 58d4e2ab7..d7254c939 100644
--- a/src/replica/duplication/duplication_sync_timer.cpp
+++ b/src/replica/duplication/duplication_sync_timer.cpp
@@ -116,19 +116,37 @@ void
duplication_sync_timer::on_duplication_sync_reply(error_code err,
void duplication_sync_timer::update_duplication_map(
const std::map<int32_t, std::map<int32_t, duplication_entry>> &dup_map)
{
- for (const replica_ptr &r : _stub->get_all_replicas()) {
+ for (replica_ptr &r : _stub->get_all_replicas()) {
auto dup_mgr = r->get_duplication_manager();
if (!dup_mgr) {
continue;
}
const auto &it = dup_map.find(r->get_gpid().get_app_id());
- if (it == dup_map.end()) {
- // no duplication is assigned to this app
+
+ // TODO(wangdan): at meta server side, an app is considered duplicating
+ // as long as any duplication of this app has valid status(i.e.
+ // duplication_info::is_invalid_status() returned false, see
+ // meta_duplication_service::refresh_duplicating_no_lock()). And
duplications
+ // in duplication_sync_response returned by meta server could also be
+ // considered duplicating according to
meta_duplication_service::duplication_sync().
+ // Thus we could update `duplicating` in both memory and
file(.app-info).
+ //
+ // However, most of properties of an app(struct `app_info`) are
written to .app-info
+ // file in replica::on_config_sync(), such as max_replica_count; on
the other hand,
+ // in-memory `duplicating` is also updated in
replica::on_config_proposal(). Thus we'd
+ // better think about a unique entrance to update `duplicating`(in
both memory and disk),
+ // rather than update them at anywhere.
+ const auto duplicating = it != dup_map.end();
+ r->update_app_duplication_status(duplicating);
+
+ if (!duplicating) {
+ // No duplication is assigned to this app.
dup_mgr->update_duplication_map({});
- } else {
- dup_mgr->update_duplication_map(it->second);
+ continue;
}
+
+ dup_mgr->update_duplication_map(it->second);
}
}
@@ -198,6 +216,5 @@ duplication_sync_timer::get_dup_states(int app_id, /*out*/
bool *app_found)
}
return result;
}
-
} // namespace replication
} // namespace dsn
diff --git a/src/replica/duplication/replica_duplicator_manager.h
b/src/replica/duplication/replica_duplicator_manager.h
index 55fc92313..51bcbd1e1 100644
--- a/src/replica/duplication/replica_duplicator_manager.h
+++ b/src/replica/duplication/replica_duplicator_manager.h
@@ -53,7 +53,7 @@ public:
// - the app is not assigned with duplication (dup_map.empty())
void update_duplication_map(const std::map<int32_t, duplication_entry>
&new_dup_map)
{
- if (_replica->status() != partition_status::PS_PRIMARY ||
new_dup_map.empty()) {
+ if (new_dup_map.empty() || _replica->status() !=
partition_status::PS_PRIMARY) {
remove_all_duplications();
return;
}
diff --git a/src/replica/replica.h b/src/replica/replica.h
index 8b6fe6b16..ab9e451a4 100644
--- a/src/replica/replica.h
+++ b/src/replica/replica.h
@@ -251,6 +251,8 @@ public:
_is_duplication_plog_checking.store(checking);
}
+ void update_app_duplication_status(bool duplicating);
+
//
// Backup
//
diff --git a/src/replica/replica_config.cpp b/src/replica/replica_config.cpp
index 580d82bbc..386a03e72 100644
--- a/src/replica/replica_config.cpp
+++ b/src/replica/replica_config.cpp
@@ -1176,5 +1176,24 @@ error_code replica::update_init_info_ballot_and_decree()
return _app->update_init_info_ballot_and_decree(this);
}
+void replica::update_app_duplication_status(bool duplicating)
+{
+ if (duplicating == _app_info.duplicating) {
+ return;
+ }
+
+ auto old_duplicating = _app_info.duplicating;
+ _app_info.__set_duplicating(duplicating);
+
+ CHECK_EQ_PREFIX_MSG(store_app_info(_app_info),
+ ERR_OK,
+ "store_app_info for duplicating failed: app_name={}, "
+ "app_id={}, old_duplicating={}, new_duplicating={}",
+ _app_info.app_name,
+ _app_info.app_id,
+ old_duplicating,
+ _app_info.duplicating);
+}
+
} // namespace replication
} // namespace dsn
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]