On 6/11/21 5:42 PM, anton.iva...@cambridgegreys.com wrote: > From: Anton Ivanov <anton.iva...@cambridgegreys.com> > > Set a soft time limit of "raft election timer"/2 on ovsdb > processing. > > This improves behaviour in large heavily loaded clusters. > While it cannot fully eliminate spurious raft elections > under heavy load, it significantly decreases their number. > > Processing is (to the extent possible) restarted where it > stopped on the previous iteration to ensure that sessions > towards the tail of the session list are not starved. > > Signed-off-by: Anton Ivanov <anton.iva...@cambridgegreys.com> > ---
Hey, Anton. Thanks for the patch! This is not a complete review, but a few things that I noticed. See inline. Best regards, Ilya Maximets. > ovsdb/jsonrpc-server.c | 80 +++++++++++++++++++++++++++++++++++++++--- > ovsdb/jsonrpc-server.h | 2 +- > ovsdb/ovsdb-server.c | 15 +++++++- > ovsdb/raft.c | 5 +++ > ovsdb/raft.h | 3 ++ > ovsdb/storage.c | 8 +++++ > ovsdb/storage.h | 2 ++ > 7 files changed, 109 insertions(+), 6 deletions(-) > > diff --git a/ovsdb/jsonrpc-server.c b/ovsdb/jsonrpc-server.c > index 4e2dfc3d7..84e0f69b5 100644 > --- a/ovsdb/jsonrpc-server.c > +++ b/ovsdb/jsonrpc-server.c > @@ -60,7 +60,8 @@ static struct ovsdb_jsonrpc_session > *ovsdb_jsonrpc_session_create( > struct ovsdb_jsonrpc_remote *, struct jsonrpc_session *, bool); > static void ovsdb_jsonrpc_session_preremove_db(struct ovsdb_jsonrpc_remote *, > struct ovsdb *); > -static void ovsdb_jsonrpc_session_run_all(struct ovsdb_jsonrpc_remote *); > +static void ovsdb_jsonrpc_session_run_all(struct ovsdb_jsonrpc_remote *, > + uint64_t limit); > static void ovsdb_jsonrpc_session_wait_all(struct ovsdb_jsonrpc_remote *); > static void ovsdb_jsonrpc_session_get_memory_usage_all( > const struct ovsdb_jsonrpc_remote *, struct simap *usage); > @@ -128,6 +129,8 @@ struct ovsdb_jsonrpc_server { > bool read_only; /* This server is does not accept any > transactions that can modify the database. > */ > struct shash remotes; /* Contains "struct ovsdb_jsonrpc_remote *"s. > */ > + struct ovsdb_jsonrpc_remote *skip_to; > + bool yield_immediately; 'yield' doesn't seem to be a right word here. Maybe 'wake_up' or something similar? Also, both fields above needs a comment. OTOH, do we really need this filed? I mean, if we didn't process some session, shouldn't next session_wait() wake us up? > }; > > /* A configured remote. This is either a passive stream listener plus a list > @@ -137,6 +140,7 @@ struct ovsdb_jsonrpc_remote { > struct ovsdb_jsonrpc_server *server; > struct pstream *listener; /* Listener, if passive. */ > struct ovs_list sessions; /* List of "struct ovsdb_jsonrpc_session"s. > */ > + struct ovsdb_jsonrpc_session *skip_to; > uint8_t dscp; > bool read_only; > char *role; > @@ -159,6 +163,8 @@ ovsdb_jsonrpc_server_create(bool read_only) > ovsdb_server_init(&server->up); > shash_init(&server->remotes); > server->read_only = read_only; > + server->yield_immediately = false; > + server->skip_to = NULL; > return server; > } > > @@ -279,6 +285,7 @@ ovsdb_jsonrpc_server_add_remote(struct > ovsdb_jsonrpc_server *svr, > remote->dscp = options->dscp; > remote->read_only = options->read_only; > remote->role = nullable_xstrdup(options->role); > + remote->skip_to = NULL; > shash_add(&svr->remotes, name, remote); > > if (!listener) { > @@ -378,12 +385,26 @@ ovsdb_jsonrpc_server_set_read_only(struct > ovsdb_jsonrpc_server *svr, > } > > void > -ovsdb_jsonrpc_server_run(struct ovsdb_jsonrpc_server *svr) > +ovsdb_jsonrpc_server_run(struct ovsdb_jsonrpc_server *svr, uint64_t limit) > { > struct shash_node *node; > + uint64_t elapsed = 0, start_time = 0; > + > + if (limit) { > + start_time = time_now(); Why this function uses time_now() while others are using time_msec() ? time_now() returns seconds while 'limit' is in milliseconds. > + } > + > + svr->yield_immediately = false; > > SHASH_FOR_EACH (node, &svr->remotes) { > struct ovsdb_jsonrpc_remote *remote = node->data; > + if (svr->skip_to) { > + if (remote != svr->skip_to) { > + continue; What if 'skip_to' is already removed from the list? We will, probably, never process any remotes again. Also, we didn't process first N remotes here and we're not setting 'yield_immediately'. This is inconsistent, at least. But, yes, it's unclear if 'yield_immediately' needed at all. > + } else { > + svr->skip_to = NULL; > + } > + } > > if (remote->listener) { > struct stream *stream; > @@ -403,7 +424,16 @@ ovsdb_jsonrpc_server_run(struct ovsdb_jsonrpc_server > *svr) > } > } > > - ovsdb_jsonrpc_session_run_all(remote); > + ovsdb_jsonrpc_session_run_all(remote, limit - elapsed); > + > + if (limit) { > + elapsed = start_time - time_now(); 'start_time' is always less than time_now(), so 'elapsed' is either a zero or a huge unsigned integer. So, limit here will never work. > + if (elapsed >= limit) { > + svr->yield_immediately = true; > + svr->skip_to = remote; > + break; > + } > + } > } > } > > @@ -412,6 +442,12 @@ ovsdb_jsonrpc_server_wait(struct ovsdb_jsonrpc_server > *svr) > { > struct shash_node *node; > > + if (svr->yield_immediately) { > + poll_immediate_wake(); > + svr->yield_immediately = false; > + return; > + } > + > SHASH_FOR_EACH (node, &svr->remotes) { > struct ovsdb_jsonrpc_remote *remote = node->data; > > @@ -583,15 +619,51 @@ ovsdb_jsonrpc_session_set_options(struct > ovsdb_jsonrpc_session *session, > } > > static void > -ovsdb_jsonrpc_session_run_all(struct ovsdb_jsonrpc_remote *remote) > +ovsdb_jsonrpc_session_run_all(struct ovsdb_jsonrpc_remote *remote, > + uint64_t limit) > { > struct ovsdb_jsonrpc_session *s, *next; > + uint64_t start_time; > + > + if (limit) { > + start_time = time_msec(); > + } > > LIST_FOR_EACH_SAFE (s, next, node, &remote->sessions) { > + if ((remote->skip_to) && (s != remote->skip_to)) { > + /* processing was interrupted, we skip to the point > + * where we had to interrupt it > + * we cannot use the _CONTINUE macro as it is not safe > + * if the list has been changed in the meantime. > + */ > + continue; Same comment here. If 'skip_to' is already removed from the list, we will never process any session again. > + } > + > + /* set next as skip point if we need to restart processing */ > + remote->skip_to = next; > + > int error = ovsdb_jsonrpc_session_run(s); > + > if (error) { > ovsdb_jsonrpc_session_close(s); > } > + > + if (limit) { > + if (time_msec() - start_time > limit) { > + /* we bail leaving skip_to set. Next processing iteration > + * will skip everything up to skip_to > + */ > + break; > + } else { > + /* We are within time constraints, zero > + * the session we need to skip to in order to > + * restart processing. > + */ > + remote->skip_to = NULL; > + } > + } else { > + remote->skip_to = NULL; > + } > } > } > > diff --git a/ovsdb/jsonrpc-server.h b/ovsdb/jsonrpc-server.h > index e0653aa39..218152e9d 100644 > --- a/ovsdb/jsonrpc-server.h > +++ b/ovsdb/jsonrpc-server.h > @@ -67,7 +67,7 @@ void ovsdb_jsonrpc_server_free_remote_status( > void ovsdb_jsonrpc_server_reconnect(struct ovsdb_jsonrpc_server *, bool > force, > char *comment); > > -void ovsdb_jsonrpc_server_run(struct ovsdb_jsonrpc_server *); > +void ovsdb_jsonrpc_server_run(struct ovsdb_jsonrpc_server *, uint64_t limit); > void ovsdb_jsonrpc_server_wait(struct ovsdb_jsonrpc_server *); > > void ovsdb_jsonrpc_server_set_read_only(struct ovsdb_jsonrpc_server *, > diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c > index b09232c65..97bfa05b2 100644 > --- a/ovsdb/ovsdb-server.c > +++ b/ovsdb/ovsdb-server.c > @@ -184,6 +184,7 @@ main_loop(struct server_config *config, > char *remotes_error, *ssl_error; > struct shash_node *node; > long long int status_timer = LLONG_MIN; > + uint64_t limit = 0; I think, it might save a few lines of code if initialized with UINT64_MAX. > > *exiting = false; > ssl_error = NULL; > @@ -215,7 +216,7 @@ main_loop(struct server_config *config, > reconfigure_remotes(jsonrpc, all_dbs, remotes), > &remotes_error); > report_error_if_changed(reconfigure_ssl(all_dbs), &ssl_error); > - ovsdb_jsonrpc_server_run(jsonrpc); > + ovsdb_jsonrpc_server_run(jsonrpc, limit); > > if (*is_backup) { > replication_run(); > @@ -228,8 +229,20 @@ main_loop(struct server_config *config, > struct shash_node *next; > SHASH_FOR_EACH_SAFE (node, next, all_dbs) { > struct db *db = node->data; > + uint64_t db_limit = 0; > + > ovsdb_txn_history_run(db->db); > ovsdb_storage_run(db->db->storage); > + > + db_limit = max_processing_time(db->db->storage); > + if (db_limit) { > + if (!limit) { > + limit = db_limit; > + } else { > + limit = MIN(db_limit, limit); > + } > + } It doesn't sound right that we're calculating a limit here inside the loop. If we have more than one database, we need to calculate a minimum for all databases and use it later, otherwise the database with higher election timer might take more time than a limit for another database. Also, limit is global and never reset. If election timer will be changed to higher value, limit will stay low. > + > read_db(config, db); > /* Run triggers after storage_run and read_db to make sure new > raft > * updates are utilized in current iteration. */ > diff --git a/ovsdb/raft.c b/ovsdb/raft.c > index e06c1f1ab..c473e5d32 100644 > --- a/ovsdb/raft.c > +++ b/ovsdb/raft.c > @@ -407,6 +407,11 @@ raft_make_address_passive(const char *address_) > } > } > > +uint64_t raft_election_timer_value(const struct raft *raft) > +{ > + return raft->election_timer; > +} > + > static struct raft * > raft_alloc(void) > { > diff --git a/ovsdb/raft.h b/ovsdb/raft.h > index 3545c41c2..1d270ec0c 100644 > --- a/ovsdb/raft.h > +++ b/ovsdb/raft.h > @@ -188,4 +188,7 @@ void raft_take_leadership(struct raft *); > void raft_transfer_leadership(struct raft *, const char *reason); > > const struct uuid *raft_current_eid(const struct raft *); > + > +uint64_t raft_election_timer_value(const struct raft *); > + > #endif /* lib/raft.h */ > diff --git a/ovsdb/storage.c b/ovsdb/storage.c > index 40415fcf6..a2142851a 100644 > --- a/ovsdb/storage.c > +++ b/ovsdb/storage.c > @@ -640,3 +640,11 @@ ovsdb_storage_peek_last_eid(struct ovsdb_storage > *storage) > } > return raft_current_eid(storage->raft); > } > + > +uint64_t max_processing_time(struct ovsdb_storage *storage) Name of the function should start from a new line since it's not just a prototype. > +{ > + if (!storage->raft) { > + return UINT64_MAX; > + } > + return raft_election_timer_value(storage->raft) / 2; > +} > diff --git a/ovsdb/storage.h b/ovsdb/storage.h > index 02b6e7e6c..9e195bbe8 100644 > --- a/ovsdb/storage.h > +++ b/ovsdb/storage.h > @@ -97,4 +97,6 @@ struct ovsdb_schema *ovsdb_storage_read_schema(struct > ovsdb_storage *); > > const struct uuid *ovsdb_storage_peek_last_eid(struct ovsdb_storage *); > > +uint64_t max_processing_time(struct ovsdb_storage *); > + > #endif /* ovsdb/storage.h */ > _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev