This is an automated email from the ASF dual-hosted git repository.
vatamane pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/couchdb.git
The following commit(s) were added to refs/heads/main by this push:
new c0a967365 Add jitter for scanner plugins
c0a967365 is described below
commit c0a967365e059faf6869bdda2da6ccd215155e56
Author: Nick Vatamaniuc <[email protected]>
AuthorDate: Tue Mar 24 01:00:48 2026 -0400
Add jitter for scanner plugins
Add some jitter (10% by default) to scanner plugin periods. This should help
avoid a "thundering heard" effect on the cluster if plugins are set up to
start
at the same time on all the cluster nodes.
Jitter is configurable per-plugin. Config format is `$num_percent` or
`$num_$timeunit`. The `$num_percent` format configures the maximum jitter
value
as a period percentage. For example 10% of `period = 24_hours` would be 2.4
hours. The `$num_$timeunit` is the same format as a period itself, so it can
take values like `5_min`, `2_hours`, etc.
---
rel/overlay/etc/default.ini | 5 ++
src/couch_scanner/src/couch_scanner_plugin.erl | 3 +-
src/couch_scanner/src/couch_scanner_util.erl | 69 +++++++++++++++++++++++---
src/docs/src/config/scanner.rst | 16 +++++-
4 files changed, 84 insertions(+), 9 deletions(-)
diff --git a/rel/overlay/etc/default.ini b/rel/overlay/etc/default.ini
index 558542fed..bb017fb44 100644
--- a/rel/overlay/etc/default.ini
+++ b/rel/overlay/etc/default.ini
@@ -1127,6 +1127,11 @@ url = {{nouveau_url}}
; * $num_$timeunit: 1000_sec, 30_min, 8_hours, 24_hour, 2_days, 3_weeks,
1_month
; * $weekday: mon, monday, Thu, thursdays
;repeat = restart
+;
+; How much jitter to apply to the period. Possible formats are:
+; * $num_percent: percent of period value
+; * $num_timeunit: 1000_sec, 30_min, 8_hours, 24_hour, 2_days, 3_weeks,
1_month
+;jitter = 10_percent
;[$plugin.skips_dbs]
; Skip over databases if their names contain any of the strings in this
section.
diff --git a/src/couch_scanner/src/couch_scanner_plugin.erl
b/src/couch_scanner/src/couch_scanner_plugin.erl
index b31ed949f..cbb42bd90 100644
--- a/src/couch_scanner/src/couch_scanner_plugin.erl
+++ b/src/couch_scanner/src/couch_scanner_plugin.erl
@@ -788,8 +788,9 @@ cfg_ddoc_batch_size() ->
schedule_time(Mod, LastSec, NowSec) ->
After = cfg(Mod, "after", "restart"),
Repeat = cfg(Mod, "repeat", "restart"),
+ Jitter = cfg(Mod, "jitter", "10_percent"),
Restart = couch_scanner_util:restart_tsec(),
- couch_scanner_util:schedule_time(NowSec, LastSec, Restart, After, Repeat).
+ couch_scanner_util:schedule_time(NowSec, LastSec, Restart, After, Repeat,
Jitter).
tsec() ->
erlang:system_time(second).
diff --git a/src/couch_scanner/src/couch_scanner_util.erl
b/src/couch_scanner/src/couch_scanner_util.erl
index a6bafe973..b414a195f 100644
--- a/src/couch_scanner/src/couch_scanner_util.erl
+++ b/src/couch_scanner/src/couch_scanner_util.erl
@@ -17,7 +17,7 @@
log/5,
ejson_map/1,
restart_tsec/0,
- schedule_time/5,
+ schedule_time/6,
load_regexes/1,
compile_regexes/1,
match_regexes/2,
@@ -60,7 +60,7 @@ consistent_hash_nodes(Item) ->
Nodes = mem3_util:live_nodes(),
hd(mem3_util:rotate_list(Item, Nodes)) =:= node().
-schedule_time(Now, Last, Restart, AfterCfg, RepeatCfg) when
+schedule_time(Now, Last, Restart, AfterCfg, RepeatCfg, JitterCfg) when
is_integer(Now), is_integer(Restart), is_integer(Last)
->
RepeatPeriod = repeat_period(Now, Last, parse_repeat(RepeatCfg)),
@@ -77,14 +77,47 @@ schedule_time(Now, Last, Restart, AfterCfg, RepeatCfg) when
{After, undefined} when is_integer(After), Last < After ->
% Run once, haven't run yet, schedule to run
max(Now, After);
- {undefined, Period} ->
+ {undefined, Period} when is_integer(Period) ->
% No after time, just period. Either need to wait
% since last time it ran, or is actually ready to run
- max(Now, Last + Period);
- {After, Period} ->
+ Jitter = rand:uniform(jitter(JitterCfg, Period)),
+ max(Now, Last + Period + Jitter);
+ {After, Period} when is_integer(After), is_integer(Period) ->
% Both after time set and a period. Wait for whichever
% takes the longest
- lists:max([Now, After, Last + Period])
+ Jitter = rand:uniform(jitter(JitterCfg, Period)),
+ lists:max([Now, After, Last + Period + Jitter])
+ end.
+
+% Parse jitter configuration as number of seconds.
+%
+% JitterCfg formats can be:
+% N_percent : where N is value 0-100 and then it return N% of Period
+% N_Unit : where N is a number and Unit is any unit (parse_period_unit/1 can
parse)
+%
+% Result will always be in the range of [1, Period] seconds.
+%
+jitter(JitterCfg, Period) when is_integer(Period), Period > 0 ->
+ try string:split(JitterCfg, "_") of
+ [PctStr, "percent"] ->
+ try list_to_integer(PctStr) of
+ Pct ->
+ Val = round(Period * Pct / 100),
+ max(1, min(Period, Val))
+ catch
+ _:_ ->
+ 1
+ end;
+ [_, _] ->
+ case parse_non_weekday_period(JitterCfg) of
+ undefined -> 1;
+ Val when is_integer(Val), Val > 0 -> min(Period, Val)
+ end;
+ _ ->
+ 1
+ catch
+ _:_ ->
+ 1
end.
load_regexes(KVs) when is_list(KVs) ->
@@ -342,6 +375,30 @@ repeat_period_test() ->
?assertEqual(?WEEK, repeat_period(Now, Now - 1, {weekday, 5})),
?assertEqual(1 * ?DAY, repeat_period(Now, Now - 999999, {weekday, 6})).
+jitter_test() ->
+ ?assertEqual(1, jitter("foo", 1)),
+ ?assertEqual(1, jitter(undefined, 1)),
+ ?assertEqual(1, jitter("", 1)),
+ ?assertEqual(1, jitter("_", 1)),
+ ?assertEqual(1, jitter("1_", 1)),
+ ?assertEqual(1, jitter("_percent", 1)),
+ ?assertEqual(1, jitter("1", 1)),
+ ?assertEqual(1, jitter("X_percent", 1)),
+ ?assertEqual(1, jitter("Z_seconds", 1)),
+ ?assertEqual(1, jitter("1_percent_years", 1)),
+ ?assertEqual(1, jitter("0_percent", 1)),
+ ?assertEqual(1, jitter("50_percent", 1)),
+ ?assertEqual(1, jitter("100_percent", 1)),
+ ?assertEqual(1, jitter("100000000000_percent", 1)),
+ ?assertEqual(1, jitter("1_sec", 1)),
+ ?assertEqual(1, jitter("2_sec", 1)),
+ ?assertEqual(2, jitter("2_sec", 2)),
+ ?assertEqual(2, jitter("2_sec", 3)),
+ ?assertEqual(50, jitter("50_percent", 100)),
+ ?assertEqual(100, jitter("100_percent", 100)),
+ ?assertEqual(100, jitter("10000000000_percent", 100)),
+ ?assertEqual(100, jitter("10000000000_years", 100)).
+
regex_compile_test() ->
KVs = [{"x", "a[d-f]"}, {"y", "**"}],
Regexes = load_regexes(KVs),
diff --git a/src/docs/src/config/scanner.rst b/src/docs/src/config/scanner.rst
index cb49acd6f..98fe2e67c 100644
--- a/src/docs/src/config/scanner.rst
+++ b/src/docs/src/config/scanner.rst
@@ -132,8 +132,8 @@ settings in their ``[{plugin}]`` section.
.. config:option:: repeat
- Run the plugin periodically. By default it will run once after node the
- node starts. Possible period formats are: ``{num}_{timeunit}`` (ex.:
+ Run the plugin periodically. By default it will run once after node
+ starts. Possible period formats are: ``{num}_{timeunit}`` (ex.:
``1000_sec``, ``30_min``, ``8_hours``, ``24_hour``, ``2_days``,
``3_weeks``, ``1_month``) or ``{weekday}`` (ex.: ``mon``, ``monday``,
``Thu``, etc.) ::
@@ -141,6 +141,18 @@ settings in their ``[{plugin}]`` section.
[{plugin}]
repeat = restart
+ .. config:option:: jitter
+
+ How much jitter to apply to the period. The default is 10% of the
+ period value. Jitter can spread the load on the cluster by adding some
+ randomness to when the plugins start. Possible formats are
+ ``{num}_percent`` (ex.: ``25_percent``) or ``{num}_{timeunit}`` (ex.:
+ ``1000_sec``, ``30_min``, ``8_hours``, ``24_hour``, ``2_days``). The
+ default is ``10_percent``, which means 10% of the period value ::
+
+ [{plugin}]
+ jitter = 10_percent
+
.. config:section:: {plugin}.skip_dbs :: Skip databases
.. config:option:: {tag}