davisp commented on a change in pull request #3364:
URL: https://github.com/apache/couchdb/pull/3364#discussion_r585075990



##########
File path: src/couch_replicator/src/couch_replicator_share.erl
##########
@@ -0,0 +1,805 @@
+% Licensed under the Apache License, Version 2.0 (the "License"); you may not
+% use this file except in compliance with the License. You may obtain a copy of
+% the License at
+%
+%   http://www.apache.org/licenses/LICENSE-2.0
+%
+% Unless required by applicable law or agreed to in writing, software
+% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+% License for the specific language governing permissions and limitations under
+% the License.
+
+% This module implements the "Fair Share" algorithm by Judy Kay and Piers
+% Lauder [1] and applies it to the scheduling of replication jobs.
+%
+% The main idea is _replicator dbs can have a configurable number of "shares"
+% assigned to them. Shares is an abstract quantity from 1 to 1000. The default
+% is 100. Jobs from _replicator databases with more shares get proportionally a
+% higher chance to run than those from databases with a lower number of shares.
+%
+% Every scheduler cycle running jobs are "charged" based on how much time they
+% spent running during that cycle. At the end of the cycle the accumulated
+% charges for each job, the number of shares configured, and the total number
+% of jobs in the pending queue from the same _replicator db, are used to
+% calculate new priority values for all the jobs. To match the algorithm from
+% the paper, jobs with lower priority values are the ones at the front of the
+% run queue and have a higher chance of running.
+%
+% Here is how charges, shares, and number of sibling jobs affect the
+% priority value:
+%
+%   1) Jobs from dbs with higher configured shares get assigned lower
+%   priority values and so stay closer to the front of the queue.
+%
+%   2) Jobs from dbs with many other jobs (many siblings) get assigned a
+%   higher priority value, so they get pushed further down the queue
+%   and have a lower chance of running.
+%
+%   3) Jobs which run longer accumulate more charges and get assigned a
+%   higher priority value and get to wait longer to run.
+%
+% In order to prevent job starvation, all job priorities are periodicaly
+% decayed (decreased). This effectively moves all the jobs towards the front of
+% the run queue. So, in effect, there are two competing processes: one
+% uniformly moves all jobs to the front, and the other throws them back in
+% proportion to those factors mentioned above. The speed of this uniform
+% priority decay is controlled by the priority_coeff parameter.
+%
+% In order to prevent jobs from low shares dbs from "cheating" by getting
+% deleted and immediately re-added, charges are accumulated using a
+% historically decayed usage value. The speed of the usage decay is controlled
+% by the `usage_coeff = 0.5` parameter.
+%
+% [1] : 
https://proteusmaster.urcf.drexel.edu/urcfwiki/images/KayLauderFairShare.pdf
+
+
+-module(couch_replicator_share).
+
+-export([
+    init/0,
+    clear/0,
+    update_shares/2,
+    reset_shares/1,
+    job_added/1,
+    job_removed/1,
+    update/3,
+    priority/1,
+    charge/3
+]).
+
+
+-include_lib("couch/include/couch_db.hrl").
+-include("couch_replicator.hrl").
+
+
+% Usage coefficient decays historic usage every scheduling cycle. For example,
+% the usage value for a job running 1 minute is 60000000 (i.e microseconds /
+% minute), then if the job stops running it will take about 26 cycles (minutes)
+% for it to decay to 0 and the system to "forget" about it completely:
+%
+%  trunc(60000000 * math:pow(0.5, 26)) = 0
+%
+-define(DEFAULT_USAGE_COEFF, 0.5).
+
+% Priority coefficient decays all the job priorities such that they slowly
+% drift towards the front of the run queue. This coefficient defines a maximum
+% time window over which this algorithm would operate. For example, if this
+% value is too small (0.1), after a few cycles quite a few jobs would end up at
+% priority 0, and would render this algorithm useless. The default value of
+% 0.98 is picked such that if a job ran for one scheduler cycle, then didn't
+% get to run for 7 hours, it would still have priority > 0. 7 hours was picked
+% as it was close enought to 8 hours which is the default maximum error backoff
+% interval.
+%
+% Example calculation:
+%   shares = 100
+%   usage after 1 minute cycle run = 60000000
+%   initial priority =  60000000 / (100 * 100) = 6000
+%   trunc(6000 * math:pow(0.98, 431)) = 0
+%   431 / 60 ~= 7 hrs
+%
+-define(DEFAULT_PRIORITY_COEFF, 0.98).
+
+
+-define(MIN_SHARES, 1).
+-define(MAX_SHARES, 1000).
+-define(DEFAULT_SHARES, 100).
+
+-define(SHARES, couch_replicator_shares).
+-define(PRIORITIES, couch_replicator_priorities).
+-define(USAGE, couch_replicator_usage).
+-define(CHARGES, couch_replicator_stopped_usage).
+-define(NUM_JOBS, couch_replicator_num_jobs).
+
+
+init() ->
+    EtsOpts = [named_table, public],
+    ?SHARES = ets:new(?SHARES, EtsOpts), % {Key, Shares}
+    ?PRIORITIES = ets:new(?PRIORITIES, EtsOpts), % {JobId, Priority}
+    ?USAGE = ets:new(?USAGE, EtsOpts), % {Key, Usage}
+    ?CHARGES = ets:new(?CHARGES, EtsOpts), % {Key, Charges}
+    ?NUM_JOBS = ets:new(?NUM_JOBS, EtsOpts), % {Key, NumJobs}
+    lists:foreach(fun({K, V}) -> update_shares(K, V) end, get_config_shares()).
+
+
+clear() ->
+    Tables = [?SHARES, ?PRIORITIES, ?USAGE, ?CHARGES, ?NUM_JOBS],
+    lists:foreach(fun(T) -> catch ets:delete(T) end, Tables).
+
+
+% This should be called when user updates the replicator.shares config section
+%
+update_shares(Key, Shares) when is_integer(Shares) ->
+    ets:insert(?SHARES, {Key, bounded(Shares, ?MIN_SHARES, ?MAX_SHARES)}).
+
+
+% Called when the config value is deleted and shares are reset to the default
+% value.
+reset_shares(Key) ->
+    ets:delete(?SHARES, Key).
+
+
+job_added(#job{} = Job) ->
+    Key = key(Job),
+    % If the entry is not present {Key, 0} is used as the default

Review comment:
       Technically, I think the 0 is ignored and the default value is the 
increment. I dunno if that's important enough of a distinction or not.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to