This is an automated email from the ASF dual-hosted git repository. vatamane pushed a commit to branch handle-upgrade-case-for-instance-start-time in repository https://gitbox.apache.org/repos/asf/couchdb.git
commit 3b18d04cfc5eb723bd514ebc4659d3fd72842b51 Author: Nick Vatamaniuc <[email protected]> AuthorDate: Wed May 24 23:34:45 2023 -0400 Handle replicator instance start time during upgrades better During cluster upgrades from 3.2 to 3.3 when instance start time switched from being always `0` to an actual timestamp, replication jobs will crash when endpoints are upgraded. Replication jobs were started when endpoint emitted a `0` and then it becomes a non-`0` value which will crash the next checkpoint attempt. After the crash jobs will restart and continue fine were they left off without rewinding. However they will make a logging mess while they crash. All four workers will exit the `{checkpoint_commit_failure,...}` error. This commit make it the checkpoint ignore mismatches if one of the instance start times is 0. --- .../src/couch_replicator_scheduler_job.erl | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/couch_replicator/src/couch_replicator_scheduler_job.erl b/src/couch_replicator/src/couch_replicator_scheduler_job.erl index e16412e4a..cd751d8f2 100644 --- a/src/couch_replicator/src/couch_replicator_scheduler_job.erl +++ b/src/couch_replicator/src/couch_replicator_scheduler_job.erl @@ -785,9 +785,9 @@ do_checkpoint(State) -> current_through_seq = {_Ts, NewSeq} = NewTsSeq, source_log = SourceLog, target_log = TargetLog, - rep_starttime = ReplicationStartTime, - src_starttime = SrcInstanceStartTime, - tgt_starttime = TgtInstanceStartTime, + rep_starttime = RepStartTs, + src_starttime = SrcStartTs, + tgt_starttime = TgtStartTs, stats = Stats, rep_details = #rep{options = Options}, session_id = SessionId @@ -799,13 +799,16 @@ do_checkpoint(State) -> {target_error, Reason} -> {checkpoint_commit_failure, <<"Failure on target commit: ", (to_binary(Reason))/binary>>}; - {SrcInstanceStartTime, TgtInstanceStartTime} -> + {<<S/binary>>, <<T/binary>>} when + (S =:= SrcStartTs orelse T =:= <<"0">> orelse SrcStartTs =:= <<"0">>) andalso + (T =:= TgtStartTs orelse T =:= <<"0">> orelse TgtStartTs =:= <<"0">>) + -> couch_log:notice( "recording a checkpoint for `~s` -> `~s` at source update_seq ~p", [SourceName, TargetName, NewSeq] ), - LocalStartTime = calendar:now_to_local_time(ReplicationStartTime), - StartTime = ?l2b(httpd_util:rfc1123_date(LocalStartTime)), + LocalStartTs = calendar:now_to_local_time(RepStartTs), + StartTime = ?l2b(httpd_util:rfc1123_date(LocalStartTs)), EndTime = ?l2b(httpd_util:rfc1123_date()), NewHistoryEntry = {[ @@ -870,15 +873,15 @@ do_checkpoint(State) -> throw:{checkpoint_commit_failure, _} = Failure -> Failure end; - {SrcInstanceStartTime, _NewTgtInstanceStartTime} -> + {SrcStartTs, _NewTgtStartTs} -> {checkpoint_commit_failure, << "instance_start_time on target database has changed since last checkpoint." >>}; - {_NewSrcInstanceStartTime, TgtInstanceStartTime} -> + {_NewSrcStartTs, TgtStartTs} -> {checkpoint_commit_failure, << "instance_start_time on source database has changed since last checkpoint." >>}; - {_NewSrcInstanceStartTime, _NewTgtInstanceStartTime} -> + {_NewSrcStartTs, _NewTgtStartTs} -> {checkpoint_commit_failure, << "instance_start_time on source and target database has changed since last checkpoint." >>}
