Re: [PR] Optimize and clean up internal replicator purge checkpoints [couchdb]

via GitHub Tue, 25 Nov 2025 21:25:15 -0800


nickva commented on code in PR #5760:
URL: https://github.com/apache/couchdb/pull/5760#discussion_r2563212167



##########
src/mem3/test/eunit/mem3_rep_test.erl:
##########
@@ -173,6 +176,110 @@ replicate_with_purges(#{allsrc := AllSrc, alltgt := 
AllTgt}) ->
     ?assertEqual(#{}, SDocs),
     ?assertEqual(#{}, get_all_docs(AllTgt)).
 
+clean_purge_checkpoints(#{allsrc := AllSrc, alltgt := AllTgt}) ->
+    DocSpec = #{docs => 10, delete => [5, 9], purge => [2, 4]},
+    add_test_docs(AllSrc, DocSpec),
+    % Add and purge some docs on target to excercise the pull_purges code path
+    add_test_docs(AllTgt, #{docs => 3, purge => [0, 2]}),
+    [Src] = lists:sort(mem3:local_shards(AllSrc)),
+    [Tgt1, Tgt2] = lists:sort(mem3:local_shards(AllTgt)),
+    #shard{name = SrcName} = Src,
+
+    % Since we don't have multiple nodes running and are just replicating
+    % from one clustered db to another, we need to patch up the shard map
+    % during the replication so it looks targets are part of the shard maps
+    meck:expect(mem3, shards, fun(DbName) ->
+        case DbName == Src#shard.dbname of
+            true -> [Src, Tgt1, Tgt2];
+            false -> meck:passthrough([DbName])
+        end
+    end),
+
+    FakeTarget = '[email protected]',
+
+    % Add a mix of stale, invalid or deprecated purge checkpoints
+    [Uuid1, Uuid2, Uuid3] = [couch_uuids:random() || _ <- lists:seq(1, 3)],
+
+    CheckpointIds = couch_util:with_db(SrcName, fun(Db) ->
+        Uuid = couch_db:get_uuid(Db),
+        Docs = [
+            % This one is ok and should not be cleaned up
+            #doc{
+                id = <<"_local/purge-mem3-", Uuid/binary, "-", Uuid1/binary>>,
+                body =
+                    {[
+                        {<<"type">>, <<"internal_replicator">>},
+                        {<<"updated_on">>, os:system_time(second)},
+                        {<<"purge_seq">>, 10042},
+                        {<<"source">>, atom_to_binary(Src#shard.node, latin1)},
+                        {<<"target">>, atom_to_binary(Tgt1#shard.node, 
latin1)},
+                        {<<"range">>, Tgt1#shard.range}
+                    ]}
+            },
+            % Non-existent range. Should be cleaned up.
+            #doc{
+                id = <<"_local/purge-mem3-", Uuid/binary, "-", Uuid2/binary>>,
+                body =
+                    {[
+                        {<<"type">>, <<"internal_replicator">>},
+                        {<<"updated_on">>, os:system_time(second)},
+                        {<<"purge_seq">>, 10043},
+                        {<<"source">>, atom_to_binary(Src#shard.node, latin1)},
+                        {<<"target">>, atom_to_binary(Tgt1#shard.node, 
latin1)},
+                        {<<"range">>, [0, 1]}
+                    ]}
+            },
+            % Non-existent target. Shoudl be cleaned up.
+            #doc{
+                id = <<"_local/purge-mem3-", Uuid/binary, "-", Uuid3/binary>>,
+                body =
+                    {[
+                        {<<"type">>, <<"internal_replicator">>},
+                        {<<"updated_on">>, os:system_time(second)},
+                        {<<"purge_seq">>, 10044},
+                        {<<"source">>, atom_to_binary(Src#shard.node, latin1)},
+                        {<<"target">>, atom_to_binary(FakeTarget, latin1)},
+                        {<<"range">>, Tgt1#shard.range}
+                    ]}
+            },
+            % Deprecated checkpoint format. Should be cleaned up.

Review Comment:
   Good point. That's the redundant checkpoint format where the source uuid is 
not the shard where the checkpoint is saved. I am thinking of this explanation:
   
   ```
               % Deprecated checkpoint format before 3.6. These were redundant
               % purge checkpoints generated by purge pulls from target to 
source
               % and checkpointed on the target as 
...-$source_uuid-$target_uuid.
               % The shard we're writing this fake checkpoint to is the "target"
               % shard, and the (source) uuid is the remote source uuid, that's
               % why it is Uuid1 and doesn't match ours. Another way to look at
               % it: we only should have checkpoints for which the $source_uuid
               % matches the current shard uuid, anything else will be cleaned 
up.
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Optimize and clean up internal replicator purge checkpoints [couchdb]

Reply via email to