Re: [PR] Introduce a minimum batch size for auto purge [couchdb]

via GitHub Thu, 23 Oct 2025 22:47:26 -0700


nickva commented on code in PR #5711:
URL: https://github.com/apache/couchdb/pull/5711#discussion_r2458965552



##########
src/couch/src/couch_auto_purge_plugin.erl:
##########
@@ -73,47 +73,67 @@ doc_fdi(#{} = St, #full_doc_info{deleted = true} = FDI, Db) 
->
     ?assert(
         FDI#full_doc_info.update_seq =< EndSeq, "FDI update_seq should not be 
greater than end seq"
     ),
-    {ok, purge(St, FDI, Db)};
+    {ok, enqueue(St, FDI, Db)};
 doc_fdi(#{} = St, #full_doc_info{}, _Db) ->
     {ok, St}.
 
-purge(#{} = St, #full_doc_info{} = FDI, Db) ->
+enqueue(#{} = St, FDI, Db) ->
     {Id, Revs} = fdi_to_idrevs(FDI),
-    MaxBatchSize = config:get_integer(atom_to_list(?MODULE), "max_batch_size", 
500),
-    purge(St, Id, Revs, MaxBatchSize, Db).
+    enqueue(St, Id, Revs, Db).
 
-purge(#{} = St, Id, Revs, MaxBatchSize, Db) when length(Revs) =< MaxBatchSize 
->
+enqueue(#{queue := Queue} = St0, Id, Revs, Db) ->
+    CurrentQueueSize = queue_size(Queue),
+    NewQueueSize = CurrentQueueSize + length(Revs),
+    MinBatchSize = min_batch_size(),
+    MaxBatchSize = max_batch_size(),
+
+    if
+        NewQueueSize > MaxBatchSize ->
+            {RevBatch, RevRest} = lists:split(NewQueueSize - MaxBatchSize, 
Revs),
+            St1 = flush_queue(St0#{queue => [{Id, RevBatch} | Queue]}, Db),
+            enqueue(St1, Id, RevRest, Db);
+        NewQueueSize >= MinBatchSize ->
+            flush_queue(St0#{queue => [{Id, Revs} | Queue]}, Db);
+        NewQueueSize < MinBatchSize ->
+            St0#{queue => [{Id, Revs} | Queue]}
+    end.
+
+flush_queue(#{queue := IdRevs} = St, Db) ->
     DbName = mem3:dbname(couch_db:name(Db)),
-    PurgeFun = fun() -> fabric:purge_docs(DbName, [{Id, Revs}], [?ADMIN_CTX]) 
end,
+    PurgeFun = fun() -> fabric:purge_docs(DbName, IdRevs, [?ADMIN_CTX]) end,

Review Comment:
   I think there is a chance here that we'll get a duplicate doc_id in the same 
request. At the http api level the request is a  json object with a `{doc_id: 
[Rev]}` structure, so it implicitly doesn't have duplicates. Staring at the 
code in `couch_db_updater` it seems duplicate doc_ids should be handled, but it 
might be a tiny bit less efficient, since we'd do a partial purge, then another 
in subsequent {Id, Revs} request, as opposed to blowing away the FDI in one go. 
Maybe keeping the queue as a map of #{Id => Revs} could work? It might be less 
elegant though, I like how it looks clean and small now.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Introduce a minimum batch size for auto purge [couchdb]

Reply via email to