Having many ->delete_document calls in a transaction still causes Xapian to eat up a large amount of memory and OOM on my system.
I may reimplement --prune to avoid blocking ongoing updates, but this is a simple fix for swapping and OOMs for now. --- lib/PublicInbox/CodeSearchIdx.pm | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm index 704baa9c..e353f452 100644 --- a/lib/PublicInbox/CodeSearchIdx.pm +++ b/lib/PublicInbox/CodeSearchIdx.pm @@ -622,12 +622,21 @@ sub scan_git_dirs ($) { sub prune_cb { # git->check_async callback my ($hex, $type, undef, $self_id) = @_; - if ($type ne 'commit') { - my ($self, $id) = @$self_id; - progress($self, "$hex $type"); - ++$self->{pruned}; - $self->{xdb}->delete_document($id); - } + return if $type eq 'commit'; + my ($self, $id) = @$self_id; + my $len = $self->{xdb}->get_doclength($id); + progress($self, "$hex $type (doclength=$len)"); + ++$self->{pruned}; + $self->{xdb}->delete_document($id); + + # all math around batch_bytes calculation is pretty fuzzy, + # but need a way to regularly flush output to avoid OOM, + # so assume the average term + position overhead is the + # answer to everything: 42 + return if ($self->{batch_bytes} -= ($len * 42)) > 0; + cidx_ckpoint($self, "[$self->{shard}] $self->{pruned}"); + $self->{batch_bytes} = $self->{-opt}->{batch_size} // + $PublicInbox::SearchIdx::BATCH_BYTES; } sub shard_prune { # via wq_io_do @@ -639,6 +648,8 @@ sub shard_prune { # via wq_io_do my $cur = $xdb->postlist_begin('Tc'); my $end = $xdb->postlist_end('Tc'); my ($id, @cmt, $oid); + local $self->{batch_bytes} = $self->{-opt}->{batch_size} // + $PublicInbox::SearchIdx::BATCH_BYTES; local $self->{pruned} = 0; for (; $cur != $end && !$DO_QUIT; $cur++) { @cmt = xap_terms('Q', $xdb, $id = $cur->get_docid);