This allows changing --indexlevel at the moment and will allow
us to fix some yet-to-be-discovered bugs or backwards-compatible
improvements in the future.
---
 lib/PublicInbox/CodeSearchIdx.pm | 33 ++++++++++++++++++++++----------
 t/cindex.t                       |  4 ++++
 2 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm
index 095c153e..5e6c0d22 100644
--- a/lib/PublicInbox/CodeSearchIdx.pm
+++ b/lib/PublicInbox/CodeSearchIdx.pm
@@ -43,7 +43,8 @@ our (
        @RDONLY_SHARDS, # Xapian::Database
        @IDX_SHARDS, # clones of self
        $MAX_SIZE,
-       $TMP_GIT, # PublicInbox::Git object for --reindex and --prune
+       $TMP_GIT, # PublicInbox::Git object for --prune
+       $REINDEX, # PublicInbox::SharedKV
 );
 
 # stop walking history if we see >$SEEN_MAX existing commits, this assumes
@@ -89,12 +90,13 @@ sub new {
 # TODO: may be used for reshard/compact
 sub count_shards { scalar($_[0]->xdb_shards_flat) }
 
-sub add_commit ($$) {
+sub update_commit ($$) {
        my ($self, $cmt) = @_; # fields from @FMT
        my $x = 'Q'.$cmt->{H};
-       for (docids_by_postlist($self, $x)) {
-               $self->{xdb}->delete_document($_)
-       }
+       my ($docid, @extra) = sort { $a <=> $b } docids_by_postlist($self, $x);
+       @extra and warn "W: $cmt->{H} indexed multiple times, pruning ",
+                       join(', ', map { "#$_" } @extra), "\n";
+       $self->{xdb}->delete_document($_) for @extra;
        my $doc = $PublicInbox::Search::X{Document}->new;
        $doc->add_boolean_term($x);
        $doc->add_boolean_term('G'.$_) for @{$self->{roots}};
@@ -119,7 +121,8 @@ sub add_commit ($$) {
 
        $x = delete $cmt->{b};
        $self->index_body_text($doc, \$x) if $x =~ /\S/s;
-       $self->{xdb}->add_document($doc);
+       defined($docid) ? $self->{xdb}->replace_document($docid, $doc) :
+                       $self->{xdb}->add_document($doc);
 }
 
 sub progress {
@@ -235,7 +238,7 @@ sub shard_index { # via wq_io_do
                        cidx_ckpoint($self, "[$n] $nr");
                        $TXN_BYTES = $batch_bytes - $len;
                }
-               add_commit($self, $cmt);
+               update_commit($self, $cmt);
                ++$nr;
                if ($TXN_BYTES <= 0) {
                        cidx_ckpoint($self, "[$n] $nr");
@@ -398,7 +401,7 @@ sub check_existing { # retry_reopen callback
        my $docid = shift(@docids) // return get_roots($self, $git);
        my $doc = $shard->{xdb}->get_document($docid) //
                        die "BUG: no #$docid ($git->{git_dir})";
-       my $old_fp = $doc->get_data;
+       my $old_fp = $REINDEX ? "\0invalid" : $doc->get_data;
        if ($old_fp eq $git->{-repo}->{fp}) { # no change
                delete $git->{-repo};
                return;
@@ -426,7 +429,10 @@ sub partition_refs ($$$) {
        while (defined(my $cmt = <$rfh>)) {
                chomp $cmt;
                my $n = hex(substr($cmt, 0, 8)) % scalar(@RDONLY_SHARDS);
-               if (seen($RDONLY_SHARDS[$n], 'Q'.$cmt)) {
+               if ($REINDEX && $REINDEX->set_maybe(pack('H*', $cmt), '')) {
+                       say { $shard_in[$n] } $cmt or die "say: $!";
+                       ++$nchange;
+               } elsif (seen($RDONLY_SHARDS[$n], 'Q'.$cmt)) {
                        last if ++$seen > $SEEN_MAX;
                } else {
                        say { $shard_in[$n] } $cmt or die "say: $!";
@@ -687,7 +693,7 @@ sub parent_quit {
 
 sub init_tmp_git_dir ($) {
        my ($self) = @_;
-       return unless ($self->{-opt}->{prune} || $self->{-opt}->{reindex});
+       return unless $self->{-opt}->{prune};
        require File::Temp;
        require PublicInbox::Import;
        my $tmp = File::Temp->newdir('cidx-all-git-XXXX', TMPDIR => 1);
@@ -729,6 +735,13 @@ sub cidx_run { # main entry point
                $cb->($m, @_);
        };
        load_existing($self);
+       local $REINDEX;
+       if ($self->{-opt}->{reindex}) {
+               require PublicInbox::SharedKV;
+               $REINDEX = PublicInbox::SharedKV->new;
+               delete $REINDEX->{lock_path};
+               $REINDEX->dbh;
+       }
        my @nc = grep { File::Spec->canonpath($_) ne $_ } @{$self->{git_dirs}};
        if (@nc) {
                warn "E: BUG? paths in $self->{cidx_dir} not canonicalized:\n";
diff --git a/t/cindex.t b/t/cindex.t
index 5d269217..eb66b2e6 100644
--- a/t/cindex.t
+++ b/t/cindex.t
@@ -93,6 +93,10 @@ EOM
        ok(run_script([qw(-cindex -qu -d), "$tmp/ext"]), '-cindex -u');
        $mset = $csrch->reopen->mset('dfn:for-update');
        is(scalar($mset->items), 1, 'got updated result');
+
+       ok(run_script([qw(-cindex -qu --reindex -d), "$tmp/ext"]), 'reindex');
+       $mset = $csrch->reopen->mset('dfn:for-update');
+       is(scalar($mset->items), 1, 'same result after reindex');
 }
 
 if ('--prune') {

Reply via email to