This matches the behavior of mail indexers and ensures `medium'
indices don't grow unexpectedly to be come `full' indices.
---
 lib/PublicInbox/CodeSearchIdx.pm | 15 +++++++++--
 lib/PublicInbox/SearchIdx.pm     |  2 +-
 t/cindex.t                       | 45 ++++++++++++++++++++++++++++++++
 3 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm
index 5f20325a..3a3fc03e 100644
--- a/lib/PublicInbox/CodeSearchIdx.pm
+++ b/lib/PublicInbox/CodeSearchIdx.pm
@@ -85,7 +85,6 @@ sub new {
                xpfx => "$dir/cidx".  PublicInbox::CodeSearch::CIDX_SCHEMA_VER,
                cidx_dir => $dir,
                creat => 1, # TODO: get rid of this, should be implicit
-               indexlevel => $l,
                transact_bytes => 0, # for checkpoint
                total_bytes => 0, # for lock_release
                current_info => '',
@@ -617,16 +616,28 @@ sub cidx_init ($) {
        }
        $self->lock_acquire;
        my @shards;
+       my $l = $self->{indexlevel} //= $self->{-opt}->{indexlevel};
+
        for my $n (0..($self->{nshard} - 1)) {
                my $shard = bless { %$self, shard => $n }, ref($self);
                delete @$shard{qw(lockfh lock_path)};
-               $shard->idx_acquire;
+               my $xdb = $shard->idx_acquire;
+               if (!$n) {
+                       if (($l // '') eq 'medium') {
+                               $xdb->set_metadata('indexlevel', $l);
+                       } elsif (($l // '') eq 'full') {
+                               $xdb->set_metadata('indexlevel', ''); # unset
+                       }
+                       $l ||= $xdb->get_metadata('indexlevel') || 'full';
+               }
+               $shard->{indexlevel} = $l;
                $shard->idx_release;
                $shard->wq_workers_start("cidx shard[$n]", 1, $SIGSET, {
                        siblings => \@shards, # for ipc_atfork_child
                }, \&shard_done_wait, $self);
                push @shards, $shard;
        }
+       $self->{indexlevel} //= $l;
        # this warning needs to happen after idx_acquire
        state $once;
        warn <<EOM if $PublicInbox::Search::X{CLOEXEC_UNSET} && !$once++;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index f36c8f97..699af432 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -90,7 +90,7 @@ sub new {
        $self;
 }
 
-sub need_xapian ($) { $_[0]->{indexlevel} =~ $xapianlevels }
+sub need_xapian ($) { ($_[0]->{indexlevel} // 'full') =~ $xapianlevels }
 
 sub idx_release {
        my ($self, $wake) = @_;
diff --git a/t/cindex.t b/t/cindex.t
index 9da0ba69..d40f73ff 100644
--- a/t/cindex.t
+++ b/t/cindex.t
@@ -4,11 +4,13 @@
 use v5.12;
 use PublicInbox::TestCommon;
 use Cwd qw(getcwd abs_path);
+use List::Util qw(sum);
 require_mods(qw(json Search::Xapian));
 use_ok 'PublicInbox::CodeSearchIdx';
 require PublicInbox::Import;
 my ($tmp, $for_destroy) = tmpdir();
 my $pwd = getcwd();
+my @unused_keys = qw(last_commit has_threadid skip_docdata);
 
 # I reworked CodeSearchIdx->shard_worker to handle empty trees
 # in the initial commit generated by cvs2svn for xapian.git
@@ -71,7 +73,48 @@ ok(run_script([qw(-cindex --dangerous -q -d), "$tmp/ext", 
$zp, "$tmp/wt0"]),
 ok(-e "$tmp/ext/cidx.lock", 'external dir created');
 ok(!-d "$zp/.git/public-inbox-cindex", 'no cindex in original coderepo');
 
+ok(run_script([qw(-cindex -L medium --dangerous -q -d),
+       "$tmp/med", $zp, "$tmp/wt0"]), 'cindex external medium');
+
+my $no_metadata_set = sub {
+       my ($i, $extra, $xdb) = @_;
+       for my $xdb (@$xdb) {
+               for my $k (@unused_keys, @$extra) {
+                       is($xdb->get_metadata($k) // '', '',
+                               "metadata $k unset in shard #$i");
+               }
+               ++$i;
+       }
+};
+
+{
+       my $mid_size = sum(map { -s $_ } glob("$tmp/med/cidx*/*/*"));
+       my $full_size = sum(map { -s $_ } glob("$tmp/ext/cidx*/*/*"));
+       ok($full_size > $mid_size, 'full size > mid size') or
+               diag "full=$full_size mid=$mid_size";
+       for my $l (qw(med ext)) {
+               ok(run_script([qw(-cindex -q --reindex -u -d), "$tmp/$l"]),
+                       "reindex $l");
+       }
+       $mid_size = sum(map { -s $_ } glob("$tmp/med/cidx*/*/*"));
+       $full_size = sum(map { -s $_ } glob("$tmp/ext/cidx*/*/*"));
+       ok($full_size > $mid_size, 'full size > mid size after reindex') or
+               diag "full=$full_size mid=$mid_size";
+       my $csrch = PublicInbox::CodeSearch->new("$tmp/med");
+       my ($xdb0, @xdb) = $csrch->xdb_shards_flat;
+       $no_metadata_set->(0, [], [ $xdb0 ]);
+       is($xdb0->get_metadata('indexlevel'), 'medium',
+               'indexlevel set in shard #0');
+       $no_metadata_set->(1, ['indexlevel'], \@xdb);
+
+       ok(run_script([qw(-cindex -q -L full --reindex -u -d), "$tmp/med"]),
+               'reindex medium as full');
+       @xdb = $csrch->xdb_shards_flat;
+       $no_metadata_set->(0, ['indexlevel'], \@xdb);
+}
+
 use_ok 'PublicInbox::CodeSearch';
+
 if ('multi-repo search') {
        my $csrch = PublicInbox::CodeSearch->new("$tmp/ext");
        my $mset = $csrch->mset('NUL');
@@ -86,6 +129,8 @@ if ('multi-repo search') {
        $mset = $csrch->mset('NUL', { git_dir => abs_path("$zp/.git") });
        @have = sort(map { $_->get_document->get_data } $mset->items);
        is_xdeeply(\@have, $exp, 'got expected subjects w/ GIT_DIR filter');
+       my @xdb = $csrch->xdb_shards_flat;
+       $no_metadata_set->(0, ['indexlevel'], \@xdb);
 }
 
 if ('--update') {

Reply via email to