This matches the behavior of mail indexers and ensures `medium' indices don't grow unexpectedly to be come `full' indices. --- lib/PublicInbox/CodeSearchIdx.pm | 15 +++++++++-- lib/PublicInbox/SearchIdx.pm | 2 +- t/cindex.t | 45 ++++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 3 deletions(-)
diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm index 5f20325a..3a3fc03e 100644 --- a/lib/PublicInbox/CodeSearchIdx.pm +++ b/lib/PublicInbox/CodeSearchIdx.pm @@ -85,7 +85,6 @@ sub new { xpfx => "$dir/cidx". PublicInbox::CodeSearch::CIDX_SCHEMA_VER, cidx_dir => $dir, creat => 1, # TODO: get rid of this, should be implicit - indexlevel => $l, transact_bytes => 0, # for checkpoint total_bytes => 0, # for lock_release current_info => '', @@ -617,16 +616,28 @@ sub cidx_init ($) { } $self->lock_acquire; my @shards; + my $l = $self->{indexlevel} //= $self->{-opt}->{indexlevel}; + for my $n (0..($self->{nshard} - 1)) { my $shard = bless { %$self, shard => $n }, ref($self); delete @$shard{qw(lockfh lock_path)}; - $shard->idx_acquire; + my $xdb = $shard->idx_acquire; + if (!$n) { + if (($l // '') eq 'medium') { + $xdb->set_metadata('indexlevel', $l); + } elsif (($l // '') eq 'full') { + $xdb->set_metadata('indexlevel', ''); # unset + } + $l ||= $xdb->get_metadata('indexlevel') || 'full'; + } + $shard->{indexlevel} = $l; $shard->idx_release; $shard->wq_workers_start("cidx shard[$n]", 1, $SIGSET, { siblings => \@shards, # for ipc_atfork_child }, \&shard_done_wait, $self); push @shards, $shard; } + $self->{indexlevel} //= $l; # this warning needs to happen after idx_acquire state $once; warn <<EOM if $PublicInbox::Search::X{CLOEXEC_UNSET} && !$once++; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index f36c8f97..699af432 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -90,7 +90,7 @@ sub new { $self; } -sub need_xapian ($) { $_[0]->{indexlevel} =~ $xapianlevels } +sub need_xapian ($) { ($_[0]->{indexlevel} // 'full') =~ $xapianlevels } sub idx_release { my ($self, $wake) = @_; diff --git a/t/cindex.t b/t/cindex.t index 9da0ba69..d40f73ff 100644 --- a/t/cindex.t +++ b/t/cindex.t @@ -4,11 +4,13 @@ use v5.12; use PublicInbox::TestCommon; use Cwd qw(getcwd abs_path); +use List::Util qw(sum); require_mods(qw(json Search::Xapian)); use_ok 'PublicInbox::CodeSearchIdx'; require PublicInbox::Import; my ($tmp, $for_destroy) = tmpdir(); my $pwd = getcwd(); +my @unused_keys = qw(last_commit has_threadid skip_docdata); # I reworked CodeSearchIdx->shard_worker to handle empty trees # in the initial commit generated by cvs2svn for xapian.git @@ -71,7 +73,48 @@ ok(run_script([qw(-cindex --dangerous -q -d), "$tmp/ext", $zp, "$tmp/wt0"]), ok(-e "$tmp/ext/cidx.lock", 'external dir created'); ok(!-d "$zp/.git/public-inbox-cindex", 'no cindex in original coderepo'); +ok(run_script([qw(-cindex -L medium --dangerous -q -d), + "$tmp/med", $zp, "$tmp/wt0"]), 'cindex external medium'); + +my $no_metadata_set = sub { + my ($i, $extra, $xdb) = @_; + for my $xdb (@$xdb) { + for my $k (@unused_keys, @$extra) { + is($xdb->get_metadata($k) // '', '', + "metadata $k unset in shard #$i"); + } + ++$i; + } +}; + +{ + my $mid_size = sum(map { -s $_ } glob("$tmp/med/cidx*/*/*")); + my $full_size = sum(map { -s $_ } glob("$tmp/ext/cidx*/*/*")); + ok($full_size > $mid_size, 'full size > mid size') or + diag "full=$full_size mid=$mid_size"; + for my $l (qw(med ext)) { + ok(run_script([qw(-cindex -q --reindex -u -d), "$tmp/$l"]), + "reindex $l"); + } + $mid_size = sum(map { -s $_ } glob("$tmp/med/cidx*/*/*")); + $full_size = sum(map { -s $_ } glob("$tmp/ext/cidx*/*/*")); + ok($full_size > $mid_size, 'full size > mid size after reindex') or + diag "full=$full_size mid=$mid_size"; + my $csrch = PublicInbox::CodeSearch->new("$tmp/med"); + my ($xdb0, @xdb) = $csrch->xdb_shards_flat; + $no_metadata_set->(0, [], [ $xdb0 ]); + is($xdb0->get_metadata('indexlevel'), 'medium', + 'indexlevel set in shard #0'); + $no_metadata_set->(1, ['indexlevel'], \@xdb); + + ok(run_script([qw(-cindex -q -L full --reindex -u -d), "$tmp/med"]), + 'reindex medium as full'); + @xdb = $csrch->xdb_shards_flat; + $no_metadata_set->(0, ['indexlevel'], \@xdb); +} + use_ok 'PublicInbox::CodeSearch'; + if ('multi-repo search') { my $csrch = PublicInbox::CodeSearch->new("$tmp/ext"); my $mset = $csrch->mset('NUL'); @@ -86,6 +129,8 @@ if ('multi-repo search') { $mset = $csrch->mset('NUL', { git_dir => abs_path("$zp/.git") }); @have = sort(map { $_->get_document->get_data } $mset->items); is_xdeeply(\@have, $exp, 'got expected subjects w/ GIT_DIR filter'); + my @xdb = $csrch->xdb_shards_flat; + $no_metadata_set->(0, ['indexlevel'], \@xdb); } if ('--update') {