Expanding threads via over.sqlite3 for mbox.gz downloads without Xapian effectively collapsing on the THREADID column leads to repeated messages getting downloaded.
To avoid that situation, use a "has_threadid" Xapian metadata flag that's only set on --reindex (and brand new Xapian DBs). This allows admins to upgrade WWW or do --reindex in any order; without worrying about users eating up bandwidth and CPU cycles. --- lib/PublicInbox/Mbox.pm | 2 +- lib/PublicInbox/Search.pm | 10 ++++++++- lib/PublicInbox/SearchIdx.pm | 16 ++++++++++++-- lib/PublicInbox/SearchView.pm | 21 ++++++++++++------- lib/PublicInbox/V2Writable.pm | 11 ++++++++++ t/init.t | 1 + t/psgi_search.t | 39 +++++++++++++++++++++++++++++++++-- 7 files changed, 87 insertions(+), 13 deletions(-) diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm index c9b11c21..0223bead 100644 --- a/lib/PublicInbox/Mbox.pm +++ b/lib/PublicInbox/Mbox.pm @@ -262,7 +262,7 @@ sub mbox_all { $ctx->{ids} = $srch->mset_to_artnums($mset); require PublicInbox::MboxGz; my $fn; - if ($q->{t}) { + if ($q->{t} && $srch->has_threadid) { $fn = 'results-thread-'.$q_string; PublicInbox::MboxGz::mbox_gz($ctx, \&results_thread_cb, $fn); } else { diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index bc820b64..01bbe73d 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -311,6 +311,12 @@ sub _do_enquire { retry_reopen($self, \&_enquire_once, [ $self, $query, $opts ]); } +# returns true if all docs have the THREADID value +sub has_threadid ($) { + my ($self) = @_; + (xdb($self)->get_metadata('has_threadid') // '') eq '1'; +} + sub _enquire_once { # retry_reopen callback my ($self, $query, $opts) = @{$_[0]}; my $xdb = xdb($self); @@ -328,7 +334,9 @@ sub _enquire_once { # retry_reopen callback } # `mairix -t / --threads' or JMAP collapseThreads - $enquire->set_collapse_key(THREADID) if $opts->{thread}; + if ($opts->{thread} && has_threadid($self)) { + $enquire->set_collapse_key(THREADID); + } my $offset = $opts->{offset} || 0; my $limit = $opts->{limit} || 50; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index baa6f41a..ade55756 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -131,6 +131,7 @@ sub idx_acquire { ($is_shard && need_xapian($self)))) { File::Path::mkpath($dir); nodatacow_dir($dir); + $self->{-set_has_threadid_once} = 1; } } return unless defined $flag; @@ -590,9 +591,17 @@ sub v1_checkpoint ($$;$) { $self->{mm}->{dbh}->commit; if ($newest && need_xapian($self)) { - my $cur = $self->{xdb}->get_metadata('last_commit'); + my $xdb = $self->{xdb}; + my $cur = $xdb->get_metadata('last_commit'); if (need_update($self, $cur, $newest)) { - $self->{xdb}->set_metadata('last_commit', $newest); + $xdb->set_metadata('last_commit', $newest); + } + + # let SearchView know a full --reindex was done so it can + # generate ->has_threadid-dependent links + if ($sync->{reindex} && !ref($sync->{reindex})) { + my $n = $xdb->get_metadata('has_threadid'); + $xdb->set_metadata('has_threadid', '1') if $n ne '1'; } } @@ -816,6 +825,9 @@ sub set_metadata_once { return if $self->{shard}; # only continue if undef or 0, not >0 my $xdb = $self->{xdb}; + if (delete($self->{-set_has_threadid_once})) { + $xdb->set_metadata('has_threadid', '1'); + } if (delete($self->{-set_indexlevel_once})) { my $level = $xdb->get_metadata('indexlevel'); if (!$level || $level ne 'medium') { diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm index dd69564a..76428dfb 100644 --- a/lib/PublicInbox/SearchView.pm +++ b/lib/PublicInbox/SearchView.pm @@ -188,13 +188,20 @@ sub search_nav_top { $rv .= qq{<a\nhref="?$s">summary</a>|<b>nested</b>}; } my $A = $q->qs_html(x => 'A', r => undef); - $rv .= qq{|<a\nhref="?$A">Atom feed</a>]} . - qq{\n\t\t\tdownload mbox.gz: } . - # we set name=z w/o using it since it seems required for - # lynx (but works fine for w3m). - qq{<input\ntype=submit\nname=z\nvalue="results only"/>|} . - qq{<input\ntype=submit\nname=x\nvalue="full threads"/>} . - qq{</pre></form><pre>}; + $rv .= qq{|<a\nhref="?$A">Atom feed</a>]}; + if ($ctx->{-inbox}->search->has_threadid) { + $rv .= qq{\n\t\t\tdownload mbox.gz: } . + # we set name=z w/o using it since it seems required for + # lynx (but works fine for w3m). + qq{<input\ntype=submit\nname=z\n} . + q{value="results only"/>} . + qq{|<input\ntype=submit\nname=x\n} . + q{value="full threads"/>}; + } else { # BOFH needs to --reindex + $rv .= qq{\n\t\t\t\t\t\tdownload: } . + qq{<input\ntype=submit\nname=z\nvalue="mbox.gz"/>} + } + $rv .= qq{</pre></form><pre>}; } sub search_nav_bot { diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 0a91a132..a32fdcf3 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -1337,6 +1337,17 @@ sub index_sync { xapian_only($self, $opt, $sync, $art_beg); } + # --reindex on the command-line + if ($opt->{reindex} && !ref($opt->{reindex}) && $idxlevel ne 'basic') { + local $self->{parallel} = 0; + my $s0 = PublicInbox::SearchIdx->new($self->{ibx}, 0, 0); + if (my $xdb = $s0->idx_acquire) { + my $n = $xdb->get_metadata('has_threadid'); + $xdb->set_metadata('has_threadid', 1) if $n ne '1'; + } + $s0->idx_release; + } + # reindex does not pick up new changes, so we rerun w/o it: if ($opt->{reindex}) { my %again = %$opt; diff --git a/t/init.t b/t/init.t index dad09435..a5a9debc 100644 --- a/t/init.t +++ b/t/init.t @@ -108,6 +108,7 @@ SKIP: { is(PublicInbox::Admin::detect_indexlevel($ibx), 'full', "detected default indexlevel -V$v"); ok($ibx->{-skip_docdata}, "docdata skip set -V$v"); + ok($ibx->search->has_threadid, 'has_threadid flag set on new inbox'); } # loop for idempotency diff --git a/t/psgi_search.t b/t/psgi_search.t index 5d537363..c1677eb3 100644 --- a/t/psgi_search.t +++ b/t/psgi_search.t @@ -3,6 +3,7 @@ use strict; use warnings; use Test::More; +use IO::Uncompress::Gunzip qw(gunzip); use PublicInbox::Eml; use PublicInbox::Config; use PublicInbox::Inbox; @@ -39,6 +40,12 @@ To: git\@vger.kernel.org EOF $im->add($mime); +$im->add(PublicInbox::Eml->new(<<"")); +Message-ID: <reply\@asdf> +From: replier <r\@example.com> +In-Reply-To: <$mid> +Subject: mismatch + $mime = PublicInbox::Eml->new(<<'EOF'); Subject: Message-ID: <blank-subj...@example.com> @@ -79,6 +86,9 @@ test_psgi(sub { $www->call(@_) }, sub { ok(index($html, 'by Ævar Arnfjörð Bjarmason') >= 0, "displayed Ævar's name properly in HTML"); + like($html, qr/download mbox\.gz: .*?"full threads"/s, + '"full threads" download option shown'); + my $warn = []; local $SIG{__WARN__} = sub { push @$warn, @_ }; $res = $cb->(GET('/test/?q=s:test&l=5e')); @@ -118,8 +128,33 @@ test_psgi(sub { $www->call(@_) }, sub { $res = $cb->(GET('/test/no-subject-at-...@example.com/t.mbox.gz')); like($res->header('Content-Disposition'), qr/filename=no-subject\.mbox\.gz/); + + # "full threads" mbox.gz download + $res = $cb->(POST('/test/?q=s:test&x=m&t')); + is($res->code, 200, 'successful mbox download with threads'); + gunzip(\($res->content) => \(my $before)); + is_deeply([ "Message-ID: <$mid>\n", "Message-ID: <reply\@asdf>\n" ], + [ grep(/^Message-ID:/m, split(/^/m, $before)) ], + 'got full thread'); + + # clobber has_threadid to emulate old versions: + { + my $sidx = PublicInbox::SearchIdx->new($ibx, 0); + my $xdb = $sidx->idx_acquire; + $xdb->set_metadata('has_threadid', '0'); + $sidx->idx_release; + } + $config->each_inbox(sub { delete $_[0]->{search} }); + $res = $cb->(GET('/test/?q=s:test')); + is($res->code, 200, 'successful search w/o has_threadid'); + unlike($html, qr/download mbox\.gz: .*?"full threads"/s, + '"full threads" download option not shown w/o has_threadid'); + + # in case somebody uses curl to bypass <form> + $res = $cb->(POST('/test/?q=s:test&x=m&t')); + is($res->code, 200, 'successful mbox download w/ threads'); + gunzip(\($res->content) => \(my $after)); + isnt($before, $after); }); done_testing(); - -1; -- unsubscribe: one-click, see List-Unsubscribe header archive: https://public-inbox.org/meta/