Eric Wong <e...@80x24.org> wrote:
> Konstantin Ryabitsev <konstan...@linuxfoundation.org> wrote:
> > However, if you do want to add ability to cheaply do a "give me just the
> > newest messages in this thread since this datetime", that would be great for
> > my needs. :)
> 
> Per-thread search is something I've wanted for a while, anyways,
> so I think I'll do /$MSGID/?q= in between ongoing work for

This implements the mbox.gz retrieval.  I didn't want to deal
with HTML nor figuring out how to expose more <form> elements,
yet; but I figure mbox.gz is the most important.

Now deployed on 80x24.org/lore:

MSGID=20230327080502.GA570847@ziqianlu-desk2
curl -d '' -sSf \
   https://80x24.org/lore/all/"$MSGID/?x=m&q=rt:2023-03-29.."; | \
   zcat | grep -i ^Message-ID:

shows the expected messages.
-----------8<-----------
Subject: [PATCH] www: support POST /$INBOX/$MSGID/?x=m&q=

This allows filtering the contents of any existing thread using
a search query.  It uses the existing THREADID column in Xapian
so we can internally add a Xapian OP_FILTER to the results.

This new functionality is orthogonal to the existing `t=1'
parameter which gives mairix-style thread expansion.  It doesn't
make sense to use `t=1' with this functionality, but it's not
disallowed, either.

The indentation change in Over->next_by_mid is to ensure
DBI->prepare_cached can share across both ->next_by_mid
and ->mid2tid.

I also noticed the existing regex for `POST /$INBOX/?x=m&q=' was
allowing extra characters.  With an added \z, it's now as strict
was originally intended and AFAIK nothing was generating invalid
URLs for it

Reported-by: Konstantin Ryabitsev <konstan...@linuxfoundation.org>
Link: 
https://public-inbox.org/meta/aaniyhk7wfm4e6m5mbukcrhevzoc6ftctyrfwvmz4fkykwwtlj@mverfng6ytas/T/
---
 lib/PublicInbox/Mbox.pm   |  5 ++++
 lib/PublicInbox/Over.pm   | 24 ++++++++++++++++++-
 lib/PublicInbox/Search.pm |  6 +++++
 lib/PublicInbox/WWW.pm    |  4 +++-
 t/psgi_v2.t               | 50 ++++++++++++++++++++++++++++++++++-----
 5 files changed, 81 insertions(+), 8 deletions(-)

diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm
index 18db9d38..e1abf7ec 100644
--- a/lib/PublicInbox/Mbox.pm
+++ b/lib/PublicInbox/Mbox.pm
@@ -229,6 +229,11 @@ sub mbox_all {
                return PublicInbox::WWW::need($ctx, 'Overview');
 
        my $qopts = $ctx->{qopts} = { relevance => -2 }; # ORDER BY docid DESC
+
+       # {threadid} limits results to a given thread
+       # {threads} collapses results from messages in the same thread,
+       # allowing us to use ->expand_thread w/o duplicates in our own code
+       $qopts->{threadid} = $over->mid2tid($ctx->{mid}) if 
defined($ctx->{mid});
        $qopts->{threads} = 1 if $q->{t};
        $srch->query_approxidate($ctx->{ibx}->git, $q_string);
        my $mset = $srch->mset($q_string, $qopts);
diff --git a/lib/PublicInbox/Over.pm b/lib/PublicInbox/Over.pm
index 271e2246..6ba27118 100644
--- a/lib/PublicInbox/Over.pm
+++ b/lib/PublicInbox/Over.pm
@@ -283,13 +283,35 @@ SELECT eidx_key FROM inboxes WHERE ibx_id = ?
        $rows;
 }
 
+sub mid2tid {
+       my ($self, $mid) = @_;
+       my $dbh = dbh($self);
+
+       my $sth = $dbh->prepare_cached(<<'', undef, 1);
+SELECT id FROM msgid WHERE mid = ? LIMIT 1
+
+       $sth->execute($mid);
+       my $id = $sth->fetchrow_array or return;
+       $sth = $dbh->prepare_cached(<<'', undef, 1);
+SELECT num FROM id2num WHERE id = ? AND num > ?
+ORDER BY num ASC LIMIT 1
+
+       $sth->execute($id, 0);
+       my $num = $sth->fetchrow_array or return;
+       $sth = $dbh->prepare(<<'');
+SELECT tid FROM over WHERE num = ? LIMIT 1
+
+       $sth->execute($num);
+       $sth->fetchrow_array;
+}
+
 sub next_by_mid {
        my ($self, $mid, $id, $prev) = @_;
        my $dbh = dbh($self);
 
        unless (defined $$id) {
                my $sth = $dbh->prepare_cached(<<'', undef, 1);
-       SELECT id FROM msgid WHERE mid = ? LIMIT 1
+SELECT id FROM msgid WHERE mid = ? LIMIT 1
 
                $sth->execute($mid);
                $$id = $sth->fetchrow_array;
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 5133a3b7..6c3d9f93 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -386,6 +386,12 @@ sub mset {
                                        sortable_serialise($uid_range->[1]));
                $query = $X{Query}->new(OP_FILTER(), $query, $range);
        }
+       if (defined(my $tid = $opt->{threadid})) {
+               $tid = sortable_serialise($tid);
+               $query = $X{Query}->new(OP_FILTER(), $query,
+                               $X{Query}->new(OP_VALUE_RANGE(), THREADID, 
$tid, $tid));
+       }
+
        my $xdb = xdb($self);
        my $enq = $X{Enquire}->new($xdb);
        $enq->set_query($query);
diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm
index 9ffcb879..a8f1ad17 100644
--- a/lib/PublicInbox/WWW.pm
+++ b/lib/PublicInbox/WWW.pm
@@ -68,7 +68,9 @@ sub call {
                        my ($idx, $fn) = ($3, $4);
                        return invalid_inbox_mid($ctx, $1, $2) ||
                                get_attach($ctx, $idx, $fn);
-               } elsif ($path_info =~ m!$INBOX_RE/!o) {
+               } elsif ($path_info =~ m!$INBOX_RE/$MID_RE/\z!o) {
+                       return invalid_inbox_mid($ctx, $1, $2) || 
mbox_results($ctx);
+               } elsif ($path_info =~ m!$INBOX_RE/\z!o) {
                        return invalid_inbox($ctx, $1) || mbox_results($ctx);
                }
        }
diff --git a/t/psgi_v2.t b/t/psgi_v2.t
index 5b197a9f..0a77adfb 100644
--- a/t/psgi_v2.t
+++ b/t/psgi_v2.t
@@ -4,6 +4,7 @@
 use strict;
 use v5.10.1;
 use PublicInbox::TestCommon;
+use IO::Uncompress::Gunzip qw(gunzip);
 require_git(2.6);
 use PublicInbox::Eml;
 use PublicInbox::Config;
@@ -76,6 +77,30 @@ $new_mid //= do {
        local $/;
        <$fh>;
 };
+
+my $m2t = create_inbox 'mid2tid-1', version => 2, indexlevel => 'medium', sub {
+       my ($im, $ibx) = @_;
+       for my $n (1..3) {
+               $im->add(PublicInbox::Eml->new(<<EOM)) or xbail 'add';
+Date: Fri, 02 Oct 1993 00:0$n:00 +0000
+Message-ID: <t\@$n>
+Subject: tid $n
+From: x\@example.com
+References: <a-mid\@b>
+
+$n
+EOM
+               $im->add(PublicInbox::Eml->new(<<EOM)) or xbail 'add';
+Date: Fri, 02 Oct 1993 00:0$n:00 +0000
+Message-ID: <ut\@$n>
+Subject: unrelated tid $n
+From: x\@example.com
+References: <b-mid\@b>
+
+EOM
+       }
+};
+
 my $cfgpath = "$ibx->{inboxdir}/pi_config";
 {
        open my $fh, '>', $cfgpath or BAIL_OUT $!;
@@ -86,6 +111,9 @@ my $cfgpath = "$ibx->{inboxdir}/pi_config";
 [publicinbox "dup"]
        inboxdir = $dibx->{inboxdir}
        address = $dibx->{-primary_address}
+[publicinbox "m2t"]
+       inboxdir = $m2t->{inboxdir}
+       address = $m2t->{-primary_address}
 EOF
        close $fh or BAIL_OUT;
 }
@@ -178,20 +206,18 @@ my $client1 = sub {
        $cfg->each_inbox(sub { $_[0]->search->reopen });
 
        SKIP: {
-               eval { require IO::Uncompress::Gunzip };
-               skip 'IO::Uncompress::Gunzip missing', 6 if $@;
                my ($in, $out, $status);
                my $req = GET('/v2test/a-mid@b/raw');
                $req->header('Accept-Encoding' => 'gzip');
                $res = $cb->($req);
                is($res->header('Content-Encoding'), 'gzip', 'gzip encoding');
                $in = $res->content;
-               IO::Uncompress::Gunzip::gunzip(\$in => \$out);
+               gunzip(\$in => \$out);
                is($out, $raw, 'gzip response matches');
 
                $res = $cb->(GET('/v2test/a-mid@b/t.mbox.gz'));
                $in = $res->content;
-               $status = IO::Uncompress::Gunzip::gunzip(\$in => \$out);
+               $status = gunzip(\$in => \$out);
                unlike($out, qr/^From oldbug/sm, 'buggy "From_" line omitted');
                like($out, qr/^hello world$/m, 'got first in t.mbox.gz');
                like($out, qr/^hello world!$/m, 'got second in t.mbox.gz');
@@ -202,7 +228,7 @@ my $client1 = sub {
                # search interface
                $res = $cb->(POST('/v2test/?q=m:a-mid@b&x=m'));
                $in = $res->content;
-               $status = IO::Uncompress::Gunzip::gunzip(\$in => \$out);
+               $status = gunzip(\$in => \$out);
                unlike($out, qr/^From oldbug/sm, 'buggy "From_" line omitted');
                like($out, qr/^hello world$/m, 'got first in mbox POST');
                like($out, qr/^hello world!$/m, 'got second in mbox POST');
@@ -213,7 +239,7 @@ my $client1 = sub {
                # all.mbox.gz interface
                $res = $cb->(GET('/v2test/all.mbox.gz'));
                $in = $res->content;
-               $status = IO::Uncompress::Gunzip::gunzip(\$in => \$out);
+               $status = gunzip(\$in => \$out);
                unlike($out, qr/^From oldbug/sm, 'buggy "From_" line omitted');
                like($out, qr/^hello world$/m, 'got first in all.mbox');
                like($out, qr/^hello world!$/m, 'got second in all.mbox');
@@ -335,6 +361,18 @@ my $client3 = sub {
        local $SIG{__WARN__} = sub { push @warn, @_ };
        $res = $cb->(GET('/v2test/?t=1970'.'01'.'01'));
        is_deeply(\@warn, [], 'no warnings on YYYYMMDD only');
+
+       $res = $cb->(POST("/m2t/t\@1/?q=dt:19931002000300..&x=m"));
+       is($res->code, 200, 'got 200 on mid2tid query');
+       gunzip(\(my $in = $res->content) => \(my $out));
+       my @m = ($out =~ m!^Message-ID: <([^>]+)>\n!gms);
+       is_deeply(\@m, ['t@3'], 'only got latest result from query');
+
+       $res = $cb->(POST("/m2t/t\@1/?q=dt:19931002000400..&x=m"));
+       is($res->code, 404, '404 on out-of-range mid2tid query');
+
+       $res = $cb->(POST("/m2t/t\@1/?q=s:unrelated&x=m"));
+       is($res->code, 404, '404 on cross-thread search');
 };
 test_psgi(sub { $www->call(@_) }, $client3);
 test_httpd($env, $client3, 4);


Reply via email to