Aside from our prior import bugs (fixed in a0c07cba0e5d8b6a (mda: drop leading "From " lines again, 2016-06-26)), we'll always have to be dealing with mutt piping messages to us and `git format-patch' output. So just share the regexp so we can use it everywhere.
In may be desirable to allow importing messages with a leading "From " line for FUSE, even. Additionally, some instances of this regexp needlessly added optional `\r?' (CR) checks ahead of the `\n' (LF) element; but they're pointless anyways since [^\n]* is enough to exclude all non-LF bytes. --- lib/PublicInbox/Eml.pm | 6 ++++++ lib/PublicInbox/IMAP.pm | 2 +- lib/PublicInbox/Import.pm | 8 +++----- lib/PublicInbox/LeiInput.pm | 5 +---- lib/PublicInbox/LeiInspect.pm | 2 +- lib/PublicInbox/LeiToMail.pm | 3 +-- lib/PublicInbox/Mbox.pm | 16 +++++++--------- lib/PublicInbox/MboxReader.pm | 2 +- lib/PublicInbox/NNTP.pm | 3 +-- script/public-inbox-edit | 5 ++--- script/public-inbox-learn | 2 +- script/public-inbox-mda | 4 ++-- script/public-inbox-purge | 4 ++-- 13 files changed, 29 insertions(+), 33 deletions(-) diff --git a/lib/PublicInbox/Eml.pm b/lib/PublicInbox/Eml.pm index 8b999e1a..24060ec8 100644 --- a/lib/PublicInbox/Eml.pm +++ b/lib/PublicInbox/Eml.pm @@ -528,4 +528,10 @@ sub willneed { re_memo($_) for @_ } willneed(qw(From To Cc Date Subject Content-Type In-Reply-To References Message-ID X-Alt-Message-ID)); +# This fixes an old bug from import (pre-a0c07cba0e5d8b6a) +# mutt also pipes single RFC822 messages with a "From " line, +# but no Content-Length or "From " escaping. +# "git format-patch" also generates such files by default. +sub strip_from { $_[0] =~ s/\A[\r\n]*From [^\n]*\n//s } + 1; diff --git a/lib/PublicInbox/IMAP.pm b/lib/PublicInbox/IMAP.pm index 3c64cefa..e4a9e304 100644 --- a/lib/PublicInbox/IMAP.pm +++ b/lib/PublicInbox/IMAP.pm @@ -664,7 +664,7 @@ sub op_eml_new { $_[4] = PublicInbox::Eml->new($_[3]) } # s/From / fixes old bug from import (pre-a0c07cba0e5d8b6a) sub to_crlf_full { ${$_[0]} =~ s/(?<!\r)\n/\r\n/sg; - ${$_[0]} =~ s/\A[\r\n]*From [^\r\n]*\r\n//s; + PublicInbox::Eml::strip_from(${$_[0]}); } sub op_crlf_bref { to_crlf_full($_[3]) } diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index 7175884c..cd03da05 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -118,9 +118,6 @@ sub _cat_blob ($$$) { $n == $len or croak "cat-blob: short read: $n < $len"; my $lf = chop $buf; croak "bad read on final byte: <$lf>" if $lf ne "\n"; - - # fixup some bugginess in old versions: - $buf =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; \$buf; } @@ -136,8 +133,9 @@ sub check_remove_v1 { my $info = _check_path($r, $w, $tip, $path) or return ('MISSING',undef); $info =~ m!\A100644 blob ([a-f0-9]{40,})\t!s or die "not blob: $info"; my $oid = $1; - my $msg = _cat_blob($r, $w, $oid) or die "BUG: cat-blob $1 failed"; - my $cur = PublicInbox::Eml->new($msg); + my $bref = _cat_blob($r, $w, $oid) or die "BUG: cat-blob $1 failed"; + PublicInbox::Eml::strip_from($$bref); + my $cur = PublicInbox::Eml->new($bref); my $cur_s = $cur->header('Subject') // ''; my $cur_m = $mime->header('Subject') // ''; if ($cur_s ne $cur_m || norm_body($cur) ne norm_body($mime)) { diff --git a/lib/PublicInbox/LeiInput.pm b/lib/PublicInbox/LeiInput.pm index 93f8b6b8..28b73ca9 100644 --- a/lib/PublicInbox/LeiInput.pm +++ b/lib/PublicInbox/LeiInput.pm @@ -84,10 +84,7 @@ sub input_fh { return $self->{lei}->child_error(0, <<""); error reading $name: $! - # mutt pipes single RFC822 messages with a "From " line, - # but no Content-Length or "From " escaping. - # "git format-patch" also generates such files by default. - $buf =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; + PublicInbox::Eml::strip_from($buf); # a user may feed just a body: git diff | lei rediff -U9 if ($self->{-force_eml}) { diff --git a/lib/PublicInbox/LeiInspect.pm b/lib/PublicInbox/LeiInspect.pm index f801610f..65c64cf2 100644 --- a/lib/PublicInbox/LeiInspect.pm +++ b/lib/PublicInbox/LeiInspect.pm @@ -254,7 +254,7 @@ sub inspect_start ($$) { sub do_inspect { # lei->do_env cb my ($lei) = @_; my $str = delete $lei->{istr}; - $str =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; + PublicInbox::Eml::strip_from($str); my $eml = PublicInbox::Eml->new(\$str); inspect_start($lei, [ 'blob:'.$lei->git_oid($eml)->hexdigest, map { "mid:$_" } @{mids($eml)} ]); diff --git a/lib/PublicInbox/LeiToMail.pm b/lib/PublicInbox/LeiToMail.pm index 8771592d..ead60b38 100644 --- a/lib/PublicInbox/LeiToMail.pm +++ b/lib/PublicInbox/LeiToMail.pm @@ -53,8 +53,7 @@ sub _mbox_hdr_buf ($$$) { } my $buf = delete $eml->{hdr}; - # fixup old bug from import (pre-a0c07cba0e5d8b6a) - $$buf =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; + PublicInbox::Eml::strip_from($$buf); my $ident = $smsg->{blob} // 'lei'; if (defined(my $pct = $smsg->{pct})) { $ident .= "=$pct" } diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm index bf61bb0e..52f88ae3 100644 --- a/lib/PublicInbox/Mbox.pm +++ b/lib/PublicInbox/Mbox.pm @@ -89,17 +89,15 @@ sub emit_raw { sub msg_hdr ($$) { my ($ctx, $eml) = @_; - my $header_obj = $eml->header_obj; - # drop potentially confusing headers, ssoma already should've dropped - # Lines and Content-Length - foreach my $d (qw(Lines Bytes Content-Length Status)) { - $header_obj->header_set($d); + # drop potentially confusing headers, various importers should've + # already dropped these, but we can't trust stuff we've cloned + for my $d (qw(Lines Bytes Content-Length Status)) { + $eml->header_set($d); } - my $crlf = $header_obj->crlf; - my $buf = $header_obj->as_string; - # fixup old bug from import (pre-a0c07cba0e5d8b6a) - $buf =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; + my $crlf = $eml->crlf; + my $buf = $eml->header_obj->as_string; + PublicInbox::Eml::strip_from($buf); "From mboxrd\@z Thu Jan 1 00:00:00 1970" . $crlf . $buf . $crlf; } diff --git a/lib/PublicInbox/MboxReader.pm b/lib/PublicInbox/MboxReader.pm index e4209022..d67fb4eb 100644 --- a/lib/PublicInbox/MboxReader.pm +++ b/lib/PublicInbox/MboxReader.pm @@ -93,7 +93,7 @@ sub _mbox_cl ($$$;@) { undef $mbfh; } while (my $hdr = _extract_hdr(\$buf)) { - $$hdr =~ s/\A[\r\n]*From [^\n]*\n//s or + PublicInbox::Eml::strip_from($$hdr) or die "E: no 'From ' line in:\n", Dumper($hdr); my $eml = PublicInbox::Eml->new($hdr); next unless $eml->raw_size; diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm index 316b7775..603cf094 100644 --- a/lib/PublicInbox/NNTP.pm +++ b/lib/PublicInbox/NNTP.pm @@ -523,8 +523,7 @@ sub msg_hdr_write ($$) { set_nntp_headers($eml, $smsg); my $hdr = $eml->{hdr} // \(my $x = ''); - # fixup old bug from import (pre-a0c07cba0e5d8b6a) - $$hdr =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; + PublicInbox::Eml::strip_from($$hdr); $$hdr =~ s/(?<!\r)\n/\r\n/sg; # Alpine barfs without this # for leafnode compatibility, we need to ensure Message-ID headers diff --git a/script/public-inbox-edit b/script/public-inbox-edit index 1fbaf5a7..1fb6f32b 100755 --- a/script/public-inbox-edit +++ b/script/public-inbox-edit @@ -1,5 +1,5 @@ #!/usr/bin/perl -w -# Copyright (C) 2019-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # Used for editing messages in a public-inbox. @@ -188,8 +188,7 @@ retry_edit: "read $edit_fn: $!\n"; if (!$opt->{raw}) { - # get rid of the From we added - $new_raw =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; + PublicInbox::Eml::strip_from($new_raw); # check if user forgot to purge (in mutt) after editing if ($new_raw =~ /^From /sm) { diff --git a/script/public-inbox-learn b/script/public-inbox-learn index 8b8e1b77..6e1978a7 100755 --- a/script/public-inbox-learn +++ b/script/public-inbox-learn @@ -40,7 +40,7 @@ my $pi_cfg = PublicInbox::Config->new; my $err; my $mime = PublicInbox::Eml->new(do{ defined(my $data = do { local $/; <STDIN> }) or die "read STDIN: $!\n"; - $data =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; + PublicInbox::Eml::strip_from($data); if ($train ne 'rm') { eval { diff --git a/script/public-inbox-mda b/script/public-inbox-mda index ba498956..cac819ac 100755 --- a/script/public-inbox-mda +++ b/script/public-inbox-mda @@ -1,5 +1,5 @@ #!/usr/bin/perl -w -# Copyright (C) 2013-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # Mail delivery agent for public-inbox, run from your MTA upon mail delivery @@ -39,7 +39,7 @@ use PublicInbox::Spamcheck; my $emergency = $ENV{PI_EMERGENCY} || "$ENV{HOME}/.public-inbox/emergency/"; $ems = PublicInbox::Emergency->new($emergency); my $str = do { local $/; <STDIN> }; -$str =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; +PublicInbox::Eml::strip_from($str); $ems->prepare(\$str); my $eml = PublicInbox::Eml->new(\$str); my $cfg = PublicInbox::Config->new; diff --git a/script/public-inbox-purge b/script/public-inbox-purge index 121027cc..8f9b0b16 100755 --- a/script/public-inbox-purge +++ b/script/public-inbox-purge @@ -1,5 +1,5 @@ #!/usr/bin/perl -w -# Copyright (C) 2019-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> # # Used for purging messages entirely from a public-inbox. Currently @@ -34,7 +34,7 @@ my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt); PublicInbox::AdminEdit::check_editable(\@ibxs); defined(my $data = do { local $/; <STDIN> }) or die "read STDIN: $!\n"; -$data =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; +PublicInbox::Eml::strip_from($data); my $n_purged = 0; foreach my $ibx (@ibxs) {