>how about avoiding the me toos...
[44 lines of crud we've already seen before]
Hm... something wrong's with that message...
% msgchk `mhpath cur`
Quote follows response, Jeopardy style #2
Overquoted: 30 lines quoted out of 41: 73%
Non-canonical signature separator:
`--------------------------------------------------------------------------'
Signature too long: 7 lines
More than just *some*thing: that's four different violations.
print "BAD!\n" for 1..4;
--tom
#!/usr/bin/perl -w
# msgchk - check mail and news messages for netiquette violations
# [EMAIL PROTECTED]
use strict;
my $Msg = get_message();
missing_headers();
bogus_address();
allcap_subject();
annoying_subject();
mimes();
lines_too_long();
control_characters();
miswrapped();
jeopardy_quoted();
overquoted();
good_signature();
exit(0);
#######################
sub AUTOLOAD {
use vars '$AUTOLOAD';
my $field;
($field = uc($AUTOLOAD)) =~ s/.*:://;
if (!defined wantarray) {
require Carp;
Carp::confess("Undefined function call: $AUTOLOAD");
}
$Msg->$field();
}
sub bogus_address {
my $address = from();
if ($address !~ /\@\w.*\.\w/) {
print "From address must contain an at sign, etc.\n";
return;
}
if ($address =~ /(remove|spam)/i) {
print "Munged return address suspected, found `$1' in from.\n";
}
ck822($address); # inscrutable
my($host) = $address =~ /\@([a-zA-Z0-9_.-]+)/;
dns_check($host); # very slow!
}
sub control_characters {
my $lineno = 0;
my $MAX_CONTROL = 5;
for (lines()) {
$lineno++;
if (/(?=[^\s\b])([\000-\037])/) {
printf "Control character (char %#o) appears at line %d of body.\n",
ord $1, $lineno;
}
if (/([\202-\237])/) {
printf "Evil and rude MS-ASCII character (char %#o) appears at line %d of
body.\n",
ord $1, $lineno;
}
last if --$MAX_CONTROL < 0;
}
}
sub lines_too_long {
my $MAX_LINE_LEN = 80;
my $line_count = scalar @{ [ lines() ] };
my ($long_lines, $longest_line_data, $longest_line_number) = (0,'',0);
my $lineno = 0;
for (lines()) {
$lineno++;
next if /^[>+-]/; # skip quotes and patch diffs
if (length() > $MAX_LINE_LEN) {
$long_lines++;
if (length() > length($longest_line_data)) {
$longest_line_data = $_;
$longest_line_number = $lineno;
}
}
}
if ($long_lines) {
printf "%d of %d lines exceed maxlen %d, ",
$long_lines, $line_count, $MAX_LINE_LEN;
printf "longest is #%d at %d bytes\n",
$longest_line_number, length($longest_line_data);
}
}
sub missing_headers {
if (subject() !~ /\S/) {
print "Missing required subject header.\n";
}
if (newsgroups() && subject() =~ /^\s*Re:/i && !references()) {
print "Followup posting missing required references header.\n";
}
}
sub allcap_subject {
my $subject = subject();
$subject =~ s/^(\s*Re:\s*)+//i;
if ($subject !~ /[a-z]/) {
print "No lower-case letters in subject header.\n";
}
}
sub miswrapped {
my($bq1, $bq2);
for (paragraphs()) {
next unless /\A(^\S.*){2,}\Z/ms; # no indented code
if (!$bq1 && /^>\S.*\n\s*[a-zA-Z]/) {
print "Incorrectly wrapped quoted text.\n";
$bq1++;
}
next if $bq2;
my $count = 0;
while (/^[^>].{60,}\n[^>].{1,20}\n(?=[^>].{60,}\n)/gm) {
$count++;
}
if ($count > 1) {
print "Incorrectly wrapped regular text.\n";
$bq2++;
### print "OOPS count = $count:\n$_\n\n";
}
}
}
sub jeopardy_quoted {
local $_ = body();
$_ = unquote_wrap($_);
$_ = strip_signature($_);
$_ = strip_attribution($_);
# check quotation at bottom but nowhere else
# XXX: these can go superlong superlong! i've added
# some more anchors and constraints to try to avoid this,
# but I still mistrust it
if (/ ((^\s*>.*){2,}) \s* \Z/mx
&&
!/ (\n>.*?)+ (\n[^>].*?)+ (\n>.*?)+ /x )
{
print "Quote follows response, Jeopardy style #1\n";
}
# completely at bottom
elsif (/^.* wr(?:ote|ites):\n(>.*\n)+\s*\Z/m) {
print "Quote follows response, Jeopardy style #2\n";
}
# another way of saying the same
elsif (/^(?:>+\s*)?-[_+]\s*Original Message\s*-[_+]\s.*\Z/ms) {
print "Quote follows response, Jeopardy style #3\n";
}
# another way of saying the same
elsif (/^(?:>+\s*)?[_-]+\s*Reply Separator\s*[_-]+\s.*\Z/ms) {
print "Quote follows response, Jeopardy style #4\n";
}
}
sub overquoted {
# cfoq: check fascistly overquoted by [EMAIL PROTECTED]
# (wants perl 5.0 or better; developed under 5.002)
my (
$total, # total number of lines, minus sig and attribution
$quoted_lines, # how many lines were quoted
$percent, # what percentage this in
$pcount, # how many in this paragraph were counted
$match_part, # holding space for current match
$gotsig, # is this the sig paragraph?
);
$total = $quoted_lines = $pcount = $percent = 0;
my $MINLINES = 20;
my $TOLERANCE = 50;
my $VERBOSE = 0;
if (body() =~ /^-+\s*Original Message\s*-+$/m) {
my $body = strip_signature(body());
my($text,$quote) = body() =~ /(.*)(^-+\s*Original Message\s*-+.*\Z)/ms;
$total = ((my $x = body()) =~ y/\n//);
$quoted_lines = ($quote =~ y/\n//);
}
else {
for (paragraphs()) {
# strip sig line, remember we found it
$gotsig = s/^-- \n.*//ms;
# strip attribution, possibly multiline
if ($. == 2) { s/\A.*?(<.*?>|\@).*?:\n//s }
# toss trailing blank lines into one single line
s/\n+\Z/\n/;
# now reduce miswrapped lines from idiotic broken PC newsreaders
# into what they should have been
s/(>.*)\n\s*([a-zA-Z])/$1 $2/g;
# count lines in this paragraph
$total++ while /^./mg;
# is it a single line, quoted in the customary fashion?
if ( /^(>+).*\n\Z/ ) {
$quoted_lines++;
print " 1 line quoted with $1\n" if $VERBOSE;
next;
}
# otherwise, it's a multiline block, which may be quoted
# with any leading repeated string that's neither alphanumeric
# nor string
while (/^(([^\w\s]+).*\n)(\2.*\n)+/mg) { # YANETUT
$quoted_lines += $pcount = ($match_part = $&) =~ tr/\n//;
printf "%2d lines quoted with $2\n", $pcount if $VERBOSE;
}
last if $gotsig;
}
}
$percent = int($quoted_lines / $total * 100);
if ($total == $quoted_lines) {
print "All $total lines were quoted lines!\n";
# not ok
}
elsif ($percent > $TOLERANCE && $total > $MINLINES) {
print "Overquoted: $quoted_lines lines quoted out of $total: $percent%\n";
}
}
sub unquote_wrap {
my $chunk = shift;
# reduce miswrapped lines from idiotic broken PC newsreaders
# into what they should have been
$chunk =~ s/(>.*)\n\s*([a-zA-Z])/$1 $2/g;
return $chunk;
}
sub good_signature {
my $MAX_SIGLINES = 4;
my $sig = '';
my($is_canon, $separator);
my $body = body();
# sometimes the ms idiotware quotes at the bottom this way
$body =~ s/^-+\s*Original Message\s*-+\s.*\Z//ms;
# first check regular signature
if ($body =~ /\n-- \n(.*)/s) {
$sig = $1;
$is_canon = 1;
}
elsif ($body =~ /\n([_-]{2,}\s*)\n(.*?)$/s) {
$separator = $1;
$sig = $2;
}
for ($separator, $sig) { s/\n\Z// if defined }
my $siglines = $sig =~ tr/\n//;
if ($separator && ($siglines && $siglines < 20)) {
if ($separator eq '--') {
print "Double-dash in signature missing trailing space.\n";
} else {
print "Non-canonical signature separator: `$separator'\n";
}
}
if ($siglines > $MAX_SIGLINES && $siglines < 20) {
printf "Signature too long: %d lines\n", $siglines;
}
}
sub strip_signature {
local $_ = shift;
s/\n-- \n(.*)//s
||
s/\n([_-]{2,}\s*)\n(.*?)$//s;
return $_;
}
sub attribution {
local $_ = body();
s/^\s*\[.*\]\s*//; # remove [courtesy cc]
if (/\A(.*wr(?:ote|ites):?)\n/) {
return $1;
} elsif (/\A(.*?(<.*?>|\@).*?:)\n/s) {
return $1;
} else {
return '';
}
}
sub strip_attribution {
local $_ = shift;
s/^\s*\[.*\]\s*//; # remove [courtesy cc]
# XXX: duplicate code with previous function
s/\A(.*wr(?:ote|ites):?)\n//
||
s/\A(.*?(<.*?>|\@).*?:)\n//s;
return $_;
}
sub annoying_subject {
local $_ = subject();
if ( / ( [?!]{3,} ) /x ||
/ ( HELP ) /x ||
/ ( PLEASE ) /x
)
{
print "Subject line contains annoying `$1' in it.\n";
}
}
sub mimes {
my $mime_crap = 0;
for (content_type()) {
last unless defined;
$mime_crap++;
if (/multipart/i) {
print "Multipart MIME detected.\n";
}
elsif (/html/i) {
print "HTML encrypting detected.\n";
}
elsif (! (/^text$/i || m#^text/plain#i)) {
print "Strange content type detected: $_\n";
}
}
for (content_transfer_encoding()) {
last unless defined;
if (/quoted-printable/i) {
print "Gratuitously quoted-illegible MIMEing detected.\n";
}
}
unless ($mime_crap) {
for (body()) {
if (/\A\s*This message is in MIME format/i) {
print "Gratuitous but unadvertised MIME detected.\n";
}
elsif ( /\A\s*This is a multi-part message in MIME format/i) {
print "Unadvertised multipart MIME detected.\n";
}
}
}
}
sub dns_check {
my $NSLOOKUP = 'nslookup'; # or /usr/ucb?
# first try an MX record, then an A rec (for badly configged hosts)
my $host = shift;
local $/ = undef;
local *NS;
local $_;
# the following is commented out for security reasons:
# if ( `nslookup -query=mx $host` =~ /mail exchanger/
# otherwise there could be naughty bits in $host
# we'll bypass system() and get right at execvp()
my $pid;
if ($pid = open(NS, "-|")) {
$_ = <NS>;
kill 'TERM', $pid if $pid; # just in case
close NS;
return if /mail exchanger/;
# else fall trohugh to next test
} else {
die "cannot fork: $!" unless defined $pid;
open(SE, ">&STDERR");
open(STDERR, ">/dev/null");
{ exec $NSLOOKUP, '-query=mx', $host; } # braces for -w
open(STDERR, ">&SE");
die "can't exec nslookup: $!";
}
if ($pid = open(NS, "-|")) {
$_ = <NS>;
kill 'TERM', $pid if $pid; # just in case
close NS;
unless (/answer:.*Address/s || /Name:.*$host.*Address:/si) {
print "No DNS for \@$host in return address.\n";
}
} else {
die "cannot fork: $!" unless defined $pid;
open(SE, ">&STDERR");
open(STDERR, ">/dev/null");
{ exec $NSLOOKUP, '-query=a', $host; } # braces for -w
open(STDERR, ">&SE");
die "can't exec nslookup: $!";
}
}
sub ck822 {
# ck822 -- check whether address is valid rfc 822 address
# [EMAIL PROTECTED]
#
# pattern developed in program by jfriedl;
# see "Mastering Regular Expressions" from ORA for details
# this will error on something like "ftp.perl.com." because
# even though dns wants it, rfc822 hates it. shucks.
my $what = 'address';
my $address = shift;
local $_;
my $is_a_valid_rfc_822_addr;
($is_a_valid_rfc_822_addr = <<'EOSCARY') =~ s/\n//g;
(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n
\015()]|\\[^\x80-\xff])*\))*\))*(?:(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\
xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|"(?:[^\\\x80-\xff\n\015"
]|\\[^\x80-\xff])*")(?:(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xf
f]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*\.(?:[\040\t]|\((?:[
^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\
xff])*\))*\))*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;
:".\\\[\]\000-\037\x80-\xff])|"(?:[^\\\x80-\xff\n\015"]|\\[^\x80-\xff])*"))
*(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\
n\015()]|\\[^\x80-\xff])*\))*\))*@(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\
\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\04
0)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-
\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\])(?:(?:[\040\t]|\((?
:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80
-\xff])*\))*\))*\.(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\(
(?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<>@,;:".\\\[\]
\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\
\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\]))*|(?:[^(\040)<>@,;:".\\\[\]\000-\0
37\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|"(?:[^\\\x80-\xf
f\n\015"]|\\[^\x80-\xff])*")(?:[^()<>@,;:".\\\[\]\x80-\xff\000-\010\012-\03
7]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\
\[^\x80-\xff])*\))*\)|"(?:[^\\\x80-\xff\n\015"]|\\[^\x80-\xff])*")*<(?:[\04
0\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]
|\\[^\x80-\xff])*\))*\))*(?:@(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x
80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<>@
,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]
)|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\])(?:(?:[\040\t]|\((?:[^\\
\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff
])*\))*\))*\.(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^
\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<>@,;:".\\\[\]\000-
\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-
\xff\n\015\[\]]|\\[^\x80-\xff])*\]))*(?:(?:[\040\t]|\((?:[^\\\x80-\xff\n\01
5()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*,(?
:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\0
15()]|\\[^\x80-\xff])*\))*\))*@(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^
\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<
>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xf
f])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\])(?:(?:[\040\t]|\((?:[^
\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\x
ff])*\))*\))*\.(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:
[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<>@,;:".\\\[\]\00
0-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x8
0-\xff\n\015\[\]]|\\[^\x80-\xff])*\]))*)*:(?:[\040\t]|\((?:[^\\\x80-\xff\n\
015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*)
?(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000
-\037\x80-\xff])|"(?:[^\\\x80-\xff\n\015"]|\\[^\x80-\xff])*")(?:(?:[\040\t]
|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[
^\x80-\xff])*\))*\))*\.(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xf
f]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<>@,;:".\
\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|"(?:
[^\\\x80-\xff\n\015"]|\\[^\x80-\xff])*"))*(?:[\040\t]|\((?:[^\\\x80-\xff\n\
015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*@
(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n
\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff
]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\
]]|\\[^\x80-\xff])*\])(?:(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\
xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*\.(?:[\040\t]|\((?
:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80
-\xff])*\))*\))*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@
,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff
])*\]))*(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x8
0-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*>)(?:[\040\t]|\((?:[^\\\x80-\xff\n\
015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*
EOSCARY
if ($address !~ /^${is_a_valid_rfc_822_addr}$/o) {
print "rfc822 failure on $address";
}
}
##############################
package Mail_Message;
use Carp;
use vars qw/$AUTOLOAD/;
# process <ARGV> for a message header and body
# currently this assumes one message per file!
sub main::get_message {
my $msg = bless {}, __PACKAGE__;
local $/ = '';
$msg->{HEADER_STRING} = <>;
chomp $msg->{HEADER_STRING};
for (split /\n(?!\s)/, $msg->{HEADER_STRING}) {
my($tag, $value) = /^([^\s:]+):\s*(.*)\s*\Z/s;
push @{ $msg->{HEADERS}{$tag} }, $value;
$tag =~ tr/-/_/;
$tag = uc($tag);
push @{ $msg->{$tag} }, $value;
}
local $/ = undef;
for ($msg->{BODY} = <>) {
$msg->{PARAGRAPHS} = [ split /\n\n+/ ];
$msg->{LINES} = [ split /\n/ ];
}
return $msg;
}
sub AUTOLOAD {
use vars '$AUTOLOAD';
my $self = shift;
my $field;
($field = uc($AUTOLOAD)) =~ s/.*:://;
my $xfield = "x_" . $field;
if (!exists $self->{$field} && exists $self->{$xfield}) {
$field = $xfield;
}
unless (exists $self->{$field}) {
return undef;
# NOT REACHED
confess "No field $field in message";
}
my $data = $self->{$field};
my @data = ref $data ? @$data : $data;
if (wantarray) {
return @data;
}
else {
return join("\n", @data);
}
}