Re: [admin] NO HTML posts please!

Tom Christiansen Thu, 30 Mar 2000 12:06:36 -0800
>how about avoiding the me toos...

[44 lines of crud we've already seen before]

Hm... something wrong's with that message...

    % msgchk `mhpath cur`
    Quote follows response, Jeopardy style #2
    Overquoted: 30 lines quoted out of 41: 73%
    Non-canonical signature separator: 
`--------------------------------------------------------------------------'
    Signature too long: 7 lines

More than just *some*thing: that's four different violations.  

    print "BAD!\n" for 1..4;

--tom

#!/usr/bin/perl -w
# msgchk - check mail and news messages for netiquette violations
# [EMAIL PROTECTED]

use strict;

my $Msg = get_message();

missing_headers();
bogus_address();
allcap_subject();
annoying_subject();

mimes();

lines_too_long();
control_characters();

miswrapped();
jeopardy_quoted();
overquoted();

good_signature();

exit(0);

#######################

sub AUTOLOAD {
    use vars '$AUTOLOAD';
    my $field;
    ($field = uc($AUTOLOAD)) =~ s/.*:://;
    if (!defined wantarray) {
        require Carp;
        Carp::confess("Undefined function call: $AUTOLOAD");
    } 
    $Msg->$field();
} 

sub bogus_address {
    my $address = from();

    if ($address !~ /\@\w.*\.\w/) {
        print "From address must contain an at sign, etc.\n";
        return;
    } 

    if ($address =~ /(remove|spam)/i) {
        print "Munged return address suspected, found `$1' in from.\n";
    } 

    ck822($address);    # inscrutable

    my($host) = $address =~ /\@([a-zA-Z0-9_.-]+)/;
    dns_check($host);   # very slow!
}

sub control_characters {
    my $lineno = 0;
    my $MAX_CONTROL = 5;

    for (lines()) { 
        $lineno++;
        if (/(?=[^\s\b])([\000-\037])/) {
            printf "Control character (char %#o) appears at line %d of body.\n", 
                ord $1, $lineno;
        }

        if (/([\202-\237])/) {
            printf "Evil and rude MS-ASCII character (char %#o) appears at line %d of 
body.\n", 
                ord $1, $lineno;
        } 
        last if --$MAX_CONTROL < 0;
    }

} 

sub lines_too_long {
    my $MAX_LINE_LEN = 80;
    my $line_count = scalar @{ [ lines() ] };
    my ($long_lines, $longest_line_data, $longest_line_number) = (0,'',0);
    my $lineno = 0;
    for (lines()) {
        $lineno++;
        next if /^[>+-]/;  # skip quotes and patch diffs
        if (length() > $MAX_LINE_LEN) {
            $long_lines++;
            if (length() > length($longest_line_data)) {
                $longest_line_data = $_;
                $longest_line_number = $lineno;
            } 
        } 
    } 
    if ($long_lines) {
        printf "%d of %d lines exceed maxlen %d, ",
           $long_lines, $line_count, $MAX_LINE_LEN;
        printf "longest is #%d at %d bytes\n",
            $longest_line_number, length($longest_line_data);
    } 
}

sub missing_headers {
    if (subject() !~ /\S/) {
        print "Missing required subject header.\n";
    } 
    if (newsgroups() && subject() =~ /^\s*Re:/i && !references()) {
        print "Followup posting missing required references header.\n";
    } 
}

sub allcap_subject {
    my $subject = subject();
    $subject =~ s/^(\s*Re:\s*)+//i;
    if ($subject !~ /[a-z]/) {
        print "No lower-case letters in subject header.\n";
    } 
}

sub miswrapped {
    my($bq1, $bq2);
    for (paragraphs()) {
        next unless /\A(^\S.*){2,}\Z/ms;  # no indented code

        if (!$bq1 && /^>\S.*\n\s*[a-zA-Z]/) { 
            print "Incorrectly wrapped quoted text.\n";
            $bq1++;
        }

        next if $bq2;

        my $count = 0;
        while (/^[^>].{60,}\n[^>].{1,20}\n(?=[^>].{60,}\n)/gm) {
            $count++;
        } 

        if ($count > 1) {
            print "Incorrectly wrapped regular text.\n";
            $bq2++;
            ### print "OOPS count = $count:\n$_\n\n";
        } 
    } 
}

sub jeopardy_quoted {
    local $_ = body();
    $_ = unquote_wrap($_);

    $_ = strip_signature($_);
    $_ = strip_attribution($_);

    # check quotation at bottom but nowhere else
    # XXX: these can go superlong superlong!  i've added
    #      some more anchors and constraints to try to avoid this,
    #      but I still mistrust it

    if (/ ((^\s*>.*){2,}) \s* \Z/mx   
        && 
        !/ (\n>.*?)+ (\n[^>].*?)+ (\n>.*?)+ /x ) 
    {
        print "Quote follows response, Jeopardy style #1\n";
    } 

    # completely at bottom 
    elsif (/^.* wr(?:ote|ites):\n(>.*\n)+\s*\Z/m) {
        print "Quote follows response, Jeopardy style #2\n";
    } 

    # another way of saying the same
    elsif (/^(?:>+\s*)?-[_+]\s*Original Message\s*-[_+]\s.*\Z/ms) {
        print "Quote follows response, Jeopardy style #3\n";
    }

    # another way of saying the same
    elsif (/^(?:>+\s*)?[_-]+\s*Reply Separator\s*[_-]+\s.*\Z/ms) {
        print "Quote follows response, Jeopardy style #4\n";
    }

}

sub overquoted {

    # cfoq: check fascistly overquoted by [EMAIL PROTECTED]
    #   (wants perl 5.0 or better; developed under 5.002)

    my (
        $total,         # total number of lines, minus sig and attribution
        $quoted_lines,  # how many lines were quoted
        $percent,       # what percentage this in
        $pcount,        # how many in this paragraph were counted
        $match_part,    # holding space for current match
        $gotsig,        # is this the sig paragraph?
    );

    $total = $quoted_lines = $pcount = $percent = 0;

    my $MINLINES  = 20;
    my $TOLERANCE = 50; 
    my $VERBOSE   = 0;

    if (body() =~ /^-+\s*Original Message\s*-+$/m) {
        my $body = strip_signature(body());
        my($text,$quote) = body() =~ /(.*)(^-+\s*Original Message\s*-+.*\Z)/ms;
        $total = ((my $x = body()) =~ y/\n//);
        $quoted_lines = ($quote =~ y/\n//);
    } 
    else { 
        for (paragraphs()) {

            # strip sig line, remember we found it
            $gotsig = s/^-- \n.*//ms;

            # strip attribution, possibly multiline
            if ($. == 2) { s/\A.*?(<.*?>|\@).*?:\n//s }  

            # toss trailing blank lines into one single line
            s/\n+\Z/\n/;

            # now reduce miswrapped lines from idiotic broken PC newsreaders
            # into what they should have been
            s/(>.*)\n\s*([a-zA-Z])/$1 $2/g;

            # count lines in this paragraph
            $total++ while  /^./mg;

            # is it a single line, quoted in the customary fashion?
            if ( /^(>+).*\n\Z/ ) {
                $quoted_lines++;
                print " 1 line quoted with $1\n" if $VERBOSE;
                next;
            } 

            # otherwise, it's a multiline block, which may be quoted
            # with any leading repeated string that's neither alphanumeric
            # nor string
            while (/^(([^\w\s]+).*\n)(\2.*\n)+/mg) {  # YANETUT
                $quoted_lines += $pcount = ($match_part = $&) =~ tr/\n//;
                printf "%2d lines quoted with $2\n", $pcount    if $VERBOSE;
            } 

            last if $gotsig;
        } 

    }

    $percent = int($quoted_lines / $total * 100);

    if ($total == $quoted_lines) {
        print "All $total lines were quoted lines!\n";
        # not ok
    } 
    elsif ($percent > $TOLERANCE && $total > $MINLINES) {
        print "Overquoted: $quoted_lines lines quoted out of $total: $percent%\n";
    } 

}

sub unquote_wrap {
    my $chunk = shift;
    # reduce miswrapped lines from idiotic broken PC newsreaders
    # into what they should have been
    $chunk =~ s/(>.*)\n\s*([a-zA-Z])/$1 $2/g;
    return $chunk;
} 

sub good_signature {
    my $MAX_SIGLINES = 4;

    my $sig = '';
    my($is_canon, $separator);

    my $body = body();

    # sometimes the ms idiotware quotes at the bottom this way
    $body =~ s/^-+\s*Original Message\s*-+\s.*\Z//ms;

    # first check regular signature
    if ($body =~ /\n-- \n(.*)/s) {
        $sig = $1;
        $is_canon = 1;
    } 
    elsif ($body =~ /\n([_-]{2,}\s*)\n(.*?)$/s) {
        $separator = $1;
        $sig = $2;
    } 

    for ($separator, $sig) { s/\n\Z// if defined }

    my $siglines = $sig =~ tr/\n//;

    if ($separator && ($siglines && $siglines < 20)) { 
        if ($separator eq '--') {
            print "Double-dash in signature missing trailing space.\n";
        } else { 
            print "Non-canonical signature separator: `$separator'\n";
        }
    }

    if ($siglines > $MAX_SIGLINES && $siglines < 20) { 
        printf "Signature too long: %d lines\n", $siglines;
    }
} 

sub strip_signature {
    local $_ = shift;

    s/\n-- \n(.*)//s
        ||
    s/\n([_-]{2,}\s*)\n(.*?)$//s;

    return $_;
} 

sub attribution {
    local $_ = body();
    s/^\s*\[.*\]\s*//;  # remove [courtesy cc]
    if (/\A(.*wr(?:ote|ites):?)\n/) {
        return $1;
    } elsif (/\A(.*?(<.*?>|\@).*?:)\n/s) {
        return $1;
    } else {
        return '';
    } 
} 

sub strip_attribution {
    local $_ = shift;

    s/^\s*\[.*\]\s*//;  # remove [courtesy cc]

    # XXX: duplicate code with previous function
    s/\A(.*wr(?:ote|ites):?)\n// 
        ||
    s/\A(.*?(<.*?>|\@).*?:)\n//s;

    return $_;
}

sub annoying_subject {
    local $_ = subject();

    if ( / ( [?!]{3,} ) /x   ||
         / ( HELP     ) /x   ||
         / ( PLEASE   ) /x
       ) 
    {
        print "Subject line contains annoying `$1' in it.\n";
    } 
}

sub mimes {

    my $mime_crap = 0;

    for (content_type()) { 
        last unless defined;
        $mime_crap++;
        if (/multipart/i) {
            print "Multipart MIME detected.\n";
        } 
        elsif (/html/i) {
            print "HTML encrypting detected.\n";
        } 
        elsif (! (/^text$/i || m#^text/plain#i)) {
            print "Strange content type detected: $_\n";
        } 
    }

    for (content_transfer_encoding()) { 
        last unless defined;
        if (/quoted-printable/i) {
            print "Gratuitously quoted-illegible MIMEing detected.\n";
        } 
    }

    unless ($mime_crap) {
        for (body()) { 
            if (/\A\s*This message is in MIME format/i) {
                print "Gratuitous but unadvertised MIME detected.\n";
            } 
            elsif ( /\A\s*This is a multi-part message in MIME format/i) {
                print "Unadvertised multipart MIME detected.\n";
            } 
        }
    } 


}

sub dns_check {
    my $NSLOOKUP = 'nslookup';  # or /usr/ucb?

    # first try an MX record, then an A rec (for badly configged hosts)

    my $host = shift;
    local $/ = undef;
    local *NS;
    local $_;

    # the following is commented out for security reasons:
    #   if ( `nslookup -query=mx $host` =~ /mail exchanger/
    # otherwise there could be naughty bits in $host
    # we'll bypass system() and get right at execvp()

    my $pid;

    if ($pid = open(NS, "-|")) {
        $_ = <NS>;
        kill 'TERM', $pid if $pid;  # just in case
        close NS;
        return if /mail exchanger/;
        # else fall trohugh to next test
    } else {
        die "cannot fork: $!" unless defined $pid;
        open(SE, ">&STDERR");
        open(STDERR, ">/dev/null");
        { exec $NSLOOKUP, '-query=mx', $host; }  # braces for -w
        open(STDERR, ">&SE");
        die "can't exec nslookup: $!";
    } 

    if ($pid = open(NS, "-|")) {
        $_ = <NS>;
        kill 'TERM', $pid if $pid;  # just in case
        close NS;
        unless (/answer:.*Address/s || /Name:.*$host.*Address:/si) {
            print "No DNS for \@$host in return address.\n";
        }
    } else { 
        die "cannot fork: $!" unless defined $pid;
        open(SE, ">&STDERR");
        open(STDERR, ">/dev/null");
        { exec $NSLOOKUP, '-query=a', $host; }  # braces for -w
        open(STDERR, ">&SE");
        die "can't exec nslookup: $!";
    }

} 


sub ck822 { 

    # ck822 -- check whether address is valid rfc 822 address
    # [EMAIL PROTECTED]
    #
    # pattern developed in program by jfriedl; 
    # see "Mastering Regular Expressions" from ORA for details

    # this will error on something like "ftp.perl.com." because
    # even though dns wants it, rfc822 hates it.  shucks.

    my $what = 'address';

    my $address = shift;
    local $_;

    my $is_a_valid_rfc_822_addr;

    ($is_a_valid_rfc_822_addr = <<'EOSCARY') =~ s/\n//g;
(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n
\015()]|\\[^\x80-\xff])*\))*\))*(?:(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\
xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|"(?:[^\\\x80-\xff\n\015"
]|\\[^\x80-\xff])*")(?:(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xf
f]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*\.(?:[\040\t]|\((?:[
^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\
xff])*\))*\))*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;
:".\\\[\]\000-\037\x80-\xff])|"(?:[^\\\x80-\xff\n\015"]|\\[^\x80-\xff])*"))
*(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\
n\015()]|\\[^\x80-\xff])*\))*\))*@(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\
\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\04
0)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-
\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\])(?:(?:[\040\t]|\((?
:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80
-\xff])*\))*\))*\.(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\(
(?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<>@,;:".\\\[\]
\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\
\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\]))*|(?:[^(\040)<>@,;:".\\\[\]\000-\0
37\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|"(?:[^\\\x80-\xf
f\n\015"]|\\[^\x80-\xff])*")(?:[^()<>@,;:".\\\[\]\x80-\xff\000-\010\012-\03
7]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\
\[^\x80-\xff])*\))*\)|"(?:[^\\\x80-\xff\n\015"]|\\[^\x80-\xff])*")*<(?:[\04
0\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]
|\\[^\x80-\xff])*\))*\))*(?:@(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x
80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<>@
,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]
)|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\])(?:(?:[\040\t]|\((?:[^\\
\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff
])*\))*\))*\.(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^
\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<>@,;:".\\\[\]\000-
\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-
\xff\n\015\[\]]|\\[^\x80-\xff])*\]))*(?:(?:[\040\t]|\((?:[^\\\x80-\xff\n\01
5()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*,(?
:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\0
15()]|\\[^\x80-\xff])*\))*\))*@(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^
\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<
>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xf
f])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\])(?:(?:[\040\t]|\((?:[^
\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\x
ff])*\))*\))*\.(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:
[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<>@,;:".\\\[\]\00
0-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x8
0-\xff\n\015\[\]]|\\[^\x80-\xff])*\]))*)*:(?:[\040\t]|\((?:[^\\\x80-\xff\n\
015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*)
?(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000
-\037\x80-\xff])|"(?:[^\\\x80-\xff\n\015"]|\\[^\x80-\xff])*")(?:(?:[\040\t]
|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[
^\x80-\xff])*\))*\))*\.(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xf
f]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<>@,;:".\
\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|"(?:
[^\\\x80-\xff\n\015"]|\\[^\x80-\xff])*"))*(?:[\040\t]|\((?:[^\\\x80-\xff\n\
015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*@
(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n
\015()]|\\[^\x80-\xff])*\))*\))*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff
]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\
]]|\\[^\x80-\xff])*\])(?:(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\
xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*\.(?:[\040\t]|\((?
:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80
-\xff])*\))*\))*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@
,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff
])*\]))*(?:[\040\t]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff]|\((?:[^\\\x8
0-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*>)(?:[\040\t]|\((?:[^\\\x80-\xff\n\
015()]|\\[^\x80-\xff]|\((?:[^\\\x80-\xff\n\015()]|\\[^\x80-\xff])*\))*\))*
EOSCARY

    if ($address !~ /^${is_a_valid_rfc_822_addr}$/o) { 
        print "rfc822 failure on $address"; 
    }
}


##############################

package Mail_Message;

use Carp;

use vars qw/$AUTOLOAD/;

# process <ARGV> for a message header and body
# currently this assumes one message per file!
sub main::get_message {
    my $msg = bless {}, __PACKAGE__;
    local $/ = '';
    $msg->{HEADER_STRING} = <>;
    chomp $msg->{HEADER_STRING};
    for (split /\n(?!\s)/, $msg->{HEADER_STRING}) {
        my($tag, $value) = /^([^\s:]+):\s*(.*)\s*\Z/s;
        push @{ $msg->{HEADERS}{$tag} }, $value;
        $tag =~ tr/-/_/;
        $tag = uc($tag);
        push @{ $msg->{$tag} }, $value;
    } 
    local $/ = undef;
    for ($msg->{BODY} = <>) { 
        $msg->{PARAGRAPHS} = [ split /\n\n+/ ];
        $msg->{LINES}      = [ split /\n/    ];
    }

    return $msg;
} 

sub AUTOLOAD {
    use vars '$AUTOLOAD';
    my $self = shift;
    my $field;
    ($field = uc($AUTOLOAD)) =~ s/.*:://;
    my $xfield = "x_" . $field;

    if (!exists $self->{$field} && exists $self->{$xfield}) {
        $field = $xfield;
    } 

    unless (exists $self->{$field}) {
        return undef;
        # NOT REACHED
        confess "No field $field in message";
    } 

    my $data = $self->{$field};
    my @data = ref $data ? @$data : $data;

    if (wantarray) { 
        return @data;
    }
    else {
        return join("\n", @data);
    } 

}
Re: [admin] NO HTML posts please!

Reply via email to