|> Nov 26 17:22:12 zamboni mailme: Now throwing back sea turtles.
|
|Sea turtles?
Sea turtles are the term I've given to crossposted messages. Used in
the function greenpeace, below. By the way, the backlog is down to
7500 messages and falling slowly.
Jeff
========================
#!/bin/bash
####################################################
#
# This program retrieves, sorts, and archives
# mailing lists. It uses MH and MHonArc.
#
# In a nutshell: Get new mail, look at the first
# message in the inbox, and decide what list it
# belongs to. Take it and all similar messages
# and archive them.
#
#
# Time-stamp: <Fri 09/15/2000 20:02:13>
#
####################################################
######################################
####### set globals #########
######################################
unset NEWLIST REDCARD MAILLIST EXISTING_LISTS
MHPATH=/usr/bin/mh
MYPATH=/home/archive/bin
PATH=${MYPATH}:${PATH}:${MHPATH}:/usr/local/bin
INBOX=inbox
FOLDER="quietly folder"
REFILE="quietly refile -unlink"
PICK="quietly pick"
INC="quietly inc -silent"
SHOW="show -noheader -nocheckmime -showproc cat"
GREP=grep
EGREP=egrep
MHONARC=/usr/bin/mhonarc
FILTER=filter.`date +%Y.%m.%d-%H:%m:%S`-$$
CONFDIR=/home/archive/conf
MAIL=/home/archive/Mail
VAULT=/home/archive/vault
ARCHIVE=/home/archive/archive
BATCHSIZE=first:7000
ACTIVELISTS=/home/archive/archive/activelists.txt
######################################
### functions for error handling ###
######################################
# Log informational or error messages.
# Usage: echok <err|notice|info> message
# Example usage: echok notice "Shields are down to 14 percent."
echok () {
logger -t mailme -p local0."$1" "$2"
}
# Diverts mail from processing area to a safe place.
# Example usage: emergency_divert LIMBO "You are not worthy!"
emergency_divert() {
[ "$2" ] && echok err "$2"
echok err "Now diverting mail to $1 folder."
$FOLDER -create +$1
$REFILE -src +$FILTER `$PICK +$FILTER` +$1
rm -f $MAIL/$FILTER/.mh_sequences
rmdir $MAIL/$FILTER
}
# MH commands are very chatty, and clutter up the
# logs. There is no -quiet option, unfortunately.
# This represses some of the annoying messages.
#
# Example usage:
# quietly pick -src inbox -subject "some subject"
# quietly refile -src inbox 72 +somewhere
# quietly folder -create somewhere
# quietly inc -silent +inbox
quietly() {
QTMP=/tmp/mm-quietly.$$
"$@" 2> $QTMP | $EGREP -v "^..* has ..* message..*$"
cat $QTMP | $EGREP -v \
-e "^(pick)|(refile): no messages match specification$" \
-e "^inc: no mail to incorporate$" \
-e "^(pick)|(refile): no messages in ($INBOX)|($FILTER)$" 1>&2
rm -f $QTMP
}
################################################
### functions involving email headers #####
################################################
# Get all headers from a message
# Uses mess822 utility, found at http://cr.yp.to/mess822.html
# Example usage: cat message | get_headers
get_headers () {
822header | tr '[:upper:]' '[:lower:]'
}
# Get a particular header from message headers
# Uses mess822 utility, found at http://cr.yp.to/mess822.html
# Example usage: cat messageheaders | grab "reply-to"
grab() {
822field $1
}
# Get all email addresses, and precede them with a carat.
# Example usage: cat RMAIL | waterfall
waterfall () {
tr -s '[\\/;:[:space:][:cntrl:],`()<>|\042\047]' '[\n*]' |\
grep @ |\
sed -e 's/^owner-//g' -e 's/^/^/g' -f $CONFDIR/aliases -e 's/\.$//g' |
grep -v -e '@localhost' -e '@127.0.0.1' -e '^majordomo@' \
-e '^majordom@' -e '^postmaster@' -e '^qmailr@' \
-e '^@' -e '@$'
}
# Remove self references
# Example usage: cat RMAIL | grep_out_self
[EMAIL PROTECTED]
[EMAIL PROTECTED]
[EMAIL PROTECTED]
grep_out_self () {
$EGREP -v -e "($ARCHIVER1)|($ARCHIVER2)|($ARCHIVER3)"
}
########################################################################
#
# Functions for doing matching of email addresses. All functions take
# a list of addresses to match against, and a list of data. They
# answer the question, "Ok, I have good warm feeling about the
# contents of $1. Can I find something in $2 that I feel good about,
# too?" We only return one match (unless otherwise noted) and there
# aren't any guarantees about which one we will return if there are
# multiple possible correct answers. Some effort will be made towards
# favoring items at the top of lists, but don't count on this. If
# nothing feels good, nothing is returned.
#
# Example usage: function "$EXISTING_LISTS" "$NEW_DATA"
#
########################################################################
# Find all exact matches.
all_matches () {
[ ! "$1" ] && return
echo "$2" | $GREP -F "$1" | sort -u
}
# Find exact first match, additional exact matches
exact_match () {
all_matches "$1" "$2" | head -1
}
additional_matches () {
all_matches "$1" "$2" | tail +2
}
# Usage example: fuzzy_match [EMAIL PROTECTED] [EMAIL PROTECTED] [--dont-double-check-results]
fuzzy_match () {
USERS=$(echo "$2" | sed 's/@/@ /g' | cut -f1 -d" ")
DOMAINS=$(echo "$2" | cut -f2 -d@)
[ ! "$USERS" -o ! "$DOMAINS" ] && return
CAND=`echo "$1" | $GREP -F "$USERS" | $GREP -F "$DOMAINS" | head -1`
# Check for false positive: $1 is a@b, $2 is a@x y@b
if [ "$3" != "--dont-double-check-results" ]
then
for i in $2
do
user=$(echo "$i" | sed 's/@/@ /g' | cut -f1 -d" ")
host=$(echo "$i" | cut -f2 -d@)
echo $CAND | $GREP ^$user | $GREP $host'$'
done
fi
}
# Usage example: fluffy_match [EMAIL PROTECTED] [EMAIL PROTECTED]
fluffy_match () {
CAND=$(fuzzy_match "$2" "$1" --dont-double-check-results | cut -f1 -d@)
[ ! "$CAND" ] && return
for i in $(echo "$1" | $GREP -F "$CAND@") ; do
host=$(echo $i | cut -f2 -d@)
user=$(echo "$CAND" | sed 's/^\^/\\^/g') # uncool
MMATCH=$(echo "$2" | $GREP -e "$user"@.*"$host")
if [ "$MMATCH" ]
then
echo $i
break
fi
done
}
# Try to find a match.
try_to_match () {
unset MATCH
[ ! "$MATCH" ] && MATCH=`exact_match "$1" "$2"`
[ ! "$MATCH" ] && MATCH=`fuzzy_match "$1" "$2"`
[ ! "$MATCH" ] && MATCH=`fluffy_match "$1" "$2"`
echo "$MATCH"
}
##########################################################
# Function for doing sanity/security check on listnames. #
# I've seen spammers send mail to weird addresses. #
# There's also the possibility that the heuristics might #
# not do such a studly job picking out a listname. #
# #
# Catches the following: #
# 0) Trailing garbage #
# 1) Illegal punctuation #
# 2) Missing @ symbol #
# 3) Multiple @ symbols #
# 4) Blank username/host #
# 5) Ridiculously long addresses #
##########################################################
# Argument: <listname>
# Return: true if valid, false if invalid
sane () {
S_SUSPECT=$1
S_USER=$(echo $S_SUSPECT | cut -f1 -d@)
S_DOMAIN=$(echo $S_SUSPECT | cut -f2 -d@)
S_GUNK=`echo $S_SUSPECT | $GREP [\%\,\"\'\(\)\<\>]`
S_ENDOK=`echo $S_SUSPECT | $GREP '[[:alpha:]]$'`
[ ! "$S_ENDOK" ] && return `false`
[ "$S_GUNK" ] && return `false`
[ ! "$S_USER" ] && return `false`
[ ! "$S_DOMAIN" ] && return `false`
[ ! "`echo $S_SUSPECT | $GREP @`" ] && return `false`
[ "`echo $S_SUSPECT | $GREP '.*@.*@.*'`" ] && return `false`
[ $(( ${#S_USER} >= 60 )) = 1 ] && return `false`
[ $(( ${#S_DOMAIN} >= 255 )) = 1 ] && return `false`
return `true`
}
################################################
### function for removing gunk from filters ####
################################################
# Cleans unarchivable gunk out of filter. If everything gets cleaned
# out, remove filter and return true. Else return false.
purge_filter () {
TRASH=TRASH
$FOLDER -create +$TRASH
PICKARGS="--x-no-archive yes -or --restrict no-external-archive"
$REFILE -src +$FILTER `$PICK $PICKARGS +$FILTER` +$TRASH
rm -f $MAIL/$FILTER/.mh_sequences
rmdir $MAIL/$FILTER 2> /dev/null && return `true`
return `false`
}
# An environmental group got together and decided that there needed to
# be a way to let the sea turtles (crossposted messages) escape. Takes
# a list of endangered species as an arguement and throws them back
# from the nets.
FSGFILE=/tmp/whatever
fsgaux() {
for i in $1
do
look $i $FSGFILE
done
}
fsg_additional_matches () {
[ ! "$1" ] && return
fsgaux "$1" | sort -u | tail +2
}
greenpeace () {
cd $MAIL/$FILTER
CATCH=$(ls -dU * | tail +2)
if [ "$CATCH" ]
then
echok info "Catch: $CATCH"
echok info "Now throwing back sea turtles."
for i in $CATCH ; do
PTMP=$(head -200 $i | get_headers | waterfall | sort -u )
# TURTLE=$(additional_matches "$1" "$PTMP")
echo "$1" > $FSGFILE
TURTLE=$(fsg_additional_matches "$PTMP")
[ "$TURTLE" -a "$i" ] && $REFILE -src +$FILTER "$i" +$INBOX
[ "$TURTLE" -a "$i" ] && echok info "Got one!"
done
fi
}
#####-------------------------#####
##### LET THE GAMES BEGIN! #####
#####-------------------------#####
# Check usage
if [ "$1" = "-src" -a "$2" ]
then
INBOX="$2"
INC="echok info manual_override_disable_mailcheck"
elif [ "$1" ]
then
echo "usage: $0 [-src <folder>]"
exit 0
fi
####################################
##### first, get new mail ######
####################################
echok info "Now checking mail."
$FOLDER -create +$INBOX
$INC +$INBOX
##############################################
##### Pick up the first letter ######
##############################################
FIRST_LETTER=`$PICK first:1 +$INBOX`
exit_bored () {
echok info "Now yawning. Nothing to archive."; exit 0
}
if [ ! "$FIRST_LETTER" ]
then
exit_bored
else
$FOLDER -create +$FILTER
$REFILE -src +$INBOX $FIRST_LETTER +$FILTER
$FOLDER -pack +$FILTER
purge_filter && exit_bored #No junk allowed!
fi
################################################
### Anatomy and physiology of headers ###
################################################
# Grab the relevant email headers
T=$($SHOW 1 +$FILTER | get_headers)
TO=`echo "$T" | grab "to"`
CC=`echo "$T" | grab "cc"`
X1=`echo "$T" | grab "x-mailing-list"`
X2=`echo "$T" | grab "reply-to"`
X3=`echo "$T" | grab "owner"`
X4=`echo "$T" | grab "sender"`
X5=`echo "$T" | grab "received" | grep_out_self`
X6=`echo "$T" | grab "resent-from"`
X7=`echo "$T" | grab "x-orcpt"`
X8=`echo "$T" | grab "apparently-to"`
X9=`echo "$T" | grab "delivered-to"`
X10=`echo "$T" | grab "resent-to" | grep_out_self`
X11=`echo "$T" | grab "x-loop"`
X12=`echo "$T" | grab "return-path"` #use later
X13=`echo "$T" | grab "mailing-list"` #use later
X14=`echo "$T" | grab "list-post"`
X15=`echo "$T" | grab "x-beenthere"` #use later (mailman)
X16=`echo "$T" | grab "list-post"`
X17=`echo "$T" | grab "x-post"`
# Extract unique email addresses
CHANCE=$(echo \
$TO $CC $X1 $X2 $X3 $X4 $X5 $X6 $X7 $X8 $X9 $X10 $X11 $X14 $X16 $X17 |\
waterfall | sort -u )
if [ ! "$CHANCE" ]
then
emergency_divert CORRUPT "Unable to find listname."
exit -1
fi
#############################################
### heuristics : figure out list name #####
#############################################
# Things get trickier when a single message is sent to multiple
# archived lists. When we detect such a situation, we throw down a
# REDCARD. All the extra matches are on the redcard.
echok info "Now looking for a match."
cd $MAIL
EXISTING_LISTS=$(ls | grep '^[-a-z0-9].*@.*$' | grep -v onelist\.com | sed 's/^/^/g')
# Try to match. Cleanup results.
MAILLIST=$(try_to_match "$EXISTING_LISTS" "$CHANCE")
MAILLIST=$(echo "$MAILLIST" | sed 's/\^//g')
REDCARD=$(additional_matches "$EXISTING_LISTS" "$CHANCE")
REDCARD=$(echo "$REDCARD" | sed 's/\^//g')
###########################################################
# If no match found, assume it is a new list to archive. #
# Recognizing a new list is a little tricky. #
###########################################################
if [ "$REDCARD" -o ! "$MAILLIST" ]
then
# Let's get some addresses to play with.
NTO=$(echo $TO | waterfall | sort )
NCC=$(echo $CC | waterfall | sort )
NX8=$(echo $X8 | waterfall | sort )
NX13=$(echo $X13 | waterfall | sort )
NX14=$(echo $X14 | waterfall | sort )
NRC=$(echo $REDCARD | waterfall | sort )
NTMP=$(echo $MAILLIST | waterfall | sort )
ANYONE=$(echo $X1 $15 $TO $CC $X5 $X7 $X9 $X12 | waterfall | sort )
SUSPECTS=$(echo $X1 $X15 $X5 $X7 $X9 $X16 $X17 | waterfall | sort )
# We're going to try a bunch of tricks. If any one of these
# works, jump for joy and break out from this guessing game.
how_about () {
MAILLIST=$(try_to_match "$1" "$2" )
[ "$MAILLIST" ] && break
}
while `true` ; do
if [ "$REDCARD" ]
then
how_about "$NTMP" "$SUSPECTS"
how_about "$NRC" "$SUSPECTS"
fi
how_about "$NX13" "$SUSPECTS"
how_about "$NX14" "$SUSPECTS"
# Majordomo -> Sender: [EMAIL PROTECTED]
if [ "$(echo $X4 | cut -f1 -d-)" = "sender: owner" ]
then
HINT=$(echo "$X4" | cut -f2- -d- | waterfall | sort )
how_about "$NTMP" "$HINT"
how_about "$NRC" "$HINT"
how_about "$HINT" "$ANYONE"
fi
how_about "$NTO" "$SUSPECTS"
how_about "$NCC" "$SUSPECTS"
how_about "$NX8" "$SUSPECTS"
# Getting desperate
how_about "$REDCARD" "$REDCARD"
how_about "$MAILLIST" "$MAILLIST"
how_about "$HINT" "$HINT"
how_about "$NTO" "$NTO"
how_about "$NCC" "$NCC"
# Spam or corrupt, almost certainly.
emergency_divert CORRUPT "Unable to find listname."
exit -1
break
done
# Clean up
MAILLIST=$(echo "$MAILLIST" | sed 's/\^//g')
if [ "$REDCARD" ]
then
echok info "Being careful $MAILLIST"
else
echok notice "New list! $MAILLIST"
NEWLIST=true
fi
fi
#################################################
# Sanity checks before proceding #
#################################################
if $(! sane "$MAILLIST")
then
emergency_divert CORRUPT "Invalid listname: $MAILLIST"
exit -1
fi
# By now, we are confident of the listname
ESCAPED_NAME=$(echo $MAILLIST | tr '@.' '__')
NICKNAME=$(echo $MAILLIST | cut -f1 -d@)
###########################################
### optionally, group mail by month ####
###########################################
# The general strategy is to consider each month an independent
# list. [EMAIL PROTECTED] (July), [EMAIL PROTECTED] (Aug) and [EMAIL PROTECTED]
# are completely independent as far as the sorting engine is
# concerned. Dates are selected from headers, using the following
# preferences to guard against misset clocks: 'x-archive-with-date'
# 'received' 'date'.
# $MONFLAG -- config file which triggers month grouping
# $YYYYMM -- directory where output (HTML) is stored
# $MONBEG/$MONEND -- demarkate month for 'pick'
# $PICKMON -- arguments to 'pick' for month grab
#
# Notes: 'pick' doesn't know that Sept only has 30 days.
# Set environment for per-month handling, if needed.
MONFLAG=$VAULT/$ESCAPED_NAME/monthly
if [ -f $MONFLAG ]
then
XDATE=`echo "$T" | grab "x-archive-with-date"`
[ ! "$XDATE" ] && XDATE=`echo "$T" | grab "received" | head -1`
[ ! "$XDATE" ] && XDATE=`echo "$T" | grab "date"`
if [ ! "$XDATE" ]
then
emergency_divert NODATE "Unable to find any date field."
exit -1
fi
JUSTDATE=`echo "$XDATE" | 822date | head -1`
date -d "$JUSTDATE" > /dev/null 2>&1
ex=$?
if [ "$ex" != "0" ]
then
emergency_divert NODATE "Unable to find valid date field."
exit -1
fi
echok info "Now switching to monthly indexing."
YYYYMM=`date -d "$JUSTDATE" +"%Y-month-%m"`
MONBEG=`date -d "$JUSTDATE" +"%m/01/%Y:00:00:00.00"`
MONEND=`date -d "$JUSTDATE" +"%m/31/%Y:23:59:59.99"`
PICKMON="-and -after $MONBEG -and -before $MONEND"
else
YYYYMM=
MONBEG=
MONEND=
PICKMON=
fi
#####################################
##### grab the relevant mail ######
#####################################
cleanup () {
echok info "Now cleaning up $1"
$REFILE -src +$INBOX `$PICK $BATCHSIZE \
-lbrace \
-to $1 -or -cc $1 \
-rbrace \
$PICKMON \
+$INBOX` +$FILTER
}
cleanupbak () {
echok info "Now cleaning up $1"
$REFILE -src +$INBOX `$PICK $BATCHSIZE \
-lbrace \
-to $1 -or -cc $1 -or --reply-to $1 -or --x-mailing-list $1 \
-or --owner $1 -or --sender $1 -or --resent-from $1 \
-or --delivered-to $1 -or --x-orcpt $1 \
-rbrace \
$PICKMON \
+$INBOX` +$FILTER
}
if [ "$MAILME_PRESORTED" ]
then
cleanup $MAILLIST
elif [ ! "$REDCARD" -a ! "$NEWLIST" ]
then
cleanup $MAILLIST
fi
#######################
### post processing ###
#######################
purge_filter && exit 0 # Halt if nothing is archivable.
# Throw back (possible) accidental catches. Slow.
EXISTING_LISTS=`echo "$EXISTING_LISTS" | grep_out_self`
[ ! "$MAILME_PRESORTED" ] && greenpeace "$EXISTING_LISTS"
#ACTIVE=`cat $ACTIVELISTS | grep_out_self`
#[ ! "$MAILME_PRESORTED" ] && greenpeace "$ACTIVE"
#####################################
#### html archive mail #######
#####################################
echok info "Now updating HTML archives."
# Configure and run MHonArc. Maybe cascade a custom rcfile.
# Use $YYYYMM monthly structure IF switched on
# Configure mhonarc and run. Maybe cascade a custom rcfile.
RCFILE="-rcfile $CONFDIR/rcfile"
CUSTOM=$VAULT/$ESCAPED_NAME/rcfile
MONTHRC=$CONFDIR/month.rc
X_MAILLIST=$MAILLIST
[ "$YYYYMM" ] && X_MAILLIST=$MAILLIST/$YYYYMM
[ -f $MONFLAG ] && RCFILE="$RCFILE -rcfile $MONTHRC"
[ -f $CUSTOM ] && RCFILE="$RCFILE -rcfile $CUSTOM"
FLAGS="-outdir $ARCHIVE/$MAILLIST/$YYYYMM -add $RCFILE \
-tidxfname index.html \
-ttitle $ESCAPED_NAME -title $NICKNAME -nolock -quiet \
-definevar X-MAILLIST=$X_MAILLIST"
mkdir -p $ARCHIVE/$MAILLIST
mkdir -p $ARCHIVE/$MAILLIST/$YYYYMM
$MHONARC $FLAGS $MAIL/$FILTER
# If MHonArc fails to work, put the mail in a safe place
# and complain, loudly, about the error.
ex=$?
if [ "$ex" != "0" ]
then
emergency_divert LIMBO "MHonArc returned exit code $ex for $MAILLIST."
exit -1
fi
# Generate monthly index
if [ -f $MONFLAG ]
then
monthme $MAILLIST >> /tmp/biglog
fi
##############################################
##### Inaugarate new lists #####
##############################################
if [ "$NEWLIST" ]
then
# Prime for searching
digger $MAILLIST
# Rebuild home page
echok info "Now rebuilding master index page."
$FOLDER -create +$MAILLIST
# splashme
fi
#####################################
###### file away originals ######
#####################################
echok info "Now filing away originals."
$FOLDER -create +$MAILLIST/$YYYYMM
$REFILE -src +$FILTER `$PICK +$FILTER` +$MAILLIST/$YYYYMM
rm -f $MAIL/$FILTER/.mh_sequences
rmdir $MAIL/$FILTER
echok info "All done."
_______________________________________________
Gossip mailing list
[EMAIL PROTECTED]
http://jab.org/cgi-bin/mailman/listinfo/gossip