#!/usr/bin/perl -w

use strict;

# db-to-text2.pl exports and imports bayes DB into/from a text file

# adapted a bit by Andy Spiegl (spamassassin.andy@spiegl.de)
#  from http://spamassassin.taint.org/devel/db-to-text.pl.txt
# to check for (and remove) atimes which lie in the future
#
# this also fixes a bug of "inventing" new keys because of newlines in the
# value part of the token.  But db-text2.pl CANNOT be used to dump bayes_seen!
# You can safely use db-to-text.pl for that.

############################################################
# How to use this script:
#
# First sync the journal before running this script!
# One way to do so is:
#  sa-learn --rebuild
#
# After (backing up! and) running
#  db-to-text2.pl -o bayes_toks > bayes_toks.txt
#  db-to-text2.pl -i bayes_toks < bayes_toks.txt
# you probably want to run
#  sa-learn -D --force-expire
# and then should NOT get this message anymore:
# "debug: bayes: couldn't find a good delta atime, need more token difference, skipping expire."
#

# WARNING: please set this variable to the correct value!
# check "sa-learn --dump magic | head -1"
my $db_version=2;

# '^M^A^G^I^C'
my $MAGIC_STRING = "\015\001\007\011\003";

my $into;
if ($ARGV[0] && $ARGV[0] eq '-i')
{
  $into = 1;
}
elsif ($ARGV[0] && $ARGV[0] eq '-o')
{
  $into = 0;
}
else
{
  die "
usage: db-to-text -o dbfile > textdump
usage: db-to-text -i dbfile < textdump
";
}

use DB_File;
use Fcntl;

shift @ARGV;
my $file = shift @ARGV;
my ($k, $v, $count, $resetcount);
my ($ts, $th, $atime);
$count = 0;
$resetcount = 0;

# TEXT to DB
if ($into)
{
  my %out;
  tie (%out, 'DB_File', $file, O_WRONLY|O_CREAT, 0600);
  while (<>)
  {
    $k="";$v="";$ts=0;$th=0;$atime=0;
    /<key>(.*?)<\/key>/ and $k = $1;
    /<val>(.*?)<\/val>/ and $v = $1;
    /<ts>(.*?)<\/ts>/ and $ts = $1;
    /<th>(.*?)<\/th>/ and $th = $1;
    /<atime>(.*?)<\/atime>/ and $atime = $1;

    $v = &tok_pack($ts, $th, $atime)  unless $v;

    $out{$k} = $v;
    $count++; print STDERR "." if ($count % 1000 == 0);
  }
  untie %out;
  print STDERR "\n$file: $count keys copied\n";

}
else

# DB to TEXT
{
  my %in;
  my %magic;
  my $magic;

  tie (%in, 'DB_File', $file, O_RDONLY, 0600);
  while (($k, $v) = each %in)
  {
    if ($k =~ /<\/key>/ || $v =~ /<\/val>/) { print STDERR "oops"; next; }

    # save the ^M^A^G^I^C tokens for later
    if ( $k =~ /^$MAGIC_STRING/ )
    {
      $magic = $k;
      $magic =~ s/^$MAGIC_STRING//;
      #print STDERR "\nMagic token: $magic: $v\n";
      $magic{$magic} = $v;

      next;
    }

    # unpack the token
    ($ts, $th, $atime) = &tok_unpack($v);

    # reset atime if it is in the future
    if ($atime > time)
    {
      print STDERR "\nResetting atime of key in the future:\n <key>$k</key><ts>$ts</ts><th>$th</th><atime>$atime</atime>\n";
      $atime = time;
      $resetcount++;
    }

    print "<key>$k</key><ts>$ts</ts><th>$th</th><atime>$atime</atime>\n";

    $count++;
    print STDERR "." if ($count % 1000 == 0);
  }
  untie %in;
  print STDERR "\n$file: $count keys copied\n";
  print STDERR "$file: $resetcount future-keys reset\n\n";

  # print magic tokens
  my ($magickey, $magicval);
  foreach $magickey (keys %magic)
  {
    $magicval = $magic{$magickey};
    # only check atime values
    if ($magickey =~ /^(LASTEXPIRE|LASTJOURNALSYNC|NEWESTAGE|OLDESTAGE|RUNNINGEXPIRE)$/)
    {
      if ($magicval > time)
      {
        # resetting can't be later than today
        print STDERR "resetting magic token \"$magickey\"\n";
        $magicval = time;
      }
    }
    print "<key>$MAGIC_STRING$magickey</key><val>$magicval</val>\n";
  }
}

print STDERR "done.\n";
exit;


# helpers copied from Mail::SpamAssassin::BayesStore.pm

use constant FORMAT_FLAG	=> 0xc0;	# 11000000
use constant ONE_BYTE_FORMAT	=> 0xc0;	# 11000000
use constant TWO_LONGS_FORMAT	=> 0x00;	# 00000000

use constant ONE_BYTE_SSS_BITS	=> 0x38;	# 00111000
use constant ONE_BYTE_HHH_BITS	=> 0x07;	# 00000111

sub tok_unpack
{
  my ($value) = @_;
  $value ||= 0;

  my ($packed, $atime);
  if ( $db_version == 0 ) {
    ($packed, $atime) = unpack("CS", $value);
  }
  elsif ( $db_version == 1 || $db_version == 2 ) {
    ($packed, $atime) = unpack("CV", $value);
  }

  if (($packed & FORMAT_FLAG) == ONE_BYTE_FORMAT) {
    return (($packed & ONE_BYTE_SSS_BITS) >> 3,
		$packed & ONE_BYTE_HHH_BITS,
		$atime || 0);
  }
  elsif (($packed & FORMAT_FLAG) == TWO_LONGS_FORMAT) {
    my ($packed, $ts, $th, $atime);
    if ( $db_version == 0 ) {
      ($packed, $ts, $th, $atime) = unpack("CLLS", $value);
    }
    elsif ( $db_version == 1 ) {
      ($packed, $ts, $th, $atime) = unpack("CVVV", $value);
    }
    elsif ( $db_version == 2 ) {
      ($packed, $ts, $th, $atime) = unpack("CVVV", $value);
    }
    return ($ts || 0, $th || 0, $atime || 0);
  }
  # other formats would go here...
  else {
    warn "unknown packing format for Bayes db, please re-learn: $packed";
    return (0, 0, 0);
  }
}

sub tok_pack {
  my ($ts, $th, $atime) = @_;
  $ts ||= 0; $th ||= 0; $atime ||= 0;

  if ($ts < 8 && $th < 8)
  {
    return pack ("CV", ONE_BYTE_FORMAT | ($ts << 3) | $th, $atime);
  }
  else
  {
    return pack ("CVVV", TWO_LONGS_FORMAT, $ts, $th, $atime);
  }
}
