#!/usr/bin/perl -w

use strict;
use lib '..', '.';

use threads;
use Thread::Queue;

use Mail::Box::Manager 2.00;
use LWP::Simple;
use URI;
use HTML::Entities;

#
# start up the worker threads and create the queues.
#
my $NumThreads = 20;
my $workers    = Thread::Queue->new;
my $results_q  = Thread::Queue->new;
start_threads ();

#
# The outline for using Mail::Box::Manager is from:
# http://radio.weblogs.com/0111823/2003/11/16.html#a373
#
my $mgr=Mail::Box::Manager->new;
my $mail_file = $ARGV[0];
my $folder = $mgr->open
       ( $mail_file,
         extract => 'ALWAYS' # Take the body
       );
die "Cannot open '$mail_file': $!\n" unless defined $folder;

my %url_seen = ();

# Process all messages in this folder.
my @messages = $folder->messages;
foreach my $message (@messages)
{
   my @match=($message->decoded =~ /\bhref="(http[^>"]*)">.*/gi );
   foreach my $match(@match)
   {
      my $u   = URLDecode(HTML::Entities::decode($match));
      my $uri = URI->new($u)->canonical;
      my $url = $uri->as_string;
      next if ($url_seen{$url}++);
      # check for answers back from the workers
      get_results ();
      # find a worker
      my $worker_inbox = bless $workers->dequeue, "Thread::Queue";
      # place this URL into its in box.
      $worker_inbox->enqueue($url);
   }
}
$folder->close;

stop_work();

exit 0;

sub start_threads {
  for (1..$NumThreads) {
     my $q = Thread::Queue->new;
     my $t = threads->create(\&url_check, $q);
     $workers->enqueue($q);
  }
}

sub get_results {
  # pick up previously checked URL's
  while (defined (my $checked_url = $results_q->dequeue_nb)) {
    print URI->new($checked_url)->host . "\t" . $checked_url . "\n";
  }
}

sub stop_work {
  # Tell the workers to finish
  for (1..$NumThreads)
  { 
    my $worker_inbox = bless $workers->dequeue, "Thread::Queue";
    $worker_inbox->enqueue(undef);
  }

  # Wait for the workers to finish
  for my $t (threads->list) { $t->join; }

  # Then make a final pass over the results queue
  get_results ();
}

#
# initial implementation is from:
# http://glennf.com/writing/hexadecimal.url.encoding.html
#
sub URLDecode {
    my $theURL = $_[0];
    # $theURL =~ tr/+/ /;  # rewrite + into space?
    $theURL =~ s/%([a-fA-F0-9]{2,2})/chr(hex($1))/eg;
    $theURL =~ tr [\200-\377] [\000-\177];   # delete 8th bit
    $theURL =~ s/[^[:print:]]//g;            # remove non-printables
    $theURL =~ s/\s//g;			     # remove white space
    $theURL =~ s/<!--(.|\n)*-->//g;
    return $theURL;
}

sub url_check {
  my $inbox = bless $_[0], "Thread::Queue";
  my $me = threads->self;
  my $url;
  while (defined ($url = $inbox->dequeue)) {
    #my $content="";
    # Try and fetch the page, no-op for now.
    # my $rc=getstore($match,$content);
    if (defined (get($url))) {
      $results_q->enqueue($url);
    }
    $workers->enqueue($inbox);
  }
}
