On Mon, 26 Jul 2004 11:27:50 -0400, perl.org wrote
> I want to get a list of the distinct domains (like perl.org) in all href
> attribute values of anchor tags in all files under a given directory 
> that end with the extensions .htm and .html.  I don't need to know 
> which files contain the links, I just want to know what domains are 
> referenced.  I don't care about JavaScript links, and the Perl code 
> would not have to crawl over http, just scan a filesystem.  It seems 
> really easy - File::Find and HTML::TokeParser, parse each file 
> matching the criteria and populate an associative array.  If someone 
> has already written and tested something that functions like this 
> and would be willing to share code, I would have to make slight 
> modifications which would save me some time.  Otherwise I would be 
> interested in suggestions for how to write one.  

Here's what I came up with.  Suggestions greatly appreciated...

use strict;

use File::Basename;
use File::Find;

use Getopt::Std;

use HTML::TokeParser;

my %{opts} = ();
main();
exit( 0 );

sub main
{
  lsHandleArgs();
  my %{exts} = ();
  my %{found} = ();

  File::Find::find( sub
  {
    my ${path} = ${File::Find::name};
    lsMessage( ${path} );

    if ( -f ${path} && ${path} =~ m#\.([^\.]+)?$#i )
    {
      my ${ext} = lc( ${1} );

      if ( ! defined( ${exts{${ext}}} ))
      {
        ${exts{${ext}}} = 1;
      }
      else
      {
        ${exts{${ext}}}++;
      }

      if ( ${ext} =~ m#^html?# )
      {
        my ${parser} = new HTML::TokeParser( ${path} );

        while( my ${token} = ${parser}->get_tag( 'a' ))
        {
          my ${url} = lc( ${token}->[1]{'href'} || '-' );
          ${url} =~ s#\#.+##;

          if ( ${url} =~ m#^(http|ftp)#i )
          {
            ${url} =~ s#^.+//([^/]+)/?.*#${1}#;
            ${found{${url}}} = ${path};
          }
        }
      }
    }
  }, ${opts{'d'}} );

  foreach my ${key} ( sort( keys( %{exts} )))
  {
    print ${exts{${key}}} . ' ' . ${key} . ' files.' . ${/};
  }

  print join( ${/}, '', sort( keys( %{found} )));
}

sub lsHandleArgs()
{
  my @{args} = @{ARGV};

  if ( ! Getopt::Std::getopts( 'd:hv', \%{opts} ))
  {
    lsUsage( 'invalid command line parameters' );
  }
  elsif ( defined( ${opts{'h'}} ))
  {
    lsUsage( 'usage information requested' );
  }
  elsif ( $#{ARGV} > -1 )
  {
    lsUsage( 'unrecognized command line parameters : ' . join( ' ', @{ARGV} ));
  }

  if ( ! defined( ${opts{'d'}} ))
  {
    lsUsage( '-d : directory required' );
  }
  elsif ( ! -d ${opts{'d'}} )
  {
    lsUsage( '-d : ' . ${opts{'d'}} . ' does not exist or is not a directory :
' . ${!} );
  }

  @{ARGV} = @{args};
}

sub lsUsage( $ )
{
  my ${msg} = shift( @{_} );
  my ${script} = File::Basename::basename( ${0} );
  my ${params} = join( ' ', @{ARGV} );
  print <<END_USAGE;

${script} : find domains linked to in content

-d <dir>          Directory (or workarea) to scan
-h                Help (display usage information)
-v                Enable verbose (debug) mode

${script} : parameter dump : ${params}

${script} : ${msg} : exit 1

END_USAGE

  exit( 1 );
}

sub lsMessage
{
  if ( defined( ${opts{'v'}} ))
  {
    foreach my ${msg} ( @{_} )
    {
      print scalar( localtime( time())) . ' : ' . ${msg} . ${/};
    }
  }
}


-- 
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]
<http://learn.perl.org/> <http://learn.perl.org/first-response>


Reply via email to