RE: Viewing exchange between browser and website

bruce Thu, 15 Jul 2004 06:02:42 -0700

colin...

a resend of the 1st email in case you didn't get it...

colin...

if you have access to mozilla/linux, you can get a plugin that will allow
you to se/view the complete header/data transfer between the
browser/server... get to the mozilla site, and search for the plugins... i
can't recall the name right now.

that said, if you use www::mechanize, it would probably help you get what
you're trying to do..

i'm including the following example for you to access the www.utaustin.edu
web site for their class schedule section.. 

it's ugly.. but it works.... and it should point out to you how you can
access a user/passwd site..

for you to actually run the script... you'd have to get your own passwd/user
and search/replace the following lines within the script..

==>>> my $username="xxxxx";
==>>> my $password="nnnnnnn";

good luck and let me know if you have any questions...

-bruce

#
#############################################################
#!/usr/bin/perl -w

use HTML::TreeBuilder;
use LWP::UserAgent;
use WWW::Mechanize;

my $b = WWW::Mechanize->new();

   $ua = new LWP::UserAgent;
   $ua->timeout(30);
   $ua->agent("AgentName/0.1 " . $ua->agent);

   my $cstr;

#
# get the top level university class site
#
  my $base_url = "http://www.utexas.edu/student/registrar/schedules/";;

   my $req = new HTTP::Request GET => $base_url;

   $req->content_type('application/x-www-form-urlencoded');

   my $res = $ua->request($req);
   my $q1c = $res->content;
   print $q1c;

# get 046/049/etc...
@_sessions = &getsessions($res->content);

#
# we'll parse the returned contents and use them to get to the
# next page...
#
   $sched_url = $base_url . "046/";
   $req = new HTTP::Request GET => $sched_url;
   $res = $ua->request($req);
   $q1c = $res->content;
   print $q1c;

   $ut_tree = HTML::TreeBuilder->new_from_content($res->content); # empty
tree
   # should be only one.. need to get the schedules..
   @_sem_td = $ut_tree->look_down("_tag"=>"td", "bgcolor"=>"#eeeeee");
   @_sem_a = $_sem_td[0]->look_down("_tag"=>"a");
   $sem_page_url = $_sem_a[0]->attr('href');

   print "href# $sem_page_url\n";

#
# now it get's tricky.. we parse the returned contents and use
# them to get to the next page. however, the next page is now the
# ut logon page.
#
# this requires that we get the returned results of the logon page
# and that we fill in the user/password...
#
# we should probably use mechanize.. if it works..
#
#Configuration:#########################
#my
$ut_url="https://utdirect.utexas.edu/registrar/clavnew/clav3.WBX?s_semester=
su2004";
my $ut_url=$sem_page_url;
my $username="xxxxx";
my $password="nnnnnnn";
#End Configuration######################
use Crypt::SSLeay;

my $b = WWW::Mechanize->new();

my $agent = WWW::Mechanize->new();

# Retrieve main page
$agent->get($ut_url);

  my $login = $agent->form_name('logonform2');
  if ( ! $login ) {
      warn "Nothing came back";
      exit;
  }

  $agent->set_fields('LOGON' => $username,
                    'PASSWORDS' => $password
                    );
  $agent->click_button('value' => 'Log In');

print "qqqqqqqq..\n";
# for now, we assume that this is the returned javascript dialog...
# it should be checked.. to see if it really is
# the app should include error checking...
#
print $agent->content();

#assuming that we got a good dialog.. means the login
#was good, go ahead and resend the registration page again...
#this gets the app to the course schedule page..
# now we begin to extract the information
# !!
#
$agent->get($ut_url);
print "7777777777777777777777..\n\n\n\n";
print $agent->content();
         open(F, ">out.html");
         print F $agent->content();
         close(F);

$filename = "out.html";

#
# go ahead and get the levels (grad/under/etc...
# and the list of depts...
#
@_level_array = &getlevels($filename);
@_dept_array = &getdepts($filename);

#
# begin to parse the top section to get a list of the classes..
#
my
$ut_base_class_url="https://utdirect.utexas.edu/registrar/clavnew/clav3_resu
lts.wb";
my $dept = "?c-key=";
my $class_str =
"&c-crs=%20&c-unique-single=&c-unique-begin=&c-unique-end=&c-cmd=lc";
my $semester = "&c-sem=";
my $level = "&c-sub=";

####
#test vars.. these vars will be in a loop..
#####
my $semester_var = "su2004";
my $level_var = "l";
my $dept_var ="ACC";

 #
 # start the loop for depts/levels
 #
 #
 print "level = ". scalar @_level_array ."\n";
 print "dept = " . scalar @_dept_array ."\n";

 $filename = "out.html";
 foreach $_lcnt (@_level_array)
 {
   foreach $_dcnt (@_dept_array)
   {
     #####
     #
     # the url--> ut_base_class + $dept.$dept_var + $class_str +
     #            $semester + $semester_var + $level + $level_var
     #
     ######
     $class_url = $class_url . $ut_base_class_url;
     $class_url = $class_url . $dept. $_dcnt;
     $class_url = $class_url . $class_str;
     $class_url = $class_url . $semester .$semester_var;
     $class_url = $class_url . $level . $_lcnt;

     print "$class_url \n";
     $agent->get($class_url);
     print "5555555555555555..\n\n\n\n";
     print $agent->content();
         open(F, ">$filename");
         print F $agent->content();
         close(F);

     @_class_array = &getclasses($filename);
     $class_url = "";
   }
 }

print "we're here...\n";
die;

#################################
#
#
#################################
sub getsessions
{
   my ($res) = @_;

   my $ut_tree = HTML::TreeBuilder->new_from_content($res); # empty tree

   # should be only one.. need to get the schedules..
   @_semester_tbl = $ut_tree->look_down("_tag"=>"table", "class"=>"tbg");

   print "tbl ".$_semester_tbl[0]->dump()."\n";
   @_semester_td = $_semester_tbl[0]->look_down("_tag"=>"td",
"align"=>"center");

   @_semester_a = $_semester_td[0]->look_down("_tag"=>"a");
   print "<a ".$_semester_a[1]->dump()."\n";
   $_ndx=0;
   foreach $a (@_semester_a)
   {
      $_semester[$_ndx++] = $a->attr('href');
   }

   #quickly strip extraneous chars...
   for ($i=0;$i<$_ndx;$i++)
   {
      $_semester[$i] =~ tr:\.\/::d;
   }
   return(@_semester);
}

#
# get the class information from the given page content
#       return the class_array/class_hash
#
sub getclasses
{
   my ($filename) = @_;
   my @level_array;
   my $tree = HTML::TreeBuilder->new(); # empty tree

  # add a "," to aid in processing...
  #
  #<p class="tbtx"> --> '<p class="tbtx">,'<<<<<
  #
  $_arg = "perl -i -p -e 's:\<p class\=\"tbtx\"\>:\<p class\=\"tbtx\"\>,:g;'
$filename";
  system($_arg);

   $tree->parse_file($filename);

   @tree_test = $tree->look_down("_tag"=>"table", "cellpadding"=>"2",
"bgcolor"=>"#cccccc");

 @tbtx_tree = $tree_test[0]->look_down("_tag"=>"tr");

  #iterate through the list...
  #skip the 1st one.. it's the header
  #get the class name/section information
  foreach $q (@tbtx_tree)
  {
    #print "tree tbtx = ". $q->as_HTML() ."\n";
    # parse the html for hdr/name/section...
    if ($q->as_HTML =~ /class\=\"tbh\"/)
    {
      # do nothing.. go ahead and skip
    }
    elsif($q->as_HTML() =~ /class\=\"em\"/)
    {
     $q1 = $q->as_HTML();
     $q1 =~ /em\"\>([^\<]+)\<[^\>]+\>([^\<]+)/;
     $classnumber = $1;
     $classname = $2;
     $classnumber =~ s/&nbsp;//;
    }
    elsif($q->as_HTML() =~ /\<a href\=\"clavcdet/)
    {
     $q1 = $q->as_text;
  print "number = $classnumber   name = $classname\n";
  print "q = $q1\n";
    }

  }

}

# }

# get the levels (under/grad/etc...)
#       return the level_array
#
sub getlevels
{
   my ($filename) = @_;
   my @level_array;
   my $tree = HTML::TreeBuilder->new(); # empty tree
   $tree->parse_file($filename);

   @tree_test = $tree->look_down("_tag"=>"input", "type"=>"radio",
"name"=>"s_course_level");

   my $_ndx=0;
   foreach $a (@tree_test)
   {
      $level_array[$_ndx++]=$a->attr('value');
   }
  return(@level_array);
}

#
# get the depts (/etc...)
#       return the depts_array
#
sub getdepts
{
   my ($filename) = @_;
   my @dept_array;
   my $tree = HTML::TreeBuilder->new(); # empty tree

  # prepare file for use
  #
  # we insert a "," to aid in data extraction
  #
  #perl -i -p -e 's///g;' out.html;
  #<p class="tbtx">  ==> '<p class="tbtx">,'
  $_arg = "perl -i -p -e 's/\<p class\=\"tbtx\"\>/\<p class\=\"tbtx\"\>,/g;'
". $filename;
  system($_arg);

   $tree->parse_file($filename);

   @tree_test = $tree->look_down("_tag"=>"select",
"name"=>"s_dept_abbr_select", "size"=>"1");

   @option_tree = $tree_test[0]->look_down("_tag"=>"option");

   my $_ndx=0;
   foreach $a (@option_tree)
   {
      $dept_array[$_ndx]=$a->attr('value');
      $dept_array[$_ndx]=~ s: :%20:g;   # sub/replace for " "s...

print "-----> $dept_array[$_ndx] \n";
      $_ndx++;
   }
  return(@dept_array);
}

-----Original Message-----
From: Colin Magee [mailto:[EMAIL PROTECTED]
Sent: Thursday, July 15, 2004 5:28 AM
To: [EMAIL PROTECTED]; [EMAIL PROTECTED]
Subject: RE: Viewing exchange between browser and website

Hi Bret,

Thanks for the swift reply.

I was aware of the deficiency of Activestate's PPM, having had the same
problem not finding Crypt-SSLeay.  However, I know that's not the solution
in
itself since I had tried my scripts on a Linux box which had this installed.
I also turned the cookie jar on (not knowing what it was doing particularly)
and that did not get me any further.  

The advice you give about proxy servers looks very interesting though, so I
shall download SSLeay (am now writing these scripts on Windows, mainly
because I find using IE is the easiest way to develop web scraping scripts
concurrently with the Perl script rather than any advantage of Perl on
Windows) and try that.

Regards
Colin 

-----Original Message-----
From: Bret Swedeen [mailto:[EMAIL PROTECTED]
Sent: 15 July 2004 13:09
To: [EMAIL PROTECTED]; Colin Magee
Subject: Re: Viewing exchange between browser and website

Hi Colin,

I recently attempted a similar task.  I'll try to outline as clearly as
possible what 
worked.

Using some of the examples from "Perl & LWP" book was a disaster.  None of
it

worked.  One important point I did pick up however was to make sure you have

cookies enabled:

use HTTP::Cookies;
$agent->cookie_jar(HTTP::Cookies->new());

Once I added those lines things started to look promising. Unfortunately, I
was still 
having problems.  Primary reason:  I needed an extra Perl Mod to make 
communication across https possible.  I'm using Perl for Win32 so I needed
to

install the Perl Mod Crypt::SSLeay.  Problem was doing so from the ppm
prompt

(part of the ActiveState Perl installation...makes mod installation very
easy) wasn't 
working.  For whatever reason I couldn't find Crypt::SSLeay for Perl on
Win32.  
Finally, after searching forums on ActiveState I found the mod and installed
from 
the ppm prompt with the following command:

install http://theoryx5.uwinnipeg.ca/ppms/Crypt-SSLeay.ppd

Take the defaults through the entire installation (there are a couple of
DDLs
that it 
will ask you about as well.  Just answer yes).

Ok, now I'm getting real close, but still not working.  I posted on the
Usenet forum 
for Perl Mods and got two extremely helpful tips.

First, install a local proxy of sorts to capture and view the back and forth

communication between browser and web site.  Something I think you are
looking 
for now.  Proxomitron was what I used.  I turned it on and went through the
web 
interaction steps with a standard browser.  While this tool didn't really
resolve my 
problems, it did help me understand more of what was going on between the 
browser and the site.

Second, and the most helpful of all, install the Perl Mod WWW::Mechanize.
This 
mod allows you to easily automate the steps of interacting with a site.
From
simply 
following links to logging on and communicating over https.  This mod was
what 
finally worked for me.  There was a problem with pressing certain buttons on
the 
page.  Seems it doesn't really know what to do with Javascript buttons, but
I
worked 
around that by simply making a URL with all of the form variables set and
passed it 
in to get what I wanted.  May not be a problem for you, but keep in mind
that
it 
really doesn't work with all form buttons exactly as you might think.

Anyway, another very useful thing during script development is to turn on
the
LWP 
debugging.  With this turned on you get to see all of the communication
details 
between your script and the site.  It really helps with troubleshooting as
you can see 
exactly where things are falling apart.  Add this line near the top of your
script after 
the use LWP statement.

use LWP::Debug qw(+);

Anyway, my experience was somewhat frustrated but little by little I did
make

progress and finally resolve my problem.  Here is a quick glimpse at what I
put 
together.  Please keep in mind I had to remove some detail as it is company 
specific which I cannot disclose here.  Also, at the end I dump the page
content that 
I get back after I send $bigprobeurl into a file with an html extension.  I
would then 
open this file in a browser to see if I got what I wanted.  The final
version
removes 
some of this code and actually acts upon the page returned.  I believe,
however, 
this example should help get you closer to what you want.  Of course, as I
found, no 
one example addresses your problem exactly they way you need.  Keep working
on 
it.  You'll get there in the end.

use LWP;
use LWP::UserAgent;
use LWP::Protocol::https;
use LWP::Debug qw(+);
use WWW::Mechanize;
use HTTP::Cookies;

my $agent = WWW::Mechanize->new();
my $intranetsite = "http://some company intranet site/index.html";
my $bigurl = "https://big url here with form variables and their values";

$numargs = @ARGV; # check for username and password on the command line
if ($numargs == 2) {
        $un     = $ARGV[0];
        $pw = $ARGV[1];
}
else {
        print "Please enter your username: ";
        my $un = <STDIN>;
        chomp($un);
        print "Please enter your password: ";
        my $pw = <STDIN>;
        chomp($pw);
}

$agent->cookie_jar(HTTP::Cookies->new());
$agent->agent_alias( 'Windows IE 6' );

#Navigate the intranet web site
$agent->get($intranetsite);
$agent->follow("Sign In"); # a link on the page
$agent->form_name('login'); # this is the name of the form on the sign in
page
$agent->field(username => "$un");
$agent->field(password => "$pw");
$agent->click(); # this is where I simulate clicking the button on the login
page
$agent->follow("Internal Application Link"); # a link on the new page
$agent->follow("Application Charts"); # a link on the next new page
$agent->get($bigurl); # finally, I send the URL wth form variables and
values

open(LOGFILE, ">output.html");
$page = $agent->content();
print LOGFILE "$page"; # dump page content into a file for viewing in a
browser
close(LOGFILE);
__END__

On 15 Jul 2004 at 12:14, Colin Magee wrote:

> Hi,
> 
> I've been trying to use LWP to programatically log in to a favourite
> password protected website.  
> 
> Problem is that I've worked through all the standard examples on LWP
> and I'm not getting through - the login mechanism doesn't conform to
> the examples, so I was wondering if there is any way I can see exactly
> what my browser is sending and receiving (while I'm using the browser)
> and therefore what I have to replicate in the code.  As you can
> probably tell I'm fairly novicey so I need to see some output where it
> will be fairly clear what I have to code in Perl.  I seem to recall
> some thread on this forum about using Mechanise in this way.  Is that
> correct?  If so is there an example script that shows how to record
> this?
> 
> Thanks
> Colin
>

RE: Viewing exchange between browser and website

Reply via email to