I've been working on Hans Anderson's Browser class and I've adapted it to do
what you're looking for. It's not complete but there should be a lot here
for you to go on. Again, I haven't had time to fine tune things at all so
it's a very rough hack right now. Hope it helps.

The dirty on how to use it:


  require_once ('class_browser.php');

  // new browser
  $browser = new Browser();

  $file_array = $browser->get_url(array(

  if ($file_array["errcode"] == 1) {
   $file_text = $file_array["content"];

  // convert relative links to absolute
  $file_text = $browser->translate($file_text, $file);



class Browser {

Class Browser by Hans Anderson.
This code is released under the GPL license.
Modifications by Aral Balkan

  06.22.01 - Added two new methods:

     (1) $string = translate_links($string, $url);

      Translates all relative links in argument $string to
      absolute links using the full URL to the page being
      accessed in the $url argument. Returns the translated

     (2) $string = translate_images($string, $url);

      Translates all relative image links in argument $string to
      absolute links using the full URL to the page being
      accessed in the $url argument. Returns the translated

     Aral Balkan ([EMAIL PROTECTED])

 function get_url($array) {

  /* defaults (there is no default for 'url' or 'content') */
  $robot_rules = TRUE; /* follow the robots.txt standard */
  $req_mthd = 'GET';
  $protocol = 'HTTP/1.0';
  $user_agent = 'PHP3 Browser';
  $time_out = 10;

  /* for each argument set in the array, overwrite default */
  while(list($k,$v) = each($array)) {

  /* set up the cookies.  If it exists, the straight variable
     will be written above ($$k=$v). */

  if(is_array($cookies)) {
   $cookies2send = '';
   while(list($k,$v) = each($cookies) ) {
    $cookies2send .= "Cookie: $k=" . urlencode($v) . "\n";

  return array("content"=>' ',"headers"=>' ',"errcode"=>-1,"errmsg"=>'Fatal
Error: No URL defined');

        $parsed_url = parse_url("$url");

  if($robot_rules) {

   $robots_url = $parsed_url["scheme"] . "://" . $parsed_url["host"];
   if($parsed_url["port"]) $robots_url .= ":" . $parsed_url["port"];
   $robots_url .= "/robots.txt";
     return array("content"=>' ',"headers"=>'
',"errcode"=>0,"errmsg"=>"Non-fatal Error: Robot Rules do not permit this
browser to access $url");


       $req_mthd = strtoupper($req_mthd); // 2068 rfc says it's case

              $host = $parsed_url["host"];

              if(!$host || $host=='' || !isset($host))
  array("content"=>' ',"headers"=>' ',"errcode"=>-1,"errmsg"=>'Fatal Error:
No URL defined');

              $path = $parsed_url["path"];
              if(!$path || $path=='' || !isset($path))
  $path = "/";

              $query = $parsed_url["query"];
  $path = "$path?$query";

              if(!isset($parsed_url["port"])) {
               $port = 80;
              } else {
               $port = $parsed_url["port"];

  $timeout = time() + $time_out;

  $fp = fsockopen("$host",$port,$errno,$errstring,$time_out);

 if(!$fp) {
  return array("content"=>' ',"headers"=>'
',"errcode"=>0,"errmsg"=>"Non-Fatal Error: Could not make connection to url
$url (not found in DNS or you are not connected to the Internet)");
 } else {

       set_socket_blocking($fp,1); // aral: set to 1 for it to work on
Windows & Unix

       $REQUEST = "$req_mthd $path $protocol\n";
       if(eregi("^HTTP\/1\.[1-9]",$protocol)) $REQUEST .= "Host: $host\n";
       $REQUEST .= "User-Agent: $user_agent\n";
  if($referer) {
   $REQUEST .= "Referer: $referer\n";
       $REQUEST .= "Connection: close\n";

  if($cookies) {
   $REQUEST .= $cookies2send;

  if($req_mthd=="POST") {
   $REQUEST .= "Content-length: " . (strlen($content)) . "\n";
   $REQUEST .= "Content-type: application/x-www-form-urlencoded\n";
   $REQUEST .= "\n$content\n";
  fputs($fp,"$REQUEST\n"); // complete the request
#  print "$REQUEST\n";

   return array("content"=>' ',"headers"=>'
',"errcode"=>0,"errmsg"=>"Non-Fatal Error: Timed out while downloading
  while (!feof($fp) && time()<$timeout) {
      $output = fgets($fp,255);

      $view_output .= $output;

            if(!isset($header)) {
       if($output=="\n" || $output == "\r\n" || $output == "\n\l") {
                          $header = $view_output;
                     $view_output = '';




n-Fatal Error: Timed out while downloading page");


} // end function get_url

/* ************************************* */

function get_headers($h) {
  $array = explode("\n",$h);

   for($i=0;$i<count($array);$i++)  {
    if(  ereg("([A-Za-z]+)/([0-9]\.[0-9]) +([0-9]+)
+([A-Za-z]+)",$array[$i],$r)  ) {
      $hdrs['version'] = trim($r[2]);
      $hdrs['status_code'] = trim($r[3]);
      $hdrs['status_text'] = trim($r[4]);
    } elseif(ereg("([^:]*): +(.*)",$array[$i],$r)) {
     $hdr = eregi_replace("-","_",trim(strtolower($r[1])));
     $hdrs[$hdr] = trim($r[2]);

 return $hdrs;
} // end function get_headers

/* ************************************* */

function get_a_header($h,$w) {
  $array = explode("\n",$h);

   for($i=0;$i<count($array);$i++)  {
    if(  ereg("([A-Za-z]+)/([0-9]\.[0-9]) +([0-9]+)
+([A-Za-z]+)",$array[$i],$r)  ) {
      $hdrs['version'] = $r[2];
      $hdrs['status_code'] = $r[3];
      $hdrs['status_text'] = $r[4];
    } elseif(ereg("([^:]*): +(.*)",$array[$i],$r)) {
     $hdr = eregi_replace("-","_",strtolower($r[1]));
     $hdrs[$hdr] = $r[2];

 return $hdrs[$w];
} // end function get_a_header

/* ************************************* */

function strip_entities($string) {

 return ereg_replace("&[^;]{2,4};"," ",$string);


function strip_html($string) {

  while(ereg("<[^>]*>",$string)) {
   $string = ereg_replace("<[^>]*>"," ",$string);

  while(ereg("\t+",$string)) {
   $string = ereg_replace("\t+"," ",$string);

  while(ereg("\n+",$string)) {
   $string = ereg_replace("\n+"," ",$string);

  while(ereg("\r+",$string)) {
   $string = ereg_replace("\r+"," ",$string);

  while(ereg("\l\r+",$string)) {
   $string = ereg_replace("\l\r+"," ",$string);

  while(ereg(" {2,}",$string)) {
   $string = ereg_replace(" {2,}"," ",$string);

 return $string;
} // end function strip_tags

/* ************************************* */

 function full_time($l=0) {
   $microtime = microtime();
   return doubleval(substr($microtime,0,10))  +  substr($microtime,11,10) +

/* ************************************* */

function get_links($s,$url=''){

 if($url) {
   $p = parse_url($url);

  if($p["port"]) {
   $port = ":$p[port]";
  } else {
   $port = '';

 $copy = $s; // so we can return links and titles in their proper case
 $s = strtolower($s); // or else the strstr and strpos searches are case

  while($pos_start) {
    $pos_close = strpos($s,"</a",$pos_start);
 if($pos_close) {
    $pos_close += 4;
 } else {
    $array[] = substr( $copy , $pos_start , $pos_close-$pos_start );

  for($i=0;$i<count($array);$i++) {
   eregi('href *= *"?([^" >]*)"?[^>]*>(.*)</a *>?',$array[$i],$r);

 if($url) {
    if(!eregi("^mailto",$r[1])) {

  if(eregi("^(f|ht)tp",$r[1])) {
   /* full url */
   $this_url = $r[1];
  } elseif(eregi("^/",$r[1])) {
   /* absolute path, but not full url */
   $this_url = $p["scheme"] . "://" . $p["host"] . $port . $r[1];
  } else {
   if($p["path"] == "/" || $p["path"] == '') {
   /* relative link, but no url path */
    $this_url = $p["scheme"] . "://" . $p["host"] . $port . "/" .  $r[1];
   } else {
   /* relative link, with url path */
    if(ereg("/$",$p["path"])) {
    /* and the path ends in '/', so not a file */
     $this_url = $p["scheme"] . "://" . $p["host"] . $port . $p["path"] .
    } else {
    /* and the path doesn't end in '/', so
       probably a file (but it *could* be
       a directory, we can't really know) */
     $remove = strrchr($p["path"],"/");
     $path = ereg_replace("$remove","/",$p["path"]);
     $this_url = $p["scheme"] . "://" . $p["host"] . $port . "$path" .



    $links[] = array($array[$i],$this_url,$r[2]);


 } else {

    $links[] = array($array[$i],$r[1],$r[2]);



 return $links; /*
   array[$i][0] = entire link,
   array[$i][1] = link,
   array[$i][2] = link title

} // end function get_links

/* ************************************* */
/* Translate functions by Aral Balkan    */
/* ************************************* */

function translate($s, $url=''){

 // Calls all translate functions and returns the result
 $translated = $this->translate_links($s, $url);
 $translated = $this->translate_images($translated, $url);
 $translated = $this->translate_flash_objects($translated, $url);
 $translated = $this->translate_embeds($translated, $url);

 return $translated;

/* ************************************* */

function translate_links($s,$url=''){

 // added 6.22.01 Aral Balkan
 // Replaces relative links in the body of passed string with
 // absolute links.

 if($url) {
   $p = parse_url($url);

  if($p["port"]) {
   $port = ":$p[port]";
  } else {
   $port = '';

 $copy = $s; // so we can return links and titles in their proper case
 $s = strtolower($s); // or else the strstr and strpos searches are case

 $previous_close = 0; // save the previous closing location (top = 0)

 while($pos_start) {
    $pos_close = strpos($s,"</a",$pos_start);

 if($pos_close) {
  $pos_close += 4;
 } else {
    $array[] = substr( $copy , $pos_start , $pos_close-$pos_start );

 // save the rest of the page
 // (to be used when stringing it back together again)
 // - aral 6/22/01
 $rest_of_page[] = substr($copy, $previous_close,
 $previous_close = $pos_close;


 for($i=0;$i<count($array);$i++) {
   eregi('href *= *"?([^" >]*)"?[^>]*>(.*)</a *>?',$array[$i],$r);

 if($url) {
    if(!eregi("^mailto",$r[1])) {

  if(eregi("^(f|ht)tp",$r[1])) {
   /* full url */
   $this_url = $r[1];
  } elseif(eregi("^/",$r[1])) {
   /* absolute path, but not full url */
   $this_url = $p["scheme"] . "://" . $p["host"] . $port . $r[1];
   // replace the relative link with the absolute one
   if ( ('' !== $r[1]) && (NULL !== $r[1]) )  {
    $absolute_links[] = eregi_replace($r[1], $this_url, $array[$i]);
  } else {
   if($p["path"] == "/" || $p["path"] == '') {
    /* relative link, but no url path */
     $this_url = $p["scheme"] . "://" . $p["host"] . $port . "/" .  $r[1];
    // replace the relative link with the absolute one
    if ( ('' !== $r[1]) && (NULL !== $r[1]) )  {
     $absolute_links[] = eregi_replace($r[1], $this_url, $array[$i]);

   } else {
    /* relative link, with url path */
    if(ereg("/$",$p["path"])) {
     /* and the path ends in '/', so not a file */
     // added extra back slash - aral
      $this_url = $p["scheme"] . "://" . $p["host"] . $port . $p["path"] .
     // replace the relative link with the absolute one
     if ( ('' !== $r[1]) && (NULL !== $r[1]) )  {
      $absolute_links[] = eregi_replace($r[1], $this_url, $array[$i]);
    } else {
    /* and the path doesn't end in '/', so
       probably a file (but it *could* be
       a directory, we can't really know) */

     $remove = strrchr($p["path"],"/");
     $path = ereg_replace("$remove","/",$p["path"]);
     $this_url = $p["scheme"] . "://" . $p["host"] . $port . "$path" .

     // replace the relative link with the absolute one
     if ( ('' !== $r[1]) && (NULL !== $r[1]) )  {
      $absolute_links[] = eregi_replace($r[1], $this_url, $array[$i]);


    $links[] = array($array[$i],$this_url,$r[2]);


 } else {

    $links[] = array($array[$i],$r[1],$r[2]);



// save all the stuff after the very last link:

$footer = substr($copy, $pos_close);

// string together the page with all absolute links:

$count = 0;
$new_content = ''; // this will store the new page/content

while ($absolute_links[$count]) {
 $new_content .= $rest_of_page[$count].$absolute_links[$count];

$new_content .= $footer;

// return the new content:
return $new_content;

} // end function translate_links

/* ************************************* */

function translate_images($s,$url=''){

 // added 6.22.01 Aral Balkan
 // Replaces relative links to images in the body of passed string with
 // absolute links to images.

 if($url) {
   $p = parse_url($url);

  if($p["port"]) {
   $port = ":$p[port]";
  } else {
   $port = '';

 $copy = $s; // so we can return links and titles in their proper case
 $s = strtolower($s); // or else the strstr and strpos searches are case

 $previous_close = 0; // save the previous closing location (top = 0)

 while($pos_start) {
    $pos_close = strpos($s,">",$pos_start);
 if($pos_close) {
    $pos_close += 4;
 } else {
    $array[] = substr( $copy , $pos_start , $pos_close-$pos_start );

 // save the rest of the page
 // (to be used when stringing it back together again)
 // - aral 6/22/01
 $rest_of_page[] = substr($copy, $previous_close,
 $previous_close = $pos_close;



  for($i=0;$i<count($array);$i++) {
   eregi('src *= *"?([^" >]*)"?[^>]*>(.*) *>?',$array[$i],$r);

 if($url) {
    if(!eregi("^mailto",$r[1])) {

  if(eregi("^(f|ht)tp",$r[1])) {
   /* full url */
   $this_url = $r[1];
  } elseif(eregi("^/",$r[1])) {
   /* absolute path, but not full url */
   $this_url = $p["scheme"] . "://" . $p["host"] . $port . $r[1];
   $absolute_links[] = eregi_replace($r[1], $this_url, $array[$i]);
  } else {
   if($p["path"] == "/" || $p["path"] == '') {
   /* relative link, but no url path */
    $this_url = $p["scheme"] . "://" . $p["host"] . $port . "/" .  $r[1];
    $absolute_links[] = eregi_replace($r[1], $this_url, $array[$i]);
   } else {
   /* relative link, with url path */
    if(ereg("/$",$p["path"])) {
    /* and the path ends in '/', so not a file */
    // added extra back slash - aral
     $this_url = $p["scheme"] . "://" . $p["host"] . $port . $p["path"] .
     $absolute_links[] = eregi_replace($r[1], $this_url, $array[$i]);
    } else {
    /* and the path doesn't end in '/', so
       probably a file (but it *could* be
       a directory, we can't really know) */

     $remove = strrchr($p["path"],"/");
     $path = ereg_replace("$remove","/",$p["path"]);
     $this_url = $p["scheme"] . "://" . $p["host"] . $port . "$path" .
     $absolute_links[] = eregi_replace($r[1], $this_url, $array[$i]);


    $links[] = array($array[$i],$this_url,$r[2]);


 } else {

    $links[] = array($array[$i],$r[1],$r[2]);



// save all the stuff after the very last link:

$footer = substr($copy, $pos_close);

// string together the page with all absolute links:

$count = 0;
$new_content = ''; // this will store the new page/content

while ($absolute_links[$count]) {
 $new_content .= $rest_of_page[$count].$absolute_links[$count];

$new_content .= $footer;

// return the new content:
return $new_content;

} // end function translate_images

/* ************************************* */

function translate_flash_objects($s,$url=''){

 // added 6.22.01 Aral Balkan
 // Replaces relative links to Flash movies in Object tags of passed string
 // absolute links to Flash movies.

 if($url) {
   $p = parse_url($url);

  if($p["port"]) {
   $port = ":$p[port]";
  } else {
   $port = '';

 $copy = $s; // so we can return links and titles in their proper case
 $s = strtolower($s); // or else the strstr and strpos searches are case
 $pos_start=strpos($s,"<param name=movie");

 $previous_close = 0; // save the previous closing location (top = 0)

 while($pos_start) {
    $pos_close = strpos($s,">",$pos_start);
 if($pos_close) {
    $pos_close += 4;
 } else {
    $array[] = substr( $copy , $pos_start , $pos_close-$pos_start );

 // save the rest of the page
 // (to be used when stringing it back together again)
 // - aral 6/22/01
 $rest_of_page[] = substr($copy, $previous_close,
 $previous_close = $pos_close;

    $pos_start=strpos($s,"<param name=movie",$pos_close);


  for($i=0;$i<count($array);$i++) {
   eregi('value *= *"?([^" >]*)"?[^>]*>(.*) *>?',$array[$i],$r);

 if($url) {
    if(!eregi("^mailto",$r[1])) {

  if(eregi("^(f|ht)tp",$r[1])) {
   /* full url */
   $this_url = $r[1];
  } elseif(eregi("^/",$r[1])) {
   /* absolute path, but not full url */
   $this_url = $p["scheme"] . "://" . $p["host"] . $port . $r[1];
   $absolute_links[] = eregi_replace($r[1], $this_url, $array[$i]);
  } else {
   if($p["path"] == "/" || $p["path"] == '') {
   /* relative link, but no url path */
    $this_url = $p["scheme"] . "://" . $p["host"] . $port . "/" .  $r[1];
    $absolute_links[] = eregi_replace($r[1], $this_url, $array[$i]);
   } else {
   /* relative link, with url path */
    if(ereg("/$",$p["path"])) {
    /* and the path ends in '/', so not a file */
    // added extra back slash - aral
     $this_url = $p["scheme"] . "://" . $p["host"] . $port . $p["path"] .
     $absolute_links[] = eregi_replace($r[1], $this_url, $array[$i]);
    } else {
    /* and the path doesn't end in '/', so
       probably a file (but it *could* be
       a directory, we can't really know) */

     $remove = strrchr($p["path"],"/");
     $path = ereg_replace("$remove","/",$p["path"]);
     $this_url = $p["scheme"] . "://" . $p["host"] . $port . "$path" .
     $absolute_links[] = eregi_replace($r[1], $this_url, $array[$i]);


    $links[] = array($array[$i],$this_url,$r[2]);


 } else {

    $links[] = array($array[$i],$r[1],$r[2]);



// save all the stuff after the very last link:

$footer = substr($copy, $pos_close);

// string together the page with all absolute links:

$count = 0;
$new_content = ''; // this will store the new page/content

while ($absolute_links[$count]) {
 $new_content .= $rest_of_page[$count].$absolute_links[$count];

$new_content .= $footer;

// return the new content:
return $new_content;

} // end function translate_flash_objects

/* ************************************* */

function translate_embeds($s,$url=''){

 // added 6.22.01 Aral Balkan
 // Replaces relative links to <EMBED>ed plugins in the body of passed
string with absolute links.

 if($url) {
   $p = parse_url($url);

  if($p["port"]) {
   $port = ":$p[port]";
  } else {
   $port = '';

 $copy = $s; // so we can return links and titles in their proper case
 $s = strtolower($s); // or else the strstr and strpos searches are case

 $previous_close = 0; // save the previous closing location (top = 0)

 while($pos_start) {
    $pos_close = strpos($s,">",$pos_start);
 if($pos_close) {
    $pos_close += 4;
 } else {
    $array[] = substr( $copy , $pos_start , $pos_close-$pos_start );

 // save the rest of the page
 // (to be used when stringing it back together again)
 // - aral 6/22/01
 $rest_of_page[] = substr($copy, $previous_close,
 $previous_close = $pos_close;



  for($i=0;$i<count($array);$i++) {
   eregi('src *= *"?([^" >]*)"?[^>]*>(.*) *>?',$array[$i],$r);

 if($url) {
    if(!eregi("^mailto",$r[1])) {

  if(eregi("^(f|ht)tp",$r[1])) {
   /* full url */
   $this_url = $r[1];
  } elseif(eregi("^/",$r[1])) {
   /* absolute path, but not full url */
   $this_url = $p["scheme"] . "://" . $p["host"] . $port . $r[1];
   $absolute_links[] = eregi_replace($r[1], $this_url, $array[$i]);
  } else {
   if($p["path"] == "/" || $p["path"] == '') {
   /* relative link, but no url path */
    $this_url = $p["scheme"] . "://" . $p["host"] . $port . "/" .  $r[1];
    $absolute_links[] = eregi_replace($r[1], $this_url, $array[$i]);
   } else {
   /* relative link, with url path */
    if(ereg("/$",$p["path"])) {
    /* and the path ends in '/', so not a file */
    // added extra back slash - aral
     $this_url = $p["scheme"] . "://" . $p["host"] . $port . $p["path"] .
     $absolute_links[] = eregi_replace($r[1], $this_url, $array[$i]);
    } else {
    /* and the path doesn't end in '/', so
       probably a file (but it *could* be
       a directory, we can't really know) */

     $remove = strrchr($p["path"],"/");
     $path = ereg_replace("$remove","/",$p["path"]);
     $this_url = $p["scheme"] . "://" . $p["host"] . $port . "$path" .
     $absolute_links[] = eregi_replace($r[1], $this_url, $array[$i]);


    $links[] = array($array[$i],$this_url,$r[2]);


 } else {

    $links[] = array($array[$i],$r[1],$r[2]);



// save all the stuff after the very last link:

$footer = substr($copy, $pos_close);

// string together the page with all absolute links:

$count = 0;
$new_content = ''; // this will store the new page/content

while ($absolute_links[$count]) {
 $new_content .= $rest_of_page[$count].$absolute_links[$count];

$new_content .= $footer;

// return the new content:
return $new_content;

} // end function translate_images

/* ************************************* */

function get_page_title($s){

 if(eregi("<title *>([^<]*)</title *>",$s,$r)) {
  return $r[1];
 } else {
  return 0;

} // end function get_page_title

/* ************************************* */

function get_meta_tags($s){

  while($s = strstr($s,"<meta")) {
    $pos_close = strpos($s,">") + 1;
    $array[] = substr( $s , 0 , $pos_close );
    $s=substr( $s , $pos_close  );

  for($i=0;$i<count($array);$i++) {
   eregi('<meta +(name|httpd-equiv|http-equiv) *= *"?([^">]*)"? +content *=
   $meta[strtolower($r[2])] = $r[3];

 return $meta;

} // end function get_meta_tags

function robot_rules ($url,$robots_url,$user_agent) {
   $a = $this->get_url(array("url"=>"$robots_url","robot_rules"=>FALSE));
   $h = $this->get_headers($a["headers"]);

  if($h["status_code"]<200 || $h["status_code"] >299) return TRUE;
     // robots.txt doesn't exist, we can index

     $lines = explode("\n",$a["content"]);

  for($i=0;$i<count($lines);$i++) {
   $entry = split(" *: *",$lines[$i]);
   $type = strtolower($entry[0]);
   $value = strtolower($entry[1]);

   if($type == "user-agent") $ua = $value;

   if($type == "disallow") {
     $hash["$ua"]["$value"] = 1;

   if(is_array($hash["*"])) {
    while(list($k,$v) = each ($hash["*"])) {
     if(strpos($url,$k)>0) return FALSE;

   if(is_array($hash["$user_agent"])) {
    while(list($k,$v) = each ($hash["$user_agent"])) {
     if(strpos($url,$k)>0) return FALSE;

  return TRUE;

} // End Browser Class

