Ad esempio ho scaricato dalla rete questo script....

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Crawl a site and extract all unique URLs for html pages.

This script takes one argument: the url to the site to crawl.
If you want to store the ouput, pipe it to a file.

Usage example (output to console):
python crawlsite.py http://www.mysite.com

Usage example (output to file in Windows):
python crawlsite.py http://www.mysite.com > mylinks.txt

This script was written in a haste. Please report errors to
[EMAIL PROTECTED]

This script uses the htmldata library by Connelly Barnes. Please
make sure it is available in the same folder.

"""

__author__ = 'Peter Krantz'
__version__ = '0.1'
__date__ = '2005/04/01'

import urllib2
import htmldata
import httplib
import sys
import urlparse
import codecs
import datetime


#Setup some basic parameters
useragentFirefox = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.6) Gecko/20050223 Firefox/1.0.1" useragentIE6 = "User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1;)" useragentSelf = "Sitecrawler " + __version__ + " " + __date__ + " by " + __author__

skippedProtocols = ("javascript", "mailto", "ftp", "gopher")
validContentTypes = ("text/html", "application/xhtml+xml")


#get command line parameters
#Starting url
url = sys.argv[1]

#Get root url
urlparts = urlparse.urlsplit(url)
rootUrl = urlparts[0] + "://" + urlparts[1]

#List of parsed urls
parsedurls = []

#Is contenttype parsable?
def isParsable(contentType):
   result = False
   for validContentType in validContentTypes:
       if validContentType in contentType:
           result = True
           break

   return result


def stripFragment(url):
   urlparts = urlparse.urlsplit(url)

   protocol = urlparts[0]
   server = urlparts[1]
   path = urlparts[2]
   query = urlparts[3]
   fragment = urlparts[4]

   return protocol + "://" + server + path + query




def addUrlToHistory(url):
   global parsedurls

   urlparts = urlparse.urlsplit(url)

   protocol = urlparts[0]
   server = urlparts[1]
   path = urlparts[2]
   query = urlparts[3]
   fragment = urlparts[4]

   #Add url without fragment to list of parsed urls
   parsedurls.append(stripFragment(url))



#Check if URL exists. Returns status and content type.
def urlIsOk(url):
   global rootUrl
   global parsedurls

   try:

       #split the url to get the request item
       urlparts = urlparse.urlsplit(url)

       protocol = urlparts[0]
       server = urlparts[1]
       path = urlparts[2]
       fragment = urlparts[4]

       #Skip links where protocol is one of skippedProtocols
       if protocol in skippedProtocols:
           return (True, "unknown", 0)

       #Skip links to other sites
       if len(server) > 0:
           if url.find(rootUrl) == -1:
               return (False, "unknown", 0)

       #Skip same page links
       if len(fragment) > 0:
           if stripFragment(url) in parsedurls:
               return (False, "unknown", 0)

       #Check url header
       httpObj = httplib.HTTPConnection(server, 80)
       httpObj.connect()
       httpObj.putrequest('HEAD', path)
       httpObj.putheader('Accept', '*/*')
       httpObj.putheader('User-Agent', useragentSelf)
       httpObj.endheaders()
       response = httpObj.getresponse()
       contentType = response.getheader("content-type", "unknown")
       httpObj.close();

       if response.status != 200:
           if response.status == 301:
               #moved permanently - read location
               return urlIsOk(response.getheader("location"))
           if response.status == 302:
               #handle redirect
               return urlIsOk(response.getheader("location"))
           else:
               #server error message
               return (False, contentType, response.status)
       else:
           #Server reports url is OK.
           return (True, contentType, 200)

   except Exception:
       return (False, "unknown", 999)



def checkUrl(url):
   global currentUrl

   result = urlIsOk(url)
   if result[0]:
       #determine if link is crawlable
       if isParsable(result[1]):
           return True
       else:
           return False
   else:
       return False




#get html for a page
def getContent(url):
   try:
       contents = urllib2.urlopen(url).read()
       return contents
   except:
       return ""



#Get data
def printlinks(url, currentlevel):
   global recurselimit
   global pagetitle
   global parsedurls
   global currentUrl

   #Check if URL already parsed
   if not (stripFragment(url) in parsedurls):

       #check if url is ok
       if checkUrl(url):
           #Get doc
           currentUrl = url
           contents = getContent(url)

           #add title and url to list
           addUrlToHistory(url)

           #print url
           print url

#recurse links = htmldata.urlextract(contents, url)

           for u in links:
               printlinks(u.url, currentlevel)




#start script
printlinks(url, 0)

però per prendere i link di un sito di circa 10 pagine ci vuole un bel pò e vorrei velocizzarlo. mi dite come please....???

Valentino Volonghi aka Dialtone ha scritto:
On Sun, 18 Mar 2007 22:36:05 +0100, "A.recca" <[EMAIL PROTECTED]> wrote:

Il mio unico problema sta nel fatto che vorrei velocizzare il programma e quindi renderlo multithreading impostando sempre da riga di comando il numero di thread che vorrei fare. Mi potete dare qualche consiglio???

A me suona strano il 'quindi' in mezzo alla frase... Rendere un programma veloce si fa non usando il multi-threading, non certo usandolo :). I client web in particolare si scrivono in modo asincrono,
controlla libevent/pyevent o, meglio secondo me, Twisted Matrix.
------------------------------------------------------------------------

_______________________________________________
Python mailing list
Python@lists.python.it
http://lists.python.it/mailman/listinfo/python

_______________________________________________
Python mailing list
Python@lists.python.it
http://lists.python.it/mailman/listinfo/python

Rispondere a