Hey list,
I've been working on my spider program (albeit not very much :P) and I
would appreciate it if I could get some comments on the code. I'm fairly
sure I haven't chosen the best method to do what I want to do, but if we
can just assume that I have, that'll make things easier. ;)
In particular, comments on the "Things to do" list at the end would be
much appreciated.
The file is attached.
Cheers,
Dan
#!/usr/bin/python
### sPyder - A program designed to retrieve a particular, related set of web pages, using a spider-type search.
### sPyder is released under the GPL.
import re
import urllib
import distutils.dir_util
import os
from optparse import OptionParser
DEFAULT_DEFS = '[cf]=".*html"|c=".*jpg"|c=".*gif"|c=".*bmp"|[cf]="/\w*/' # These should be moved to a configuration file, ideally.
def main():
### This is the class which brings all of the other classes together, as well as providing the command line parsing interface.
# This is all command line parsing
usage = "usage: %prog -s <remote site> -d <local destination> -f <source file> -defs <definitions file>"
parser = OptionParser(usage=usage)
parser.add_option("-s","--site",help="the site to be copied (must start with 'http://'")
parser.add_option("-l","--local_destination",dest="localdir",help="the destination the site will be copied to")
parser.add_option("-f","--file",dest="source",help="the file to start at (assumed to be 'index.html' if not specified)",default="index.html")
parser.add_option("-d","--definitions",help="a file containing definitions of what files to copy (reverts to defaults if not specified)",default=None)
(options, args) = parser.parse_args()
# Preparing and tidying up the input for use by the actual fetching classes (copy_items)
site = options.site
localdir = options.localdir
source = options.source
if options.definitions:
definitions = options.definitions # This is not implemented correctly, it is just a placeholder.
else:
definitions = DEFAULT_DEFS
if not re.search('/\Z',site):
site = site + "/"
if not re.search('/\Z',localdir):
localdir = localdir + "/"
if site and localdir:
copy_items(site,localdir,source,definitions)
def get_page_items (site,localdir,source,defs):
### This class returns a list of all the items within a page which fit the definitions.
next = []
text = urllib.urlopen(site+source).read()
if re.search(defs,text):
for i in re.findall(defs,text):
i = i[3:-1]
if re.search('/\w*/\Z',i):
j = os.listdir(site+i)
next.append(j)
else:
next.append(i)
src = [source] + next
print src
return src
def get_list (site,localdir,source,defs):
### This class returns a list of the pages which need to be copied.
items = []
next = get_page_items (site,localdir,source,defs)
for i in next:
if i not in items:
items.append (i)
next.extend (get_page_items (site,localdir,i,defs))
return items
def copy_items (site,localdir,source,defs):
### This class copies the list of files which has been compiled.
items = get_list(site,localdir,source,defs)
for i in items:
distutils.dir_util.create_tree(localdir, items)
original = urllib.urlopen(site+i)
local = open(localdir+i,'w')
body = original.read()
local.write(body)
local.close()
main()
#Things to do:
# - Create 'definitions' file
# - Create support for straight references to a folder
# - Look into compiling RE for greater efficiency
_______________________________________________
Tutor maillist - [email protected]
http://mail.python.org/mailman/listinfo/tutor