On Wednesday, July 06, 2011 18:15:45 Torsten Dreyer wrote:
> Hi,
>
Actually the correct script is the one attached here (it's late).
Hope it helps.
Adrian
#!/usr/bin/env python
# Copyright (C) 2011 Adrian Musceac
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import os, sys
import io
import time
from urllib2 import *
from HTMLParser import *
import re
class Parserul(HTMLParser):
def __init__(self):
self.getit=False
self.text=''
self.bufr=''
HTMLParser.__init__(self)
def handle_starttag(self,tag,attrs):
if tag.lower()=='div':
for att in attrs:
if att[1]=='commit_message':
self.getit=True
def handle_data(self,data):
if self.getit==True:
if data:
self.bufr = self.bufr + data
def handle_endtag(self,tag):
if tag.lower()=='div' and self.getit==True:
self.text = self.text + '\n' + self.bufr
self.bufr=''
self.getit=False
class Parserul2(HTMLParser):
def __init__(self):
self.getit=False
self.next_link=''
HTMLParser.__init__(self)
def handle_starttag(self,tag,attrs):
if tag.lower()=='a':
if len(attrs) >2:
if attrs[1][0]=='class' and attrs[1][1]=='next_page' and attrs[2][0]=='rel' and attrs[2][1]=='next':
self.next_link=attrs[0][1]
def get_next_link(self):
return self.next_link
def crawl_gitorious():
prefix='http://gitorious.org'
if(sys.argv[1]=='fg'):
next_link='/fg/flightgear/commits/next'
git='flightgear'
######### Edit number of pages to crawl in flightgear commit log #############
xx=20
elif(sys.argv[1]=='sg'):
next_link='/fg/simgear/commits/next'
git='simgear'
######### Edit number of pages to crawl in simgear commit log #############
xx=10
elif(sys.argv[1]=='fgdata'):
next_link='/fg/fgdata/commits/master'
######### Edit number of pages to crawl in fgdata commit log #############
git='fgdata'
xx=25
while 1:
parser=Parserul()
parser_link=Parserul2()
ff = open('./'+git+'.log','ab')
f= urlopen(prefix+next_link)
print next_link
g=f.read()
parser.feed(g)
parser_link.feed(g)
next_link=parser_link.get_next_link()
buf = re.sub('<[^<]+?>','',parser.text)
ff.write(buf)
ff.close()
f.close()
parser.close()
parser_link.close()
tmp=next_link.split('page=')
if int(tmp[1]) > xx: break
time.sleep(3) # DOS is bad manners
if __name__ == "__main__":
if len(sys.argv) <2:
print 'Usage: crawler.py fg | sg | fgdata'
sys.exit()
else:
crawl_gitorious()
------------------------------------------------------------------------------
All of the data generated in your IT infrastructure is seriously valuable.
Why? It contains a definitive record of application performance, security
threats, fraudulent activity, and more. Splunk takes this data and makes
sense of it. IT sense. And common sense.
http://p.sf.net/sfu/splunk-d2d-c2
_______________________________________________
Flightgear-devel mailing list
Flightgear-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/flightgear-devel