On Wednesday, July 06, 2011 18:15:45 Torsten Dreyer wrote:
> Hi,
> 

Actually the correct script is the one attached here (it's late).
Hope it helps.

Adrian
#!/usr/bin/env python

# Copyright (C) 2011 Adrian Musceac 
# 
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.


import os, sys
import io
import time
from urllib2 import *
from HTMLParser import *
import re

class Parserul(HTMLParser):
	
	def __init__(self):
		self.getit=False
		self.text=''
		self.bufr=''
		HTMLParser.__init__(self)
	def handle_starttag(self,tag,attrs):
		if tag.lower()=='div':
			for att in attrs:
				if att[1]=='commit_message':
					self.getit=True
	
	def handle_data(self,data):
		if self.getit==True:
			if data:
				self.bufr = self.bufr + data
				
	def handle_endtag(self,tag):
		if tag.lower()=='div' and self.getit==True:
			self.text = self.text + '\n' + self.bufr
			self.bufr=''
			self.getit=False
			
class Parserul2(HTMLParser):
	
	def __init__(self):
		self.getit=False
		self.next_link=''
		HTMLParser.__init__(self)
	def handle_starttag(self,tag,attrs):
		if tag.lower()=='a':
			if len(attrs) >2:
				if attrs[1][0]=='class' and attrs[1][1]=='next_page' and attrs[2][0]=='rel' and attrs[2][1]=='next':
					self.next_link=attrs[0][1]
					
	def get_next_link(self):
		return self.next_link

		
def crawl_gitorious():
	
	prefix='http://gitorious.org'
	if(sys.argv[1]=='fg'):
		next_link='/fg/flightgear/commits/next'
		git='flightgear'
		######### Edit number of pages to crawl in flightgear commit log #############
		xx=20
	elif(sys.argv[1]=='sg'):
		next_link='/fg/simgear/commits/next'
		git='simgear'
		######### Edit number of pages to crawl in simgear commit log #############
		xx=10
	elif(sys.argv[1]=='fgdata'):
		next_link='/fg/fgdata/commits/master'
		######### Edit number of pages to crawl in fgdata commit log #############
		git='fgdata'
		xx=25
	while 1:
		parser=Parserul()
		parser_link=Parserul2()
		ff = open('./'+git+'.log','ab')
		f= urlopen(prefix+next_link)
		print next_link
		g=f.read()
		parser.feed(g)
		parser_link.feed(g)
		next_link=parser_link.get_next_link()
		
		buf = re.sub('<[^<]+?>','',parser.text)
		ff.write(buf)
		ff.close()
		f.close()
		parser.close()
		parser_link.close()
		tmp=next_link.split('page=')
		if int(tmp[1]) > xx: break 
		time.sleep(3) # DOS is bad manners
		
if __name__ == "__main__":
	if len(sys.argv) <2:
		print 'Usage: crawler.py fg | sg | fgdata'
		sys.exit()
	else:
		crawl_gitorious()
	
------------------------------------------------------------------------------
All of the data generated in your IT infrastructure is seriously valuable.
Why? It contains a definitive record of application performance, security 
threats, fraudulent activity, and more. Splunk takes this data and makes 
sense of it. IT sense. And common sense.
http://p.sf.net/sfu/splunk-d2d-c2
_______________________________________________
Flightgear-devel mailing list
Flightgear-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/flightgear-devel

Reply via email to