On Tue, Dec 16, 2008 at 2:20 PM, Jean-Paul Calderone <[email protected]>wrote:
> On Tue, 16 Dec 2008 05:31:45 +0200, Radu Dragusin <[email protected]> > wrote: > >> I have a HTTP Proxy made with twisted.web and want to change the request >> that the browser sends to the Proxy such that I erase the value of the >> 'accept-encoding' key from 'gzip,deflate' to ' '. >> >> I use the example from the Tisted Book: >> >> By adding the overriden process method in WordCountProxyRequest I can get >> the request header but have found no way to set a key, value pair. >> I want make the server think that the browser does not support gzip >> because >> twisted seems to not support gzip as the response from www.google.com and >> many (but not all) sites appears still encoded. www.dpreview.com seems >> not >> to gzip the response, and so the resonse is processed correctly. >> >> What can I do to either correctly decode gzip responses or modify the >> 'accept-encoding' value to nothing so the server does not compress the >> response? >> >> Thank you! >> *Example 4-8. wordcountproxy.py* >> >> import sgmllib, re >> from twisted.web import proxy, http >> import sys >> from twisted.python import log >> log.startLogging(sys.stdout) >> >> WEB_PORT = 8000 >> PROXY_PORT = 8001 >> >> class WordParser(sgmllib.SGMLParser): >> def __init__(self): >> sgmllib.SGMLParser.__init__(self) >> self.chardata = [] >> self.inBody = False >> >> def start_body(self, attrs): >> self.inBody = True >> >> def end_body(self): >> self.inBody = False >> >> def handle_data(self, data): >> if self.inBody: >> self.chardata.append(data) >> >> def getWords(self): >> # extract words >> wordFinder = re.compile(r'\w*') >> words = wordFinder.findall("".join(self.chardata)) >> words = filter(lambda word: word.strip( ), words) >> print "WORDS ARE", words >> return words >> >> class WordCounter(object): >> ignoredWords = "the a of in from to this that and or but is was be >> can could i you they we at".split( ) >> >> def __init__(self): >> self.words = {} >> >> def addWords(self, words): >> for word in words: >> word = word.lower( ) >> if not word in self.ignoredWords: >> currentCount = self.words.get(word, 0) >> self.words[word] = currentCount + 1 >> >> class WordCountProxyClient(proxy.ProxyClient): >> def handleHeader(self, key, value): >> proxy.ProxyClient.handleHeader(self, key, value) >> > > How about skipping it here? > If I use here the following: print "[", key, ":", value,"]" I get: 2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Cache-Control : no-cache, no-store, max-age=0, must-revalidate ] 2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Pragma : no-cache ] 2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Expires : Fri, 01 Jan 1990 00:00:00 GMT ] 2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Date : Tue, 16 Dec 2008 13:37:21 GMT ] 2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Content-Type : text/javascript; charset=UTF-8 ] 2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Set-Cookie : GMAIL_STAT_3492=EXPIRED; Expires=Mon, 15-Dec-2008 13:37:21 GMT; Path=/a/ dragusin.ro ] 2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Set-Cookie : GMAIL_IMP=EXPIRED; Expires=Mon, 15-Dec-2008 13:37:21 GMT; Path=/a/ dragusin.ro ] 2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Content-Encoding : gzip ] 2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ X-Content-Type-Options : nosniff ] 2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Content-Length : 14340 ] 2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Server : GFE/1.3 ] 2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Connection : Close ] 2008-12-16 15:37:22+0200 [WordCountProxyClient,client] [ Cache-Control : private, max-age=0 ] 2008-12-16 15:37:22+0200 [WordCountProxyClient,client] [ Date : Tue, 16 Dec 2008 13:37:21 GMT ] 2008-12-16 15:37:22+0200 [WordCountProxyClient,client] [ Expires : -1 ] 2008-12-16 15:37:22+0200 [WordCountProxyClient,client] [ Content-Type : text/html; charset=UTF-8 ] 2008-12-16 15:37:22+0200 [WordCountProxyClient,client] [ Content-Encoding : gzip ] 2008-12-16 15:37:22+0200 [WordCountProxyClient,client] [ Server : gws ] 2008-12-16 15:37:22+0200 [WordCountProxyClient,client] [ Content-Length : 2597 ] 2008-12-16 15:37:22+0200 [WordCountProxyClient,client] [ Connection : Close ] So that is the response header. I need to override the request header, the one that the browser sends to the proxy server. See below: > > > if key.lower( ) == "content-type": >> if value.split(';')[0] == 'text/html': >> self.parser = WordParser( ) >> >> def handleResponsePart(self, data): >> proxy.ProxyClient.handleResponsePart(self, data) >> if hasattr(self, 'parser'): self.parser.feed(data) >> >> >> def handleResponseEnd(self): >> proxy.ProxyClient.handleResponseEnd(self) >> if hasattr(self, 'parser'): >> self.parser.close( ) >> self.father.wordCounter.addWords(self.parser.getWords( )) >> del(self.parser) >> >> class WordCountProxyClientFactory(proxy.ProxyClientFactory): >> def buildProtocol(self, addr): >> client = proxy.ProxyClientFactory.buildProtocol(self, addr) >> # upgrade proxy.proxyClient object to WordCountProxyClient >> client.__class__ = WordCountProxyClient >> return client >> >> class WordCountProxyRequest(proxy.ProxyRequest): >> protocols = {'http': WordCountProxyClientFactory} >> >> def __init__(self, wordCounter, *args): >> self.wordCounter = wordCounter >> proxy.ProxyRequest.__init__(self, *args) >> >> * def process(self): >> proxy.ProxyRequest.process(self) >> print "received_headers", proxy.ProxyRequest.getAllHeaders(self)* > > the print above prints: received_headers: {'accept-language': 'en-us,en;q=0.5', 'accept-encoding': 'gzip,deflate', 'keep-alive': '300', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'user-agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.4) Gecko/2008111318 Ubuntu/8.10 (intrepid) Firefox/3.0.4', 'accept-charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'host': 'www.google.com', 'cookie': 'PREF=ID=cfb3eb179de0c1e6:LD=en:NR=100:CR=2:TM=1228315308:LM=1229032156:GM=1:S=ImAuEufbnV6S7BAz; NID=17=lOVMiFLculcrfN-zUO7xxFTTUFzqQqaHOFHcG_BDmYFX8QKYbMoo7GrDoYH-8ASPBlVijG_Hstp7HSDQ_8WQexHPjwz6g_7ZVpBhwmh3vkKuO3jpf9dnzrnWthcW1mGh; S=photos_html=6ScUGfd699g4Xuuh0FeizA; TZ=-120', 'cache-control': 'max-age=0', 'proxy-connection': 'keep-alive'} these are the values I want to modify, the 'accept-encoding', to be specific. How can I do it? Thank you! > class WordCountProxy(proxy.Proxy): >> def __init__(self, wordCounter): >> self.wordCounter = wordCounter >> proxy.Proxy.__init__(self) >> >> def requestFactory(self, *args): >> return WordCountProxyRequest(self.wordCounter, *args) >> >> class WordCountProxyFactory(http.HTTPFactory): >> def __init__(self, wordCounter): >> self.wordCounter = wordCounter >> http.HTTPFactory.__init__(self) >> >> def buildProtocol(self, addr): >> protocol = WordCountProxy(self.wordCounter) >> return protocol >> >> # classes for web reporting interface >> class WebReportRequest(http.Request): >> def __init__(self, wordCounter, *args): >> self.wordCounter = wordCounter >> http.Request.__init__(self, *args) >> >> def process(self): >> self.setHeader("Content-Type", "text/html") >> words = self.wordCounter.words.items( ) >> words.sort(lambda (w1, c1), (w2, c2): cmp(c2, c1)) >> for word, count in words: >> self.write("<li>%s %s</li>" % (word, count)) >> self.finish( ) >> >> class WebReportChannel(http.HTTPChannel): >> def __init__(self, wordCounter): >> self.wordCounter = wordCounter >> http.HTTPChannel.__init__(self) >> >> def requestFactory(self, *args): >> return WebReportRequest(self.wordCounter, *args) >> >> class WebReportFactory(http.HTTPFactory): >> def __init__(self, wordCounter): >> self.wordCounter = wordCounter >> http.HTTPFactory.__init__(self) >> >> def buildProtocol(self, addr): >> return WebReportChannel(self.wordCounter) >> >> if __name__ == "__main__": >> from twisted.internet import reactor >> counter = WordCounter( ) >> prox = WordCountProxyFactory(counter) >> reactor.listenTCP(PROXY_PORT, prox) >> reactor.listenTCP(WEB_PORT, WebReportFactory(counter)) >> reactor.run( ) >> >> >> > Jean-Paul > > _______________________________________________ > Twisted-web mailing list > [email protected] > http://twistedmatrix.com/cgi-bin/mailman/listinfo/twisted-web > -- Radu
_______________________________________________ Twisted-web mailing list [email protected] http://twistedmatrix.com/cgi-bin/mailman/listinfo/twisted-web
