Sorry, I find there's already a solution. See the link: Scraping multiple JavaScript webpages with webkit<http://webscraping.com/blog/Scraping-multiple-JavaScript-webpages-with-webkit/>
On Wed, Oct 24, 2012 at 2:06 PM, flyer <[email protected]> wrote: > Thanks anyway. > > I find this > example<http://webscraping.com/blog/Scraping-JavaScript-webpages-with-webkit/> > . > After modifying this example to crawl one web page which AJAX couldn't be > parsed properly using the previous code, I get the nearly-whole info of the > web page including some info generated by AJAX when scrolling down the > scroll bar. > > And the new example code solved another problem of the previous code that > the code could not quit properly. It must be killed using shell command or > use some tricky statements which would caused segment fault. > > The new example code works well for one page at a time. When I tried to > get many web pages continually. It could only get part info of the first > url, namely the AJAX of the first url couldn't be parsed and the other urls > couldn't be handled and I got the following exception: > > QObject::connect: Cannot connect >>> (null)::configurationAdded(QNetworkConfiguration) to >>> QNetworkConfigurationManager::configurationAdded(QNetworkConfiguration) >> >> QObject::connect: Cannot connect >>> (null)::configurationRemoved(QNetworkConfiguration) to >>> QNetworkConfigurationManager::configurationRemoved(QNetworkConfiguration) >> >> QObject::connect: Cannot connect (null)::configurationUpdateComplete() to >>> QNetworkConfigurationManager::updateCompleted() >> >> QObject::connect: Cannot connect (null)::onlineStateChanged(bool) to >>> QNetworkConfigurationManager::onlineStateChanged(bool) >> >> QObject::connect: Cannot connect >>> (null)::configurationChanged(QNetworkConfiguration) to >>> QNetworkConfigurationManager::configurationChanged(QNetworkConfiguration) >> >> Segmentation fault >> >> >> > I googled this problem and searched this on stackoverflow, but couldn't > find a good solution. > > Please give me some tips on this. Thank you in advance. > > The following is the new code I wrote: > > > > #!/usr/bin/env python > > #coding: utf-8 > > >> import sys > > import re > > import time > > >> from PyQt4.QtCore import SIGNAL, QUrl, QSize > > from PyQt4.QtGui import QApplication > > from PyQt4.QtWebKit import QWebView, QWebPage, QWebFrame, QWebSettings > > from PyQt4.QtNetwork import QNetworkAccessManager, QNetworkRequest > > >> reload(sys) > > sys.setdefaultencoding('utf-8') > > >> fn_url = 'url.txt' > > fp_url = open(fn_url, 'ab+') > > >> >> class Render(QWebPage): > > >> def __init__(self, url): > > self.url = url > > self.app = QApplication(sys.argv) > > QWebPage.__init__(self) > > > > self.viewport = self.setViewportSize(QSize(1600, 9000)) > > > > self.network = NetworkAccessManager() > > self.setNetworkAccessManager(self.network) > > >> self.loadFinished.connect(self._loadFinished) > > >> QWebSettings.clearMemoryCaches() > > >> self.mainFrame().load(QUrl(self.url)) > > >> self.app.exec_() > > >> def _loadFinished(self, res): > > self.frame = self.mainFrame() > > self.app.quit() > > self.deleteLater() > > >> >> class NetworkAccessManager(QNetworkAccessManager): > > > > def __init__(self): > > super(NetworkAccessManager, self).__init__() > > >> self.connect(self, SIGNAL('finished (QNetworkReply *)'), >> self.finishd) > > >> self.ban = ( > > '.*\.css', > > '.*\.jpg', > > '.*\.png', > > ) > > > > def createRequest(self, operation, request, data): > > url = str(request.url().toString()) > > if re.search('.*\.css', url): > > self.setNetworkAccessible(QNetworkAccessManager.NotAccessible) > > else: > > self.setNetworkAccessible(QNetworkAccessManager.Accessible) > > > > return QNetworkAccessManager.createRequest(self, operation, >> request, data) > > >> def finishd(self, reply): > > print 'In NetworkAccessManager finishd' > > url = str(reply.url().toString()) > > > > log = '%s: %s\n' % (time.ctime(), url) > > fp_url.write(log) > > >> print url > > >> >> if __name__ == '__main__': > > urls_jd = ( > > 'http://www.360buy.com/product/135896.html', > > 'http://www.360buy.com/product/742573.html', > > 'http://www.360buy.com/product/724557.html', > > 'http://www.360buy.com/product/690189.html', > > 'http://www.360buy.com/product/721948.html', > > 'http://www.360buy.com/product/722933.html', > > 'http://book.360buy.com/10120243.html', > > 'http://book.360buy.com/10009164.html', > > 'http://book.360buy.com/10875531.html', > > 'http://mvd.360buy.com/20003405.html', > > 'http://mvd.360buy.com/20064481.html', > > 'http://mvd.360buy.com/20063053.html', > > 'http://mvd.360buy.com/20061277.html', > > 'http://mvd.360buy.com/20006893.html', > > ) > > >> for url in urls_jd: > > r = Render(url) > > html = r.frame.toHtml().toUtf8() > > >> fn = '%s.txt' % (url.split('=')[-1], ) > > with open(fn, 'ab+') as fp: > > fp.write(html) > > del r > > print 'File %s' % (fn, ) > > > > On Mon, Oct 22, 2012 at 11:52 PM, Srini Kommoori <[email protected]> wrote: > >> Initially I thought you are missing scrolling but looks like you are >> only interested in getting the content either in text or html. I am >> not definite but some websites respond to particular clients really >> well. You can play with client string and see how it behaves. >> >> >> On Mon, Oct 22, 2012 at 1:11 AM, flyer <[email protected]> wrote: >> > I wrote a python script using QtWebKit to get all page info including >> info >> > generated by AJAX requests. I run the following code on CentOS Server >> and do >> > the following settings: >> > >> >> >> >> $ Xvfb :100 -screen 0 9000x15000x24 & >> >> >> >> $ export DISPLAY=:100 >> > >> > >> > The following code worked, however, it could only get one-screen info >> of the >> > web page, namely, getting different amount of info according to the >> screen >> > resolution. I could only get part of the info of the webpage. >> > >> > I have tried using selenium and I can get all web info if I set large >> screen >> > resolution using Xvfb . >> > >> > Please give me some tips about how to solve the problem and any manual >> for >> > QtWebKit is also appreciated because I can't find more materials about >> it. >> > >> > The following is my code: >> > >> >>> #!/usr/bin/env python >> >>> >> >>> #coding: utf-8 >> >>> >> >>> >> >>> >> >>> import sys >> >>> >> >>> import time >> >>> >> >>> >> >>> from PySide.QtCore import QUrl, SIGNAL >> >>> >> >>> from PySide.QtGui import QApplication >> >>> >> >>> from PySide.QtWebKit import QWebPage, QWebView, QWebSettings >> >>> >> >>> from PySide.QtNetwork import QNetworkAccessManager, QNetworkRequest >> >>> >> >>> >> >>> reload(sys) >> >>> >> >>> sys.setdefaultencoding('utf-8') >> >>> >> >>> >> >>> fn_log = 'url_dd.txt' >> >>> >> >>> fp_log = open(fn_log, 'ab+') >> >>> >> >>> >> >>> class WebPage(QWebPage): >> >>> >> >>> >> >>> def __init__(self, logger=None, parent=None): >> >>> >> >>> super(WebPage, self).__init__(parent) >> >>> >> >>> >> >>> >> >>> def javaScriptConsoleMessage(self, message, lineNumber, sourceID): >> >>> >> >>> sys.stderr.write('Javascritp error at line number %d\n' % >> >>> (lineNumber)) >> >>> >> >>> sys.stderr.write('%s\n' % (message, )) >> >>> >> >>> sys.stderr.write('Source ID: %s\n' % (sourceID, )) >> >>> >> >>> >> >>> >> >>> class Crawler(QApplication): >> >>> >> >>> >> >>> >> >>> def __init__(self, url): >> >>> >> >>> super(Crawler, self).__init__(sys.argv) >> >>> >> >>> >> >>> >> >>> self.url = url >> >>> >> >>> self.web_view = QWebView() >> >>> >> >>> self.web_page = WebPage() >> >>> >> >>> self.web_view.setPage(self.web_page) >> >>> >> >>> self.web_frame = self.web_page.mainFrame() >> >>> >> >>> >> >>> self.network = NetworkAccessManager() >> >>> >> >>> self.web_page.setNetworkAccessManager(self.network) >> >>> >> >>> >> >>> >> >>> self.settings = self.web_page.settings().globalSettings() >> >>> >> >>> self.settings.setAttribute(QWebSettings.AutoLoadImages, False) >> >>> >> >>> self.settings.setAttribute(QWebSettings.PluginsEnabled, False) >> >>> >> >>> QWebSettings.clearMemoryCaches() >> >>> >> >>> >> >>> self.web_view.resize(1024, 9000) >> >>> >> >>> >> >>> self.connect(self.web_page, SIGNAL('loadFinished(bool)'), >> >>> self.loadFinished) >> >>> >> >>> >> >>> print 'Before loading' >> >>> >> >>> self.web_view.load(QUrl(self.url)) >> >>> >> >>> print 'After loading' >> >>> >> >>> >> >>> def loadFinished(self, ok): >> >>> >> >>> print 'Start loadFinished()' >> >>> >> >>> >> >>> print 'Start writing' >> >>> >> >>> with open('content_dd.txt', 'ab+') as fp: >> >>> >> >>> fp.write(self.web_frame.toHtml().toUtf8()) >> >>> >> >>> print 'End writing' >> >>> >> >>> >> >>> >> >>> print 'End loadFinished()' >> >>> >> >>> >> >>> try: >> >>> >> >>> self.quit() >> >>> >> >>> except Exception, e: >> >>> >> >>> print 'FATAL ERROR: %s' % (str(e), ) >> >>> >> >>> >> >>> >> >>> class NetworkAccessManager(QNetworkAccessManager): >> >>> >> >>> >> >>> >> >>> def __init__(self): >> >>> >> >>> super(NetworkAccessManager, self).__init__() >> >>> >> >>> # QNetworkAccessManager.__init__(self) >> >>> >> >>> self.connect(self, SIGNAL('finished (QNetworkReply *)'), >> >>> self.finishd) >> >>> >> >>> >> >>> >> >>> def createRequest(self, operation, request, data): >> >>> >> >>> # url = request.url().toString() >> >>> >> >>> self.setNetworkAccessible(self.Accessible) >> >>> >> >>> >> >>> >> >>> return QNetworkAccessManager.createRequest(self, operation, >> >>> request, data) >> >>> >> >>> >> >>> def finishd(self, reply): >> >>> >> >>> print 'In NetworkAccessManager finishd' >> >>> >> >>> url = str(reply.url().toString()) >> >>> >> >>> >> >>> >> >>> log = '%s: %s\n' % (time.ctime(), url) >> >>> >> >>> fp_log.write(log) >> >>> >> >>> >> >>> print url >> >>> >> >>> >> >>> >> >>> if __name__ == '__main__': >> >>> >> >>> # url = >> >>> 'http://product.dangdang.com/product.aspx?product_id=22822333' >> >>> >> >>> url = ' >> http://product.dangdang.com/product.aspx?product_id=22848707' >> >>> >> >>> crawler = Crawler(url) >> >>> >> >>> sys.exit(crawler.exec_()) >> >> >> >> >> > >> > -- >> > 宠辱不惊,闲看庭前花开花落;去留无意,漫随天边云卷云舒。 >> > >> > >> > >> > _______________________________________________ >> > PySide mailing list >> > [email protected] >> > http://lists.qt-project.org/mailman/listinfo/pyside >> > >> > > > > -- > 宠辱不惊,闲看庭前花开花落;去留无意,漫随天边云卷云舒。 > > > -- 宠辱不惊,闲看庭前花开花落;去留无意,漫随天边云卷云舒。
_______________________________________________ PySide mailing list [email protected] http://lists.qt-project.org/mailman/listinfo/pyside
