在 2012年12月24日星期一UTC+8上午8时34分47秒,iMath写道: > how to detect the character encoding in a web page ? > > such as this page > > > > http://python.org/
I found PyQt’s QtextStream can very accurately detect the character encoding in a web page . even for this bad page chardet and beautiful soup failed ,but QtextStream can get the right result . here is my code from PyQt4.QtCore import * from PyQt4.QtGui import * from PyQt4.QtNetwork import * import sys def slotSourceDownloaded(reply): redirctLocation=reply.header(QNetworkRequest.LocationHeader) redirctLocationUrl=reply.url() if not redirctLocation else redirctLocation print(redirctLocationUrl) if (reply.error()!= QNetworkReply.NoError): print('11111111', reply.errorString()) return content=QTextStream(reply).readAll() if content=='': print('---------', 'cannot find any resource !') return print(content) reply.deleteLater() qApp.quit() if __name__ == '__main__': app =QCoreApplication(sys.argv) manager=QNetworkAccessManager () url =input('input url :') request=QNetworkRequest (QUrl.fromEncoded(QUrl.fromUserInput(url).toEncoded())) request.setRawHeader("User-Agent" ,'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17 SE 2.X MetaSr 1.0') manager.get(request) manager.finished.connect(slotSourceDownloaded) sys.exit(app.exec_()) -- http://mail.python.org/mailman/listinfo/python-list