Beta16 has uploaded a new change for review. https://gerrit.wikimedia.org/r/104015
Change subject: weblinkchecker.py : XML and archived URL ...................................................................... weblinkchecker.py : XML and archived URL Same as the follow for compat: * 7ba4f460897316ae1f5cbcca0080f8c3262d9abf read XML dump * I46c1737aea471691cd90f9ec21e3592ce0c69fde Internet Archive and Web Citation Change-Id: I7279da01b0527c974ea53dc1f234a9268dbc8d43 --- M pywikibot/textlib.py M scripts/weblinkchecker.py 2 files changed, 106 insertions(+), 29 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core refs/changes/15/104015/1 diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py index 751634c..2da2452 100644 --- a/pywikibot/textlib.py +++ b/pywikibot/textlib.py @@ -888,6 +888,44 @@ linkR = re.compile(regex) return linkR +def getInternetArchiveURL(site, url, timestamp=None): + """Return archived URL by Internet Archive.""" + # See [[:mw:Archived Pages]] and http://archive.org/help/wayback_api.php + import json + query = u'http://archive.org/wayback/available?' + query += u'url=' + query += url + if not timestamp is None: + query += u'×tamp=' + query += timestamp + if pywikibot.verbose: + pywikibot.output(u"Requesting query from Internet Archive: %s" % query) + jsontext = site.getUrl(query, retry=True, no_hostname=True) + if "closest" in jsontext: + data = json.loads(jsontext) + return data['archived_snapshots']['closest']['url'] + else: + return None + +def getWebCitationURL(site, url, timestamp=None): + """Return archived URL by Web Citation.""" + # See http://www.webcitation.org/doc/WebCiteBestPracticesGuide.pdf + from BeautifulSoup import BeautifulStoneSoup + query = u'http://www.webcitation.org/query?' + query += u'returnxml=true' + query += u'&url=' + query += url + if not timestamp is None: + query += u'&date=' + query += timestamp + if pywikibot.verbose: + pywikibot.output(u"Requesting query from Web Citation: %s" % query) + xmltext = site.getUrl(query, retry=True, no_hostname=True) + if "success" in xmltext: + data = BeautifulStoneSoup(xmltext) + return data.find('webcite_url').string + else: + return None #---------------------------------- # Functions dealing with templates diff --git a/scripts/weblinkchecker.py b/scripts/weblinkchecker.py index fe138c7..28675e5 100644 --- a/scripts/weblinkchecker.py +++ b/scripts/weblinkchecker.py @@ -36,6 +36,11 @@ -namespace Only process templates in the namespace with the given number or name. This parameter may be used multiple times. +-xml Should be used instead of a simple page fetching method from + pagegenerators.py for performance and load issues + +-xmlstart Page to start with when using an XML dump + -ignore HTTP return codes to ignore. Can be provided several times : -ignore:401 -ignore:500 @@ -112,6 +117,7 @@ from pywikibot import i18n from pywikibot import config from pywikibot import pagegenerators +from pywikibot import xmlreader docuReplacements = { '¶ms;': pagegenerators.parameterHelp @@ -176,31 +182,45 @@ else: yield m.group('urlb') +class XmlDumpPageGenerator: + """Xml generator that yiels pages containing a web link""" -class InternetArchiveConsulter: - def __init__(self, url): - self.url = url + def __init__(self, xmlFilename, xmlStart, namespaces): + self.xmlStart = xmlStart + self.namespaces = namespaces + self.skipping = bool(xmlStart) + self.site = pywikibot.getSite() - def getArchiveURL(self): - pywikibot.output(u'Consulting the Internet Archive for %s' % self.url) - archiveURL = 'http://web.archive.org/web/*/%s' % self.url + dump = xmlreader.XmlDump(xmlFilename) + self.parser = dump.parse() + + def __iter__(self): + return self + + def next(self): try: - f = urllib2.urlopen(archiveURL) - except urllib2.HTTPError: - # The Internet Archive yields a 403 error when the site was not - # archived due to robots.txt restrictions. - return - except UnicodeEncodeError: - return - data = f.read() - if f.headers.get('content-encoding', None) == 'gzip': - # Since 2008, the Internet Archive returns pages in GZIPed - # compression format. Unfortunatelly urllib2 doesn't handle - # the decompression for us, so we have to do it ourselves. - data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read() - if "Search Results for " in data: - return archiveURL - + for entry in self.parser: + if self.skipping: + if entry.title != self.xmlStart: + continue + self.skipping = False + page=pywikibot.Page(self.site, entry.title) + if not self.namespaces == []: + if page.namespace() not in self.namespaces: + continue + found = False + for url in weblinksIn(entry.text): + found = True + if found: + return page + except KeyboardInterrupt: + try: + if not self.skipping: + pywikibot.output( + u'To resume, use "-xmlstart:%s" on the command line.' + % entry.title) + except NameError: + pass class LinkChecker(object): """ @@ -509,10 +529,10 @@ def __init__(self, reportThread): self.reportThread = reportThread - site = pywikibot.getSite() + self.site = pywikibot.getSite() self.semaphore = threading.Semaphore() self.datfilename = pywikibot.config.datafilepath( - 'deadlinks', 'deadlinks-%s-%s.dat' % (site.family.name, site.code)) + 'deadlinks', 'deadlinks-%s-%s.dat' % (self.site.family.name, self.site.code)) # Count the number of logged links, so that we can insert captions # from time to time self.logCount = 0 @@ -528,7 +548,6 @@ """ Logs an error report to a text file in the deadlinks subdirectory. """ - site = pywikibot.getSite() if archiveURL: errorReport = u'* %s ([%s archive])\n' % (url, archiveURL) else: @@ -541,8 +560,8 @@ pywikibot.output(u"** Logging link for deletion.") txtfilename = pywikibot.config.datafilepath('deadlinks', 'results-%s-%s.txt' - % (site.family.name, - site.lang)) + % (self.site.family.name, + self.site.lang)) txtfile = codecs.open(txtfilename, 'a', 'utf-8') self.logCount += 1 if self.logCount % 30 == 0: @@ -573,8 +592,9 @@ # We'll list it in a file so that it can be removed manually. if timeSinceFirstFound > 60 * 60 * 24 * day: # search for archived page - iac = InternetArchiveConsulter(url) - archiveURL = iac.getArchiveURL() + archiveURL = pywikibot.getInternetArchiveURL(self.site, url) + if archiveURL is None: + archiveURL = pywikibot.getWebCitationURL(self.site, url) self.log(url, error, page, archiveURL) else: self.historyDict[url] = [(page.title(), now, error)] @@ -781,6 +801,7 @@ def main(): gen = None singlePageTitle = [] + xmlFilename = None # Which namespaces should be processed? # default to [] which means all namespaces will be processed namespaces = [] @@ -807,6 +828,17 @@ HTTPignore.append(int(arg[8:])) elif arg.startswith('-day:'): day = int(arg[5:]) + elif arg.startswith('-xmlstart'): + if len(arg) == 9: + xmlStart = pywikibot.input( + u'Please enter the dumped article to start with:') + else: + xmlStart = arg[10:] + elif arg.startswith('-xml'): + if len(arg) == 4: + xmlFilename = i18n.input('pywikibot-enter-xml-filename') + else: + xmlFilename = arg[5:] else: if not genFactory.handleArg(arg): singlePageTitle.append(arg) @@ -816,6 +848,13 @@ page = pywikibot.Page(pywikibot.getSite(), singlePageTitle) gen = iter([page]) + if xmlFilename: + try: + xmlStart + except NameError: + xmlStart = None + gen = XmlDumpPageGenerator(xmlFilename, xmlStart, namespaces) + if not gen: gen = genFactory.getCombinedGenerator() if gen: -- To view, visit https://gerrit.wikimedia.org/r/104015 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I7279da01b0527c974ea53dc1f234a9268dbc8d43 Gerrit-PatchSet: 1 Gerrit-Project: pywikibot/core Gerrit-Branch: master Gerrit-Owner: Beta16 <l.rabine...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits