Beta16 has uploaded a new change for review. https://gerrit.wikimedia.org/r/104013
Change subject: weblinkchecker.py : get archived URL ...................................................................... weblinkchecker.py : get archived URL * use API for querying Internet Archive: [[:mw:Archived Pages]] * add query for Web Citation: bug 58815 Change-Id: I46c1737aea471691cd90f9ec21e3592ce0c69fde --- M pywikibot/textlib.py M weblinkchecker.py 2 files changed, 45 insertions(+), 33 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/pywikibot/compat refs/changes/13/104013/1 diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py index 30d78b6..b2af9fb 100644 --- a/pywikibot/textlib.py +++ b/pywikibot/textlib.py @@ -886,6 +886,44 @@ linkR = re.compile(regex) return linkR +def getInternetArchiveURL(site, url, timestamp=None): + """Return archived URL by Internet Archive.""" + # See [[:mw:Archived Pages]] and http://archive.org/help/wayback_api.php + import json + query = u'http://archive.org/wayback/available?' + query += u'url=' + query += url + if not timestamp is None: + query += u'×tamp=' + query += timestamp + if pywikibot.verbose: + pywikibot.output(u"Requesting query from Internet Archive: %s" % query) + jsontext = site.getUrl(query, retry=True, no_hostname=True) + if "closest" in jsontext: + data = json.loads(jsontext) + return data['archived_snapshots']['closest']['url'] + else: + return None + +def getWebCitationURL(site, url, timestamp=None): + """Return archived URL by Web Citation.""" + # See http://www.webcitation.org/doc/WebCiteBestPracticesGuide.pdf + from BeautifulSoup import BeautifulStoneSoup + query = u'http://www.webcitation.org/query?' + query += u'returnxml=true' + query += u'&url=' + query += url + if not timestamp is None: + query += u'&date=' + query += timestamp + if pywikibot.verbose: + pywikibot.output(u"Requesting query from Web Citation: %s" % query) + xmltext = site.getUrl(query, retry=True, no_hostname=True) + if "success" in xmltext: + data = BeautifulStoneSoup(xmltext) + return data.find('webcite_url').string + else: + return None #---------------------------------- # Functions dealing with templates diff --git a/weblinkchecker.py b/weblinkchecker.py index 1eaa96b..1fdaa8e 100644 --- a/weblinkchecker.py +++ b/weblinkchecker.py @@ -176,32 +176,6 @@ else: yield m.group('urlb') - -class InternetArchiveConsulter: - def __init__(self, url): - self.url = url - - def getArchiveURL(self): - pywikibot.output(u'Consulting the Internet Archive for %s' % self.url) - archiveURL = 'http://web.archive.org/web/*/%s' % self.url - try: - f = urllib2.urlopen(archiveURL) - except urllib2.HTTPError: - # The Internet Archive yields a 403 error when the site was not - # archived due to robots.txt restrictions. - return - except UnicodeEncodeError: - return - data = f.read() - if f.headers.get('content-encoding', None) == 'gzip': - # Since 2008, the Internet Archive returns pages in GZIPed - # compression format. Unfortunatelly urllib2 doesn't handle - # the decompression for us, so we have to do it ourselves. - data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read() - if "Search Results for " in data: - return archiveURL - - class LinkChecker(object): """ Given a HTTP URL, tries to load the page from the Internet and checks if it @@ -509,10 +483,10 @@ def __init__(self, reportThread): self.reportThread = reportThread - site = pywikibot.getSite() + self.site = pywikibot.getSite() self.semaphore = threading.Semaphore() self.datfilename = pywikibot.config.datafilepath( - 'deadlinks', 'deadlinks-%s-%s.dat' % (site.family.name, site.lang)) + 'deadlinks', 'deadlinks-%s-%s.dat' % (self.site.family.name, self.site.lang)) # Count the number of logged links, so that we can insert captions # from time to time self.logCount = 0 @@ -528,7 +502,6 @@ """ Logs an error report to a text file in the deadlinks subdirectory. """ - site = pywikibot.getSite() if archiveURL: errorReport = u'* %s ([%s archive])\n' % (url, archiveURL) else: @@ -541,8 +514,8 @@ pywikibot.output(u"** Logging link for deletion.") txtfilename = pywikibot.config.datafilepath('deadlinks', 'results-%s-%s.txt' - % (site.family.name, - site.lang)) + % (self.site.family.name, + self.site.lang)) txtfile = codecs.open(txtfilename, 'a', 'utf-8') self.logCount += 1 if self.logCount % 30 == 0: @@ -573,8 +546,9 @@ # We'll list it in a file so that it can be removed manually. if timeSinceFirstFound > 60 * 60 * 24 * day: # search for archived page - iac = InternetArchiveConsulter(url) - archiveURL = iac.getArchiveURL() + archiveURL = pywikibot.getInternetArchiveURL(self.site, url) + if archiveURL is None: + archiveURL = pywikibot.getWebCitationURL(self.site, url) self.log(url, error, page, archiveURL) else: self.historyDict[url] = [(page.title(), now, error)] -- To view, visit https://gerrit.wikimedia.org/r/104013 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I46c1737aea471691cd90f9ec21e3592ce0c69fde Gerrit-PatchSet: 1 Gerrit-Project: pywikibot/compat Gerrit-Branch: master Gerrit-Owner: Beta16 <l.rabine...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits