[MediaWiki-commits] [Gerrit] weblinkchecker.py : XML and archived URL - change (pywikibot/core)

Beta16 (Code Review) Fri, 27 Dec 2013 03:40:41 -0800

Beta16 has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/104015



Change subject: weblinkchecker.py : XML and archived URL
......................................................................

weblinkchecker.py : XML and archived URL

Same as the follow for compat:
* 7ba4f460897316ae1f5cbcca0080f8c3262d9abf read XML dump
* I46c1737aea471691cd90f9ec21e3592ce0c69fde Internet Archive and Web Citation

Change-Id: I7279da01b0527c974ea53dc1f234a9268dbc8d43
---
M pywikibot/textlib.py
M scripts/weblinkchecker.py
2 files changed, 106 insertions(+), 29 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core 
refs/changes/15/104015/1

diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index 751634c..2da2452 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -888,6 +888,44 @@
     linkR = re.compile(regex)
     return linkR
 
+def getInternetArchiveURL(site, url, timestamp=None):
+    """Return archived URL by Internet Archive."""
+    # See [[:mw:Archived Pages]] and http://archive.org/help/wayback_api.php
+    import json
+    query = u'http://archive.org/wayback/available?'
+    query += u'url='
+    query += url
+    if not timestamp is None:
+        query += u'&timestamp='
+        query += timestamp
+    if pywikibot.verbose:
+        pywikibot.output(u"Requesting query from Internet Archive: %s" % query)
+    jsontext = site.getUrl(query, retry=True, no_hostname=True)
+    if "closest" in jsontext:
+        data = json.loads(jsontext)
+        return data['archived_snapshots']['closest']['url']
+    else:
+        return None
+
+def getWebCitationURL(site, url, timestamp=None):
+    """Return archived URL by Web Citation."""
+    # See http://www.webcitation.org/doc/WebCiteBestPracticesGuide.pdf
+    from BeautifulSoup import BeautifulStoneSoup
+    query = u'http://www.webcitation.org/query?'
+    query += u'returnxml=true'
+    query += u'&url='
+    query += url
+    if not timestamp is None:
+        query += u'&date='
+        query += timestamp
+    if pywikibot.verbose:
+        pywikibot.output(u"Requesting query from Web Citation: %s" % query)
+    xmltext = site.getUrl(query, retry=True, no_hostname=True)
+    if "success" in xmltext:
+        data = BeautifulStoneSoup(xmltext)
+        return data.find('webcite_url').string
+    else:
+        return None
 
 #----------------------------------
 # Functions dealing with templates
diff --git a/scripts/weblinkchecker.py b/scripts/weblinkchecker.py
index fe138c7..28675e5 100644
--- a/scripts/weblinkchecker.py
+++ b/scripts/weblinkchecker.py
@@ -36,6 +36,11 @@
 -namespace   Only process templates in the namespace with the given number or
              name. This parameter may be used multiple times.
 
+-xml         Should be used instead of a simple page fetching method from
+             pagegenerators.py for performance and load issues
+
+-xmlstart    Page to start with when using an XML dump
+
 -ignore      HTTP return codes to ignore. Can be provided several times :
                 -ignore:401 -ignore:500
 
@@ -112,6 +117,7 @@
 from pywikibot import i18n
 from pywikibot import config
 from pywikibot import pagegenerators
+from pywikibot import xmlreader
 
 docuReplacements = {
     '&params;': pagegenerators.parameterHelp
@@ -176,31 +182,45 @@
         else:
             yield m.group('urlb')
 
+class XmlDumpPageGenerator:
+    """Xml generator that yiels pages containing a web link"""
 
-class InternetArchiveConsulter:
-    def __init__(self, url):
-        self.url = url
+    def __init__(self, xmlFilename, xmlStart, namespaces):
+        self.xmlStart = xmlStart
+        self.namespaces = namespaces
+        self.skipping = bool(xmlStart)
+        self.site = pywikibot.getSite()
 
-    def getArchiveURL(self):
-        pywikibot.output(u'Consulting the Internet Archive for %s' % self.url)
-        archiveURL = 'http://web.archive.org/web/*/%s' % self.url
+       dump = xmlreader.XmlDump(xmlFilename)
+        self.parser = dump.parse()
+
+    def __iter__(self):
+        return self
+
+    def next(self):
         try:
-            f = urllib2.urlopen(archiveURL)
-        except urllib2.HTTPError:
-            # The Internet Archive yields a 403 error when the site was not
-            # archived due to robots.txt restrictions.
-            return
-        except UnicodeEncodeError:
-            return
-        data = f.read()
-        if f.headers.get('content-encoding', None) == 'gzip':
-            # Since 2008, the Internet Archive returns pages in GZIPed
-            # compression format. Unfortunatelly urllib2 doesn't handle
-            # the decompression for us, so we have to do it ourselves.
-            data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
-        if "Search Results for " in data:
-            return archiveURL
-
+            for entry in self.parser:
+                if self.skipping:
+                    if entry.title != self.xmlStart:
+                        continue
+                    self.skipping = False
+                page=pywikibot.Page(self.site, entry.title)
+                if not self.namespaces == []:
+                    if page.namespace() not in self.namespaces:
+                        continue
+                found = False
+                for url in weblinksIn(entry.text):
+                    found = True
+                if found:
+                   return page
+        except KeyboardInterrupt:
+            try:
+                if not self.skipping:
+                    pywikibot.output(
+                        u'To resume, use "-xmlstart:%s" on the command line.'
+                        % entry.title)
+            except NameError:
+                pass
 
 class LinkChecker(object):
     """
@@ -509,10 +529,10 @@
 
     def __init__(self, reportThread):
         self.reportThread = reportThread
-        site = pywikibot.getSite()
+        self.site = pywikibot.getSite()
         self.semaphore = threading.Semaphore()
         self.datfilename = pywikibot.config.datafilepath(
-            'deadlinks', 'deadlinks-%s-%s.dat' % (site.family.name, site.code))
+            'deadlinks', 'deadlinks-%s-%s.dat' % (self.site.family.name, 
self.site.code))
         # Count the number of logged links, so that we can insert captions
         # from time to time
         self.logCount = 0
@@ -528,7 +548,6 @@
         """
         Logs an error report to a text file in the deadlinks subdirectory.
         """
-        site = pywikibot.getSite()
         if archiveURL:
             errorReport = u'* %s ([%s archive])\n' % (url, archiveURL)
         else:
@@ -541,8 +560,8 @@
         pywikibot.output(u"** Logging link for deletion.")
         txtfilename = pywikibot.config.datafilepath('deadlinks',
                                                     'results-%s-%s.txt'
-                                                    % (site.family.name,
-                                                       site.lang))
+                                                    % (self.site.family.name,
+                                                       self.site.lang))
         txtfile = codecs.open(txtfilename, 'a', 'utf-8')
         self.logCount += 1
         if self.logCount % 30 == 0:
@@ -573,8 +592,9 @@
             # We'll list it in a file so that it can be removed manually.
             if timeSinceFirstFound > 60 * 60 * 24 * day:
                 # search for archived page
-                iac = InternetArchiveConsulter(url)
-                archiveURL = iac.getArchiveURL()
+                archiveURL = pywikibot.getInternetArchiveURL(self.site, url)
+                if archiveURL is None:
+                    archiveURL = pywikibot.getWebCitationURL(self.site, url)
                 self.log(url, error, page, archiveURL)
         else:
             self.historyDict[url] = [(page.title(), now, error)]
@@ -781,6 +801,7 @@
 def main():
     gen = None
     singlePageTitle = []
+    xmlFilename = None
     # Which namespaces should be processed?
     # default to [] which means all namespaces will be processed
     namespaces = []
@@ -807,6 +828,17 @@
             HTTPignore.append(int(arg[8:]))
         elif arg.startswith('-day:'):
             day = int(arg[5:])
+        elif arg.startswith('-xmlstart'):
+            if len(arg) == 9:
+                xmlStart = pywikibot.input(
+                    u'Please enter the dumped article to start with:')
+            else:
+                xmlStart = arg[10:]
+        elif arg.startswith('-xml'):
+            if len(arg) == 4:
+                xmlFilename = i18n.input('pywikibot-enter-xml-filename')
+            else:
+                xmlFilename = arg[5:]
         else:
             if not genFactory.handleArg(arg):
                 singlePageTitle.append(arg)
@@ -816,6 +848,13 @@
         page = pywikibot.Page(pywikibot.getSite(), singlePageTitle)
         gen = iter([page])
 
+    if xmlFilename:
+        try:
+            xmlStart
+        except NameError:
+            xmlStart = None
+        gen = XmlDumpPageGenerator(xmlFilename, xmlStart, namespaces)
+
     if not gen:
         gen = genFactory.getCombinedGenerator()
     if gen:

-- 
To view, visit https://gerrit.wikimedia.org/r/104015
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I7279da01b0527c974ea53dc1f234a9268dbc8d43
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Beta16 <l.rabine...@gmail.com>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] weblinkchecker.py : XML and archived URL - change (pywikibot/core)

Reply via email to