jenkins-bot has submitted this change and it was merged.

Change subject: Improve HTML parser to detect all IWM MW sites
......................................................................


Improve HTML parser to detect all IWM MW sites

The HTML parser used only EditURI to determine the api.php endpoint.

Add HTML parser support for
* OpenSearch's opensearch_desc.php, introduced in MW 1.8, and
* Resource Loader's load.php, introduced in MW 1.17.

Also raise an exception for unsupported versions, lower than MW 1.14.

And fix flake8 issues in site_detect module.

Bug: T111007
Change-Id: Iebadd7f782ab1b471d3de71fa1c5a52c0d6f1018
---
M pywikibot/site_detect.py
M tests/site_detect_tests.py
M tox.ini
3 files changed, 244 insertions(+), 56 deletions(-)

Approvals:
  John Vandenberg: Looks good to me, but someone else must approve
  XZise: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/pywikibot/site_detect.py b/pywikibot/site_detect.py
index 13bfe27..7dd2587 100644
--- a/pywikibot/site_detect.py
+++ b/pywikibot/site_detect.py
@@ -13,17 +13,21 @@
 import json
 import re
 
-from distutils.version import LooseVersion as V
+import pywikibot
 
 from pywikibot.comms.http import fetch
-from pywikibot.tools import PY2, PYTHON_VERSION
+from pywikibot.exceptions import ServerError
+from pywikibot.tools import MediaWikiVersion, PY2, PYTHON_VERSION
 
 if not PY2:
     from html.parser import HTMLParser
-    from urllib.parse import urljoin
+    from urllib.parse import urljoin, urlparse
 else:
-    from HTMLParser import HTMLParser
-    from urlparse import urljoin
+    try:
+        from future.backports.html.parser import HTMLParser
+    except ImportError:
+        from HTMLParser import HTMLParser
+    from urlparse import urljoin, urlparse
 
 
 class MWSite(object):
@@ -38,25 +42,59 @@
     REwgVersion = re.compile(r'wgVersion ?= ?"([^"]*)"')
 
     def __init__(self, fromurl):
-        self.fromurl = fromurl
+        """
+        Constructor.
+
+        @raises ServerError: a server error occurred while loading the site
+        @raises Timeout: a timeout occurred while loading the site
+        @raises RuntimeError: Version not found or version less than 1.14
+        """
         if fromurl.endswith("$1"):
             fromurl = fromurl[:-2]
-        data = fetch(fromurl).content
+        r = fetch(fromurl)
+        if r.status == 503:
+            raise ServerError('Service Unavailable')
 
-        wp = WikiHTMLPageParser()
+        if fromurl != r.data.url:
+            pywikibot.log('{0} redirected to {1}'.format(fromurl, r.data.url))
+            fromurl = r.data.url
+
+        self.fromurl = fromurl
+
+        data = r.content
+
+        wp = WikiHTMLPageParser(fromurl)
         wp.feed(data)
-        try:
-            self.version = wp.generator.replace("MediaWiki ", "")
-        except Exception:
-            self.version = "0.0"
 
-        if V(self.version) < V("1.17.0"):
+        self.version = wp.version
+        self.server = wp.server
+        self.scriptpath = wp.scriptpath
+        self.articlepath = None
+
+        try:
             self._parse_pre_117(data)
-        else:
-            self._parse_post_117(wp, fromurl)
+        except Exception as e:
+            pywikibot.log('MW pre-1.17 detection failed: {0!r}'.format(e))
+
+        if self.api:
+            try:
+                self._parse_post_117()
+            except Exception as e:
+                pywikibot.log('MW 1.17+ detection failed: {0!r}'.format(e))
+
+            if not self.version:
+                self._fetch_old_version()
+
+        if not self.api:
+            raise RuntimeError('Unsupported url: {0}'.format(self.fromurl))
+
+        if (not self.version or
+                self.version < MediaWikiVersion('1.14')):
+            raise RuntimeError('Unsupported version: {0}'.format(self.version))
 
     @property
     def langs(self):
+        """Build interwikimap."""
         response = fetch(
             self.api +
             
"?action=query&meta=siteinfo&siprop=interwikimap&sifilteriw=local&format=json")
@@ -69,52 +107,78 @@
         return self.langs
 
     def _parse_pre_117(self, data):
+        """Parse HTML."""
         if not self.REwgEnableApi.search(data):
-            print("*** WARNING: Api does not seem to be enabled on %s"
-                  % self.fromurl)
+            pywikibot.log(
+                'wgEnableApi is not enabled in HTML of %s'
+                % self.fromurl)
         try:
-            self.version = self.REwgVersion.search(data).groups()[0]
+            self.version = MediaWikiVersion(
+                self.REwgVersion.search(data).groups(0))
         except AttributeError:
-            self.version = None
+            pass
 
         self.server = self.REwgServer.search(data).groups()[0]
         self.scriptpath = self.REwgScriptPath.search(data).groups()[0]
         self.articlepath = self.REwgArticlePath.search(data).groups()[0]
         self.lang = self.REwgContentLanguage.search(data).groups()[0]
 
+    def _fetch_old_version(self):
+        """Extract the version from API help with ?version enabled."""
         if self.version is None:
-            # try to get version using api
             try:
-                d = json.loads(fetch(self.api + 
'?version&format=json').content)
+                d = fetch(self.api + '?version&format=json').content
+                try:
+                    d = json.loads(d)
+                except ValueError:
+                    # Fallback for old versions which didnt wrap help in json
+                    d = {'error': {'*': d}}
+
                 self.version = list(filter(
                     lambda x: x.startswith("MediaWiki"),
                     [l.strip()
                      for l in d['error']['*'].split("\n")]))[0].split()[1]
             except Exception:
                 pass
+            else:
+                self.version = MediaWikiVersion(self.version)
 
-    def _parse_post_117(self, wp, fromurl):
-        apipath = wp.edituri.split("?")[0]
-        fullurl = urljoin(fromurl, apipath)
-        response = fetch(fullurl + '?action=query&meta=siteinfo&format=json')
+    def _parse_post_117(self):
+        """Parse 1.17+ siteinfo data."""
+        response = fetch(self.api + '?action=query&meta=siteinfo&format=json')
         info = json.loads(response.content)['query']['general']
-        self.server = urljoin(fromurl, info['server'])
+        self.version = MediaWikiVersion.from_generator(info['generator'])
+        if self.version < MediaWikiVersion('1.17'):
+            return
+
+        self.server = urljoin(self.fromurl, info['server'])
         for item in ['scriptpath', 'articlepath', 'lang']:
             setattr(self, item, info[item])
 
-    def __cmp__(self, other):
+    def __eq__(self, other):
+        """Return True if equal to other."""
         return (self.server + self.scriptpath ==
                 other.server + other.scriptpath)
 
     def __hash__(self):
+        """Get hashable representation."""
         return hash(self.server + self.scriptpath)
 
     @property
     def api(self):
+        """
+        Get api URL.
+
+        @rtype: str or None
+        """
+        if self.server is None or self.scriptpath is None:
+            return
+
         return self.server + self.scriptpath + "/api.php"
 
     @property
     def iwpath(self):
+        """Get article path URL."""
         return self.server + self.articlepath
 
 
@@ -122,18 +186,85 @@
 
     """Wiki HTML page parser."""
 
-    def __init__(self):
+    def __init__(self, url):
+        """Constructor."""
         if PYTHON_VERSION < (3, 4):
             HTMLParser.__init__(self)
         else:
             super().__init__(convert_charrefs=True)
+        self.url = urlparse(url)
         self.generator = None
+        self.version = None
+        self._parsed_url = None
+        self.server = None
+        self.scriptpath = None
+
+    def set_version(self, value):
+        """Set highest version."""
+        if self.version and value < self.version:
+            return
+
+        self.version = value
+
+    def set_api_url(self, url):
+        """Set api_url."""
+        url = url.split('.php', 1)[0]
+        (value, script_name) = url.rsplit('/', 1)
+        if script_name not in ('api', 'load', 'opensearch_desc'):
+            return
+
+        if script_name == 'load':
+            self.set_version(MediaWikiVersion('1.17.0'))
+            if self._parsed_url:
+                # A Resource Loader link is less reliable than other links.
+                # Resource Loader can load resources from a different site.
+                # e.g. http://kino.skripov.com/index.php/$1
+                # loads resources from http://megawiki.net/
+                return
+
+        new_parsed_url = urlparse(value)
+        if self._parsed_url:
+            assert new_parsed_url.path == self._parsed_url.path
+
+        if not new_parsed_url.scheme or not new_parsed_url.netloc:
+            new_parsed_url = urlparse(
+                '{0}://{1}{2}'.format(
+                    new_parsed_url.scheme or self.url.scheme,
+                    new_parsed_url.netloc or self.url.netloc,
+                    new_parsed_url.path))
+        else:
+            if self._parsed_url:
+                # allow upgrades to https, but not downgrades
+                if self._parsed_url.scheme == 'https':
+                    if new_parsed_url.scheme != self._parsed_url.scheme:
+                        return
+
+                # allow http://www.brickwiki.info/ vs http://brickwiki.info/
+                if (new_parsed_url.netloc in self._parsed_url.netloc or
+                        self._parsed_url.netloc in new_parsed_url.netloc):
+                    return
+
+                assert new_parsed_url == self._parsed_url, '{0} != {1}'.format(
+                    self._parsed_url, new_parsed_url)
+
+        self._parsed_url = new_parsed_url
+        self.server = '{0}://{1}'.format(
+            self._parsed_url.scheme, self._parsed_url.netloc)
+        self.scriptpath = self._parsed_url.path
 
     def handle_starttag(self, tag, attrs):
+        """Handle an opening tag."""
         attrs = dict(attrs)
         if tag == "meta":
             if attrs.get('name') == 'generator':
                 self.generator = attrs["content"]
-        if tag == "link":
-            if attrs.get('rel') == 'EditURI':
-                self.edituri = attrs["href"]
+                try:
+                    self.version = MediaWikiVersion.from_generator(
+                        self.generator)
+                except ValueError:
+                    pass
+        elif tag == 'link' and 'rel' in attrs and 'href' in attrs:
+            if attrs['rel'] in ('EditURI', 'stylesheet', 'search'):
+                self.set_api_url(attrs['href'])
+        elif tag == 'script' and 'src' in attrs:
+            self.set_api_url(attrs['src'])
diff --git a/tests/site_detect_tests.py b/tests/site_detect_tests.py
index f13105f..bcad8cb 100644
--- a/tests/site_detect_tests.py
+++ b/tests/site_detect_tests.py
@@ -11,8 +11,9 @@
 
 from requests.exceptions import Timeout
 
+from pywikibot.exceptions import ServerError
 from pywikibot.site_detect import MWSite
-from pywikibot.tools import PY2
+from pywikibot.tools import MediaWikiVersion, PY2
 
 from tests.aspects import unittest, TestCase
 
@@ -23,10 +24,6 @@
 class TestWikiSiteDetection(TestCase):
 
     """Test Case for MediaWiki detection and site object creation."""
-
-    family = 'meta'
-    code = 'meta'
-    net = True
 
     def setUp(self):
         """Set up test."""
@@ -75,7 +72,7 @@
         self.all += [url]
         try:
             site = MWSite(url)
-        except Timeout as e:
+        except (ServerError, Timeout) as e:
             self.skips[url] = e
             return
         except Exception as e:
@@ -88,7 +85,7 @@
                 self.assertIsNone(site)
             else:
                 self.assertIsInstance(site, result)
-            self.passes[url] = result
+            self.passes[url] = site
         except AssertionError as error:
             self.failures[url] = error
 
@@ -102,15 +99,24 @@
 
     def assertAllPass(self):
         """Assert that all urls were detected as a MediaWiki site."""
-        self.assertEqual(len(self.passes), len(self.all) - len(self.skips))
-        self.assertEqual(len(self.failures), 0)
-        self.assertEqual(len(self.errors), 0)
+        self.assertEqual(set(self.passes), set(self.all) - set(self.skips))
+        self.assertEqual(self.failures, {})
+        self.assertEqual(self.errors, {})
 
     def assertAllError(self):
         """Assert that all urls were not detected as a MediaWiki site."""
-        self.assertEqual(len(self.passes), 0)
-        self.assertEqual(len(self.failures), 0)
-        self.assertEqual(len(self.errors), len(self.all) - len(self.skips))
+        self.assertEqual(self.passes, {})
+        self.assertEqual(self.failures, {})
+        self.assertEqual(set(self.errors), set(self.all) - set(self.skips))
+
+
+class InterWikiMapDetection(TestWikiSiteDetection):
+
+    """Test all urls on the interwiki map."""
+
+    family = 'meta'
+    code = 'meta'
+    net = True
 
     def test_IWM(self):
         """Test the load_site method for MW sites on the IWM list."""
@@ -133,39 +139,85 @@
                         self.errors[url] = error
                     else:
                         try:
-                            self.assertIsInstance(version, basestring)
-                            self.assertRegex(version, r'^\d\.\d+.*')
+                            self.assertIsInstance(version, MediaWikiVersion)
                             self.passes[url] = site
                         except AssertionError as error:
                             print('failed to parse version of ' + url)
                             self.failures[url] = error
 
+
+class SiteDetectionTestCase(TestWikiSiteDetection):
+
+    """Test all urls on the interwiki map."""
+
+    net = True
+
     def test_detect_site(self):
         """Test detection of MediaWiki sites."""
         self.assertSite('http://botwiki.sno.cc/wiki/$1')
-        self.assertSite('http://glossary.reuters.com/index.php?title=$1')
-        self.assertSite('http://www.livepedia.gr/index.php?title=$1')
         self.assertSite('http://guildwars.wikia.com/wiki/$1')
-        self.assertSite('http://www.hrwiki.org/index.php/$1')
+        self.assertSite('http://www.hrwiki.org/index.php/$1')  # v 1.15
         self.assertSite('http://www.proofwiki.org/wiki/$1')
         self.assertSite(
             'http://www.ck-wissen.de/ckwiki/index.php?title=$1')
         self.assertSite('http://en.citizendium.org/wiki/$1')
         self.assertSite(
             'http://www.lojban.org/tiki/tiki-index.php?page=$1')
-        self.assertSite('http://www.EcoReality.org/wiki/$1')
         self.assertSite('http://www.wikichristian.org/index.php?title=$1')
-        self.assertSite('http://wikitree.org/index.php?title=$1')
+        self.assertSite('https://en.wikifur.com/wiki/$1')
+        self.assertSite('http://bluwiki.com/go/$1')
+        self.assertSite('http://kino.skripov.com/index.php/$1')
+        self.assertAllPass()
+
+    def test_wikisophia(self):
+        """Test wikisophia.org which has redirect problems."""
+        # /index.php?title=$1 reports 404, however a wiki exists there,
+        # but the API is also hidden.
+        self.assertNoSite('http://wikisophia.org/index.php?title=$1')
+        self.assertAllError()
+
+    def test_pre_114_sites(self):
+        """Test pre 1.14 sites which should be detected as unsupported."""
+        # v1.12
+        self.assertNoSite('http://www.livepedia.gr/index.php?title=$1')
+        # v1.11
+        self.assertNoSite('http://www.wikifon.org/$1')
+        self.assertNoSite('http://glossary.reuters.com/index.php?title=$1')
+        # v1.11, with no query module
+        self.assertNoSite('http://wikitree.org/index.php?title=$1')
+        # v1.9
+        self.assertNoSite('http://www.wikinvest.com/$1')
+        self.assertAllError()
+
+    def test_non_standard_version_sites(self):
+        """Test non-standard version string sites."""
+        self.assertSite('https://wiki.gentoo.org/wiki/$1')
+        self.assertSite('http://wiki.arabeyes.org/$1')
+        self.assertSite('http://tfwiki.net/wiki/$1')
         self.assertAllPass()
 
     def test_detect_failure(self):
         """Test detection failure for MediaWiki sites with an API."""
-        self.assertNoSite('https://en.wikifur.com/wiki/$1')
+        # SSL certificate verification fails
+        self.assertNoSite('http://hackerspaces.org/wiki/$1')
+        self.assertAllError()
+
+    @unittest.expectedFailure
+    def test_api_hidden(self):
+        """Test MediaWiki sites with a hidden enabled API."""
         # api.php is not available
         self.assertNoSite('http://wiki.animutationportal.com/index.php/$1')
-        # API is disabled
+        # HTML looks like it has an API, but redirect rules prevent access
+        self.assertNoSite('http://www.EcoReality.org/wiki/$1')
+        self.assertAllError()
+
+    def test_api_disabled(self):
+        """Test MediaWiki sites without an enabled API."""
         self.assertNoSite('http://wiki.linuxquestions.org/wiki/$1')
-        # offline
+        self.assertAllError()
+
+    def test_offline_sites(self):
+        """Test offline sites."""
         self.assertNoSite('http://seattlewiki.org/wiki/$1')
         self.assertAllError()
 
@@ -181,7 +233,6 @@
 
     def test_detect_nosite(self):
         """Test detection of non-wiki sites."""
-        self.assertNoSite('http://bluwiki.com/go/$1')
         self.assertNoSite('http://www.imdb.com/name/nm$1/')
         self.assertNoSite('http://www.ecyrd.com/JSPWiki/Wiki.jsp?page=$1')
         self.assertNoSite('http://operawiki.info/$1')
@@ -192,8 +243,13 @@
         self.assertNoSite(
             
'http://www.merriam-webster.com/cgi-bin/dictionary?book=Dictionary&va=$1')
         self.assertNoSite('http://arxiv.org/abs/$1')
+        self.assertAllError()
+
+    def test_musicbrainz_doc(self):
+        """Test http://musicbrainz.org/doc/ which has a page 'api.php'."""
+        # Possible false positive caused by the existance of a page
+        # called http://musicbrainz.org/doc/api.php
         self.assertNoSite('http://musicbrainz.org/doc/$1')
-        self.assertNoSite('http://wiki.animutationportal.com/index.php/$1')
         self.assertAllError()
 
 
diff --git a/tox.ini b/tox.ini
index 728dc58..15ecea9 100644
--- a/tox.ini
+++ b/tox.ini
@@ -64,6 +64,7 @@
     pywikibot/pagegenerators.py \
     pywikibot/plural.py \
     pywikibot/proofreadpage.py \
+    pywikibot/site_detect.py \
     pywikibot/textlib.py \
     pywikibot/throttle.py \
     pywikibot/titletranslate.py \

-- 
To view, visit https://gerrit.wikimedia.org/r/230512
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Iebadd7f782ab1b471d3de71fa1c5a52c0d6f1018
Gerrit-PatchSet: 16
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: John Vandenberg <jay...@gmail.com>
Gerrit-Reviewer: John Vandenberg <jay...@gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgr...@gmail.com>
Gerrit-Reviewer: Merlijn van Deen <valhall...@arctus.nl>
Gerrit-Reviewer: XZise <commodorefabia...@gmx.de>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to