jenkins-bot has submitted this change and it was merged. Change subject: Improve HTML parser to detect all IWM MW sites ......................................................................
Improve HTML parser to detect all IWM MW sites The HTML parser used only EditURI to determine the api.php endpoint. Add HTML parser support for * OpenSearch's opensearch_desc.php, introduced in MW 1.8, and * Resource Loader's load.php, introduced in MW 1.17. Also raise an exception for unsupported versions, lower than MW 1.14. And fix flake8 issues in site_detect module. Bug: T111007 Change-Id: Iebadd7f782ab1b471d3de71fa1c5a52c0d6f1018 --- M pywikibot/site_detect.py M tests/site_detect_tests.py M tox.ini 3 files changed, 244 insertions(+), 56 deletions(-) Approvals: John Vandenberg: Looks good to me, but someone else must approve XZise: Looks good to me, approved jenkins-bot: Verified diff --git a/pywikibot/site_detect.py b/pywikibot/site_detect.py index 13bfe27..7dd2587 100644 --- a/pywikibot/site_detect.py +++ b/pywikibot/site_detect.py @@ -13,17 +13,21 @@ import json import re -from distutils.version import LooseVersion as V +import pywikibot from pywikibot.comms.http import fetch -from pywikibot.tools import PY2, PYTHON_VERSION +from pywikibot.exceptions import ServerError +from pywikibot.tools import MediaWikiVersion, PY2, PYTHON_VERSION if not PY2: from html.parser import HTMLParser - from urllib.parse import urljoin + from urllib.parse import urljoin, urlparse else: - from HTMLParser import HTMLParser - from urlparse import urljoin + try: + from future.backports.html.parser import HTMLParser + except ImportError: + from HTMLParser import HTMLParser + from urlparse import urljoin, urlparse class MWSite(object): @@ -38,25 +42,59 @@ REwgVersion = re.compile(r'wgVersion ?= ?"([^"]*)"') def __init__(self, fromurl): - self.fromurl = fromurl + """ + Constructor. + + @raises ServerError: a server error occurred while loading the site + @raises Timeout: a timeout occurred while loading the site + @raises RuntimeError: Version not found or version less than 1.14 + """ if fromurl.endswith("$1"): fromurl = fromurl[:-2] - data = fetch(fromurl).content + r = fetch(fromurl) + if r.status == 503: + raise ServerError('Service Unavailable') - wp = WikiHTMLPageParser() + if fromurl != r.data.url: + pywikibot.log('{0} redirected to {1}'.format(fromurl, r.data.url)) + fromurl = r.data.url + + self.fromurl = fromurl + + data = r.content + + wp = WikiHTMLPageParser(fromurl) wp.feed(data) - try: - self.version = wp.generator.replace("MediaWiki ", "") - except Exception: - self.version = "0.0" - if V(self.version) < V("1.17.0"): + self.version = wp.version + self.server = wp.server + self.scriptpath = wp.scriptpath + self.articlepath = None + + try: self._parse_pre_117(data) - else: - self._parse_post_117(wp, fromurl) + except Exception as e: + pywikibot.log('MW pre-1.17 detection failed: {0!r}'.format(e)) + + if self.api: + try: + self._parse_post_117() + except Exception as e: + pywikibot.log('MW 1.17+ detection failed: {0!r}'.format(e)) + + if not self.version: + self._fetch_old_version() + + if not self.api: + raise RuntimeError('Unsupported url: {0}'.format(self.fromurl)) + + if (not self.version or + self.version < MediaWikiVersion('1.14')): + raise RuntimeError('Unsupported version: {0}'.format(self.version)) @property def langs(self): + """Build interwikimap.""" response = fetch( self.api + "?action=query&meta=siteinfo&siprop=interwikimap&sifilteriw=local&format=json") @@ -69,52 +107,78 @@ return self.langs def _parse_pre_117(self, data): + """Parse HTML.""" if not self.REwgEnableApi.search(data): - print("*** WARNING: Api does not seem to be enabled on %s" - % self.fromurl) + pywikibot.log( + 'wgEnableApi is not enabled in HTML of %s' + % self.fromurl) try: - self.version = self.REwgVersion.search(data).groups()[0] + self.version = MediaWikiVersion( + self.REwgVersion.search(data).groups(0)) except AttributeError: - self.version = None + pass self.server = self.REwgServer.search(data).groups()[0] self.scriptpath = self.REwgScriptPath.search(data).groups()[0] self.articlepath = self.REwgArticlePath.search(data).groups()[0] self.lang = self.REwgContentLanguage.search(data).groups()[0] + def _fetch_old_version(self): + """Extract the version from API help with ?version enabled.""" if self.version is None: - # try to get version using api try: - d = json.loads(fetch(self.api + '?version&format=json').content) + d = fetch(self.api + '?version&format=json').content + try: + d = json.loads(d) + except ValueError: + # Fallback for old versions which didnt wrap help in json + d = {'error': {'*': d}} + self.version = list(filter( lambda x: x.startswith("MediaWiki"), [l.strip() for l in d['error']['*'].split("\n")]))[0].split()[1] except Exception: pass + else: + self.version = MediaWikiVersion(self.version) - def _parse_post_117(self, wp, fromurl): - apipath = wp.edituri.split("?")[0] - fullurl = urljoin(fromurl, apipath) - response = fetch(fullurl + '?action=query&meta=siteinfo&format=json') + def _parse_post_117(self): + """Parse 1.17+ siteinfo data.""" + response = fetch(self.api + '?action=query&meta=siteinfo&format=json') info = json.loads(response.content)['query']['general'] - self.server = urljoin(fromurl, info['server']) + self.version = MediaWikiVersion.from_generator(info['generator']) + if self.version < MediaWikiVersion('1.17'): + return + + self.server = urljoin(self.fromurl, info['server']) for item in ['scriptpath', 'articlepath', 'lang']: setattr(self, item, info[item]) - def __cmp__(self, other): + def __eq__(self, other): + """Return True if equal to other.""" return (self.server + self.scriptpath == other.server + other.scriptpath) def __hash__(self): + """Get hashable representation.""" return hash(self.server + self.scriptpath) @property def api(self): + """ + Get api URL. + + @rtype: str or None + """ + if self.server is None or self.scriptpath is None: + return + return self.server + self.scriptpath + "/api.php" @property def iwpath(self): + """Get article path URL.""" return self.server + self.articlepath @@ -122,18 +186,85 @@ """Wiki HTML page parser.""" - def __init__(self): + def __init__(self, url): + """Constructor.""" if PYTHON_VERSION < (3, 4): HTMLParser.__init__(self) else: super().__init__(convert_charrefs=True) + self.url = urlparse(url) self.generator = None + self.version = None + self._parsed_url = None + self.server = None + self.scriptpath = None + + def set_version(self, value): + """Set highest version.""" + if self.version and value < self.version: + return + + self.version = value + + def set_api_url(self, url): + """Set api_url.""" + url = url.split('.php', 1)[0] + (value, script_name) = url.rsplit('/', 1) + if script_name not in ('api', 'load', 'opensearch_desc'): + return + + if script_name == 'load': + self.set_version(MediaWikiVersion('1.17.0')) + if self._parsed_url: + # A Resource Loader link is less reliable than other links. + # Resource Loader can load resources from a different site. + # e.g. http://kino.skripov.com/index.php/$1 + # loads resources from http://megawiki.net/ + return + + new_parsed_url = urlparse(value) + if self._parsed_url: + assert new_parsed_url.path == self._parsed_url.path + + if not new_parsed_url.scheme or not new_parsed_url.netloc: + new_parsed_url = urlparse( + '{0}://{1}{2}'.format( + new_parsed_url.scheme or self.url.scheme, + new_parsed_url.netloc or self.url.netloc, + new_parsed_url.path)) + else: + if self._parsed_url: + # allow upgrades to https, but not downgrades + if self._parsed_url.scheme == 'https': + if new_parsed_url.scheme != self._parsed_url.scheme: + return + + # allow http://www.brickwiki.info/ vs http://brickwiki.info/ + if (new_parsed_url.netloc in self._parsed_url.netloc or + self._parsed_url.netloc in new_parsed_url.netloc): + return + + assert new_parsed_url == self._parsed_url, '{0} != {1}'.format( + self._parsed_url, new_parsed_url) + + self._parsed_url = new_parsed_url + self.server = '{0}://{1}'.format( + self._parsed_url.scheme, self._parsed_url.netloc) + self.scriptpath = self._parsed_url.path def handle_starttag(self, tag, attrs): + """Handle an opening tag.""" attrs = dict(attrs) if tag == "meta": if attrs.get('name') == 'generator': self.generator = attrs["content"] - if tag == "link": - if attrs.get('rel') == 'EditURI': - self.edituri = attrs["href"] + try: + self.version = MediaWikiVersion.from_generator( + self.generator) + except ValueError: + pass + elif tag == 'link' and 'rel' in attrs and 'href' in attrs: + if attrs['rel'] in ('EditURI', 'stylesheet', 'search'): + self.set_api_url(attrs['href']) + elif tag == 'script' and 'src' in attrs: + self.set_api_url(attrs['src']) diff --git a/tests/site_detect_tests.py b/tests/site_detect_tests.py index f13105f..bcad8cb 100644 --- a/tests/site_detect_tests.py +++ b/tests/site_detect_tests.py @@ -11,8 +11,9 @@ from requests.exceptions import Timeout +from pywikibot.exceptions import ServerError from pywikibot.site_detect import MWSite -from pywikibot.tools import PY2 +from pywikibot.tools import MediaWikiVersion, PY2 from tests.aspects import unittest, TestCase @@ -23,10 +24,6 @@ class TestWikiSiteDetection(TestCase): """Test Case for MediaWiki detection and site object creation.""" - - family = 'meta' - code = 'meta' - net = True def setUp(self): """Set up test.""" @@ -75,7 +72,7 @@ self.all += [url] try: site = MWSite(url) - except Timeout as e: + except (ServerError, Timeout) as e: self.skips[url] = e return except Exception as e: @@ -88,7 +85,7 @@ self.assertIsNone(site) else: self.assertIsInstance(site, result) - self.passes[url] = result + self.passes[url] = site except AssertionError as error: self.failures[url] = error @@ -102,15 +99,24 @@ def assertAllPass(self): """Assert that all urls were detected as a MediaWiki site.""" - self.assertEqual(len(self.passes), len(self.all) - len(self.skips)) - self.assertEqual(len(self.failures), 0) - self.assertEqual(len(self.errors), 0) + self.assertEqual(set(self.passes), set(self.all) - set(self.skips)) + self.assertEqual(self.failures, {}) + self.assertEqual(self.errors, {}) def assertAllError(self): """Assert that all urls were not detected as a MediaWiki site.""" - self.assertEqual(len(self.passes), 0) - self.assertEqual(len(self.failures), 0) - self.assertEqual(len(self.errors), len(self.all) - len(self.skips)) + self.assertEqual(self.passes, {}) + self.assertEqual(self.failures, {}) + self.assertEqual(set(self.errors), set(self.all) - set(self.skips)) + + +class InterWikiMapDetection(TestWikiSiteDetection): + + """Test all urls on the interwiki map.""" + + family = 'meta' + code = 'meta' + net = True def test_IWM(self): """Test the load_site method for MW sites on the IWM list.""" @@ -133,39 +139,85 @@ self.errors[url] = error else: try: - self.assertIsInstance(version, basestring) - self.assertRegex(version, r'^\d\.\d+.*') + self.assertIsInstance(version, MediaWikiVersion) self.passes[url] = site except AssertionError as error: print('failed to parse version of ' + url) self.failures[url] = error + +class SiteDetectionTestCase(TestWikiSiteDetection): + + """Test all urls on the interwiki map.""" + + net = True + def test_detect_site(self): """Test detection of MediaWiki sites.""" self.assertSite('http://botwiki.sno.cc/wiki/$1') - self.assertSite('http://glossary.reuters.com/index.php?title=$1') - self.assertSite('http://www.livepedia.gr/index.php?title=$1') self.assertSite('http://guildwars.wikia.com/wiki/$1') - self.assertSite('http://www.hrwiki.org/index.php/$1') + self.assertSite('http://www.hrwiki.org/index.php/$1') # v 1.15 self.assertSite('http://www.proofwiki.org/wiki/$1') self.assertSite( 'http://www.ck-wissen.de/ckwiki/index.php?title=$1') self.assertSite('http://en.citizendium.org/wiki/$1') self.assertSite( 'http://www.lojban.org/tiki/tiki-index.php?page=$1') - self.assertSite('http://www.EcoReality.org/wiki/$1') self.assertSite('http://www.wikichristian.org/index.php?title=$1') - self.assertSite('http://wikitree.org/index.php?title=$1') + self.assertSite('https://en.wikifur.com/wiki/$1') + self.assertSite('http://bluwiki.com/go/$1') + self.assertSite('http://kino.skripov.com/index.php/$1') + self.assertAllPass() + + def test_wikisophia(self): + """Test wikisophia.org which has redirect problems.""" + # /index.php?title=$1 reports 404, however a wiki exists there, + # but the API is also hidden. + self.assertNoSite('http://wikisophia.org/index.php?title=$1') + self.assertAllError() + + def test_pre_114_sites(self): + """Test pre 1.14 sites which should be detected as unsupported.""" + # v1.12 + self.assertNoSite('http://www.livepedia.gr/index.php?title=$1') + # v1.11 + self.assertNoSite('http://www.wikifon.org/$1') + self.assertNoSite('http://glossary.reuters.com/index.php?title=$1') + # v1.11, with no query module + self.assertNoSite('http://wikitree.org/index.php?title=$1') + # v1.9 + self.assertNoSite('http://www.wikinvest.com/$1') + self.assertAllError() + + def test_non_standard_version_sites(self): + """Test non-standard version string sites.""" + self.assertSite('https://wiki.gentoo.org/wiki/$1') + self.assertSite('http://wiki.arabeyes.org/$1') + self.assertSite('http://tfwiki.net/wiki/$1') self.assertAllPass() def test_detect_failure(self): """Test detection failure for MediaWiki sites with an API.""" - self.assertNoSite('https://en.wikifur.com/wiki/$1') + # SSL certificate verification fails + self.assertNoSite('http://hackerspaces.org/wiki/$1') + self.assertAllError() + + @unittest.expectedFailure + def test_api_hidden(self): + """Test MediaWiki sites with a hidden enabled API.""" # api.php is not available self.assertNoSite('http://wiki.animutationportal.com/index.php/$1') - # API is disabled + # HTML looks like it has an API, but redirect rules prevent access + self.assertNoSite('http://www.EcoReality.org/wiki/$1') + self.assertAllError() + + def test_api_disabled(self): + """Test MediaWiki sites without an enabled API.""" self.assertNoSite('http://wiki.linuxquestions.org/wiki/$1') - # offline + self.assertAllError() + + def test_offline_sites(self): + """Test offline sites.""" self.assertNoSite('http://seattlewiki.org/wiki/$1') self.assertAllError() @@ -181,7 +233,6 @@ def test_detect_nosite(self): """Test detection of non-wiki sites.""" - self.assertNoSite('http://bluwiki.com/go/$1') self.assertNoSite('http://www.imdb.com/name/nm$1/') self.assertNoSite('http://www.ecyrd.com/JSPWiki/Wiki.jsp?page=$1') self.assertNoSite('http://operawiki.info/$1') @@ -192,8 +243,13 @@ self.assertNoSite( 'http://www.merriam-webster.com/cgi-bin/dictionary?book=Dictionary&va=$1') self.assertNoSite('http://arxiv.org/abs/$1') + self.assertAllError() + + def test_musicbrainz_doc(self): + """Test http://musicbrainz.org/doc/ which has a page 'api.php'.""" + # Possible false positive caused by the existance of a page + # called http://musicbrainz.org/doc/api.php self.assertNoSite('http://musicbrainz.org/doc/$1') - self.assertNoSite('http://wiki.animutationportal.com/index.php/$1') self.assertAllError() diff --git a/tox.ini b/tox.ini index 728dc58..15ecea9 100644 --- a/tox.ini +++ b/tox.ini @@ -64,6 +64,7 @@ pywikibot/pagegenerators.py \ pywikibot/plural.py \ pywikibot/proofreadpage.py \ + pywikibot/site_detect.py \ pywikibot/textlib.py \ pywikibot/throttle.py \ pywikibot/titletranslate.py \ -- To view, visit https://gerrit.wikimedia.org/r/230512 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Iebadd7f782ab1b471d3de71fa1c5a52c0d6f1018 Gerrit-PatchSet: 16 Gerrit-Project: pywikibot/core Gerrit-Branch: master Gerrit-Owner: John Vandenberg <jay...@gmail.com> Gerrit-Reviewer: John Vandenberg <jay...@gmail.com> Gerrit-Reviewer: Ladsgroup <ladsgr...@gmail.com> Gerrit-Reviewer: Merlijn van Deen <valhall...@arctus.nl> Gerrit-Reviewer: XZise <commodorefabia...@gmx.de> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits