John Vandenberg has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/150872

Change subject: Introduce static method Link.normalize(title)
......................................................................

Introduce static method Link.normalize(title)

Moves normalisation and basic validation logic from
Link.__init__ and Link.parse_test into a new static method
which can be used on a string which is a title when validation
of site namespaces and interwikis is not desirable.

Fixes bug 61832

Change-Id: I7021e3d7e40d72fd74709f396e967c9803248c52
---
M pywikibot/page.py
M pywikibot/site.py
M scripts/cosmetic_changes.py
M tests/wikibase_tests.py
4 files changed, 92 insertions(+), 43 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core 
refs/changes/72/150872/1

diff --git a/pywikibot/page.py b/pywikibot/page.py
index 7a0d1bd..b17cded 100644
--- a/pywikibot/page.py
+++ b/pywikibot/page.py
@@ -1234,12 +1234,15 @@
         # element into a list in the format used by old scripts
         result = []
         for template in templates:
-            link = pywikibot.Link(template[0], self.site,
-                                  defaultNamespace=10)
             try:
+                link = pywikibot.Link(template[0], self.site,
+                                      defaultNamespace=10)
                 if link.canonical_title() not in titles:
                     continue
-            except pywikibot.Error:
+            except pywikibot.InvalidTitle:
+                # TODO: this exception handling should not be necessary,
+                # however extract_templates_and_params on en:wp mainpage
+                # returns titles like '#if:{{Main Page banner}}'.
                 # this is a parser function or magic word, not template name
                 continue
             args = template[1]
@@ -3505,10 +3508,14 @@
         following a '|' character inside the link
 
     """
-
+    # The hash/number symbol and vertical pipe symbol are valid in a Link,
+    # but are not valid in a Title.
     illegal_titles_pattern = re.compile(
+        r'''[\x23\x7c]'''
+    )
+    illegal_link_pattern = re.compile(
         # Matching titles will be held as illegal.
-        r'''[\x00-\x1f\x23\x3c\x3e\x5b\x5d\x7b\x7c\x7d\x7f]'''
+        r'''[\x00-\x1f\x3c\x3e\x5b\x5d\x7b\x7d\x7f]'''
         # URL percent encoding sequences interfere with the ability
         # to round-trip titles -- you can't link to them consistently.
         u'|%[0-9A-Fa-f]{2}'
@@ -3531,27 +3538,40 @@
             contain one (defaults to 0)
         @type defaultNamespace: int
 
+        @exception InvalidTitle: The title is not valid.
         """
         assert source is None or isinstance(source, pywikibot.site.BaseSite), \
             "source parameter should be a Site object"
+
+        if not text:
+            raise pywikibot.InvalidTitle('No title')
 
         self._text = text
         self._source = source or pywikibot.Site()
         self._defaultns = defaultNamespace
 
-        # preprocess text (these changes aren't site-dependent)
-        # First remove anchor, which is stored unchanged, if there is one
-        if u"|" in self._text:
-            self._text, self._anchor = self._text.split(u"|", 1)
-        else:
-            self._anchor = None
-
         # Clean up the name, it can come from anywhere.
-        # Convert HTML entities to unicode
-        t = html2unicode(self._text)
 
         # Convert URL-encoded characters to unicode
-        t = url2unicode(t, site=self._source)
+        # FIXME: to be moved into normalize after
+        # I9ca2a933d227afa79de8ce402304592682785d17
+        t = url2unicode(self._text, site=self._source)
+
+        self._text = Link.normalize(t)
+
+    @staticmethod
+    def normalize(title):
+        """
+        Normalise a title, with basic non-site specific validation.
+
+        @param title: title to normalise
+        @type title: unicode
+        @return: unicode
+
+        @exception InvalidTitle: The title is not valid.
+        """
+        # Convert HTML entities to unicode
+        t = html2unicode(title)
 
         # Normalize unicode string to a NFC (composed) format to allow
         # proper string comparisons. According to
@@ -3563,8 +3583,8 @@
         # This code was adapted from Title.php : secureAndSplit()
         #
         if u'\ufffd' in t:
-            raise pywikibot.Error(
-                "Title contains illegal char (\\uFFFD 'REPLACEMENT 
CHARACTER')")
+            raise pywikibot.InvalidTitle(
+                "Title contains illegal char \\uFFFD (REPLACEMENT CHARACTER)")
 
         # Replace underscores by spaces
         t = t.replace(u"_", u" ")
@@ -3572,10 +3592,41 @@
         while u"  " in t:
             t = t.replace(u"  ", u" ")
         # Strip spaces at both ends
+        # TODO: Stripping trailing spaces breaks linktrails
+        #       and may cause the same issue with leading spaces.
         t = t.strip()
         # Remove left-to-right and right-to-left markers.
         t = t.replace(u"\u200e", u"").replace(u"\u200f", u"")
-        self._text = t
+
+        # Reject illegal characters.
+        m = Link.illegal_link_pattern.search(t)
+        if m:
+            raise pywikibot.InvalidTitle(
+                u"%s contains illegal char(s) %s"
+                % (repr(t), repr(m.group(0))))
+
+        # Pages with "/./" or "/../" appearing in the URLs will
+        # often be unreachable due to the way web browsers deal
+        # * with 'relative' URLs. Forbid them explicitly.
+
+        if u'.' in t and (
+                t == u'.' or t == u'..'
+                or t.startswith(u"./")
+                or t.startswith(u"../")
+                or u"/./" in t
+                or u"/../" in t
+                or t.endswith(u"/.")
+                or t.endswith(u"/..")
+        ):
+            raise pywikibot.InvalidTitle(
+                "(contains . / combinations): '%s'"
+                % self._text)
+
+        # Magic tilde sequences? Nu-uh!
+        if u"~~~" in t:
+            raise pywikibot.InvalidTitle("(contains ~~~): '%s'" % self._text)
+
+        return t
 
     def __repr__(self):
         """Return a more complete string representation."""
@@ -3628,6 +3679,14 @@
         """
         self._site = self._source
         self._namespace = self._defaultns
+
+        # preprocess text (these changes aren't site-dependent)
+        # First remove anchor, which is stored unchanged, if there is one
+        if u"|" in self._text:
+            self._text, self._anchor = self._text.split(u"|", 1)
+        else:
+            self._anchor = None
+
         t = self._text
 
         # This code was adapted from Title.php : secureAndSplit()
@@ -3698,27 +3757,6 @@
         if m:
             raise pywikibot.InvalidTitle(
                 u"%s contains illegal char(s) %s" % (repr(t), 
repr(m.group(0))))
-
-        # Pages with "/./" or "/../" appearing in the URLs will
-        # often be unreachable due to the way web browsers deal
-        # * with 'relative' URLs. Forbid them explicitly.
-
-        if u'.' in t and (
-                t == u'.' or t == u'..'
-                or t.startswith(u"./")
-                or t.startswith(u"../")
-                or u"/./" in t
-                or u"/../" in t
-                or t.endswith(u"/.")
-                or t.endswith(u"/..")
-        ):
-            raise pywikibot.InvalidTitle(
-                "(contains . / combinations): '%s'"
-                % self._text)
-
-        # Magic tilde sequences? Nu-uh!
-        if u"~~~" in t:
-            raise pywikibot.InvalidTitle("(contains ~~~): '%s'" % self._text)
 
         if self._namespace != -1 and len(t) > 255:
             raise pywikibot.InvalidTitle("(over 255 bytes): '%s'" % t)
diff --git a/pywikibot/site.py b/pywikibot/site.py
index c26e8cd..b00c16c 100644
--- a/pywikibot/site.py
+++ b/pywikibot/site.py
@@ -171,7 +171,12 @@
                 user = user[0].upper() + user[1:]
             if sysop:
                 sysop = sysop[0].upper() + sysop[1:]
+        if user:
+            user = pywikibot.Link.normalize(user)
+        if sysop:
+            sysop = pywikibot.Link.normalize(sysop)
         self._username = [user, sysop]
+
         self.use_hard_category_redirects = (
             self.code in self.family.use_hard_category_redirects)
 
@@ -424,6 +429,14 @@
 
         if title1 == title2:
             return True
+        try:
+            title1 = pywikibot.Link.normalize(title1)
+            title2 = pywikibot.Link.normalize(title2)
+        except InvalidTitle:
+            return False
+        if title1 == title2:
+            return True
+
         # determine whether titles contain namespace prefixes
         if ":" in title1:
             ns1, name1 = title1.split(":", 1)
diff --git a/scripts/cosmetic_changes.py b/scripts/cosmetic_changes.py
index 34a2d4f..a305c31 100755
--- a/scripts/cosmetic_changes.py
+++ b/scripts/cosmetic_changes.py
@@ -405,6 +405,8 @@
                 except pywikibot.InvalidTitle:
                     return match.group()
                 if namespace == 0:
+                    # TODO: This logic could be merged with Link.normalize
+
                     # Replace underlines by spaces, also multiple underlines
                     titleWithSection = re.sub('_+', ' ', titleWithSection)
                     # Remove double spaces
diff --git a/tests/wikibase_tests.py b/tests/wikibase_tests.py
index aca60b3..a76457e 100644
--- a/tests/wikibase_tests.py
+++ b/tests/wikibase_tests.py
@@ -289,10 +289,6 @@
         self.assertEquals(hasattr(item, '_content'), True)
         self.assertEquals(item.exists(), True)
 
-    def test_fromPage_invalid_title(self):
-        page = pywikibot.Page(pywikibot.page.Link("[]", site))
-        self.assertRaises(pywikibot.InvalidTitle, pywikibot.ItemPage.fromPage, 
page)
-
     def _test_fromPage_noitem(self, link):
         """Helper function to test a page without an associated item.
 

-- 
To view, visit https://gerrit.wikimedia.org/r/150872
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I7021e3d7e40d72fd74709f396e967c9803248c52
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: John Vandenberg <jay...@gmail.com>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to