jenkins-bot has submitted this change and it was merged.

Change subject: Don't ignore "DoNotArchiveUntil" timestamps
......................................................................


Don't ignore "DoNotArchiveUntil" timestamps

Don't ignore timestamps written in HTML comments, e.g. as used by
"DoNotArchiveUntil".

See:
- https://commons.wikimedia.org/wiki/Template:DNAU
- https://en.wikipedia.org/wiki/Template:Do_not_archive_until

Analyze comments separately from rest of each line to avoid to skip
dates in comments, as the date matched by timestripper is the rightmost
one.

Bug: T102423
Change-Id: I079d9f6b636ac0a145dd04a3190a65c61b9d1b31
---
M pywikibot/textlib.py
M tests/timestripper_tests.py
2 files changed, 72 insertions(+), 3 deletions(-)

Approvals:
  John Vandenberg: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index 2fae512..de627c2 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -1609,6 +1609,10 @@
         ]
 
         self.linkP = compileLinkR()
+        self.comment_pattern = re.compile(r'<!--(.*?)-->')
+
+        self.tzinfo = tzoneFixedOffset(self.site.siteinfo['timeoffset'],
+                                       self.site.siteinfo['timezone'])
 
     def findmarker(self, text, base=u'@@', delta='@'):
         """Find a string which is not part of text."""
@@ -1661,6 +1665,17 @@
         """
         # match date fields
         dateDict = dict()
+        # Analyze comments separately from rest of each line to avoid to skip
+        # dates in comments, as the date matched by timestripper is the
+        # rightmost one.
+        most_recent = []
+        for comment in self.comment_pattern.finditer(line):
+            # Recursion levels can be maximum two. If a comment is found, it 
will
+            # not for sure be found in the next level.
+            # Nested cmments are excluded by design.
+            timestamp = self.timestripper(comment.group(1))
+            most_recent.append(timestamp)
+
         # Remove parts that are not supposed to contain the timestamp, in order
         # to reduce false positives.
         line = removeDisabledParts(line)
@@ -1696,12 +1711,17 @@
                                      % (v, k))
 
             # find timezone
-            dateDict['tzinfo'] = 
tzoneFixedOffset(self.site.siteinfo['timeoffset'],
-                                                  
self.site.siteinfo['timezone'])
+            dateDict['tzinfo'] = self.tzinfo
 
             timestamp = datetime.datetime(**dateDict)
-
         else:
             timestamp = None
 
+        most_recent.append(timestamp)
+
+        try:
+            timestamp = max(ts for ts in most_recent if ts is not None)
+        except ValueError:
+            timestamp = None
+
         return timestamp
diff --git a/tests/timestripper_tests.py b/tests/timestripper_tests.py
index 80d70e0..a3f7e77 100644
--- a/tests/timestripper_tests.py
+++ b/tests/timestripper_tests.py
@@ -231,6 +231,55 @@
         self.assertEqual(self.ts.timestripper(txtNoMatch), None)
 
 
+class TestTimeStripperDoNotArchiveUntil(TestCase):
+
+    """Test cases for Do Not Archive Until templates.
+
+    See https://commons.wikimedia.org/wiki/Template:DNAU and
+    https://en.wikipedia.org/wiki/Template:Do_not_archive_until.
+    """
+
+    family = 'wikisource'
+    code = 'en'
+
+    cached = True
+
+    username = '[[User:DoNotArchiveUntil]]'
+    date = '06:57 06 June 2015 (UTC)'
+    user_and_date = username + ' ' + date
+    tzone = tzoneFixedOffset(0, 'UTC')
+
+    def test_timestripper_match(self):
+        """Test that dates in comments  are correctly recognised."""
+        ts = TimeStripper(self.get_site())
+
+        txt_match = '<!-- [[User:Do___ArchiveUntil]] ' + self.date + ' -->'
+        res = datetime.datetime(2015, 6, 6, 6, 57, tzinfo=self.tzone)
+        self.assertEqual(ts.timestripper(txt_match), res)
+
+        txt_match = '<!-- --> <!-- ' + self.user_and_date + ' <!-- -->'
+        res = datetime.datetime(2015, 6, 6, 6, 57, tzinfo=self.tzone)
+        self.assertEqual(ts.timestripper(txt_match), res)
+
+        txt_match = '<!-- ' + self.user_and_date + ' -->'
+        res = datetime.datetime(2015, 6, 6, 6, 57, tzinfo=self.tzone)
+        self.assertEqual(ts.timestripper(txt_match), res)
+
+    def test_timestripper_match_only(self):
+        """Test that latest date is used instead of other dates."""
+        ts = TimeStripper(self.get_site())
+
+        later_date = '10:57 06 June 2015 (UTC)'
+        txt_match = '<!-- --> ' + self.user_and_date + ' <!-- -->' + later_date
+        res = datetime.datetime(2015, 6, 6, 10, 57, tzinfo=self.tzone)
+        self.assertEqual(ts.timestripper(txt_match), res)
+
+        earlier_date = '02:57 06 June 2015 (UTC)'
+        txt_match = '<!-- ' + self.user_and_date + ' --> ' + earlier_date
+        res = datetime.datetime(2015, 6, 6, 6, 57, tzinfo=self.tzone)
+        self.assertEqual(ts.timestripper(txt_match), res)
+
+
 if __name__ == '__main__':
     try:
         unittest.main()

-- 
To view, visit https://gerrit.wikimedia.org/r/218436
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I079d9f6b636ac0a145dd04a3190a65c61b9d1b31
Gerrit-PatchSet: 6
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Mpaa <mpaa.w...@gmail.com>
Gerrit-Reviewer: John Vandenberg <jay...@gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgr...@gmail.com>
Gerrit-Reviewer: Merlijn van Deen <valhall...@arctus.nl>
Gerrit-Reviewer: Mpaa <mpaa.w...@gmail.com>
Gerrit-Reviewer: XZise <commodorefabia...@gmx.de>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
Pywikibot-commits mailing list
Pywikibot-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/pywikibot-commits

Reply via email to