jenkins-bot has submitted this change and it was merged. Change subject: Don't ignore "DoNotArchiveUntil" timestamps ......................................................................
Don't ignore "DoNotArchiveUntil" timestamps Don't ignore timestamps written in HTML comments, e.g. as used by "DoNotArchiveUntil". See: - https://commons.wikimedia.org/wiki/Template:DNAU - https://en.wikipedia.org/wiki/Template:Do_not_archive_until Analyze comments separately from rest of each line to avoid to skip dates in comments, as the date matched by timestripper is the rightmost one. Bug: T102423 Change-Id: I079d9f6b636ac0a145dd04a3190a65c61b9d1b31 --- M pywikibot/textlib.py M tests/timestripper_tests.py 2 files changed, 72 insertions(+), 3 deletions(-) Approvals: John Vandenberg: Looks good to me, approved jenkins-bot: Verified diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py index 2fae512..de627c2 100644 --- a/pywikibot/textlib.py +++ b/pywikibot/textlib.py @@ -1609,6 +1609,10 @@ ] self.linkP = compileLinkR() + self.comment_pattern = re.compile(r'<!--(.*?)-->') + + self.tzinfo = tzoneFixedOffset(self.site.siteinfo['timeoffset'], + self.site.siteinfo['timezone']) def findmarker(self, text, base=u'@@', delta='@'): """Find a string which is not part of text.""" @@ -1661,6 +1665,17 @@ """ # match date fields dateDict = dict() + # Analyze comments separately from rest of each line to avoid to skip + # dates in comments, as the date matched by timestripper is the + # rightmost one. + most_recent = [] + for comment in self.comment_pattern.finditer(line): + # Recursion levels can be maximum two. If a comment is found, it will + # not for sure be found in the next level. + # Nested cmments are excluded by design. + timestamp = self.timestripper(comment.group(1)) + most_recent.append(timestamp) + # Remove parts that are not supposed to contain the timestamp, in order # to reduce false positives. line = removeDisabledParts(line) @@ -1696,12 +1711,17 @@ % (v, k)) # find timezone - dateDict['tzinfo'] = tzoneFixedOffset(self.site.siteinfo['timeoffset'], - self.site.siteinfo['timezone']) + dateDict['tzinfo'] = self.tzinfo timestamp = datetime.datetime(**dateDict) - else: timestamp = None + most_recent.append(timestamp) + + try: + timestamp = max(ts for ts in most_recent if ts is not None) + except ValueError: + timestamp = None + return timestamp diff --git a/tests/timestripper_tests.py b/tests/timestripper_tests.py index 80d70e0..a3f7e77 100644 --- a/tests/timestripper_tests.py +++ b/tests/timestripper_tests.py @@ -231,6 +231,55 @@ self.assertEqual(self.ts.timestripper(txtNoMatch), None) +class TestTimeStripperDoNotArchiveUntil(TestCase): + + """Test cases for Do Not Archive Until templates. + + See https://commons.wikimedia.org/wiki/Template:DNAU and + https://en.wikipedia.org/wiki/Template:Do_not_archive_until. + """ + + family = 'wikisource' + code = 'en' + + cached = True + + username = '[[User:DoNotArchiveUntil]]' + date = '06:57 06 June 2015 (UTC)' + user_and_date = username + ' ' + date + tzone = tzoneFixedOffset(0, 'UTC') + + def test_timestripper_match(self): + """Test that dates in comments are correctly recognised.""" + ts = TimeStripper(self.get_site()) + + txt_match = '<!-- [[User:Do___ArchiveUntil]] ' + self.date + ' -->' + res = datetime.datetime(2015, 6, 6, 6, 57, tzinfo=self.tzone) + self.assertEqual(ts.timestripper(txt_match), res) + + txt_match = '<!-- --> <!-- ' + self.user_and_date + ' <!-- -->' + res = datetime.datetime(2015, 6, 6, 6, 57, tzinfo=self.tzone) + self.assertEqual(ts.timestripper(txt_match), res) + + txt_match = '<!-- ' + self.user_and_date + ' -->' + res = datetime.datetime(2015, 6, 6, 6, 57, tzinfo=self.tzone) + self.assertEqual(ts.timestripper(txt_match), res) + + def test_timestripper_match_only(self): + """Test that latest date is used instead of other dates.""" + ts = TimeStripper(self.get_site()) + + later_date = '10:57 06 June 2015 (UTC)' + txt_match = '<!-- --> ' + self.user_and_date + ' <!-- -->' + later_date + res = datetime.datetime(2015, 6, 6, 10, 57, tzinfo=self.tzone) + self.assertEqual(ts.timestripper(txt_match), res) + + earlier_date = '02:57 06 June 2015 (UTC)' + txt_match = '<!-- ' + self.user_and_date + ' --> ' + earlier_date + res = datetime.datetime(2015, 6, 6, 6, 57, tzinfo=self.tzone) + self.assertEqual(ts.timestripper(txt_match), res) + + if __name__ == '__main__': try: unittest.main() -- To view, visit https://gerrit.wikimedia.org/r/218436 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I079d9f6b636ac0a145dd04a3190a65c61b9d1b31 Gerrit-PatchSet: 6 Gerrit-Project: pywikibot/core Gerrit-Branch: master Gerrit-Owner: Mpaa <mpaa.w...@gmail.com> Gerrit-Reviewer: John Vandenberg <jay...@gmail.com> Gerrit-Reviewer: Ladsgroup <ladsgr...@gmail.com> Gerrit-Reviewer: Merlijn van Deen <valhall...@arctus.nl> Gerrit-Reviewer: Mpaa <mpaa.w...@gmail.com> Gerrit-Reviewer: XZise <commodorefabia...@gmx.de> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ Pywikibot-commits mailing list Pywikibot-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/pywikibot-commits