jenkins-bot has submitted this change and it was merged.

Change subject: [FEAT] Category: Get newest pages
......................................................................


[FEAT] Category: Get newest pages

This returns the pages which have been added to a category ordered by
the creation date from newest to oldest. To do that it needs to cache
all pages and check on each iteration if any previously cached page was
created after the current page is added. If that is the case the cached
pages are newer than the current page and any other page which will be
checked as the generator progresses as they are ordered by addition to
the category and only pages are returned which were added to the
category before the current page was added.

Change-Id: I9bb3f74bbe2e3319ed2dbcdefab414a3204c2c32
---
M pywikibot/page.py
M tests/category_tests.py
2 files changed, 90 insertions(+), 0 deletions(-)

Approvals:
  John Vandenberg: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/pywikibot/page.py b/pywikibot/page.py
index c63af01..61e1d1b 100644
--- a/pywikibot/page.py
+++ b/pywikibot/page.py
@@ -2439,6 +2439,74 @@
         """
         return self.site.categoryinfo(self)
 
+    def newest_pages(self, total=None):
+        """
+        Return pages in a category ordered by the creation date.
+
+        If two or more pages are created at the same time, the pages are
+        returned in the order they were added to the category. The most 
recently
+        added page is returned first.
+
+        It only allows to return the pages ordered from newest to oldest, as it
+        is impossible to determine the oldest page in a category without
+        checking all pages. But it is possible to check the category in order
+        with the newly added first and it yields all pages which were created
+        after the currently checked page was added (and thus there is no page
+        created after any of the cached but added before the currently 
checked).
+
+        @param total: The total number of pages queried.
+        @type total: int
+        @return: A page generator of all pages in a category ordered by the
+            creation date. From newest to oldest. Note: It currently only
+            returns Page instances and not a subclass of it if possible. This
+            might change so don't expect to only get Page instances.
+        @rtype: generator
+        """
+        def check_cache(latest):
+            """Return the cached pages in order and not more than total."""
+            cached = []
+            for timestamp in sorted((ts for ts in cache if ts > latest),
+                                    reverse=True):
+                # The complete list can be removed, it'll either yield all of
+                # them, or only a portion but will skip the rest anyway
+                cached += cache.pop(timestamp)[:None if total is None else
+                                                total - len(cached)]
+                if total and len(cached) >= total:
+                    break  # already got enough
+            assert(total is None or len(cached) <= total)
+            return cached
+
+        # all pages which have been checked but where created before the
+        # current page was added, at some point they will be created after
+        # the current page was added. It saves all pages via the creation
+        # timestamp. Be prepared for multiple pages.
+        cache = defaultdict(list)
+        # TODO: Make site.categorymembers is usable as it returns pages
+        # There is no total defined, as it's not known how many pages need to 
be
+        # checked before the total amount of new pages was found. In worst case
+        # all pages of a category need to be checked.
+        for member in pywikibot.data.api.QueryGenerator(
+                site=self.site, list='categorymembers', cmsort='timestamp',
+                cmdir='older', cmprop='timestamp|title',
+                cmtitle=self.title()):
+            # TODO: Upcast to suitable class
+            page = pywikibot.Page(self.site, member['title'])
+            assert(page.namespace() == member['ns'])
+            cached = check_cache(pywikibot.Timestamp.fromISOformat(
+                member['timestamp']))
+            for cached_page in cached:
+                yield cached_page
+            if total is not None:
+                total -= len(cached)
+                if total <= 0:
+                    break
+            cache[page.oldest_revision.timestamp] += [page]
+        else:
+            # clear cache
+            assert(total is None or total > 0)
+            for cached_page in check_cache(pywikibot.Timestamp.min):
+                yield cached_page
+
 # ### DEPRECATED METHODS ####
     @deprecated("list(Category.subcategories(...))")
     def subcategoriesList(self, recurse=False):
diff --git a/tests/category_tests.py b/tests/category_tests.py
index 8923884..d9d543c 100644
--- a/tests/category_tests.py
+++ b/tests/category_tests.py
@@ -209,6 +209,28 @@
         self.assertEqual(cat.aslink(sortKey='Foo'), '[[Category:Wikipedia 
categories|Foo]]')
 
 
+class CategoryNewestPages(TestCase):
+
+    """Test newest_pages feature on French Wikinews."""
+
+    family = 'wikinews'
+    code = 'fr'
+
+    cached = True
+
+    def test_newest_pages(self):
+        """Test that the pages are getting older."""
+        cat = pywikibot.Category(self.get_site(), u'Catégorie:Yukon Quest 
2015')
+        last = pywikibot.Timestamp.max
+        count = 0
+        for page in cat.newest_pages():
+            creation_stamp = page.oldest_revision.timestamp
+            self.assertLessEqual(creation_stamp, last)
+            last = creation_stamp
+            count += 1
+        self.assertEqual(count, cat.categoryinfo['size'])
+
+
 if __name__ == '__main__':
     try:
         unittest.main()

-- 
To view, visit https://gerrit.wikimedia.org/r/190137
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I9bb3f74bbe2e3319ed2dbcdefab414a3204c2c32
Gerrit-PatchSet: 5
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: XZise <[email protected]>
Gerrit-Reviewer: John Vandenberg <[email protected]>
Gerrit-Reviewer: Ladsgroup <[email protected]>
Gerrit-Reviewer: Merlijn van Deen <[email protected]>
Gerrit-Reviewer: XZise <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to