hroest has uploaded a new change for review. https://gerrit.wikimedia.org/r/272655
Change subject: [FEATURE] add feature to check snippet in search ...................................................................... [FEATURE] add feature to check snippet in search - using the new checkSnippet option, only pages are returned that actually contain the search-word. This can substantially reduce the number of returned pages and thus speed up the bot. [FEATURE] add option checkSnippet to generator [TEST] add test [FIX] add maximum items [FEATURE] skip after a certain number of misses Change-Id: I62c91758fa33620215fd85ed583da56623f73b65 --- M pywikibot/pagegenerators.py M pywikibot/site.py M tests/site_tests.py 3 files changed, 78 insertions(+), 11 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core refs/changes/55/272655/1 diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py index 7ccf929..e70f369 100644 --- a/pywikibot/pagegenerators.py +++ b/pywikibot/pagegenerators.py @@ -2241,7 +2241,7 @@ @deprecated_args(number='total') def SearchPageGenerator(query, step=None, total=None, namespaces=None, - site=None): + site=None, checkSnippet=True, skipAfterNrMisses=-1): """ Yield pages from the MediaWiki internal search engine. @@ -2255,7 +2255,7 @@ if site is None: site = pywikibot.Site() for page in site.search(query, step=step, total=total, - namespaces=namespaces): + namespaces=namespaces, checkSnippet=checkSnippet, skipAfterNrMisses=skipAfterNrMisses): yield page diff --git a/pywikibot/site.py b/pywikibot/site.py index fb92abe..b3a988c 100644 --- a/pywikibot/site.py +++ b/pywikibot/site.py @@ -4400,11 +4400,15 @@ @deprecated_args(number='total', key='searchstring', getredirects='get_redirects') def search(self, searchstring, namespaces=None, where="text", - get_redirects=False, step=None, total=None, content=False): + get_redirects=False, step=None, total=None, content=False, + checkSnippet=False, skipAfterNrMisses=-1): """Iterate Pages that contain the searchstring. Note that this may include non-existing Pages if the wiki's database - table contains outdated entries. + table contains outdated entries. Note that sometimes articles can be + returned that do not actually contain the searchterm but a closely + related term. This can be prevented by checking that the term is + actually present (see checkSnippet parameter). @param searchstring: the text to search for @type searchstring: unicode @@ -4416,8 +4420,13 @@ list of namespace identifiers. @param get_redirects: if True, include redirects in results. Since version MediaWiki 1.23 it will always return redirects. + @param total: Maximum number of items to retrieve @param content: if True, load the current content of each iterated page (default False) + @param checkSnippet: if True, only yield pages that contain an exact match + (default False) + @param skipAfterNrMisses: Stop retrieving items after this many + non-matches were retrieved (default -1) @raises KeyError: a namespace identifier was not resolved @raises TypeError: a namespace identifier has an inappropriate type such as NoneType or bool @@ -4431,13 +4440,65 @@ if not namespaces: pywikibot.warning(u"search: namespaces cannot be empty; using [0].") namespaces = [0] - srgen = self._generator(api.PageGenerator, type_arg="search", - gsrsearch=searchstring, gsrwhat=where, - namespaces=namespaces, step=step, - total=total, g_content=content) - if MediaWikiVersion(self.version()) < MediaWikiVersion('1.23'): - srgen.request['gsrredirects'] = get_redirects - return srgen + + if MediaWikiVersion(self.version()) > MediaWikiVersion('1.11') and checkSnippet: + + if len(namespaces) > 1: + raise Error("Cannot do more than one namespace and check snippets") + + # If we want to get the snippets, we have to use a ListGenerator + # and manually yield those pages that contain the correct words. + srgen = api.ListGenerator("search", site=self, + srsearch=searchstring, srwhat=where, + srnamespace=namespaces[0], step=step, + g_content=content) + + if total is not None: + srgen.set_maximum_items(total) + + if MediaWikiVersion(self.version()) < MediaWikiVersion('1.23'): + srgen.request['srredirects'] = get_redirects + + import re + wordsearch = re.compile(r'<span class="searchmatch">([^<]*)</span>') + nr_misses = 0 + for pagedata in srgen: + + snippet = pagedata["snippet"] + match = wordsearch.search(snippet) + if match and match.group(1) == searchstring: + + # Matching page found, create it and yield + p = pywikibot.Page(self, pagedata['title'], pagedata['ns']) + ns = pagedata['ns'] + # Upcast to proper Page subclass. + if ns == 6: + p = pywikibot.FilePage(p) + elif ns == 14: + p = pywikibot.Category(p) + # api.update_page(p, pagedata) + yield p + else: + nr_misses += 1 + if skipAfterNrMisses > 0 and nr_misses > skipAfterNrMisses: + break + + else: + + # Pagegenerator solution (does not check snippets) + srgen = self._generator(api.PageGenerator, type_arg="search", + gsrsearch=searchstring, gsrwhat=where, + namespaces=namespaces, step=step, + total=total, g_content=content) + + if total is not None: + srgen.set_maximum_items(total) + + if MediaWikiVersion(self.version()) < MediaWikiVersion('1.23'): + srgen.request['gsrredirects'] = get_redirects + + for p in srgen: + yield p def usercontribs(self, user=None, userprefix=None, start=None, end=None, reverse=False, namespaces=None, showMinor=None, diff --git a/tests/site_tests.py b/tests/site_tests.py index 989e940..5af9f68 100644 --- a/tests/site_tests.py +++ b/tests/site_tests.py @@ -1380,6 +1380,12 @@ get_redirects=True): self.assertIsInstance(hit, pywikibot.Page) self.assertEqual(hit.namespace(), 0) + + se = list(mysite.search("wiki", total=100, checkSnippet=True)) + self.assertLessEqual(len(se), 100) + self.assertTrue(all(isinstance(hit, pywikibot.Page) + for hit in se)) + except pywikibot.data.api.APIError as e: if e.code == "gsrsearch-error" and "timed out" in e.info: raise unittest.SkipTest("gsrsearch returned timeout on site: %r" % e) -- To view, visit https://gerrit.wikimedia.org/r/272655 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I62c91758fa33620215fd85ed583da56623f73b65 Gerrit-PatchSet: 1 Gerrit-Project: pywikibot/core Gerrit-Branch: master Gerrit-Owner: hroest <hannesro...@gmx.ch> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits