hroest has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/272655

Change subject: [FEATURE] add feature to check snippet in search
......................................................................

[FEATURE] add feature to check snippet in search

- using the new checkSnippet option, only pages are returned that
  actually contain the search-word. This can substantially reduce the
  number of returned pages and thus speed up the bot.

[FEATURE] add option checkSnippet to generator

[TEST] add test

[FIX] add maximum items

[FEATURE] skip after a certain number of misses

Change-Id: I62c91758fa33620215fd85ed583da56623f73b65
---
M pywikibot/pagegenerators.py
M pywikibot/site.py
M tests/site_tests.py
3 files changed, 78 insertions(+), 11 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core 
refs/changes/55/272655/1

diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py
index 7ccf929..e70f369 100644
--- a/pywikibot/pagegenerators.py
+++ b/pywikibot/pagegenerators.py
@@ -2241,7 +2241,7 @@
 
 @deprecated_args(number='total')
 def SearchPageGenerator(query, step=None, total=None, namespaces=None,
-                        site=None):
+                        site=None, checkSnippet=True, skipAfterNrMisses=-1):
     """
     Yield pages from the MediaWiki internal search engine.
 
@@ -2255,7 +2255,7 @@
     if site is None:
         site = pywikibot.Site()
     for page in site.search(query, step=step, total=total,
-                            namespaces=namespaces):
+                            namespaces=namespaces, checkSnippet=checkSnippet, 
skipAfterNrMisses=skipAfterNrMisses):
         yield page
 
 
diff --git a/pywikibot/site.py b/pywikibot/site.py
index fb92abe..b3a988c 100644
--- a/pywikibot/site.py
+++ b/pywikibot/site.py
@@ -4400,11 +4400,15 @@
     @deprecated_args(number='total', key='searchstring',
                      getredirects='get_redirects')
     def search(self, searchstring, namespaces=None, where="text",
-               get_redirects=False, step=None, total=None, content=False):
+               get_redirects=False, step=None, total=None, content=False, 
+               checkSnippet=False, skipAfterNrMisses=-1):
         """Iterate Pages that contain the searchstring.
 
         Note that this may include non-existing Pages if the wiki's database
-        table contains outdated entries.
+        table contains outdated entries. Note that sometimes articles can be
+        returned that do not actually contain the searchterm but a closely
+        related term. This can be prevented by checking that the term is
+        actually present (see checkSnippet parameter).
 
         @param searchstring: the text to search for
         @type searchstring: unicode
@@ -4416,8 +4420,13 @@
             list of namespace identifiers.
         @param get_redirects: if True, include redirects in results. Since
             version MediaWiki 1.23 it will always return redirects.
+        @param total: Maximum number of items to retrieve
         @param content: if True, load the current content of each iterated page
             (default False)
+        @param checkSnippet: if True, only yield pages that contain an exact 
match 
+            (default False)
+        @param skipAfterNrMisses: Stop retrieving items after this many
+            non-matches were retrieved (default -1)
         @raises KeyError: a namespace identifier was not resolved
         @raises TypeError: a namespace identifier has an inappropriate
             type such as NoneType or bool
@@ -4431,13 +4440,65 @@
         if not namespaces:
             pywikibot.warning(u"search: namespaces cannot be empty; using 
[0].")
             namespaces = [0]
-        srgen = self._generator(api.PageGenerator, type_arg="search",
-                                gsrsearch=searchstring, gsrwhat=where,
-                                namespaces=namespaces, step=step,
-                                total=total, g_content=content)
-        if MediaWikiVersion(self.version()) < MediaWikiVersion('1.23'):
-            srgen.request['gsrredirects'] = get_redirects
-        return srgen
+
+        if MediaWikiVersion(self.version()) > MediaWikiVersion('1.11') and 
checkSnippet:
+
+            if len(namespaces) > 1:
+                raise Error("Cannot do more than one namespace and check 
snippets")
+
+            # If we want to get the snippets, we have to use a ListGenerator
+            # and manually yield those pages that contain the correct words.
+            srgen = api.ListGenerator("search", site=self,
+                                      srsearch=searchstring, srwhat=where,
+                                      srnamespace=namespaces[0], step=step,
+                                      g_content=content)
+
+            if total is not None:
+                srgen.set_maximum_items(total)
+
+            if MediaWikiVersion(self.version()) < MediaWikiVersion('1.23'):
+                srgen.request['srredirects'] = get_redirects
+
+            import re
+            wordsearch = re.compile(r'<span 
class="searchmatch">([^<]*)</span>')
+            nr_misses = 0
+            for pagedata in srgen:
+
+                snippet = pagedata["snippet"]
+                match = wordsearch.search(snippet)
+                if match and match.group(1) == searchstring:
+
+                    # Matching page found, create it and yield
+                    p = pywikibot.Page(self, pagedata['title'], pagedata['ns'])
+                    ns = pagedata['ns']
+                    # Upcast to proper Page subclass.
+                    if ns == 6:
+                        p = pywikibot.FilePage(p)
+                    elif ns == 14:
+                        p = pywikibot.Category(p)
+                    # api.update_page(p, pagedata)
+                    yield p
+                else:
+                    nr_misses += 1
+                    if skipAfterNrMisses > 0 and nr_misses > skipAfterNrMisses:
+                        break
+
+        else:
+
+            # Pagegenerator solution (does not check snippets) 
+            srgen = self._generator(api.PageGenerator, type_arg="search",
+                                    gsrsearch=searchstring, gsrwhat=where,
+                                    namespaces=namespaces, step=step,
+                                    total=total, g_content=content)
+
+            if total is not None:
+                srgen.set_maximum_items(total)
+
+            if MediaWikiVersion(self.version()) < MediaWikiVersion('1.23'):
+                srgen.request['gsrredirects'] = get_redirects
+
+            for p in srgen:
+                yield p
 
     def usercontribs(self, user=None, userprefix=None, start=None, end=None,
                      reverse=False, namespaces=None, showMinor=None,
diff --git a/tests/site_tests.py b/tests/site_tests.py
index 989e940..5af9f68 100644
--- a/tests/site_tests.py
+++ b/tests/site_tests.py
@@ -1380,6 +1380,12 @@
                                      get_redirects=True):
                 self.assertIsInstance(hit, pywikibot.Page)
                 self.assertEqual(hit.namespace(), 0)
+
+            se = list(mysite.search("wiki", total=100, checkSnippet=True))
+            self.assertLessEqual(len(se), 100)
+            self.assertTrue(all(isinstance(hit, pywikibot.Page)
+                                for hit in se))
+
         except pywikibot.data.api.APIError as e:
             if e.code == "gsrsearch-error" and "timed out" in e.info:
                 raise unittest.SkipTest("gsrsearch returned timeout on site: 
%r" % e)

-- 
To view, visit https://gerrit.wikimedia.org/r/272655
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I62c91758fa33620215fd85ed583da56623f73b65
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: hroest <hannesro...@gmx.ch>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to