Strainu has submitted this change and it was merged.

Change subject: Port makecat.py to core
......................................................................


Port makecat.py to core

Change-Id: Icb7c690bf3e8625ad4a10e0f3e9f7b523f54c059
---
A scripts/makecat.py
1 file changed, 304 insertions(+), 0 deletions(-)

Approvals:
  Strainu: Verified; Looks good to me, approved



diff --git a/scripts/makecat.py b/scripts/makecat.py
new file mode 100644
index 0000000..e269620
--- /dev/null
+++ b/scripts/makecat.py
@@ -0,0 +1,304 @@
+# -*- coding: UTF-8 -*-
+"""
+This bot takes as its argument (or, if no argument is given, asks for it), the
+name of a new or existing category. It will then try to find new articles for
+this category (pages linked to and from pages already in the category), asking
+the user which pages to include and which not.
+
+Arguments:
+   -nodates  automatically skip all pages that are years or dates (years
+             only work AD, dates only for certain languages)
+   -forward  only check pages linked from pages already in the category,
+             not pages linking to them. Is less precise but quite a bit
+             faster.
+   -exist    only ask about pages that do actually exist; drop any
+             titles of non-existing pages silently. If -forward is chosen,
+             -exist is automatically implied.
+   -keepparent  do not remove parent categories of the category to be
+             worked on.
+   -all      work on all pages (default: only main namespace)
+
+When running the bot, you will get one by one a number by pages. You can
+choose:
+Y(es) - include the page
+N(o) - do not include the page or
+I(gnore) - do not include the page, but if you meet it again, ask again.
+X - add the page, but do not check links to and from it
+Other possiblities:
+A(dd) - add another page, which may have been one that was included before
+C(heck) - check links to and from the page, but do not add the page itself
+R(emove) - remove a page that is already in the list
+L(ist) - show current list of pages to include or to check
+"""
+
+# (C) Andre Engels, 2004
+# (C) Pywikipedia bot team 2005-2010
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id$'
+#
+
+import codecs
+import pywikibot
+from pywikibot import date, catlib, pagegenerators, i18n
+
+
+def rawtoclean(c):
+    #Given the 'raw' category, provides the 'clean' category
+    c2 = c.title().split('|')[0]
+    return pywikibot.Page(mysite, c2)
+
+
+def isdate(s):
+    """returns true iff s is a date or year
+    """
+    dict, val = date.getAutoFormat(pywikibot.getSite().language(), s)
+    return dict is not None
+
+
+def needcheck(pl):
+    if main:
+        if pl.namespace() != 0:
+            return False
+    if pl in checked:
+        return False
+    if skipdates:
+        if isdate(pl.title()):
+            return False
+    return True
+
+
+def include(pl, checklinks=True, realinclude=True, linkterm=None):
+    cl = checklinks
+    if linkterm:
+        actualworkingcat = catlib.Category(mysite, workingcat.title(),
+                                           sortKey=linkterm)
+    else:
+        actualworkingcat = workingcat
+    if realinclude:
+        try:
+            text = pl.get()
+        except pywikibot.NoPage:
+            pass
+        except pywikibot.IsRedirectPage:
+            cl = True
+            pass
+        else:
+            cats = [x for x in pl.categories()]
+            if not workingcat in cats:
+                cats = [x for x in pl.categories()]
+                for c in cats:
+                    if c in parentcats:
+                        if removeparent:
+                            catlib.change_category(pl, c, actualworkingcat)
+                            break
+                else:
+                    pl.put(pywikibot.replaceCategoryLinks(
+                        text, cats + [actualworkingcat]))
+    if cl:
+        if checkforward:
+            for page2 in pl.linkedPages():
+                if needcheck(page2):
+                    tocheck.append(page2)
+                    checked[page2] = page2
+        if checkbackward:
+            for refPage in pl.getReferences():
+                if needcheck(refPage):
+                    tocheck.append(refPage)
+                    checked[refPage] = refPage
+
+
+def exclude(pl, real_exclude=True):
+    if real_exclude:
+        excludefile.write('%s\n' % pl.title())
+
+
+def asktoadd(pl):
+    if pl.site != mysite:
+        return
+    if pl.isRedirectPage():
+        pl2 = pl.getRedirectTarget()
+        if needcheck(pl2):
+            tocheck.append(pl2)
+            checked[pl2] = pl2
+        return
+    ctoshow = 500
+    pywikibot.output(u'')
+    pywikibot.output(u"==%s==" % pl.title())
+    while 1:
+        answer = raw_input("y(es)/n(o)/i(gnore)/(o)ther options? ")
+        if answer == 'y':
+            include(pl)
+            break
+        if answer == 'c':
+            include(pl, realinclude=False)
+            break
+        if answer == 'z':
+            if pl.exists():
+                if not pl.isRedirectPage():
+                    linkterm = pywikibot.input(
+                        u"In what manner should it be alphabetized?")
+                    include(pl, linkterm=linkterm)
+                    break
+            include(pl)
+            break
+        elif answer == 'n':
+            exclude(pl)
+            break
+        elif answer == 'i':
+            exclude(pl, real_exclude=False)
+            break
+        elif answer == 'o':
+            pywikibot.output(u"t: Give the beginning of the text of the page")
+            pywikibot.output(
+                u"z: Add under another title (as [[Category|Title]])")
+            pywikibot.output(
+                u"x: Add the page, but do not check links to and from it")
+            pywikibot.output(u"c: Do not add the page, but do check links")
+            pywikibot.output(u"a: Add another page")
+            pywikibot.output(u"l: Give a list of the pages to check")
+        elif answer == 'a':
+            pagetitle = raw_input("Specify page to add:")
+            page = pywikibot.Page(pywikibot.getSite(), pagetitle)
+            if not page in checked.keys():
+                include(page)
+        elif answer == 'x':
+            if pl.exists():
+                if pl.isRedirectPage():
+                    pywikibot.output(
+                        u"Redirect page. Will be included normally.")
+                    include(pl, realinclude=False)
+                else:
+                    include(pl, checklinks=False)
+            else:
+                pywikibot.output(u"Page does not exist; not added.")
+                exclude(pl, real_exclude=False)
+            break
+        elif answer == 'l':
+            pywikibot.output(u"Number of pages still to check: %s"
+                             % len(tocheck))
+            pywikibot.output(u"Pages to be checked:")
+            pywikibot.output(u" - ".join(page.title() for page in tocheck))
+            pywikibot.output(u"==%s==" % pl.title())
+        elif answer == 't':
+            pywikibot.output(u"==%s==" % pl.title())
+            try:
+                pywikibot.output(u'' + pl.get(get_redirect=True)[0:ctoshow])
+            except pywikibot.NoPage:
+                pywikibot.output(u"Page does not exist.")
+            ctoshow += 500
+        else:
+            pywikibot.output(u"Not understood.")
+
+try:
+    checked = {}
+    skipdates = False
+    checkforward = True
+    checkbackward = True
+    checkbroken = True
+    removeparent = True
+    main = True
+    workingcatname = []
+    tocheck = []
+    for arg in pywikibot.handleArgs():
+        if arg.startswith('-nodate'):
+            skipdates = True
+        elif arg.startswith('-forward'):
+            checkbackward = False
+            checkbroken = False
+        elif arg.startswith('-exist'):
+            checkbroken = False
+        elif arg.startswith('-keepparent'):
+            removeparent = False
+        elif arg.startswith('-all'):
+            main = False
+        else:
+            workingcatname.append(arg)
+
+    if len(workingcatname) == 0:
+        workingcatname = raw_input("Which page to start with? ")
+    else:
+        workingcatname = ' '.join(workingcatname)
+    mysite = pywikibot.getSite()
+    workingcatname = unicode(workingcatname, 'utf-8')
+    pywikibot.setAction(i18n.twtranslate(mysite, 'makecat-create') + u' ' + 
workingcatname)
+    workingcat = catlib.Category(mysite,
+                                 u'%s:%s'
+                                 % (mysite.category_namespace(),
+                                    workingcatname))
+    filename = pywikibot.config.datafilepath('category',
+                                             workingcatname.encode('ascii', 
'xmlcharrefreplace') + '_exclude.txt')
+    try:
+        f = codecs.open(filename, 'r', encoding=mysite.encoding())
+        for line in f.readlines():
+            # remove trailing newlines and carriage returns
+            try:
+                while line[-1] in ['\n', '\r']:
+                    line = line[:-1]
+            except IndexError:
+                pass
+            exclude(line, real_exclude=False)
+            pl = pywikibot.Page(mysite, line)
+            checked[pl] = pl
+        f.close()
+        excludefile = codecs.open(filename, 'a', encoding=mysite.encoding())
+    except IOError:
+        # File does not exist
+        excludefile = codecs.open(filename, 'w', encoding=mysite.encoding())
+    try:
+        parentcats = workingcat.categories()
+    except pywikibot.Error:
+        parentcats = []
+    # Do not include articles already in subcats; only checking direct subcats
+    subcatlist = list(workingcat.subcategories())
+    if subcatlist:
+        subcatlist = pagegenerators.PreloadingGenerator(subcatlist)
+        for cat in subcatlist:
+            artlist = list(cat.articles())
+            for page in artlist:
+                exclude(page.title(), real_exclude=False)
+                checked[page] = page
+    list = [x for x in workingcat.articles()]
+    if list:
+        for pl in list:
+            checked[pl] = pl
+        list = pagegenerators.PreloadingGenerator(list)
+        for pl in list:
+            include(pl)
+    else:
+        pywikibot.output(
+            u"Category %s does not exist or is empty. Which page to start 
with?"
+            % workingcatname)
+        answer = pywikibot.input(u"(Default is [[%s]]):" % workingcatname)
+        if not answer:
+            answer = workingcatname
+        pywikibot.output(u'' + answer)
+        pl = pywikibot.Page(mysite, answer)
+        tocheck = []
+        checked[pl] = pl
+        include(pl)
+    loaded = 0
+    while tocheck:
+        if loaded == 0:
+            if len(tocheck) < 50:
+                loaded = len(tocheck)
+            else:
+                loaded = 50
+            tocheck = [x for x in 
pagegenerators.PreloadingGenerator(tocheck[:loaded])]
+        if not checkbroken:
+            if not tocheck[0].exists():
+                pass
+            else:
+                asktoadd(tocheck[0])
+        else:
+            asktoadd(tocheck[0])
+        tocheck = tocheck[1:]
+        loaded -= 1
+
+finally:
+    pywikibot.stopme()
+    try:
+        excludefile.close()
+    except:
+        pass

-- 
To view, visit https://gerrit.wikimedia.org/r/102837
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Icb7c690bf3e8625ad4a10e0f3e9f7b523f54c059
Gerrit-PatchSet: 6
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: M4tx <m...@m4tx.pl>
Gerrit-Reviewer: Ladsgroup <ladsgr...@gmail.com>
Gerrit-Reviewer: M4tx <m...@m4tx.pl>
Gerrit-Reviewer: Merlijn van Deen <valhall...@arctus.nl>
Gerrit-Reviewer: Strainu <w...@strainu.ro>
Gerrit-Reviewer: jenkins-bot

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to