Prianka has uploaded a new change for review. https://gerrit.wikimedia.org/r/194824
Change subject: Port and re-package copyright*.py ...................................................................... Port and re-package copyright*.py Bug:T66848 Change-Id: Ia0c3a9fe6a2c3be3cdbad517ac9dbf3249c197ab --- A copyright/exclusion_list.txt A copyright/site_protected_list.txt M pywikibot/config2.py A scripts/copyright/__init__.py A scripts/copyright/copyright.py A scripts/copyright/copyright_clean.py A scripts/copyright/copyright_put.py 7 files changed, 1,933 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core refs/changes/24/194824/1 diff --git a/copyright/exclusion_list.txt b/copyright/exclusion_list.txt new file mode 100644 index 0000000..0f92ced --- /dev/null +++ b/copyright/exclusion_list.txt @@ -0,0 +1,37 @@ +# +# +# You can place here part of URLs that are to reject by +# exclusion system. Please note, there is an online and +# active version of this file, automaticaly loaded by +# copyright.py. +# +# You could also consider to report SPAM to: +# +# http://www.google.com/contact/spamreport.html +# http://add.yahoo.com/fast/help/us/ysearch/cgi_reportsearchspam +# +# Please inform about mirrors and forks of Wikipedia to: +# +# http://en.wikipedia.org/wiki/Wikipedia:Mirrors_and_forks +# +# or equivalent page in others Wikipedia according to language +# used by the clone. +# + +# +# This let ignore URLs that containing 'wikipedia', 'wikibooks', +# 'wikiquote'... + +wikipedia +wikibooks +wikiquote +wikisource +wikimedia +wikinews +wiktionary +wikiversity + +# +# Uncomment here if you want to exclude URL containing '.ebay.' +#.ebay. + diff --git a/copyright/site_protected_list.txt b/copyright/site_protected_list.txt new file mode 100644 index 0000000..2193037 --- /dev/null +++ b/copyright/site_protected_list.txt @@ -0,0 +1,7 @@ +britannica.com # On-line encyclopedia (publisher: Encyclopædia Britannica) +encarta.msn.com # On-line encyclopedia (publisher: Microsoft) +pbmstoria.it # On-line encycloedia (publisher: Bruno Mondadori) +sapere.it # On-line encyclopedia (publisher: De Agostini) +treccani.it # On-line encyclopedia (publisher: Istituto della Enciclopedia italiana) + +cronologia.it # Amateur site (author: Franco Gonzato) diff --git a/pywikibot/config2.py b/pywikibot/config2.py index 0d9d670..fda1f68 100644 --- a/pywikibot/config2.py +++ b/pywikibot/config2.py @@ -569,6 +569,7 @@ # ############# SEARCH ENGINE SETTINGS ############## +google_key = '' # Some scripts allow using the Yahoo! Search Web Services. To use this feature, # you must install the pYsearch module from http://pysearch.sourceforge.net # and get a Yahoo AppID from https://developer.yahoo.com/ diff --git a/scripts/copyright/__init__.py b/scripts/copyright/__init__.py new file mode 100644 index 0000000..573b320 --- /dev/null +++ b/scripts/copyright/__init__.py @@ -0,0 +1,2 @@ +# THIS DIRECTORY IS TO HOLD BOT SCRIPTS FOR THE NEW FRAMEWORK +"""Copyright tool scripts to handle API.""" diff --git a/scripts/copyright/copyright.py b/scripts/copyright/copyright.py new file mode 100644 index 0000000..f105676 --- /dev/null +++ b/scripts/copyright/copyright.py @@ -0,0 +1,1271 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +""" +This robot checks copyright text in Google, Yahoo! and Live Search. + +Google search requires to install the pyGoogle module from +http://pygoogle.sf.net and get a Google API license key from +http://code.google.com/apis/soapsearch/ (but since December 2006 Google is +no longer issuing new SOAP API keys). + +Yahoo! search requires pYsearch module from http://pysearch.sourceforge.net +and a Yahoo AppID from http://developer.yahoo.com. + +Windows Live Search requires to get an AppID from +http://search.msn.com/developer +and to download/install the SOAPpy module from http://pywebsvcs.sf.net or using +SVN with the following command: + +svn co http://pywebsvcs.svn.sourceforge.net/svnroot/pywebsvcs/trunk/SOAPpy/SOAPpy SOAPpy + +Unlike SOAPpy version 0.12, current SVN version has no problem with Python 2.5. + + +You can run the bot with the following commandline parameters: + +-g - Use Google search engine +-ng - Do not use Google +-y - Use Yahoo! search engine +-ny - Do not use Yahoo! +-l - Use Windows Live Search engine +-nl - Do not use Windows Live Search +-maxquery - Stop after a specified number of queries for page (default: 25) +-skipquery - Skip a number specified of queries +-output - Append results to a specified file (default: + 'copyright/output.txt') + +-text:input_text - Work on a specified text + +-file - Work on all pages given in a local text file. + Will read any [[wiki link]] and use these articles. + Argument can also be given as "-file:filename". +-new - Work on the 60 newest pages. If given as -new:x, will work + on the x newest pages. +-cat - Work on all pages which are in a specific category. + Argument can also be given as "-cat:categoryname". +-subcat - When the pages to work on have been chosen by -cat, pages in + subcategories of the selected category are also included. + When -cat has not been selected, this has no effect. +-page - Only check a specific page. + Argument can also be given as "-page:pagetitle". You can give + this parameter multiple times to check multiple pages. +-ref - Work on all pages that link to a certain page. + Argument can also be given as "-ref:referredpagetitle". +-filelinks - Works on all pages that link to a certain image. + Argument can also be given as "-filelinks:ImageName". +-links - Work on all pages that are linked to from a certain page. + Argument can also be given as "-links:linkingpagetitle". +-start - Work on all pages in the wiki, starting at a given page. +-namespace:n - Number or name of namespace to process. The parameter can be used + multiple times. + +Examples: + +If you want to check first 50 new articles then use this command: + + python copyright.py -new:50 + +If you want to check a category with no limit for number of queries to +request, use this: + + python copyright.py -cat:"Wikipedia featured articles" -maxquery:0 + +You can include also the text to examine directly on the command line: + + python copyright.py -text:" + ...text... + " +""" + +# +# (c) Francesco Cosoleto, 2006 +# (c) Pywikibot team 2006-2015 +# +# Distributed under the terms of the MIT license. +# +__version__ = '$Id$' +# + +import re +import codecs +import os +import time +import urllib +import urllib2 + +import pywikibot +from pywikibot import pagegenerators +from pywikibot import config + + +# Search keywords added to all the queries. +no_result_with_those_words = '-Wikipedia' + +# Performing a search engine query if string length is greater than the given +# value. +min_query_string_len = 120 + +# Split the text into strings of a specified number of words. +number_of_words = 22 + +# Try to skip quoted text. +exclude_quote = True + +# Enable DOTALL regular expression flag in remove_wikicode() function. +remove_wikicode_dotall = True + +# If ratio between query length and number of commas is greater or equal +# to 'comma_ratio' then the script identify a comma separated list and +# don't send data to search engine. +comma_ratio = 5 + +# No checks if the page is a disambiguation page. +skip_disambig = True + +# Parameter used in Live Search query. +# (http://msdn2.microsoft.com/en-us/library/bb266177.aspx) +region_code = 'en-US' + +enable_color = True + +warn_color = 'lightyellow' +error_color = 'lightred' + +appdir = "copyright" +output_file = pywikibot.config.datafilepath(appdir, "output.txt") + +pages_for_exclusion_database = [ + ('it', 'Wikipedia:Sospette violazioni di copyright/Lista di esclusione', + 'exclusion_list.txt'), + ('en', 'Wikipedia:Mirrors_and_forks/Abc', 'Abc.txt'), + ('en', 'Wikipedia:Mirrors_and_forks/Def', 'Def.txt'), + ('en', 'Wikipedia:Mirrors_and_forks/Ghi', 'Ghi.txt'), + ('en', 'Wikipedia:Mirrors_and_forks/Jkl', 'Jkl.txt'), + ('en', 'Wikipedia:Mirrors_and_forks/Mno', 'Mno.txt'), + ('en', 'Wikipedia:Mirrors_and_forks/Pqr', 'Pqr.txt'), + ('en', 'Wikipedia:Mirrors_and_forks/Stu', 'Stu.txt'), + ('en', 'Wikipedia:Mirrors_and_forks/Vwxyz', 'Vwxyz.txt'), + ('es', 'Wikipedia:Espejos de Wikipedia/Espejos_que_cumplen_la_GFDL_y_CC-BY-SA', + 'Espejos.txt'), + ('it', 'Wikipedia:Cloni', 'Cloni.txt'), +] + +reports_cat = { + 'it': u'Segnalazioni automatiche sospetti problemi di copyright', + 'es': u'Wikipedia:Páginas para revisar en busca de posible violación de copyright', +} + +wikipedia_names = { + '--': u'Wikipedia', + 'am': u'ááªááµá«', + 'an': u'Biquipedia', + 'ang': u'WicipÇ£dia', + 'ar': u'ÙÙÙÙبÙدÙا', + 'arc': u'ÜÜÜÜܦÜÜÜ', + 'ast': u'Uiquipedia', + 'az': u'Vikipediya', + 'bat-smg': u'VikipedÄjÄ', + 'be': u'ÐÑкÑпÑдÑÑ', + 'be-x-old': u'ÐÑкÑпÑдÑÑ', + 'bg': u'УикипедиÑ', + 'bn': u'à¦à¦à¦à¦¿à¦ªà¦¿à¦¡à¦¿à¦¯à¦¼à¦¾', + 'bpy': u'à¦à¦à¦à¦¿à¦ªà¦¿à¦¡à¦¿à¦¯à¦¼à¦¾', + 'ca': u'Viquipèdia', + 'ceb': u'Wikipedya', + 'chr': u'á«á©ááá¯', + 'cr': u'ááá±áá', + 'cs': u'Wikipedie', + 'csb': u'Wikipedijô', + 'cu': u'ÐикипедÑï', + 'cv': u'Ðикипеди', + 'cy': u'Wicipedia', + 'diq': u'Wikipediya', + 'dv': u'ÞÞ¨ÞÞ¨ÞÞ©ÞÞ¨ÞÞ§', + 'el': u'ÎικιÏαίδεια', + 'eo': u'Vikipedio', + 'et': u'Vikipeedia', + 'fa': u'ÙÛÚ©ÛâپدÛا', + 'fiu-vro': u'Vikipeediä', + 'fr': u'Wikipédia', + 'frp': u'Vuiquipèdia', + 'fur': u'Vichipedie', + 'fy': u'Wikipedy', + 'ga': u'Vicipéid', + 'gu': u'વિàªàª¿àªªà«àª¡àª¿àª¯àª¾', + 'he': u'××ק×פ×××', + 'hi': u'विà¤à¤¿à¤ªà¥à¤¡à¤¿à¤¯à¤¾', + 'hr': u'Wikipedija', + 'hsb': u'Wikipedija', + 'hu': u'Wikipédia', + 'hy': u'ÕÕ«ÖÕ«ÖÕ¥Õ¤Õ«Õ¡', + 'io': u'Wikipedio', + 'iu': u'á ááá±áá/oikipitia', + 'ja': u'ã¦ã£ãããã£ã¢', + 'jbo': u'uikipedias', + 'ka': u'ááááááááá', + 'kk': u'УикипедиÑ', + 'kn': u'ವಿà²à²¿à²ªà³à²¡à²¿à²¯', + 'ko': u'ìí¤ë°±ê³¼', + 'ksh': u'Wikkipedija', + 'la': u'Vicipaedia', + 'lad': u'××ק×פ××××', + 'lt': u'Vikipedija', + 'lv': u'VikipÄdija', + 'mk': u'ÐикипедиÑа', + 'ml': u'വിà´àµà´à´¿à´ªàµà´¡à´¿à´¯', + 'mo': u'ÐикипедиÑ', + 'mr': u'विà¤à¤¿à¤ªà¤¿à¤¡à¥à¤¯à¤¾', + 'mt': u'Wikipedija', + 'nah': u'Huiquipedia', + 'ne': u'विà¤à¤¿à¤ªà¥à¤¡à¤¿à¤¯à¤¾', + 'nrm': u'Viqùipédie', + 'oc': u'Wikipèdia', + 'os': u'Ðикипеди', + 'pa': u'ਵਿà¨à¨¿à¨ªà©à¨¡à¨¿à¨', + 'pt': u'Wikipédia', + 'qu': u'Wikipidiya', + 'rmy': u'Vikipidiya', + 'ru': u'ÐикипедиÑ', + 'sco': u'Wikipaedia', + 'si': u'à·à·à¶à·à¶´à·à¶©à·à¶ºà·', + 'sk': u'Wikipédia', + 'sl': u'Wikipedija', + 'sr': u'ÐикипедиÑа', + 'su': u'Wikipédia', + 'ta': u'விà®à¯à®à®¿à®ªà¯à®à®¿à®¯à®¾', + 'tg': u'Ðикипедиа', + 'th': u'วิà¸à¸´à¸à¸µà¹à¸à¸µà¸¢', + 'tr': u'Vikipedi', + 'uk': u'ÐÑкÑпедÑÑ', + 'uz': u'Vikipediya', + 'yi': u'â«×°×ק×פע×××¢', + 'zh': u'ç»´åºç¾ç§', + 'zh-classical': u'ç¶åºå¤§å ¸', + 'zh-yue': u'ç¶åºç¾ç§', +} + +editsection_names = { + 'ar': u'\[عدÙ\]', + 'en': u'\[edit\]', + 'fa': u'\[ÙÛراÛØ´\]', + 'fr': u'\[modifier\]', + 'de': u'\[Bearbeiten\]', + 'es,pt': u'\[editar\]', + 'it': u'\[modifica\]', + 'is': u'\[breyti\]', + 'ja': u'\[ç·¨é\]', + 'zh': u'\[ç¼è¾\]', +} + +sections_to_skip = { + 'ar': [u'٠راجع', u'Ùراءة أخرÙ', u'Ù ÙاØظات', u'ÙصÙات خارجÙØ©'], + 'en': [u'References', u'Further reading', u'Citations', u'External links'], + 'fa': [u'Ù Ùابع', u'Ù Ùابع Ø¨Ø±Ø§Û Ù Ø·Ø§Ùع٠بÛشتر', u'ÛادکردÙا', + u'Ù¾ÛÙÙد ب٠بÛرÙÙ'], + 'es': [u'Referencias', u'Ver también', u'BibliografÃa', u'Enlaces externos', + u'Notas'], + 'fr': [u'Liens externes'], + 'it': [u'Bibliografia', u'Discografia', u'Opere bibliografiche', + u'Riferimenti bibliografici', u'Collegamenti esterni', + u'Pubblicazioni', u'Pubblicazioni principali', + u'Bibliografia parziale'], + 'is': [u'Heimildir', u'Tenglar', u'Tengt efni'], + 'ja': [u'è注', u'è注æ¬', u'è注ã»åºå ¸', u'åºå ¸', u'注é'], + 'zh': [u'åèæç»', u'åèæç®', u'åèè³æ', u'åèèµæ', u'è³æä¾æº', u'èµææ¥æº', + u'åè¦', u'åè§', u'åé±', u'åé '], +} + +if enable_color: + warn_color = '\03{%s}' % warn_color + error_color = '\03{%s}' % error_color + default_color = '\03{default}' +else: + warn_color = error_color = default_color = '' + +site = pywikibot.Site() + + +def _output(text, prefix=None, color=''): + if prefix: + pywikibot.output('%s%s: %s%s' % (color, prefix, default_color, text)) + else: + pywikibot.output('%s%s' % (color, text)) + + +def warn(text, prefix=None): + _output(text, prefix=prefix, color=warn_color) + + +def error(text, prefix=None): + _output(text, prefix=prefix, color=error_color) + + +def skip_section(text): + sect_titles = '|'.join(sections_to_skip[pywikibot.Site().lang]) + sectC = re.compile('(?mi)^==\s*(' + sect_titles + ')\s*==') + while True: + newtext = cut_section(text, sectC) + if newtext == text: + break + text = newtext + return text + + +def cut_section(text, sectC): + sectendC = re.compile('(?m)^==[^=]') + start = sectC.search(text) + if start: + end = sectendC.search(text, start.end()) + if end: + return text[:start.start()] + text[end.start():] + else: + return text[:start.start()] + return text + + +class URLExclusion: + def __init__(self): + self.URLlist = set() + self.scan() + + def pages_list(self): + for i in pages_for_exclusion_database: + path = pywikibot.config.datafilepath(appdir, i[0], i[2]) + pywikibot.config.makepath(path) + page = pywikibot.Page(pywikibot.Site(i[0]), i[1]) + yield page, path + + def download(self, force_update=False): + for page, path in self.pages_list(): + download = force_update + try: + if not os.path.exists(path): + pywikibot.output('Creating file \'%s\' (%s)' + % (pywikibot.config.shortpath(path), + page.title(asLink=True))) + download = True + else: + file_age = time.time() - os.path.getmtime(path) + if download or file_age > 24 * 60 * 60: + pywikibot.output('Updating file \'%s\' (%s)' + % (pywikibot.config.shortpath(path), + page.title(asLink=True))) + download = True + except OSError: + raise + + if download: + data = None + try: + data = page.get() + except KeyboardInterrupt: + raise + except pywikibot.IsRedirectPage: + data = page.getRedirectTarget().get() + except: + error('Getting page failed') + + if data: + f = codecs.open(path, 'w', 'utf-8') + f.write(data) + f.close() + + def update(self): + self.download(force_update=True) + self.scan() + + def check(self, url, verbose=False): + for entry in self.URLlist: + if entry in url: + if verbose > 1: + warn('URL Excluded: %s\nReason: %s' % (url, entry)) + elif verbose: + warn('URL Excluded: %s' % url) + return True + return False + + def scan(self): + prelist = [] + result_list = [] + self.download() + + for page, path in self.pages_list(): + if 'exclusion_list.txt' in path: + result_list += re.sub("</?pre>", "", + read_file(path, + cut_comment=True, + cut_newlines=True) + ).splitlines() + else: + data = read_file(path) + # wikipedia:en:Wikipedia:Mirrors and forks + prelist += re.findall("(?i)url\s*=\s*<nowiki>(?:http://)?(.*)</nowiki>", + data) + prelist += re.findall("(?i)\*\s*Site:\s*\[?(?:http://)?(.*)\]?", + data) + # wikipedia:it:Wikipedia:Cloni + if 'it/Cloni.txt' in path: + prelist += re.findall('(?mi)^==(?!=)\s*\[?\s*(?:<nowiki>)?\s*(?:http://)?(.*?)(?:</nowiki>)?\s*\]?\s*==', + data) + list1 = [] + for entry in prelist: + list1 += entry.split(", ") + list2 = [] + for entry in list1: + list2 += entry.split("and ") + for entry in list2: + # Remove unnecessary part of URL + entry = re.sub("(http://|www\.)", "", entry) + entry = re.sub("</?nowiki>", "", entry) + if entry: + if '/' in entry: + entry = entry[:entry.rfind('/')] + + entry = re.sub("\s.*", "", entry) + + if len(entry) > 4: + result_list.append(entry) + + result_list += read_file( + pywikibot.config.datafilepath(appdir, 'exclusion_list.txt'), + cut_comment=True, cut_newlines=True).splitlines() + + for item in result_list: + cleaned = item.strip() + if cleaned: + self.URLlist.add(cleaned) + + def sanity_check(self): + pywikibot.output("Exclusion list sanity check...") + for entry in self.URLlist: + if ('.' not in entry and '/' not in entry) or len(entry) < 5: + pywikibot.output("** " + entry) + + def dump(self): + f = open(pywikibot.config.datafilepath(appdir, 'exclusion_list.dump'), + 'w') + f.write('\n'.join(self.URLlist)) + f.close() + pywikibot.output("Exclusion list dump saved.") + + +def read_file(filename, cut_comment=False, cut_newlines=False): + text = u"" + f = codecs.open(filename, 'r', 'utf-8') + text = f.read() + f.close() + if cut_comment: + text = re.sub(" ?#.*", "", text) + if cut_newlines: + text = re.sub("(?m)^\r?\n", "", text) + return text + + +def write_log(text, filename=output_file): + f = codecs.open(filename, 'a', 'utf-8') + f.write(text) + f.close() + +# +# Ignore text that contents comma separated list, only numbers, +# punctuation... + + +def economize_query(text): + # Comma separated list + c = text.count(', ') + if c > 4: + l = len(text) + r = 100 * float(c) / l + if r >= comma_ratio: + return True + + # Numbers + if re.search('[^0-9\'*/,. +?:;-]{5}', text): + return False + return True + +# +# Set regex used in remove_wikicode() to remove [[Image:]] tags +# and regex used in check_in_source() to reject pages with +# 'Wikipedia'. + + +def join_family_data(reString, namespace): + for s in site.namespaces[namespace]: + if type(s) == list: + for e in s: + reString += '|' + e + else: + reString += '|' + s + return '\s*(' + reString + ')\s*' + +reImageC = re.compile('\[\[' + join_family_data('Image', 6) + ':.*?\]\]', re.I) +reWikipediaC = re.compile('(' + '|'.join(wikipedia_names.values()) + ')', re.I) +reSectionNamesC = re.compile('(' + '|'.join(editsection_names.values()) + ')') + + +def remove_wikicode(text, re_dotall=False, remove_quote=exclude_quote, + debug=False): + if not text: + return "" + + if debug: + write_log(text + '\n', "copyright/wikicode.txt") + + text = re.sub('(?i)</?(p|u|i|b|em|div|span|font|small|big|code|tt).*?>', + '', text) + text = re.sub('(?i)<(/\s*)?br(\s*/)?>', '', text) + text = re.sub('<!--.*?-->', '', text) + + text = text.replace('<', '<') + text = text.replace('>', '>') + + # remove URL + text = re.sub('(ftp|https?)://[\w/.,;:@&=%#\\\?_!~*\'|()\"+-]+', ' ', text) + + # remove Image tags + text = reImageC.sub("", text) + + # replace piped wikilink + text = re.sub("\[\[[^\]]*?\|(.*?)\]\]", "\\1", text) + + # remove unicode and polytonic template + text = re.sub("(?i){{(unicode|polytonic)\|(.*?)}}", "\\1", text) + + if re_dotall: + flags = "(?xsim)" + # exclude wikitable + text = re.sub('(?s){\|.*?^\|}', '', text) + else: + flags = "(?xim)" + + text = re.sub(""" + %s + ( + <ref[^>]*?\s*/\s*> | # exclude <ref name = '' / > tags + <ref.*?>.*?</ref> | # exclude <ref> notes + ^[\ \t]*({\||[|!]).*?$ | # exclude wikitable + </*nowiki> | # remove <nowiki> tags + {{.*?}} | # remove (not nested) template + <math>.*?</math> | # remove LaTeX staff + [\[\]] | # remove [, ] + ^[*:;]+ | # remove *, :, ; in begin of line + <!-- | + --> | + ) + """ % flags, "", text) + + if remove_quote: + # '' text '' + # '' text ''. + # '' text '' (text) + # « text » + # ... + # + + italic_quoteC = re.compile("(?m)^[:*]?\s*(''.*?'')\.?\s*(\(.*?\))?\r?$") + + index = 0 + try: + import pywikiparser + except ImportError: + pywikiparser = False + + while pywikiparser: + m = italic_quoteC.search(text, index) + if not m: + break + + s = pywikiparser.Parser(m.group(1)) + + try: + xmldata = s.parse().toxml() + if '<wikipage><p><i>' in xmldata and \ + '</i></p></wikipage>' in xmldata: + if xmldata.count('<i>') == 1: + text = text[:m.start()] + text[m.end():] + except: + pass + + index = m.start() + 1 + + text = re.sub('(?m)^[:*]*\s*["][^"]+["]\.?\s*(\(.*?\))?\r?$', "", text) + text = re.sub('(?m)^[:*]*\s*[«][^»]+[»]\.?\s*(\(.*?\))?\r?$', "", text) + text = re.sub('(?m)^[:*]*\s*[â][^â]+[â]\.?\s*(\(.*?\))?\r?$', "", text) + + # remove useless spaces + text = re.sub("(?m)(^[ \t]+|[ \t]+\r?$)", "", text) + + if debug: + write_log(text + '\n', "copyright/wikicode_removed.txt") + + return text + + +def n_index(text, n, sep): + pos = 0 + while n > 0: + try: + pos = text.index(sep, pos + 1) + n -= 1 + except ValueError: + return 0 + return pos + + +def mysplit(text, dim, sep): + if sep not in text: + return [text] + t = text + l = list() + while t: + if sep in t: + n = n_index(t, dim, sep) + if n > 0: + l.append(t[:n]) + t = t[n + 1:] + continue + l.append(t) + break + return l + + +class SearchEngine: + + num_google_queries = num_yahoo_queries = num_msn_queries = 0 + + def __init__(self): + self.URLexcl = URLExclusion() + + def __del__(self): + self.print_stats() + + def query(self, lines=[], max_query_len=1300, wikicode=True): + # Google max_query_len = 1480? + # - '-Wikipedia ""' = 1467 + + # Google limit queries to 32 words. + + n_query = 0 + output = unicode() + previous_group_url = 'null' + + for line in lines: + if wikicode: + line = remove_wikicode(line) + for search_words in mysplit(line, number_of_words, " "): + if len(search_words) > min_query_string_len: + if config.copyright_economize_query: + if economize_query(search_words): + warn(search_words, prefix='Text excluded') + consecutive = False + continue + n_query += 1 + #pywikibot.output(search_words) + if config.copyright_max_query_for_page and \ + n_query > config.copyright_max_query_for_page: + warn(u"Max query limit for page reached") + return output + if config.copyright_skip_query > n_query: + continue + if len(search_words) > max_query_len: + search_words = search_words[:max_query_len] + consecutive = False + if " " in search_words: + search_words = search_words[ + :search_words.rindex(" ")] + + results = self.get_results(search_words) + group_url = '' + cmp_group_url = '' + + for url, engine, comment in results: + if comment: + group_url += '\n*%s - %s (%s)' % (engine, + url, + "; ".join(comment) + ) + else: + group_url += '\n*%s - %s' % (engine, url) + cmp_group_url += '\n*%s - %s' % (engine, url) + if results: + group_url_list = group_url.splitlines() + cmp_group_url_list = cmp_group_url.splitlines() + group_url_list.sort() + cmp_group_url_list.sort() + group_url = '\n'.join(group_url_list) + cmp_group_url = '\n'.join(cmp_group_url_list) + if previous_group_url == cmp_group_url: + if consecutive: + output += ' ' + search_words + else: + output += '\n**' + search_words + else: + output += group_url + '\n**' + search_words + + previous_group_url = cmp_group_url + consecutive = True + else: + consecutive = False + else: + consecutive = False + return output + + def add_in_urllist(self, url, add_item, engine, cache_url=None): + + check_in_source = (engine == 'google' and + config.copyright_check_in_source_google or + engine == 'yahoo' and + config.copyright_check_in_source_yahoo or + engine == 'msn' and + config.copyright_check_in_source_msn) + + if check_in_source or config.copyright_show_date or \ + config.copyright_show_length: + s = None + cache = False + + # list to store date, length, cache URL + comment = list() + + try: + s = WebPage(add_item, self.URLexcl) + except URL_exclusion: + pass + except NoWebPage: + cache = True + + if s: + # Before of add url in result list, perform the check in source + if check_in_source: + if s.check_in_source(): + return + + if config.copyright_show_date: + date = s.lastmodified() + if date: + if date[:3] != time.localtime()[:3]: + comment.append("%s/%s/%s" + % (date[2], date[1], date[0])) + + unit = 'bytes' + if config.copyright_show_length: + length = s.length() + if length > 1024: + # convert in kilobyte + length /= 1024 + unit = 'KB' + if length > 1024: + # convert in megabyte + length /= 1024 + unit = 'MB' + if length > 0: + comment.append("%d %s" % (length, unit)) + if cache: + if cache_url: + if engine == 'google': + comment.append( + '[http://www.google.com/search?sourceid=navclient&q=cache:%s Google cache]' + % urllib.quote(short_url(add_item))) + elif engine == 'yahoo': + comment.append("''Yahoo cache''") + elif engine == 'msn': + comment.append('[%s Live cache]' + % re.sub('&lang=[^&]*', '', cache_url)) + else: + comment.append('[http://web.archive.org/*/%s archive.org]' + % short_url(add_item)) + for i in range(len(url)): + if add_item in url[i]: + if engine not in url[i][1]: + if url[i][2]: + comment = url[i][2] + url[i] = (add_item, url[i][1] + ', ' + engine, comment) + return + url.append((add_item, engine, comment)) + return + + def soap(self, engine, query, url, numresults=10): + pywikibot.output(" %s query..." % engine.capitalize()) + search_request_retry = config.copyright_connection_tries + query_success = False + + while search_request_retry: + try: + if engine == 'google': + import google + google.LICENSE_KEY = config.google_key + data = google.doGoogleSearch('%s "%s"' + % (no_result_with_those_words, + query)) + for entry in data.results: + self.add_in_urllist(url, entry.URL, 'google', + entry.cachedSize) + self.num_google_queries += 1 + + elif engine == 'yahoo': + import yahoo.search.web + data = yahoo.search.web.WebSearch(config.yahoo_appid, + query='"%s" %s' % ( + query.encode('utf_8'), + no_result_with_those_words + ), results=numresults) + for entry in data.parse_results(): + cacheurl = None + if entry.Cache: + cacheurl = entry.Cache.Url + self.add_in_urllist(url, entry.Url, 'yahoo', cacheurl) + + self.num_yahoo_queries += 1 + + elif engine == 'msn': + #max_query_len = 150? + from SOAPpy import WSDL + + try: + server = WSDL.Proxy( + 'http://soap.search.msn.com/webservices.asmx?wsdl') + except Exception, err: + error("Live Search Error: %s" % err) + raise + + params = {'AppID': config.msn_appid, + 'Query': '%s "%s"' % (no_result_with_those_words, + query), + 'CultureInfo': region_code, + 'SafeSearch': 'Off', + 'Requests': { + 'SourceRequest': {'Source': 'Web', + 'Offset': 0, + 'Count': 10, + 'ResultFields': 'All', + }}} + + results = '' + server_results = server.Search(Request=params) + if server_results.Responses[0].Results: + results = server_results.Responses[0].Results[0] + if results: + # list or instance? + if type(results) == list: + for entry in results: + cacheurl = None + if hasattr(entry, 'CacheUrl'): + cacheurl = entry.CacheUrl + self.add_in_urllist(url, entry.Url, 'msn', + cacheurl) + else: + cacheurl = None + if hasattr(results, 'CacheUrl'): + cacheurl = results.CacheUrl + self.add_in_urllist(url, results.Url, 'msn', + cacheurl) + self.num_msn_queries += 1 + search_request_retry = 0 + query_success = True + except KeyboardInterrupt: + raise + except Exception, err: + # Something is going wrong... + if 'Daily limit' in str(err) or \ + 'Insufficient quota for key' in str(err): + exceeded_in_queries('google') + elif 'limit exceeded' in str(err): + exceeded_in_queries('yahoo') + elif 'Invalid value for AppID in request' in str(err): + exceeded_in_queries('msn') + else: + error(err, "Got an error") + + if search_request_retry: + search_request_retry -= 1 + + if not query_success: + error('No response for: %s' % query, "Error (%s)" % engine) + + def get_results(self, query, numresults=10): + result_list = list() + query = re.sub("[()\"<>]", "", query) + pywikibot.output(query) + if config.copyright_google: + self.soap('google', query, result_list) + if config.copyright_yahoo: + self.soap('yahoo', query, result_list, numresults=numresults) + if config.copyright_msn: + self.soap('msn', query, result_list) + + offset = 0 + for i in range(len(result_list)): + if self.URLexcl.check(result_list[i + offset][0], verbose=True): + result_list.pop(i + offset) + offset += -1 + return result_list + + def print_stats(self): + pywikibot.output('\n' + 'Search engine | number of queries\n' + '---------------------------------\n' + 'Google | %s\n' + 'Yahoo! | %s\n' + 'Live Search | %s\n' + % (self.num_google_queries, self.num_yahoo_queries, + self.num_msn_queries)) + +source_seen = set() +positive_source_seen = set() + + +class NoWebPage(Exception): + """Web page does not exist (404)""" + + +class URL_exclusion(Exception): + """URL in exclusion list""" + + +class WebPage(object): + """ + """ + + def __init__(self, url, URLExcl): + global source_seen + self.URLexcludedlist = URLExcl.URLlist + + if url in source_seen or URLExcl.check(url): + raise URL_exclusion + + self._url = url + + try: + self._urldata = urllib2.urlopen( + urllib2.Request(self._url, None, + {'User-Agent': pywikibot.useragent})) + except urllib2.HTTPError, err: + error("HTTP error: %d / %s (%s)" % (err.code, err.msg, url)) + if err.code >= 400: + source_seen.add(self._url) + raise NoWebPage + return None + except urllib2.URLError, arg: + error("URL error: %s / %s" % (url, arg)) + return None + except Exception, err: + error("ERROR: %s" % (err)) + + self._lastmodified = self._urldata.info().getdate('Last-Modified') + self._length = self._urldata.info().getheader('Content-Length') + self._content_type = self._urldata.info().getheader('Content-Type') + + def length(self): + if hasattr(self, '_length'): + if self._length: + return int(self._length) + if hasattr(self, '_contents'): + return len(self._contents) + + def lastmodified(self): + if hasattr(self, '_lastmodified'): + return self._lastmodified + + def get(self, force=False): + # Exclude URL with listed file extension. + if self._url[-4:] in [".pdf", ".doc", ".ppt"]: + raise URL_exclusion + + # Make sure we did try to get the contents once + if not hasattr(self, '_contents'): + self._contents = self._urldata.read() + return self._contents + + def check_regexp(self, reC, text, filename=None): + m = reC.search(text) + if m: + global positive_source_seen + self.URLexcludedlist.add(self._url) + positive_source_seen.add(self._url) + if filename: + write_log("%s (%s)\n" % (self._url, m.group()), filename) + return True + + def check_in_source(self): + """ Sources may be different from search engine database and include + mentions of Wikipedia. This function avoid also errors in search results + that can occurs either with Google and Yahoo! service. + + """ + global source_seen + + if not hasattr(self, '_urldata'): + return False + if self._url in positive_source_seen: + return True + if self._url in source_seen: + return False + + try: + text = self.get() + except URL_exclusion: + return False + + # Character encoding conversion if 'Content-Type' field has + # charset attribute set to UTF-8. + + if text: + if 'utf-8' in self._content_type.lower(): + text = text.decode("utf-8", 'replace') + else: + # <META> declaration with "http-equiv" set to "Content-Type" in HTML document. + if 'text/html' in self._content_type and ( + re.search("(?is)<meta\s.*?charset\s*=\s*[\"\']*\s*UTF-8.*?>", + text) or + re.search("(?is)<\?.*?encoding\s*=\s*[\"\']*\s*UTF-8.*?\?>", + text)): + text = text.decode("utf-8", 'replace') + + if config.copyright_check_in_source_section_names: + if self.check_regexp(reSectionNamesC, text, + "copyright/sites_with_'[edit]'.txt"): + return True + + if self.check_regexp(reWikipediaC, text, + "copyright/sites_with_'wikipedia'.txt"): + return True + source_seen.add(self._url) + return False + + +def exceeded_in_queries(engine): + """Behavior if an exceeded error occur.""" + + # Disable search engine + if config.copyright_exceeded_in_queries == 1: + exec('config.copyright_' + engine + ' = False') + # Sleeping + if config.copyright_exceeded_in_queries == 2: + error("Got a queries exceeded error from %s. Sleeping for %d hours..." + % (engine.capitalize(), + config.copyright_exceeded_in_queries_sleep_hours)) + time.sleep(config.copyright_exceeded_in_queries_sleep_hours * 3600) + # Stop execution + if config.copyright_exceeded_in_queries == 3: + raise 'Got a queries exceeded error.' + + +def get_by_id(title, id): + return pywikibot.Site().getUrl( + "/w/index.php?title=%s&oldid=%s&action=raw" % (title, id)) + + +def checks_by_ids(self, ids): + for title, id in ids: + original_text = get_by_id(title, id) + if original_text: + pywikibot.output(original_text) + output = self.query(lines=original_text.splitlines()) + if output: + write_log( + "=== [[" + title + "]] ===\n{{botbox|%s|prev|%s|%s|00}}" + % (title.replace(" ", "_").replace("\"", "%22"), + id, "author") + + output, + pywikibot.config.datafilepath(appdir, "ID_output.txt")) + + +class CheckRobot(): + def __init__(self, generator): + self.generator = generator + self.SearchEngine = SearchEngine() + self.site = pywikibot.Site() + + def run(self): + for page in self.generator: + try: + original_text = page.get() + except pywikibot.NoPage: + pywikibot.output(u'Page %s not found' % page.title()) + continue + except pywikibot.IsRedirectPage: + newpage = page.getRedirectTarget() + pywikibot.output(u'Page %s redirects to \'%s\'' + % (page.title(asLink=True), newpage.title())) + bot = CheckRobot(iter([newpage])) + bot.run() + continue + except pywikibot.SectionError: + error("Page %s has no section %s" + % (page.title(), page.section())) + continue + + if skip_disambig: + if page.isDisambig(): + pywikibot.output(u'Page %s is a disambiguation page' + % page.title(asLink=True)) + continue + + pywikibot.output(page.title()) + + if original_text: + text = skip_section(original_text) + + if remove_wikicode_dotall: + text = remove_wikicode(text, re_dotall=True) + + output = self.SearchEngine.query( + lines=text.splitlines(), + wikicode=not remove_wikicode_dotall) + if output: + write_log('=== [[%s]] ===%s\n' % (page.title(), output), + filename=output_file) + + +def short_url(url): + return url[url.index('://') + 3:] + + +def put(page, text, comment): + while True: + try: + page.put(text, comment=comment) + break + except pywikibot.SpamfilterError, url: + warn(url, prefix="Spam filter") + text = re.sub(url[0], '<blacklist>' + short_url(url[0]), text) + except pywikibot.EditConflict: + warn("Edit conflict") + raise pywikibot.EditConflict + + +def check_config(var, license_id, license_name): + if var: + if not license_id: + warn(u"You don't have set a %s, search engine is disabled." + % license_name, prefix="WARNING") + return False + return var + + +def setSavepath(path): + global output_file + output_file = path + + +def main(*args): + gen = None + # pages which will be processed when the -page parameter is used + PageTitles = [] + # IDs which will be processed when the -ids parameter is used + ids = None + # Which namespaces should be processed? + # default to [] which means all namespaces will be processed + namespaces = [] + # + repeat = False + # + text = None + # Number of pages to load at a time by Preload generator + step = 40 + # Default number of pages for NewPages generator + number = 60 + + # This factory is responsible for processing command line arguments + # that are also used by other scripts and that determine on which pages + # to work on. + genFactory = pagegenerators.GeneratorFactory() + + # Read commandline parameters. + for arg in pywikibot.handle_args(args): + if arg == '-y': + config.copyright_yahoo = True + elif arg == '-g': + config.copyright_google = True + elif arg == '-l': + config.copyright_msn = True + elif arg == '-ny': + config.copyright_yahoo = False + elif arg == '-ng': + config.copyright_google = False + elif arg == '-nl': + config.copyright_msn = False + elif arg.startswith('-output'): + if len(arg) >= 8: + setSavepath(arg[8:]) + elif arg.startswith('-maxquery'): + if len(arg) >= 10: + config.copyright_max_query_for_page = int(arg[10:]) + elif arg.startswith('-skipquery'): + if len(arg) >= 11: + config.copyright_skip_query = int(arg[11:]) + elif arg.startswith('-text'): + if len(arg) >= 6: + text = arg[6:] + elif arg.startswith('-page'): + if len(arg) == 5: + PageTitles.append(pywikibot.input( + u'Which page do you want to change?')) + else: + PageTitles.append(arg[6:]) + elif arg.startswith('-namespace:'): + try: + namespaces.append(int(arg[11:])) + except ValueError: + namespaces.append(arg[11:]) + elif arg.startswith('-forceupdate'): + URLExclusion().update() + elif arg == '-repeat': + repeat = True + elif arg.startswith('-new'): + if len(arg) >= 5: + number = int(arg[5:]) + gen = pagegenerators.NewpagesPageGenerator(number=number, + repeat=repeat) + # Preload generator work better if 'step' is not major than + # 'number', this avoid unnecessary delay. + if number < step: + step = number + else: + genFactory.handleArg(arg) + + if PageTitles: + pages = [pywikibot.Page(pywikibot.Site(), + PageTitle) for PageTitle in PageTitles] + gen = iter(pages) + + config.copyright_yahoo = check_config(config.copyright_yahoo, + config.yahoo_appid, "Yahoo AppID") + config.copyright_google = check_config(config.copyright_google, + config.google_key, + "Google Web API license key") + config.copyright_msn = check_config(config.copyright_msn, + config.msn_appid, "Live Search AppID") + + if ids: + checks_by_ids(ids) + + if not gen: + gen = genFactory.getCombinedGenerator() + if not gen and not ids and not text: + # syntax error, show help text from the top of this file + pywikibot.output(__doc__, 'utf-8') + + if text: + output = SearchEngine().query(lines=text.splitlines()) + if output: + pywikibot.output(output) + + if not gen: + return + if namespaces: + gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces) + preloadingGen = pagegenerators.PreloadingGenerator(gen, step=step) + bot = CheckRobot(preloadingGen) + bot.run() + +if number_of_words > 22 and config.copyright_msn: + warn("Live Search requires a lower value for 'number_of_words' " + "variable (current value is %d, a good value may be 22)." + % (number_of_words), prefix='Warning') + + +if __name__ == "__main__": + main() diff --git a/scripts/copyright/copyright_clean.py b/scripts/copyright/copyright_clean.py new file mode 100644 index 0000000..eec4947 --- /dev/null +++ b/scripts/copyright/copyright_clean.py @@ -0,0 +1,318 @@ +# -*- coding: utf-8 -*- +""" +""" + +# +# (C) Francesco Cosoleto, 2006 +# (c) Pywikibot team 2006-2015 +# +# Distributed under the terms of the MIT license. +# +__version__ = '$Id$' +# + +import re + +import pywikibot +from pywikibot import pagegenerators +from copyright import mysplit, put, reports_cat, join_family_data + + +summary_msg = { + 'ar': u'إزاÙØ©', + 'en': u'Removing', + 'fa': u'ØØ°Ù', + 'fr': u'Retiré', + 'it': u'Rimozione', + 'ru': u'Удаление', + 'uk': u'ÐидаленнÑ', +} + +headC = re.compile( + "(?m)^=== (?:<strike>)?(?:<s>)?(?:<del>)?\[\[(?::)?(.*?)\]\]") +separatorC = re.compile('(?m)^== +') +next_headC = re.compile("(?m)^=+.*?=+") + + +# {{botbox|title|newid|oldid|author|...}} +rev_templateC = re.compile( + "(?m)^(?:{{/t\|.*?}}\n?)?{{(?:/box|botbox)\|.*?\|(.*?)\|") + + +class CopyrightCleanBot(pywikibot.Bot): + + query_results_titles = list() + query_results_revids = list() + + def __init__(self, generator): + super(CopyrightCleanBot, self).__init__() + self.generator = generator + + def query_api(self, data): + predata = { + 'action': 'query', + 'prop': 'revisions', + } + predata = self.CombineParams(predata, data) + return pywikibot.data.api.Request(**predata).submit() + + def query_old_api(self, data): + + predata = { + 'what': 'revisions', + 'rvlimit': '1', + } + predata = self.CombineParams(predata, data) + return pywikibot.data.api.Request(**predata).submit() + + def old_page_exist(self, title): + for pageobjs in self.query_results_titles: + for key in pageobjs['pages']: + if pageobjs['pages'][key]['title'] == title: + if int(key) >= 0: + return True + pywikibot.output('* ' + title) + return False + + def old_revid_exist(self, revid): + for pageobjs in self.query_results_revids: + for id in pageobjs['pages']: + for rv in range(len(pageobjs['pages'][id]['revisions'])): + if pageobjs['pages'][id]['revisions'][rv]['revid'] == \ + int(revid): + # print rv + return True + pywikibot.output('* ' + revid) + return False + + def page_exist(self, title): + for pageobjs in self.query_results_titles: + for key in pageobjs['query']['pages']: + if pageobjs['query']['pages'][key]['title'] == title: + if 'missing' in pageobjs['query']['pages'][key]: + pywikibot.output('* ' + title) + return False + return True + + def revid_exist(self, revid): + for pageobjs in self.query_results_revids: + if 'badrevids' in pageobjs['query']: + for id in pageobjs['query']['badrevids']: + if id == int(revid): + # print rv + pywikibot.output('* ' + revid) + return False + return True + + def treat(self, page): + data = page.get() + pywikibot.output(page.title(asLink=True)) + output = '' + + # + # Preserve text before of the sections + # + + m = re.search("(?m)^==\s*[^=]*?\s*==", data) + if m: + output = data[:m.end() + 1] + else: + m = re.search("(?m)^===\s*[^=]*?", data) + if m: + output = data[:m.start()] + + titles = headC.findall(data) + titles = [re.sub("#.*", "", item) for item in titles] + revids = rev_templateC.findall(data) + + # No more of 50 titles at a time using API + for s in mysplit(self.ListToParam(titles), 50, "|"): + self.query_results_titles.append(self.query_api({'titles': s})) + for s in mysplit(self.ListToParam(revids), 50, "|"): + self.query_results_revids.append(self.query_api({'revids': s})) + + comment_entry = list() + add_separator = False + index = 0 + + while True: + head = headC.search(data, index) + if not head: + break + index = head.end() + title = re.sub("#.*", "", head.group(1)) + next_head = next_headC.search(data, index) + if next_head: + if separatorC.search(data[next_head.start():next_head.end()]): + add_separator = True + stop = next_head.start() + else: + stop = len(data) + + exist = True + if self.page_exist(title): + # check {{botbox}} + revid = re.search("{{(?:/box|botbox)\|.*?\|(.*?)\|", + data[head.end():stop]) + if revid: + if not self.revid_exist(revid.group(1)): + exist = False + else: + exist = False + + if exist: + ctitle = re.sub(u'(?i)=== \[\[%s:' + % join_family_data('Image', 6), + ur'=== [[:\1:', title) + ctitle = re.sub(u'(?i)=== \[\[%s:' + % join_family_data('Category', 14), + ur'=== [[:\1:', ctitle) + output += "=== [[" + ctitle + "]]" + data[head.end():stop] + else: + comment_entry.append("[[%s]]" % title) + + if add_separator: + output += data[next_head.start():next_head.end()] + '\n' + add_separator = False + + add_comment = u'%s: %s' % (pywikibot.translate(pywikibot.Site(), + summary_msg), + ", ".join(comment_entry)) + + # remove useless newlines + output = re.sub("(?m)^\n", "", output) + + if comment_entry: + pywikibot.output(add_comment) + if pywikibot.config.verbose_output: + pywikibot.showDiff(page.get(), output) + + choice = pywikibot.inputChoice(u'Do you want to clean the page?', + ['Yes', 'No'], ['y', 'n'], 'n') + if choice == 'y': + try: + put(page, output, add_comment) + except pywikibot.PageNotSaved: + raise + + pywikibot.stopme() + + # + # + # Helper utilities + # + # + + def CleanParams(self, params): + """Params may be either a tuple, a list of tuples or a dictionary. + This method will convert it into a dictionary + """ + if params is None: + return {} + pt = type(params) + if pt == dict: + return params + elif pt == tuple: + if len(params) != 2: + raise "Tuple size must be 2" + return {params[0]: params[1]} + elif pt == list: + for p in params: + if p != tuple or len(p) != 2: + raise "Every list element must be a 2 item tuple" + return dict(params) + else: + raise "Unknown param type %s" % pt + + def CombineParams(self, params1, params2): + """Merge two dictionaries. If they have the same keys, their values will + be appended one after another separated by the '|' symbol. + """ + + params1 = self.CleanParams(params1) + if params2 is None: + return params1 + params2 = self.CleanParams(params2) + + for k, v2 in params2.iteritems(): + if k in params1: + v1 = params1[k] + if len(v1) == 0: + params1[k] = v2 + elif len(v2) > 0: + if str in [type(v1), type(v2)]: + raise "Both merged values must be of type 'str'" + params1[k] = v1 + '|' + v2 + # else ignore + else: + params1[k] = v2 + return params1 + + def ConvToList(self, item): + """Ensure the output is a list + """ + if item is None: + return [] + elif isinstance(item, basestring): + return [item] + else: + return item + + def ListToParam(self, list): + """Convert a list of unicode strings into a UTF8 string separated by the '|' + symbols + + """ + list = self.ConvToList(list) + if len(list) == 0: + return '' + + encList = '' + # items may not have one symbol - '|' + for item in list: + if isinstance(item, basestring): + if u'|' in item: + raise pywikibot.Error(u"item '%s' contains '|' symbol" % item) + encList += self.ToUtf8(item) + u'|' + elif type(item) == int: + encList += self.ToUtf8(item) + u'|' + elif isinstance(item, pywikibot.Page): + encList += self.ToUtf8(item.title()) + u'|' + elif item.__class__.__name__ == 'User': + # delay loading this until it is needed + encList += self.ToUtf8(item.name()) + u'|' + else: + raise pywikibot.Error(u'unknown item class %s' + % item.__class__.__name__) + + # strip trailing '|' before returning + return encList[:-1] + + def ToUtf8(self, s): + if type(s) != unicode: + try: + s = unicode(s) + except UnicodeDecodeError: + s = s.decode(pywikibot.config.console_encoding) + return s + + +def main(*args): + # Process global args and prepare generator args parser + local_args = pywikibot.handle_args(args) + genFactory = pagegenerators.GeneratorFactory() + + for arg in local_args: + genFactory.handleArg(arg) + gen = genFactory.getCombinedGenerator() + + if not gen: + cat = pywikibot.Category(pywikibot.Site(), 'Category:%s' % + pywikibot.translate(pywikibot.Site(), + reports_cat)) + gen = pagegenerators.CategorizedPageGenerator(cat, recurse=True) + bot = CopyrightCleanBot(gen) + bot.run() + +if __name__ == "__main__": + main() diff --git a/scripts/copyright/copyright_put.py b/scripts/copyright/copyright_put.py new file mode 100644 index 0000000..7b692bd --- /dev/null +++ b/scripts/copyright/copyright_put.py @@ -0,0 +1,297 @@ +# -*- coding: utf-8 -*- +""" +docuReplacements = {â¶ms;': pywikibot.pagegenerators.parameterHelp} + +""" +__version__ = '$Id$' + +# +# (C) Francesco Cosoleto, 2006 +# +# Distributed under the terms of the MIT license. +# + +import re +import codecs +import os +import time +import shutil + +import pywikibot +from pywikibot import config, date +from pywikibot import pagegenerators + +from copyright import put, join_family_data, appdir, reports_cat + +# +# Month + Year save method (e.g. User:BotName/Report_December_2007) +append_date_to_wiki_save_path = True + +# +# Append day of month to wiki save path +# e.g. User:BotName/Report_25_December_2007) +append_day_to_wiki_save_path = False + +# +# Add pubblication date to entries (template:botdate) +append_date_to_entries = False + +msg_table = { + 'ar': {'_default': [u'٠دخÙات جدÙدة', u'٠دخÙات جدÙدة']}, + 'en': {'_default': [u'New entries', u'New entries']}, + 'es': {'_default': [u'Entradas nuevas', u'Entradas nuevas']}, + 'fa': {'_default': [u'Ù ØتÙÛات جدÛد', u'Ù ØتÙÛات جدÛد']}, + 'it': {'_default': [u'Pagine nuove', u'Nuove voci'], + 'feed': [u'Aggiunte a voci esistenti', u'Testo aggiunto in']}, + 'ru': {'_default': [u'ÐовÑе запиÑи', u'ÐовÑе запиÑи']}, + 'uk': {'_default': [u'ÐÐ¾Ð²Ñ Ð·Ð°Ð¿Ð¸Ñи', u'ÐÐ¾Ð²Ñ Ð·Ð°Ð¿Ð¸Ñи']}, +} + +template_cat = { + '_default': [u'This template is used by copyright.py, a script part of [[:m:Using the python wikipediabot|PyWikipediaBot]].', + u''], + 'it': [u'Questo template è usato dallo script copyright.py del [[:m:Using the python wikipediabot|PyWikipediaBot]].', + u'Template usati da bot'], +} + +stat_msg = { + 'ar': [u'Ø¥Øصاءات', u'صÙØØ©', u'٠دخÙات', u'Øج٠', u'إج٠اÙÙ', 'تØدÙØ«'], + 'en': [u'Statistics', u'Page', u'Entries', u'Size', u'Total', 'Update'], + 'es': [u'EstadÃsticas', u'Página', u'Entradas', u'Tamaño', u'Total', + u'Actualizacion'], + 'fa': [u'آ٠ار', u'صÙØÙ', u'Ù ØتÙÛات', u'اÙدازÙ', u'٠ج٠Ùع', 'برÙزرساÙÛ'], + 'it': [u'Statistiche', u'Pagina', u'Segnalazioni', u'Lunghezza', u'Totale', + u'Ultimo aggiornamento'], + 'ru': [u'СÑаÑиÑÑика', u'СÑÑаниÑа', u'ÐапиÑи', u'РазмеÑ', u'ÐÑего', + u'Ðзменено'], + 'uk': [u'СÑаÑиÑÑика', u'СÑоÑÑнка', u'ÐапиÑи', u'РозмÑÑ', u'Разом', + u'ÐмÑнено'], +} + +separatorC = re.compile('(?m)^== +') + + +def get_wiki_save_page(stat_page=False): + + site = pywikibot.Site() + wiki_save_path = { + '_default': u'User:%s/Report' % config.usernames[ + site.family.name][site.code], + 'es': u'Usuario:%s/Reporte' % config.usernames[ + site.family.name][site.code], + 'it': u'Utente:RevertBot/Report', + } + + save_path = pywikibot.translate(site, wiki_save_path, fallback=True) + if stat_page: + return pywikibot.Page(site, + '%s/%s' % (save_path, + pywikibot.translate(site, + stat_msg)[0])) + if append_date_to_wiki_save_path: + t = time.localtime() + day = '' + if append_day_to_wiki_save_path: + day = '_' + str(t[2]) + save_path += '%s_%s_%s' % (day, date.monthName(site.code, t[1]), + str(t[0])) + return pywikibot.Page(site, save_path) + + +def set_template(name=None): + site = pywikibot.Site() + tcat = pywikibot.translate(site, template_cat) + url = "%s://%s%s" % (site.protocol(), site.hostname(), site.path()) + botdate = u""" +<div style="text-align:right">{{{1}}}</div><noinclude>%s\n[[%s:%s]]</noinclude> +""" % (tcat[0], site.namespace(14), tcat[1]) + + botbox = """ +<div class=plainlinks style="text-align:right">[%s?title={{{1}}}&diff={{{2}}}&oldid={{{3}}} diff] - [%s?title={{{1}}}&action=history cron] - [%s?title=Special:Log&page={{{1}}} log]</div><noinclude>%s\n[[%s:%s]]</noinclude> +""" % (url, url, url, tcat[0], site.namespace(14), tcat[1]) + + if name == 'botdate': + p = pywikibot.Page(site, 'Template:botdate') + if not p.exists(): + p.put(botdate, comment='Init.') + if name == 'botbox': + p = pywikibot.Page(site, 'Template:botbox') + if not p.exists(): + p.put(botbox, comment='Init.') + + +def stat_sum(engine, text): + return len(re.findall('(?im)^\*.*?' + engine + '.*?- ', text)) + + +def get_stats(): + msg = pywikibot.translate(pywikibot.Site(), stat_msg) + cat = pywikibot.Category(pywikibot.Site(), + 'Category:%s' + % pywikibot.translate(pywikibot.Site(), + reports_cat)) + gen = pagegenerators.CategorizedPageGenerator(cat, recurse=True) + output = u"""{| {{prettytable|width=|align=|text-align=left}} +! %s +! %s +! %s +! %s +! %s +! %s +|- +""" % (msg[1], msg[2], msg[3], 'Google', 'Yahoo', 'Live Search') + gnt, ynt, mnt, ent, sn, snt = 0, 0, 0, 0, 0, 0 + for page in gen: + data = page.get() + gn = stat_sum('google', data) + yn = stat_sum('yahoo', data) + mn = stat_sum('(msn|live)', data) + en = len(re.findall('=== \[\[', data)) + sn = len(data) + gnt += gn + ynt += yn + mnt += mn + ent += en + snt += sn + if en > 0: + output += u"|%s||%s||%s KB||%s||%s||%s\n|-\n" \ + % (page.title(asLink=True), en, sn / 1024, gn, yn, mn) + output += u"""| |||||||| +|- +|'''%s'''||%s||%s KB||%s||%s||%s +|- +|colspan="6" align=right style="background-color:#eeeeee;"|<small>''%s: %s''</small> +|} +""" % (msg[4], ent, snt / 1024, gnt, ynt, mnt, msg[5], + time.strftime("%d " + "%s" + % (date.monthName(pywikibot.Site().language(), + time.localtime()[1])) + " %Y")) + return output + + +def put_stats(): + page = get_wiki_save_page(stat_page=True) + page.put(get_stats(), comment=pywikibot.translate(pywikibot.Site(), + stat_msg)[0]) + + +def output_files_gen(): + for f in os.listdir(appdir): + if 'output' in f and '_pending' not in f: + m = re.search('output_(.*?)\.txt', f) + if m: + tag = m.group(1) + else: + tag = '_default' + section_name_and_summary = pywikibot.translate(pywikibot.Site(), + msg_table)[tag] + section = section_name_and_summary[0] + summary = section_name_and_summary[1] + yield os.path.join(appdir, f), section, summary + + +def read_output_file(filename): + if os.path.isfile(filename + '_pending'): + shutil.move(filename, filename + '_temp') + ap = codecs.open(filename + '_pending', 'a', 'utf-8') + ot = codecs.open(filename + '_temp', 'r', 'utf-8') + ap.write(ot.read()) + ap.close() + ot.close() + os.remove(filename + '_temp') + else: + shutil.move(filename, filename + '_pending') + f = codecs.open(filename + '_pending', 'r', 'utf-8') + data = f.read() + f.close() + return data + + +def run(send_stats=False): + page = get_wiki_save_page() + try: + wikitext = page.get() + except pywikibot.NoPage: + pywikibot.output("%s not found." % page.title(asLink=True)) + wikitext = '[[%s:%s]]\n' % (pywikibot.Site().namespace(14), + pywikibot.translate(pywikibot.Site(), + reports_cat)) + final_summary = u'' + output_files = list() + for f, section, summary in output_files_gen(): + print (page) + pywikibot.output('File: \'%s\'\nSection: %s\n' % (f, section)) + output_data = read_output_file(f) + output_files.append(f) + entries = re.findall('=== (.*?) ===', output_data) + if entries: + if append_date_to_entries: + dt = time.strftime('%d-%m-%Y %H:%M', time.localtime()) + output_data = re.sub("(?m)^(=== \[\[.*?\]\] ===\n)", + r"\1{{botdate|%s}}\n" % dt, output_data) + m = re.search('(?m)^==\s*%s\s*==' % section, wikitext) + if m: + m_end = re.search(separatorC, wikitext[m.end():]) + if m_end: + wikitext = (wikitext[:m_end.start() + m.end()] + + output_data + wikitext[m_end.start() + m.end():]) + else: + wikitext += '\n' + output_data + else: + wikitext += '\n' + output_data + if final_summary: + final_summary += ' ' + final_summary += u'%s: %s' % (summary, ', '.join(entries)) + + if final_summary: + pywikibot.output(final_summary + '\n') + + # if a page in 'Image' or 'Category' namespace is checked then fix + # title section by adding ':' in order to avoid wiki code effects. + wikitext = re.sub(u'(?i)=== \[\[%s:' % join_family_data('Image', 6), + ur'=== [[:\1:', wikitext) + wikitext = re.sub(u'(?i)=== \[\[%s:' % join_family_data('Category', 14), + ur'=== [[:\1:', wikitext) + + # TODO: + # List of frequent rejected address to improve upload process. + wikitext = re.sub('http://(.*?)((forumcommunity|forumfree).net)', + r'<blacklist>\1\2', wikitext) + + if len(final_summary) >= 200: + final_summary = final_summary[:200] + final_summary = final_summary[ + :final_summary.rindex("[") - 3] + "..." + + try: + put(page, wikitext, comment=final_summary) + for f in output_files: + os.remove(f + '_pending') + pywikibot.output("\'%s\' deleted." % f) + except pywikibot.PageNotSaved: + raise + + if append_date_to_entries: + set_template(name='botdate') + if '{{botbox' in wikitext: + set_template(name='botbox') + + if send_stats: + put_stats() + + +def main(*args): + # Send statistics + send_stats = False + genFactory = pagegenerators.GeneratorFactory() + for arg in pywikibot.handle_args(args): + if arg == "-stats": + send_stats = True + else: + genFactory.handleArg(arg) + gen = genFactory.getCombinedGenerator() + run(send_stats=send_stats) + + +if __name__ == "__main__": + main() -- To view, visit https://gerrit.wikimedia.org/r/194824 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ia0c3a9fe6a2c3be3cdbad517ac9dbf3249c197ab Gerrit-PatchSet: 1 Gerrit-Project: pywikibot/core Gerrit-Branch: master Gerrit-Owner: Prianka <priyankajayaswal...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits