Mpaa has uploaded a new change for review. https://gerrit.wikimedia.org/r/94583
Change subject: Refactored to support all languages and timezones. Necessary info are retrieved directly from site. No use of locale should be necessary any longer. ...................................................................... Refactored to support all languages and timezones. Necessary info are retrieved directly from site. No use of locale should be necessary any longer. Change-Id: Iede5165fd36b8e5747db032183094fa11177b037 --- M archivebot.py 1 file changed, 215 insertions(+), 121 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/pywikibot/compat refs/changes/83/94583/1 diff --git a/archivebot.py b/archivebot.py index d0f79ee..f4afe3f 100644 --- a/archivebot.py +++ b/archivebot.py @@ -36,7 +36,7 @@ algo specifies the maximum age of a thread. Must be in the form old(<delay>) where <delay> specifies the age in hours or days like 24h or 5d. - Default ist old(24h) + Default is old(24h) counter The current value of a counter which could be assigned as variable. Will be actualized by bot. Initial value is 1. maxarchivesize The maximum archive size before incrementing the counter. @@ -70,7 +70,7 @@ # # (C) Misza13, 2006-2010 # (C) xqt, 2009-2012 -# (C) Pywikipedia bot team, 2007-2012 +# (C) Pywikipedia bot team, 2007-2013 # # Distributed under the terms of the MIT license. # @@ -78,10 +78,17 @@ # import wikipedia as pywikibot from pywikibot import i18n -import pagegenerators, query -Site = pywikibot.getSite() +import pagegenerators +import query +import dateutil.tz +import datetime +import os +import re +import time +import locale +import traceback -import os, re, time, locale, traceback, string, urllib, unicodedata +Site = pywikibot.getSite() try: #Get a constructor for the MD5 hash object import hashlib @@ -112,19 +119,23 @@ class ArchiveSecurityError(pywikibot.Error): """Archive is not a subpage of page being archived and key not specified (or incorrect).""" + +class TimeZoneNotFound(pywikibot.Error): + """Timezone not found by dateutil.tz.gettz(), either using tzone stripped + from text of from site.siteinfo.""" def str2time(str): """Accepts a string defining a time period: 7d - 7 days 36h - 36 hours - Returns the corresponding time, measured in seconds.""" + Returns the corresponding timedelta object.""" if str[-1] == 'd': - return int(str[:-1])*24*3600 + return datetime.timedelta(days=int(str[:-1])) elif str[-1] == 'h': - return int(str[:-1])*3600 + return datetime.timedelta(hours=int(str[:-1])) else: - return int(str) + return datetime.timedelta(seconds=int(str)) def str2size(str): """Accepts a string defining a size: @@ -133,7 +144,7 @@ 2M - 2 megabytes Returns a tuple (size,unit), where size is an integer and unit is 'B' (bytes) or 'T' (threads).""" - if str[-1] in string.digits: #TODO: de-uglify + if str[-1].isdigit(): #TODO: de-uglify return (int(str),'B') elif str[-1] in ['K', 'k']: return (int(str[:-1])*1024,'B') @@ -143,36 +154,6 @@ return (int(str[:-1]),'T') else: return (int(str[:-1])*1024,'B') - -def int2month(num): - """Returns the locale's full name of month 'num' (1-12).""" - if hasattr(locale, 'nl_langinfo'): - return locale.nl_langinfo(locale.MON_1+num-1).decode('utf-8') - Months = ['january', 'february', 'march', 'april', 'may_long', 'june', - 'july', 'august', 'september', 'october', 'november', 'december'] - return Site.mediawiki_message(Months[num-1]) - -def int2month_short(num): - """Returns the locale's abbreviated name of month 'num' (1-12).""" - if hasattr(locale, 'nl_langinfo'): - #filter out non-alpha characters - return ''.join([c for c in locale.nl_langinfo(locale.ABMON_1+num-1).decode('utf-8') if c.isalpha()]) - Months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', - 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'] - return Site.mediawiki_message(Months[num-1]) - -def txt2timestamp(txt, format): - """Attempts to convert the timestamp 'txt' according to given 'format'. - On success, returns the time tuple; on failure, returns None.""" -## print txt, format - try: - return time.strptime(txt,format) - except ValueError: - try: - return time.strptime(txt.encode('utf8'),format) - except: - pass - return None def generateTransclusions(Site, template, namespaces=[]): pywikibot.output(u'Fetching template transclusions...') @@ -184,6 +165,165 @@ for page in gen: yield page +class Months(object): + """ + Generation of look-up dictionaries for months, used by Timestripper() and PageArchiver + """ + + def __init__(self, site=None): + if site is None: + self.site = pywikibot.getSite() + + @classmethod + def queryMonths(self): + months_long = ['january', 'february', 'march', 'april', 'may_long', 'june', + 'july', 'august', 'september', 'october', 'november', 'december'] + months_short = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', + 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'] + + #one query instead of multiple queries using site.mediawiki_message() + #can be refactored to use site.mediawiki_message() + params = { + 'action' : 'query', + 'meta' : 'allmessages', + 'ammessages': '|'.join(months_long) + '|' + '|'.join(months_short), + 'amlang' : self.site.lang, + } + + monthsDict = query.GetData(params)['query']['allmessages'] + + #d[1:12] = {'short': 'orig_short', 'long': 'orig_long} + monthNum2origNames = {i: {'short': '', 'long': ''} for i in range(1, 13)} + origNames2monthNum = dict() + + for el in monthsDict: + orig, eng = el.values() + try: + month_num = months_long.index(eng) + 1 + monthNum2origNames[month_num]['long'] = orig + except ValueError: + month_num = months_short.index(eng) + 1 + monthNum2origNames[month_num]['short'] = orig + + origNames2monthNum[orig] = month_num + + return monthNum2origNames, origNames2monthNum + + @classmethod + def updateMonths(self, site=None): + if site is None: + self.site = pywikibot.getSite() + else: + self.site = site + self.monthsDicts = self.queryMonths() + + +class TimeStripper(object): + """ + Find timetstamp in page text and returns it as timezone aware datetime object + """ + + def __init__(self): + self.monthNum2origNames, self.origNames2monthNum = Months.monthsDicts + self.site = Months.site + + self.groups = [u'year', u'month', u'hour', u'time', u'day', u'minute', u'tzinfo'] + + timeR = r'(?P<time>(?P<hour>[0-2]\d)[:\.h](?P<minute>[0-5]\d))' + timeznR = r'\((?P<tzinfo>[A-Z]+)\)' + yearR = r'(?P<year>(19|20)\d\d)' + monthR = ur'(?P<month>(%s))' % (u'|'.join(self.origNames2monthNum)) + dayR = r'(?P<day>(3[01]|[12]\d|0?[1-9]))' + + self.ptimeR = re.compile(timeR) + self.timeznR = re.compile(timeznR) + self.yearR = re.compile(yearR) + self.pmonthR = re.compile(monthR, re.U) + self.pdayR = re.compile(dayR) + + #order is important to avoid mismatch when searching + self.patterns = [self.ptimeR, + self.timeznR, + self.yearR, + self.pmonthR, + self.pdayR, + ] + + def findmarker(self, text, base=u'@@', delta='@'): + # find a string which is not part of text + while base in text: + base += delta + return base + + def last_match_and_replace(self, txt, pat): + """ + Take the rightmost match, to prevent spurious earlier matches, and replace with marker + """ + m = None + for m in pat.finditer(txt): + pass + + if m: + marker = self.findmarker(txt) + txt = pat.sub(marker, txt) + return (txt, m.groupdict()) + else: + return (txt, None) + + def timestripper(self, line): + """ + Find timestamp in line and convert it to time zone aware datetime + """ + _line = line + #match date fields + dateDict = dict() + for pat in self.patterns: + line, matchDict = self.last_match_and_replace(line, pat) + if matchDict: + dateDict.update(matchDict) + + #all fields matched -> date valid + if all(g in dateDict for g in self.groups): + #remove 'time' key, now splitted in hour/minute and not needed by datetime + del dateDict['time'] + + #replace month name in original language with month number + try: + dateDict['month'] = self.origNames2monthNum[dateDict['month']] + except KeyError: + pywikibot.output(u'incorrect month name in page') + + #convert to integers + for k, v in dateDict.items(): + try: + dateDict[k] = int(v) + except ValueError: + pass + + #find timezone + tzoneText = dateutil.tz.gettz(dateDict['tzinfo']) + tzoneSite = dateutil.tz.gettz(self.site.siteinfo()['timezone']) + if dateDict['tzinfo'] != 'UTC': + #try timezone stripped from text not recognised by dateutil.tz.gettz + if tzoneText: + dateDict['tzinfo'] = tzoneText + #try tzone from site.siteinfo + elif tzoneSite: + dateDict['tzinfo'] = tzoneSite + #give up + else: + raise TimeZoneNotFound(u'Timezone %s or %s not found. Please submit a bug.' + % (dateDict['tzinfo'], tzoneSite)) + else: + dateDict['tzinfo'] = dateutil.tz.tzutc() + + timestamp = datetime.datetime(**dateDict) + + else: + timestamp = None + + return timestamp + class DiscussionThread(object): """An object representing a discussion thread on a page, that is something of the form: @@ -194,87 +334,31 @@ :Reply, etc. ~~~~ """ - def __init__(self, title): + def __init__(self, title, now): self.title = title + self.now = now self.content = "" + self.ts = TimeStripper() self.timestamp = None def __repr__(self): return '%s("%s",%d bytes)' \ - % (self.__class__.__name__,self.title,len(self.content)) + % (self.__class__.__name__, self.title, len(self.content)) def feedLine(self, line): if not self.content and not line: return + self.content += line + '\n' - #Update timestamp -# nnwiki: -# 19:42, 25 mars 2008 (CET) -# enwiki -# 16:36, 30 March 2008 (UTC) -# huwiki -# 2007. december 8., 13:42 (CET) - TM = re.search(r'(\d\d):(\d\d), (\d\d?) (\S+) (\d\d\d\d) \(.*?\)', line) - if not TM: - TM = re.search(r'(\d\d):(\d\d), (\S+) (\d\d?), (\d\d\d\d) \(.*?\)', line) - if not TM: - TM = re.search(r'(\d{4})\. (\S+) (\d\d?)\., (\d\d:\d\d) \(.*?\)', line) -# 18. apr 2006 kl.18:39 (UTC) -# 4. nov 2006 kl. 20:46 (CET) - if not TM: - TM = re.search(r'(\d\d?)\. (\S+) (\d\d\d\d) kl\.\W*(\d\d):(\d\d) \(.*?\)', line) -#3. joulukuuta 2008 kello 16.26 (EET) - if not TM: - TM = re.search(r'(\d\d?)\. (\S+) (\d\d\d\d) kello \W*(\d\d).(\d\d) \(.*?\)', line) - if not TM: -# 14:23, 12. Jan. 2009 (UTC) - pat = re.compile(r'(\d\d):(\d\d), (\d\d?)\. (\S+)\.? (\d\d\d\d) \((?:UTC|CES?T)\)') - TM = pat.search(line) -# ro.wiki: 4 august 2012 13:01 (EEST) - if not TM: - TM = re.search(r'(\d\d?) (\S+) (\d\d\d\d) (\d\d):(\d\d) \(.*?\)', line) - if TM: - # Strip away all diacritics in the Mn ('Mark, non-spacing') category - # NFD decomposition splits combined characters (e.g. 'รค", LATIN SMALL - # LETTER A WITH DIAERESIS) into two entities: LATIN SMALL LETTER A - # and COMBINING DIAERESIS. The latter falls in the Mn category and is - # filtered out, resuling in 'a'. - _TM = ''.join(c for c in unicodedata.normalize('NFD', TM.group(0)) - if unicodedata.category(c) != 'Mn') + + timestamp = self.ts.timestripper(line) + + if not self.timestamp: #first time + self.timestamp = timestamp + + if timestamp: + self.timestamp = max(self.timestamp, timestamp) - TIME = txt2timestamp(_TM,"%d. %b %Y kl. %H:%M (%Z)") - if not TIME: - TIME = txt2timestamp(_TM, "%Y. %B %d., %H:%M (%Z)") - if not TIME: - TIME = txt2timestamp(_TM, "%d. %b %Y kl.%H:%M (%Z)") - if not TIME: - TIME = txt2timestamp(re.sub(' *\([^ ]+\) *', '', _TM), - "%H:%M, %d %B %Y") - if not TIME: - TIME = txt2timestamp(_TM, "%H:%M, %d %b %Y (%Z)") - if not TIME: - TIME = txt2timestamp(re.sub(' *\([^ ]+\) *', '', _TM), - "%H:%M, %d %b %Y") - if not TIME: - TIME = txt2timestamp(_TM, "%H:%M, %b %d %Y (%Z)") - if not TIME: - TIME = txt2timestamp(_TM, "%H:%M, %B %d %Y (%Z)") - if not TIME: - TIME = txt2timestamp(_TM, "%H:%M, %b %d, %Y (%Z)") - if not TIME: - TIME = txt2timestamp(_TM, "%H:%M, %B %d, %Y (%Z)") - if not TIME: - TIME = txt2timestamp(_TM,"%d. %Bta %Y kello %H.%M (%Z)") - if not TIME: - TIME = txt2timestamp(_TM, "%d %B %Y %H:%M (%Z)") - if not TIME: - TIME = txt2timestamp(re.sub(' *\([^ ]+\) *', '', _TM), - "%H:%M, %d. %b. %Y") - if TIME: - self.timestamp = max(self.timestamp, time.mktime(TIME)) -## pywikibot.output(u'Time to be parsed: %s' % TM.group(0)) -## pywikibot.output(u'Parsed time: %s' % TIME) -## pywikibot.output(u'Newest timestamp in thread: %s' % TIME) def size(self): return len(self.title) + len(self.content) + 12 @@ -282,17 +366,20 @@ def toText(self): return "== " + self.title + ' ==\n\n' + self.content - def shouldBeArchived(self,Archiver): + def shouldBeArchived(self, Archiver): algo = Archiver.get('algo') - reT = re.search(r'^old\((.*)\)$',algo) + reT = re.search(r'^old\((.*)\)$', algo) if reT: if not self.timestamp: return '' #TODO: handle this: #return 'unsigned' maxage = str2time(reT.group(1)) - if self.timestamp + maxage < time.time(): - return message('archivebot-older-than') + ' ' + reT.group(1) + try: + if self.now - self.timestamp > maxage: + return message('archivebot-older-than') + ' ' + reT.group(1) + except: + import pdb; pdb.set_trace() return '' @@ -306,6 +393,8 @@ self.full = False self.archiver = archiver self.vars = vars + self.now = datetime.datetime.utcnow().replace(tzinfo=dateutil.tz.tzutc()) + try: self.loadPage() except pywikibot.NoPage: @@ -329,7 +418,7 @@ found = True #Reading threads now if curThread: self.threads.append(curThread) - curThread = DiscussionThread(threadHeader.group(1)) + curThread = DiscussionThread(threadHeader.group(1), self.now) else: if found: curThread.feedLine(line) @@ -363,6 +452,8 @@ if self.full: summary += ' ' + message('archivebot-archive-full') self.put(newtext, comment=summary) + with open('x.txt', 'a') as f: + f.write(newtext.encode('utf-8')) class PageArchiver(object): @@ -390,6 +481,7 @@ } self.archives = {} self.archivedThreads = 0 + self.monthNum2origNames, self.origNames2monthNum = Months.monthsDicts def get(self, attr, default=''): return self.attributes.get(attr,[default])[0] @@ -464,14 +556,13 @@ why = t.shouldBeArchived(self) if why: archive = self.get('archive') - TStuple = time.gmtime(t.timestamp) vars = { - 'counter' : archCounter, - 'year' : TStuple[0], - 'month' : TStuple[1], - 'monthname' : int2month(TStuple[1]), - 'monthnameshort' : int2month_short(TStuple[1]), - 'week' : int(time.strftime('%W',TStuple)), + 'counter': archCounter, + 'year': t.timestamp.year, + 'month': t.timestamp.month, + 'monthname': self.monthNum2origNames[t.timestamp.month]['long'], + 'monthnameshort': self.monthNum2origNames[t.timestamp.month]['short'], + 'week': int(time.strftime('%W', t.timestamp.timetuple())), } archive = archive % vars if self.feedArchive(archive,t,maxArchSize,vars): @@ -584,6 +675,9 @@ pywikibot.output(u'NOTE: you must specify a template to run the bot') pywikibot.showHelp('archivebot') return + + #query site for original months name and create convenience look-up dictionaries + Months.updateMonths(site=Site) for a in args: pagelist = [] -- To view, visit https://gerrit.wikimedia.org/r/94583 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Iede5165fd36b8e5747db032183094fa11177b037 Gerrit-PatchSet: 1 Gerrit-Project: pywikibot/compat Gerrit-Branch: master Gerrit-Owner: Mpaa <mpaa.w...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits