[MediaWiki-commits] [Gerrit] Refactored to support all languages and timezones. Necessary... - change (pywikibot/compat)

Mpaa (Code Review) Sat, 09 Nov 2013 14:01:53 -0800

Mpaa has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/94583



Change subject: Refactored to support all languages and timezones. Necessary 
info are retrieved directly from site. No use of locale should be necessary any 
longer.
......................................................................

Refactored to support all languages and timezones.
Necessary info are retrieved directly from site.
No use of locale should be necessary any longer.

Change-Id: Iede5165fd36b8e5747db032183094fa11177b037
---
M archivebot.py
1 file changed, 215 insertions(+), 121 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/pywikibot/compat 
refs/changes/83/94583/1

diff --git a/archivebot.py b/archivebot.py
index d0f79ee..f4afe3f 100644
--- a/archivebot.py
+++ b/archivebot.py
@@ -36,7 +36,7 @@
 algo                 specifies the maximum age of a thread. Must be in the form
                      old(<delay>) where <delay> specifies the age in hours or
                      days like 24h or 5d.
-                     Default ist old(24h)
+                     Default is old(24h)
 counter              The current value of a counter which could be assigned as
                      variable. Will be actualized by bot. Initial value is 1.
 maxarchivesize       The maximum archive size before incrementing the counter.
@@ -70,7 +70,7 @@
 #
 # (C) Misza13, 2006-2010
 # (C) xqt, 2009-2012
-# (C) Pywikipedia bot team, 2007-2012
+# (C) Pywikipedia bot team, 2007-2013
 #
 # Distributed under the terms of the MIT license.
 #
@@ -78,10 +78,17 @@
 #
 import wikipedia as pywikibot
 from pywikibot import i18n
-import pagegenerators, query
-Site = pywikibot.getSite()
+import pagegenerators
+import query
+import dateutil.tz
+import datetime
+import os
+import re
+import time
+import locale
+import traceback
 
-import os, re, time, locale, traceback, string, urllib, unicodedata
+Site = pywikibot.getSite()
 
 try: #Get a constructor for the MD5 hash object
     import hashlib
@@ -112,19 +119,23 @@
 class ArchiveSecurityError(pywikibot.Error):
     """Archive is not a subpage of page being archived and key not specified
     (or incorrect)."""
+    
+class TimeZoneNotFound(pywikibot.Error):
+    """Timezone not found by dateutil.tz.gettz(), either using tzone stripped 
+    from text of from site.siteinfo."""
 
 
 def str2time(str):
     """Accepts a string defining a time period:
     7d - 7 days
     36h - 36 hours
-    Returns the corresponding time, measured in seconds."""
+    Returns the corresponding timedelta object."""
     if str[-1] == 'd':
-        return int(str[:-1])*24*3600
+        return datetime.timedelta(days=int(str[:-1]))
     elif str[-1] == 'h':
-        return int(str[:-1])*3600
+        return datetime.timedelta(hours=int(str[:-1]))
     else:
-        return int(str)
+        return datetime.timedelta(seconds=int(str))
 
 def str2size(str):
     """Accepts a string defining a size:
@@ -133,7 +144,7 @@
     2M - 2 megabytes
     Returns a tuple (size,unit), where size is an integer and unit is
     'B' (bytes) or 'T' (threads)."""
-    if str[-1] in string.digits: #TODO: de-uglify
+    if str[-1].isdigit(): #TODO: de-uglify
         return (int(str),'B')
     elif str[-1] in ['K', 'k']:
         return (int(str[:-1])*1024,'B')
@@ -143,36 +154,6 @@
         return (int(str[:-1]),'T')
     else:
         return (int(str[:-1])*1024,'B')
-
-def int2month(num):
-    """Returns the locale's full name of month 'num' (1-12)."""
-    if hasattr(locale, 'nl_langinfo'):
-        return locale.nl_langinfo(locale.MON_1+num-1).decode('utf-8')
-    Months = ['january', 'february', 'march', 'april', 'may_long', 'june',
-              'july', 'august', 'september', 'october', 'november', 'december']
-    return Site.mediawiki_message(Months[num-1])
-
-def int2month_short(num):
-    """Returns the locale's abbreviated name of month 'num' (1-12)."""
-    if hasattr(locale, 'nl_langinfo'):
-        #filter out non-alpha characters
-        return ''.join([c for c in 
locale.nl_langinfo(locale.ABMON_1+num-1).decode('utf-8') if c.isalpha()])
-    Months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun',
-              'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
-    return Site.mediawiki_message(Months[num-1])
-
-def txt2timestamp(txt, format):
-    """Attempts to convert the timestamp 'txt' according to given 'format'.
-    On success, returns the time tuple; on failure, returns None."""
-##    print txt, format
-    try:
-        return time.strptime(txt,format)
-    except ValueError:
-        try:
-            return time.strptime(txt.encode('utf8'),format)
-        except:
-            pass
-        return None
 
 def generateTransclusions(Site, template, namespaces=[]):
     pywikibot.output(u'Fetching template transclusions...')
@@ -184,6 +165,165 @@
     for page in gen:
         yield page
 
+class Months(object):
+    """
+    Generation of look-up dictionaries for months, used by Timestripper() and 
PageArchiver
+    """
+
+    def __init__(self, site=None):
+        if site is None:
+            self.site = pywikibot.getSite()
+
+    @classmethod
+    def queryMonths(self):
+        months_long = ['january', 'february', 'march', 'april', 'may_long', 
'june',
+                       'july', 'august', 'september', 'october', 'november', 
'december']
+        months_short = ['jan', 'feb', 'mar', 'apr', 'may', 'jun',
+                        'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
+
+        #one query instead of multiple queries using site.mediawiki_message()
+        #can be refactored to use site.mediawiki_message()
+        params = {
+            'action'    : 'query',
+            'meta'      : 'allmessages',
+            'ammessages': '|'.join(months_long) + '|' + '|'.join(months_short),
+            'amlang'    : self.site.lang,
+            }
+    
+        monthsDict = query.GetData(params)['query']['allmessages']
+    
+        #d[1:12] = {'short': 'orig_short', 'long': 'orig_long}
+        monthNum2origNames = {i: {'short': '', 'long': ''} for i in range(1, 
13)}
+        origNames2monthNum = dict()
+        
+        for el in monthsDict:
+            orig, eng = el.values()
+            try:
+                month_num = months_long.index(eng) + 1
+                monthNum2origNames[month_num]['long'] = orig
+            except ValueError:
+                month_num = months_short.index(eng) + 1
+                monthNum2origNames[month_num]['short'] = orig
+    
+            origNames2monthNum[orig] = month_num
+    
+        return monthNum2origNames, origNames2monthNum
+
+    @classmethod
+    def updateMonths(self, site=None):
+        if site is None:
+            self.site = pywikibot.getSite()
+        else:
+            self.site = site
+        self.monthsDicts = self.queryMonths()
+
+
+class TimeStripper(object):
+    """
+    Find timetstamp in page text and returns it as timezone aware datetime 
object
+    """
+    
+    def __init__(self):
+        self.monthNum2origNames, self.origNames2monthNum = Months.monthsDicts
+        self.site = Months.site
+
+        self.groups = [u'year', u'month',  u'hour',  u'time', u'day', 
u'minute', u'tzinfo']
+
+        timeR = r'(?P<time>(?P<hour>[0-2]\d)[:\.h](?P<minute>[0-5]\d))'
+        timeznR =  r'\((?P<tzinfo>[A-Z]+)\)'
+        yearR = r'(?P<year>(19|20)\d\d)'
+        monthR = ur'(?P<month>(%s))' % (u'|'.join(self.origNames2monthNum))
+        dayR = r'(?P<day>(3[01]|[12]\d|0?[1-9]))'
+           
+        self.ptimeR = re.compile(timeR)
+        self.timeznR = re.compile(timeznR)        
+        self.yearR = re.compile(yearR)
+        self.pmonthR = re.compile(monthR, re.U)
+        self.pdayR = re.compile(dayR)
+
+        #order is important to avoid mismatch when searching
+        self.patterns = [self.ptimeR,
+                    self.timeznR,
+                    self.yearR,
+                    self.pmonthR,
+                    self.pdayR,
+                   ]
+
+    def findmarker(self, text, base=u'@@', delta='@'):
+        # find a string which is not part of text 
+        while base in text:
+            base += delta
+        return base
+
+    def last_match_and_replace(self, txt, pat):
+        """
+        Take the rightmost match, to prevent spurious earlier matches, and 
replace with marker
+        """
+        m = None
+        for m in pat.finditer(txt):
+            pass
+
+        if m:
+            marker = self.findmarker(txt)
+            txt = pat.sub(marker, txt)
+            return (txt, m.groupdict())
+        else:
+            return (txt, None)
+
+    def timestripper(self, line):
+        """
+        Find timestamp in line and convert it to time zone aware datetime
+        """
+        _line = line
+        #match date fields
+        dateDict = dict()
+        for pat in self.patterns:
+            line, matchDict = self.last_match_and_replace(line, pat)
+            if matchDict:
+                dateDict.update(matchDict)
+                
+        #all fields matched -> date valid
+        if all(g in dateDict for g in self.groups):
+            #remove 'time' key, now splitted in hour/minute and not needed by 
datetime
+            del dateDict['time']
+
+            #replace month name in original language with month number
+            try:
+                dateDict['month'] = self.origNames2monthNum[dateDict['month']]
+            except KeyError:
+                pywikibot.output(u'incorrect month name in page')
+            
+            #convert to integers
+            for k, v in dateDict.items():
+                try:
+                    dateDict[k] = int(v)
+                except ValueError:
+                    pass
+
+            #find timezone
+            tzoneText = dateutil.tz.gettz(dateDict['tzinfo'])
+            tzoneSite = dateutil.tz.gettz(self.site.siteinfo()['timezone'])
+            if dateDict['tzinfo'] != 'UTC':
+                #try timezone stripped from text not recognised by 
dateutil.tz.gettz
+                if tzoneText:
+                    dateDict['tzinfo'] = tzoneText
+                #try tzone from site.siteinfo
+                elif tzoneSite:
+                    dateDict['tzinfo'] = tzoneSite
+                #give up
+                else:
+                    raise TimeZoneNotFound(u'Timezone %s or %s not found. 
Please submit a bug.'
+                           % (dateDict['tzinfo'], tzoneSite))
+            else:
+                dateDict['tzinfo'] =  dateutil.tz.tzutc()
+            
+            timestamp = datetime.datetime(**dateDict)
+            
+        else:
+            timestamp = None
+
+        return timestamp
+
 
 class DiscussionThread(object):
     """An object representing a discussion thread on a page, that is something 
of the form:
@@ -194,87 +334,31 @@
     :Reply, etc. ~~~~
     """
 
-    def __init__(self, title):
+    def __init__(self, title, now):
         self.title = title
+        self.now = now
         self.content = ""
+        self.ts = TimeStripper()
         self.timestamp = None
 
     def __repr__(self):
         return '%s("%s",%d bytes)' \
-               % (self.__class__.__name__,self.title,len(self.content))
+               % (self.__class__.__name__, self.title, len(self.content))
 
     def feedLine(self, line):
         if not self.content and not line:
             return
+            
         self.content += line + '\n'
-        #Update timestamp
-# nnwiki:
-# 19:42, 25 mars 2008 (CET)
-# enwiki
-# 16:36, 30 March 2008 (UTC)
-# huwiki
-# 2007. december 8., 13:42 (CET)
-        TM = re.search(r'(\d\d):(\d\d), (\d\d?) (\S+) (\d\d\d\d) \(.*?\)', 
line)
-        if not TM:
-            TM = re.search(r'(\d\d):(\d\d), (\S+) (\d\d?), (\d\d\d\d) 
\(.*?\)', line)
-        if not TM:
-            TM = re.search(r'(\d{4})\. (\S+) (\d\d?)\., (\d\d:\d\d) \(.*?\)', 
line)
-# 18. apr 2006 kl.18:39 (UTC)
-# 4. nov 2006 kl. 20:46 (CET)
-        if not TM:
-            TM = re.search(r'(\d\d?)\. (\S+) (\d\d\d\d) kl\.\W*(\d\d):(\d\d) 
\(.*?\)', line)
-#3. joulukuuta 2008 kello 16.26 (EET)
-        if not TM:
-            TM = re.search(r'(\d\d?)\. (\S+) (\d\d\d\d) kello \W*(\d\d).(\d\d) 
\(.*?\)', line)
-        if not TM:
-# 14:23, 12. Jan. 2009 (UTC)
-            pat = re.compile(r'(\d\d):(\d\d), (\d\d?)\. (\S+)\.? (\d\d\d\d) 
\((?:UTC|CES?T)\)')
-            TM = pat.search(line)
-# ro.wiki: 4 august 2012 13:01 (EEST)
-        if not TM:
-            TM = re.search(r'(\d\d?) (\S+) (\d\d\d\d) (\d\d):(\d\d) \(.*?\)', 
line)
-        if TM:
-            # Strip away all diacritics in the Mn ('Mark, non-spacing') 
category
-            # NFD decomposition splits combined characters (e.g. 'ä", LATIN 
SMALL
-            # LETTER A WITH DIAERESIS) into two entities: LATIN SMALL LETTER A
-            # and COMBINING DIAERESIS. The latter falls in the Mn category and 
is
-            # filtered out, resuling in 'a'.
-            _TM = ''.join(c for c in unicodedata.normalize('NFD', TM.group(0))
-                    if unicodedata.category(c) != 'Mn')
+        
+        timestamp = self.ts.timestripper(line)
+        
+        if not self.timestamp: #first time
+            self.timestamp = timestamp
+            
+        if timestamp:
+            self.timestamp = max(self.timestamp, timestamp)
 
-            TIME = txt2timestamp(_TM,"%d. %b %Y kl. %H:%M (%Z)")
-            if not TIME:
-                TIME = txt2timestamp(_TM, "%Y. %B %d., %H:%M (%Z)")
-            if not TIME:
-                TIME = txt2timestamp(_TM, "%d. %b %Y kl.%H:%M (%Z)")
-            if not TIME:
-                TIME = txt2timestamp(re.sub(' *\([^ ]+\) *', '', _TM),
-                                     "%H:%M, %d %B %Y")
-            if not TIME:
-                TIME = txt2timestamp(_TM, "%H:%M, %d %b %Y (%Z)")
-            if not TIME:
-                TIME = txt2timestamp(re.sub(' *\([^ ]+\) *', '', _TM),
-                                     "%H:%M, %d %b %Y")
-            if not TIME:
-                TIME = txt2timestamp(_TM, "%H:%M, %b %d %Y (%Z)")
-            if not TIME:
-                TIME = txt2timestamp(_TM, "%H:%M, %B %d %Y (%Z)")
-            if not TIME:
-                TIME = txt2timestamp(_TM, "%H:%M, %b %d, %Y (%Z)")
-            if not TIME:
-                TIME = txt2timestamp(_TM, "%H:%M, %B %d, %Y (%Z)")
-            if not TIME:
-                TIME = txt2timestamp(_TM,"%d. %Bta %Y kello %H.%M (%Z)")
-            if not TIME:
-                TIME = txt2timestamp(_TM, "%d %B %Y %H:%M (%Z)")
-            if not TIME:
-                TIME = txt2timestamp(re.sub(' *\([^ ]+\) *', '', _TM),
-                                     "%H:%M, %d. %b. %Y")
-            if TIME:
-                self.timestamp = max(self.timestamp, time.mktime(TIME))
-##                pywikibot.output(u'Time to be parsed: %s' % TM.group(0))
-##                pywikibot.output(u'Parsed time: %s' % TIME)
-##                pywikibot.output(u'Newest timestamp in thread: %s' % TIME)
 
     def size(self):
         return len(self.title) + len(self.content) + 12
@@ -282,17 +366,20 @@
     def toText(self):
         return "== " + self.title + ' ==\n\n' + self.content
 
-    def shouldBeArchived(self,Archiver):
+    def shouldBeArchived(self, Archiver):
         algo = Archiver.get('algo')
-        reT = re.search(r'^old\((.*)\)$',algo)
+        reT = re.search(r'^old\((.*)\)$', algo)
         if reT:
             if not self.timestamp:
                 return ''
             #TODO: handle this:
                 #return 'unsigned'
             maxage = str2time(reT.group(1))
-            if self.timestamp + maxage < time.time():
-                return message('archivebot-older-than') + ' ' + reT.group(1)
+            try:
+                if self.now - self.timestamp > maxage:
+                    return message('archivebot-older-than') + ' ' + 
reT.group(1)
+            except:
+                import pdb; pdb.set_trace()
         return ''
 
 
@@ -306,6 +393,8 @@
         self.full = False
         self.archiver = archiver
         self.vars = vars
+        self.now = 
datetime.datetime.utcnow().replace(tzinfo=dateutil.tz.tzutc())
+        
         try:
             self.loadPage()
         except pywikibot.NoPage:
@@ -329,7 +418,7 @@
                 found = True #Reading threads now
                 if curThread:
                     self.threads.append(curThread)
-                curThread = DiscussionThread(threadHeader.group(1))
+                curThread = DiscussionThread(threadHeader.group(1), self.now)
             else:
                 if found:
                     curThread.feedLine(line)
@@ -363,6 +452,8 @@
         if self.full:
             summary += ' ' + message('archivebot-archive-full')
         self.put(newtext, comment=summary)
+        with open('x.txt', 'a') as f:
+            f.write(newtext.encode('utf-8'))
 
 
 class PageArchiver(object):
@@ -390,6 +481,7 @@
                 }
         self.archives = {}
         self.archivedThreads = 0
+        self.monthNum2origNames, self.origNames2monthNum = Months.monthsDicts
 
     def get(self, attr, default=''):
         return self.attributes.get(attr,[default])[0]
@@ -464,14 +556,13 @@
             why = t.shouldBeArchived(self)
             if why:
                 archive = self.get('archive')
-                TStuple = time.gmtime(t.timestamp)
                 vars = {
-                        'counter' : archCounter,
-                        'year' : TStuple[0],
-                        'month' : TStuple[1],
-                        'monthname' : int2month(TStuple[1]),
-                        'monthnameshort' : int2month_short(TStuple[1]),
-                        'week' : int(time.strftime('%W',TStuple)),
+                        'counter': archCounter,
+                        'year': t.timestamp.year,
+                        'month': t.timestamp.month,
+                        'monthname': 
self.monthNum2origNames[t.timestamp.month]['long'],
+                        'monthnameshort': 
self.monthNum2origNames[t.timestamp.month]['short'],
+                        'week': int(time.strftime('%W', 
t.timestamp.timetuple())),
                         }
                 archive = archive % vars
                 if self.feedArchive(archive,t,maxArchSize,vars):
@@ -584,6 +675,9 @@
         pywikibot.output(u'NOTE: you must specify a template to run the bot')
         pywikibot.showHelp('archivebot')
         return
+    
+    #query site for original months name and create convenience look-up 
dictionaries
+    Months.updateMonths(site=Site)
 
     for a in args:
         pagelist = []

-- 
To view, visit https://gerrit.wikimedia.org/r/94583
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Iede5165fd36b8e5747db032183094fa11177b037
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/compat
Gerrit-Branch: master
Gerrit-Owner: Mpaa <mpaa.w...@gmail.com>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] Refactored to support all languages and timezones. Necessary... - change (pywikibot/compat)

Reply via email to