[MediaWiki-commits] [Gerrit] download only when needed - change (analytics/zero-sms)

2014-09-12 Thread Yurik (Code Review)
Yurik has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/159976

Change subject: download only when needed
..

download only when needed

Change-Id: Ie36b9be8881f86d02d7d057ebffc847a48f2ab4c
---
M scripts/smslogs.py
M scripts/weblogs.py
2 files changed, 36 insertions(+), 15 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/zero-sms 
refs/changes/76/159976/1

diff --git a/scripts/smslogs.py b/scripts/smslogs.py
index 40cadf9..7f01c2e 100644
--- a/scripts/smslogs.py
+++ b/scripts/smslogs.py
@@ -112,6 +112,7 @@
 
 bucket = cn.get_bucket(self.settings.awsBucket)
 files = bucket.list(self.settings.awsPrefix)
+newDataFound = False
 
 for key in files:
 filename = key.key[len(self.settings.awsPrefix):]
@@ -160,8 +161,15 @@
 key.get_contents_to_filename(filePath)
 if fileDate and (not self.settings.lastDownloadTs or 
self.settings.lastDownloadTs < fileDate):
 self.settings.lastDownloadTs = fileDate
+newDataFound = True
 
-def combineDataFiles(self, sourceFiles):
+return newDataFound
+
+def combineDataFiles(self):
+
+sourceFiles = os.listdir(self.pathLogs)
+# files = itertools.chain([os.path.join('pc', f) for f in 
os.listdir(os.path.join(self.pathLogs, 'pc'))],
+# files)
 
 safePrint(u'Combining files into %s' % self.combinedFilePath)
 if self.processIfAfter:
@@ -169,11 +177,11 @@
 else:
 safePrint(u'Processing all files')
 
-appendingDataFile = self.combinedFilePath + '.tmp'
+tempFile = self.combinedFilePath + '.tmp'
 manualLogRe = re.compile(r'^wikipedia_application_\d+\.log\.\d+\.gz:')
 
 totalCount = 0
-with io.open(appendingDataFile, 'w', encoding='utf8') as dst:
+with io.open(tempFile, 'w', encoding='utf8') as dst:
 for srcFile in sourceFiles:
 
 fileDate = self.getFileDate(srcFile)
@@ -216,7 +224,7 @@
 if os.path.exists(sortedOutputFile):
 os.remove(sortedOutputFile)
 
-args = [self.settings.sortCmd, '-u', '-o', sortedOutputFile, 
appendingDataFile]
+args = [self.settings.sortCmd, '-u', '-o', sortedOutputFile, 
tempFile]
 originalExists = os.path.exists(self.combinedFilePath)
 if originalExists:
 args.append(self.combinedFilePath)
@@ -239,7 +247,7 @@
 except subprocess.CalledProcessError, ex:
 raise Exception(u'Error %s running %s\nOutput:\n%s' % 
(ex.returncode, cmd, ex.output))
 
-os.remove(appendingDataFile)
+os.remove(tempFile)
 
 def generateGraphData(self, skipParsing=False):
 stats = smsgraphs.Stats(self.combinedFilePath, self.pathGraphs, 
self.statsFilePath, self.settings.partnerMap,
@@ -257,13 +265,15 @@
 stats.createGraphs()
 
 def run(self):
+newDataFound = True
 if self.settings.enableDownload:
-self.download()
-files = os.listdir(self.pathLogs)
-files = itertools.chain([os.path.join('pc', f) for f in 
os.listdir(os.path.join(self.pathLogs, 'pc'))],
-files)
-self.combineDataFiles(files)
-self.generateGraphData()
+newDataFound = self.download()
+
+if not newDataFound and os.path.isfile(self.combinedFilePath):
+safePrint('No new data, we are done')
+else:
+self.combineDataFiles()
+self.generateGraphData()
 
 
 if __name__ == "__main__":
diff --git a/scripts/weblogs.py b/scripts/weblogs.py
index 4abb5b0..7e5453a 100644
--- a/scripts/weblogs.py
+++ b/scripts/weblogs.py
@@ -36,6 +36,7 @@
 self.urlRe = re.compile(r'^https?://([^/]+)', re.IGNORECASE)
 self.duplUrlRe = re.compile(r'^(https?://.+)\1', re.IGNORECASE)
 self.zcmdRe = re.compile(r'zcmd=([-a-z0-9]+)', re.IGNORECASE)
+self.combinedFile = os.path.join(self.pathGraphs, 'combined-all.tsv')
 
 def defaultSettings(self, suffix):
 s = super(WebLogProcessor, self).defaultSettings(suffix)
@@ -68,6 +69,7 @@
 
 def processLogFiles(self):
 
+newDataFound = False
 safePrint('Processing log files')
 statFiles = {}
 for f in os.listdir(self.pathLogs):
@@ -82,6 +84,7 @@
 fileDt = m.group(1)
 fileDt = '-'.join([fileDt[0:4], fileDt[4:6], fileDt[6:8]])
 self.processLogFile(logFile, statFile, fileDt)
+newDataFound = True
 
 # Clean up older stat files (if gz file size has changed)
 removeFiles = []
@@ -96,6 +99,8 @@
 removeFiles.append(statFile)
 for f in removeFiles:
 os.remove(f)
+
+return newDataFound
 
 def processLogFile(self, logFile, statFile, fileDt):
 """
@@ -272

[MediaWiki-commits] [Gerrit] download only when needed - change (analytics/zero-sms)

2014-09-12 Thread Yurik (Code Review)
Yurik has submitted this change and it was merged.

Change subject: download only when needed
..


download only when needed

Change-Id: Ie36b9be8881f86d02d7d057ebffc847a48f2ab4c
---
M scripts/smslogs.py
M scripts/weblogs.py
2 files changed, 36 insertions(+), 15 deletions(-)

Approvals:
  Yurik: Verified; Looks good to me, approved



diff --git a/scripts/smslogs.py b/scripts/smslogs.py
index 40cadf9..7f01c2e 100644
--- a/scripts/smslogs.py
+++ b/scripts/smslogs.py
@@ -112,6 +112,7 @@
 
 bucket = cn.get_bucket(self.settings.awsBucket)
 files = bucket.list(self.settings.awsPrefix)
+newDataFound = False
 
 for key in files:
 filename = key.key[len(self.settings.awsPrefix):]
@@ -160,8 +161,15 @@
 key.get_contents_to_filename(filePath)
 if fileDate and (not self.settings.lastDownloadTs or 
self.settings.lastDownloadTs < fileDate):
 self.settings.lastDownloadTs = fileDate
+newDataFound = True
 
-def combineDataFiles(self, sourceFiles):
+return newDataFound
+
+def combineDataFiles(self):
+
+sourceFiles = os.listdir(self.pathLogs)
+# files = itertools.chain([os.path.join('pc', f) for f in 
os.listdir(os.path.join(self.pathLogs, 'pc'))],
+# files)
 
 safePrint(u'Combining files into %s' % self.combinedFilePath)
 if self.processIfAfter:
@@ -169,11 +177,11 @@
 else:
 safePrint(u'Processing all files')
 
-appendingDataFile = self.combinedFilePath + '.tmp'
+tempFile = self.combinedFilePath + '.tmp'
 manualLogRe = re.compile(r'^wikipedia_application_\d+\.log\.\d+\.gz:')
 
 totalCount = 0
-with io.open(appendingDataFile, 'w', encoding='utf8') as dst:
+with io.open(tempFile, 'w', encoding='utf8') as dst:
 for srcFile in sourceFiles:
 
 fileDate = self.getFileDate(srcFile)
@@ -216,7 +224,7 @@
 if os.path.exists(sortedOutputFile):
 os.remove(sortedOutputFile)
 
-args = [self.settings.sortCmd, '-u', '-o', sortedOutputFile, 
appendingDataFile]
+args = [self.settings.sortCmd, '-u', '-o', sortedOutputFile, 
tempFile]
 originalExists = os.path.exists(self.combinedFilePath)
 if originalExists:
 args.append(self.combinedFilePath)
@@ -239,7 +247,7 @@
 except subprocess.CalledProcessError, ex:
 raise Exception(u'Error %s running %s\nOutput:\n%s' % 
(ex.returncode, cmd, ex.output))
 
-os.remove(appendingDataFile)
+os.remove(tempFile)
 
 def generateGraphData(self, skipParsing=False):
 stats = smsgraphs.Stats(self.combinedFilePath, self.pathGraphs, 
self.statsFilePath, self.settings.partnerMap,
@@ -257,13 +265,15 @@
 stats.createGraphs()
 
 def run(self):
+newDataFound = True
 if self.settings.enableDownload:
-self.download()
-files = os.listdir(self.pathLogs)
-files = itertools.chain([os.path.join('pc', f) for f in 
os.listdir(os.path.join(self.pathLogs, 'pc'))],
-files)
-self.combineDataFiles(files)
-self.generateGraphData()
+newDataFound = self.download()
+
+if not newDataFound and os.path.isfile(self.combinedFilePath):
+safePrint('No new data, we are done')
+else:
+self.combineDataFiles()
+self.generateGraphData()
 
 
 if __name__ == "__main__":
diff --git a/scripts/weblogs.py b/scripts/weblogs.py
index 4abb5b0..7e5453a 100644
--- a/scripts/weblogs.py
+++ b/scripts/weblogs.py
@@ -36,6 +36,7 @@
 self.urlRe = re.compile(r'^https?://([^/]+)', re.IGNORECASE)
 self.duplUrlRe = re.compile(r'^(https?://.+)\1', re.IGNORECASE)
 self.zcmdRe = re.compile(r'zcmd=([-a-z0-9]+)', re.IGNORECASE)
+self.combinedFile = os.path.join(self.pathGraphs, 'combined-all.tsv')
 
 def defaultSettings(self, suffix):
 s = super(WebLogProcessor, self).defaultSettings(suffix)
@@ -68,6 +69,7 @@
 
 def processLogFiles(self):
 
+newDataFound = False
 safePrint('Processing log files')
 statFiles = {}
 for f in os.listdir(self.pathLogs):
@@ -82,6 +84,7 @@
 fileDt = m.group(1)
 fileDt = '-'.join([fileDt[0:4], fileDt[4:6], fileDt[6:8]])
 self.processLogFile(logFile, statFile, fileDt)
+newDataFound = True
 
 # Clean up older stat files (if gz file size has changed)
 removeFiles = []
@@ -96,6 +99,8 @@
 removeFiles.append(statFile)
 for f in removeFiles:
 os.remove(f)
+
+return newDataFound
 
 def processLogFile(self, logFile, statFile, fileDt):
 """
@@ -272,7 +277,10 @@
 stats[key] += int(count)