Mforns has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/200239

Change subject: [WIP] Add support for wiki explosion and others.
......................................................................

[WIP] Add support for wiki explosion and others.

Thorough description coming soon...

Bug: https://phabricator.wikimedia.org/T89251
Change-Id: Idc486cb934f3cc15c061af39c376621c5d5823a6
---
M reportupdater/update_reports.py
M reportupdater/utils.py
M reportupdater/writer.py
D test/fixtures/output/reader_test.csv
A test/fixtures/output/reader_test.tsv
D test/fixtures/output/reader_test_error.csv
A test/fixtures/output/reader_test_error.tsv
D test/fixtures/output/selector_test1.csv
A test/fixtures/output/selector_test1.tsv
D test/fixtures/output/selector_test2.csv
A test/fixtures/output/selector_test2.tsv
D test/fixtures/output/writer_test1.csv
A test/fixtures/output/writer_test1.tsv
M test/reportupdater_test.py
M test/selector_test.py
M test/writer_test.py
16 files changed, 66 insertions(+), 66 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/limn-mobile-data 
refs/changes/39/200239/1

diff --git a/reportupdater/update_reports.py b/reportupdater/update_reports.py
index afb89e8..45b9a3b 100755
--- a/reportupdater/update_reports.py
+++ b/reportupdater/update_reports.py
@@ -15,10 +15,10 @@
 
 
 def main():
-    parser = argparse.ArgumentParser(description='Write/Update SQL reports 
into CSV files.')
+    parser = argparse.ArgumentParser(description='Write/Update SQL reports 
into TSV files.')
     parser.add_argument('config_path', help='Yaml configuration file path.')
     parser.add_argument('sql_folder', help='Folder with *.sql files.')
-    parser.add_argument('output_folder', help='Folder to write the *.csv files 
to.')
+    parser.add_argument('output_folder', help='Folder to write the TSV files 
to.')
     parser.add_argument('-l', '--log-level', 
help='(debug|info|warning|error|critical)')
     args = vars(parser.parse_args())
     if 'log_level' in args:
diff --git a/reportupdater/utils.py b/reportupdater/utils.py
index 9c76348..ba4b0fb 100644
--- a/reportupdater/utils.py
+++ b/reportupdater/utils.py
@@ -25,17 +25,17 @@
     # Reads a report file to get its results
     # and returns them in the expected dict(date->row) format.
     previous_results = {'header': [], 'data': {}}
-    output_file_path = os.path.join(output_folder, report.key + '.csv')
+    output_file_path = os.path.join(output_folder, report.key + '.tsv')
     if os.path.exists(output_file_path):
         try:
             with io.open(output_file_path, encoding='utf-8') as output_file:
-                rows = list(csv.reader(output_file))
+                rows = list(csv.reader(output_file, delimiter='\t'))
         except IOError, e:
             raise IOError('Could not read the output file (' + str(e) + ').')
         header = []
         if report.is_funnel:
             # If the report is for a funnel visualization,
-            # one same date may contain several lines in the csv.
+            # one same date may contain several lines in the tsv.
             # So, all lines for the same date, are listed in the
             # same dict entry under the date key.
             data = defaultdict(list)
diff --git a/reportupdater/writer.py b/reportupdater/writer.py
index cba2219..3327b60 100644
--- a/reportupdater/writer.py
+++ b/reportupdater/writer.py
@@ -59,7 +59,7 @@
         rows = [data[date] for date in dates]
         if report.is_funnel:
             rows = [row for sublist in rows for row in sublist]  # flatten
-        output_path = os.path.join(self.config['output_folder'], report.key + 
'.csv')
+        output_path = os.path.join(self.config['output_folder'], report.key + 
'.tsv')
         temp_output_path = output_path + '.tmp'
 
         try:
@@ -67,11 +67,11 @@
             temp_output_file = io.open(temp_output_path, 'wb')
         except Exception, e:
             raise RuntimeError('Could not open the temporary output file (' + 
str(e) + ').')
-        csv_writer = csv.writer(temp_output_file)
-        csv_writer.writerow(header)
+        tsv_writer = csv.writer(temp_output_file, delimiter='\t')
+        tsv_writer.writerow(header)
         for row in rows:
             row[0] = row[0].strftime(DATE_FORMAT)
-            csv_writer.writerow(row)
+            tsv_writer.writerow(row)
         temp_output_file.close()
         try:
             os.rename(temp_output_path, output_path)
diff --git a/test/fixtures/output/reader_test.csv 
b/test/fixtures/output/reader_test.csv
deleted file mode 100644
index 8a362e4..0000000
--- a/test/fixtures/output/reader_test.csv
+++ /dev/null
@@ -1,4 +0,0 @@
-date,value
-2015-01-01,1
-2015-01-02,2
-2015-01-03,3
diff --git a/test/fixtures/output/reader_test.tsv 
b/test/fixtures/output/reader_test.tsv
new file mode 100644
index 0000000..e86e4af
--- /dev/null
+++ b/test/fixtures/output/reader_test.tsv
@@ -0,0 +1,4 @@
+date   value
+2015-01-01     1
+2015-01-02     2
+2015-01-03     3
diff --git a/test/fixtures/output/reader_test_error.csv 
b/test/fixtures/output/reader_test_error.csv
deleted file mode 100644
index b15c649..0000000
--- a/test/fixtures/output/reader_test_error.csv
+++ /dev/null
@@ -1,4 +0,0 @@
-date,value
-2015-01-01,1
-badFormatedDate,2
-2015-01-03,3
diff --git a/test/fixtures/output/reader_test_error.tsv 
b/test/fixtures/output/reader_test_error.tsv
new file mode 100644
index 0000000..eb077cc
--- /dev/null
+++ b/test/fixtures/output/reader_test_error.tsv
@@ -0,0 +1,4 @@
+date   value
+2015-01-01     1
+badFormatedDate        2
+2015-01-03     3
diff --git a/test/fixtures/output/selector_test1.csv 
b/test/fixtures/output/selector_test1.csv
deleted file mode 100644
index 2f0e4f3..0000000
--- a/test/fixtures/output/selector_test1.csv
+++ /dev/null
@@ -1,2 +0,0 @@
-date,value
-2015-01-01,a
diff --git a/test/fixtures/output/selector_test1.tsv 
b/test/fixtures/output/selector_test1.tsv
new file mode 100644
index 0000000..8bc098c
--- /dev/null
+++ b/test/fixtures/output/selector_test1.tsv
@@ -0,0 +1,2 @@
+date   value
+2015-01-01     a
diff --git a/test/fixtures/output/selector_test2.csv 
b/test/fixtures/output/selector_test2.csv
deleted file mode 100644
index d1828c0..0000000
--- a/test/fixtures/output/selector_test2.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-date,value
-2015-01-01,a
-2015-01-02,b
diff --git a/test/fixtures/output/selector_test2.tsv 
b/test/fixtures/output/selector_test2.tsv
new file mode 100644
index 0000000..6197b46
--- /dev/null
+++ b/test/fixtures/output/selector_test2.tsv
@@ -0,0 +1,3 @@
+date   value
+2015-01-01     a
+2015-01-02     b
diff --git a/test/fixtures/output/writer_test1.csv 
b/test/fixtures/output/writer_test1.csv
deleted file mode 100644
index f0e10cc..0000000
--- a/test/fixtures/output/writer_test1.csv
+++ /dev/null
@@ -1,2 +0,0 @@
-date,val1,val2,val3
-2015-01-01,1,2,3
diff --git a/test/fixtures/output/writer_test1.tsv 
b/test/fixtures/output/writer_test1.tsv
new file mode 100644
index 0000000..5d55b2b
--- /dev/null
+++ b/test/fixtures/output/writer_test1.tsv
@@ -0,0 +1,2 @@
+date   val1    val2    val3
+2015-01-01     1       2       3
diff --git a/test/reportupdater_test.py b/test/reportupdater_test.py
index cc9e2c6..ae85358 100644
--- a/test/reportupdater_test.py
+++ b/test/reportupdater_test.py
@@ -48,7 +48,7 @@
         )
         # The report should not be computed because it has already been 
computed
         # within this hour. So the output file should not exist.
-        output_path = os.path.join(self.output_folder, 
'reportupdater_test1.csv')
+        output_path = os.path.join(self.output_folder, 
'reportupdater_test1.tsv')
         self.assertFalse(os.path.exists(output_path))
 
 
@@ -64,7 +64,7 @@
         )
         # The report should not be computed because it has already been 
computed
         # within this day. So the output file should not exist.
-        output_path = os.path.join(self.output_folder, 
'reportupdater_test2.csv')
+        output_path = os.path.join(self.output_folder, 
'reportupdater_test2.tsv')
         self.assertFalse(os.path.exists(output_path))
 
 
@@ -84,7 +84,7 @@
 
         # The first thread should execute normally and output the results.
         history_path1 = 'test/fixtures/reportupdater_test1.history'
-        output_path1 = os.path.join(self.output_folder, 
'reportupdater_test1.csv')
+        output_path1 = os.path.join(self.output_folder, 
'reportupdater_test1.tsv')
         self.paths_to_clean.extend([history_path1, output_path1])
         args1 = {
             'config_path': os.path.join(self.config_folder, 
'reportupdater_test1.yaml'),
@@ -102,7 +102,7 @@
         # the frequency control does not discard this thread.
         time.sleep(0.1)
         history_path2 = 'test/fixtures/reportupdater_test2.history'
-        output_path2 = os.path.join(self.output_folder, 
'reportupdater_test2.csv')
+        output_path2 = os.path.join(self.output_folder, 
'reportupdater_test2.tsv')
         self.paths_to_clean.extend([history_path2, output_path2])
         args2 = {
             'config_path': os.path.join(self.config_folder, 
'reportupdater_test2.yaml'),
@@ -115,10 +115,10 @@
 
         # wait for the threads to finish and assert results
         thread1.join()
-        output_path1 = os.path.join(self.output_folder, 
'reportupdater_test1.csv')
+        output_path1 = os.path.join(self.output_folder, 
'reportupdater_test1.tsv')
         self.assertTrue(os.path.exists(output_path1))
         thread2.join()
-        output_path2 = os.path.join(self.output_folder, 
'reportupdater_test2.csv')
+        output_path2 = os.path.join(self.output_folder, 
'reportupdater_test2.tsv')
         self.assertFalse(os.path.exists(output_path2))
 
 
@@ -139,7 +139,7 @@
         MySQLdb.connect = MagicMock(return_value=connection_mock)
 
         config_path = os.path.join(self.config_folder, 
'reportupdater_test1.yaml')
-        output_path = os.path.join(self.output_folder, 
'reportupdater_test1.csv')
+        output_path = os.path.join(self.output_folder, 
'reportupdater_test1.tsv')
         history_path = 'test/fixtures/reportupdater_test1.history'
         self.paths_to_clean.extend([output_path, history_path])
         reportupdater.run(
@@ -153,12 +153,12 @@
             output_lines = output_file.readlines()
         self.assertTrue(len(output_lines) > 1)
         header = output_lines.pop(0).strip()
-        self.assertEqual(header, 'date,value')
+        self.assertEqual(header, 'date\tvalue')
         # Assert that all lines hold subsequent values.
         expected_date = datetime(2015, 1, 1)
         expected_value = 1
         for line in output_lines:
-            expected_line = expected_date.strftime(DATE_FORMAT) + ',' + 
str(expected_value)
+            expected_line = expected_date.strftime(DATE_FORMAT) + '\t' + 
str(expected_value)
             self.assertEqual(line.strip(), expected_line)
             expected_date += relativedelta(days=+1)
             expected_value += 1
@@ -182,7 +182,7 @@
         MySQLdb.connect = MagicMock(return_value=connection_mock)
 
         config_path = os.path.join(self.config_folder, 
'reportupdater_test3.yaml')
-        output_path = os.path.join(self.output_folder, 
'reportupdater_test3.csv')
+        output_path = os.path.join(self.output_folder, 
'reportupdater_test3.tsv')
         history_path = 'test/fixtures/reportupdater_test3.history'
         self.paths_to_clean.extend([output_path, history_path])
         reportupdater.run(
@@ -196,12 +196,12 @@
             output_lines = output_file.readlines()
         self.assertTrue(len(output_lines) > 1)
         header = output_lines.pop(0).strip()
-        self.assertEqual(header, 'date,value')
+        self.assertEqual(header, 'date\tvalue')
         # Assert that all lines hold subsequent values.
         expected_date = datetime(2015, 1, 1)
         expected_value = 1
         for line in output_lines:
-            expected_line = expected_date.strftime(DATE_FORMAT) + ',' + 
str(expected_value)
+            expected_line = expected_date.strftime(DATE_FORMAT) + '\t' + 
str(expected_value)
             self.assertEqual(line.strip(), expected_line)
             if expected_value < 3:
                 expected_value += 1
@@ -228,10 +228,10 @@
         MySQLdb.connect = MagicMock(return_value=connection_mock)
 
         config_path = os.path.join(self.config_folder, 
'reportupdater_test2.yaml')
-        output_path = os.path.join(self.output_folder, 
'reportupdater_test2.csv')
+        output_path = os.path.join(self.output_folder, 
'reportupdater_test2.tsv')
         history_path = 'test/fixtures/reportupdater_test2.history'
         with io.open(output_path, 'w') as output_file:
-            
output_file.write(unicode('date,value\n2015-01-01,1\n2015-02-01,2\n'))
+            
output_file.write(unicode('date\tvalue\n2015-01-01\t1\n2015-02-01\t2\n'))
         self.paths_to_clean.extend([output_path, history_path])
         reportupdater.run(
             config_path=config_path,
@@ -244,12 +244,12 @@
             output_lines = output_file.readlines()
         self.assertTrue(len(output_lines) > 1)
         header = output_lines.pop(0).strip()
-        self.assertEqual(header, 'date,value')
+        self.assertEqual(header, 'date\tvalue')
         # Assert that all lines hold subsequent values.
         expected_date = datetime(2015, 1, 1)
         expected_value = 1
         for line in output_lines:
-            expected_line = expected_date.strftime(DATE_FORMAT) + ',' + 
str(expected_value)
+            expected_line = expected_date.strftime(DATE_FORMAT) + '\t' + 
str(expected_value)
             self.assertEqual(line.strip(), expected_line)
             expected_date += relativedelta(months=+1)
             expected_value += 1
diff --git a/test/selector_test.py b/test/selector_test.py
index 572f6aa..92af297 100644
--- a/test/selector_test.py
+++ b/test/selector_test.py
@@ -70,7 +70,7 @@
 
 
     def test_get_interval_reports_when_previous_results_is_empty(self):
-        # Note no previous results csv exists for default report.
+        # Note no previous results tsv exists for default report.
         now = datetime(2015, 1, 2)
         reports = list(self.selector.get_interval_reports(self.report, now))
         self.assertEqual(len(reports), 2)
@@ -82,7 +82,7 @@
 
     def test_get_interval_reports_when_previous_results_has_some_dates(self):
         self.report.key = 'selector_test1'
-        # see: test/fixtures/output/selector_test1.csv
+        # see: test/fixtures/output/selector_test1.tsv
         now = datetime(2015, 1, 2)
         reports = list(self.selector.get_interval_reports(self.report, now))
         self.assertEqual(len(reports), 1)
@@ -92,7 +92,7 @@
 
     def test_get_interval_reports_when_previous_results_has_all_dates(self):
         self.report.key = 'selector_test2'
-        # see: test/fixtures/output/selector_test2.csv
+        # see: test/fixtures/output/selector_test2.tsv
         now = datetime(2015, 1, 2)
         reports = list(self.selector.get_interval_reports(self.report, now))
         self.assertEqual(len(reports), 1)
diff --git a/test/writer_test.py b/test/writer_test.py
index 04803c1..43b6055 100644
--- a/test/writer_test.py
+++ b/test/writer_test.py
@@ -40,7 +40,7 @@
 
     def tearDown(self):
         try:
-            os.remove('test/fixtures/output/writer_test.csv')
+            os.remove('test/fixtures/output/writer_test.tsv')
         except:
             pass
         io.open = self.io_open_stash
@@ -75,11 +75,11 @@
         data = {}
         output_folder = self.config['output_folder']
         self.writer.write_results(header, data, self.report, output_folder)
-        output_path = os.path.join(output_folder, self.report.key + '.csv')
+        output_path = os.path.join(output_folder, self.report.key + '.tsv')
         self.paths_to_clean.append(output_path)
         with io.open(output_path, 'r', encoding='utf-8') as output_file:
             output = output_file.read().strip()
-        self.assertEqual(output, 'date,value')
+        self.assertEqual(output, 'date\tvalue')
 
 
     def test_write_results_with_funnel_data(self):
@@ -92,17 +92,17 @@
         }
         output_folder = self.config['output_folder']
         self.writer.write_results(header, data, self.report, output_folder)
-        output_path = os.path.join(output_folder, self.report.key + '.csv')
+        output_path = os.path.join(output_folder, self.report.key + '.tsv')
         self.paths_to_clean.append(output_path)
         with io.open(output_path, 'r', encoding='utf-8') as output_file:
             output_lines = output_file.readlines()
         self.assertEqual(len(output_lines), 6)
-        self.assertEqual(output_lines[0], 'date,value\n')
-        self.assertEqual(output_lines[1], '2015-01-01,a\n')
-        self.assertEqual(output_lines[2], '2015-01-01,b\n')
-        self.assertEqual(output_lines[3], '2015-01-02,c\n')
-        self.assertEqual(output_lines[4], '2015-01-02,d\n')
-        self.assertEqual(output_lines[5], '2015-01-03,e\n')
+        self.assertEqual(output_lines[0], 'date\tvalue\n')
+        self.assertEqual(output_lines[1], '2015-01-01\ta\n')
+        self.assertEqual(output_lines[2], '2015-01-01\tb\n')
+        self.assertEqual(output_lines[3], '2015-01-02\tc\n')
+        self.assertEqual(output_lines[4], '2015-01-02\td\n')
+        self.assertEqual(output_lines[5], '2015-01-03\te\n')
 
 
     def test_write_results(self):
@@ -114,19 +114,19 @@
         }
         output_folder = self.config['output_folder']
         self.writer.write_results(header, data, self.report, output_folder)
-        output_path = os.path.join(output_folder, self.report.key + '.csv')
+        output_path = os.path.join(output_folder, self.report.key + '.tsv')
         self.paths_to_clean.append(output_path)
         with io.open(output_path, 'r', encoding='utf-8') as output_file:
             output_lines = output_file.readlines()
         self.assertEqual(len(output_lines), 4)
-        self.assertEqual(output_lines[0], 'date,value\n')
-        self.assertEqual(output_lines[1], '2015-01-01,c\n')
-        self.assertEqual(output_lines[2], '2015-01-02,a\n')
-        self.assertEqual(output_lines[3], '2015-01-03,b\n')
+        self.assertEqual(output_lines[0], 'date\tvalue\n')
+        self.assertEqual(output_lines[1], '2015-01-01\tc\n')
+        self.assertEqual(output_lines[2], '2015-01-02\ta\n')
+        self.assertEqual(output_lines[3], '2015-01-03\tb\n')
 
 
     def test_run_when_previous_results_header_is_empty(self):
-        # self.report has no previous results csv by default setup
+        # self.report has no previous results tsv by default setup
         executed = [self.report]
         self.writer.executor.run = MagicMock(return_value=executed)
         self.writer.write_results = MagicMock()
@@ -142,7 +142,7 @@
     def 
test_run_when_previous_results_header_and_results_header_are_different(self):
         self.report.key = 'writer_test1'
         # previous header will be: ['date', 'val1', 'val2', 'val3']
-        # see: test/fixtures/output/writer_test1.csv
+        # see: test/fixtures/output/writer_test1.tsv
         self.report.results = {
             'header': ['date', 'val4', 'val5'],
             'data': {
@@ -165,12 +165,12 @@
 
     def test_run(self):
         self.report.key = 'writer_test2'
-        output_path = os.path.join(self.config['output_folder'], 
self.report.key + '.csv')
+        output_path = os.path.join(self.config['output_folder'], 
self.report.key + '.tsv')
         self.paths_to_clean.append(output_path)
         # Set up previous results.
         # File can not be a permanent fixture, because it is overwritten be 
the test.
         with io.open(output_path, 'w') as output_file:
-            
output_file.write(unicode('date,value\n2015-01-01,a\n2015-01-02,b\n'))
+            
output_file.write(unicode('date\tvalue\n2015-01-01\ta\n2015-01-02\tb\n'))
         # Set up current results.
         self.report.results['header'] = ['date', 'value']
         self.report.results['data'] = {
@@ -184,7 +184,7 @@
         with io.open(output_path, 'r', encoding='utf-8') as output_file:
             output_lines = output_file.readlines()
         self.assertEqual(len(output_lines), 4)
-        self.assertEqual(output_lines[0], 'date,value\n')
-        self.assertEqual(output_lines[1], '2015-01-01,a\n')
-        self.assertEqual(output_lines[2], '2015-01-02,b\n')
-        self.assertEqual(output_lines[3], '2015-01-03,c\n')
+        self.assertEqual(output_lines[0], 'date\tvalue\n')
+        self.assertEqual(output_lines[1], '2015-01-01\ta\n')
+        self.assertEqual(output_lines[2], '2015-01-02\tb\n')
+        self.assertEqual(output_lines[3], '2015-01-03\tc\n')

-- 
To view, visit https://gerrit.wikimedia.org/r/200239
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Idc486cb934f3cc15c061af39c376621c5d5823a6
Gerrit-PatchSet: 1
Gerrit-Project: analytics/limn-mobile-data
Gerrit-Branch: master
Gerrit-Owner: Mforns <mfo...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to