Mforns has uploaded a new change for review. https://gerrit.wikimedia.org/r/200239
Change subject: [WIP] Add support for wiki explosion and others. ...................................................................... [WIP] Add support for wiki explosion and others. Thorough description coming soon... Bug: https://phabricator.wikimedia.org/T89251 Change-Id: Idc486cb934f3cc15c061af39c376621c5d5823a6 --- M reportupdater/update_reports.py M reportupdater/utils.py M reportupdater/writer.py D test/fixtures/output/reader_test.csv A test/fixtures/output/reader_test.tsv D test/fixtures/output/reader_test_error.csv A test/fixtures/output/reader_test_error.tsv D test/fixtures/output/selector_test1.csv A test/fixtures/output/selector_test1.tsv D test/fixtures/output/selector_test2.csv A test/fixtures/output/selector_test2.tsv D test/fixtures/output/writer_test1.csv A test/fixtures/output/writer_test1.tsv M test/reportupdater_test.py M test/selector_test.py M test/writer_test.py 16 files changed, 66 insertions(+), 66 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/analytics/limn-mobile-data refs/changes/39/200239/1 diff --git a/reportupdater/update_reports.py b/reportupdater/update_reports.py index afb89e8..45b9a3b 100755 --- a/reportupdater/update_reports.py +++ b/reportupdater/update_reports.py @@ -15,10 +15,10 @@ def main(): - parser = argparse.ArgumentParser(description='Write/Update SQL reports into CSV files.') + parser = argparse.ArgumentParser(description='Write/Update SQL reports into TSV files.') parser.add_argument('config_path', help='Yaml configuration file path.') parser.add_argument('sql_folder', help='Folder with *.sql files.') - parser.add_argument('output_folder', help='Folder to write the *.csv files to.') + parser.add_argument('output_folder', help='Folder to write the TSV files to.') parser.add_argument('-l', '--log-level', help='(debug|info|warning|error|critical)') args = vars(parser.parse_args()) if 'log_level' in args: diff --git a/reportupdater/utils.py b/reportupdater/utils.py index 9c76348..ba4b0fb 100644 --- a/reportupdater/utils.py +++ b/reportupdater/utils.py @@ -25,17 +25,17 @@ # Reads a report file to get its results # and returns them in the expected dict(date->row) format. previous_results = {'header': [], 'data': {}} - output_file_path = os.path.join(output_folder, report.key + '.csv') + output_file_path = os.path.join(output_folder, report.key + '.tsv') if os.path.exists(output_file_path): try: with io.open(output_file_path, encoding='utf-8') as output_file: - rows = list(csv.reader(output_file)) + rows = list(csv.reader(output_file, delimiter='\t')) except IOError, e: raise IOError('Could not read the output file (' + str(e) + ').') header = [] if report.is_funnel: # If the report is for a funnel visualization, - # one same date may contain several lines in the csv. + # one same date may contain several lines in the tsv. # So, all lines for the same date, are listed in the # same dict entry under the date key. data = defaultdict(list) diff --git a/reportupdater/writer.py b/reportupdater/writer.py index cba2219..3327b60 100644 --- a/reportupdater/writer.py +++ b/reportupdater/writer.py @@ -59,7 +59,7 @@ rows = [data[date] for date in dates] if report.is_funnel: rows = [row for sublist in rows for row in sublist] # flatten - output_path = os.path.join(self.config['output_folder'], report.key + '.csv') + output_path = os.path.join(self.config['output_folder'], report.key + '.tsv') temp_output_path = output_path + '.tmp' try: @@ -67,11 +67,11 @@ temp_output_file = io.open(temp_output_path, 'wb') except Exception, e: raise RuntimeError('Could not open the temporary output file (' + str(e) + ').') - csv_writer = csv.writer(temp_output_file) - csv_writer.writerow(header) + tsv_writer = csv.writer(temp_output_file, delimiter='\t') + tsv_writer.writerow(header) for row in rows: row[0] = row[0].strftime(DATE_FORMAT) - csv_writer.writerow(row) + tsv_writer.writerow(row) temp_output_file.close() try: os.rename(temp_output_path, output_path) diff --git a/test/fixtures/output/reader_test.csv b/test/fixtures/output/reader_test.csv deleted file mode 100644 index 8a362e4..0000000 --- a/test/fixtures/output/reader_test.csv +++ /dev/null @@ -1,4 +0,0 @@ -date,value -2015-01-01,1 -2015-01-02,2 -2015-01-03,3 diff --git a/test/fixtures/output/reader_test.tsv b/test/fixtures/output/reader_test.tsv new file mode 100644 index 0000000..e86e4af --- /dev/null +++ b/test/fixtures/output/reader_test.tsv @@ -0,0 +1,4 @@ +date value +2015-01-01 1 +2015-01-02 2 +2015-01-03 3 diff --git a/test/fixtures/output/reader_test_error.csv b/test/fixtures/output/reader_test_error.csv deleted file mode 100644 index b15c649..0000000 --- a/test/fixtures/output/reader_test_error.csv +++ /dev/null @@ -1,4 +0,0 @@ -date,value -2015-01-01,1 -badFormatedDate,2 -2015-01-03,3 diff --git a/test/fixtures/output/reader_test_error.tsv b/test/fixtures/output/reader_test_error.tsv new file mode 100644 index 0000000..eb077cc --- /dev/null +++ b/test/fixtures/output/reader_test_error.tsv @@ -0,0 +1,4 @@ +date value +2015-01-01 1 +badFormatedDate 2 +2015-01-03 3 diff --git a/test/fixtures/output/selector_test1.csv b/test/fixtures/output/selector_test1.csv deleted file mode 100644 index 2f0e4f3..0000000 --- a/test/fixtures/output/selector_test1.csv +++ /dev/null @@ -1,2 +0,0 @@ -date,value -2015-01-01,a diff --git a/test/fixtures/output/selector_test1.tsv b/test/fixtures/output/selector_test1.tsv new file mode 100644 index 0000000..8bc098c --- /dev/null +++ b/test/fixtures/output/selector_test1.tsv @@ -0,0 +1,2 @@ +date value +2015-01-01 a diff --git a/test/fixtures/output/selector_test2.csv b/test/fixtures/output/selector_test2.csv deleted file mode 100644 index d1828c0..0000000 --- a/test/fixtures/output/selector_test2.csv +++ /dev/null @@ -1,3 +0,0 @@ -date,value -2015-01-01,a -2015-01-02,b diff --git a/test/fixtures/output/selector_test2.tsv b/test/fixtures/output/selector_test2.tsv new file mode 100644 index 0000000..6197b46 --- /dev/null +++ b/test/fixtures/output/selector_test2.tsv @@ -0,0 +1,3 @@ +date value +2015-01-01 a +2015-01-02 b diff --git a/test/fixtures/output/writer_test1.csv b/test/fixtures/output/writer_test1.csv deleted file mode 100644 index f0e10cc..0000000 --- a/test/fixtures/output/writer_test1.csv +++ /dev/null @@ -1,2 +0,0 @@ -date,val1,val2,val3 -2015-01-01,1,2,3 diff --git a/test/fixtures/output/writer_test1.tsv b/test/fixtures/output/writer_test1.tsv new file mode 100644 index 0000000..5d55b2b --- /dev/null +++ b/test/fixtures/output/writer_test1.tsv @@ -0,0 +1,2 @@ +date val1 val2 val3 +2015-01-01 1 2 3 diff --git a/test/reportupdater_test.py b/test/reportupdater_test.py index cc9e2c6..ae85358 100644 --- a/test/reportupdater_test.py +++ b/test/reportupdater_test.py @@ -48,7 +48,7 @@ ) # The report should not be computed because it has already been computed # within this hour. So the output file should not exist. - output_path = os.path.join(self.output_folder, 'reportupdater_test1.csv') + output_path = os.path.join(self.output_folder, 'reportupdater_test1.tsv') self.assertFalse(os.path.exists(output_path)) @@ -64,7 +64,7 @@ ) # The report should not be computed because it has already been computed # within this day. So the output file should not exist. - output_path = os.path.join(self.output_folder, 'reportupdater_test2.csv') + output_path = os.path.join(self.output_folder, 'reportupdater_test2.tsv') self.assertFalse(os.path.exists(output_path)) @@ -84,7 +84,7 @@ # The first thread should execute normally and output the results. history_path1 = 'test/fixtures/reportupdater_test1.history' - output_path1 = os.path.join(self.output_folder, 'reportupdater_test1.csv') + output_path1 = os.path.join(self.output_folder, 'reportupdater_test1.tsv') self.paths_to_clean.extend([history_path1, output_path1]) args1 = { 'config_path': os.path.join(self.config_folder, 'reportupdater_test1.yaml'), @@ -102,7 +102,7 @@ # the frequency control does not discard this thread. time.sleep(0.1) history_path2 = 'test/fixtures/reportupdater_test2.history' - output_path2 = os.path.join(self.output_folder, 'reportupdater_test2.csv') + output_path2 = os.path.join(self.output_folder, 'reportupdater_test2.tsv') self.paths_to_clean.extend([history_path2, output_path2]) args2 = { 'config_path': os.path.join(self.config_folder, 'reportupdater_test2.yaml'), @@ -115,10 +115,10 @@ # wait for the threads to finish and assert results thread1.join() - output_path1 = os.path.join(self.output_folder, 'reportupdater_test1.csv') + output_path1 = os.path.join(self.output_folder, 'reportupdater_test1.tsv') self.assertTrue(os.path.exists(output_path1)) thread2.join() - output_path2 = os.path.join(self.output_folder, 'reportupdater_test2.csv') + output_path2 = os.path.join(self.output_folder, 'reportupdater_test2.tsv') self.assertFalse(os.path.exists(output_path2)) @@ -139,7 +139,7 @@ MySQLdb.connect = MagicMock(return_value=connection_mock) config_path = os.path.join(self.config_folder, 'reportupdater_test1.yaml') - output_path = os.path.join(self.output_folder, 'reportupdater_test1.csv') + output_path = os.path.join(self.output_folder, 'reportupdater_test1.tsv') history_path = 'test/fixtures/reportupdater_test1.history' self.paths_to_clean.extend([output_path, history_path]) reportupdater.run( @@ -153,12 +153,12 @@ output_lines = output_file.readlines() self.assertTrue(len(output_lines) > 1) header = output_lines.pop(0).strip() - self.assertEqual(header, 'date,value') + self.assertEqual(header, 'date\tvalue') # Assert that all lines hold subsequent values. expected_date = datetime(2015, 1, 1) expected_value = 1 for line in output_lines: - expected_line = expected_date.strftime(DATE_FORMAT) + ',' + str(expected_value) + expected_line = expected_date.strftime(DATE_FORMAT) + '\t' + str(expected_value) self.assertEqual(line.strip(), expected_line) expected_date += relativedelta(days=+1) expected_value += 1 @@ -182,7 +182,7 @@ MySQLdb.connect = MagicMock(return_value=connection_mock) config_path = os.path.join(self.config_folder, 'reportupdater_test3.yaml') - output_path = os.path.join(self.output_folder, 'reportupdater_test3.csv') + output_path = os.path.join(self.output_folder, 'reportupdater_test3.tsv') history_path = 'test/fixtures/reportupdater_test3.history' self.paths_to_clean.extend([output_path, history_path]) reportupdater.run( @@ -196,12 +196,12 @@ output_lines = output_file.readlines() self.assertTrue(len(output_lines) > 1) header = output_lines.pop(0).strip() - self.assertEqual(header, 'date,value') + self.assertEqual(header, 'date\tvalue') # Assert that all lines hold subsequent values. expected_date = datetime(2015, 1, 1) expected_value = 1 for line in output_lines: - expected_line = expected_date.strftime(DATE_FORMAT) + ',' + str(expected_value) + expected_line = expected_date.strftime(DATE_FORMAT) + '\t' + str(expected_value) self.assertEqual(line.strip(), expected_line) if expected_value < 3: expected_value += 1 @@ -228,10 +228,10 @@ MySQLdb.connect = MagicMock(return_value=connection_mock) config_path = os.path.join(self.config_folder, 'reportupdater_test2.yaml') - output_path = os.path.join(self.output_folder, 'reportupdater_test2.csv') + output_path = os.path.join(self.output_folder, 'reportupdater_test2.tsv') history_path = 'test/fixtures/reportupdater_test2.history' with io.open(output_path, 'w') as output_file: - output_file.write(unicode('date,value\n2015-01-01,1\n2015-02-01,2\n')) + output_file.write(unicode('date\tvalue\n2015-01-01\t1\n2015-02-01\t2\n')) self.paths_to_clean.extend([output_path, history_path]) reportupdater.run( config_path=config_path, @@ -244,12 +244,12 @@ output_lines = output_file.readlines() self.assertTrue(len(output_lines) > 1) header = output_lines.pop(0).strip() - self.assertEqual(header, 'date,value') + self.assertEqual(header, 'date\tvalue') # Assert that all lines hold subsequent values. expected_date = datetime(2015, 1, 1) expected_value = 1 for line in output_lines: - expected_line = expected_date.strftime(DATE_FORMAT) + ',' + str(expected_value) + expected_line = expected_date.strftime(DATE_FORMAT) + '\t' + str(expected_value) self.assertEqual(line.strip(), expected_line) expected_date += relativedelta(months=+1) expected_value += 1 diff --git a/test/selector_test.py b/test/selector_test.py index 572f6aa..92af297 100644 --- a/test/selector_test.py +++ b/test/selector_test.py @@ -70,7 +70,7 @@ def test_get_interval_reports_when_previous_results_is_empty(self): - # Note no previous results csv exists for default report. + # Note no previous results tsv exists for default report. now = datetime(2015, 1, 2) reports = list(self.selector.get_interval_reports(self.report, now)) self.assertEqual(len(reports), 2) @@ -82,7 +82,7 @@ def test_get_interval_reports_when_previous_results_has_some_dates(self): self.report.key = 'selector_test1' - # see: test/fixtures/output/selector_test1.csv + # see: test/fixtures/output/selector_test1.tsv now = datetime(2015, 1, 2) reports = list(self.selector.get_interval_reports(self.report, now)) self.assertEqual(len(reports), 1) @@ -92,7 +92,7 @@ def test_get_interval_reports_when_previous_results_has_all_dates(self): self.report.key = 'selector_test2' - # see: test/fixtures/output/selector_test2.csv + # see: test/fixtures/output/selector_test2.tsv now = datetime(2015, 1, 2) reports = list(self.selector.get_interval_reports(self.report, now)) self.assertEqual(len(reports), 1) diff --git a/test/writer_test.py b/test/writer_test.py index 04803c1..43b6055 100644 --- a/test/writer_test.py +++ b/test/writer_test.py @@ -40,7 +40,7 @@ def tearDown(self): try: - os.remove('test/fixtures/output/writer_test.csv') + os.remove('test/fixtures/output/writer_test.tsv') except: pass io.open = self.io_open_stash @@ -75,11 +75,11 @@ data = {} output_folder = self.config['output_folder'] self.writer.write_results(header, data, self.report, output_folder) - output_path = os.path.join(output_folder, self.report.key + '.csv') + output_path = os.path.join(output_folder, self.report.key + '.tsv') self.paths_to_clean.append(output_path) with io.open(output_path, 'r', encoding='utf-8') as output_file: output = output_file.read().strip() - self.assertEqual(output, 'date,value') + self.assertEqual(output, 'date\tvalue') def test_write_results_with_funnel_data(self): @@ -92,17 +92,17 @@ } output_folder = self.config['output_folder'] self.writer.write_results(header, data, self.report, output_folder) - output_path = os.path.join(output_folder, self.report.key + '.csv') + output_path = os.path.join(output_folder, self.report.key + '.tsv') self.paths_to_clean.append(output_path) with io.open(output_path, 'r', encoding='utf-8') as output_file: output_lines = output_file.readlines() self.assertEqual(len(output_lines), 6) - self.assertEqual(output_lines[0], 'date,value\n') - self.assertEqual(output_lines[1], '2015-01-01,a\n') - self.assertEqual(output_lines[2], '2015-01-01,b\n') - self.assertEqual(output_lines[3], '2015-01-02,c\n') - self.assertEqual(output_lines[4], '2015-01-02,d\n') - self.assertEqual(output_lines[5], '2015-01-03,e\n') + self.assertEqual(output_lines[0], 'date\tvalue\n') + self.assertEqual(output_lines[1], '2015-01-01\ta\n') + self.assertEqual(output_lines[2], '2015-01-01\tb\n') + self.assertEqual(output_lines[3], '2015-01-02\tc\n') + self.assertEqual(output_lines[4], '2015-01-02\td\n') + self.assertEqual(output_lines[5], '2015-01-03\te\n') def test_write_results(self): @@ -114,19 +114,19 @@ } output_folder = self.config['output_folder'] self.writer.write_results(header, data, self.report, output_folder) - output_path = os.path.join(output_folder, self.report.key + '.csv') + output_path = os.path.join(output_folder, self.report.key + '.tsv') self.paths_to_clean.append(output_path) with io.open(output_path, 'r', encoding='utf-8') as output_file: output_lines = output_file.readlines() self.assertEqual(len(output_lines), 4) - self.assertEqual(output_lines[0], 'date,value\n') - self.assertEqual(output_lines[1], '2015-01-01,c\n') - self.assertEqual(output_lines[2], '2015-01-02,a\n') - self.assertEqual(output_lines[3], '2015-01-03,b\n') + self.assertEqual(output_lines[0], 'date\tvalue\n') + self.assertEqual(output_lines[1], '2015-01-01\tc\n') + self.assertEqual(output_lines[2], '2015-01-02\ta\n') + self.assertEqual(output_lines[3], '2015-01-03\tb\n') def test_run_when_previous_results_header_is_empty(self): - # self.report has no previous results csv by default setup + # self.report has no previous results tsv by default setup executed = [self.report] self.writer.executor.run = MagicMock(return_value=executed) self.writer.write_results = MagicMock() @@ -142,7 +142,7 @@ def test_run_when_previous_results_header_and_results_header_are_different(self): self.report.key = 'writer_test1' # previous header will be: ['date', 'val1', 'val2', 'val3'] - # see: test/fixtures/output/writer_test1.csv + # see: test/fixtures/output/writer_test1.tsv self.report.results = { 'header': ['date', 'val4', 'val5'], 'data': { @@ -165,12 +165,12 @@ def test_run(self): self.report.key = 'writer_test2' - output_path = os.path.join(self.config['output_folder'], self.report.key + '.csv') + output_path = os.path.join(self.config['output_folder'], self.report.key + '.tsv') self.paths_to_clean.append(output_path) # Set up previous results. # File can not be a permanent fixture, because it is overwritten be the test. with io.open(output_path, 'w') as output_file: - output_file.write(unicode('date,value\n2015-01-01,a\n2015-01-02,b\n')) + output_file.write(unicode('date\tvalue\n2015-01-01\ta\n2015-01-02\tb\n')) # Set up current results. self.report.results['header'] = ['date', 'value'] self.report.results['data'] = { @@ -184,7 +184,7 @@ with io.open(output_path, 'r', encoding='utf-8') as output_file: output_lines = output_file.readlines() self.assertEqual(len(output_lines), 4) - self.assertEqual(output_lines[0], 'date,value\n') - self.assertEqual(output_lines[1], '2015-01-01,a\n') - self.assertEqual(output_lines[2], '2015-01-02,b\n') - self.assertEqual(output_lines[3], '2015-01-03,c\n') + self.assertEqual(output_lines[0], 'date\tvalue\n') + self.assertEqual(output_lines[1], '2015-01-01\ta\n') + self.assertEqual(output_lines[2], '2015-01-02\tb\n') + self.assertEqual(output_lines[3], '2015-01-03\tc\n') -- To view, visit https://gerrit.wikimedia.org/r/200239 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Idc486cb934f3cc15c061af39c376621c5d5823a6 Gerrit-PatchSet: 1 Gerrit-Project: analytics/limn-mobile-data Gerrit-Branch: master Gerrit-Owner: Mforns <mfo...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits