Nuria has submitted this change and it was merged. Change subject: Make coalesced report format more efficient ......................................................................
Make coalesced report format more efficient Bug: 67822 Change-Id: I662fe3c408ec36839ef4e9da58be965f7fa0805e --- M tests/test_file_manager.py M tests/test_utils/test_one_off_functions.py M wikimetrics/api/file_manager.py M wikimetrics/utils.py 4 files changed, 204 insertions(+), 15 deletions(-) Approvals: Nuria: Verified; Looks good to me, approved diff --git a/tests/test_file_manager.py b/tests/test_file_manager.py index 5a9bd18..5cddb86 100644 --- a/tests/test_file_manager.py +++ b/tests/test_file_manager.py @@ -8,8 +8,9 @@ from wikimetrics.configurables import app, db, get_absolute_path from wikimetrics.exceptions import PublicReportIOError -from wikimetrics.api import PublicReportFileManager -from wikimetrics.api import COALESCED_REPORT_FILE +from wikimetrics.api import PublicReportFileManager, COALESCED_REPORT_FILE +from wikimetrics.api.file_manager import _merge_run +from wikimetrics.enums import Aggregation class PublicReportFileMangerTest(unittest.TestCase): @@ -75,23 +76,30 @@ assert_equal(len(files_to_coalesce), 4) report_results = [] + users_reported = set() + dates = set() for f in files_to_coalesce: with open(os.sep.join((test_report_dir, f))) as json_file: try: - report_results.append(json.load(json_file)) + loaded = json.load(json_file) + for u in loaded['result'][Aggregation.IND]: + users_reported.add(u) + for d in loaded['result'][Aggregation.AVG]['edits']: + dates.add(d) + report_results.append(loaded) except ValueError: pass assert_equal(len(report_results), 3) # There are 3 valid test report files - expected_end_dates = {r['parameters']['Metric_end_date'] for r in report_results} - expected_values = [r['result'] for r in report_results] - - actual_end_dates = {k for k, v in full_report.items() if k != 'parameters'} - actual_values = [v for k, v in full_report.items() if k != 'parameters'] - - assert_equal(expected_end_dates, actual_end_dates) - assert_equal(sorted(expected_values), sorted(actual_values)) + assert_equal({k for k in full_report}, { + 'result', 'parameters' + }) + assert_equal({k for k in full_report['result']}, { + Aggregation.AVG, Aggregation.IND + }) + assert_equal({k for k in full_report['result'][Aggregation.IND]}, users_reported) + assert_equal({k for k in full_report['result'][Aggregation.AVG]['edits']}, dates) @raises(PublicReportIOError) def test_remove_recurrent_report(self): @@ -99,3 +107,101 @@ self.api.root_dir = self.test_report_path test_report_id = '0001' self.api.remove_recurrent_report(test_report_id) + + +class CoalesceTests(unittest.TestCase): + def test_coalesce_format_initial(self): + coalesced = {} + first_run = { + 'result': { + Aggregation.SUM: {'metric1': 10} + }, + 'parameters': { + 'Metric_timeseries': 'none', + 'Metric_end_date': '2014-07-01' + } + } + _merge_run(coalesced, first_run) + assert_equal(coalesced, { + 'result': { + Aggregation.SUM: {'metric1': {'2014-07-01': 10}} + }, + 'parameters': { + 'Metric_timeseries': 'none', + 'Metric_end_date': '2014-07-01' + } + }) + + def test_coalesce_format_updating(self): + coalesced = {} + new_run = { + 'result': { + Aggregation.IND: { + '123|enwiki|1': { + 'metric1': { + '2014-06-14 00:00:00': 0, + '2014-06-15 00:00:00': 1, + '2014-06-16 00:00:00': 0, + }, + 'metric2': { + '2014-06-14 00:00:00': 0, + '2014-06-15 00:00:00': 0, + '2014-06-16 00:00:00': 2, + }, + }, + '124|enwiki|1': { + 'metric1': { + '2014-06-14 00:00:00': 1, + '2014-06-15 00:00:00': 0, + '2014-06-16 00:00:00': 1, + }, + 'metric2': { + '2014-06-14 00:00:00': 0, + '2014-06-15 00:00:00': 3, + '2014-06-16 00:00:00': 0, + }, + } + }, + Aggregation.SUM: { + 'metric1': { + '2014-06-14 00:00:00': 1, + '2014-06-15 00:00:00': 1, + '2014-06-16 00:00:00': 1, + }, + 'metric2': { + '2014-06-14 00:00:00': 0, + '2014-06-15 00:00:00': 3, + '2014-06-16 00:00:00': 2, + }, + } + }, + 'parameters': { + 'param1': 1, + 'param2': 2, + 'param3': 3, + 'Metric_timeseries': 'day', + } + } + _merge_run(coalesced, new_run) + assert_equal(coalesced, new_run) + + new_date = '2014-06-17 00:00:00' + _merge_run(coalesced, { + 'parameters': {'Metric_end_date': new_date}, + 'result': { + Aggregation.SUM: {'metric1': 2, 'metric2': 3}, + Aggregation.IND: { + '123|enwiki|1': {'metric1': 3, 'metric2': 1}, + '124|enwiki|1': {'metric1': 4, 'metric2': 0}, + } + } + }) + r = coalesced['result'] + + assert_equal(r[Aggregation.SUM]['metric1'][new_date], 2) + assert_equal(r[Aggregation.SUM]['metric2'][new_date], 3) + + assert_equal(r[Aggregation.IND]['123|enwiki|1']['metric1'][new_date], 3) + assert_equal(r[Aggregation.IND]['123|enwiki|1']['metric2'][new_date], 1) + assert_equal(r[Aggregation.IND]['124|enwiki|1']['metric1'][new_date], 4) + assert_equal(r[Aggregation.IND]['124|enwiki|1']['metric2'][new_date], 0) diff --git a/tests/test_utils/test_one_off_functions.py b/tests/test_utils/test_one_off_functions.py index d2d5882..28d9064 100644 --- a/tests/test_utils/test_one_off_functions.py +++ b/tests/test_utils/test_one_off_functions.py @@ -15,6 +15,7 @@ timestamps_to_now, parse_tag, chunk, + update_dict, ) from wikimetrics.metrics import NamespaceEdits @@ -154,3 +155,28 @@ assert_equal(chunked, [[2, 3], [4, 5], [6, 7], [8]]) chunked = list(chunk(range(2, 3), 2)) assert_equal(chunked, [[2]]) + + def test_update_dict(self): + target = {} + source = { + 'deep dict': {'nested': {'one': 1}}, + 'list': [1, 2, 3], + 'value': 1, + } + update_dict(target, source) + assert_equal(target['deep dict']['nested']['one'], 1) + assert_equal(target['list'], [1, 2, 3]) + assert_equal(target['value'], 1) + + source2 = { + 'deep dict': {'nested': {'one': 9}, 'nested1': {'two': 10}}, + 'list': [4, 5], + 'value': 3, + 'value1': 2, + } + update_dict(target, source2) + assert_equal(target['deep dict']['nested']['one'], 9) + assert_equal(target['deep dict']['nested1']['two'], 10) + assert_equal(target['list'], [1, 2, 3, 4, 5]) + assert_equal(target['value'], 3) + assert_equal(target['value1'], 2) diff --git a/wikimetrics/api/file_manager.py b/wikimetrics/api/file_manager.py index f7ac02b..cbeee93 100644 --- a/wikimetrics/api/file_manager.py +++ b/wikimetrics/api/file_manager.py @@ -2,7 +2,12 @@ import os.path import json import shutil +import collections + from wikimetrics.exceptions import PublicReportIOError +from wikimetrics.utils import update_dict +from wikimetrics.enums import Aggregation +from wikimetrics.metrics import TimeseriesChoices # TODO ultils imports flask response -> fix # Illegal filename characters @@ -179,11 +184,8 @@ with open(full_path, 'r') as saved_report: try: data = json.load(saved_report) - if 'parameters' not in coalesced_reports: - coalesced_reports['parameters'] = data['parameters'] + _merge_run(coalesced_reports, data) - key = data['parameters']['Metric_end_date'] - coalesced_reports[key] = data['result'] except KeyError, e: msg = 'Key "{}" not in JSON file "{}"'.format(e, full_path) self.logger.exception(msg) @@ -196,3 +198,43 @@ msg = 'Could not concatenate public report {0}'.format(report_id) self.logger.exception(msg) raise PublicReportIOError(msg) + + +def _merge_run(coalesced, data): + """ + Helper function, handles merging of new report results into an existing dictionary + that represents the output of a recurrent report. Correctly merges both timeseries + and non-timeseries results into a timeseries-like format. + + Parameters + coalesced : the coalesced report to update, could be an empty dictionary + data : the json result of the new report + """ + coalesced.setdefault('parameters', data['parameters']) + coalesced.setdefault('result', {}) + + timeseries = data['parameters'].get('Metric_timeseries', TimeseriesChoices.NONE) + if timeseries == TimeseriesChoices.NONE: + date = data['parameters']['Metric_end_date'] + for aggregate in data['result']: + if aggregate == Aggregation.IND: + # shape the data so it all looks like timeseries + data['result'][aggregate] = { + user: { + submetric: { + date: data['result'][aggregate][user][submetric] + } + for submetric in data['result'][aggregate][user] + } + for user in data['result'][aggregate] + } + else: + # shape the data so it all looks like timeseries + data['result'][aggregate] = { + submetric: { + date: data['result'][aggregate][submetric] + } + for submetric in data['result'][aggregate] + } + + update_dict(coalesced['result'], data['result']) diff --git a/wikimetrics/utils.py b/wikimetrics/utils.py index cd661a6..7d4d658 100644 --- a/wikimetrics/utils.py +++ b/wikimetrics/utils.py @@ -264,3 +264,18 @@ """ for i in xrange(0, len(array), chunk_size): yield array[i : i + chunk_size] + + +def update_dict(target, source): + """ + Updates a target dictionary recursively from a source dictionary + """ + for key, val in source.items(): + if isinstance(val, collections.Mapping): + tmp = target.get(key, {}) + update_dict(tmp, val) + target[key] = tmp + elif isinstance(val, list): + target[key] = target.get(key, []) + val + else: + target[key] = source[key] -- To view, visit https://gerrit.wikimedia.org/r/146521 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I662fe3c408ec36839ef4e9da58be965f7fa0805e Gerrit-PatchSet: 5 Gerrit-Project: analytics/wikimetrics Gerrit-Branch: master Gerrit-Owner: Milimetric <dandree...@wikimedia.org> Gerrit-Reviewer: Nuria <nu...@wikimedia.org> Gerrit-Reviewer: QChris <christ...@quelltextlich.at> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits