Nuria has submitted this change and it was merged.

Change subject: Make coalesced report format more efficient
......................................................................


Make coalesced report format more efficient

Bug: 67822
Change-Id: I662fe3c408ec36839ef4e9da58be965f7fa0805e
---
M tests/test_file_manager.py
M tests/test_utils/test_one_off_functions.py
M wikimetrics/api/file_manager.py
M wikimetrics/utils.py
4 files changed, 204 insertions(+), 15 deletions(-)

Approvals:
  Nuria: Verified; Looks good to me, approved



diff --git a/tests/test_file_manager.py b/tests/test_file_manager.py
index 5a9bd18..5cddb86 100644
--- a/tests/test_file_manager.py
+++ b/tests/test_file_manager.py
@@ -8,8 +8,9 @@
 
 from wikimetrics.configurables import app, db, get_absolute_path
 from wikimetrics.exceptions import PublicReportIOError
-from wikimetrics.api import PublicReportFileManager
-from wikimetrics.api import COALESCED_REPORT_FILE
+from wikimetrics.api import PublicReportFileManager, COALESCED_REPORT_FILE
+from wikimetrics.api.file_manager import _merge_run
+from wikimetrics.enums import Aggregation
 
 
 class PublicReportFileMangerTest(unittest.TestCase):
@@ -75,23 +76,30 @@
         assert_equal(len(files_to_coalesce), 4)
 
         report_results = []
+        users_reported = set()
+        dates = set()
         for f in files_to_coalesce:
             with open(os.sep.join((test_report_dir, f))) as json_file:
                 try:
-                    report_results.append(json.load(json_file))
+                    loaded = json.load(json_file)
+                    for u in loaded['result'][Aggregation.IND]:
+                        users_reported.add(u)
+                    for d in loaded['result'][Aggregation.AVG]['edits']:
+                        dates.add(d)
+                    report_results.append(loaded)
                 except ValueError:
                     pass
 
         assert_equal(len(report_results), 3)  # There are 3 valid test report 
files
 
-        expected_end_dates = {r['parameters']['Metric_end_date'] for r in 
report_results}
-        expected_values = [r['result'] for r in report_results]
-
-        actual_end_dates = {k for k, v in full_report.items() if k != 
'parameters'}
-        actual_values = [v for k, v in full_report.items() if k != 
'parameters']
-
-        assert_equal(expected_end_dates, actual_end_dates)
-        assert_equal(sorted(expected_values), sorted(actual_values))
+        assert_equal({k for k in full_report}, {
+            'result', 'parameters'
+        })
+        assert_equal({k for k in full_report['result']}, {
+            Aggregation.AVG, Aggregation.IND
+        })
+        assert_equal({k for k in full_report['result'][Aggregation.IND]}, 
users_reported)
+        assert_equal({k for k in 
full_report['result'][Aggregation.AVG]['edits']}, dates)
 
     @raises(PublicReportIOError)
     def test_remove_recurrent_report(self):
@@ -99,3 +107,101 @@
         self.api.root_dir = self.test_report_path
         test_report_id = '0001'
         self.api.remove_recurrent_report(test_report_id)
+
+
+class CoalesceTests(unittest.TestCase):
+    def test_coalesce_format_initial(self):
+        coalesced = {}
+        first_run = {
+            'result': {
+                Aggregation.SUM: {'metric1': 10}
+            },
+            'parameters': {
+                'Metric_timeseries': 'none',
+                'Metric_end_date': '2014-07-01'
+            }
+        }
+        _merge_run(coalesced, first_run)
+        assert_equal(coalesced, {
+            'result': {
+                Aggregation.SUM: {'metric1': {'2014-07-01': 10}}
+            },
+            'parameters': {
+                'Metric_timeseries': 'none',
+                'Metric_end_date': '2014-07-01'
+            }
+        })
+
+    def test_coalesce_format_updating(self):
+        coalesced = {}
+        new_run = {
+            'result': {
+                Aggregation.IND: {
+                    '123|enwiki|1': {
+                        'metric1': {
+                            '2014-06-14 00:00:00': 0,
+                            '2014-06-15 00:00:00': 1,
+                            '2014-06-16 00:00:00': 0,
+                        },
+                        'metric2': {
+                            '2014-06-14 00:00:00': 0,
+                            '2014-06-15 00:00:00': 0,
+                            '2014-06-16 00:00:00': 2,
+                        },
+                    },
+                    '124|enwiki|1': {
+                        'metric1': {
+                            '2014-06-14 00:00:00': 1,
+                            '2014-06-15 00:00:00': 0,
+                            '2014-06-16 00:00:00': 1,
+                        },
+                        'metric2': {
+                            '2014-06-14 00:00:00': 0,
+                            '2014-06-15 00:00:00': 3,
+                            '2014-06-16 00:00:00': 0,
+                        },
+                    }
+                },
+                Aggregation.SUM: {
+                    'metric1': {
+                        '2014-06-14 00:00:00': 1,
+                        '2014-06-15 00:00:00': 1,
+                        '2014-06-16 00:00:00': 1,
+                    },
+                    'metric2': {
+                        '2014-06-14 00:00:00': 0,
+                        '2014-06-15 00:00:00': 3,
+                        '2014-06-16 00:00:00': 2,
+                    },
+                }
+            },
+            'parameters': {
+                'param1': 1,
+                'param2': 2,
+                'param3': 3,
+                'Metric_timeseries': 'day',
+            }
+        }
+        _merge_run(coalesced, new_run)
+        assert_equal(coalesced, new_run)
+
+        new_date = '2014-06-17 00:00:00'
+        _merge_run(coalesced, {
+            'parameters': {'Metric_end_date': new_date},
+            'result': {
+                Aggregation.SUM: {'metric1': 2, 'metric2': 3},
+                Aggregation.IND: {
+                    '123|enwiki|1': {'metric1': 3, 'metric2': 1},
+                    '124|enwiki|1': {'metric1': 4, 'metric2': 0},
+                }
+            }
+        })
+        r = coalesced['result']
+
+        assert_equal(r[Aggregation.SUM]['metric1'][new_date], 2)
+        assert_equal(r[Aggregation.SUM]['metric2'][new_date], 3)
+
+        assert_equal(r[Aggregation.IND]['123|enwiki|1']['metric1'][new_date], 
3)
+        assert_equal(r[Aggregation.IND]['123|enwiki|1']['metric2'][new_date], 
1)
+        assert_equal(r[Aggregation.IND]['124|enwiki|1']['metric1'][new_date], 
4)
+        assert_equal(r[Aggregation.IND]['124|enwiki|1']['metric2'][new_date], 
0)
diff --git a/tests/test_utils/test_one_off_functions.py 
b/tests/test_utils/test_one_off_functions.py
index d2d5882..28d9064 100644
--- a/tests/test_utils/test_one_off_functions.py
+++ b/tests/test_utils/test_one_off_functions.py
@@ -15,6 +15,7 @@
     timestamps_to_now,
     parse_tag,
     chunk,
+    update_dict,
 )
 from wikimetrics.metrics import NamespaceEdits
 
@@ -154,3 +155,28 @@
         assert_equal(chunked, [[2, 3], [4, 5], [6, 7], [8]])
         chunked = list(chunk(range(2, 3), 2))
         assert_equal(chunked, [[2]])
+
+    def test_update_dict(self):
+        target = {}
+        source = {
+            'deep dict': {'nested': {'one': 1}},
+            'list': [1, 2, 3],
+            'value': 1,
+        }
+        update_dict(target, source)
+        assert_equal(target['deep dict']['nested']['one'], 1)
+        assert_equal(target['list'], [1, 2, 3])
+        assert_equal(target['value'], 1)
+
+        source2 = {
+            'deep dict': {'nested': {'one': 9}, 'nested1': {'two': 10}},
+            'list': [4, 5],
+            'value': 3,
+            'value1': 2,
+        }
+        update_dict(target, source2)
+        assert_equal(target['deep dict']['nested']['one'], 9)
+        assert_equal(target['deep dict']['nested1']['two'], 10)
+        assert_equal(target['list'], [1, 2, 3, 4, 5])
+        assert_equal(target['value'], 3)
+        assert_equal(target['value1'], 2)
diff --git a/wikimetrics/api/file_manager.py b/wikimetrics/api/file_manager.py
index f7ac02b..cbeee93 100644
--- a/wikimetrics/api/file_manager.py
+++ b/wikimetrics/api/file_manager.py
@@ -2,7 +2,12 @@
 import os.path
 import json
 import shutil
+import collections
+
 from wikimetrics.exceptions import PublicReportIOError
+from wikimetrics.utils import update_dict
+from wikimetrics.enums import Aggregation
+from wikimetrics.metrics import TimeseriesChoices
 # TODO ultils imports flask response -> fix
 
 # Illegal filename characters
@@ -179,11 +184,8 @@
                     with open(full_path, 'r') as saved_report:
                         try:
                             data = json.load(saved_report)
-                            if 'parameters' not in coalesced_reports:
-                                coalesced_reports['parameters'] = 
data['parameters']
+                            _merge_run(coalesced_reports, data)
 
-                            key = data['parameters']['Metric_end_date']
-                            coalesced_reports[key] = data['result']
                         except KeyError, e:
                             msg = 'Key "{}" not in JSON file "{}"'.format(e, 
full_path)
                             self.logger.exception(msg)
@@ -196,3 +198,43 @@
             msg = 'Could not concatenate public report {0}'.format(report_id)
             self.logger.exception(msg)
             raise PublicReportIOError(msg)
+
+
+def _merge_run(coalesced, data):
+    """
+    Helper function, handles merging of new report results into an existing 
dictionary
+    that represents the output of a recurrent report.  Correctly merges both 
timeseries
+    and non-timeseries results into a timeseries-like format.
+
+    Parameters
+        coalesced   : the coalesced report to update, could be an empty 
dictionary
+        data        : the json result of the new report
+    """
+    coalesced.setdefault('parameters', data['parameters'])
+    coalesced.setdefault('result', {})
+
+    timeseries = data['parameters'].get('Metric_timeseries', 
TimeseriesChoices.NONE)
+    if timeseries == TimeseriesChoices.NONE:
+        date = data['parameters']['Metric_end_date']
+        for aggregate in data['result']:
+            if aggregate == Aggregation.IND:
+                # shape the data so it all looks like timeseries
+                data['result'][aggregate] = {
+                    user: {
+                        submetric: {
+                            date: data['result'][aggregate][user][submetric]
+                        }
+                        for submetric in data['result'][aggregate][user]
+                    }
+                    for user in data['result'][aggregate]
+                }
+            else:
+                # shape the data so it all looks like timeseries
+                data['result'][aggregate] = {
+                    submetric: {
+                        date: data['result'][aggregate][submetric]
+                    }
+                    for submetric in data['result'][aggregate]
+                }
+
+    update_dict(coalesced['result'], data['result'])
diff --git a/wikimetrics/utils.py b/wikimetrics/utils.py
index cd661a6..7d4d658 100644
--- a/wikimetrics/utils.py
+++ b/wikimetrics/utils.py
@@ -264,3 +264,18 @@
     """
     for i in xrange(0, len(array), chunk_size):
         yield array[i : i + chunk_size]
+
+
+def update_dict(target, source):
+    """
+    Updates a target dictionary recursively from a source dictionary
+    """
+    for key, val in source.items():
+        if isinstance(val, collections.Mapping):
+            tmp = target.get(key, {})
+            update_dict(tmp, val)
+            target[key] = tmp
+        elif isinstance(val, list):
+            target[key] = target.get(key, []) + val
+        else:
+            target[key] = source[key]

-- 
To view, visit https://gerrit.wikimedia.org/r/146521
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I662fe3c408ec36839ef4e9da58be965f7fa0805e
Gerrit-PatchSet: 5
Gerrit-Project: analytics/wikimetrics
Gerrit-Branch: master
Gerrit-Owner: Milimetric <dandree...@wikimedia.org>
Gerrit-Reviewer: Nuria <nu...@wikimedia.org>
Gerrit-Reviewer: QChris <christ...@quelltextlich.at>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to