[MediaWiki-commits] [Gerrit] Add weekly aggregations - change (analytics/aggregator)

2015-01-06 Thread jenkins-bot (Code Review)
jenkins-bot has submitted this change and it was merged.

Change subject: Add weekly aggregations
..


Add weekly aggregations

Change-Id: Ia13ed04f59eb496d6014474b15b7b5a2de060c8f
---
M aggregator/projectcounts.py
M bin/aggregate_projectcounts
M tests/test_projectcounts/test_helpers.py
M tests/test_projectcounts/test_monitoring.py
A tests/test_projectcounts/test_weekly_aggregation.py
M tests/test_projectcounts/testcases.py
6 files changed, 758 insertions(+), 54 deletions(-)

Approvals:
  Nuria: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/aggregator/projectcounts.py b/aggregator/projectcounts.py
index 983ad23..f53950d 100644
--- a/aggregator/projectcounts.py
+++ b/aggregator/projectcounts.py
@@ -184,6 +184,115 @@
 header=CSV_HEADER)
 
 
+def rescale_counts(csv_data, dates, bad_dates, rescale_to):
+Extracts relevant dates from CSV data, sums them up, and rescales them.
+
+If the dates only cover bad dates, None is returned.
+
+All dates are expected to have the same number of columns. In case they
+have not, the first good date is taken as reference. Missing columns for
+good dates are assumed to be 0.
+
+Upon other errors, a RuntimeError is raised.
+
+The rescaled counts are returned as list of integers.
+
+:param csv_data_input: The data dict to get data from
+:param dates: The dates to sum up counts for
+:param bad_dates: List of dates considered having bad data.
+:param rescale_to: Rescale the good entries to this many entries.
+
+ret = None
+aggregations = 0
+for date in dates:
+if date in bad_dates:
+continue
+date_str = date.isoformat()
+try:
+csv_line_items = csv_data[date_str].split(',')
+except KeyError:
+raise RuntimeError(No data for '%s' % (date_str))
+del csv_line_items[0]  # getting rid of date column
+if ret is None:
+ret = [0 for i in range(len(csv_line_items))]
+for i in range(len(ret)):
+try:
+ret[i] += int(csv_line_items[i])
+except IndexError:
+# csv_line_items has less items than the first good row.
+# We assume 0.
+pass
+aggregations += 1
+
+if ret is not None:
+# Since we found readings, rescale.
+ret = [(ret[i] * rescale_to) / aggregations for i in range(len(ret))]
+return ret
+
+
+def update_weekly_csv(target_dir_abs, dbname, csv_data_input, first_date,
+  last_date, bad_dates=[], force_recomputation=False):
+Updates weekly per project CSVs from a csv data dictionary.
+
+The existing per project CSV files in target_dir_abs/weekly are updated for
+all weeks where Sunday in in the date interval from first_date up to (and
+including) last_date.
+
+For weekly aggregations, a week's total data is rescaled to 7 days.
+
+If a week under consideration contains no good date, it is removed.
+
+Upon any error, the function raises an exception.
+
+:param target_dir_abs: Absolute directory. CSVs are getting written to the
+'weekly_rescaled' subdirectory of target_dir_abs.
+:param dbname: The database name of the wiki to consider (E.g.: 'enwiki')
+:param csv_data_input: The data dict to aggregate from
+:param first_date: The first date to compute non-existing data for.
+:param last_date: The last date to compute non-existing data for.
+:param bad_dates: List of dates considered having bad data. (Default: [])
+:param force_recomputation: If True, recompute data for the given days,
+even if it is already in the CSV. (Default: False)
+
+csv_dir_abs = os.path.join(target_dir_abs, 'weekly_rescaled')
+if not os.path.exists(csv_dir_abs):
+os.mkdir(csv_dir_abs)
+csv_file_abs = os.path.join(csv_dir_abs, dbname + '.csv')
+
+csv_data = util.parse_csv_to_first_column_dict(csv_file_abs)
+
+for date in util.generate_dates(first_date, last_date):
+if date.weekday() == 6:  # Sunday. End of ISO week
+date_str = date.strftime('%GW%V')
+logging.debug(Updating csv '%s' for date '%s' % (
+dbname, str(date)))
+week_dates = set(date + datetime.timedelta(days=offset)
+ for offset in range(-6, 1))
+expected_good_dates = len(week_dates - set(bad_dates))
+need_recomputation = force_recomputation
+need_recomputation |= expected_good_dates != 7
+need_recomputation |= date_str not in csv_data
+if need_recomputation:
+if expected_good_dates == 0:
+try:
+del csv_data[date_str]
+except KeyError:
+# No reading was there to remove. That's ok :-)
+pass
+ 

[MediaWiki-commits] [Gerrit] Add weekly aggregations - change (analytics/aggregator)

2015-01-04 Thread QChris (Code Review)
QChris has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/182676

Change subject: Add weekly aggregations
..

Add weekly aggregations

Change-Id: Ia13ed04f59eb496d6014474b15b7b5a2de060c8f
---
M aggregator/projectcounts.py
M bin/aggregate_projectcounts
M tests/test_projectcounts/test_helpers.py
M tests/test_projectcounts/test_monitoring.py
A tests/test_projectcounts/test_weekly_aggregation.py
M tests/test_projectcounts/testcases.py
6 files changed, 758 insertions(+), 54 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/aggregator 
refs/changes/76/182676/1

diff --git a/aggregator/projectcounts.py b/aggregator/projectcounts.py
index 983ad23..1fa39ed 100644
--- a/aggregator/projectcounts.py
+++ b/aggregator/projectcounts.py
@@ -184,6 +184,115 @@
 header=CSV_HEADER)
 
 
+def rescale_counts(csv_data, dates, bad_dates, rescale_to):
+Extracts relevant dates from CSV data, sums them up, and rescales them.
+
+If the dates only cover bad dates, None is returned.
+
+All dates are expected to have the same number of columns. In case they
+have not, the first good date is taken as reference. Missing columns for
+good dates are assumed to be 0.
+
+Upon other errors, a RuntimeError is raised.
+
+The rescaled counts are returned as list of integers.
+
+:param csv_data_input: The data dict to get data from
+:param dates: The dates to sum up counts for
+:param bad_dates: List of dates considered having bad data.
+:param rescale_to: Rescale the good entries to this many entries.
+
+ret = None
+aggregations = 0
+for date in dates:
+if date in bad_dates:
+continue
+date_str = date.isoformat()
+try:
+csv_line_items = csv_data[date_str].split(',')
+except KeyError:
+raise RuntimeError(No data for '%s' % (date_str))
+del csv_line_items[0]  # getting rid of date column
+if ret is None:
+ret = [0 for i in range(len(csv_line_items))]
+for i in range(len(ret)):
+try:
+ret[i] += int(csv_line_items[i])
+except IndexError:
+# csv_line_items has less items than the first good row.
+# We assume 0.
+pass
+aggregations += 1
+
+if ret is not None:
+# Since we found readings, rescale.
+ret = [(ret[i] * rescale_to) / aggregations for i in range(len(ret))]
+return ret
+
+
+def update_weekly_csv(target_dir_abs, dbname, csv_data_input, first_date,
+  last_date, bad_dates=[], force_recomputation=False):
+Updates weekly per project CSVs from a csv data dictionary.
+
+The existing per project CSV files in target_dir_abs/weekly are updated for
+all weeks where Sunday in in the date interval from first_date up to (and
+including) last_date.
+
+For weekly aggregations, a week's total data is rescaled to 7 days.
+
+If a week under consideration contains no good date, it is removed.
+
+Upon any error, the function raises an exception.
+
+:param target_dir_abs: Absolute directory. CSVs are getting written to the
+'weekly_rescaled' subdirectory of target_dir_abs.
+:param dbname: The database name of the wiki to consider (E.g.: 'enwiki')
+:param csv_data_input: The data dict to aggregate from
+:param first_date: The first date to compute non-existing data for.
+:param last_date: The last date to compute non-existing data for.
+:param bad_dates: List of dates considered having bad data. (Default: [])
+:param force_recomputation: If True, recompute data for the given days,
+even if it is already in the CSV. (Default: False)
+
+csv_dir_abs = os.path.join(target_dir_abs, 'weekly_rescaled')
+if not os.path.exists(csv_dir_abs):
+os.mkdir(csv_dir_abs)
+csv_file_abs = os.path.join(csv_dir_abs, dbname + '.csv')
+
+csv_data = util.parse_csv_to_first_column_dict(csv_file_abs)
+
+for date in util.generate_dates(first_date, last_date):
+if date.weekday() == 6:  # Sunday. End of ISO week
+date_str = date.strftime('%YW%W')
+logging.debug(Updating csv '%s' for date '%s' % (
+dbname, str(date)))
+week_dates = set(date + datetime.timedelta(days=offset)
+ for offset in range(-6, 1))
+expected_good_dates = len(week_dates - set(bad_dates))
+need_recomputation = force_recomputation
+need_recomputation |= expected_good_dates != 7
+need_recomputation |= date_str not in csv_data
+if need_recomputation:
+if expected_good_dates == 0:
+try:
+del csv_data[date_str]
+except KeyError:
+# No reading was there to remove. That's