[MediaWiki-commits] [Gerrit] Add weekly aggregations - change (analytics/aggregator)
jenkins-bot has submitted this change and it was merged. Change subject: Add weekly aggregations .. Add weekly aggregations Change-Id: Ia13ed04f59eb496d6014474b15b7b5a2de060c8f --- M aggregator/projectcounts.py M bin/aggregate_projectcounts M tests/test_projectcounts/test_helpers.py M tests/test_projectcounts/test_monitoring.py A tests/test_projectcounts/test_weekly_aggregation.py M tests/test_projectcounts/testcases.py 6 files changed, 758 insertions(+), 54 deletions(-) Approvals: Nuria: Looks good to me, approved jenkins-bot: Verified diff --git a/aggregator/projectcounts.py b/aggregator/projectcounts.py index 983ad23..f53950d 100644 --- a/aggregator/projectcounts.py +++ b/aggregator/projectcounts.py @@ -184,6 +184,115 @@ header=CSV_HEADER) +def rescale_counts(csv_data, dates, bad_dates, rescale_to): +Extracts relevant dates from CSV data, sums them up, and rescales them. + +If the dates only cover bad dates, None is returned. + +All dates are expected to have the same number of columns. In case they +have not, the first good date is taken as reference. Missing columns for +good dates are assumed to be 0. + +Upon other errors, a RuntimeError is raised. + +The rescaled counts are returned as list of integers. + +:param csv_data_input: The data dict to get data from +:param dates: The dates to sum up counts for +:param bad_dates: List of dates considered having bad data. +:param rescale_to: Rescale the good entries to this many entries. + +ret = None +aggregations = 0 +for date in dates: +if date in bad_dates: +continue +date_str = date.isoformat() +try: +csv_line_items = csv_data[date_str].split(',') +except KeyError: +raise RuntimeError(No data for '%s' % (date_str)) +del csv_line_items[0] # getting rid of date column +if ret is None: +ret = [0 for i in range(len(csv_line_items))] +for i in range(len(ret)): +try: +ret[i] += int(csv_line_items[i]) +except IndexError: +# csv_line_items has less items than the first good row. +# We assume 0. +pass +aggregations += 1 + +if ret is not None: +# Since we found readings, rescale. +ret = [(ret[i] * rescale_to) / aggregations for i in range(len(ret))] +return ret + + +def update_weekly_csv(target_dir_abs, dbname, csv_data_input, first_date, + last_date, bad_dates=[], force_recomputation=False): +Updates weekly per project CSVs from a csv data dictionary. + +The existing per project CSV files in target_dir_abs/weekly are updated for +all weeks where Sunday in in the date interval from first_date up to (and +including) last_date. + +For weekly aggregations, a week's total data is rescaled to 7 days. + +If a week under consideration contains no good date, it is removed. + +Upon any error, the function raises an exception. + +:param target_dir_abs: Absolute directory. CSVs are getting written to the +'weekly_rescaled' subdirectory of target_dir_abs. +:param dbname: The database name of the wiki to consider (E.g.: 'enwiki') +:param csv_data_input: The data dict to aggregate from +:param first_date: The first date to compute non-existing data for. +:param last_date: The last date to compute non-existing data for. +:param bad_dates: List of dates considered having bad data. (Default: []) +:param force_recomputation: If True, recompute data for the given days, +even if it is already in the CSV. (Default: False) + +csv_dir_abs = os.path.join(target_dir_abs, 'weekly_rescaled') +if not os.path.exists(csv_dir_abs): +os.mkdir(csv_dir_abs) +csv_file_abs = os.path.join(csv_dir_abs, dbname + '.csv') + +csv_data = util.parse_csv_to_first_column_dict(csv_file_abs) + +for date in util.generate_dates(first_date, last_date): +if date.weekday() == 6: # Sunday. End of ISO week +date_str = date.strftime('%GW%V') +logging.debug(Updating csv '%s' for date '%s' % ( +dbname, str(date))) +week_dates = set(date + datetime.timedelta(days=offset) + for offset in range(-6, 1)) +expected_good_dates = len(week_dates - set(bad_dates)) +need_recomputation = force_recomputation +need_recomputation |= expected_good_dates != 7 +need_recomputation |= date_str not in csv_data +if need_recomputation: +if expected_good_dates == 0: +try: +del csv_data[date_str] +except KeyError: +# No reading was there to remove. That's ok :-) +pass +
[MediaWiki-commits] [Gerrit] Add weekly aggregations - change (analytics/aggregator)
QChris has uploaded a new change for review. https://gerrit.wikimedia.org/r/182676 Change subject: Add weekly aggregations .. Add weekly aggregations Change-Id: Ia13ed04f59eb496d6014474b15b7b5a2de060c8f --- M aggregator/projectcounts.py M bin/aggregate_projectcounts M tests/test_projectcounts/test_helpers.py M tests/test_projectcounts/test_monitoring.py A tests/test_projectcounts/test_weekly_aggregation.py M tests/test_projectcounts/testcases.py 6 files changed, 758 insertions(+), 54 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/analytics/aggregator refs/changes/76/182676/1 diff --git a/aggregator/projectcounts.py b/aggregator/projectcounts.py index 983ad23..1fa39ed 100644 --- a/aggregator/projectcounts.py +++ b/aggregator/projectcounts.py @@ -184,6 +184,115 @@ header=CSV_HEADER) +def rescale_counts(csv_data, dates, bad_dates, rescale_to): +Extracts relevant dates from CSV data, sums them up, and rescales them. + +If the dates only cover bad dates, None is returned. + +All dates are expected to have the same number of columns. In case they +have not, the first good date is taken as reference. Missing columns for +good dates are assumed to be 0. + +Upon other errors, a RuntimeError is raised. + +The rescaled counts are returned as list of integers. + +:param csv_data_input: The data dict to get data from +:param dates: The dates to sum up counts for +:param bad_dates: List of dates considered having bad data. +:param rescale_to: Rescale the good entries to this many entries. + +ret = None +aggregations = 0 +for date in dates: +if date in bad_dates: +continue +date_str = date.isoformat() +try: +csv_line_items = csv_data[date_str].split(',') +except KeyError: +raise RuntimeError(No data for '%s' % (date_str)) +del csv_line_items[0] # getting rid of date column +if ret is None: +ret = [0 for i in range(len(csv_line_items))] +for i in range(len(ret)): +try: +ret[i] += int(csv_line_items[i]) +except IndexError: +# csv_line_items has less items than the first good row. +# We assume 0. +pass +aggregations += 1 + +if ret is not None: +# Since we found readings, rescale. +ret = [(ret[i] * rescale_to) / aggregations for i in range(len(ret))] +return ret + + +def update_weekly_csv(target_dir_abs, dbname, csv_data_input, first_date, + last_date, bad_dates=[], force_recomputation=False): +Updates weekly per project CSVs from a csv data dictionary. + +The existing per project CSV files in target_dir_abs/weekly are updated for +all weeks where Sunday in in the date interval from first_date up to (and +including) last_date. + +For weekly aggregations, a week's total data is rescaled to 7 days. + +If a week under consideration contains no good date, it is removed. + +Upon any error, the function raises an exception. + +:param target_dir_abs: Absolute directory. CSVs are getting written to the +'weekly_rescaled' subdirectory of target_dir_abs. +:param dbname: The database name of the wiki to consider (E.g.: 'enwiki') +:param csv_data_input: The data dict to aggregate from +:param first_date: The first date to compute non-existing data for. +:param last_date: The last date to compute non-existing data for. +:param bad_dates: List of dates considered having bad data. (Default: []) +:param force_recomputation: If True, recompute data for the given days, +even if it is already in the CSV. (Default: False) + +csv_dir_abs = os.path.join(target_dir_abs, 'weekly_rescaled') +if not os.path.exists(csv_dir_abs): +os.mkdir(csv_dir_abs) +csv_file_abs = os.path.join(csv_dir_abs, dbname + '.csv') + +csv_data = util.parse_csv_to_first_column_dict(csv_file_abs) + +for date in util.generate_dates(first_date, last_date): +if date.weekday() == 6: # Sunday. End of ISO week +date_str = date.strftime('%YW%W') +logging.debug(Updating csv '%s' for date '%s' % ( +dbname, str(date))) +week_dates = set(date + datetime.timedelta(days=offset) + for offset in range(-6, 1)) +expected_good_dates = len(week_dates - set(bad_dates)) +need_recomputation = force_recomputation +need_recomputation |= expected_good_dates != 7 +need_recomputation |= date_str not in csv_data +if need_recomputation: +if expected_good_dates == 0: +try: +del csv_data[date_str] +except KeyError: +# No reading was there to remove. That's