Alex Monk has uploaded a new change for review. https://gerrit.wikimedia.org/r/295607
Change subject: [WIP/POC/POS] Add python version of maintain-replicas script ...................................................................... [WIP/POC/POS] Add python version of maintain-replicas script This does not include functionality that was split into maintain-meta_p.py last year. This (so far) deals with the following issues: * It was written in Perl. * Configuration was part of the script. * It tried to parse ~/.my.cnf instead of getting them from it's own configuration. * Configuration file couldn't be specified on the CLI It still needs to: * Allow specifying one database to work on only. * BE TESTED. * Be commented more * Do less work in file scope? * Pause and ask for confirmation of things * Have a "dry run" mode. * Be more verbose. * Be more helpful to the user in case of failure. Change-Id: Icc51fc9a23f2fcf4c19c7adc092662d7122471e1 --- A maintain-replicas/config.json A maintain-replicas/maintain-replicas.py 2 files changed, 312 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/software refs/changes/07/295607/1 diff --git a/maintain-replicas/config.json b/maintain-replicas/config.json new file mode 100644 index 0000000..8f9d3ff --- /dev/null +++ b/maintain-replicas/config.json @@ -0,0 +1,158 @@ +{ + "slices": [ + ["labsdb1001.eqiad.wmnet", 3306], + ["labsdb1002.eqiad.wmnet", 3306], + ["labsdb1003.eqiad.wmnet", 3306] + ], + "mysql_user": "root", + "mysql_password": "password", + "fullviews": [ + "abuse_filter_action", "abuse_filter_history", + "aft_article_answer", "aft_article_answer_text", + "aft_article_feedback_properties", "aft_article_feedback_ratings_rollup", + "aft_article_feedback_select_rollup", "aft_article_field", "aft_article_field_group", + "aft_article_field_option", "aft_article_filter_count", "aft_article_revision_feedback_ratings_rollup", + "aft_article_revision_feedback_select_rollup", "article_assessment", "article_assessment_pages", + "article_assessment_ratings", "article_feedback", "article_feedback_pages", + "article_feedback_properties", "article_feedback_ratings", "article_feedback_revisions", + "article_feedback_stats", "article_feedback_stats_types", "category", "categorylinks", "change_tag", + "ep_articles", "ep_cas", "ep_courses", "ep_events", "ep_instructors", "ep_oas", "ep_orgs", + "ep_revisions", "ep_students", "ep_users_per_course", "externallinks", "flaggedimages", + "flaggedpage_config", "flaggedpage_pending", "flaggedpages", "flaggedrevs", "flaggedrevs_promote", + "flaggedrevs_statistics", "flaggedrevs_stats", "flaggedrevs_stats2", "flaggedrevs_tracking", + "flaggedtemplates", "geo_killlist", "geo_tags", "geo_updates", "globalimagelinks", "global_block_whitelist", + "hashs", "hitcounter", "image", "imagelinks", "imagelinks_old", "interwiki", "iwlinks", + "l10n_cache", "langlinks", "links", "localisation", "localisation_file_hash", + "math", "module_deps", "msg_resource_links", "namespaces", + "page", "page_broken", "pagelinks", "page_props", "page_restrictions", "pagetriage_log", + "pagetriage_page", "pagetriage_page_tags", "pagetriage_tags", "pif_edits", "povwatch_log", + "povwatch_subscribers", "protected_titles", "pr_index", "redirect", "renameuser_status", "site_identifiers", + "sites", "site_stats", "tag_summary", "templatelinks", "transcode", "updatelog", "updates", + "user_former_groups", "user_groups", "valid_tag", "wikilove_image_log", + "wikilove_log", "global_group_permissions", "global_group_restrictions", "global_user_groups", + "globalblocks", "localuser", "wikiset", "wb_changes", "wb_changes_dispatch", "wb_changes_subscription", + "wb_entity_per_page", "wb_id_counters", "wb_items_per_site", "wb_property_info", "wb_terms", + "wbc_entity_usage", "wbs_propertypairs" + ], + "logging_whitelist": [ + "gblrights", "globalauth", "close", "lock", "gblblock", "managetags", "gather", "campus", + "institution", "instructor", "online", "merge", "massmessage", "course", "stable", "import", + "abusefilter", "student", "eparticle", "rights", "pagetriage-deletion", "protect", + "pagetriage-curation", "thanks", "renameuser", "review", "block", "upload", "move", + "delete", "patrol", "newusers" + ], + "customviews": { + "abuse_filter": { + "source": "abuse_filter", + "view": "select af_id, if(af_hidden,null,af_pattern) as af_pattern, af_user, af_user_text, af_timestamp, af_enabled, if(af_hidden,null,af_comments) as af_comments, af_public_comments, af_hidden, af_hit_count, af_throttled, af_deleted, af_actions, af_global, af_group" + }, + "abuse_filter_log": { + "source": "abuse_filter_log", + "view": "select afl_id, afl_filter, afl_user, afl_user_text, NULL as afl_ip, afl_action, afl_actions, afl_var_dump, afl_timestamp, afl_namespace, afl_title, afl_wiki, afl_deleted, afl_patrolled_by, afl_rev_id, afl_log_id" + }, + "aft_article_feedback": { + "source": "aft_article_feedback", + "view": "select af_id, af_page_id, af_user_id, NULL as af_user_ip, af_user_anon_token, af_revision_id, af_cta_id, af_link_id, af_created, af_abuse_count, af_helpful_count, af_unhelpful_count, af_oversight_count, af_is_deleted, af_is_hidden, af_net_helpfulness, af_has_comment, af_is_unhidden, af_is_undeleted, af_is_declined, af_activity_count, af_form_id, af_experiment, af_suppress_count, af_last_status, af_last_status_user_id, af_last_status_timestamp, af_is_autohide, af_is_unrequested, af_is_featured, af_is_unfeatured, af_is_resolved, af_is_unresolved, af_relevance_score, af_relevance_sort, af_last_status_notes" + }, + "archive": { + "source": "archive", + "view": "select ar_id, ar_namespace, ar_title, NULL as ar_text, NULL as ar_comment, if(ar_deleted&4,null,ar_user) as ar_user, if(ar_deleted&4,null,ar_user_text) as ar_user_text, ar_timestamp, ar_minor_edit, ar_flags, ar_rev_id, if(ar_deleted&1,null,ar_text_id) as ar_text_id, ar_deleted, if(ar_deleted&1,null,ar_len) as ar_len, ar_page_id, ar_parent_id, if(ar_deleted&1,null,ar_sha1) as ar_sha1" + }, + "archive_userindex": { + "source": "archive", + "view": "select ar_id, ar_namespace, ar_title, NULL as ar_text, NULL as ar_comment, ar_user, ar_user_text, ar_timestamp, ar_minor_edit, ar_flags, ar_rev_id, if(ar_deleted&1,null,ar_text_id) as ar_text_id, ar_deleted, if(ar_deleted&1,null,ar_len) as ar_len, ar_page_id, ar_parent_id, if(ar_deleted&1,null,ar_sha1) as ar_sha1", + "where": "(ar_deleted&4)=0" + }, + "filearchive": { + "source": "filearchive", + "view": "select fa_id, fa_name, fa_archive_name, fa_storage_group, NULL as fa_storage_key, fa_deleted_user, fa_deleted_timestamp, fa_deleted_reason, if(fa_deleted&1,null,fa_size) as fa_size, if(fa_deleted&1,null,fa_width) as fa_width, if(fa_deleted&1,null,fa_height) as fa_height, if(fa_deleted&1,null,fa_metadata) as fa_metadata, if(fa_deleted&1,null,fa_bits) as fa_bits, if(fa_deleted&1,null,fa_media_type) as fa_media_type, if(fa_deleted&1,null,fa_major_mime) as fa_major_mime, if(fa_deleted&1,null,fa_minor_mime) as fa_minor_mime, if(fa_deleted&2,null,fa_description) as fa_description, if(fa_deleted&4,null,fa_user)as fa_user, if(fa_deleted&4,null,fa_user_text) as fa_user_text, fa_timestamp, fa_deleted, if(fa_deleted&1,null,fa_sha1) as fa_sha1" + }, + "filearchive_userindex": { + "source": "filearchive", + "view": "select fa_id, fa_name, fa_archive_name, fa_storage_group, NULL as fa_storage_key, fa_deleted_user, fa_deleted_timestamp, fa_deleted_reason, if(fa_deleted&1,null,fa_size) as fa_size, if(fa_deleted&1,null,fa_width) as fa_width, if(fa_deleted&1,null,fa_height) as fa_height, if(fa_deleted&1,null,fa_metadata) as fa_metadata, if(fa_deleted&1,null,fa_bits) as fa_bits, if(fa_deleted&1,null,fa_media_type) as fa_media_type, if(fa_deleted&1,null,fa_major_mime) as fa_major_mime, if(fa_deleted&1,null,fa_minor_mime) as fa_minor_mime, if(fa_deleted&2,null,fa_description) as fa_description, fa_user, fa_user_text, fa_timestamp, fa_deleted, if(fa_deleted&1,null,fa_sha1) as fa_sha1", + "where": "(fa_deleted&4)=0" + }, + "globaluser": { + "source": "globaluser", + "view": "select gu_id, gu_name, gu_enabled, gu_enabled_method, gu_home_db, NULL as gu_email, NULL as gu_email_authenticated, NULL as gu_salt, NULL as gu_password, gu_locked, gu_hidden, gu_registration, NULL as gu_password_reset_key, NULL as gu_password_reset_expiration, NULL as gu_auth_token", + "where": "gu_hidden=''" + }, + "ipblocks": { + "source": "ipblocks", + "view": "select ipb_id, if(ipb_auto<>0,null,ipb_address) as ipb_address, ipb_user, ipb_by, ipb_reason, ipb_timestamp, ipb_auto, ipb_anon_only, ipb_create_account, ipb_expiry, if(ipb_auto<>0,null,ipb_range_start) as ipb_range_start, if(ipb_auto<>0,null,ipb_range_end) as ipb_range_end, ipb_enable_autoblock, ipb_deleted, ipb_block_email, ipb_by_text, ipb_allow_usertalk, ipb_parent_block_id", + "where": "ipb_deleted=0" + }, + "ipblocks_ipindex": { + "source": "ipblocks", + "view": "select ipb_id, ipb_address, ipb_user, ipb_by, ipb_reason, ipb_timestamp, ipb_auto, ipb_anon_only, ipb_create_account, ipb_expiry, ipb_range_start, ipb_range_end, ipb_enable_autoblock, ipb_deleted, ipb_block_email, ipb_by_text, ipb_allow_usertalk, ipb_parent_block_id", + "where": "ipb_deleted=0 and ipb_auto=0" + }, + "logging": { + "source": "logging", + "view": "select log_id, log_type, if(log_deleted&1,null,log_action) as log_action, log_timestamp, if(log_deleted&4,null,log_user) as log_user, if(log_deleted&1,null,log_namespace) as log_namespace, if(log_deleted&1,null,log_title) as log_title, if(log_deleted&2,null,log_comment) as log_comment, if(log_deleted,null,log_params) as log_params, log_deleted, if(log_deleted&4,null,log_user_text) as log_user_text, if(log_deleted&1,null,log_page) as log_page" + }, + "logging_logindex": { + "source": "logging", + "view": "select log_id, log_type, log_action, log_timestamp, if(log_deleted&4,null,log_user) as log_user, log_namespace, log_title, if(log_deleted&2,null,log_comment) as log_comment, if(log_deleted,null,log_params) as log_params, log_deleted, if(log_deleted&4,null,log_user_text) as log_user_text, log_page" + }, + "logging_userindex": { + "source": "logging", + "view": "select log_id, log_type, if(log_deleted&1,null,log_action) as log_action, log_timestamp, log_user, if(log_deleted&1,null,log_namespace) as log_namespace, if(log_deleted&1,null,log_title) as log_title, if(log_deleted&2,null,log_comment) as log_comment, if(log_deleted,null,log_params) as log_params, log_deleted, log_user_text as log_user_text, if(log_deleted&1,null,log_page) as log_page" + }, + "mark_as_helpful": { + "source": "mark_as_helpful", + "view": "select mah_id, mah_type, mah_item, mah_user_id, mah_user_editcount, mah_namespace, mah_title, mah_timestamp, NULL as mah_system_type, NULL as mah_user_agent, NULL as mah_locale" + }, + "oldimage": { + "source": "oldimage", + "view": "select oi_name, oi_archive_name, oi_size, oi_width, oi_height, oi_bits, if(oi_deleted&2,null,oi_description) as oi_description, if(oi_deleted&4,null,oi_user) as oi_user, if(oi_deleted&4,null,oi_user_text) as oi_user_text, oi_timestamp, oi_metadata, oi_media_type, oi_major_mime, oi_minor_mime, oi_deleted, oi_sha1" + }, + "oldimage_userindex": { + "source": "oldimage", + "view": "select oi_name, oi_archive_name, oi_size, oi_width, oi_height, oi_bits, if(oi_deleted&2,null,oi_description) as oi_description, oi_user, oi_user_text, oi_timestamp, oi_metadata, oi_media_type, oi_major_mime, oi_minor_mime, oi_deleted, oi_sha1", + "where": "(oi_deleted&4)=0" + }, + "recentchanges": { + "source": "recentchanges", + "view": "select rc_id, rc_timestamp, if(rc_deleted&4,null,rc_user) as rc_user, if(rc_deleted&4,null,rc_user_text) as rc_user_text, rc_namespace, rc_title, if(rc_deleted&2,null,rc_comment) as rc_comment, rc_minor, rc_bot, rc_new, rc_cur_id, rc_this_oldid, rc_last_oldid, rc_type, rc_source, rc_patrolled, null as rc_ip, rc_old_len, rc_new_len, rc_deleted, rc_logid, rc_log_type, rc_log_action, rc_params" + }, + "recentchanges_userindex": { + "source": "recentchanges", + "view": "select rc_id, rc_timestamp, rc_user, rc_user_text, rc_namespace, rc_title, if(rc_deleted&2,null,rc_comment) as rc_comment, rc_minor, rc_bot, rc_new, rc_cur_id, rc_this_oldid, rc_last_oldid, rc_type, rc_source, rc_patrolled, null as rc_ip, rc_old_len, rc_new_len, rc_deleted, rc_logid, rc_log_type, rc_log_action, rc_params", + "where": "(rc_deleted&4)=0" + }, + "revision": { + "source": "revision", + "view": "select rev_id, rev_page, if(rev_deleted&1,null,rev_text_id) as rev_text_id, if(rev_deleted&2,null,rev_comment) as rev_comment, if(rev_deleted&4,null,rev_user) as rev_user, if(rev_deleted&4,null,rev_user_text) as rev_user_text, rev_timestamp, rev_minor_edit, rev_deleted, if(rev_deleted&1,null,rev_len) as rev_len, rev_parent_id, if(rev_deleted&1,null,rev_sha1) as rev_sha1, rev_content_model, rev_content_format" + }, + "revision_userindex": { + "source": "revision", + "view": "select rev_id, rev_page, if(rev_deleted&1,null,rev_text_id) as rev_text_id, if(rev_deleted&2,null,rev_comment) as rev_comment, rev_user, rev_user_text, rev_timestamp, rev_minor_edit, rev_deleted, if(rev_deleted&1,null,rev_len) as rev_len, rev_parent_id, if(rev_deleted&1,null,rev_sha1) as rev_sha1, rev_content_model, rev_content_format", + "where": "(rev_deleted&4)=0" + }, + "user": { + "source": "user", + "view": "select user_id, user_name, user_real_name, NULL as user_password, NULL as user_newpassword, NULL as user_email, NULL as user_options, NULL as user_touched, NULL as user_token, NULL as user_email_authenticated, NULL as user_email_token, NULL as user_email_token_expires, user_registration, NULL as user_newpass_time, user_editcount, NULL as user_password_expires" + }, + "user_old": { + "source": "user_old", + "view": "select user_id, user_name, NULL as user_password, NULL as user_newpassword, NULL as user_email, NULL as user_options, NULL as user_newtalk, NULL as user_touched, user_real_name, NULL as user_token" + }, + "user_properties": { + "source": "user_properties", + "view": "select up_user, up_property, up_value", + "where": "up_property in ( 'disablemail', 'fancysig', 'gender', 'language', 'nickname', 'skin', 'timecorrection', 'variant' )" + }, + "user_properties_anon": { + "limit": 2, + "source": [ "user_properties", "user", "meta_p.properties_anon_whitelist" ], + "view": "select cast(extract(year_month from user_touched)*100+1 as date) upa_touched, up_property, up_value", + "where": "user_id=up_user and up_property like pw_property" + }, + "watchlist_counts": { + "source": "watchlist", + "view" : "select count(*) as wl_count, wl_namespace, wl_title", + "group" : "wl_namespace, wl_title having wl_count >= 30" + } + } +} \ No newline at end of file diff --git a/maintain-replicas/maintain-replicas.py b/maintain-replicas/maintain-replicas.py new file mode 100644 index 0000000..d4f30a7 --- /dev/null +++ b/maintain-replicas/maintain-replicas.py @@ -0,0 +1,154 @@ +#! /usr/bin/python3 +# -*- coding: utf-8 -*- + +# Based on work by Marc-André Pelletier, ported to Python by Alex Monk +# Copyright © 2016 Alex Monk <a...@wikimedia.org> +# Copyright © 2015 Alex Monk <kren...@gmail.com> +# Copyright © 2013 Marc-André Pelletier <mpellet...@wikimedia.org> +# +# Permission to use, copy, modify, and/or distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# +## +## This script maintains the databases containing sanitized views to +## the replicated databases (in the form <db>_p for every <db>) +## +## By default, it processes every shard but it accepts a list of +## databases to process +## +## The script expects to be invoked in a fresh copy of +## operations/mediawiki-config where it will get most of its +## information. +## + +import argparse +import json +import logging +import pymysql +import re +import subprocess +import sys + +if __name__ == "__main__": + argparser = argparse.ArgumentParser('maintain-replicas', + description='Maintain labs sanitized views of replica databases') + argparser.add_argument( + '--config-location', + help='Path to find the configuration file', + default='config.json' + ) + args = argparser.parse_args() + with open(args.config_location) as f: + config = json.load(f) + + # Hacks + safelog = "log_type IN ('" + "', '".join(config['logging_whitelist']) + "')" + customviews = config['customviews'] + customviews['logging']['where'] = safelog + customviews['logging_logindex']['where'] = "(log_deleted&1)=0 and " + safelog + customviews['logging_userindex']['where'] = "(log_deleted&4)=0 and " + safelog + + logging.info("Running git pull in mediawiki-config") + subprocess.call(["git", "pull"], cwd = "mediawiki-config") + dbs = {db : {} for db in open('mediawiki-config/dblists/all.dblist').read().splitlines()} + def read_list(list_fname, prop, val): + for db in open('mediawiki-config/dblists/' + list_fname + '.dblist').read().splitlines(): + if db in dbs: + dbs[db][prop] = val + + read_list("deleted", "deleted", True) + read_list("private", "private", True) + read_list("small", "size", 1) + read_list("medium", "size", 2) + read_list("large", "size", 3) + + logging.info("Got all necessary info, starting to connect to slices") + for dbhost, dbport in config['slices']: + dbh = pymysql.connect( + host=dbhost, + port=dbport, + user=config['mysql_user'], + passwd=config['mysql_password'], + charset='utf8' + ) + cursor = dbh.cursor() + logging.info("Connected to {}:{}...".format(dbhost, dbport)) + cursor.execute("SET NAMES 'utf8';") + for db, db_info in dbs: + if 'deleted' in db_info and db_info['deleted']: + continue + if 'private' in db_info and db_info['private']: + continue + cursor.execute("CREATE DATABASE IF NOT EXISTS {}_p;".format(db)) + + logging.info("Full views for {}:".format(db)) + for view in config['fullviews']: + cursor.execute(""" + SELECT table_name + FROM information_schema.tables + WHERE table_name='{}' AND table_schema='{}'; + """.format(view, db)) + result, = cursor.fetchone() + if result == 1: + logging.info("[{}] ".format(view)) + cursor.execute(""" + CREATE OR REPLACE + DEFINER=viewmaster + VIEW {0}_p.{1} + AS SELECT * FROM {0}.{1}; + """.format(db, view)) + + + logging.info("Custom views for {}:".format(db)) + for view_name, view_details in customviews.items(): + if view_details['limit'] > db_info['size']: + continue + + sources = list(view_details['source']) + sources_checked = [] + + for source in sources: + match = re.match(r'^(?:(.*)\.)?([^.]+)$', source) + if not match: + # TODO: Do we really want to do this? + logging.critical("Custom view source does not look valid! Source: {}, view: {}".format(source, view_name)) + sys.exit(1) + source_db, source_table = match.groups() + if source_db is None: + source_db = db + cursor.execute(""" + SELECT table_name + FROM information_schema.tables + WHERE table_name='{}' AND table_schema='{}'; + """.format(table, db)) + result, = cursor.fetchone() + if result == 1: + sources_checked.append("{}.{}".format(source_db, source_table)) + + if len(sources) != len(sources_checked): + continue + + query = """ + CREATE OR REPLACE + DEFINER=viewmaster + VIEW ${dbk}_p.$view + AS {} + FROM {} + """.format(db, customview['view'], ','.join(sources)) + + if 'where' in view_details: + query += " WHERE {}\n".format(view_details['where']) + if 'group' in view_details: + query += "GROUP BY {}\n".format(view_details['group']) + cursor.execute("{};".format(query)) + +logging.info("All done.") \ No newline at end of file -- To view, visit https://gerrit.wikimedia.org/r/295607 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Icc51fc9a23f2fcf4c19c7adc092662d7122471e1 Gerrit-PatchSet: 1 Gerrit-Project: operations/software Gerrit-Branch: master Gerrit-Owner: Alex Monk <kren...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits