Nuria has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/338786 )

Change subject: Add script to generate WSC abbrevs to domain map
......................................................................


Add script to generate WSC abbrevs to domain map

This script gets WMF's site matrix and generates a map of Webstats-
collector abbreviations (en.zero.b, commons.m, etc.) to their
corresponding domain name (en.wikibooks.org, commons.wikimedia.org)
and access-site (zero, desktop, etc.). The output is a TSV that can
be used as underlying data for a hive table.

Bug: T158330
Change-Id: I5d65ae9511d58d19497f025a8ba08b8a50475a7e
---
A bin/generate-domain-abbrev-map
A hive/domain_abbrev_map/create_domain_abbrev_map_table.hql
2 files changed, 237 insertions(+), 0 deletions(-)

Approvals:
  Nuria: Verified; Looks good to me, approved



diff --git a/bin/generate-domain-abbrev-map b/bin/generate-domain-abbrev-map
new file mode 100755
index 0000000..c0e4f42
--- /dev/null
+++ b/bin/generate-domain-abbrev-map
@@ -0,0 +1,209 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Note: You should make sure to put refinery/python on your PYTHONPATH.
+#   export PYTHONPATH=$PYTHONPATH:/path/to/refinery/python
+#
+# Note: The resulting table is intended for the translation of historical
+#   data only. So it does not need to be updated with new changes to the
+#   site matrix and thus will not be run periodically.
+#
+# Adapted from Dan Andreescu's script:
+# 
https://github.com/wikimedia/analytics-refinery/blob/master/bin/download-project-namespace-map
+# Also, some structures and methods have been copied from:
+# 
https://github.com/wikimedia/analytics-aggregator/blob/master/aggregator/util.py
+
+"""
+Gets WMF's site matrix and outputs a TSV that maps webstatscollector
+abbreviations to their corresponding domain name and access site.
+
+Usage:
+  generate-domain-abbrev-map.py (--output-file FILE|--output-hdfs PATH)
+
+Options:
+    -h --help                           Show this help message and exit.
+    -o FILE --output-file FILE          Output the results here.
+    -x PATH --output-hdfs PATH          Output the results to HDFS here.
+"""
+
+import requests
+import json
+import csv
+import os
+
+from docopt import docopt
+from tempfile import mkstemp
+from subprocess import check_call
+
+
+WEBSTATSCOLLECTOR_WHITELISTED_WIKIMEDIA_WIKIS = [
+    'commons',
+    'meta',
+    'incubator',
+    'species',
+    'strategy',
+    'outreach',
+    'usability',
+    'quality',
+]
+
+WEBSTATSCOLLECTOR_SUFFIX_ABBREVIATIONS = [
+    # Using a list (not an object), as order is important, as we
+    # consider the first match a win.
+    ('foundationwiki', '.f'),
+    ('mediawikiwiki', '.w'),
+    ('wikidatawiki', '.wd'),
+    ('wikibooks', '.b'),
+    ('wiktionary', '.d'),
+    ('wikimedia', '.m'),
+    ('wikinews', '.n'),
+    ('wikiquote', '.q'),
+    ('wikisource', '.s'),
+    ('wikiversity', '.v'),
+    ('wikivoyage', '.voy'),
+    # Have generic wiki last
+    ('wiki', ''),
+]
+
+
+def get_wikis():
+    headers = {
+        'User-Agent': 'Wikimedia Foundation Analytics Bot',
+        'From': 'mfo...@wikimedia.org'
+    }
+    site_matrix_query = ''.join([
+        'https://www.mediawiki.org/w/api.php?action=sitematrix',
+        '&smsiteprop=url|dbname|code',
+        '&smstate=all',
+        '&format=json',
+    ])
+
+    matrix = requests.get(
+        site_matrix_query,
+        headers=headers,
+    ).json().get('sitematrix', {})
+
+    wikis = [
+        wiki
+        for language in matrix.values()
+        if type(language) is dict and 'site' in language
+        for wiki in language['site']
+    ] + [
+        wiki
+        for wiki in matrix.get('specials', [])
+    ]
+
+    return [
+        wiki
+        for wiki in wikis
+        if 'private' not in wiki
+    ]
+
+def dbname_to_webstatscollector_abbreviation(dbname, site='desktop'):
+    """
+    Gets the webstatscollector abbreviation for a site's database name
+
+    If no webstatscollector abbreviation could be found, None is returned.
+
+    :param dbname: The data base name for the wiki (e.g.: 'enwiki')
+    :param site: The site to get the abbreviation for. Either 'desktop',
+        'mobile', or 'zero'. (Default: 'desktop')
+    """
+    for (dbname_ending, new_ending) in WEBSTATSCOLLECTOR_SUFFIX_ABBREVIATIONS:
+        if dbname.endswith(dbname_ending):
+            # replacing last occurrence of dbname's ending with new_ending
+            abbreviation = dbname.rsplit(dbname_ending, 1)[0] + new_ending
+
+            # dbnames use “_” where webstatscollector uses “-”.
+            abbreviation = abbreviation.replace('_', '-')
+
+            # prepend www if it is just the root project to catch things like
+            # wikidatawiki being served at www.wikidata.org
+            if abbreviation.startswith('.'):
+                abbreviation = "www" + abbreviation
+
+            # Fix-up for wikimedia.org wikis
+            if abbreviation in WEBSTATSCOLLECTOR_WHITELISTED_WIKIMEDIA_WIKIS:
+                abbreviation += ".m"
+
+            # Inject site modifier
+            if site != 'desktop':  # desktop has no modifier -> short-circuit
+                abbreviation_split = abbreviation.split('.')
+                if site == 'mobile':
+                    abbreviation_split.insert(1, 'm')
+                elif site == 'zero':
+                    abbreviation_split.insert(1, 'zero')
+
+                # fix-up mobile site where desktop site is www, like
+                # www.m.wd to m.wd
+                if abbreviation_split[0] == 'www':
+                    del abbreviation_split[0]
+
+                abbreviation = '.'.join(abbreviation_split)
+
+            return abbreviation
+    return None
+
+
+if __name__ == '__main__':
+    # Parse arguments
+    arguments = docopt(__doc__)
+    outfile = arguments['--output-file']
+    outhdfs = arguments['--output-hdfs']
+
+    # If we're outputting to hdfs, output to a temp file and copy up
+    output_to_hdfs = outhdfs is not None
+    if output_to_hdfs:
+        outfile = mkstemp()[1]
+
+    # Writes mapping as: (abbreviation, hostname, access_site)
+    # abbreviation  : en.m
+    # hostname      : en.wikipedia.org
+    # access_site   : mobile
+    with open(outfile, 'wb') as w:
+        tsvwriter = csv.writer(w, delimiter='\t')
+        access_sites = ['desktop', 'mobile', 'zero']
+
+        for wiki in get_wikis():
+            url = wiki.get('url', '')
+            hostname = url.replace('https://', '')
+            dbname = wiki.get('dbname', hostname)
+
+            for access_site in access_sites:
+                abbreviation = 
dbname_to_webstatscollector_abbreviation(dbname, access_site)
+                row = [
+                    abbreviation,
+                    hostname,
+                    access_site,
+                ]
+                tsvwriter.writerow([unicode(s).encode("utf-8") for s in row])
+
+    # Exports to HDFS if specified
+    if output_to_hdfs:
+        check_call([
+            'hdfs', 'dfs', '-mkdir', '-p',
+            outhdfs,
+        ])
+        check_call([
+            'hdfs', 'dfs', '-put', '-f',
+            outfile,
+            outhdfs + '/domain_abbrev_map.tsv',
+        ])
+        check_call([
+            'hdfs', 'dfs', '-touchz',
+            outhdfs + '/_SUCCESS',
+        ])
+        # clean up the temp file
+        os.remove(outfile)
diff --git a/hive/domain_abbrev_map/create_domain_abbrev_map_table.hql 
b/hive/domain_abbrev_map/create_domain_abbrev_map_table.hql
new file mode 100644
index 0000000..4209bd2
--- /dev/null
+++ b/hive/domain_abbrev_map/create_domain_abbrev_map_table.hql
@@ -0,0 +1,28 @@
+--
+-- Create table statement for domain abreviation map
+--
+-- This is a helper table that permits (by joining to it) translating
+-- webstatscollector domain abbreviations into their respective full
+-- domains and access sites, i.e.:
+--
+--     en         <->  en.wikipedia.org, desktop
+--     de.m.b     <->  de.wikibooks.org, mobile
+--     es.zero.d  <->  es.wiktionary.org, zero
+--
+-- The contents from this table come from the execution of the script
+-- bin/generate-domain-abbrev-map in this repository.
+--
+-- Usage
+--     hive -f create_domain_abbrev_map_table.hql --database wmf
+--
+
+CREATE EXTERNAL TABLE IF NOT EXISTS `domain_abbrev_map`(
+  `domain_abbrev`  string  COMMENT 'Webstatscollector domain abbreviation', 
+  `hostname`       string  COMMENT 'Full domain hostname (en.wikipedia.org)', 
+  `access_site`    string  COMMENT 'Accessed site (desktop|mobile|zero)'
+)
+ROW FORMAT DELIMITED
+FIELDS TERMINATED BY '\t'
+STORED AS TEXTFILE
+LOCATION '/wmf/data/archive/domain_abbrev_map'
+;

-- 
To view, visit https://gerrit.wikimedia.org/r/338786
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I5d65ae9511d58d19497f025a8ba08b8a50475a7e
Gerrit-PatchSet: 4
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: Mforns <mfo...@wikimedia.org>
Gerrit-Reviewer: Joal <j...@wikimedia.org>
Gerrit-Reviewer: Mforns <mfo...@wikimedia.org>
Gerrit-Reviewer: Nuria <nu...@wikimedia.org>
Gerrit-Reviewer: Ottomata <ao...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to