EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/403546 )
Change subject: Repoint spark in example_train.yaml ...................................................................... Repoint spark in example_train.yaml Mostly this makes it easier to push a branch over to stat1005 and try something out against full-sized data. Having example_train.yaml be "close enough" helps a good bit. Also update spark to 2.1.2 to match, and add a 'master' template so yarn/local can be toggled from spark.py command line Change-Id: Iccd44c0c9436287ba963a3c8b2244b3fa0a46274 --- M example_train.yaml A mjolnir/pruning.py A mjolnir/scan_es.py M mjolnir/test/fixtures/load_config/example_train.expect 4 files changed, 316 insertions(+), 100 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR refs/changes/46/403546/1 diff --git a/example_train.yaml b/example_train.yaml index 31ce45c..7b1749c 100644 --- a/example_train.yaml +++ b/example_train.yaml @@ -3,10 +3,11 @@ global: environment: PYSPARK_PYTHON: venv/bin/python - SPARK_CONF_DIR: /etc/spark/conf - SPARK_HOME: "%(HOME)s/spark-%(spark_version)s-bin-hadoop2.6" + SPARK_CONF_DIR: /etc/spark2/conf + SPARK_HOME: "/usr/lib/spark2" template_vars: - spark_version: 2.1.0 + spark_version: 2.1.2 + master: yarn # Path to spark-submit applicatoin spark_submit: "%(SPARK_HOME)s/bin/spark-submit" # Local path to zip'd virtualenv which will be shipped to executors @@ -50,7 +51,7 @@ ? "%(spark_submit)s" ? "%(PYSPARK_PYTHON)s" spark_args: - master: yarn + master: "%(master)s" # TODO: When is this necessary? files: /usr/lib/libhdfs.so.0.0.0 # Ship the mjolnir virtualenv to executors and decompress it to ./venv diff --git a/mjolnir/pruning.py b/mjolnir/pruning.py new file mode 100644 index 0000000..c2e78f9 --- /dev/null +++ b/mjolnir/pruning.py @@ -0,0 +1,134 @@ +from __future__ import absolute_import +import json +import math +from pyspark.sql import functions as F +from pyspark.sql.types import FloatType, StructField, StructType + + +class Split(object): + def __init__(self, left, right, feature, threshold): + self.left = left + self.right = right + self.feature = feature + self.threshold = threshold + + def isLeaf(self): + return False + + def eval(self, features): + n = self + while not n.isLeaf(): + if n.threshold > features[n.feature]: + n = n.left + else: + n = n.right + return n.output + + +class Leaf(object): + def __init__(self, output): + self.output = output + + def isLeaf(self): + return True + + +def _parse_node(json_node): + if 'leaf' in json_node: + return Leaf(json_node['leaf']) + else: + left = _parse_node(json_node['children'][0]) + right = _parse_node(json_node['children'][1]) + return Split(left, right, json_node['split'], json_node['split_condition']) + + +def parse_xgboost(json_tree): + return [_parse_node(tree) for tree in json.loads(json_tree)] + + +def ndcg_at_k(k, predicted, actual): + idcg = sum([((1 << label) - 1) / math.log(i + 2.0, 2) for i, label in enumerate(actual[:k])]) + if idcg == 0: + return 0. + else: + dcg = sum([((1 << label) - 1) / math.log(i + 2.0, 2) for i, label in enumerate(predicted[:k])]) + return dcg / idcg + + +# Horrible name ... it returns the ndcg for each removed tree +def gen_per_tree_ndcg(tree_cols, removed_trees, label_col, k=10): + def f(rows): + # Remove trees from the sum + cur_sum = [reduce(lambda acc, tree: acc - row[tree], removed_trees, row.sum) for row in rows] + data = zip(rows, cur_sum) + + # TODO: actual could be pre-calculated? Actually full idcg could be pre-calculated + actual = [x[0][label_col] for x in sorted(data, key=lambda x: x[0][label_col], reverse=True)] + # baseline ndcg + predicted = [x[0][label_col] for x in sorted(data, key=lambda x: x[1], reverse=True)] + res = [ndcg_at_k(k, predicted, actual)] + # Per-tree ndcgs + for tree_pred in tree_cols: + predicted = [x[0][label_col] for x in sorted(data, key=lambda x: x[1] - x[0][tree_pred], reverse=True)] + res.append(ndcg_at_k(k, predicted, actual)) + return res + fields = [StructField(name, FloatType()) for name in ['orig'] + tree_cols] + return F.udf(f, StructType(fields)) + + +def gen_eval_tree_udf(bc_trees): + def f(tree_id, features): + return bc_trees.value[tree_id].eval(features) + return F.udf(f, FloatType()) + + +def prune(df, trees, feature_col='features', label_col='label', group_cols=['wikiid', 'query']): + # Calculate per-tree scores + eval_tree_udf = gen_eval_tree_udf(df._sc.broadcast(trees)) + cols = [eval_tree_udf(F.lit(i), feature_col).alias('tree_%d' % (i)) for i in range(len(trees))] + tree_cols = ['tree_%d' % (i) for i in range(len(trees))] + + # We should iterate until it gets worse or we hit some desired # of trees + df_w_scores = ( + df + .select(feature_col, label_col, F.concat(*group_cols).alias('group_id')) + # Does the above select shrink the data size for this calculation? or would + # spark manage that anyways? + .repartition(200) + .select('group_id', label_col, *cols) + .withColumn('sum', reduce(lambda x, y: x + F.col(y), tree_cols, F.lit(0))) + .groupBy('group_id') + # This grouping makes it impossible to drop fields as we go ... + .agg(F.collect_list(F.struct('sum', 'label', *tree_cols)).alias('pages')) + .drop('group_id') + .cache()) + + try: + removed_trees = set() + results = [] + while len(removed_trees) < len(trees): + print 'Remaining steps: %d' % (len(trees) - len(removed_trees)) + if removed_trees: + remaining_tree_cols = [x for x in tree_cols if x not in removed_trees] + else: + remaining_tree_cols = tree_cols + + per_tree_ndcg_udf = gen_per_tree_ndcg(remaining_tree_cols, removed_trees, label_col) + + scores = ( + df_w_scores + .select(per_tree_ndcg_udf('pages').alias('per_tree_scores')) + # Average together each query group + .agg(*[F.avg('per_tree_scores.%s' % (tree_col)).alias('%s_ndcg' % (tree_col)) + for tree_col in ['orig'] + remaining_tree_cols]) + .collect())[0] + + # Record score before any mucking around + if len(results) == 0: + results.append(('original', scores.orig_ndcg)) + worst_tree, best_score = sorted(zip(remaining_tree_cols, scores[1:]), key=lambda x: x[1], reverse=True)[0] + removed_trees.add(worst_tree) + results.append((worst_tree, best_score)) + return results + finally: + df_w_scores.unpersist() diff --git a/mjolnir/scan_es.py b/mjolnir/scan_es.py new file mode 100644 index 0000000..67f22cc --- /dev/null +++ b/mjolnir/scan_es.py @@ -0,0 +1,81 @@ +from __future__ import absolute_import +import json +import mjolnir.spark +import numpy as np +from pyspark.ml.linalg import Vectors, VectorUDT +import pyspark.sql.types +from pyspark.sql import functions as F +import random +import requests + + +def read_fields(sc, n_slices, fields, size=500): + def read_slice(id): + hosts = ['elastic%d.codfw.wmnet' % i for i in range(2001, 2037)] + host = random.choice(hosts) + url = 'http://%s:9200/enwiki_content/page/_search?scroll=1m' % (host) + scroll_url = 'http://%s:9200/_search/scroll' % (host) + query = { + "query": { + "match_all": {}, + }, + "sort": ["_doc"], + "size": size, + "_source": fields, + } + if n_slices > 1: + query['slice'] = { + "id": id, + "max": n_slices + } + session = requests.Session() + res = session.get(url, data=json.dumps(query)).json() + while len(res['hits']['hits']) > 0: + for hit in res['hits']['hits']: + yield [hit['_id']] + [hit['_source'][field] if field in hit['_source'] else [] for field in fields] + res = session.get(scroll_url, data=json.dumps({ + "scroll": "1m", + "scroll_id": res['_scroll_id'], + })).json() + + fields_schema = [ + pyspark.sql.types.StructField("page_id", pyspark.sql.types.StringType()) + ] + for field in fields: + fields_schema.append(pyspark.sql.types.StructField( + field, pyspark.sql.types.ArrayType(pyspark.sql.types.StringType()))) + + return sc.parallelize(range(n_slices), n_slices).flatMap(read_slice).toDF( + pyspark.sql.types.StructType(fields_schema)) + + +def take_imporant(df_feat, df_es, field, min_count=1000): + rows = ( + df_es + .select(F.explode(field).alias(field)) + .groupBy(field) + .agg(F.count(F.lit(1)).alias('count')) + .where(F.col('count') > min_count) + .collect()) + row_map = {row[field]: i for i, row in enumerate(rows)} + + df_es_feat = ( + df_es + .rdd.map(lambda row: ( + row.page_id, + Vectors.sparse( + len(row_map), + [[row_map[item], True] for item in row[field] if item in row_map]) + )) + .toDF(['hit_page_id', 'es_features'])) + + merge_feats = F.udf(lambda a, b: Vectors.dense(np.append(a.toArray(), b.toArray())), VectorUDT()) + + return ( + df_feat + .join(df_es_feat, how='inner', on=['hit_page_id']) + .withColumn('features', merge_feats('features', 'es_features')) + .drop('es_features') + .withColumn('features', mjolnir.spark.add_meta(df_feat._sc, F.col('features'), { + 'features': df_feat.schema['features'].metadata['features'] + [row[field] for row in rows], + }))) diff --git a/mjolnir/test/fixtures/load_config/example_train.expect b/mjolnir/test/fixtures/load_config/example_train.expect index c56d132..52bea1b 100644 --- a/mjolnir/test/fixtures/load_config/example_train.expect +++ b/mjolnir/test/fixtures/load_config/example_train.expect @@ -10,20 +10,20 @@ environment: HOME: /home/pytest PYSPARK_PYTHON: venv/bin/python - SPARK_CONF_DIR: /etc/spark/conf - SPARK_HOME: /home/pytest/spark-2.1.0-bin-hadoop2.6 + SPARK_CONF_DIR: /etc/spark2/conf + SPARK_HOME: /usr/lib/spark2 USER: pytest mjolnir_utility: data_pipeline mjolnir_utility_path: /srv/mjolnir/venv/bin/mjolnir-utilities.py paths: dir_exist: !!set - /etc/spark/conf: null + /etc/spark2/conf: null dir_not_exist: !!set /mnt/hdfs/user/pytest/mjolnir/marker: null file_exist: !!set - /home/pytest/spark-2.1.0-bin-hadoop2.6/bin/spark-submit: null /srv/mjolnir/mjolnir_venv.zip: null /srv/mjolnir/venv/bin/mjolnir-utilities.py: null + /usr/lib/spark2/bin/spark-submit: null venv/bin/python: null spark_args: archives: /srv/mjolnir/mjolnir_venv.zip#venv @@ -31,9 +31,9 @@ executor-memory: 2G files: /usr/lib/libhdfs.so.0.0.0 master: yarn - packages: ml.dmlc:xgboost4j-spark:0.7-wmf-1,org.wikimedia.search:mjolnir:0.2,org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0 + packages: ml.dmlc:xgboost4j-spark:0.7-wmf-1,org.wikimedia.search:mjolnir:0.2,org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.2 repositories: https://archiva.wikimedia.org/repository/releases,https://archiva.wikimedia.org/repository/snapshots,https://archiva.wikimedia.org/repository/mirrored - spark_command: /home/pytest/spark-2.1.0-bin-hadoop2.6/bin/spark-submit + spark_command: /usr/lib/spark2/bin/spark-submit spark_conf: spark.driver.extraJavaOptions: -Dhttp.proxyHost=webproxy.eqiad.wmnet -Dhttp.proxyPort=8080 -Dhttps.proxyHost=webproxy.eqiad.wmnet -Dhttps.proxyPort=8080 @@ -43,16 +43,16 @@ environment: HOME: /home/pytest PYSPARK_PYTHON: venv/bin/python - SPARK_CONF_DIR: /etc/spark/conf - SPARK_HOME: /home/pytest/spark-2.1.0-bin-hadoop2.6 + SPARK_CONF_DIR: /etc/spark2/conf + SPARK_HOME: /usr/lib/spark2 USER: pytest paths: dir_exist: !!set - /etc/spark/conf: null + /etc/spark2/conf: null file_exist: !!set - /home/pytest/spark-2.1.0-bin-hadoop2.6/bin/spark-submit: null /srv/mjolnir/mjolnir_venv.zip: null /srv/mjolnir/venv/bin/mjolnir-utilities.py: null + /usr/lib/spark2/bin/spark-submit: null venv/bin/python: null spark_args: archives: /srv/mjolnir/mjolnir_venv.zip#venv @@ -60,9 +60,9 @@ executor-memory: 2G files: /usr/lib/libhdfs.so.0.0.0 master: yarn - packages: ml.dmlc:xgboost4j-spark:0.7-wmf-1,org.wikimedia.search:mjolnir:0.2,org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0 + packages: ml.dmlc:xgboost4j-spark:0.7-wmf-1,org.wikimedia.search:mjolnir:0.2,org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.2 repositories: https://archiva.wikimedia.org/repository/releases,https://archiva.wikimedia.org/repository/snapshots,https://archiva.wikimedia.org/repository/mirrored - spark_command: /home/pytest/spark-2.1.0-bin-hadoop2.6/bin/pyspark + spark_command: /usr/lib/spark2/bin/pyspark spark_conf: spark.driver.extraJavaOptions: -Dhttp.proxyHost=webproxy.eqiad.wmnet -Dhttp.proxyPort=8080 -Dhttps.proxyHost=webproxy.eqiad.wmnet -Dhttps.proxyPort=8080 @@ -72,16 +72,16 @@ environment: HOME: /home/pytest PYSPARK_PYTHON: venv/bin/python - SPARK_CONF_DIR: /etc/spark/conf - SPARK_HOME: /home/pytest/spark-2.1.0-bin-hadoop2.6 + SPARK_CONF_DIR: /etc/spark2/conf + SPARK_HOME: /usr/lib/spark2 USER: pytest paths: dir_exist: !!set - /etc/spark/conf: null + /etc/spark2/conf: null file_exist: !!set - /home/pytest/spark-2.1.0-bin-hadoop2.6/bin/spark-submit: null /srv/mjolnir/mjolnir_venv.zip: null /srv/mjolnir/venv/bin/mjolnir-utilities.py: null + /usr/lib/spark2/bin/spark-submit: null venv/bin/python: null spark_args: archives: /srv/mjolnir/mjolnir_venv.zip#venv @@ -89,9 +89,9 @@ executor-memory: 2G files: /usr/lib/libhdfs.so.0.0.0 master: yarn - packages: ml.dmlc:xgboost4j-spark:0.7-wmf-1,org.wikimedia.search:mjolnir:0.2,org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0 + packages: ml.dmlc:xgboost4j-spark:0.7-wmf-1,org.wikimedia.search:mjolnir:0.2,org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.2 repositories: https://archiva.wikimedia.org/repository/releases,https://archiva.wikimedia.org/repository/snapshots,https://archiva.wikimedia.org/repository/mirrored - spark_command: /home/pytest/spark-2.1.0-bin-hadoop2.6/bin/pyspark + spark_command: /usr/lib/spark2/bin/pyspark spark_conf: spark.driver.extraJavaOptions: -Dhttp.proxyHost=webproxy.eqiad.wmnet -Dhttp.proxyPort=8080 -Dhttps.proxyHost=webproxy.eqiad.wmnet -Dhttps.proxyPort=8080 @@ -104,19 +104,19 @@ environment: HOME: /home/pytest PYSPARK_PYTHON: venv/bin/python - SPARK_CONF_DIR: /etc/spark/conf - SPARK_HOME: /home/pytest/spark-2.1.0-bin-hadoop2.6 + SPARK_CONF_DIR: /etc/spark2/conf + SPARK_HOME: /usr/lib/spark2 USER: pytest mjolnir_utility: training_pipeline mjolnir_utility_path: /srv/mjolnir/venv/bin/mjolnir-utilities.py paths: dir_exist: !!set - /etc/spark/conf: null + /etc/spark2/conf: null /home/pytest/training_size: null file_exist: !!set - /home/pytest/spark-2.1.0-bin-hadoop2.6/bin/spark-submit: null /srv/mjolnir/mjolnir_venv.zip: null /srv/mjolnir/venv/bin/mjolnir-utilities.py: null + /usr/lib/spark2/bin/spark-submit: null venv/bin/python: null spark_args: archives: /srv/mjolnir/mjolnir_venv.zip#venv @@ -125,9 +125,9 @@ executor-memory: 2G files: /usr/lib/libhdfs.so.0.0.0 master: yarn - packages: ml.dmlc:xgboost4j-spark:0.7-wmf-1,org.wikimedia.search:mjolnir:0.2,org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0 + packages: ml.dmlc:xgboost4j-spark:0.7-wmf-1,org.wikimedia.search:mjolnir:0.2,org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.2 repositories: https://archiva.wikimedia.org/repository/releases,https://archiva.wikimedia.org/repository/snapshots,https://archiva.wikimedia.org/repository/mirrored - spark_command: /home/pytest/spark-2.1.0-bin-hadoop2.6/bin/spark-submit + spark_command: /usr/lib/spark2/bin/spark-submit spark_conf: spark.driver.extraJavaOptions: -Dhttp.proxyHost=webproxy.eqiad.wmnet -Dhttp.proxyPort=8080 -Dhttps.proxyHost=webproxy.eqiad.wmnet -Dhttps.proxyPort=8080 @@ -149,20 +149,20 @@ environment: HOME: /home/pytest PYSPARK_PYTHON: venv/bin/python - SPARK_CONF_DIR: /etc/spark/conf - SPARK_HOME: /home/pytest/spark-2.1.0-bin-hadoop2.6 + SPARK_CONF_DIR: /etc/spark2/conf + SPARK_HOME: /usr/lib/spark2 USER: pytest mjolnir_utility: data_pipeline mjolnir_utility_path: /srv/mjolnir/venv/bin/mjolnir-utilities.py paths: dir_exist: !!set - /etc/spark/conf: null + /etc/spark2/conf: null dir_not_exist: !!set /mnt/hdfs/user/pytest/mjolnir/marker: null file_exist: !!set - /home/pytest/spark-2.1.0-bin-hadoop2.6/bin/spark-submit: null /srv/mjolnir/mjolnir_venv.zip: null /srv/mjolnir/venv/bin/mjolnir-utilities.py: null + /usr/lib/spark2/bin/spark-submit: null venv/bin/python: null spark_args: archives: /srv/mjolnir/mjolnir_venv.zip#venv @@ -170,9 +170,9 @@ executor-memory: 2G files: /usr/lib/libhdfs.so.0.0.0 master: yarn - packages: ml.dmlc:xgboost4j-spark:0.7-wmf-1,org.wikimedia.search:mjolnir:0.2,org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0 + packages: ml.dmlc:xgboost4j-spark:0.7-wmf-1,org.wikimedia.search:mjolnir:0.2,org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.2 repositories: https://archiva.wikimedia.org/repository/releases,https://archiva.wikimedia.org/repository/snapshots,https://archiva.wikimedia.org/repository/mirrored - spark_command: /home/pytest/spark-2.1.0-bin-hadoop2.6/bin/spark-submit + spark_command: /usr/lib/spark2/bin/spark-submit spark_conf: spark.driver.extraJavaOptions: -Dhttp.proxyHost=webproxy.eqiad.wmnet -Dhttp.proxyPort=8080 -Dhttps.proxyHost=webproxy.eqiad.wmnet -Dhttps.proxyPort=8080 @@ -182,16 +182,16 @@ environment: HOME: /home/pytest PYSPARK_PYTHON: venv/bin/python - SPARK_CONF_DIR: /etc/spark/conf - SPARK_HOME: /home/pytest/spark-2.1.0-bin-hadoop2.6 + SPARK_CONF_DIR: /etc/spark2/conf + SPARK_HOME: /usr/lib/spark2 USER: pytest paths: dir_exist: !!set - /etc/spark/conf: null + /etc/spark2/conf: null file_exist: !!set - /home/pytest/spark-2.1.0-bin-hadoop2.6/bin/spark-submit: null /srv/mjolnir/mjolnir_venv.zip: null /srv/mjolnir/venv/bin/mjolnir-utilities.py: null + /usr/lib/spark2/bin/spark-submit: null venv/bin/python: null spark_args: archives: /srv/mjolnir/mjolnir_venv.zip#venv @@ -199,9 +199,9 @@ executor-memory: 2G files: /usr/lib/libhdfs.so.0.0.0 master: yarn - packages: ml.dmlc:xgboost4j-spark:0.7-wmf-1,org.wikimedia.search:mjolnir:0.2,org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0 + packages: ml.dmlc:xgboost4j-spark:0.7-wmf-1,org.wikimedia.search:mjolnir:0.2,org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.2 repositories: https://archiva.wikimedia.org/repository/releases,https://archiva.wikimedia.org/repository/snapshots,https://archiva.wikimedia.org/repository/mirrored - spark_command: /home/pytest/spark-2.1.0-bin-hadoop2.6/bin/pyspark + spark_command: /usr/lib/spark2/bin/pyspark spark_conf: spark.driver.extraJavaOptions: -Dhttp.proxyHost=webproxy.eqiad.wmnet -Dhttp.proxyPort=8080 -Dhttps.proxyHost=webproxy.eqiad.wmnet -Dhttps.proxyPort=8080 @@ -211,16 +211,16 @@ environment: HOME: /home/pytest PYSPARK_PYTHON: venv/bin/python - SPARK_CONF_DIR: /etc/spark/conf - SPARK_HOME: /home/pytest/spark-2.1.0-bin-hadoop2.6 + SPARK_CONF_DIR: /etc/spark2/conf + SPARK_HOME: /usr/lib/spark2 USER: pytest paths: dir_exist: !!set - /etc/spark/conf: null + /etc/spark2/conf: null file_exist: !!set - /home/pytest/spark-2.1.0-bin-hadoop2.6/bin/spark-submit: null /srv/mjolnir/mjolnir_venv.zip: null /srv/mjolnir/venv/bin/mjolnir-utilities.py: null + /usr/lib/spark2/bin/spark-submit: null venv/bin/python: null spark_args: archives: /srv/mjolnir/mjolnir_venv.zip#venv @@ -228,9 +228,9 @@ executor-memory: 2G files: /usr/lib/libhdfs.so.0.0.0 master: yarn - packages: ml.dmlc:xgboost4j-spark:0.7-wmf-1,org.wikimedia.search:mjolnir:0.2,org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0 + packages: ml.dmlc:xgboost4j-spark:0.7-wmf-1,org.wikimedia.search:mjolnir:0.2,org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.2 repositories: https://archiva.wikimedia.org/repository/releases,https://archiva.wikimedia.org/repository/snapshots,https://archiva.wikimedia.org/repository/mirrored - spark_command: /home/pytest/spark-2.1.0-bin-hadoop2.6/bin/pyspark + spark_command: /usr/lib/spark2/bin/pyspark spark_conf: spark.driver.extraJavaOptions: -Dhttp.proxyHost=webproxy.eqiad.wmnet -Dhttp.proxyPort=8080 -Dhttps.proxyHost=webproxy.eqiad.wmnet -Dhttps.proxyPort=8080 @@ -247,19 +247,19 @@ environment: HOME: /home/pytest PYSPARK_PYTHON: venv/bin/python - SPARK_CONF_DIR: /etc/spark/conf - SPARK_HOME: /home/pytest/spark-2.1.0-bin-hadoop2.6 + SPARK_CONF_DIR: /etc/spark2/conf + SPARK_HOME: /usr/lib/spark2 USER: pytest mjolnir_utility: training_pipeline mjolnir_utility_path: /srv/mjolnir/venv/bin/mjolnir-utilities.py paths: dir_exist: !!set - /etc/spark/conf: null + /etc/spark2/conf: null /home/pytest/training_size: null file_exist: !!set - /home/pytest/spark-2.1.0-bin-hadoop2.6/bin/spark-submit: null /srv/mjolnir/mjolnir_venv.zip: null /srv/mjolnir/venv/bin/mjolnir-utilities.py: null + /usr/lib/spark2/bin/spark-submit: null venv/bin/python: null spark_args: archives: /srv/mjolnir/mjolnir_venv.zip#venv @@ -268,9 +268,9 @@ executor-memory: 4G files: /usr/lib/libhdfs.so.0.0.0 master: yarn - packages: ml.dmlc:xgboost4j-spark:0.7-wmf-1,org.wikimedia.search:mjolnir:0.2,org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0 + packages: ml.dmlc:xgboost4j-spark:0.7-wmf-1,org.wikimedia.search:mjolnir:0.2,org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.2 repositories: https://archiva.wikimedia.org/repository/releases,https://archiva.wikimedia.org/repository/snapshots,https://archiva.wikimedia.org/repository/mirrored - spark_command: /home/pytest/spark-2.1.0-bin-hadoop2.6/bin/spark-submit + spark_command: /usr/lib/spark2/bin/spark-submit spark_conf: spark.driver.extraJavaOptions: -Dhttp.proxyHost=webproxy.eqiad.wmnet -Dhttp.proxyPort=8080 -Dhttps.proxyHost=webproxy.eqiad.wmnet -Dhttps.proxyPort=8080 @@ -294,20 +294,20 @@ environment: HOME: /home/pytest PYSPARK_PYTHON: venv/bin/python - SPARK_CONF_DIR: /etc/spark/conf - SPARK_HOME: /home/pytest/spark-2.1.0-bin-hadoop2.6 + SPARK_CONF_DIR: /etc/spark2/conf + SPARK_HOME: /usr/lib/spark2 USER: pytest mjolnir_utility: data_pipeline mjolnir_utility_path: /srv/mjolnir/venv/bin/mjolnir-utilities.py paths: dir_exist: !!set - /etc/spark/conf: null + /etc/spark2/conf: null dir_not_exist: !!set /mnt/hdfs/user/pytest/mjolnir/marker: null file_exist: !!set - /home/pytest/spark-2.1.0-bin-hadoop2.6/bin/spark-submit: null /srv/mjolnir/mjolnir_venv.zip: null /srv/mjolnir/venv/bin/mjolnir-utilities.py: null + /usr/lib/spark2/bin/spark-submit: null venv/bin/python: null spark_args: archives: /srv/mjolnir/mjolnir_venv.zip#venv @@ -315,9 +315,9 @@ executor-memory: 2G files: /usr/lib/libhdfs.so.0.0.0 master: yarn - packages: ml.dmlc:xgboost4j-spark:0.7-wmf-1,org.wikimedia.search:mjolnir:0.2,org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0 + packages: ml.dmlc:xgboost4j-spark:0.7-wmf-1,org.wikimedia.search:mjolnir:0.2,org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.2 repositories: https://archiva.wikimedia.org/repository/releases,https://archiva.wikimedia.org/repository/snapshots,https://archiva.wikimedia.org/repository/mirrored - spark_command: /home/pytest/spark-2.1.0-bin-hadoop2.6/bin/spark-submit + spark_command: /usr/lib/spark2/bin/spark-submit spark_conf: spark.driver.extraJavaOptions: -Dhttp.proxyHost=webproxy.eqiad.wmnet -Dhttp.proxyPort=8080 -Dhttps.proxyHost=webproxy.eqiad.wmnet -Dhttps.proxyPort=8080 @@ -327,16 +327,16 @@ environment: HOME: /home/pytest PYSPARK_PYTHON: venv/bin/python - SPARK_CONF_DIR: /etc/spark/conf - SPARK_HOME: /home/pytest/spark-2.1.0-bin-hadoop2.6 + SPARK_CONF_DIR: /etc/spark2/conf + SPARK_HOME: /usr/lib/spark2 USER: pytest paths: dir_exist: !!set - /etc/spark/conf: null + /etc/spark2/conf: null file_exist: !!set - /home/pytest/spark-2.1.0-bin-hadoop2.6/bin/spark-submit: null /srv/mjolnir/mjolnir_venv.zip: null /srv/mjolnir/venv/bin/mjolnir-utilities.py: null + /usr/lib/spark2/bin/spark-submit: null venv/bin/python: null spark_args: archives: /srv/mjolnir/mjolnir_venv.zip#venv @@ -344,9 +344,9 @@ executor-memory: 2G files: /usr/lib/libhdfs.so.0.0.0 master: yarn - packages: ml.dmlc:xgboost4j-spark:0.7-wmf-1,org.wikimedia.search:mjolnir:0.2,org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0 + packages: ml.dmlc:xgboost4j-spark:0.7-wmf-1,org.wikimedia.search:mjolnir:0.2,org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.2 repositories: https://archiva.wikimedia.org/repository/releases,https://archiva.wikimedia.org/repository/snapshots,https://archiva.wikimedia.org/repository/mirrored - spark_command: /home/pytest/spark-2.1.0-bin-hadoop2.6/bin/pyspark + spark_command: /usr/lib/spark2/bin/pyspark spark_conf: spark.driver.extraJavaOptions: -Dhttp.proxyHost=webproxy.eqiad.wmnet -Dhttp.proxyPort=8080 -Dhttps.proxyHost=webproxy.eqiad.wmnet -Dhttps.proxyPort=8080 @@ -356,16 +356,16 @@ environment: HOME: /home/pytest PYSPARK_PYTHON: venv/bin/python - SPARK_CONF_DIR: /etc/spark/conf - SPARK_HOME: /home/pytest/spark-2.1.0-bin-hadoop2.6 + SPARK_CONF_DIR: /etc/spark2/conf + SPARK_HOME: /usr/lib/spark2 USER: pytest paths: dir_exist: !!set - /etc/spark/conf: null + /etc/spark2/conf: null file_exist: !!set - /home/pytest/spark-2.1.0-bin-hadoop2.6/bin/spark-submit: null /srv/mjolnir/mjolnir_venv.zip: null /srv/mjolnir/venv/bin/mjolnir-utilities.py: null + /usr/lib/spark2/bin/spark-submit: null venv/bin/python: null spark_args: archives: /srv/mjolnir/mjolnir_venv.zip#venv @@ -373,9 +373,9 @@ executor-memory: 2G files: /usr/lib/libhdfs.so.0.0.0 master: yarn - packages: ml.dmlc:xgboost4j-spark:0.7-wmf-1,org.wikimedia.search:mjolnir:0.2,org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0 + packages: ml.dmlc:xgboost4j-spark:0.7-wmf-1,org.wikimedia.search:mjolnir:0.2,org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.2 repositories: https://archiva.wikimedia.org/repository/releases,https://archiva.wikimedia.org/repository/snapshots,https://archiva.wikimedia.org/repository/mirrored - spark_command: /home/pytest/spark-2.1.0-bin-hadoop2.6/bin/pyspark + spark_command: /usr/lib/spark2/bin/pyspark spark_conf: spark.driver.extraJavaOptions: -Dhttp.proxyHost=webproxy.eqiad.wmnet -Dhttp.proxyPort=8080 -Dhttps.proxyHost=webproxy.eqiad.wmnet -Dhttps.proxyPort=8080 @@ -392,19 +392,19 @@ environment: HOME: /home/pytest PYSPARK_PYTHON: venv/bin/python - SPARK_CONF_DIR: /etc/spark/conf - SPARK_HOME: /home/pytest/spark-2.1.0-bin-hadoop2.6 + SPARK_CONF_DIR: /etc/spark2/conf + SPARK_HOME: /usr/lib/spark2 USER: pytest mjolnir_utility: training_pipeline mjolnir_utility_path: /srv/mjolnir/venv/bin/mjolnir-utilities.py paths: dir_exist: !!set - /etc/spark/conf: null + /etc/spark2/conf: null /home/pytest/training_size: null file_exist: !!set - /home/pytest/spark-2.1.0-bin-hadoop2.6/bin/spark-submit: null /srv/mjolnir/mjolnir_venv.zip: null /srv/mjolnir/venv/bin/mjolnir-utilities.py: null + /usr/lib/spark2/bin/spark-submit: null venv/bin/python: null spark_args: archives: /srv/mjolnir/mjolnir_venv.zip#venv @@ -413,9 +413,9 @@ executor-memory: 3G files: /usr/lib/libhdfs.so.0.0.0 master: yarn - packages: ml.dmlc:xgboost4j-spark:0.7-wmf-1,org.wikimedia.search:mjolnir:0.2,org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0 + packages: ml.dmlc:xgboost4j-spark:0.7-wmf-1,org.wikimedia.search:mjolnir:0.2,org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.2 repositories: https://archiva.wikimedia.org/repository/releases,https://archiva.wikimedia.org/repository/snapshots,https://archiva.wikimedia.org/repository/mirrored - spark_command: /home/pytest/spark-2.1.0-bin-hadoop2.6/bin/spark-submit + spark_command: /usr/lib/spark2/bin/spark-submit spark_conf: spark.driver.extraJavaOptions: -Dhttp.proxyHost=webproxy.eqiad.wmnet -Dhttp.proxyPort=8080 -Dhttps.proxyHost=webproxy.eqiad.wmnet -Dhttps.proxyPort=8080 @@ -441,20 +441,20 @@ environment: HOME: /home/pytest PYSPARK_PYTHON: venv/bin/python - SPARK_CONF_DIR: /etc/spark/conf - SPARK_HOME: /home/pytest/spark-2.1.0-bin-hadoop2.6 + SPARK_CONF_DIR: /etc/spark2/conf + SPARK_HOME: /usr/lib/spark2 USER: pytest mjolnir_utility: data_pipeline mjolnir_utility_path: /srv/mjolnir/venv/bin/mjolnir-utilities.py paths: dir_exist: !!set - /etc/spark/conf: null + /etc/spark2/conf: null dir_not_exist: !!set /mnt/hdfs/user/pytest/mjolnir/marker: null file_exist: !!set - /home/pytest/spark-2.1.0-bin-hadoop2.6/bin/spark-submit: null /srv/mjolnir/mjolnir_venv.zip: null /srv/mjolnir/venv/bin/mjolnir-utilities.py: null + /usr/lib/spark2/bin/spark-submit: null venv/bin/python: null spark_args: archives: /srv/mjolnir/mjolnir_venv.zip#venv @@ -462,9 +462,9 @@ executor-memory: 2G files: /usr/lib/libhdfs.so.0.0.0 master: yarn - packages: ml.dmlc:xgboost4j-spark:0.7-wmf-1,org.wikimedia.search:mjolnir:0.2,org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0 + packages: ml.dmlc:xgboost4j-spark:0.7-wmf-1,org.wikimedia.search:mjolnir:0.2,org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.2 repositories: https://archiva.wikimedia.org/repository/releases,https://archiva.wikimedia.org/repository/snapshots,https://archiva.wikimedia.org/repository/mirrored - spark_command: /home/pytest/spark-2.1.0-bin-hadoop2.6/bin/spark-submit + spark_command: /usr/lib/spark2/bin/spark-submit spark_conf: spark.driver.extraJavaOptions: -Dhttp.proxyHost=webproxy.eqiad.wmnet -Dhttp.proxyPort=8080 -Dhttps.proxyHost=webproxy.eqiad.wmnet -Dhttps.proxyPort=8080 @@ -474,16 +474,16 @@ environment: HOME: /home/pytest PYSPARK_PYTHON: venv/bin/python - SPARK_CONF_DIR: /etc/spark/conf - SPARK_HOME: /home/pytest/spark-2.1.0-bin-hadoop2.6 + SPARK_CONF_DIR: /etc/spark2/conf + SPARK_HOME: /usr/lib/spark2 USER: pytest paths: dir_exist: !!set - /etc/spark/conf: null + /etc/spark2/conf: null file_exist: !!set - /home/pytest/spark-2.1.0-bin-hadoop2.6/bin/spark-submit: null /srv/mjolnir/mjolnir_venv.zip: null /srv/mjolnir/venv/bin/mjolnir-utilities.py: null + /usr/lib/spark2/bin/spark-submit: null venv/bin/python: null spark_args: archives: /srv/mjolnir/mjolnir_venv.zip#venv @@ -491,9 +491,9 @@ executor-memory: 2G files: /usr/lib/libhdfs.so.0.0.0 master: yarn - packages: ml.dmlc:xgboost4j-spark:0.7-wmf-1,org.wikimedia.search:mjolnir:0.2,org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0 + packages: ml.dmlc:xgboost4j-spark:0.7-wmf-1,org.wikimedia.search:mjolnir:0.2,org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.2 repositories: https://archiva.wikimedia.org/repository/releases,https://archiva.wikimedia.org/repository/snapshots,https://archiva.wikimedia.org/repository/mirrored - spark_command: /home/pytest/spark-2.1.0-bin-hadoop2.6/bin/pyspark + spark_command: /usr/lib/spark2/bin/pyspark spark_conf: spark.driver.extraJavaOptions: -Dhttp.proxyHost=webproxy.eqiad.wmnet -Dhttp.proxyPort=8080 -Dhttps.proxyHost=webproxy.eqiad.wmnet -Dhttps.proxyPort=8080 @@ -503,16 +503,16 @@ environment: HOME: /home/pytest PYSPARK_PYTHON: venv/bin/python - SPARK_CONF_DIR: /etc/spark/conf - SPARK_HOME: /home/pytest/spark-2.1.0-bin-hadoop2.6 + SPARK_CONF_DIR: /etc/spark2/conf + SPARK_HOME: /usr/lib/spark2 USER: pytest paths: dir_exist: !!set - /etc/spark/conf: null + /etc/spark2/conf: null file_exist: !!set - /home/pytest/spark-2.1.0-bin-hadoop2.6/bin/spark-submit: null /srv/mjolnir/mjolnir_venv.zip: null /srv/mjolnir/venv/bin/mjolnir-utilities.py: null + /usr/lib/spark2/bin/spark-submit: null venv/bin/python: null spark_args: archives: /srv/mjolnir/mjolnir_venv.zip#venv @@ -520,9 +520,9 @@ executor-memory: 2G files: /usr/lib/libhdfs.so.0.0.0 master: yarn - packages: ml.dmlc:xgboost4j-spark:0.7-wmf-1,org.wikimedia.search:mjolnir:0.2,org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0 + packages: ml.dmlc:xgboost4j-spark:0.7-wmf-1,org.wikimedia.search:mjolnir:0.2,org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.2 repositories: https://archiva.wikimedia.org/repository/releases,https://archiva.wikimedia.org/repository/snapshots,https://archiva.wikimedia.org/repository/mirrored - spark_command: /home/pytest/spark-2.1.0-bin-hadoop2.6/bin/pyspark + spark_command: /usr/lib/spark2/bin/pyspark spark_conf: spark.driver.extraJavaOptions: -Dhttp.proxyHost=webproxy.eqiad.wmnet -Dhttp.proxyPort=8080 -Dhttps.proxyHost=webproxy.eqiad.wmnet -Dhttps.proxyPort=8080 @@ -539,19 +539,19 @@ environment: HOME: /home/pytest PYSPARK_PYTHON: venv/bin/python - SPARK_CONF_DIR: /etc/spark/conf - SPARK_HOME: /home/pytest/spark-2.1.0-bin-hadoop2.6 + SPARK_CONF_DIR: /etc/spark2/conf + SPARK_HOME: /usr/lib/spark2 USER: pytest mjolnir_utility: training_pipeline mjolnir_utility_path: /srv/mjolnir/venv/bin/mjolnir-utilities.py paths: dir_exist: !!set - /etc/spark/conf: null + /etc/spark2/conf: null /home/pytest/training_size: null file_exist: !!set - /home/pytest/spark-2.1.0-bin-hadoop2.6/bin/spark-submit: null /srv/mjolnir/mjolnir_venv.zip: null /srv/mjolnir/venv/bin/mjolnir-utilities.py: null + /usr/lib/spark2/bin/spark-submit: null venv/bin/python: null spark_args: archives: /srv/mjolnir/mjolnir_venv.zip#venv @@ -560,9 +560,9 @@ executor-memory: 2G files: /usr/lib/libhdfs.so.0.0.0 master: yarn - packages: ml.dmlc:xgboost4j-spark:0.7-wmf-1,org.wikimedia.search:mjolnir:0.2,org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0 + packages: ml.dmlc:xgboost4j-spark:0.7-wmf-1,org.wikimedia.search:mjolnir:0.2,org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.2 repositories: https://archiva.wikimedia.org/repository/releases,https://archiva.wikimedia.org/repository/snapshots,https://archiva.wikimedia.org/repository/mirrored - spark_command: /home/pytest/spark-2.1.0-bin-hadoop2.6/bin/spark-submit + spark_command: /usr/lib/spark2/bin/spark-submit spark_conf: spark.driver.extraJavaOptions: -Dhttp.proxyHost=webproxy.eqiad.wmnet -Dhttp.proxyPort=8080 -Dhttps.proxyHost=webproxy.eqiad.wmnet -Dhttps.proxyPort=8080 -- To view, visit https://gerrit.wikimedia.org/r/403546 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Iccd44c0c9436287ba963a3c8b2244b3fa0a46274 Gerrit-PatchSet: 1 Gerrit-Project: search/MjoLniR Gerrit-Branch: master Gerrit-Owner: EBernhardson <ebernhard...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits