[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: Fixup xgboost training
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/406068 ) Change subject: Fixup xgboost training .. Fixup xgboost training * Use tree_method = hist when training on a single worker. This is significantly faster than the approx method used by default. * We've always trained with dense feature matrix's, and the ltr plugin only supports dense evaluation, but the DataWriter was writing out sparse matrixs. This caused a degredation in ndcg. * The txt file emitted by DataWriter has to be read by lightgbm and xgboost. As such it starts features at idx 1 to make lightgbm happy (which stores the label at idx 0). This broke XGBoostModel.eval because it was not providing the empty feature at index 0 that training sees. * XGBoostModel.loadModelFrom* always failed because the summary method on the jvm side throws an exception (metrics are not serialized). Wrap in try/except and set summary to None when not available. Change-Id: I48d7bf96a300313b6f62e3f60742345e8bd1a83f --- M jvm/src/main/scala/org/wikimedia/search/mjolnir/DataWriter.scala M jvm/src/test/scala/org/wikimedia/search/mjolnir/DataWriterSuite.scala M mjolnir/training/xgboost.py M mjolnir/utilities/make_folds.py 4 files changed, 10 insertions(+), 4 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR refs/changes/68/406068/1 diff --git a/jvm/src/main/scala/org/wikimedia/search/mjolnir/DataWriter.scala b/jvm/src/main/scala/org/wikimedia/search/mjolnir/DataWriter.scala index 8d6976b..01fcf6d 100644 --- a/jvm/src/main/scala/org/wikimedia/search/mjolnir/DataWriter.scala +++ b/jvm/src/main/scala/org/wikimedia/search/mjolnir/DataWriter.scala @@ -28,7 +28,7 @@ ) extends Serializable { // Accepting JavaSparkContext for py4j compatability - def this(sc: JavaSparkContext) = this(sc.broadcast(new SerializableConfiguration(sc.hadoopConfiguration))) + def this(sc: JavaSparkContext, sparse: Boolean) = this(sc.broadcast(new SerializableConfiguration(sc.hadoopConfiguration)), sparse) private def asHDFSPath(path: String): HDFSPath = if (path.charAt(0) == '/') { new HDFSPath(s"file://$path") diff --git a/jvm/src/test/scala/org/wikimedia/search/mjolnir/DataWriterSuite.scala b/jvm/src/test/scala/org/wikimedia/search/mjolnir/DataWriterSuite.scala index ca85260..06004a7 100644 --- a/jvm/src/test/scala/org/wikimedia/search/mjolnir/DataWriterSuite.scala +++ b/jvm/src/test/scala/org/wikimedia/search/mjolnir/DataWriterSuite.scala @@ -44,7 +44,7 @@ try { val df = makeData() val pattern = s"$testDir/%s-fold-%s-partition-%d" -val writer = new DataWriter(spark.sparkContext) +val writer = new DataWriter(spark.sparkContext, true) val folds = writer.write(df, numWorkers, pattern, foldCol) assert(folds.length == expectedFolds) diff --git a/mjolnir/training/xgboost.py b/mjolnir/training/xgboost.py index abeaabf..4d6bb9d 100644 --- a/mjolnir/training/xgboost.py +++ b/mjolnir/training/xgboost.py @@ -4,6 +4,7 @@ import mjolnir.training.hyperopt from mjolnir.training.tuning import make_cv_objective, ModelSelection import numpy as np +import py4j import pyspark import pyspark.sql from pyspark.sql import functions as F @@ -114,6 +115,8 @@ # ints, so this gets all the types right for Java. Also makes # a copy of params so we don't modifying the incoming dict. params = _coerce_params(params) +# Histogram doesn't work with distributed training +params['tree_method'] = 'hist' if len(fold) == 1 else 'approx' # TODO: Maybe num_rounds should just be external? But it's easier # to do hyperparameter optimization with a consistent dict interface kwargs = { @@ -158,7 +161,10 @@ class XGBoostModel(object): def __init__(self, j_xgb_model): self._j_xgb_model = j_xgb_model -self.summary = XGBoostSummary(self._j_xgb_model.summary()) +try: +self.summary = XGBoostSummary(self._j_xgb_model.summary()) +except py4j.protocol.Py4JJavaError: +self.summary = None @staticmethod def trainWithFiles(fold, train_matrix, params, num_rounds=100, diff --git a/mjolnir/utilities/make_folds.py b/mjolnir/utilities/make_folds.py index c7ac04d..5cbd682 100644 --- a/mjolnir/utilities/make_folds.py +++ b/mjolnir/utilities/make_folds.py @@ -64,7 +64,7 @@ write_xgb(local_input, local_output.name) # Write out as text files from scala, much faster than shuffling to python -writer = sc._jvm.org.wikimedia.search.mjolnir.DataWriter(sc._jsc) +writer = sc._jvm.org.wikimedia.search.mjolnir.DataWriter(sc._jsc, False) j_paths = writer.write(df._jdf, num_workers, path_format, fold_col) # Convert everything to python objects -- To view, visit https://gerrit.wikimedia.org/r/406068 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: Pull make_cv_objective outside tuner
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/406067 ) Change subject: Pull make_cv_objective outside tuner .. Pull make_cv_objective outside tuner This really had no business in tuner, it's function is independant and it didn't require any of the state. Adds a test that verifies the function works roughly as expected. Also drop the 'condition' argument from tuner stages. A standard if condition should be used when building the stage list. Change-Id: Ic3dff6a1a055cba3fc57debd4a1e3417476ddd4a --- M mjolnir/test/training/test_tuning.py M mjolnir/training/tuning.py M mjolnir/training/xgboost.py M mjolnir/utils.py 4 files changed, 72 insertions(+), 63 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR refs/changes/67/406067/1 diff --git a/mjolnir/test/training/test_tuning.py b/mjolnir/test/training/test_tuning.py index 22402f1..15389d7 100644 --- a/mjolnir/test/training/test_tuning.py +++ b/mjolnir/test/training/test_tuning.py @@ -46,7 +46,7 @@ } tuner = mjolnir.training.tuning.ModelSelection(initial_space, tune_stages) -train_func = tuner.make_cv_objective(f, folds, num_cv_jobs, **kwargs) +train_func = mjolnir.training.tuning.make_cv_objective(f, folds, num_cv_jobs, **kwargs) trials_pool = tuner.build_pool(folds, num_cv_jobs) result = tuner(train_func, trials_pool) return result, stats['called'] @@ -80,39 +80,14 @@ assert result['params']['baz'] == 0 -def test_ModelSelection_stage_condition(): -num_iterations = 3 -result, called = run_model_selection([ -('a', { -'condition': lambda: False, -'iterations': num_iterations, -'space': { -'foo': hyperopt.hp.uniform('foo', 1, 9), -} -}), -('b', { -'iterations': num_iterations, -'space': { -'bar': hyperopt.hp.uniform('bar', 1, 9), -} -}), -]) -# iterations * folds -assert called == num_iterations * 2 -assert result['params']['foo'] == 10 -assert 1 <= result['params']['bar'] <= 9 -assert result['params']['baz'] == 0 - - def test_ModelSelection_kwargs_pass_thru(): -tuner = mjolnir.training.tuning.ModelSelection(None, None) expected_kwargs = {'hi': 5, 'there': 'test'} def f(fold, params, **kwargs): assert kwargs == expected_kwargs return {'test': [fold[0]], 'train': [fold[0]]} -obj = tuner.make_cv_objective(f, [[1], [2]], 1, **expected_kwargs) +obj = mjolnir.training.tuning.make_cv_objective(f, [[1], [2]], 1, **expected_kwargs) res = obj(None) assert res == [ @@ -144,3 +119,23 @@ folds = [[1] * num_workers for i in range(num_folds)] pool = tuner.build_pool(folds, num_cv_jobs) assert (pool is not None) == expect_pool + + +def test_ModelSelection_transformer(): +stats = {'called': 0} + +def transformer(result, params): +assert 'foo' in result +assert result['foo'] == 'bar' +assert params == 'some params' +stats['called'] += 1 +return 'baz' + +def f(fold, params): +assert params == 'some params' +return {'foo': 'bar'} + +folds = [[1, 2, 3], [4, 5, 6]] +obj = mjolnir.training.tuning.make_cv_objective(f, folds, 1, transformer) +assert obj('some params') == ['baz', 'baz'] +assert stats['called'] == 2 diff --git a/mjolnir/training/tuning.py b/mjolnir/training/tuning.py index 7d2df68..81bfafe 100644 --- a/mjolnir/training/tuning.py +++ b/mjolnir/training/tuning.py @@ -133,11 +133,48 @@ return with_retry +def make_cv_objective(train_func, folds, num_cv_jobs, transformer=None, **kwargs): +"""Create a cross-validation objective function + +Parameters +-- +train_func : callable +Function accepting a fold and hyperparameters to perform training +num_cv_jobs : int +The total number of folds to train in parallel +transformer : callable or None, optional +Function accepting output of train_func and hyperparameters to +return stats about the individual fold train/test performance + +Returns +--- +callable +Accepts a set of hyperparameters as only argument and returns +list of per-fold train/test performance. +""" +train_func = _py4j_retry(train_func, None) +if num_cv_jobs > 1: +cv_pool = Pool(num_cv_jobs) +cv_mapper = cv_pool.map +else: +cv_mapper = map + +def f(params): +def inner(fold): +return train_func(fold, params, **kwargs) + +return cv_mapper(inner, folds) + +if transformer is None: +return f +else: +return lambda params: [transformer(scores, params) for scores in f(params)] + + class ModelSelection(object): -def __init__(self, initial_space, tune_stages,
[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: add python interface to scala dbn
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/406069 ) Change subject: add python interface to scala dbn .. add python interface to scala dbn It turns out only the driver has a py4j connection to the jvm, executors talk to spark directly through sockets. To use jvm implementations in the executors we need to trigger that from jvm. Added an implementation and some basic tests. Change-Id: Iee7f79662e89bcf64cdb447aac0df5b68ee1170c --- M jvm/src/main/scala/org/wikimedia/search/mjolnir/DBN.scala M jvm/src/main/scala/org/wikimedia/search/mjolnir/DataWriter.scala M jvm/src/test/scala/org/wikimedia/search/mjolnir/DBNSuite.scala M mjolnir/dbn.py M mjolnir/test/conftest.py M mjolnir/test/training/test_xgboost.py M mjolnir/utilities/data_pipeline.py M setup.py 8 files changed, 164 insertions(+), 197 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR refs/changes/69/406069/1 diff --git a/jvm/src/main/scala/org/wikimedia/search/mjolnir/DBN.scala b/jvm/src/main/scala/org/wikimedia/search/mjolnir/DBN.scala index faac7dc..d051c7d 100644 --- a/jvm/src/main/scala/org/wikimedia/search/mjolnir/DBN.scala +++ b/jvm/src/main/scala/org/wikimedia/search/mjolnir/DBN.scala @@ -13,7 +13,12 @@ * implementation was ported from python clickmodels by Aleksandr Chuklin and the * notes on math were added in an attempt to understand why the implementation works. */ +import org.apache.spark.rdd.RDD + import scala.collection.mutable +import org.apache.spark.sql.{DataFrame, Row, functions => F} +import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema +import org.apache.spark.sql.{types => T} import org.json4s.{JArray, JBool, JString} import org.json4s.jackson.JsonMethods @@ -29,7 +34,7 @@ def urlToId(queryId: Int, url: String): Int = { val urlToIdMap = queryIdToUrlToIdMap.getOrElseUpdate(queryId, { mutable.Map() }) urlToIdMap.getOrElseUpdate(url, { - var nextUrlId = queryIdToNextUrlId.getOrElse(queryId, 0) + val nextUrlId = queryIdToNextUrlId.getOrElse(queryId, 0) queryIdToNextUrlId(queryId) = nextUrlId + 1 nextUrlId }) @@ -79,7 +84,7 @@ c } -val hasClicks = allClicks.take(n).exists { x => x} +val hasClicks = allClicks.exists { x => x } if (urls.length < minDocsPerQuery || (discardNoClicks && !hasClicks) ) { @@ -185,20 +190,14 @@ // attractiveness and satisfaction values for each position class PositionRel(var a: Array[Double], var s: Array[Double]) -case class SessionEstimate( - a: (Double, Double), s: (Double, Double), - e: Array[(Double, Double)], C: Double, - clicks: Array[Double]) - - class DbnModel(gamma: Double, config: Config) { val invGamma: Double = 1D - gamma def train(sessions: Seq[SessionItem]): Array[Array[UrlRel]] = { // This is basically a multi-dimensional array with queryId in the first -// dimension and urlId in the second dimension. Because queries only reference -// a subset of the known urls we use a map at the second level instead of -// creating the entire matrix. +// dimension and urlId in the second dimension. InputReader guarantees +// that queryId starts at 0 and is continuous, and that per-query id urlId +// also starts at 0 and is continuous, allowing static sized arrays to be used. val urlRelevances: Array[Array[UrlRel]] = (0 to config.maxQueryId).map { queryId => (0 to config.maxUrlIds(queryId)).map { _ => new UrlRel(config.defaultRel, config.defaultRel) }.toArray }.toArray @@ -267,7 +266,7 @@ val queryUrlRelFrac = urlRelFractions(s.queryId) i = 0 while (i < N) { -var urlId = s.urlIds(i) +val urlId = s.urlIds(i) // update attraction val rel = queryUrlRelFrac(urlId) val estA = sessionEstimate.a(i) @@ -410,7 +409,7 @@ // (alpha, beta) } - var sessionEstimate = new PositionRel(new Array[Double](config.serpSize), new Array[Double](config.serpSize)) + val sessionEstimate = new PositionRel(new Array[Double](config.serpSize), new Array[Double](config.serpSize)) // Returns // a: P(A_i|C_i,G) - Probability of attractiveness at position i conditioned on clicked and gamma // s: P(S_i|C_i,G) - Probability of satisfaction at position i conditioned on clicked and gamma @@ -461,4 +460,95 @@ } } +private class DbnHitPage(val hitPageId: Int, val hitPosition: Double, val clicked: Boolean) +/** + * Predict relevance of query/page pairs from individual user search sessions. + */ +object DBN { + // TODO: These should all be configurable? Perhaps + // also simplified somehow... + private val CLICKED = "clicked" + private val HITS = "hits" + private val HIT_PAGE_ID = "hit_page_id" + private val HIT_POSITION = "hit_position" + private val NORM_QUERY_ID = "norm_query_id" + private val RELEVANCE = "relevance" +
[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: Add end-to-end integration test
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/406070 ) Change subject: Add end-to-end integration test .. Add end-to-end integration test A basic end to end run through of the training pipeline. It's of course a bit slow, but worthwhile to see the whole operation run from end to end. * verifies that the general premise works * outputs from one stage are expected by the input from the next * models are in expected places and loadable * evaluations run against the models match train time metrics Change-Id: I8ad5fe1dbbbd50b897362b44411cfc19650b0390 --- M jvm/src/main/scala/org/wikimedia/search/mjolnir/PythonUtils.scala M mjolnir/test/conftest.py A mjolnir/test/fixtures/requests/test_integration.sqlite3 M mjolnir/test/training/test_xgboost.py M mjolnir/training/xgboost.py M mjolnir/utilities/data_pipeline.py M mjolnir/utilities/make_folds.py M mjolnir/utilities/training_pipeline.py 8 files changed, 63 insertions(+), 20 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR refs/changes/70/406070/1 diff --git a/jvm/src/main/scala/org/wikimedia/search/mjolnir/PythonUtils.scala b/jvm/src/main/scala/org/wikimedia/search/mjolnir/PythonUtils.scala index 8c6929e..9200fe9 100644 --- a/jvm/src/main/scala/org/wikimedia/search/mjolnir/PythonUtils.scala +++ b/jvm/src/main/scala/org/wikimedia/search/mjolnir/PythonUtils.scala @@ -1,7 +1,7 @@ package org.wikimedia.search.mjolnir import org.apache.spark.ml.feature.{LabeledPoint => MLLabeledPoint} -import org.apache.spark.ml.linalg.{Vector => MLVector} +import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vectors, Vector => MLVector} import org.apache.spark.rdd.RDD import org.apache.spark.sql.functions.col import org.apache.spark.sql.{Dataset, Row} @@ -13,22 +13,38 @@ * pyspark. */ object PythonUtils { + private def shiftVector(vec: MLVector): MLVector = vec match { +case y: DenseVector => Vectors.dense(Array(0D) ++ y.toArray) +case y: SparseVector => Vectors.sparse(y.size + 1, y.indices.map(_ + 1), y.values) + } + /** * There is no access to LabeledPoint from pyspark, but various methods such as * trainWithRDD and eval require an RDD[MLLabeledPoint]. This offers a bridge to * convert a Dataset into the required format. * + * @deprecated * @param ds Input dataframe containing features and label * @param featureCol Name of the column containing feature vectors * @param labelCol Name of the column containing numeric labels + * @param shiftRight Shift all features to index + 1. This is a disapointing hack, + * but due to the way data files are created feature indices start + * at 1 and the 0 feature is empty. This allows to shift to match + * when evaluating a dataframe againts a model trained that way. */ - def toLabeledPoints(ds: Dataset[_], featureCol: String, labelCol: String): RDD[MLLabeledPoint] = { + def toLabeledPoints(ds: Dataset[_], featureCol: String, labelCol: String, shiftRight: Boolean): RDD[MLLabeledPoint] = { ds.select(col(featureCol), col(labelCol).cast(DoubleType)).rdd.map { case Row(feature: MLVector, label: Double) => +val shiftedFeature = if (shiftRight) shiftVector(feature) else feature MLLabeledPoint(label, feature) } } + def toLabeledPoints(ds: Dataset[_], featureCol: String, labelCol: String): RDD[MLLabeledPoint] = { +toLabeledPoints(ds, featureCol, labelCol, shiftRight = false) + } + + /** * Training/evaluating a ranking model in XGBoost requires rows for the same * query to be provided sequentially, and it needs to know for each partition diff --git a/mjolnir/test/conftest.py b/mjolnir/test/conftest.py index efc8441..c4c3d77 100644 --- a/mjolnir/test/conftest.py +++ b/mjolnir/test/conftest.py @@ -72,7 +72,8 @@ .set('spark.jars.packages', ','.join([ 'ml.dmlc:xgboost4j-spark:0.8-wmf-1', 'org.wikimedia.search:mjolnir:0.4-SNAPSHOT', -'org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0'])) +'org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0', +'org.wikimedia.analytics.refinery.hive:refinery-hive:0.0.57'])) # By default spark will shuffle to 200 partitions, which is # way too many for our small test cases. This cuts execution # time of the tests in half. diff --git a/mjolnir/test/fixtures/requests/test_integration.sqlite3 b/mjolnir/test/fixtures/requests/test_integration.sqlite3 new file mode 100644 index 000..44957bf --- /dev/null +++ b/mjolnir/test/fixtures/requests/test_integration.sqlite3 Binary files differ diff --git a/mjolnir/test/training/test_xgboost.py b/mjolnir/test/training/test_xgboost.py index 100a3e8..ba8dc48 100644 --- a/mjolnir/test/training/test_xgboost.py +++ b/mjolnir/test/training/test_xgboost.py @@
[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: Support LXC in Vagrantfile
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/405209 ) Change subject: Support LXC in Vagrantfile .. Support LXC in Vagrantfile LXC, from a linux host, can be significantly more responsive than virtualbox. Support it in the Vagrantfile. Change-Id: I2b1e2d41beea97dd216c8c75395735a472104208 --- M Vagrantfile 1 file changed, 14 insertions(+), 9 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR refs/changes/09/405209/1 diff --git a/Vagrantfile b/Vagrantfile index d8d71ad..1005dca 100644 --- a/Vagrantfile +++ b/Vagrantfile @@ -1,18 +1,23 @@ Vagrant.configure("2") do |config| +config.vm.provider :lxc do |_lxc, override| +override.vm.box = 'LEAP/jessie' +end + config.vm.provider :virtualbox do |vb, override| override.vm.box = 'debian/contrib-jessie64' vb.customize ['modifyvm', :id, '--memory', '2048'] + +root_share_options = { id: 'vagrant-root' } +root_share_options[:type] = :nfs +root_share_options[:mount_options] = ['noatime', 'rsize=32767', 'wsize=3267', 'async', 'nolock'] +override.nfs.map_uid = Process.uid +override.nfs.map_gid = Process.gid +override.vm.synced_folder ".", "/vagrant", root_share_options + +override.vm.hostname = "MjoLniR" +override.vm.network "private_network", type: "dhcp" end -root_share_options = { id: 'vagrant-root' } -root_share_options[:type] = :nfs -root_share_options[:mount_options] = ['noatime', 'rsize=32767', 'wsize=3267', 'async', 'nolock'] -config.nfs.map_uid = Process.uid -config.nfs.map_gid = Process.gid -config.vm.synced_folder ".", "/vagrant", root_share_options - -config.vm.hostname = "MjoLniR" -config.vm.network "private_network", type: "dhcp" config.vm.provision "shell", path: "bootstrap-vm.sh" end -- To view, visit https://gerrit.wikimedia.org/r/405209 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I2b1e2d41beea97dd216c8c75395735a472104208 Gerrit-PatchSet: 1 Gerrit-Project: search/MjoLniR Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] operations/mediawiki-config[master]: Switch wiktionary sister search on enwiki to title only
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/405206 ) Change subject: Switch wiktionary sister search on enwiki to title only .. Switch wiktionary sister search on enwiki to title only Bug: T185250 Change-Id: I43ff74472e4cdd2a925cf284905e70c318eb6468 --- M wmf-config/CirrusSearch-common.php M wmf-config/InitialiseSettings.php 2 files changed, 19 insertions(+), 8 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/mediawiki-config refs/changes/06/405206/1 diff --git a/wmf-config/CirrusSearch-common.php b/wmf-config/CirrusSearch-common.php index e5e45ac..572a692 100644 --- a/wmf-config/CirrusSearch-common.php +++ b/wmf-config/CirrusSearch-common.php @@ -241,14 +241,7 @@ $wgCirrusSearchFetchConfigFromApi = true; // Override sister search profiles for specific projects -$wgCirrusSearchCrossProjectProfiles = [ - // full text wikivoyage results are often irrelevant, filter the - // search with title matches to improve relevance. - 'voy' => [ - 'ftbuilder' => 'perfield_builder_title_filter', - 'rescore' => 'wsum_inclinks', - ], -]; +$wgCirrusSearchCrossProjectProfiles = $wmgCirrusSearchCrossProjectProfiles $wgCirrusSearchCrossProjectSearchBlackList = $wmgCirrusSearchCrossProjectSearchBlackList; $wgCirrusSearchCrossProjectShowMultimedia = $wmgCirrusSearchCrossProjectShowMultimedia; diff --git a/wmf-config/InitialiseSettings.php b/wmf-config/InitialiseSettings.php index 39dccc7..2c9b460 100755 --- a/wmf-config/InitialiseSettings.php +++ b/wmf-config/InitialiseSettings.php @@ -18437,6 +18437,24 @@ 'itwikivoyage' => false, ], +'wmgCirrusSearchCrossProjectProfiles = [ + 'default' => [ + // full text wikivoyage results are often irrelevant, filter the + // search with title matches to improve relevance + 'voy' => [ + 'ftbuilder' => 'perfield_builder_title_filter', + 'rescore' => 'wsum_inclinks', + ], + ], + '+enwiki' => [ + // T185250 + 'wikt' => [ + 'ftbuilder' => 'perfield_builder_title_filter', + 'rescore' => 'wsum_inclinks', + ], + ], +], + 'wmgCirrusSearchIgnoreOnWikiBoostTemplates' => [ 'default' => false, // on wiki boost templates have to high boosts for enwiki -- To view, visit https://gerrit.wikimedia.org/r/405206 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I43ff74472e4cdd2a925cf284905e70c318eb6468 Gerrit-PatchSet: 1 Gerrit-Project: operations/mediawiki-config Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] mediawiki...WikimediaEvents[wmf/1.31.0-wmf.17]: Turn off cirrus AB test on hewiki
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/404595 ) Change subject: Turn off cirrus AB test on hewiki .. Turn off cirrus AB test on hewiki Test has run it's course. Time to turn off and reset hewiki sampling back to the old (default) rates. Bug: T182616 Change-Id: I0b6d95af218379d47043c63a240ec5a7132d86ee (cherry picked from commit b2495fcacf3ae7325f34f37705c2c108eb29e513) --- M modules/all/ext.wikimediaEvents.searchSatisfaction.js 1 file changed, 1 insertion(+), 7 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/WikimediaEvents refs/changes/95/404595/1 diff --git a/modules/all/ext.wikimediaEvents.searchSatisfaction.js b/modules/all/ext.wikimediaEvents.searchSatisfaction.js index de78f11..b818261 100644 --- a/modules/all/ext.wikimediaEvents.searchSatisfaction.js +++ b/modules/all/ext.wikimediaEvents.searchSatisfaction.js @@ -114,9 +114,7 @@ function initialize( session ) { var sessionId = session.get( 'sessionId' ), - validBuckets = mw.config.get( 'wgDBname' ) === 'hewiki' ? - [ 'control', 'ltr-1024', 'ltr-1024-i' ] : - [], + validBuckets = [], sampleSize = ( function () { var dbName = mw.config.get( 'wgDBname' ), // Provides a place to handle wiki-specific sampling, @@ -188,10 +186,6 @@ zhwiki: { test: 100, subTest: null - }, - hewiki: { - test: 0.8112, - subTest: 0.8767 } }; if ( subTests[ dbName ] ) { -- To view, visit https://gerrit.wikimedia.org/r/404595 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I0b6d95af218379d47043c63a240ec5a7132d86ee Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/WikimediaEvents Gerrit-Branch: wmf/1.31.0-wmf.17 Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] mediawiki...WikimediaEvents[wmf/1.31.0-wmf.16]: Turn off cirrus AB test on hewiki
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/404594 ) Change subject: Turn off cirrus AB test on hewiki .. Turn off cirrus AB test on hewiki Test has run it's course. Time to turn off and reset hewiki sampling back to the old (default) rates. Bug: T182616 Change-Id: I0b6d95af218379d47043c63a240ec5a7132d86ee (cherry picked from commit b2495fcacf3ae7325f34f37705c2c108eb29e513) --- M modules/all/ext.wikimediaEvents.searchSatisfaction.js 1 file changed, 1 insertion(+), 7 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/WikimediaEvents refs/changes/94/404594/1 diff --git a/modules/all/ext.wikimediaEvents.searchSatisfaction.js b/modules/all/ext.wikimediaEvents.searchSatisfaction.js index de78f11..b818261 100644 --- a/modules/all/ext.wikimediaEvents.searchSatisfaction.js +++ b/modules/all/ext.wikimediaEvents.searchSatisfaction.js @@ -114,9 +114,7 @@ function initialize( session ) { var sessionId = session.get( 'sessionId' ), - validBuckets = mw.config.get( 'wgDBname' ) === 'hewiki' ? - [ 'control', 'ltr-1024', 'ltr-1024-i' ] : - [], + validBuckets = [], sampleSize = ( function () { var dbName = mw.config.get( 'wgDBname' ), // Provides a place to handle wiki-specific sampling, @@ -188,10 +186,6 @@ zhwiki: { test: 100, subTest: null - }, - hewiki: { - test: 0.8112, - subTest: 0.8767 } }; if ( subTests[ dbName ] ) { -- To view, visit https://gerrit.wikimedia.org/r/404594 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I0b6d95af218379d47043c63a240ec5a7132d86ee Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/WikimediaEvents Gerrit-Branch: wmf/1.31.0-wmf.16 Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] operations/mediawiki-config[master]: Remove cirrus AB test config for hewiki
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/404592 ) Change subject: Remove cirrus AB test config for hewiki .. Remove cirrus AB test config for hewiki This test is complete and the configuration is no longer necessary. Bug: T182616 Change-Id: Ibb55c90ecf7fc562860fb69a06d2d1f2babf49aa --- M wmf-config/InitialiseSettings.php 1 file changed, 0 insertions(+), 24 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/mediawiki-config refs/changes/92/404592/1 diff --git a/wmf-config/InitialiseSettings.php b/wmf-config/InitialiseSettings.php index 9767eb5..c76cecc 100644 --- a/wmf-config/InitialiseSettings.php +++ b/wmf-config/InitialiseSettings.php @@ -18702,30 +18702,6 @@ 'wmgCirrusSearchUserTesting' => [ 'default' => [], - 'hewiki' => [ - 'ltr' => [ - 'globals' => [], - 'buckets' => [ - 'control' => [ - 'trigger' => 'control', - ], - 'ltr-1024' => [ - 'trigger' => 'ltr-1024', - 'globals' => [ - 'wgCirrusSearchRescoreProfile' => 'mlr-1024rs', - ] - ], - 'ltr-1024-i' => [ - 'trigger' => 'ltr-1024-i', - 'globals' => [ - 'wgCirrusSearchInterleaveConfig' => [ - 'CirrusSearchRescoreProfile' => 'mlr-1024rs' - ], - ], - ], - ], - ], - ], ], 'wmgCirrusSearchLanguageDetectors' => [ -- To view, visit https://gerrit.wikimedia.org/r/404592 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ibb55c90ecf7fc562860fb69a06d2d1f2babf49aa Gerrit-PatchSet: 1 Gerrit-Project: operations/mediawiki-config Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] mediawiki...WikimediaEvents[master]: Turn off hewiki AB test
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/404591 ) Change subject: Turn off hewiki AB test .. Turn off hewiki AB test Test has run it's course. Time to turn off and reset hewiki sampling back to the old (default) rates. Bug: T182616 Change-Id: I0b6d95af218379d47043c63a240ec5a7132d86ee --- M modules/all/ext.wikimediaEvents.searchSatisfaction.js 1 file changed, 1 insertion(+), 7 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/WikimediaEvents refs/changes/91/404591/1 diff --git a/modules/all/ext.wikimediaEvents.searchSatisfaction.js b/modules/all/ext.wikimediaEvents.searchSatisfaction.js index de78f11..b818261 100644 --- a/modules/all/ext.wikimediaEvents.searchSatisfaction.js +++ b/modules/all/ext.wikimediaEvents.searchSatisfaction.js @@ -114,9 +114,7 @@ function initialize( session ) { var sessionId = session.get( 'sessionId' ), - validBuckets = mw.config.get( 'wgDBname' ) === 'hewiki' ? - [ 'control', 'ltr-1024', 'ltr-1024-i' ] : - [], + validBuckets = [], sampleSize = ( function () { var dbName = mw.config.get( 'wgDBname' ), // Provides a place to handle wiki-specific sampling, @@ -188,10 +186,6 @@ zhwiki: { test: 100, subTest: null - }, - hewiki: { - test: 0.8112, - subTest: 0.8767 } }; if ( subTests[ dbName ] ) { -- To view, visit https://gerrit.wikimedia.org/r/404591 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I0b6d95af218379d47043c63a240ec5a7132d86ee Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/WikimediaEvents Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: Generalize tuning pipeline
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/403869 ) Change subject: Generalize tuning pipeline .. Generalize tuning pipeline This pipeline was pretty convoluted. Push most of the complexity up out of the pipeline into the single ModelSelection object leaving the rest of the model selection code (cross validation, tuning, parameter selection, etc) clearer and more directly implemented. This also provides a reusable tuning implementation to share between xgboost and lightgbm. Change-Id: I8f2a2f3aeca85fe86cb6d466622a2e83dd249172 --- M mjolnir/test/training/test_hyperopt.py M mjolnir/test/training/test_tuning.py M mjolnir/training/hyperopt.py M mjolnir/training/lightgbm.py M mjolnir/training/tuning.py M mjolnir/training/xgboost.py M mjolnir/utilities/training_pipeline.py 7 files changed, 289 insertions(+), 276 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR refs/changes/69/403869/1 diff --git a/mjolnir/test/training/test_hyperopt.py b/mjolnir/test/training/test_hyperopt.py index c4c4782..1dec547 100644 --- a/mjolnir/test/training/test_hyperopt.py +++ b/mjolnir/test/training/test_hyperopt.py @@ -1,27 +1,19 @@ from __future__ import absolute_import import hyperopt import mjolnir.training.hyperopt -from pyspark.ml.linalg import Vectors -import pytest -def _make_q(query, n=4): -"Generates single feature queries" -return [('foowiki', query, query, float(f), Vectors.dense([float(f)])) for f in range(n)] - - -@pytest.fixture -def df_train(spark_context, hive_context): -# TODO: Use some fixture dataset representing real-ish data? But -# it needs to be pretty small -return spark_context.parallelize( -_make_q('abc') + _make_q('def') + _make_q('ghi') + _make_q('jkl') -+ _make_q('mno') + _make_q('pqr') + _make_q('stu') -).toDF(['wikiid', 'norm_query_id', 'query', 'label', 'features']) - - -def test_minimize(folds_b): +def test_maximize(folds_b): "Not an amazing test...basically sees if the happy path doesnt blow up" +def f(params): +assert isinstance(params, dict) +assert 'max_depth' in params +assert params['num_rounds'] == 50 +return [{ +'train': [0.80], +'test': [0.79], +}] + space = { 'num_rounds': 50, 'max_depth': hyperopt.hp.quniform('max_depth', 1, 20, 1) @@ -30,33 +22,11 @@ # mostly hyperopt just calls cross_validate, of which the integration with # xgboost is separately tested. Instead of going all the way into xgboost # mock it out w/MockModel. -best_params, trails = mjolnir.training.hyperopt.minimize( -folds_b, MockModel, space, max_evals=5) +best_params, trails = mjolnir.training.hyperopt.maximize( +f, space, max_evals=5) assert isinstance(best_params, dict) # num_rounds should have been unchanged assert 'num_rounds' in best_params assert best_params['num_rounds'] == 50 # should have max_evals evaluations assert len(trails.trials) == 5 - - -class MockSummary(object): -def train(self): -return [1.] - -def test(self): -return [1.] - - -class MockModel(object): -def __init__(self, df, params, train_matrix=None): -# Params that were passed to hyperopt -assert isinstance(params, dict) -assert 'max_depth' in params -assert params['num_rounds'] == 50 - -def eval(self, df_test, j_groups=None, feature_col='features', label_col='label'): -return 1.0 - -def summary(self): -return MockSummary() diff --git a/mjolnir/test/training/test_tuning.py b/mjolnir/test/training/test_tuning.py index e14982b..22402f1 100644 --- a/mjolnir/test/training/test_tuning.py +++ b/mjolnir/test/training/test_tuning.py @@ -1,8 +1,8 @@ from __future__ import absolute_import +import hyperopt import mjolnir.training.tuning import mjolnir.training.xgboost from pyspark.sql import functions as F -from pyspark.ml.linalg import Vectors import pytest @@ -32,27 +32,115 @@ assert len(queries_in_0.intersection(queries_in_1)) == 0 -def _make_q(query, n=4): -"Generates single feature queries" -return [('foowiki', query, query, float(f), Vectors.dense([float(f)])) for f in range(n)] +def run_model_selection(tune_stages, f=None, num_cv_jobs=1, **kwargs): +stats = {'called': 0} +initial_space = {'foo': 10, 'bar': 20, 'baz': 0} +folds = [[1, 2, 3], [4, 5, 6]] +if not f: +def f(fold, params, **kwargs): +stats['called'] += 1 +factor = 1.0 / (6 * params['foo']) +return { +'test': [v * factor * 0.9 for v in fold], +'train': [v * factor for v in fold], +} + +tuner = mjolnir.training.tuning.ModelSelection(initial_space, tune_stages) +train_func = tuner.make_cv_objective(f, folds, num_cv_jobs,
[MediaWiki-commits] [Gerrit] mediawiki/core[wmf/1.31.0-wmf.16]: Deprecate old interwiki search result widget
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/403719 ) Change subject: Deprecate old interwiki search result widget .. Deprecate old interwiki search result widget Update the flag for new interwiki sidebar from unset means disabled to unset means enabled. Deprecate the old rendering widgets to be removed at a later date per deprecation policy. Change-Id: I80d8375bbd3e1fabc9b2432b6875d17a96aee099 Related: I9a488438 (cherry picked from commit d95f644e80fb894ca4f22a9fcdeab53cde9dedc9) --- M includes/specials/SpecialSearch.php M includes/widget/search/SimpleSearchResultSetWidget.php M includes/widget/search/SimpleSearchResultWidget.php 3 files changed, 8 insertions(+), 1 deletion(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/core refs/changes/19/403719/1 diff --git a/includes/specials/SpecialSearch.php b/includes/specials/SpecialSearch.php index b3a58cb..f826844 100644 --- a/includes/specials/SpecialSearch.php +++ b/includes/specials/SpecialSearch.php @@ -394,7 +394,8 @@ $linkRenderer = $this->getLinkRenderer(); $mainResultWidget = new FullSearchResultWidget( $this, $linkRenderer ); - if ( $search->getFeatureData( 'enable-new-crossproject-page' ) ) { + // Default (null) on. Can be explicitly disabled. + if ( $search->getFeatureData( 'enable-new-crossproject-page' ) !== false ) { $sidebarResultWidget = new InterwikiSearchResultWidget( $this, $linkRenderer ); $sidebarResultsWidget = new InterwikiSearchResultSetWidget( $this, diff --git a/includes/widget/search/SimpleSearchResultSetWidget.php b/includes/widget/search/SimpleSearchResultSetWidget.php index d6583a3..d0c259f 100644 --- a/includes/widget/search/SimpleSearchResultSetWidget.php +++ b/includes/widget/search/SimpleSearchResultSetWidget.php @@ -13,6 +13,8 @@ * Renders one or more SearchResultSets into a sidebar grouped by * interwiki prefix. Includes a per-wiki header indicating where * the results are from. + * + * @deprecated since 1.31. Use InterwikiSearchResultSetWidget */ class SimpleSearchResultSetWidget implements SearchResultSetWidget { /** @var SpecialSearch */ @@ -32,6 +34,7 @@ LinkRenderer $linkRenderer, InterwikiLookup $iwLookup ) { + wfDeprecated( __METHOD__, '1.31' ); $this->specialSearch = $specialSearch; $this->resultWidget = $resultWidget; $this->linkRenderer = $linkRenderer; diff --git a/includes/widget/search/SimpleSearchResultWidget.php b/includes/widget/search/SimpleSearchResultWidget.php index fa07563..552cbaf 100644 --- a/includes/widget/search/SimpleSearchResultWidget.php +++ b/includes/widget/search/SimpleSearchResultWidget.php @@ -9,6 +9,8 @@ /** * Renders a simple one-line result + * + * @deprecated since 1.31. Use other result widgets. */ class SimpleSearchResultWidget implements SearchResultWidget { /** @var SpecialSearch */ @@ -17,6 +19,7 @@ protected $linkRenderer; public function __construct( SpecialSearch $specialSearch, LinkRenderer $linkRenderer ) { + wfDeprecated( __METHOD__, '1.31' ); $this->specialSearch = $specialSearch; $this->linkRenderer = $linkRenderer; } -- To view, visit https://gerrit.wikimedia.org/r/403719 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I80d8375bbd3e1fabc9b2432b6875d17a96aee099 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/core Gerrit-Branch: wmf/1.31.0-wmf.16 Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: Repoint spark in example_train.yaml
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/403546 ) Change subject: Repoint spark in example_train.yaml .. Repoint spark in example_train.yaml Mostly this makes it easier to push a branch over to stat1005 and try something out against full-sized data. Having example_train.yaml be "close enough" helps a good bit. Also update spark to 2.1.2 to match, and add a 'master' template so yarn/local can be toggled from spark.py command line Change-Id: Iccd44c0c9436287ba963a3c8b2244b3fa0a46274 --- M example_train.yaml A mjolnir/pruning.py A mjolnir/scan_es.py M mjolnir/test/fixtures/load_config/example_train.expect 4 files changed, 316 insertions(+), 100 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR refs/changes/46/403546/1 diff --git a/example_train.yaml b/example_train.yaml index 31ce45c..7b1749c 100644 --- a/example_train.yaml +++ b/example_train.yaml @@ -3,10 +3,11 @@ global: environment: PYSPARK_PYTHON: venv/bin/python -SPARK_CONF_DIR: /etc/spark/conf -SPARK_HOME: "%(HOME)s/spark-%(spark_version)s-bin-hadoop2.6" +SPARK_CONF_DIR: /etc/spark2/conf +SPARK_HOME: "/usr/lib/spark2" template_vars: -spark_version: 2.1.0 +spark_version: 2.1.2 +master: yarn # Path to spark-submit applicatoin spark_submit: "%(SPARK_HOME)s/bin/spark-submit" # Local path to zip'd virtualenv which will be shipped to executors @@ -50,7 +51,7 @@ ? "%(spark_submit)s" ? "%(PYSPARK_PYTHON)s" spark_args: -master: yarn +master: "%(master)s" # TODO: When is this necessary? files: /usr/lib/libhdfs.so.0.0.0 # Ship the mjolnir virtualenv to executors and decompress it to ./venv diff --git a/mjolnir/pruning.py b/mjolnir/pruning.py new file mode 100644 index 000..c2e78f9 --- /dev/null +++ b/mjolnir/pruning.py @@ -0,0 +1,134 @@ +from __future__ import absolute_import +import json +import math +from pyspark.sql import functions as F +from pyspark.sql.types import FloatType, StructField, StructType + + +class Split(object): +def __init__(self, left, right, feature, threshold): +self.left = left +self.right = right +self.feature = feature +self.threshold = threshold + +def isLeaf(self): +return False + +def eval(self, features): +n = self +while not n.isLeaf(): +if n.threshold > features[n.feature]: +n = n.left +else: +n = n.right +return n.output + + +class Leaf(object): +def __init__(self, output): +self.output = output + +def isLeaf(self): +return True + + +def _parse_node(json_node): +if 'leaf' in json_node: +return Leaf(json_node['leaf']) +else: +left = _parse_node(json_node['children'][0]) +right = _parse_node(json_node['children'][1]) +return Split(left, right, json_node['split'], json_node['split_condition']) + + +def parse_xgboost(json_tree): +return [_parse_node(tree) for tree in json.loads(json_tree)] + + +def ndcg_at_k(k, predicted, actual): +idcg = sum([((1 << label) - 1) / math.log(i + 2.0, 2) for i, label in enumerate(actual[:k])]) +if idcg == 0: +return 0. +else: +dcg = sum([((1 << label) - 1) / math.log(i + 2.0, 2) for i, label in enumerate(predicted[:k])]) +return dcg / idcg + + +# Horrible name ... it returns the ndcg for each removed tree +def gen_per_tree_ndcg(tree_cols, removed_trees, label_col, k=10): +def f(rows): +# Remove trees from the sum +cur_sum = [reduce(lambda acc, tree: acc - row[tree], removed_trees, row.sum) for row in rows] +data = zip(rows, cur_sum) + +# TODO: actual could be pre-calculated? Actually full idcg could be pre-calculated +actual = [x[0][label_col] for x in sorted(data, key=lambda x: x[0][label_col], reverse=True)] +# baseline ndcg +predicted = [x[0][label_col] for x in sorted(data, key=lambda x: x[1], reverse=True)] +res = [ndcg_at_k(k, predicted, actual)] +# Per-tree ndcgs +for tree_pred in tree_cols: +predicted = [x[0][label_col] for x in sorted(data, key=lambda x: x[1] - x[0][tree_pred], reverse=True)] +res.append(ndcg_at_k(k, predicted, actual)) +return res +fields = [StructField(name, FloatType()) for name in ['orig'] + tree_cols] +return F.udf(f, StructType(fields)) + + +def gen_eval_tree_udf(bc_trees): +def f(tree_id, features): +return bc_trees.value[tree_id].eval(features) +return F.udf(f, FloatType()) + + +def prune(df, trees, feature_col='features', label_col='label', group_cols=['wikiid', 'query']): +# Calculate per-tree scores +eval_tree_udf = gen_eval_tree_udf(df._sc.broadcast(trees)) +
[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: JVM components to support file-based training
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/403545 ) Change subject: JVM components to support file-based training .. JVM components to support file-based training An upcoming refactor changes training_pipeline.py from dataframe based training to file based training, where we emit partitioned and formatted folds/splits to hdfs and load them into training by copying to a local file and pointing c++ as it. This is a separate patch so we can release a new version of the MjoLniR jar. Due to how our CI works python cannot test against new jvm code until it has been released. The entry points that python will be using are: * DataWriter.write * MlrXGBoost.trainWithFiles Change-Id: Ib5e8cd9d3e87e724f05b5ec0941c140aa5077d71 --- M .gitignore D jvm/mjolnir.iml M jvm/pom.xml A jvm/src/main/scala/ml/dmlc/xgboost4j/scala/spark/MjolnirUtils.scala A jvm/src/main/scala/org/wikimedia/search/mjolnir/AsLocalFile.scala A jvm/src/main/scala/org/wikimedia/search/mjolnir/DataWriter.scala A jvm/src/main/scala/org/wikimedia/search/mjolnir/MlrXGBoost.scala A jvm/src/test/resources/fixtures/datasets/test.txt A jvm/src/test/resources/fixtures/datasets/test.txt.query A jvm/src/test/resources/fixtures/datasets/train.txt A jvm/src/test/resources/fixtures/datasets/train.txt.query M jvm/src/test/scala/org/wikimedia/search/mjolnir/PythonUtilsSuite.scala 12 files changed, 749 insertions(+), 163 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR refs/changes/45/403545/1 diff --git a/.gitignore b/.gitignore index f2a9cf7..83930f6 100644 --- a/.gitignore +++ b/.gitignore @@ -27,6 +27,7 @@ # Editor temporary files .*.sw[po] /jvm/.idea +/jvm/mjolnir.iml # Vagrant, and cdh stuff in vagrant .vagrant diff --git a/jvm/mjolnir.iml b/jvm/mjolnir.iml deleted file mode 100644 index b341014..000 --- a/jvm/mjolnir.iml +++ /dev/null @@ -1,162 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/jvm/pom.xml b/jvm/pom.xml index 479b4ec..d1fdc13 100644 --- a/jvm/pom.xml +++ b/jvm/pom.xml @@ -14,7 +14,7 @@ 2.1.0 2.11.8 2.11 -0.7-wmf-1 +0.8-wmf-1-SNAPSHOT @@ -146,6 +146,16 @@ jackson-module-scala_${scala.binary.version} 2.6.5 + +ml.dmlc +xgboost4j-spark +${xgboost.version} + + +ml.dmlc +xgboost4j +${xgboost.version} + diff --git a/jvm/src/main/scala/ml/dmlc/xgboost4j/scala/spark/MjolnirUtils.scala b/jvm/src/main/scala/ml/dmlc/xgboost4j/scala/spark/MjolnirUtils.scala new file mode 100644 index 000..45a00af --- /dev/null +++ b/jvm/src/main/scala/ml/dmlc/xgboost4j/scala/spark/MjolnirUtils.scala @@ -0,0 +1,28 @@ +package ml.dmlc.xgboost4j.scala.spark + +import ml.dmlc.xgboost4j.java.IRabitTracker +import ml.dmlc.xgboost4j.scala.Booster +import ml.dmlc.xgboost4j.scala.rabit.RabitTracker + +/** + * Provide access to package-private constructs of xgboost4j-spark + */ +object MjolnirUtils { + def model(booster: Booster, metrics: Map[String, Array[Float]], trainMatrix: String): XGBoostModel = { +// Arbitrarily take an 'other' matrix if available +val xgMetrics = metrics.keys.find(!_.equals(trainMatrix)).map{ name => Map( + "train" -> metrics(trainMatrix), + "test" -> metrics(name) +) }.getOrElse(Map( + "train" -> metrics(trainMatrix) +)) + +val model = new XGBoostRegressionModel(booster) +model.setSummary(XGBoostTrainingSummary(xgMetrics)) +model + } + + def scalaRabitTracker(nWorkers: Int): IRabitTracker = { +new RabitTracker(nWorkers) + } +} diff --git a/jvm/src/main/scala/org/wikimedia/search/mjolnir/AsLocalFile.scala b/jvm/src/main/scala/org/wikimedia/search/mjolnir/AsLocalFile.scala new file mode 100644 index 000..9962b3a --- /dev/null +++
[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: Add lightgbm support
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/403335 ) Change subject: Add lightgbm support .. Add lightgbm support Only support single-executor training at the moment. Distributed training is left for another day. Change-Id: Ia9a188ef87afc86985ac9c3e269b6665dcceca10 --- A mjolnir/training/lightgbm.py M mjolnir/utilities/training_pipeline.py M setup.py 3 files changed, 248 insertions(+), 6 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR refs/changes/35/403335/1 diff --git a/mjolnir/training/lightgbm.py b/mjolnir/training/lightgbm.py new file mode 100644 index 000..cbc8883 --- /dev/null +++ b/mjolnir/training/lightgbm.py @@ -0,0 +1,221 @@ +from __future__ import absolute_import +import contextlib +import functools +import hyperopt +import json +import lightgbm as lgb +import math +import mjolnir.training.hyperopt +from mjolnir.utils import as_local_paths +from multiprocessing.dummy import Pool +import numpy as np +import pyspark + + +def _overrideParamsAccordingToTaskCpus(sc, params): +n_cpus = int(sc.getConf().get("spark.task.cpus", "1")) +if 'num_threads' not in params: +params['num_threads'] = n_cpus +elif params['num_threads'] > n_cpus: +raise Exception( +"the num_threads param %d must be no larger than spark.task.cpus (%d)" % ( +params['num_threads'], n_cpus)) + + +@contextlib.contextmanager +def load_datasets(fold, train_matrix): +with as_local_paths(*fold.values) as local_paths: +datasets = dict(zip(fold.keys(), local_paths)) +try: +yield datasets +finally: +for ds in datasets.values(): +ds._free_handle() + + +def build_distributed_boosters(rdd, params, train_matrix): +def build_partition(rows): +fold = rows.next() +try: +rows.next() +raise Exception("Expected single row in partition but received more.") +except StopIteration: +pass + +num_rounds = 100 +if 'num_rounds' in params: +num_rounds = params['num_rounds'] +del params['num_rounds'] + +# TODO: Generalize +with load_datasets(fold) as datasets: +eval_results = {} +gbm = lgb.train( +params, datasets[train_matrix], +num_boost_round=num_rounds, +valid_sets=datasets.values(), valid_names=datasets.keys(), +early_stopping_rounds=None, evals_result=eval_results) +gbm.free_dataset() +yield (gbm, eval_results) + +return rdd.mapPartitions(build_partition).cache() + + +def _coerce_params(params): +types = { +'min_data_in_leaf': int, +'num_leaves': int, +} +for k, val_type in types.items(): +if k in params: +params[k] = val_type(params[k]) + + +def train(fold, paramOverrides, train_matrix=None): +sc = pyspark.SparkContext.getOrCreate() +params = { +'boosting_type': 'gbdt', +'objective': 'lambdarank', +'metric': 'ndcg', +'ndcg_eval_at': '1,3,5,10', +'is_training_metric': True, +'num_rounds': 100, +'max_bin': 255, +'num_leaves': 63, +'learning_rate': 0.1, +'feature_fraction': 1.0, +'bagging_fraction': 0.9, +'bagging_freq': 1, +'verbose': 0, +} +params.update(paramOverrides) +_overrideParamsAccordingToTaskCpus(sc, params) +_coerce_params(params) + +if (len(fold) > 1): +rdd = sc.parallelize(list(enumerate(fold)), 1).partitionBy(len(fold), lambda x: x).map(lambda x: x[1]) +raise Exception("TODO: Distributed Training") +else: +rdd = sc.parallelize(fold, 1) + +if train_matrix is None: +train_matrix = "all" if "all" in fold else "train" + +booster, metrics = build_distributed_boosters(rdd, params, train_matrix).collect()[0] +return LightGBMModel(booster, metrics) + + +class LightGBMSummary(object): +def __init__(self, metrics): +self._metrics = metrics + +def train(self): +return self._metrics['train']['ndcg@10'] + +def test(self): +return self._metrics['test']['ndcg@10'] + + +class LightGBMModel(object): +def __init__(self, booster, metrics): +self._booster = booster +self.metrics = metrics + +def summary(self): +return LightGBMSummary(self.metrics) + +def dump(self, features=None): +# TODO: lightgbm needs features provided when creating the dataset +return json.dumps(self._booster.dump_model()) + +def saveModelAsLocalFile(self, path): +self._booster.save_model(path) + + +def tune(folds, stats, train_matrix, num_cv_jobs=5, num_workers=5, initial_num_trees=100, final_num_trees=500): +cv_pool = None +if num_cv_jobs > 1: +cv_pool
[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: Simplify hyperparameter tuning
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/40 ) Change subject: Simplify hyperparameter tuning .. Simplify hyperparameter tuning I tested letting all the tuning happen at once instead of the iterative approach we were using, it went quicker and gave comparable results. This will also make it easier to add in lightgbm as an alternate training algo. Also removed use_external_memory parameter from xgboost. This is specialized and won't be necessary anymore after an upcoming refactor for file based training. Change-Id: I8cc4ee504d0e49bc61ffc5d2781e131fabe4372c --- M example_train.yaml A mjolnir/pruning.py A mjolnir/scan_es.py M mjolnir/test/fixtures/load_config/example_train.expect M mjolnir/test/training/test_hyperopt.py M mjolnir/test/training/test_tuning.py M mjolnir/training/hyperopt.py M mjolnir/training/tuning.py M mjolnir/training/xgboost.py M mjolnir/utilities/training_pipeline.py 10 files changed, 306 insertions(+), 330 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR refs/changes/33/40/1 diff --git a/example_train.yaml b/example_train.yaml index 183ea6e..31ce45c 100644 --- a/example_train.yaml +++ b/example_train.yaml @@ -138,7 +138,6 @@ cv-jobs: 22 folds: 3 final-trees: 100 -use-external-memory: yes medium: # 4M to 12M observations per executor. diff --git a/mjolnir/pruning.py b/mjolnir/pruning.py new file mode 100644 index 000..c2e78f9 --- /dev/null +++ b/mjolnir/pruning.py @@ -0,0 +1,134 @@ +from __future__ import absolute_import +import json +import math +from pyspark.sql import functions as F +from pyspark.sql.types import FloatType, StructField, StructType + + +class Split(object): +def __init__(self, left, right, feature, threshold): +self.left = left +self.right = right +self.feature = feature +self.threshold = threshold + +def isLeaf(self): +return False + +def eval(self, features): +n = self +while not n.isLeaf(): +if n.threshold > features[n.feature]: +n = n.left +else: +n = n.right +return n.output + + +class Leaf(object): +def __init__(self, output): +self.output = output + +def isLeaf(self): +return True + + +def _parse_node(json_node): +if 'leaf' in json_node: +return Leaf(json_node['leaf']) +else: +left = _parse_node(json_node['children'][0]) +right = _parse_node(json_node['children'][1]) +return Split(left, right, json_node['split'], json_node['split_condition']) + + +def parse_xgboost(json_tree): +return [_parse_node(tree) for tree in json.loads(json_tree)] + + +def ndcg_at_k(k, predicted, actual): +idcg = sum([((1 << label) - 1) / math.log(i + 2.0, 2) for i, label in enumerate(actual[:k])]) +if idcg == 0: +return 0. +else: +dcg = sum([((1 << label) - 1) / math.log(i + 2.0, 2) for i, label in enumerate(predicted[:k])]) +return dcg / idcg + + +# Horrible name ... it returns the ndcg for each removed tree +def gen_per_tree_ndcg(tree_cols, removed_trees, label_col, k=10): +def f(rows): +# Remove trees from the sum +cur_sum = [reduce(lambda acc, tree: acc - row[tree], removed_trees, row.sum) for row in rows] +data = zip(rows, cur_sum) + +# TODO: actual could be pre-calculated? Actually full idcg could be pre-calculated +actual = [x[0][label_col] for x in sorted(data, key=lambda x: x[0][label_col], reverse=True)] +# baseline ndcg +predicted = [x[0][label_col] for x in sorted(data, key=lambda x: x[1], reverse=True)] +res = [ndcg_at_k(k, predicted, actual)] +# Per-tree ndcgs +for tree_pred in tree_cols: +predicted = [x[0][label_col] for x in sorted(data, key=lambda x: x[1] - x[0][tree_pred], reverse=True)] +res.append(ndcg_at_k(k, predicted, actual)) +return res +fields = [StructField(name, FloatType()) for name in ['orig'] + tree_cols] +return F.udf(f, StructType(fields)) + + +def gen_eval_tree_udf(bc_trees): +def f(tree_id, features): +return bc_trees.value[tree_id].eval(features) +return F.udf(f, FloatType()) + + +def prune(df, trees, feature_col='features', label_col='label', group_cols=['wikiid', 'query']): +# Calculate per-tree scores +eval_tree_udf = gen_eval_tree_udf(df._sc.broadcast(trees)) +cols = [eval_tree_udf(F.lit(i), feature_col).alias('tree_%d' % (i)) for i in range(len(trees))] +tree_cols = ['tree_%d' % (i) for i in range(len(trees))] + +# We should iterate until it gets worse or we hit some desired # of trees +df_w_scores = ( +df +.select(feature_col, label_col, F.concat(*group_cols).alias('group_id')) +
[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: [WIP] distributed training for lightgbm
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/403336 ) Change subject: [WIP] distributed training for lightgbm .. [WIP] distributed training for lightgbm untested. The daemon never closes right Change-Id: Id50f4f53b221003a89555e870bb771ba26faad21 --- M mjolnir/training/lightgbm.py 1 file changed, 267 insertions(+), 81 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR refs/changes/36/403336/1 diff --git a/mjolnir/training/lightgbm.py b/mjolnir/training/lightgbm.py index cbc8883..460740b 100644 --- a/mjolnir/training/lightgbm.py +++ b/mjolnir/training/lightgbm.py @@ -9,7 +9,11 @@ from mjolnir.utils import as_local_paths from multiprocessing.dummy import Pool import numpy as np +import Pyro4 import pyspark +import socket +import threading +import time def _overrideParamsAccordingToTaskCpus(sc, params): @@ -33,7 +37,9 @@ ds._free_handle() -def build_distributed_boosters(rdd, params, train_matrix): +def build_distributed_boosters(rdd, params, train_matrix, client): +num_partitions = rdd.getNumPartitions() + def build_partition(rows): fold = rows.next() try: @@ -47,7 +53,11 @@ num_rounds = params['num_rounds'] del params['num_rounds'] -# TODO: Generalize +if client is not None: +machines, listen_port = client.request_machine_list(num_partitions) +params['machines'] = machines +params['local_listen_port'] = listen_port + with load_datasets(fold) as datasets: eval_results = {} gbm = lgb.train( @@ -71,7 +81,7 @@ params[k] = val_type(params[k]) -def train(fold, paramOverrides, train_matrix=None): +def train(fold, paramOverrides, train_matrix=None, client=None): sc = pyspark.SparkContext.getOrCreate() params = { 'boosting_type': 'gbdt', @@ -95,13 +105,15 @@ if (len(fold) > 1): rdd = sc.parallelize(list(enumerate(fold)), 1).partitionBy(len(fold), lambda x: x).map(lambda x: x[1]) raise Exception("TODO: Distributed Training") +if client is None: +raise Exception("client required for distributed training") else: rdd = sc.parallelize(fold, 1) if train_matrix is None: train_matrix = "all" if "all" in fold else "train" -booster, metrics = build_distributed_boosters(rdd, params, train_matrix).collect()[0] +booster, metrics = build_distributed_boosters(rdd, params, train_matrix, client).collect()[0] return LightGBMModel(booster, metrics) @@ -132,90 +144,264 @@ self._booster.save_model(path) +DAEMON_PORT = 6827 + + def tune(folds, stats, train_matrix, num_cv_jobs=5, num_workers=5, initial_num_trees=100, final_num_trees=500): cv_pool = None if num_cv_jobs > 1: cv_pool = Pool(num_cv_jobs) -# Configure the trials pool large enough to keep cv_pool full -num_folds = len(folds) -num_workers = len(folds[0]) -trials_pool_size = int(math.floor(num_cv_jobs / (num_workers * num_folds))) -if trials_pool_size > 1: -trials_pool = Pool(trials_pool_size) -else: -trials_pool = None +with Daemon(socket.gethostname(), DAEMON_PORT) as daemon: +while not daemon.ready: +time.sleep(1) -train_func = functools.partial(train, train_matrix=train_matrix) +# Configure the trials pool large enough to keep cv_pool full +num_folds = len(folds) +num_workers = len(folds[0]) +trials_pool_size = int(math.floor(num_cv_jobs / (num_workers * num_folds))) +if trials_pool_size > 1: +trials_pool = Pool(trials_pool_size) +else: +trials_pool = None -def eval_space(space, max_evals): -max_evals = 2 # TODO: remove -best, trials = mjolnir.training.hyperopt.minimize( -folds, train_func, space, max_evals=max_evals, -cv_pool=cv_pool, trials_pool=trials_pool) -for k, v in space.items(): -if not np.isscalar(v): -print 'best %s: %f' % (k, best[k]) -return best, trials +kwargs = {'train_matrix': train_matrix} +if num_workers > 1: +kwargs['client'] = Client(daemon.url) +train_func = functools.partial(train, **kwargs) -space = { -'boosting_type': 'gbdt', -'objective': 'lambdarank', -'metric': 'ndcg', -'ndcg_eval_at': '1,3,10', -'is_training_metric': True, -'num_rounds': initial_num_trees, -'max_bin': 255, -'num_leaves': 63, -'learning_rate': 0.1, -'feature_fraction': 1.0, -'bagging_fraction': 0.9, -'bagging_freq': 1, -} -tune_spaces = [ -('initial', { -'iterations': 5, -'space': { -
[MediaWiki-commits] [Gerrit] search/xgboost[master]: [WIP] Merge remote-tracking branch 'upstream/master'
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/402114 ) Change subject: [WIP] Merge remote-tracking branch 'upstream/master' .. [WIP] Merge remote-tracking branch 'upstream/master' Conflicts: jvm-packages/pom.xml jvm-packages/xgboost4j-spark/pom.xml jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala jvm-packages/xgboost4j/pom.xml Change-Id: I1ae675ee924579623f2cf5d5fc4b797c84e56d0c --- M jvm-packages/pom.xml M jvm-packages/xgboost4j-spark/pom.xml M jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala M jvm-packages/xgboost4j/pom.xml 4 files changed, 15 insertions(+), 101 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/xgboost refs/changes/14/402114/1 diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml index 130505d..0fab33d 100644 --- a/jvm-packages/pom.xml +++ b/jvm-packages/pom.xml @@ -4,11 +4,7 @@ ml.dmlc xgboost-jvm -<<< HEAD (9bdbdc Add unique tag to log instances in RabitTracker) -0.7-wmf-2-SNAPSHOT -=== -0.8-SNAPSHOT ->>> BRANCH (14c639 [jvm-packages] add dev script to update version and update v) +0.8-wmf-1-SNAPSHOT pom UTF-8 diff --git a/jvm-packages/xgboost4j-spark/pom.xml b/jvm-packages/xgboost4j-spark/pom.xml index 3532a91..5f02dd7 100644 --- a/jvm-packages/xgboost4j-spark/pom.xml +++ b/jvm-packages/xgboost4j-spark/pom.xml @@ -4,11 +4,7 @@ ml.dmlc xgboost-jvm -<<< HEAD (9bdbdc Add unique tag to log instances in RabitTracker) -0.7-wmf-2-SNAPSHOT -=== -0.8-SNAPSHOT ->>> BRANCH (14c639 [jvm-packages] add dev script to update version and update v) +0.8-wmf-1-SNAPSHOT xgboost4j-spark diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala index 053fbbb..2ff1ddf 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala @@ -16,17 +16,12 @@ package ml.dmlc.xgboost4j.scala.spark -<<< HEAD (9bdbdc Add unique tag to log instances in RabitTracker) import java.io.ByteArrayInputStream import java.util.concurrent.TimeUnit - -import scala.collection.mutable -import scala.concurrent.duration.Duration -=== import java.io.File import scala.collection.mutable ->>> BRANCH (14c639 [jvm-packages] add dev script to update version and update v) +import scala.concurrent.duration.Duration import scala.util.Random import ml.dmlc.xgboost4j.java.{IRabitTracker, Rabit, XGBoostError, RabitTracker => PyRabitTracker} import ml.dmlc.xgboost4j.scala.rabit.RabitTracker @@ -38,7 +33,6 @@ import org.apache.spark.sql.Dataset import org.apache.spark.ml.feature.{LabeledPoint => MLLabeledPoint} import org.apache.spark.{SparkContext, SparkParallelismTracker, TaskContext} - /** @@ -121,23 +115,11 @@ obj: ObjectiveTrait, eval: EvalTrait, useExternalMemory: Boolean, -<<< HEAD (9bdbdc Add unique tag to log instances in RabitTracker) - missing: Float): RDD[Array[Byte]] = { -val partitionedData = if (data.getNumPartitions != numWorkers) { - logger.info(s"repartitioning training set to $numWorkers partitions") - data.repartition(numWorkers) -} else { - data -} -val partitionedBaseMargin = partitionedData.map(_.baseMargin) -val appName = partitionedData.context.appName -=== missing: Float, prevBooster: Booster -): RDD[(Booster, Map[String, Array[Float]])] = { +): RDD[(Int, Array[Byte], Map[String, Array[Float]])] = { val partitionedBaseMargin = data.map(_.baseMargin) ->>> BRANCH (14c639 [jvm-packages] add dev script to update version and update v) // to workaround the empty partitions in training dataset, // this might not be the best efficient implementation, see // (https://github.com/dmlc/xgboost/issues/1277) @@ -157,42 +139,28 @@ } else { None } -<<< HEAD (9bdbdc Add unique tag to log instances in RabitTracker) // Yes it's odd to access this but not do anything. We are ensuring the lazily // initialized resource monitor is setup before we enter training. monitor - rabitEnv.put("DMLC_TASK_ID", TaskContext.getPartitionId().toString) -=== rabitEnv.put("DMLC_TASK_ID", taskId) ->>> BRANCH (14c639 [jvm-packages] add dev script to update version and update v) Rabit.init(rabitEnv) val watches = Watches(params, -<<< HEAD (9bdbdc Add unique tag to log instances in RabitTracker) -fromDenseToSparseLabeledPoints(labeledPoints, missing), -
[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Revert "Add backend support for the new crossproject result ...
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/401795 ) Change subject: Revert "Add backend support for the new crossproject result page A/B test" .. Revert "Add backend support for the new crossproject result page A/B test" This test is over, we don't need the ability to request but throw away interwiki results to estimate recall and such anymore. The related feature-marker, enable-new-crossproject-page, is removed from core in the related patch. This reverts commit b43efa1dbe8d257c6d31a4e350dbcc6d1723e9fb. Related: I80d8375b Change-Id: I9a4884386a5c15852af15942ff51de60d9355858 --- M CirrusSearch.php M includes/CirrusSearch.php 2 files changed, 0 insertions(+), 31 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/95/401795/1 diff --git a/CirrusSearch.php b/CirrusSearch.php index a2d3461..3357283 100644 --- a/CirrusSearch.php +++ b/CirrusSearch.php @@ -1062,22 +1062,6 @@ $wgCirrusSearchCrossProjectProfiles = []; /** - * When wgCirrusSearchEnableCrossProjectSearch is true - * Setting wgCirrusSearchHideCrossProjectResults will - * tell SpecialSearch to run normally without displaying - * interwiki results. - * Useful to report how many results we could have been - * displayed (For analytics purpose). - */ -$wgCirrusSearchHideCrossProjectResults = false; - -/** - * Informs SpeciaSearch in core that we want - * to use the new cross project result page - */ -$wgCirrusSearchNewCrossProjectPage = false; - -/** * Enables the explore similar feature for search results * which adds links to related pages (morelike), categories and * languages beside each search result on the SERP. diff --git a/includes/CirrusSearch.php b/includes/CirrusSearch.php index 40e759b..f41c9b3 100644 --- a/includes/CirrusSearch.php +++ b/includes/CirrusSearch.php @@ -407,34 +407,19 @@ $iwSearch = new InterwikiSearcher( $this->connection, $config, $this->namespaces, null, $highlightingConfig ); $iwSearch->setOptionsFromRequest( $this->request ); $interwikiResults = $iwSearch->getInterwikiResults( $term ); - if ( $interwikiResults !== null ) { // If we are dumping we need to convert into an array that can be appended to - $recallMetrics = []; if ( $iwSearch->isReturnRaw() ) { $result = [ $result ]; } foreach ( $interwikiResults as $interwiki => $interwikiResult ) { - $recallMetrics[$interwiki] = "$interwiki:0"; if ( $iwSearch->isReturnRaw() ) { $result[] = $interwikiResult; } elseif ( $interwikiResult && $interwikiResult->numRows() > 0 ) { - $recallMetrics[$interwiki] = "$interwiki:" . $interwikiResult->getTotalHits(); - // Hide the search results, we are only - // running the query for analytic purposes - if ( $this->config->get( 'CirrusSearchHideCrossProjectResults' ) ) { - continue; - } $result->addInterwikiResults( $interwikiResult, SearchResultSet::SECONDARY_RESULTS, $interwiki ); } - } - $this->extraSearchMetrics['wgCirrusSearchCrossProjectRecall'] = implode( '|', $recallMetrics ); - if ( $this->config->get( 'CirrusSearchNewCrossProjectPage' ) && - !$this->config->get( 'CirrusSearchHideCrossProjectResults' ) ) { - $this->features['enable-new-crossproject-page'] = true; - $this->features['show-multimedia-search-results'] = $this->config->get( 'CirrusSearchCrossProjectShowMultimedia' ); } } } -- To view, visit https://gerrit.wikimedia.org/r/401795 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I9a4884386a5c15852af15942ff51de60d9355858 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/CirrusSearch Gerrit-Branch: master Gerrit-Owner: EBernhardson
[MediaWiki-commits] [Gerrit] mediawiki/core[master]: Remove old interwiki search result widget
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/401794 ) Change subject: Remove old interwiki search result widget .. Remove old interwiki search result widget This code is no longer necessary, if interwiki search is on it should use the new default which includes highlights. Change-Id: I80d8375bbd3e1fabc9b2432b6875d17a96aee099 Related: I9a488438 --- M includes/specials/SpecialSearch.php D includes/widget/search/SimpleSearchResultSetWidget.php D includes/widget/search/SimpleSearchResultWidget.php 3 files changed, 8 insertions(+), 210 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/core refs/changes/94/401794/1 diff --git a/includes/specials/SpecialSearch.php b/includes/specials/SpecialSearch.php index b3a58cb..789ab4d 100644 --- a/includes/specials/SpecialSearch.php +++ b/includes/specials/SpecialSearch.php @@ -28,8 +28,6 @@ use MediaWiki\Widget\Search\FullSearchResultWidget; use MediaWiki\Widget\Search\InterwikiSearchResultWidget; use MediaWiki\Widget\Search\InterwikiSearchResultSetWidget; -use MediaWiki\Widget\Search\SimpleSearchResultWidget; -use MediaWiki\Widget\Search\SimpleSearchResultSetWidget; /** * implements Special:Search - Run text & title search and display the output @@ -394,24 +392,14 @@ $linkRenderer = $this->getLinkRenderer(); $mainResultWidget = new FullSearchResultWidget( $this, $linkRenderer ); - if ( $search->getFeatureData( 'enable-new-crossproject-page' ) ) { - $sidebarResultWidget = new InterwikiSearchResultWidget( $this, $linkRenderer ); - $sidebarResultsWidget = new InterwikiSearchResultSetWidget( - $this, - $sidebarResultWidget, - $linkRenderer, - MediaWikiServices::getInstance()->getInterwikiLookup(), - $search->getFeatureData( 'show-multimedia-search-results' ) - ); - } else { - $sidebarResultWidget = new SimpleSearchResultWidget( $this, $linkRenderer ); - $sidebarResultsWidget = new SimpleSearchResultSetWidget( - $this, - $sidebarResultWidget, - $linkRenderer, - MediaWikiServices::getInstance()->getInterwikiLookup() - ); - } + $sidebarResultWidget = new InterwikiSearchResultWidget( $this, $linkRenderer ); + $sidebarResultsWidget = new InterwikiSearchResultSetWidget( + $this, + $sidebarResultWidget, + $linkRenderer, + MediaWikiServices::getInstance()->getInterwikiLookup(), + $search->getFeatureData( 'show-multimedia-search-results' ) + ); $widget = new BasicSearchResultSetWidget( $this, $mainResultWidget, $sidebarResultsWidget ); diff --git a/includes/widget/search/SimpleSearchResultSetWidget.php b/includes/widget/search/SimpleSearchResultSetWidget.php deleted file mode 100644 index d6583a3..000 --- a/includes/widget/search/SimpleSearchResultSetWidget.php +++ /dev/null @@ -1,130 +0,0 @@ -specialSearch = $specialSearch; - $this->resultWidget = $resultWidget; - $this->linkRenderer = $linkRenderer; - $this->iwLookup = $iwLookup; - } - - /** -* @param string $term User provided search term -* @param SearchResultSet|SearchResultSet[] $resultSets List of interwiki -* results to render. -* @return string HTML -*/ - public function render( $term, $resultSets ) { - if ( !is_array( $resultSets ) ) { - $resultSets = [ $resultSets ]; - } - - $this->loadCustomCaptions(); - - $iwResults = []; - foreach ( $resultSets as $resultSet ) { - $result = $resultSet->next(); - while ( $result ) { - if ( !$result->isBrokenTitle() ) { - $iwResults[$result->getTitle()->getInterwiki()][] = $result; - } - $result = $resultSet->next(); - } - } - - $out = ''; - foreach ( $iwResults as $iwPrefix => $results ) { - $out .= $this->headerHtml( $iwPrefix, $term ); - $out .= ""; - // TODO: Assumes interwiki results are never paginated - $position = 0; - foreach ( $results as $result ) { -
[MediaWiki-commits] [Gerrit] mediawiki...WikimediaEvents[master]: Bring back human search relevance survey
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/401631 ) Change subject: Bring back human search relevance survey .. Bring back human search relevance survey This survey was temporarily turned off while evaluating the results of the first round. We are getting ready to run a second survey, this time with queries we don't know the relevance of (the final goal of the surveys), and need this code available to be updated before shipping out to users. Bug: T184019 Change-Id: I036c611d93e0f3f20f992ab89ab79b8e8d738826 --- A modules/all/ext.wikimediaEvents.humanSearchRelevance.css A modules/all/ext.wikimediaEvents.humanSearchRelevance.js 2 files changed, 150 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/WikimediaEvents refs/changes/31/401631/1 diff --git a/modules/all/ext.wikimediaEvents.humanSearchRelevance.css b/modules/all/ext.wikimediaEvents.humanSearchRelevance.css new file mode 100644 index 000..41594a0 --- /dev/null +++ b/modules/all/ext.wikimediaEvents.humanSearchRelevance.css @@ -0,0 +1,10 @@ +/* Needs some extra specificity to override `.mw-body p` */ +.mw-notification-content .mw-wme-humanrel-question { + margin: 0; +} + +.mw-wme-humanrel-question > span, +.mw-wme-humanrel-question > small { + display: inline-block; + margin-top: 0.5em; +} diff --git a/modules/all/ext.wikimediaEvents.humanSearchRelevance.js b/modules/all/ext.wikimediaEvents.humanSearchRelevance.js new file mode 100644 index 000..5803354 --- /dev/null +++ b/modules/all/ext.wikimediaEvents.humanSearchRelevance.js @@ -0,0 +1,140 @@ +( function ( mw, $ ) { + 'use strict'; + + var config; + + function sample( acceptPercentage ) { + var rand = mw.user.generateRandomSessionId(), + // take the first 52 bits of the rand value to match js + // integer precision + parsed = parseInt( rand.slice( 0, 13 ), 16 ); + if ( acceptPercentage >= 1 ) { + return true; + } + return parsed / Math.pow( 2, 52 ) < acceptPercentage; + } + + function chooseOne( options ) { + var rand = mw.user.generateRandomSessionId(), + parsed = parseInt( rand.slice( 0, 13 ), 16 ), + step = Math.pow( 2, 52 ) / options.length; + return options[ Math.floor( parsed / step ) ]; + } + + // See https://developer.mozilla.org/en-US/docs/Web/API/Navigator/doNotTrack + // Taken from https://www.npmjs.com/package/dnt-polyfill + if ( window.doNotTrack === '1' || + navigator.doNotTrack === '1' || + navigator.doNotTrack === 'yes' || + navigator.msDoNotTrack === '1' + ) { + return; + } + + // Page is not part of this test + if ( !mw.config.exists( 'wgWMESearchRelevancePages' ) ) { + return; + } + + // The config value is coded into the page output and cached in varnish. + // That means any changes to sampling rates or pages chosen will take up to + // a week to propogate into the wild. + config = mw.config.get( 'wgWMESearchRelevancePages' ); + + // bad configuration + if ( !config.hasOwnProperty( 'sampleRate' ) || !config.hasOwnProperty( 'queries' ) ) { + return; + } + + // This page view not chosen for sampling + if ( !sample( config.sampleRate ) ) { + return; + } + + function askQuestion() { + mw.loader.using( [ + 'oojs-ui-core', + 'mediawiki.notification', + 'ext.wikimediaEvents.humanSearchRel' + ] ).then( function () { + var notification, originalClose, + closed = false, + query = chooseOne( config.queries ), + question = 'wikimediaevents-humanrel-question-' + chooseOne( [ 'a', 'b', 'c', 'd' ] ), + logEvent = function ( choice ) { + if ( !closed ) { + closed = true; + notification.close(); + } + mw.loader.using( [ 'schema.HumanSearchRelevance' ] ).then( function () { + mw.eventLog.logEvent( 'HumanSearchRelevance', { + articleId: mw.config.get( 'wgArticleId' ), + query: query, + choice: choice, +
[MediaWiki-commits] [Gerrit] mediawiki/core[master]: Push pagination decision for prefix search into SearchEngine
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/399312 ) Change subject: Push pagination decision for prefix search into SearchEngine .. Push pagination decision for prefix search into SearchEngine Various code using the search engine shouldn't need to implement it's own methods, such as over-fetching, to determine if there are more results available. This should be knowledge internal to search that is exposed by a boolean. Full-text search unfortunately does the same thing, but fixing it is delegated to some future patch. Change-Id: Ica094428700637dfdedb723b03f6aeadfe12b9f4 --- M includes/api/ApiQueryPrefixSearch.php M includes/api/SearchApi.php M includes/search/SearchEngine.php M includes/search/SearchSuggestionSet.php M tests/phpunit/includes/search/SearchEnginePrefixTest.php 5 files changed, 116 insertions(+), 47 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/core refs/changes/12/399312/1 diff --git a/includes/api/ApiQueryPrefixSearch.php b/includes/api/ApiQueryPrefixSearch.php index 2fbc518..aaf81b0 100644 --- a/includes/api/ApiQueryPrefixSearch.php +++ b/includes/api/ApiQueryPrefixSearch.php @@ -51,7 +51,12 @@ $offset = $params['offset']; $searchEngine = $this->buildSearchEngine( $params ); - $titles = $searchEngine->extractTitles( $searchEngine->completionSearchWithVariants( $search ) ); + $suggestions = $searchEngine->completionSearchWithVariants( $search ); + $titles = $searchEngine->extractTitles( $suggestions ); + + if ( $suggestions->hasMoreResults() ) { + $this->setContinueEnumParameter( 'offset', $offset + $limit ); + } if ( $resultPageSet ) { $resultPageSet->setRedirectMergePolicy( function ( array $current, array $new ) { @@ -60,10 +65,6 @@ } return $current; } ); - if ( count( $titles ) > $limit ) { - $this->setContinueEnumParameter( 'offset', $offset + $limit ); - array_pop( $titles ); - } $resultPageSet->populateFromTitles( $titles ); foreach ( $titles as $index => $title ) { $resultPageSet->setGeneratorData( $title, [ 'index' => $index + $offset + 1 ] ); @@ -72,10 +73,6 @@ $result = $this->getResult(); $count = 0; foreach ( $titles as $title ) { - if ( ++$count > $limit ) { - $this->setContinueEnumParameter( 'offset', $offset + $limit ); - break; - } $vals = [ 'ns' => intval( $title->getNamespace() ), 'title' => $title->getPrefixedText(), diff --git a/includes/api/SearchApi.php b/includes/api/SearchApi.php index f7c6471..fb6b635 100644 --- a/includes/api/SearchApi.php +++ b/includes/api/SearchApi.php @@ -157,15 +157,7 @@ $searchEngine = MediaWikiServices::getInstance()->getSearchEngineFactory()->create( $type ); $limit = $params['limit']; $searchEngine->setNamespaces( $params['namespace'] ); - $offset = null; - if ( isset( $params['offset'] ) ) { - // If the API supports offset then it probably - // wants to fetch limit+1 so it can check if - // more results are available to properly set - // the continue param - $offset = $params['offset']; - $limit += 1; - } + $offset = isset( $params['offset'] ) ? $params['offset'] : null; $searchEngine->setLimitOffset( $limit, $offset ); // Initialize requested search profiles. diff --git a/includes/search/SearchEngine.php b/includes/search/SearchEngine.php index 3c8fe60..6876099 100644 --- a/includes/search/SearchEngine.php +++ b/includes/search/SearchEngine.php @@ -517,7 +517,15 @@ return SearchSuggestionSet::emptySuggestionSet(); // Return empty result } $search = $this->normalizeNamespaces( $search ); - return $this->processCompletionResults( $search, $this->completionSearchBackend( $search ) ); + // Over-fetch results so we can determine if pagination is possible in + //
[MediaWiki-commits] [Gerrit] operations/mediawiki-config[master]: Dont discount file searches on commonswiki
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/398394 ) Change subject: Dont discount file searches on commonswiki .. Dont discount file searches on commonswiki Change-Id: I5c1d7493ce68c5afe4d6f21c25b25112283924be --- M wmf-config/InitialiseSettings.php 1 file changed, 3 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/mediawiki-config refs/changes/94/398394/1 diff --git a/wmf-config/InitialiseSettings.php b/wmf-config/InitialiseSettings.php index c355b31..81bab5d 100644 --- a/wmf-config/InitialiseSettings.php +++ b/wmf-config/InitialiseSettings.php @@ -18679,6 +18679,9 @@ 104 => 0.9, 106 => 0.9, ], + 'commonswiki' => [ + 6 => 1.0, + ], 'wikisource' => [ 'author' => 1, ], -- To view, visit https://gerrit.wikimedia.org/r/398394 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I5c1d7493ce68c5afe4d6f21c25b25112283924be Gerrit-PatchSet: 1 Gerrit-Project: operations/mediawiki-config Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] mediawiki/core[master]: Silently drop unknown titles in completion search
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/398192 ) Change subject: Silently drop unknown titles in completion search .. Silently drop unknown titles in completion search This mimics how full text works by silenty dropping results returned from search that no longer exist. This could be because the search index is slightly out of sync with reality, or the search engine could simply be broken. Only silent from the users perspective. We maintain a count in statsd of the number of titles dropped. This can be monitored over time to recognize any increases. Bug: T115756 Change-Id: I2f29d73e258cd448a14d35a2b4902a4fb6f61c68 --- M includes/search/SearchEngine.php M includes/search/SearchSuggestionSet.php 2 files changed, 31 insertions(+), 1 deletion(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/core refs/changes/92/398192/1 diff --git a/includes/search/SearchEngine.php b/includes/search/SearchEngine.php index 3c8fe60..94e0d80 100644 --- a/includes/search/SearchEngine.php +++ b/includes/search/SearchEngine.php @@ -580,6 +580,16 @@ $lb->setCaller( __METHOD__ ); $lb->execute(); + $before = $suggestions->count(); + $suggestions = $suggestions->filter( function ( SearchSuggestion $sugg ) { + return $sugg->getSuggestedTitle()->isKnown(); + } ); + $after = $suggestions->count(); + if ( $before !== $after ) { + MediaWikiServices::getInstance()->getStatsdDataFactory() + ->updateCount( 'search.completion.missing', $before - $after ); + } + $results = $suggestions->map( function ( SearchSuggestion $sugg ) { return $sugg->getSuggestedTitle()->getPrefixedText(); } ); diff --git a/includes/search/SearchSuggestionSet.php b/includes/search/SearchSuggestionSet.php index aced5e1..7c4b484 100644 --- a/includes/search/SearchSuggestionSet.php +++ b/includes/search/SearchSuggestionSet.php @@ -23,7 +23,7 @@ * A set of search suggestions. * The set is always ordered by score, with the best match first. */ -class SearchSuggestionSet { +class SearchSuggestionSet implements Countable { /** * @var SearchSuggestion[] */ @@ -73,6 +73,19 @@ return array_map( $callback, $this->suggestions ); } + /** +* Filter the suggestions array +* @param callback $callback +* @return self +*/ + public function filter( $callback ) { + $suggestions = array_filter( $this->suggestions, $callback ); + if ( count( $suggestions ) === count( $this->suggestions ) ) { + return $this; + } else { + return new self( $suggestions ); + } + } /** * Add a new suggestion at the end. * If the score of the new suggestion is greater than the worst one, @@ -171,6 +184,13 @@ } /** +* @return int The number of suggestions held +*/ + public function count() { + return count( $this->suggestions ); + } + + /** * Builds a new set of suggestion based on a title array. * Useful when using a backend that supports only Titles. * -- To view, visit https://gerrit.wikimedia.org/r/398192 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I2f29d73e258cd448a14d35a2b4902a4fb6f61c68 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/core Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Remove duplicate uploads in integration tests
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/398129 ) Change subject: Remove duplicate uploads in integration tests .. Remove duplicate uploads in integration tests The same files were being uploaded, in slightly different ways, from two different hooks. Unify them into a single hook. I think the recent errors on cindy were due to the ordering of the execution of these two hooks, if one came before the other things would fail. I'm not entirely sure why though, there is something odd with file uploads but it's not clear to me what it is ... Change-Id: I5b780033ff61b1c06016d9b9c8840cdf0a0b9fbb --- M tests/integration/features/support/hooks.js 1 file changed, 20 insertions(+), 34 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/29/398129/1 diff --git a/tests/integration/features/support/hooks.js b/tests/integration/features/support/hooks.js index c6cf80c..fa7d0ac 100644 --- a/tests/integration/features/support/hooks.js +++ b/tests/integration/features/support/hooks.js @@ -280,21 +280,27 @@ } } ) ); - BeforeOnce( { tags: "@filesearch" }, Promise.coroutine( function* () { - // Unfortunatly the current deduplication between wikis requires a file - // be uploaded to commons before it's uploaded to any other wiki, or the - // other wiki isn't tagged. - yield runBatch( this, 'commons', [ - job.upload( "DuplicatedLocally.svg", "File stored on commons and duplicated locally" ), - job.upload( "OnCommons.svg", "File stored on commons for test purposes" ), - ] ); + BeforeOnce( { tags: "@filesearch or @setup_main or @commons" }, Promise.coroutine( function* () { + yield runBatch( this, 'commons', { + delete: [ + 'File:OnCommons.svg', + 'File:DuplicatedLocally.svg', + ] + } ); + yield runBatch( this, false, { + delete: [ 'File:DuplicatedLocally.svg' ] + } ); + yield runBatch( this, 'commons', [ + // TODO: Why is overwrite necessary here? Otherwise the upload is rejected + // with was-deleted or some such? + job.uploadOverwrite( 'OnCommons.svg', "File stored on commons for test purposes" ), + job.uploadOverwrite( 'DuplicatedLocally.svg', 'File stored on commons and duplicated locally' ), + ] ); + // For duplications to track correctly commons has to be uploaded first. This is a bug + // in cirrus, but no current plans to fix. yield runBatch( this, false, [ - job.upload( 'No_SVG.svg', "[[Category:Red circle with left slash]]" ), - job.upload( 'Somethingelse_svg_SVG.svg', "[[Category:Red circle with left slash]]" ), - job.upload( 'Savepage-greyed.png', "Screenshot, for test purposes, associated with https://bugzilla.wikimedia.org/show_bug.cgi?id=52908 ." ), - job.upload( 'DuplicatedLocally.svg', "Locally stored file duplicated on commons" ), - job.delete( 'File:Frozen.svg' ), + job.uploadOverwrite( 'DuplicatedLocally.svg','Locally stored file duplicated on commons' ) ] ); } ) ); @@ -490,6 +496,7 @@ yield runBatch(this, false, { delete: [ 'File:Linux_Distribution_Timeline_text_version.pdf', + 'File:Frozen.svg', ] }); yield runBatch(this, false, [ @@ -635,27 +642,6 @@ } ) ); BeforeOnce( { tags: "@setup_main or @commons" }, Promise.coroutine( function* () { - yield runBatch( this, 'commons', { - delete: [ - 'File:OnCommons.svg', - 'File:DuplicatedLocally.svg', - ] - } ); - yield runBatch( this, false, { - delete: [ 'File:DuplicatedLocally.svg' ] - } ); - - yield runBatch( this, 'commons', [ - // TODO: Why is overwrite necessary here? Otherwise the upload is rejected - // with was-deleted or some such? - job.uploadOverwrite( 'OnCommons.svg', "File stored on commons for test purposes" ), - job.uploadOverwrite( 'DuplicatedLocally.svg', 'File stored on commons and duplicated locally' ), - ] ); - // For duplications to
[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Resolve redirect namespaces from source docs in fancy title ...
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/398124 ) Change subject: Resolve redirect namespaces from source docs in fancy title results type .. Resolve redirect namespaces from source docs in fancy title results type Rather than figuring out an appropriate namespace for redirects we used the namespace of the document redirected to which is regularly wrong, especially for 'shorthand' redirects on wikis such as CAT:PROD on enwiki in NS_MAIN which redirects to Category:Proposed Deletion (any many other similar shortcuts). Dig through the redirects stored with the document and figure out what the likely namespace of the document is. Change-Id: I8de5c9d35ed709ee100cc3ad8093e49a7a5476d3 --- M includes/CompletionSuggester.php M includes/Search/ResultsType.php M tests/unit/Search/ResultsTypeTest.php 3 files changed, 171 insertions(+), 17 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/24/398124/1 diff --git a/includes/CompletionSuggester.php b/includes/CompletionSuggester.php index e553aca..bfcf045 100644 --- a/includes/CompletionSuggester.php +++ b/includes/CompletionSuggester.php @@ -283,9 +283,10 @@ // they'll be forgotten in client response $score = $collector->getMinScore() !== null ? $collector->getMinScore() - 1 : count( $prefixResults->getResults() ); + $namespaces = $this->searchContext->getNamespaces(); foreach ( $prefixResults->getResults() as $res ) { $pageId = $this->config->makePageId( $res->getId() ); - $title = FancyTitleResultsType::chooseBestTitleOrRedirect( $rType->transformOneElasticResult( $res ) ); + $title = FancyTitleResultsType::chooseBestTitleOrRedirect( $rType->transformOneElasticResult( $res, $namespaces ) ); if ( $title === false ) { continue; } diff --git a/includes/Search/ResultsType.php b/includes/Search/ResultsType.php index 372ef36..39faeff 100644 --- a/includes/Search/ResultsType.php +++ b/includes/Search/ResultsType.php @@ -128,6 +128,10 @@ $this->matchedAnalyzer = $matchedAnalyzer; } + public function getSourceFiltering() { + return [ 'namespace', 'title', 'namespace_text', 'wiki', 'redirect' ]; + } + /** * @param array $highlightSource * @return array|null @@ -223,11 +227,12 @@ * Transform a result from elastic into an array of Titles. * * @param \Elastica\Result $r +* @param int[] $namespaces Prefer * @return \Title[] with the following keys : * titleMatch => a title if the title matched * redirectMatches => an array of redirect matches, one per matched redirect */ - public function transformOneElasticResult( \Elastica\Result $r ) { + public function transformOneElasticResult( \Elastica\Result $r, array $namespaces = [] ) { $title = TitleHelper::makeTitle( $r ); $highlights = $r->getHighlights(); $resultForTitle = []; @@ -251,21 +256,16 @@ $highlights["redirect.title.{$this->matchedAnalyzer}_asciifolding"] ); } if ( count( $redirectHighlights ) !== 0 ) { - foreach ( $redirectHighlights as $redirectTitle ) { - // The match was against a redirect so we should replace the $title with one that - // represents the redirect. - // The first step is to strip the actual highlighting from the title. - $redirectTitle = str_replace( [ Searcher::HIGHLIGHT_PRE, Searcher::HIGHLIGHT_POST ], - '', $redirectTitle ); - - // Instead of getting the redirect's real namespace we're going to just use the namespace - // of the title. This is not great but OK given that we can't find cross namespace - // redirects properly any way. - // TODO: ask the highlighter to return the namespace for this kind of matches - // this would perhaps help to partially fix T115756 - $redirectTitle = - TitleHelper::makeRedirectTitle( $r, $redirectTitle, $r->namespace ); - $resultForTitle['redirectMatches'][] = $redirectTitle; + $source = $r->getSource(); + $docRedirects = []; + if ( isset( $source['redirect'] ) ) { + foreach (
[MediaWiki-commits] [Gerrit] operations/mediawiki-config[master]: Enable Cirrus MLR for 4 more wikis
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/398093 ) Change subject: Enable Cirrus MLR for 4 more wikis .. Enable Cirrus MLR for 4 more wikis Change-Id: I26051ee0b2356c2dc7a9f3959a596f7aaa32028f --- M tests/cirrusTest.php M wmf-config/InitialiseSettings.php 2 files changed, 5 insertions(+), 5 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/mediawiki-config refs/changes/93/398093/1 diff --git a/tests/cirrusTest.php b/tests/cirrusTest.php index 2c8edb3..8a57bda 100644 --- a/tests/cirrusTest.php +++ b/tests/cirrusTest.php @@ -241,7 +241,7 @@ 'zhwiki' => [ 'zhwiki', 'wiki', [ 'wmgCirrusSearchSimilarityProfile' => 'wmf_defaults', - 'wmgCirrusSearchRescoreProfile' => 'classic', + 'wmgCirrusSearchRescoreProfile' => 'mlr-1024rs', 'wmgCirrusSearchFullTextQueryBuilderProfile' => 'perfield_builder', 'wmgCirrusSearchMaxPhraseTokens' => 10, ], diff --git a/wmf-config/InitialiseSettings.php b/wmf-config/InitialiseSettings.php index c61f97b..c355b31 100644 --- a/wmf-config/InitialiseSettings.php +++ b/wmf-config/InitialiseSettings.php @@ -18297,7 +18297,7 @@ 'default' => 'wsum_inclinks', 'commonswiki' => 'classic_noboostlinks', 'enwiki' => 'mlr-1024rs', - // 'arwiki' => 'mlr-1024rs', + 'arwiki' => 'mlr-1024rs', 'dewiki' => 'mlr-1024rs', 'fawiki' => 'mlr-1024rs', 'fiwiki' => 'mlr-1024rs', @@ -18306,14 +18306,14 @@ 'itwiki' => 'mlr-1024rs', 'jawiki' => 'mlr-1024rs', 'kowiki' => 'mlr-1024rs', - // 'nlwiki' => 'mlr-1024rs', + 'nlwiki' => 'mlr-1024rs', 'nowiki' => 'mlr-1024rs', - // 'plwiki' => 'mlr-1024rs', + 'plwiki' => 'mlr-1024rs', 'ptwiki' => 'mlr-1024rs', 'ruwiki' => 'mlr-1024rs', 'svwiki' => 'mlr-1024rs', 'viwiki' => 'mlr-1024rs', - // 'zhwiki' => 'mlr-1024rs', + 'zhwiki' => 'mlr-1024rs', // Uses the lang tag, list of spaceless languages // (see https://www.mediawiki.org/wiki/User:TJones_(WMF)/Notes/Spaceless_Writing_Systems_and_Wiki-Projects) "bo" => "classic", -- To view, visit https://gerrit.wikimedia.org/r/398093 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I26051ee0b2356c2dc7a9f3959a596f7aaa32028f Gerrit-PatchSet: 1 Gerrit-Project: operations/mediawiki-config Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] operations/mediawiki-config[master]: Turn off a couple search ranking models that arnt ready
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/397988 ) Change subject: Turn off a couple search ranking models that arnt ready .. Turn off a couple search ranking models that arnt ready It seems a few of the wikis ranking models havn't been uploaded to the search clusters yet. Turn those off, enabling only those with live ranking models. Change-Id: I28a269c940290518467f99e6003c6eb13774b633 --- M wmf-config/InitialiseSettings.php 1 file changed, 4 insertions(+), 4 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/mediawiki-config refs/changes/88/397988/1 diff --git a/wmf-config/InitialiseSettings.php b/wmf-config/InitialiseSettings.php index ccf3f56..652fbb9 100644 --- a/wmf-config/InitialiseSettings.php +++ b/wmf-config/InitialiseSettings.php @@ -18253,7 +18253,7 @@ 'default' => 'wsum_inclinks', 'commonswiki' => 'classic_noboostlinks', 'enwiki' => 'mlr-1024rs', - 'arwiki' => 'mlr-1024rs', + //'arwiki' => 'mlr-1024rs', 'dewiki' => 'mlr-1024rs', 'fawiki' => 'mlr-1024rs', 'fiwiki' => 'mlr-1024rs', @@ -18262,14 +18262,14 @@ 'itwiki' => 'mlr-1024rs', 'jawiki' => 'mlr-1024rs', 'kowiki' => 'mlr-1024rs', - 'nlwiki' => 'mlr-1024rs', + //'nlwiki' => 'mlr-1024rs', 'nowiki' => 'mlr-1024rs', - 'plwiki' => 'mlr-1024rs', + //'plwiki' => 'mlr-1024rs', 'ptwiki' => 'mlr-1024rs', 'ruwiki' => 'mlr-1024rs', 'svwiki' => 'mlr-1024rs', 'viwiki' => 'mlr-1024rs', - 'zhwiki' => 'mlr-1024rs', + //'zhwiki' => 'mlr-1024rs', // Uses the lang tag, list of spaceless languages // (see https://www.mediawiki.org/wiki/User:TJones_(WMF)/Notes/Spaceless_Writing_Systems_and_Wiki-Projects) "bo" => "classic", -- To view, visit https://gerrit.wikimedia.org/r/397988 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I28a269c940290518467f99e6003c6eb13774b633 Gerrit-PatchSet: 1 Gerrit-Project: operations/mediawiki-config Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] mediawiki...WikimediaEvents[wmf/1.31.0-wmf.12]: Turn on second mlr test for hewiki
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/397985 ) Change subject: Turn on second mlr test for hewiki .. Turn on second mlr test for hewiki Re-uses the same sampling rates as last time around on hewiki. Depends on I681e1e724 being deployed first to setup the appropriate backend triggers. Bug: T182616 Change-Id: I7bc0ddd86da966a8f49b27a4ddf9aa93074a6b39 (cherry picked from commit 4d6d5e905de945647fc24795d172c787bbb33128) --- M modules/all/ext.wikimediaEvents.searchSatisfaction.js 1 file changed, 7 insertions(+), 4 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/WikimediaEvents refs/changes/85/397985/1 diff --git a/modules/all/ext.wikimediaEvents.searchSatisfaction.js b/modules/all/ext.wikimediaEvents.searchSatisfaction.js index 7517df7..de78f11 100644 --- a/modules/all/ext.wikimediaEvents.searchSatisfaction.js +++ b/modules/all/ext.wikimediaEvents.searchSatisfaction.js @@ -114,7 +114,9 @@ function initialize( session ) { var sessionId = session.get( 'sessionId' ), - validBuckets = [], + validBuckets = mw.config.get( 'wgDBname' ) === 'hewiki' ? + [ 'control', 'ltr-1024', 'ltr-1024-i' ] : + [], sampleSize = ( function () { var dbName = mw.config.get( 'wgDBname' ), // Provides a place to handle wiki-specific sampling, @@ -135,9 +137,6 @@ test: 350, subTest: null }, - // .0005 works out to ~2.7k sessions per week. - // .15 increases that to 810k per week. Giving - // 160k sessions per bucket per week. enwiki: { test: 2000, subTest: null @@ -189,6 +188,10 @@ zhwiki: { test: 100, subTest: null + }, + hewiki: { + test: 0.8112, + subTest: 0.8767 } }; if ( subTests[ dbName ] ) { -- To view, visit https://gerrit.wikimedia.org/r/397985 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I7bc0ddd86da966a8f49b27a4ddf9aa93074a6b39 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/WikimediaEvents Gerrit-Branch: wmf/1.31.0-wmf.12 Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] mediawiki...WikimediaEvents[wmf/1.31.0-wmf.11]: Turn on second mlr test for hewiki
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/397984 ) Change subject: Turn on second mlr test for hewiki .. Turn on second mlr test for hewiki Re-uses the same sampling rates as last time around on hewiki. Depends on I681e1e724 being deployed first to setup the appropriate backend triggers. Bug: T182616 Change-Id: I7bc0ddd86da966a8f49b27a4ddf9aa93074a6b39 --- M modules/all/ext.wikimediaEvents.searchSatisfaction.js 1 file changed, 7 insertions(+), 4 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/WikimediaEvents refs/changes/84/397984/1 diff --git a/modules/all/ext.wikimediaEvents.searchSatisfaction.js b/modules/all/ext.wikimediaEvents.searchSatisfaction.js index 7517df7..de78f11 100644 --- a/modules/all/ext.wikimediaEvents.searchSatisfaction.js +++ b/modules/all/ext.wikimediaEvents.searchSatisfaction.js @@ -114,7 +114,9 @@ function initialize( session ) { var sessionId = session.get( 'sessionId' ), - validBuckets = [], + validBuckets = mw.config.get( 'wgDBname' ) === 'hewiki' ? + [ 'control', 'ltr-1024', 'ltr-1024-i' ] : + [], sampleSize = ( function () { var dbName = mw.config.get( 'wgDBname' ), // Provides a place to handle wiki-specific sampling, @@ -135,9 +137,6 @@ test: 350, subTest: null }, - // .0005 works out to ~2.7k sessions per week. - // .15 increases that to 810k per week. Giving - // 160k sessions per bucket per week. enwiki: { test: 2000, subTest: null @@ -189,6 +188,10 @@ zhwiki: { test: 100, subTest: null + }, + hewiki: { + test: 0.8112, + subTest: 0.8767 } }; if ( subTests[ dbName ] ) { -- To view, visit https://gerrit.wikimedia.org/r/397984 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I7bc0ddd86da966a8f49b27a4ddf9aa93074a6b39 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/WikimediaEvents Gerrit-Branch: wmf/1.31.0-wmf.11 Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] mediawiki...WikimediaEvents[master]: Turn on second mlr test for hewiki
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/397973 ) Change subject: Turn on second mlr test for hewiki .. Turn on second mlr test for hewiki Resets enwiki to its default sampling of 1:2000. Re-uses the same sampling rates as last time around on hewiki. Depends on I681e1e724 being deployed first to setup the appropriate backend triggers. Change-Id: I7bc0ddd86da966a8f49b27a4ddf9aa93074a6b39 --- M modules/all/ext.wikimediaEvents.searchSatisfaction.js 1 file changed, 8 insertions(+), 7 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/WikimediaEvents refs/changes/73/397973/1 diff --git a/modules/all/ext.wikimediaEvents.searchSatisfaction.js b/modules/all/ext.wikimediaEvents.searchSatisfaction.js index a48355d..de78f11 100644 --- a/modules/all/ext.wikimediaEvents.searchSatisfaction.js +++ b/modules/all/ext.wikimediaEvents.searchSatisfaction.js @@ -114,8 +114,8 @@ function initialize( session ) { var sessionId = session.get( 'sessionId' ), - validBuckets = mw.config.get( 'wgDBname' ) === 'enwiki' ? - [ 'control', 'dbn20', 'dbn20-i', 'dbn35', 'dbn35-i' ] : + validBuckets = mw.config.get( 'wgDBname' ) === 'hewiki' ? + [ 'control', 'ltr-1024', 'ltr-1024-i' ] : [], sampleSize = ( function () { var dbName = mw.config.get( 'wgDBname' ), @@ -137,12 +137,9 @@ test: 350, subTest: null }, - // .0005 works out to ~2.7k sessions per week. - // .15 increases that to 810k per week. Giving - // 160k sessions per bucket per week. enwiki: { - test: 0.15, - subTest: 0.996 + test: 2000, + subTest: null }, enwiktionary: { test: 40, @@ -191,6 +188,10 @@ zhwiki: { test: 100, subTest: null + }, + hewiki: { + test: 0.8112, + subTest: 0.8767 } }; if ( subTests[ dbName ] ) { -- To view, visit https://gerrit.wikimedia.org/r/397973 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I7bc0ddd86da966a8f49b27a4ddf9aa93074a6b39 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/WikimediaEvents Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] operations/mediawiki-config[master]: Turn on MLR for most wikis with >1% of search traffic
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/397970 ) Change subject: Turn on MLR for most wikis with >1% of search traffic .. Turn on MLR for most wikis with >1% of search traffic Based on the results of our AB test showing improved search relevance for machine learned ranking, turn them on. Also updates to models trained in the last week. Change-Id: I657d80bd1fbda61d5fd84fcdd0e29383a1d857cd --- M tests/cirrusTest.php M wmf-config/InitialiseSettings.php 2 files changed, 36 insertions(+), 19 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/mediawiki-config refs/changes/70/397970/1 diff --git a/tests/cirrusTest.php b/tests/cirrusTest.php index 5bccd87..8a57bda 100644 --- a/tests/cirrusTest.php +++ b/tests/cirrusTest.php @@ -241,7 +241,7 @@ 'zhwiki' => [ 'zhwiki', 'wiki', [ 'wmgCirrusSearchSimilarityProfile' => 'wmf_defaults', - 'wmgCirrusSearchRescoreProfile' => 'wsum_inclinks', + 'wmgCirrusSearchRescoreProfile' => 'mlr-1024rs', 'wmgCirrusSearchFullTextQueryBuilderProfile' => 'perfield_builder', 'wmgCirrusSearchMaxPhraseTokens' => 10, ], diff --git a/wmf-config/InitialiseSettings.php b/wmf-config/InitialiseSettings.php index 9ab29bf..ccf3f56 100644 --- a/wmf-config/InitialiseSettings.php +++ b/wmf-config/InitialiseSettings.php @@ -18253,6 +18253,23 @@ 'default' => 'wsum_inclinks', 'commonswiki' => 'classic_noboostlinks', 'enwiki' => 'mlr-1024rs', + 'arwiki' => 'mlr-1024rs', + 'dewiki' => 'mlr-1024rs', + 'fawiki' => 'mlr-1024rs', + 'fiwiki' => 'mlr-1024rs', + 'frwiki' => 'mlr-1024rs', + 'idwiki' => 'mlr-1024rs', + 'itwiki' => 'mlr-1024rs', + 'jawiki' => 'mlr-1024rs', + 'kowiki' => 'mlr-1024rs', + 'nlwiki' => 'mlr-1024rs', + 'nowiki' => 'mlr-1024rs', + 'plwiki' => 'mlr-1024rs', + 'ptwiki' => 'mlr-1024rs', + 'ruwiki' => 'mlr-1024rs', + 'svwiki' => 'mlr-1024rs', + 'viwiki' => 'mlr-1024rs', + 'zhwiki' => 'mlr-1024rs', // Uses the lang tag, list of spaceless languages // (see https://www.mediawiki.org/wiki/User:TJones_(WMF)/Notes/Spaceless_Writing_Systems_and_Wiki-Projects) "bo" => "classic", @@ -19595,25 +19612,25 @@ 'wmgCirrusSearchMLRModel' => [ 'default' => false, - 'enwiki' => '20171101_enwiki_v1', - 'arwiki' => '20170905_arwiki_v1', - 'fawiki' => '20170905_fawiki_v1', - 'jawiki' => '20170905_jawiki_v1', - 'svwiki' => '20170905_svwiki_v1', - 'frwiki' => '20170905_frwiki_v1', - 'itwiki' => '20170905_itwiki_v1', - 'ptwiki' => '20170905_ptwiki_v1', - 'ruwiki' => '20170905_ruwiki_v1', - 'dewiki' => '20170905_dewiki_v1', - 'fiwiki' => '20170908_fiwiki_v1', + 'enwiki' => '20171130_enwiki_v1', + 'arwiki' => '20171130_arwiki_v1', + 'fawiki' => '20171130_fawiki_v1', + 'jawiki' => '20171130_jawiki_v1', + 'svwiki' => '20171130_svwiki_v1', + 'frwiki' => '20171130_frwiki_v1', + 'itwiki' => '20171130_itwiki_v1', + 'ptwiki' => '20171130_ptwiki_v1', + 'ruwiki' => '20171130_ruwiki_v1', + 'dewiki' => '20171130_dewiki_v1', + 'fiwiki' => '20171130_fiwiki_v1', 'hewiki' => '20171130_hewiki_v1', - 'idwiki' => '20170908_idwiki_v1', - 'kowiki' => '20170908_kowiki_v1', - 'nlwiki' => '20170908_nlwiki_v1', - 'nowiki' => '20170908_nowiki_v1', - 'plwiki' => '20170908_plwiki_v1', - 'viwiki' => '20170908_viwiki_v1', - 'zhwiki' => '20170908_zhwiki_v1', + 'idwiki' => '20171130_idwiki_v1', + 'kowiki' => '20171130_kowiki_v1', + 'nlwiki' => '20171130_nlwiki_v1', + 'nowiki' => '20171130_nowiki_v1', + 'plwiki' => '20171130_plwiki_v1', + 'viwiki' => '20171130_viwiki_v1', + 'zhwiki' => '20171130_zhwiki_v1', ], 'wmgWMESearchRelevancePages' => [ -- To view, visit https://gerrit.wikimedia.org/r/397970 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I657d80bd1fbda61d5fd84fcdd0e29383a1d857cd Gerrit-PatchSet: 1 Gerrit-Project: operations/mediawiki-config Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] wikimedia...relevanceForge[master]: Add sanity checks for more languages
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/397968 ) Change subject: Add sanity checks for more languages .. Add sanity checks for more languages I'm not completely sure these are very good. I didn't really try and figure out what any of the pages were, just chose random pages from the bottom 2/3 of the top 100 most popular pages (by page views) of the last month. Then chose sometimes a redirect and sometimes the primary title as the query string. We can probably iterate on these if we get any complaints about search results that users expect at the top but are not. Change-Id: I39b47b47f03d02907f5d291050465c14ae596287 --- M sanityCheck.py A sanityCheck/arwiki.json A sanityCheck/dewiki.json A sanityCheck/fawiki.json A sanityCheck/fiwiki.json A sanityCheck/frwiki.json A sanityCheck/hewiki.json A sanityCheck/idwiki.json A sanityCheck/itwiki.json A sanityCheck/jawiki.json A sanityCheck/kowiki.json A sanityCheck/nlwiki.json A sanityCheck/nowiki.json A sanityCheck/plwiki.json A sanityCheck/ptwiki.json A sanityCheck/ruwiki.json A sanityCheck/svwiki.json A sanityCheck/viwiki.json A sanityCheck/zhwiki.json 19 files changed, 256 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/wikimedia/discovery/relevanceForge refs/changes/68/397968/1 diff --git a/sanityCheck.py b/sanityCheck.py old mode 100644 new mode 100755 index 19aae72..341f476 --- a/sanityCheck.py +++ b/sanityCheck.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python from __future__ import print_function import argparse import json @@ -37,6 +38,9 @@ print('') else: print("PASSED\n") + +print("OVERALL: %s" % ("PASSED" if ok else "FAILED")) + return ok diff --git a/sanityCheck/arwiki.json b/sanityCheck/arwiki.json new file mode 100644 index 000..1f27498 --- /dev/null +++ b/sanityCheck/arwiki.json @@ -0,0 +1,11 @@ +{ +"api": "https://ar.wikipedia.org/w/api.php;, +"queries": { +"ابن_الهيثم": [ +"ابن الهيثم" +], +"مؤسسة محمد الخامس": [ +"مؤسسة محمد الخامس للتضامن" +] +} +} diff --git a/sanityCheck/dewiki.json b/sanityCheck/dewiki.json new file mode 100644 index 000..4480dc3 --- /dev/null +++ b/sanityCheck/dewiki.json @@ -0,0 +1,14 @@ +{ +"api": "https://de.wikipedia.org/w/api.php;, +"queries": { +"Bundestagswahl 2017": [ +"Bundestagswahl 2017" +], +"FIFA WM 2018": [ +"Fußball-Weltmeisterschaft 2018" +], +"Rechtswesen Österreichs": [ +"Österreich" +] +} +} diff --git a/sanityCheck/fawiki.json b/sanityCheck/fawiki.json new file mode 100644 index 000..7397c81 --- /dev/null +++ b/sanityCheck/fawiki.json @@ -0,0 +1,14 @@ +{ +"api": "https://fa.wikipedia.org/w/api.php;, +"queries": { +"سید محمدحسین شهریار": [ +"سید محمدحسین شهریار" +], +"شادمهر عقيلي": [ +"شادمهر عقیلی" +], +"ساكر": [ +"آمیزش جنسی دهانی" +] +} +} diff --git a/sanityCheck/fiwiki.json b/sanityCheck/fiwiki.json new file mode 100644 index 000..2be778b --- /dev/null +++ b/sanityCheck/fiwiki.json @@ -0,0 +1,14 @@ +{ +"api": "https://fi.wikipedia.org/w/api.php;, +"queries": { +"Happamuus": [ +"Happamuus" +], +"Veljessota": [ +"Suomen sisällissota" +], +"Mannerheim": [ +"Carl Gustaf Emil Mannerheim" +] +} +} diff --git a/sanityCheck/frwiki.json b/sanityCheck/frwiki.json new file mode 100644 index 000..4528194 --- /dev/null +++ b/sanityCheck/frwiki.json @@ -0,0 +1,14 @@ +{ +"api": "https://fr.wikipedia.org/w/api.php;, +"queries": { +"U Arena": [ +"U Arena" +], +"DALS": [ +"Danse avec les stars" +], +"Tableau périodique des éléments": [ +"Tableau périodique des éléments" +] +} +} diff --git a/sanityCheck/hewiki.json b/sanityCheck/hewiki.json new file mode 100644 index 000..c3a014d --- /dev/null +++ b/sanityCheck/hewiki.json @@ -0,0 +1,14 @@ +{ +"api": "https://he.wikipedia.org/w/api.php;, +"queries": { +"שלמה ארצי": [ +"שלמה ארצי" +], +"רחל בלובשטיין סלע": [ +"רחל המשוררת" +], +"Jerusalem": [ +"ירושלים" +] +} +} diff --git a/sanityCheck/idwiki.json b/sanityCheck/idwiki.json new file mode 100644 index 000..5e9380e --- /dev/null +++ b/sanityCheck/idwiki.json @@ -0,0 +1,14 @@ +{ +"api": "https://id.wikipedia.org/w/api.php;, +"queries": { +"Pemerintahan daerah di Indonesia": [ +"Pemerintahan daerah di Indonesia" +], +"Undang-undang dasar": [ +"Konstitusi" +], +"Srivijaya": [ +"Sriwijaya" +
[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: [WIP] Additional integration test features for cindy
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/397735 ) Change subject: [WIP] Additional integration test features for cindy .. [WIP] Additional integration test features for cindy WIP because the full integration with the barry script to automate running hasn't been finished, so there is probably still more to do. * Cindy didn't like that we were using npm dependencies that weren't explicitly called out. Perhaps due to a difference in npm version of something. Regardless it's good practice to call out our dependencies explicitly. * Add a --tag=... option to request only specific tags are run. This can, for example, specify an or of tags with '@foo or @bar'. All tests except those marked frozen can be selected with "not @frozen". See cucumber-tag-expressions lib for more details. * Add a configuration file specifically for mwv in labs that generates appropriate urls. Triggered by setting MWV_LABS_HOSTNAME environment variable to the hostname of the machine. ex: MWV_LABS_HOTSNAME=cirrus-browser-bot * Increase max parallelism of chrome to 8. This will still be limited by the top level maxInstances to 1 by default (necessary when mixing frozen index tests with the others). * Add a grunt cli parameter to set parallelism from the command line. Change-Id: I4d8837b2c56b018f682429756a2ba6efd106969d --- M Gruntfile.js M package.json M tests/integration/config/wdio.conf.js A tests/integration/config/wdio.conf.mwvlabs.js A tests/integration/log/.gitkeep 5 files changed, 50 insertions(+), 2 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/35/397735/1 diff --git a/Gruntfile.js b/Gruntfile.js index b3841f7..16637ca 100644 --- a/Gruntfile.js +++ b/Gruntfile.js @@ -19,6 +19,8 @@ if ( process.env.JENKINS_HOME ) { WebdriverIOconfigFile = './tests/integration/config/wdio.conf.jenkins.js'; + } else if ( process.env.MWV_LABS_HOSTNAME ) { + WebdriverIOconfigFile = './tests/integration/config/wdio.conf.mwvlabs.js'; } else { WebdriverIOconfigFile = './tests/integration/config/wdio.conf.js'; } @@ -59,6 +61,15 @@ webdriver: { test: { configFile: WebdriverIOconfigFile, + cucumberOpts: { + tagExpression: ( () => { + return grunt.option( 'tags' ); + )() + }, + maxInstances: ( () => { + let max = grunt.option( 'maxInstances' ); + return max ? parseInt( max, 10 ) : 1; + } )(), spec: ( () => { let spec = grunt.option( 'spec' ); if ( !spec ) { diff --git a/package.json b/package.json index cf4a6b3..b6855f5 100644 --- a/package.json +++ b/package.json @@ -8,8 +8,10 @@ "selenium": "killall -0 chromedriver 2>/dev/null || chromedriver --url-base=/wd/hub --port= & grunt webdriver:test; killall chromedriver" }, "devDependencies": { +"bluebird": "3.5.1", "chai": "^4.1.2", "cucumber": "^3.0.1", +"deepmerge": "2.0.1", "grunt": "1.0.1", "grunt-banana-checker": "0.5.0", "grunt-contrib-jshint": "1.0.0", @@ -21,8 +23,11 @@ "stylelint-config-wikimedia": "0.4.1", "wdio-cucumber-framework": "^1.0.1", "webdriverio": "^4.8.0", +"wdio-spec-reporter": "1.2.0", +"wdio-junit-reporter": "1.1.3", "restify": "^6.3.4", "request": "^2.83.0", -"request-promise-native": "^1.0.5" +"request-promise-native": "^1.0.5", +"semlog": "0.6.10" } } diff --git a/tests/integration/config/wdio.conf.js b/tests/integration/config/wdio.conf.js index 82601f7..89bb9a3 100644 --- a/tests/integration/config/wdio.conf.js +++ b/tests/integration/config/wdio.conf.js @@ -124,7 +124,7 @@ // maxInstances can get overwritten per capability. So if you have an in-house Selenium // grid with only 5 firefox instances available you can make sure that not more than // 5 instances get started at a time. - maxInstances: 1, + maxInstances: 8, // browserName: 'chrome', // Since Chrome v57 https://bugs.chromium.org/p/chromedriver/issues/detail?id=1625 diff --git a/tests/integration/config/wdio.conf.mwvlabs.js b/tests/integration/config/wdio.conf.mwvlabs.js new file mode 100644 index 000..7b3a7a7 --- /dev/null +++ b/tests/integration/config/wdio.conf.mwvlabs.js @@ -0,0 +1,32 @@ +/*jshint esversion: 6, node:true */ + +/* eslint
[MediaWiki-commits] [Gerrit] operations/mediawiki-config[master]: Setup MLR AB test for hewiki
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/397582 ) Change subject: Setup MLR AB test for hewiki .. Setup MLR AB test for hewiki Last time around we trained the hewiki model on an analysis chain that was different than the one used when running the test so the results were invalid. Re-run the test with a new model trained against the new analysis chain. This reverts commit 9bfa5214657476b410399ecb90a62a6b8afd3196. Change-Id: I681e1e724201337d73867e518fe806cbf5f89636 --- M wmf-config/CirrusSearch-common.php M wmf-config/InitialiseSettings.php 2 files changed, 13 insertions(+), 60 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/mediawiki-config refs/changes/82/397582/1 diff --git a/wmf-config/CirrusSearch-common.php b/wmf-config/CirrusSearch-common.php index 5ead601..35ae235 100644 --- a/wmf-config/CirrusSearch-common.php +++ b/wmf-config/CirrusSearch-common.php @@ -296,40 +296,7 @@ ], ], ]; - - $wgCirrusSearchRescoreProfiles['dbn20'] = $wgCirrusSearchRescoreProfiles['mlr-1024rs']; - $wgCirrusSearchRescoreProfiles['dbn20']['rescore'][2]['model'] = 'dbn20_enwiki_v1'; - - $wgCirrusSearchRescoreProfiles['dbn35'] = $wgCirrusSearchRescoreProfiles['mlr-1024rs']; - $wgCirrusSearchRescoreProfiles['dbn35']['rescore'][2]['model'] = 'dbn35_enwiki_v1'; - } - -# needed for recall A/B test (T177502) -$wgCirrusSearchFullTextQueryBuilderProfiles['rec_3t_80_66'] = $wgCirrusSearchFullTextQueryBuilderProfiles['perfield_builder']; -$wgCirrusSearchFullTextQueryBuilderProfiles['rec_3t_80_66']['settings']['filter'] = [ - 'type' => 'default', - 'settings' => [ - 'all' => [ - 'minimum_should_match' => '3<80%' - ], - 'all.plain' => [ - 'minimum_should_match' => '3<66%' - ], - ] -]; -$wgCirrusSearchFullTextQueryBuilderProfiles['rec_4t_80_66'] = $wgCirrusSearchFullTextQueryBuilderProfiles['perfield_builder']; -$wgCirrusSearchFullTextQueryBuilderProfiles['rec_4t_80_66']['settings']['filter'] = [ - 'type' => 'default', - 'settings' => [ - 'all' => [ - 'minimum_should_match' => '4<80%' - ], - 'all.plain' => [ - 'minimum_should_match' => '4<66%' - ], - ] -]; $wgCirrusSearchUserTesting = $wmgCirrusSearchUserTesting; diff --git a/wmf-config/InitialiseSettings.php b/wmf-config/InitialiseSettings.php index 8da9e21..9ab29bf 100644 --- a/wmf-config/InitialiseSettings.php +++ b/wmf-config/InitialiseSettings.php @@ -18623,44 +18623,30 @@ 'wmgCirrusSearchUserTesting' => [ 'default' => [], - // DBN sizing AB test - 'enwiki' => [ - 'dbn_sizing' => [ + 'hewiki' => [ + 'ltr' => [ + 'globals' => [], 'buckets' => [ 'control' => [ 'trigger' => 'control', ], - 'dbn20' => [ - 'trigger' => 'dbn20', + 'ltr-1024' => [ + 'trigger' => 'ltr-1024', 'globals' => [ - 'wgCirrusSearchRescoreProfile' => 'dbn20', + 'wgCirrusSearchRescoreProfile' => 'mlr-1024rs', ] ], - 'dbn20-i' => [ - 'trigger' => 'dbn20-i', + 'ltr-1024-i' => [ + 'trigger' => 'ltr-1024-i', 'globals' => [ 'wgCirrusSearchInterleaveConfig' => [ - 'CirrusSearchRescoreProfile' => 'dbn20', + 'CirrusSearchRescoreProfile' => 'mlr-1024rs' ], - ] - ], - 'dbn35' => [ - 'trigger' => 'dbn35', - 'globals' => [ - 'wgCirrusSearchRescoreProfile' => 'dbn35', - ] - ], - 'dbn35-i' => [ - 'trigger' => 'dbn35-i', - 'globals' => [ -
[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[wmf/1.31.0-wmf.11]: Simple hack to override mlr model from query string
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/397569 ) Change subject: Simple hack to override mlr model from query string .. Simple hack to override mlr model from query string This is a rather naive attempt to allow us to do a sanity check on an MLR model before we roll it out. With the new cirrusMLRModel query parameter we can upload a model to elasticsearch and try a couple queries by specifying the model name before we ship a config cange to enable the model for everyone. Change-Id: Id258c4ad295eab8cac543f14e9135a1bdb87533a (cherry picked from commit 8a51602c1e92f32e736a6f101986fd79f9fa0bad) --- M includes/Search/RescoreBuilders.php 1 file changed, 11 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/69/397569/1 diff --git a/includes/Search/RescoreBuilders.php b/includes/Search/RescoreBuilders.php index 76a4fac..3119a01 100644 --- a/includes/Search/RescoreBuilders.php +++ b/includes/Search/RescoreBuilders.php @@ -127,6 +127,17 @@ * @return AbstractQuery */ private function buildLtrQuery( $model ) { + // This is a bit fragile, and makes the bold assumption + // only a single level of rescore will be used. This is + // strictly for debugging/testing before shipping a model + // live so shouldn't be a big deal. + $override = \RequestContext::getMain() + ->getRequest() + ->getVal( 'cirrusMLRModel' ); + if ( $override ) { + $model = $override; + } + $bool = new \Elastica\Query\BoolQuery(); // the ltr query can return negative scores, which mucks with elasticsearch // sorting as that will put these results below documents set to 0. Fix -- To view, visit https://gerrit.wikimedia.org/r/397569 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Id258c4ad295eab8cac543f14e9135a1bdb87533a Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/CirrusSearch Gerrit-Branch: wmf/1.31.0-wmf.11 Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Port remaining dump_* features to nodejs
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/396553 ) Change subject: Port remaining dump_* features to nodejs .. Port remaining dump_* features to nodejs I changed these up a bit to make them simpler. Basically each one now has a specific Then( ... ) test for it. The previous tests were just string includes against the stringified json, this seemed perhaps better at verifying we have somewhat reasonable output. Change-Id: I4d2a0789ae17e880304cc42baeb58735e9b2c66b --- A tests/integration/features/dump_config.feature A tests/integration/features/dump_mapping.feature A tests/integration/features/dump_query.feature A tests/integration/features/dump_settings.feature M tests/integration/features/step_definitions/page_steps.js 5 files changed, 120 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/53/396553/1 diff --git a/tests/integration/features/dump_config.feature b/tests/integration/features/dump_config.feature new file mode 100644 index 000..fc85cf1 --- /dev/null +++ b/tests/integration/features/dump_config.feature @@ -0,0 +1,8 @@ +@clean @dump_config @phantomjs +Feature: You can dump CirrusSearch's configuration + Scenario: You can dump CirrusSearch's configuration +When I dump the cirrus config +Then the config dump contains CirrusSearchPhraseSuggestMaxErrors + And the config dump contains CirrusSearchNamespaceWeights + And the config dump text does not contain Password + And the config dump text does not contain password diff --git a/tests/integration/features/dump_mapping.feature b/tests/integration/features/dump_mapping.feature new file mode 100644 index 000..a0f1776 --- /dev/null +++ b/tests/integration/features/dump_mapping.feature @@ -0,0 +1,5 @@ +@clean @dump_mapping @phantomjs +Feature: You can dump the mapping CirrusSearch set on Elasticsearch's indexes + Scenario: You can dump the mapping CirrusSearch set on Elasticsearch's indexes +When I dump the cirrus mapping +Then A valid mapping dump is produced diff --git a/tests/integration/features/dump_query.feature b/tests/integration/features/dump_query.feature new file mode 100644 index 000..49aa588 --- /dev/null +++ b/tests/integration/features/dump_query.feature @@ -0,0 +1,5 @@ +@clean @dump_quer @phantomjs +Feature: Can dump the query syntax + Scenario: Can dump the query syntax +Given I request a query dump for main page + Then A valid query dump for main page is produced diff --git a/tests/integration/features/dump_settings.feature b/tests/integration/features/dump_settings.feature new file mode 100644 index 000..d0d9c43 --- /dev/null +++ b/tests/integration/features/dump_settings.feature @@ -0,0 +1,5 @@ +@clean @dump_settings @phantomjs +Feature: You can dump the settings CirrusSearch set on Elasticsearch's indexes + Scenario: You can dump the settings CirrusSearch set on Elasticsearch's indexes +When I dump the cirrus settings +Then A valid settings dump is produced diff --git a/tests/integration/features/step_definitions/page_steps.js b/tests/integration/features/step_definitions/page_steps.js index bb3eec6..dd1a56d 100644 --- a/tests/integration/features/step_definitions/page_steps.js +++ b/tests/integration/features/step_definitions/page_steps.js @@ -496,4 +496,101 @@ } ); } ); } ); + + When( /^I dump the cirrus config$/, Promise.coroutine( function* () { + let client = yield this.onWiki(); + try { + let response = yield client.request( { + action: 'cirrus-config-dump', + } ); + this.setApiResponse( response ); + } catch ( err ) { + this.setApiError( err ); + } + } ) ); + + Then( /^the config dump contains (.+)$/, function ( key ) { + return withApi( this, () => { + expect( this.apiResponse ).to.have.any.keys( key ); + } ); + } ); + + Then( /^the config dump text does not contain (.+)$/, function ( key ) { + return withApi( this, () => { + let text = JSON.stringify( this.apiResponse ); + expect( text ).to.not.include( key ); + } ); + } ); + + When( /^I dump the cirrus mapping$/, Promise.coroutine( function* () { + let client = yield this.onWiki(); + try { + let response = yield client.request( { + action: 'cirrus-mapping-dump', + } ); + this.setApiResponse( response ); + } catch ( err ) { + this.setApiError( err ); + } + } ) ); +
[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Port update_weight_api.feature to nodejs
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/396552 ) Change subject: Port update_weight_api.feature to nodejs .. Port update_weight_api.feature to nodejs Replace 'within ...' calls with a new step that waits for incoming_links to be appropriately updated. This makes our expectations explicit instead of waiting on a secondary thing that hopefully updates based on the first. Additionally to save some time (this test is pretty slow) skip waiting on most of the edits and just do the final wait for incoming_links. I'm not really sure these tests even really need to be performing searches, it looks like mostly they are checking that incoming links are updated and counted appropriately. While working up this patch i noticed multiple steps are all now utilizing a 'wait for elasticsearch document to have some value' type step so moved the implementation into stepHelpers and adjusted the uses to all have similar wording and use the same implementation with different check functions. Change-Id: I20b13236e2139026d542de5e376392d6c5a67e47 --- M tests/integration/features/commons.feature M tests/integration/features/step_definitions/page_step_helpers.js M tests/integration/features/step_definitions/page_steps.js M tests/integration/features/update_redirect_api.feature A tests/integration/features/update_weight_api.feature 5 files changed, 142 insertions(+), 43 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/52/396552/1 diff --git a/tests/integration/features/commons.feature b/tests/integration/features/commons.feature index b36867b..764f74d 100644 --- a/tests/integration/features/commons.feature +++ b/tests/integration/features/commons.feature @@ -5,7 +5,7 @@ Then File:OnCommons.svg is the first api search result Scenario: A file that exists on commons and the local wiki returns the local result -When within 20 seconds File:DuplicatedLocally.svg has cirrustestwiki as local_sites_with_dupe +When I wait for File:DuplicatedLocally.svg on commons to include cirrustestwiki in local_sites_with_dupe Then I api search in namespace 6 for duplicated Then File:DuplicatedLocally.svg is the first api search result And Locally stored file *duplicated* on commons is the highlighted snippet of the first api search result diff --git a/tests/integration/features/step_definitions/page_step_helpers.js b/tests/integration/features/step_definitions/page_step_helpers.js index b5f39cc..0979e90 100644 --- a/tests/integration/features/step_definitions/page_step_helpers.js +++ b/tests/integration/features/step_definitions/page_step_helpers.js @@ -171,6 +171,29 @@ } ).call( this ); } + waitForDocument( title, check ) { + return Promise.coroutine( function* () { + let timeoutMs = 2; + let start = new Date(); + let lastError; + while ( true ) { + let doc = yield this.getCirrusIndexedContent( title ); + if ( doc.cirrusdoc && doc.cirrusdoc.length > 0 ) { + try { + check( doc.cirrusdoc[0] ); + break; + } catch ( err ) { + lastError = err; + } + } + if ( new Date() - start >= timeoutMs ) { + throw lastError || new Error( `Timeout out waiting for ${title}` ); + } + yield this.waitForMs( 200 ); + } + } ).call( this ); + } + waitForMs( ms ) { return new Promise( ( resolve ) => setTimeout( resolve, ms ) ); } diff --git a/tests/integration/features/step_definitions/page_steps.js b/tests/integration/features/step_definitions/page_steps.js index f3d7a83..bb3eec6 100644 --- a/tests/integration/features/step_definitions/page_steps.js +++ b/tests/integration/features/step_definitions/page_steps.js @@ -386,27 +386,6 @@ return stepHelpers.uploadFile( title, fileName, description ); } ); - Then(/^within (\d+) seconds (.+) has (.+) as local_sites_with_dupe$/, function (seconds, title, value) { - return Promise.coroutine( function* () { - let stepHelpers = this.stepHelpers.onWiki( 'commons' ); - let time = new Date(); - let found = false; - main: do { - let page = yield stepHelpers.getCirrusIndexedContent( title ); -
[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: port update_redirect_loop.feature to nodejs
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/396472 ) Change subject: port update_redirect_loop.feature to nodejs .. port update_redirect_loop.feature to nodejs Almost a straight copy from ruby, just changed the steps to not use the new code that waits for updates to hit elasticsearch. This is necessary because we don't ever index redirect loops. Change-Id: I8463ef4e4aef9e272554bcf8bc316c4f3df486a8 --- A tests/integration/features/update_redirect_loop.feature 1 file changed, 12 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/72/396472/1 diff --git a/tests/integration/features/update_redirect_loop.feature b/tests/integration/features/update_redirect_loop.feature new file mode 100644 index 000..893646d --- /dev/null +++ b/tests/integration/features/update_redirect_loop.feature @@ -0,0 +1,12 @@ +@clean @phantomjs @update @redirect_loop +Feature: Search backend updates containing redirect loops + Scenario: Pages that redirect to themself don't throw errors +Then I don't wait for a page named IAmABad RedirectSelf%{epoch} to exist with contents #REDIRECT [[IAmABad RedirectSelf%{epoch}]] + + # The actual creation of the pages will fails if redirect loops fails + Scenario: Pages that form a redirect chain don't throw errors +When I don't wait for a page named IAmABad RedirectChain%{epoch} A to exist with contents #REDIRECT [[IAmABad RedirectChain%{epoch} B]] + And I don't wait for a page named IAmABad RedirectChain%{epoch} B to exist with contents #REDIRECT [[IAmABad RedirectChain%{epoch} C]] + And I don't wait for a page named IAmABad RedirectChain%{epoch} C to exist with contents #REDIRECT [[IAmABad RedirectChain%{epoch} D]] +Then I don't wait for a page named IAmABad RedirectChain%{epoch} D to exist with contents #REDIRECT [[IAmABad RedirectChain%{epoch} A]] + And I don't wait for a page named IAmABad RedirectChain%{epoch} B to exist with contents #REDIRECT [[IAmABad RedirectChain%{epoch} D]] -- To view, visit https://gerrit.wikimedia.org/r/396472 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I8463ef4e4aef9e272554bcf8bc316c4f3df486a8 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/CirrusSearch Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Port update_redirect_api.feature to nodejs
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/396471 ) Change subject: Port update_redirect_api.feature to nodejs .. Port update_redirect_api.feature to nodejs Mostly a straight copy from ruby. While most of the 'within ...' steps could be removed unfortunately the one that waits for a redirect converted to not a redirect couldn't be removed. This has to wait for the previously redirected-to page to be updated. Rather than indirectly waiting on search results added a new step that directly waits for the redirect to be removed from the previously redirected to document. Change-Id: I1febfdad9eac1f8e3577b545274378dd50ae5de0 --- M tests/integration/features/step_definitions/page_step_helpers.js M tests/integration/features/step_definitions/page_steps.js A tests/integration/features/update_redirect_api.feature 3 files changed, 44 insertions(+), 1 deletion(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/71/396471/1 diff --git a/tests/integration/features/step_definitions/page_step_helpers.js b/tests/integration/features/step_definitions/page_step_helpers.js index 7281da6..b5f39cc 100644 --- a/tests/integration/features/step_definitions/page_step_helpers.js +++ b/tests/integration/features/step_definitions/page_step_helpers.js @@ -296,7 +296,8 @@ // Is the requested page and the returned document dont have the same // title that means we have a redirect. In that case the revision id // wont match, but the backend api ensures the redirect is now contained - // within the document. + // within the document. Unfortunately if the page was just edited to + // now be a redirect anymore this is wrong ... if ( isOk && revisionId && content[0].source.title === page.title ) { isOk = parseInt( content[0].source.version, 10 ) === revisionId; } diff --git a/tests/integration/features/step_definitions/page_steps.js b/tests/integration/features/step_definitions/page_steps.js index 9db07e3..522c8ac 100644 --- a/tests/integration/features/step_definitions/page_steps.js +++ b/tests/integration/features/step_definitions/page_steps.js @@ -484,4 +484,24 @@ let found = snippets.reduce( ( a, b ) => a || b.indexOf( within ) > -1, false ); expect( found ).to.equal( !should_not ); } ); + + Then( /^I wait for (.+) to not be included in the redirects of (.+)$/, function ( source, redirect ) { + return Promise.coroutine( function* () { + let timeoutMs = 2; + let start = new Date(); + while (true) { + let doc = yield this.stepHelpers.getCirrusIndexedContent( redirect ); + if ( doc.cirrusdoc.length > 0 ) { + let exists = doc.cirrusdoc[0].source.redirect.reduce( ( a, b ) => a || b.title === source, false ); + if ( !exists ) { + break; + } + } + if (new Date() - start >= timeoutMs) { + throw new Error( `Timed out waiting for ${source} to not exist in document of ${redirect}` ); + } + yield this.stepHelpers.waitForMs( 200 ); + } + } ).call( this ); + } ); }); diff --git a/tests/integration/features/update_redirect_api.feature b/tests/integration/features/update_redirect_api.feature new file mode 100644 index 000..50171ea --- /dev/null +++ b/tests/integration/features/update_redirect_api.feature @@ -0,0 +1,22 @@ +@clean @api @redirect @update +Feature: Updating a page from or to a redirect + Scenario: Turning a page into a redirect removes it from the search index +Given a page named RedirectTarget exists + When a page named ToBeRedirect%{epoch} exists + And I api search for ToBeRedirect%{epoch} + Then ToBeRedirect%{epoch} is the first api search result + When a page named ToBeRedirect%{epoch} exists with contents #REDIRECT [[RedirectTarget]] + And I api search for ToBeRedirect%{epoch} + Then RedirectTarget is the first api search result + And ToBeRedirect%{epoch} is not in the api search results + + Scenario: Turning a page from a redirect to a regular page puts it in the index +Given a page named RedirectTarget exists + When a page named StartsAsRedirect%{epoch} exists with contents #REDIRECT [[RedirectTarget]] + And I api search for StartsAsRedirect%{epoch} +
[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: port update_non_existent_api.feature to nodejs
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/396470 ) Change subject: port update_non_existent_api.feature to nodejs .. port update_non_existent_api.feature to nodejs Change-Id: I04b7f5a75c05aa8c3ff59d081ffe085c49d0a601 --- M tests/integration/features/step_definitions/page_steps.js A tests/integration/features/update_non_existent_api.feature 2 files changed, 76 insertions(+), 24 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/70/396470/1 diff --git a/tests/integration/features/step_definitions/page_steps.js b/tests/integration/features/step_definitions/page_steps.js index a22f2f8..9db07e3 100644 --- a/tests/integration/features/step_definitions/page_steps.js +++ b/tests/integration/features/step_definitions/page_steps.js @@ -43,28 +43,32 @@ // TODO: We might need to share this epoch between wdio runner processes? const epoch = +new Date(); const searchVars = {}; -defineParameterType( { - // Quite annoyingly this isn't a regexp to match in the step name, rather - // it is a string literal to match a capture group of the step definition. - // So basically this only replaces epochs in parameters defined as (.+). - regexp: /.+/, - transformer: (s) => { - if ( s === undefined ) { - return s; - } - if ( s === 'the empty string' ) { - return ''; - } - s = s.replace( /%{epoch}/g, epoch ); - s = s.replace( /%ideographic_whitspace%/g, "\u3000" ); +// These expressions are string matches against capture groups in steps. Yes, .+) +// is intentional. Cucumber's matching of capture groups is broken so (?:foo (.+)) +// has to be matched as .+). That broken matching also means (?:foo (.+) bar) +// would have to be matched as '.+) bar' but we don't bother. +let expressions = [ '.+', '.+?', '.+)' ]; +for ( let expression of expressions ) { + defineParameterType( { + regexp: expression, + transformer: (s) => { + if ( s === undefined ) { + return s; + } + if ( s === 'the empty string' ) { + return ''; + } + s = s.replace( /%{epoch}/g, epoch ); + s = s.replace( /%ideographic_whitspace%/g, "\u3000" ); - // Replace %{\u}% with the appropriate unicode code point - s = s.replace(/%\{\\u([\dA-Fa-f]{4,6})\}%/g, ( match, codepoint ) => JSON.parse( `"\\u${codepoint}"` ) ); - s = Object.keys(searchVars).reduce( ( str, pattern ) => str.replace( pattern, searchVars[pattern] ), s ); - return s.replace( /%{exact:([^}]*)}/g, '$1' ); - }, - name: 'replacements', -} ); + // Replace %{\u}% with the appropriate unicode code point + s = s.replace(/%\{\\u([\dA-Fa-f]{4,6})\}%/g, ( match, codepoint ) => JSON.parse( `"\\u${codepoint}"` ) ); + s = Object.keys(searchVars).reduce( ( str, pattern ) => str.replace( pattern, searchVars[pattern] ), s ); + return s.replace( /%{exact:([^}]*)}/g, '$1' ); + }, + typeName: 'replacements_' + expression , + } ); +} defineSupportCode( function( {Given, When, Then} ) { @@ -244,19 +248,19 @@ return stepHelpers.searchFor( search, options ); } ); - Then( /there are no errors reported by the api/, function () { + Then( /^there are no errors reported by the api$/, function () { return withApi( this, () => { expect( this.apiError ).to.equal(undefined); } ); } ); - Then( /there is an api search result/, function () { + Then( /^there is an api search result$/, function () { return withApi( this, () => { expect( this.apiResponse.query.search ).to.not.have.lengthOf( 0 ); } ); } ); - Then( /there are no api search results/, function () { + Then( /^there are no api search results$/, function () { return withApi( this, () => { expect( this.apiResponse.query.search ).to.have.lengthOf( 0 ); } ); @@ -474,4 +478,10 @@ Then ( /^the page text contains (.+)$/, function( text ) { expect(browser.getSource()).to.contains(text); } ); + + Then( /^there are( no)? api search results with (.+) in the data$/, function ( should_not, within ) { + let snippets = this.apiResponse.query.search.map( ( result ) => result.snippet ); + let found = snippets.reduce( ( a, b ) => a
[MediaWiki-commits] [Gerrit] search/xgboost[master]: Specialize single-node training
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/396098 ) Change subject: Specialize single-node training .. Specialize single-node training XGBoost has a faster training method, tree_method -> hist, which is currently not implemented for distributed training. We actually train quite a few models on a single node (but with many models being trained in parallel) so it would be nice to be able to utilize this where possible. This is perhaps not implemented in the most optimal way if we were going to upstream the patch, but upstreaming is unlikely as upstream does not support training multiple models in parallel (we do through a custom hack). Rather than refactoring existing code this mostly adds new functions for specialized single-node training so that pulling in upstream changes will be as pain free as possible. Change-Id: I2760127edd2c3c4ad26abd23e621059ac9609950 --- M jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala M jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala 2 files changed, 87 insertions(+), 4 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/xgboost refs/changes/98/396098/1 diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala index ea18ff2..bc052e2 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala @@ -105,6 +105,57 @@ } } + private[spark] def buildLocalBoosters( + data: RDD[XGBLabeledPoint], + params: Map[String, Any], + round: Int, + obj: ObjectiveTrait, + eval: EvalTrait, + useExternalMemory: Boolean, + missing: Float): RDD[Array[Byte]] = { +val partitionedData = if (data.getNumPartitions != 1) { + logger.info(s"repartitioning training set to 1 partitions") + data.coalesce(1) +} else { + data +} +val partitionedBaseMargin = partitionedData.map(_.baseMargin) +val appName = partitionedData.context.appName +partitionedData.zipPartitions(partitionedBaseMargin) { (labeledPoints, baseMargins) => + if (labeledPoints.isEmpty) { +throw new XGBoostError( + s"detected an empty partition in the training data, partition ID:" + +s" ${TaskContext.getPartitionId()}") + } + val cacheFileName = if (useExternalMemory) { +s"$appName-${TaskContext.get().stageId()}-" + + s"dtrain_cache-${TaskContext.getPartitionId()}" + } else { +null + } + + // Yes it's odd to access this but not do anything. We are ensuring the lazily + // initialized resource monitor is setup before we enter training. + monitor + + val watches = Watches(params, +fromDenseToSparseLabeledPoints(labeledPoints, missing), +fromBaseMarginsToArray(baseMargins), cacheFileName) + try { +val numEarlyStoppingRounds = params.get("numEarlyStoppingRounds") + .map(_.toString.toInt).getOrElse(0) +val booster = SXGBoost.train(watches.train, params, round, + watches = watches.toMap, obj = obj, eval = eval, + earlyStoppingRound = numEarlyStoppingRounds) +val bytes = booster.toByteArray +booster.dispose +Iterator(bytes) + } finally { +watches.delete() + } +} + } + private[spark] def buildDistributedBoosters( data: RDD[XGBLabeledPoint], params: Map[String, Any], @@ -302,8 +353,40 @@ val xgbTrainingData = trainingData.map { case MLLabeledPoint(label, features) => features.asXGB.copy(label = label.toFloat) } -trainDistributed(xgbTrainingData, params, round, nWorkers, obj, eval, - useExternalMemory, missing) +if (nWorkers == 1) { + trainLocal(xgbTrainingData, params, round, obj, eval, useExternalMemory, missing) +} else { + trainDistributed(xgbTrainingData, params, round, nWorkers, obj, eval, +useExternalMemory, missing) +} + } + + @throws(classOf[XGBoostError]) + private[spark] def trainLocal( + trainingData: RDD[XGBLabeledPoint], + params: Map[String, Any], + round: Int, + obj: ObjectiveTrait = null, + eval: EvalTrait = null, + useExternalMemory: Boolean = false, + missing: Float = Float.NaN): XGBoostModel = { +if (obj != null) { + require(params.get("obj_type").isDefined, "parameter \"obj_type\" is not defined," + +" you have to specify the objective type as classification or regression with a" + +" customized objective function") +} +val overriddenParams = overrideParamsAccordingToTaskCPUs(params, trainingData.sparkContext) +
[MediaWiki-commits] [Gerrit] operations...cdh[master]: Enable more accurate smaps based rss checking
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/395923 ) Change subject: Enable more accurate smaps based rss checking .. Enable more accurate smaps based rss checking Training xgboost models in the hadoop cluster is running into some issues where yarn regularly kills containers, but only some of them. Based on review of yarn's code it appears this is because we are using the default RSS calculation which is documented as less accurate. Specifically it includes pages that the kernel is free to evict, and double(triple, etc) counts read only memory shared by many processes. A custom implementation of that algorithm was injected into a background task of training mlr models and found that the more accurate algorithm shows a constant memory usage. Enabling this will allow us to stop over-allocating memory to account for this discrepency, and require 250Gb less memory for the 9 hour training process. Bug: T182276 Change-Id: I0f8223db4d4abc26eb9d04ff106b7e49602f504e --- M templates/hadoop/yarn-site.xml.erb 1 file changed, 6 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/puppet/cdh refs/changes/23/395923/1 diff --git a/templates/hadoop/yarn-site.xml.erb b/templates/hadoop/yarn-site.xml.erb index 5657577..913a028 100644 --- a/templates/hadoop/yarn-site.xml.erb +++ b/templates/hadoop/yarn-site.xml.erb @@ -169,6 +169,12 @@ org.apache.spark.network.yarn.YarnShuffleService + +RSS usage of a process computed via /proc/pid/stat is not very accurate as it includes shared pages of a process. /proc/pid/smaps provides useful information like Private_Dirty, Private_Clean, Shared_Dirty, Shared_Clean which can be used for computing more accurate RSS. When this flag is enabled, RSS is computed as Min(Shared_Dirty, Pss) + Private_Clean + Private_Dirty. It excludes read-only shared mappings in RSS computation. + yarn.nodemanager.container-monitor.procfs-tree.smaps-based-rss.enabled +true + + <% if @datanode_mounts -%> List of directories to store localized files in. -- To view, visit https://gerrit.wikimedia.org/r/395923 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I0f8223db4d4abc26eb9d04ff106b7e49602f504e Gerrit-PatchSet: 1 Gerrit-Project: operations/puppet/cdh Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] search/xgboost[master]: [DNM] more debugging of RssFile explosion
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/395915 ) Change subject: [DNM] more debugging of RssFile explosion .. [DNM] more debugging of RssFile explosion Change-Id: Ia5cdb36f31dd4512b048de74f2b2d769d2fa7acf --- M jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/ResourceMonitorThread.scala M jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala 2 files changed, 117 insertions(+), 8 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/xgboost refs/changes/15/395915/1 diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/ResourceMonitorThread.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/ResourceMonitorThread.scala index 22ed87c..e8f55b4 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/ResourceMonitorThread.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/ResourceMonitorThread.scala @@ -16,13 +16,15 @@ package ml.dmlc.xgboost4j.scala.spark +import java.io.{BufferedReader, File, FileReader} import java.lang.management.ManagementFactory import java.util.concurrent.atomic.AtomicBoolean +import java.util.regex.Pattern import org.apache.commons.logging.LogFactory import scala.concurrent.duration.Duration -import scala.io.Source +import scala.io.{BufferedSource, Source} class ResourceMonitorThread(reportEvery: Duration) extends Thread { super.setDaemon(true) @@ -37,18 +39,120 @@ return } while (keepChecking.get()) { - report() + report().foreach(logger.info) Thread.sleep(reportEvery.toMillis) } } def stopChecking(): Unit = keepChecking.set(false) - private def report(): Unit = { + def report(): Seq[String] = { val rss = Source.fromFile(s"/proc/$pid/status").getLines() .filter(_.startsWith("Rss")) .mkString(", ") -logger.info(rss) -logger.info(memoryBean.getHeapMemoryUsage) +Seq(rss, + memoryBean.getHeapMemoryUsage.toString, + // 5 largest contributors to RSSFile + collectSMapInfo().take(5).map({ info => +s"${info.mem()}: ${info.name}" + }).mkString("\n") +).filter(_.length > 0) } + + private val ADDRESS_PATTERN = raw"^([a-f0-9]*)-([a-f0-9]*)(\s)*([rxwps\-]*).*".r + private val MEM_INFO_PATTERN = raw"^([A-Z].*):[\s ]*(.*).*".r + private val KB = "kB" + private val READ_ONLY_WITH_SHARED_PERMISSION = "r--s" + private val READ_EXECUTE_WITH_SHARED_PERMISSION = "r-xs" + + val file = new File(s"/proc/$pid/smaps") + + private def collectSMapInfo(): List[ProcessSmapMemoryInfo] = { +if (!file.exists()) { + return Nil +} +val lines = Source.fromFile(s"/proc/$pid/smaps").getLines() +lines.map(_.trim).foldLeft(List[ProcessSmapMemoryInfo]()) { (acc, line) => + line match { +case ADDRESS_PATTERN(startAddr, endAddr, space, permission) => + new ProcessSmapMemoryInfo(line, permission) :: acc +case MEM_INFO_PATTERN(key, value) => + acc match { +case memInfo :: xs => memInfo.setMemInfo(key.trim, value.replace(KB, "").trim) :: xs +case Nil => Nil + } +case _ => acc + } +}.filter { memInfo => + !memInfo.permission.trim.equalsIgnoreCase(READ_ONLY_WITH_SHARED_PERMISSION) && + !memInfo.permission.trim.equalsIgnoreCase(READ_EXECUTE_WITH_SHARED_PERMISSION) +}.sortBy(_.mem()).reverse + } +} + +class ProcessSmapMemoryInfo(val name: String, val permission: String) { + var size: Int = 0 + var rss: Int = 0 + var pss: Int = 0 + var sharedClean: Int = 0 + var sharedDirty: Int = 0 + var privateClean: Int = 0 + var privateDirty: Int = 0 + var referenced: Int = 0 + var regionName: String = "" + + def setMemInfo(key: String, value: String): ProcessSmapMemoryInfo = { +try { + val intval = value.trim.toInt + MemInfo(key) match { +case MemInfo.SIZE => size = intval +case MemInfo.RSS => rss = intval +case MemInfo.PSS => pss = intval +case MemInfo.SHARED_CLEAN => sharedClean = intval +case MemInfo.SHARED_DIRTY => sharedDirty = intval +case MemInfo.PRIVATE_CLEAN => privateClean = intval +case MemInfo.PRIVATE_DIRTY => privateDirty = intval +case MemInfo.REFERENCED => referenced = intval +case _ => None + } +} catch { + case e: NumberFormatException => Nil +} +this + } + + def mem(): Int = { +// Math.min(sharedDirty, pss) + privateDirty + privateClean +rss + } +} + +object MemInfo { + sealed abstract class MemInfoVal(val name: String) { +override def toString: String = name + } + + def apply(name: String): MemInfoVal = { +values.collectFirst { case i if i.name.equalsIgnoreCase(name.trim) => i
[MediaWiki-commits] [Gerrit] search/xgboost[master]: [DNM] Test xgboost4j-spark with fast hist tree maker
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/395856 ) Change subject: [DNM] Test xgboost4j-spark with fast hist tree maker .. [DNM] Test xgboost4j-spark with fast hist tree maker Change-Id: If8cda596953182a62485df9d8f370f7e6d800b51 --- M jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala 1 file changed, 4 insertions(+), 4 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/xgboost refs/changes/56/395856/1 diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala index ea18ff2..a0024a7 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala @@ -316,10 +316,10 @@ eval: EvalTrait = null, useExternalMemory: Boolean = false, missing: Float = Float.NaN): XGBoostModel = { -if (params.contains("tree_method")) { - require(params("tree_method") != "hist", "xgboost4j-spark does not support fast histogram" + - " for now") -} +// if (params.contains("tree_method")) { +// require(params("tree_method") != "hist", "xgboost4j-spark does not support fast" + +// " histogram for now") +// } require(nWorkers > 0, "you must specify more than 0 workers") if (obj != null) { require(params.get("obj_type").isDefined, "parameter \"obj_type\" is not defined," + -- To view, visit https://gerrit.wikimedia.org/r/395856 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: If8cda596953182a62485df9d8f370f7e6d800b51 Gerrit-PatchSet: 1 Gerrit-Project: search/xgboost Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Simple hack to override mlr model from query string
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/395826 ) Change subject: Simple hack to override mlr model from query string .. Simple hack to override mlr model from query string This is a rather naive attempt to allow us to do a sanity check on an MLR model before we roll it out. With the new cirrusMLRModel query parameter we can upload a model to elasticsearch and try a couple queries by specifying the model name before we ship a config cange to enable the model for everyone. Change-Id: Id258c4ad295eab8cac543f14e9135a1bdb87533a --- M includes/Search/RescoreBuilders.php 1 file changed, 11 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/26/395826/1 diff --git a/includes/Search/RescoreBuilders.php b/includes/Search/RescoreBuilders.php index 76a4fac..3119a01 100644 --- a/includes/Search/RescoreBuilders.php +++ b/includes/Search/RescoreBuilders.php @@ -127,6 +127,17 @@ * @return AbstractQuery */ private function buildLtrQuery( $model ) { + // This is a bit fragile, and makes the bold assumption + // only a single level of rescore will be used. This is + // strictly for debugging/testing before shipping a model + // live so shouldn't be a big deal. + $override = \RequestContext::getMain() + ->getRequest() + ->getVal( 'cirrusMLRModel' ); + if ( $override ) { + $model = $override; + } + $bool = new \Elastica\Query\BoolQuery(); // the ltr query can return negative scores, which mucks with elasticsearch // sorting as that will put these results below documents set to 0. Fix -- To view, visit https://gerrit.wikimedia.org/r/395826 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Id258c4ad295eab8cac543f14e9135a1bdb87533a Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/CirrusSearch Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: AtLeastNDistinct returns wrong value on merge
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/395617 ) Change subject: AtLeastNDistinct returns wrong value on merge .. AtLeastNDistinct returns wrong value on merge The merge operation wasn't correctly taking buf2 into account. Add some tests to verify how this should work and update merge to correctly integrate buf2 into buf1. Change-Id: Ib37b60e4f4ae2354d1d1181460e1b511c0c13cc2 --- M jvm/src/main/scala/org/wikimedia/search/mjolnir/AtLeastNDistinct.scala A jvm/src/test/scala/org/wikimedia/search/mjolnir/AtLeastNDistinctSuite.scala 2 files changed, 72 insertions(+), 1 deletion(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR refs/changes/17/395617/1 diff --git a/jvm/src/main/scala/org/wikimedia/search/mjolnir/AtLeastNDistinct.scala b/jvm/src/main/scala/org/wikimedia/search/mjolnir/AtLeastNDistinct.scala index 1b49e25..74d7467 100644 --- a/jvm/src/main/scala/org/wikimedia/search/mjolnir/AtLeastNDistinct.scala +++ b/jvm/src/main/scala/org/wikimedia/search/mjolnir/AtLeastNDistinct.scala @@ -57,7 +57,9 @@ } override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { -if (!buffer1.getAs[Boolean](buffer_reached)) { +if (buffer2.getAs[Boolean](buffer_reached)) { + buffer1(buffer_reached) = true +} else if (!buffer1.getAs[Boolean](buffer_reached)) { getSet(buffer1) ++= getSet(buffer2) checkReached(buffer1) } diff --git a/jvm/src/test/scala/org/wikimedia/search/mjolnir/AtLeastNDistinctSuite.scala b/jvm/src/test/scala/org/wikimedia/search/mjolnir/AtLeastNDistinctSuite.scala new file mode 100644 index 000..962dbb9 --- /dev/null +++ b/jvm/src/test/scala/org/wikimedia/search/mjolnir/AtLeastNDistinctSuite.scala @@ -0,0 +1,69 @@ +package org.wikimedia.search.mjolnir + +import org.apache.spark.sql.expressions.MutableAggregationBuffer +import org.scalatest.FunSuite + +class DummyBuffer(init: Array[Any]) extends MutableAggregationBuffer { + val values: Array[Any] = init + def update(i: Int, value: Any): Unit = values(i) = value + def get(i: Int) = values(i) + def length: Int = init.length + def copy() = new DummyBuffer(init.clone()) +} + +class AtLeastNDistinctSuite extends FunSuite { + import org.scalatest.prop.TableDrivenPropertyChecks._ + + test("basic operation") { +val udaf = new AtLeastNDistinct +val buf = new DummyBuffer(new Array(udaf.bufferSchema.length)) +val row = new DummyBuffer(new Array(udaf.inputSchema.length)) + +forAll(Table( + ("limit", "expected", "values"), + (1, false, Seq()), + (1, true, Seq("zomg")), + (1, true, Seq("hi", "hi", "hi")), + (2, false, Seq("hi", "hi", "hi")), + (2, true, Seq("hi", "there", "hi")) +)) { (limit: Int, expect: Boolean, values: Seq[String]) => + udaf.initialize(buf) + row(udaf.input_limit) = limit + values.foreach { value => +row(udaf.input_value) = value +udaf.update(buf, row) + } + assert(udaf.evaluate(buf) == expect) +} + } + + test("merge") { +val udaf = new AtLeastNDistinct +val buf1 = new DummyBuffer(new Array(udaf.bufferSchema.length)) +val buf2 = new DummyBuffer(new Array(udaf.bufferSchema.length)) +val row = new DummyBuffer(new Array(udaf.inputSchema.length)) + +forAll(Table( + ("limit", "expected", "a", "b"), + (1, true, Set("a"), Set[String]()), + (1, true, Set[String](), Set("a")), + (2, false, Set("a"), Set("a")), + (2, true, Set("a"), Set("b")) +)) { (limit: Int, expect: Boolean, a: Set[String], b: Set[String]) => + udaf.initialize(buf1) + udaf.initialize(buf2) + row(udaf.input_limit) = limit + a.foreach { value => +row(udaf.input_value) = value +udaf.update(buf1, row) + } + b.foreach { value => +row(udaf.input_value) = value +udaf.update(buf2, row) + } + + udaf.merge(buf1, buf2) + assert(udaf.evaluate(buf1) == expect) +} + } +} -- To view, visit https://gerrit.wikimedia.org/r/395617 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ib37b60e4f4ae2354d1d1181460e1b511c0c13cc2 Gerrit-PatchSet: 1 Gerrit-Project: search/MjoLniR Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] search/xgboost[master]: Add unique tag to log instances in RabitTracker
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/395611 ) Change subject: Add unique tag to log instances in RabitTracker .. Add unique tag to log instances in RabitTracker We often have between 15 and 100 separate RabitTracker instances running at the same time and it's incredibly difficult to figure out when one errors out what other logs are related to the one that failed. This doesn't completely solve the problem of associating non-tracker logs (like executor kills by yarn), but it at least helps distinguish the output from the separate trackers. Change-Id: Ic4189ae318316be405b3be499d95b2849b0e6f61 --- M jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/RabitTracker.java 1 file changed, 9 insertions(+), 7 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/xgboost refs/changes/11/395611/1 diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/RabitTracker.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/RabitTracker.java index 888d501..4927466 100644 --- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/RabitTracker.java +++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/RabitTracker.java @@ -5,7 +5,7 @@ import java.io.*; import java.util.HashMap; import java.util.Map; -import java.util.concurrent.TimeUnit; +import java.util.Random; import java.util.concurrent.atomic.AtomicReference; import org.apache.commons.logging.Log; @@ -23,8 +23,10 @@ * The tracker must be started on driver node before running distributed jobs. */ public class RabitTracker implements IRabitTracker { - // Maybe per tracker logger? - private static final Log logger = LogFactory.getLog(RabitTracker.class); + private static final Log classLogger = LogFactory.getLog(RabitTracker.class); + private static final Random random = new Random(); + private final String logTag = Integer.toHexString(random.nextInt()); + private final Log logger = LogFactory.getLog(RabitTracker.class.getName() + '@' + logTag); // tracker python file. private static String tracker_py = null; // environment variable to be pased. @@ -37,8 +39,8 @@ try { initTrackerPy(); } catch (IOException ex) { - logger.error("load tracker library failed."); - logger.error(ex); + classLogger.error("load tracker library failed."); + classLogger.error(ex); } } @@ -48,7 +50,7 @@ private class TrackerProcessLogger implements Runnable { public void run() { - Log trackerProcessLogger = LogFactory.getLog(TrackerProcessLogger.class); + Log trackerProcessLogger = LogFactory.getLog(TrackerProcessLogger.class.getName() + '@' + logTag); BufferedReader reader = new BufferedReader(new InputStreamReader( trackerProcess.get().getErrorStream())); String line; @@ -73,7 +75,7 @@ try { tracker_py = NativeLibLoader.createTempFileFromResource("/tracker.py"); } catch (IOException ioe) { - logger.trace("cannot access tracker python script"); + classLogger.trace("cannot access tracker python script"); throw ioe; } } -- To view, visit https://gerrit.wikimedia.org/r/395611 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ic4189ae318316be405b3be499d95b2849b0e6f61 Gerrit-PatchSet: 1 Gerrit-Project: search/xgboost Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] search/xgboost[master]: Add background resource monitor task to training
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/395592 ) Change subject: Add background resource monitor task to training .. Add background resource monitor task to training We have executors getting killed by overrunning their memory allocations, but no clue why that is happening. Training an entire 35M observation set on a single jvm (local spark mode), but training that 35M observation set in yarn split between three executors usually works but sometimes yarn comes out and kills our process. Add a thread on executors that perform training to regularly report both heap usage and Rss info from /proc/$pid/status. While this wont tell us exactly what is happening, it will at least hopefully give some insight into how memory usage develops over time up to the point that yarn decides to kiil our executors. This intentionally is implemented in a "once per jvm" way which is a bit odd but provides us the most information. Basically the first time an executor performs training the thread is spun up and that thread keeps running after the current task is complete, up until the executor itself exits. Change-Id: I71c121055ea94b997bc018da4fc0d4d86d63bf66 --- A jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/ResourceMonitorThread.scala M jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala 2 files changed, 64 insertions(+), 1 deletion(-) git pull ssh://gerrit.wikimedia.org:29418/search/xgboost refs/changes/92/395592/1 diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/ResourceMonitorThread.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/ResourceMonitorThread.scala new file mode 100644 index 000..64309e6 --- /dev/null +++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/ResourceMonitorThread.scala @@ -0,0 +1,52 @@ +/* + Copyright (c) 2014 by Contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +package ml.dmlc.xgboost4j.scala.spark + +import java.lang.management.ManagementFactory +import java.util.concurrent.atomic.AtomicBoolean + +import org.apache.commons.logging.LogFactory + +import scala.concurrent.duration.Duration +import scala.io.Source + +class ResourceMonitorThread(reportEvery: Duration) extends Thread { + private val keepChecking = new AtomicBoolean(true) + private val pid = ManagementFactory.getRuntimeMXBean.getName.split('@')(0).toInt + private val memoryBean = ManagementFactory.getMemoryMXBean + private val logger = LogFactory.getLog(this.getClass) + + override def run(): Unit = { +if (!logger.isInfoEnabled) { + return +} +while (keepChecking.get()) { + report() + Thread.sleep(reportEvery.toMillis) +} + } + + def stopChecking(): Unit = keepChecking.set(false) + + private def report(): Unit = { +val rss = Source.fromFile(s"/proc/$pid/status").getLines() + .filter(_.startsWith("Rss")) + .mkString(", ") +logger.info(rss) +logger.info(memoryBean.getHeapMemoryUsage) + } +} diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala index 2f64e15..cce063d 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala @@ -17,8 +17,10 @@ package ml.dmlc.xgboost4j.scala.spark import java.io.ByteArrayInputStream +import java.util.concurrent.TimeUnit import scala.collection.mutable +import scala.concurrent.duration.Duration import scala.util.Random import ml.dmlc.xgboost4j.java.{IRabitTracker, Rabit, XGBoostError, RabitTracker => PyRabitTracker} import ml.dmlc.xgboost4j.scala.rabit.RabitTracker @@ -30,6 +32,7 @@ import org.apache.spark.sql.Dataset import org.apache.spark.ml.feature.{LabeledPoint => MLLabeledPoint} import org.apache.spark.{SparkContext, SparkParallelismTracker, TaskContext} + object TrackerConf { def apply(): TrackerConf = TrackerConf(0L, "python") @@ -51,6 +54,10 @@ object XGBoost extends Serializable { private val logger = LogFactory.getLog("XGBoostSpark") + + // By using a lazy val on an object (singleton) we ensure this is only performed + // once per-jvm. It is
[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: Add option to train using external memory
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/395062 ) Change subject: Add option to train using external memory .. Add option to train using external memory I'm not sure what exactly has changed, but i'm unable to complete a full round of training on wikis with large (~35M) numbers of observations keep getting killed by spark. I tried increasing memory overhead from 9G to 12G but it still keeps dieing. I'm wary of allocating even more memory than that, as we are asking for a significant % of cluster memory. Take advantage of xgboost's external memory implementation to prevent the memory explosion. This basically writes out the features matrix to disk and memory maps it, depending on the kernel disk cache to keep it in memory where possible. This is likely a little slower, but still faster than killing executors and regularly restarting training. Change-Id: Ie283c1c58d8395054164f1c0157e1a709d14 --- M example_train.yaml M mjolnir/test/fixtures/load_config/example_train.expect M mjolnir/training/xgboost.py M mjolnir/utilities/training_pipeline.py 4 files changed, 18 insertions(+), 3 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR refs/changes/62/395062/1 diff --git a/example_train.yaml b/example_train.yaml index 5784421..9cfb8a3 100644 --- a/example_train.yaml +++ b/example_train.yaml @@ -138,6 +138,7 @@ cv-jobs: 22 folds: 3 final-trees: 100 +use-external-memory: yes medium: # 4M to 12M observations per executor. diff --git a/mjolnir/test/fixtures/load_config/example_train.expect b/mjolnir/test/fixtures/load_config/example_train.expect index 23e536f..75233f6 100644 --- a/mjolnir/test/fixtures/load_config/example_train.expect +++ b/mjolnir/test/fixtures/load_config/example_train.expect @@ -243,6 +243,7 @@ folds: '3' input: hdfs://analytics-hadoop/user/pytest/mjolnir/marker output: /home/pytest/training_size/marker_large + use-external-memory: 'True' workers: '3' environment: HOME: /home/pytest diff --git a/mjolnir/training/xgboost.py b/mjolnir/training/xgboost.py index 6d1f70b..03e8599 100644 --- a/mjolnir/training/xgboost.py +++ b/mjolnir/training/xgboost.py @@ -108,7 +108,7 @@ return retval -def train(df, params, num_workers=None): +def train(df, params, num_workers=None, use_external_memory=False): """Train a single xgboost ranking model. df : pyspark.sql.DataFrame @@ -168,6 +168,7 @@ try: return XGBoostModel.trainWithDataFrame(df_grouped, params, num_rounds, num_workers, feature_col='features', + use_external_memory=use_external_memory, label_col='label') finally: if unpersist: diff --git a/mjolnir/utilities/training_pipeline.py b/mjolnir/utilities/training_pipeline.py index 3ee6bd2..dae13ab 100644 --- a/mjolnir/utilities/training_pipeline.py +++ b/mjolnir/utilities/training_pipeline.py @@ -51,7 +51,7 @@ def run_pipeline(sc, sqlContext, input_dir, output_dir, wikis, initial_num_trees, final_num_trees, - num_workers, num_cv_jobs, num_folds, test_dir, zero_features): + num_workers, num_cv_jobs, num_folds, test_dir, zero_features, use_external_memory): for wiki in wikis: print 'Training wiki: %s' % (wiki) df_hits_with_features = ( @@ -98,7 +98,8 @@ df_grouped, j_groups = mjolnir.training.xgboost.prep_training( df_hits_with_features, num_workers) best_params['groupData'] = j_groups -model = mjolnir.training.xgboost.train(df_grouped, best_params) +model = mjolnir.training.xgboost.train( +df_grouped, best_params, use_external_memory=use_external_memory) tune_results['metrics']['train'] = model.eval(df_grouped, j_groups) df_grouped.unpersist() @@ -142,6 +143,14 @@ print 'Wrote xgboost binary model to %s' % (xgb_model_output) print '' +def str_to_bool(value): +if value.lower() in ['true', 'yes', '1']: +return True +elif value.lower() in ['false', 'no', '0']: +return False +else: +raise ValueError("Unknown boolean string: " + value) + def parse_arguments(argv): parser = argparse.ArgumentParser(description='Train XGBoost ranking models') @@ -168,6 +177,9 @@ '--initial-trees', dest='initial_num_trees', default=100, type=int, help='Number of trees to perform hyperparamter tuning with. (Default: 100)') parser.add_argument( +'-e', '--use-external-memory', dest='use_external_memory', default=False, +type=str_to_bool, help='Use external memory for feature matrix') +
[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: Allow spark to keep the full data pipeline in memory
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/395063 ) Change subject: Allow spark to keep the full data pipeline in memory .. Allow spark to keep the full data pipeline in memory Something with the upgrade to spark 2.1.2 has caused us to recompute lots of data over and over again in the data pipeline. This is particularly egregious for normalization and feature collection steps which take an hour each on a full run of data. I tested out simply not unpersisting our data and everything seems to work fine. We have ~100G of memory available for caching and only end up using 50G by not unpersisting. Figuring out what data is available is also much easier if we don't have to think about when to unpersist what data. Change-Id: Iedf259e481055444f369c528a56bee372e57595e --- M mjolnir/sampling.py M mjolnir/utilities/data_pipeline.py 2 files changed, 1 insertion(+), 19 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR refs/changes/63/395063/1 diff --git a/mjolnir/sampling.py b/mjolnir/sampling.py index d7d1f2b..88a93c7 100644 --- a/mjolnir/sampling.py +++ b/mjolnir/sampling.py @@ -196,8 +196,5 @@ df .join(df_queries_sampled, how='inner', on=['wikiid', 'norm_query_id']) .cache()) -df_sampled.count() -df.unpersist() -df_queries_unique.unpersist() return hit_page_id_counts, df_sampled diff --git a/mjolnir/utilities/data_pipeline.py b/mjolnir/utilities/data_pipeline.py index 20b06f0..5d26065 100644 --- a/mjolnir/utilities/data_pipeline.py +++ b/mjolnir/utilities/data_pipeline.py @@ -69,11 +69,6 @@ seed=54321, samples_per_wiki=samples_per_wiki) -# This should already be cached from sample, but lets be explicit -# to prevent future problems with refactoring. -df_sampled_raw.cache().count() -df_norm.unpersist() - # Transform our dataframe into the shape expected by the DBN df_sampled = ( df_sampled_raw @@ -85,13 +80,11 @@ .drop('click_page_ids') .cache()) -# materialize df_sampled and unpersist df_norm -nb_samples = df_sampled.count() -df_sampled_raw.unpersist() # Target around 125k rows per partition. Note that this isn't # how many the dbn will see, because it gets collected up. Just # a rough guess. +nb_samples = df_sampled.count() dbn_partitions = int(max(200, min(2000, nb_samples / 125000))) # Learn relevances @@ -114,10 +107,6 @@ .join(df_rel, how='inner', on=['wikiid', 'norm_query_id', 'hit_page_id']) .cache()) -# materialize df_all_hits and drop df_sampled, df_norm -df_all_hits.count() -df_sampled.unpersist() - # TODO: Training is per-wiki, should this be as well? weightedNdcgAt10 = mjolnir.metrics.ndcg(df_all_hits, 10, query_cols=['wikiid', 'query', 'session_id']) print 'weighted ndcg@10: %.4f' % (weightedNdcgAt10) @@ -133,10 +122,6 @@ F.first('label').alias('label'), F.first('relevance').alias('relevance')) .cache()) - -# materialize df_hits and drop df_all_hits -df_hits.count() -df_all_hits.unpersist() actual_samples_per_wiki = df_hits.groupby('wikiid').agg(F.count(F.lit(1)).alias('n_obs')).collect() actual_samples_per_wiki = {row.wikiid: row.n_obs for row in actual_samples_per_wiki} -- To view, visit https://gerrit.wikimedia.org/r/395063 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Iedf259e481055444f369c528a56bee372e57595e Gerrit-PatchSet: 1 Gerrit-Project: search/MjoLniR Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: [WIP] Bad ideas for improved DBN performance
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/394741 ) Change subject: [WIP] Bad ideas for improved DBN performance .. [WIP] Bad ideas for improved DBN performance I'm not sure this is a particularly great idea, but I wanted to explore the performance limits of the JVM based DBN implementation. This brings the original benchmark (90s in java, 3-4s in prior patch) to ~900ms. To get a better idea on performance i increased the size of the benchmark: * python: 616s - only ran once * orig jvm: min: 21.7, max: 24.1 mean: 23.5s - 5 runs - 25x- 28x faster than python * optimized jvm: min: 5.0s max: 5.3s mean: 5.2s - 5 runs - 116x - 123x faster than python - 4x - 5x faster than orig jvm The improvements made were guided by profiling in visualvm and arn't all that numerous: * We were thrashing memory pretty hard at >1GB/sec. To reduce this add caches of our intermediate arrays. We are still thrashing memory pretty hard but not as bad. * The caches of the intermediate arrays in scala Maps brought those maps up high in the profiler. Replace with arrays of queues. The backing linked list still shows up in profiling, but not as bad. * DefaultMap.apply gets hit *alot* and was showing up in profiling. Replacing inner scala maps with java maps helped some. Further replacing java maps with trove4j primitive maps helped significantly. * Find places where we were repeatedly hitting an array for the same item (for example getting something by s.queryId in a loop on the urls) and fetch it into a local var. Not sure this made much difference visualvm now reports 80% of cpu time is spent in our own functions, whereas before it was significantly lower. Mostly I just kept looking for places where the supporting machinery was taking up cpu instead of our calculations and kept replacing them until it was better. Change-Id: I08b72b98f515a820675e1ef9b45dd8724cbd070e --- M jvm/pom.xml M jvm/src/main/scala/org/wikimedia/search/mjolnir/DBN.scala M jvm/src/test/scala/org/wikimedia/search/mjolnir/DBNSuite.scala 3 files changed, 246 insertions(+), 58 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR refs/changes/41/394741/1 diff --git a/jvm/pom.xml b/jvm/pom.xml index b2a7f71..f405975 100644 --- a/jvm/pom.xml +++ b/jvm/pom.xml @@ -141,6 +141,11 @@ 3.0.1 test + +net.sf.trove4j +trove4j +3.0.3 + diff --git a/jvm/src/main/scala/org/wikimedia/search/mjolnir/DBN.scala b/jvm/src/main/scala/org/wikimedia/search/mjolnir/DBN.scala index 12ef975..cda6778 100644 --- a/jvm/src/main/scala/org/wikimedia/search/mjolnir/DBN.scala +++ b/jvm/src/main/scala/org/wikimedia/search/mjolnir/DBN.scala @@ -9,6 +9,9 @@ * A Dynamic Bayesian Network Click Model for Web Search Ranking - Olivier Chapelle and * Ya Zang - http://olivier.chapelle.cc/pub/DBN_www2009.pdf */ +import gnu.trove.iterator.TIntObjectIterator +import gnu.trove.map.hash.TIntObjectHashMap + import scala.collection.mutable import scala.util.parsing.json.JSON @@ -19,15 +22,22 @@ // This bit maps input queryies/results to array indexes to be used while calculating private var currentUrlId: Int = 0 // TODO: Why is first returned value 1 instead of 0? + private val urlToIdMap: mutable.Map[String, Int] = mutable.Map() + def urlToId(key: String): Int = { +urlToIdMap.getOrElseUpdate(key, { + currentUrlId += 1 + currentUrlId +}) + } + private var currentQueryId: Int = -1 - private val urlToId: DefaultMap[String, Int] = new DefaultMap({ _ => -currentUrlId += 1 -currentUrlId - }) - private val queryToId: DefaultMap[(String, String), Int] = new DefaultMap({ _ => -currentQueryId += 1 -currentQueryId - }) + private val queryToIdMap: mutable.Map[(String, String), Int] = mutable.Map() + def queryToId(key: (String, String)): Int = { +queryToIdMap.getOrElseUpdate(key, { + currentQueryId += 1 + currentQueryId +}) + } def maxQueryId: Int = currentQueryId + 2 @@ -91,8 +101,8 @@ } def toRelevances(urlRelevances: Array[Map[Int, UrlRel]]): Seq[RelevanceResult] = { -val idToUrl = urlToId.asMap.map(_.swap) -val idToQuery = queryToId.asMap.map(_.swap) +val idToUrl = urlToIdMap.map(_.swap) +val idToQuery = queryToIdMap.map(_.swap) urlRelevances.zipWithIndex.flatMap { case (d, queryId) => val (query, region) = idToQuery(queryId) @@ -101,6 +111,127 @@ RelevanceResult(query, region, url, urlRel.a * urlRel.s) } } + } +} + +class ArrayCache { + val QUEUE_1D_MAX = 20 + private val queueMap1d: Array[mutable.Queue[Array[Double]]] = Array.fill(QUEUE_1D_MAX + 1){ mutable.Queue() } + + def get1d(n: Int): Array[Double] = { +if (n > QUEUE_1D_MAX) { + new Array[Double](n) +} else { +
[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: Port DBN from clickmodels to scala
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/394509 ) Change subject: Port DBN from clickmodels to scala .. Port DBN from clickmodels to scala DBN takes quite some time when running against a full run of wikis. On one run that resulted in 90M observations across 20 wikis it took over 20 minutes. In the naive benchmark (included) this scala version is > 20x faster than the python implementation, and is perhaps easier to follow since we remove unused functionality ( the intents and layouts). This does not yet include the python side of calling this, because the way the CI works we will need to publish a new mjolnir jar with this code and wanted to let it get reviewed first. For performance reasons this almost exclusively uses arrays, and most inner loops are using while instead of more idiomatic map or fold. This conversion gave an ~3x speedup, which seems worthwhile. This is probably very allocation heavy, but optimizing out the allocations seemed like a big pain. Change-Id: I7231590a18b7f8fe2552997bc4c702ee635d06e5 --- A jvm/src/main/scala/org/wikimedia/search/mjolnir/DBN.scala A jvm/src/test/resources/dbn.data A jvm/src/test/scala/org/wikimedia/search/mjolnir/DBNSuite.scala 3 files changed, 463 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR refs/changes/09/394509/1 diff --git a/jvm/src/main/scala/org/wikimedia/search/mjolnir/DBN.scala b/jvm/src/main/scala/org/wikimedia/search/mjolnir/DBN.scala new file mode 100644 index 000..51ea651 --- /dev/null +++ b/jvm/src/main/scala/org/wikimedia/search/mjolnir/DBN.scala @@ -0,0 +1,312 @@ +package org.wikimedia.search.mjolnir + +import scala.collection.mutable +import scala.util.parsing.json.JSON + +case class SessionItem(queryId: Int, urlIds: Array[Int], clicks: Array[Boolean]) +case class RelevanceResult(query: String, region: String, url: String, relevance: Double) + +class InputReader( + minDocsPerQuery: Int, maxDocsPerQuery: Int, serpSize: Int, + discardNoClicks: Boolean +) { + + private val urlToId: mutable.Map[String, Int] = mutable.Map() + private val queryToId: mutable.Map[(String, String), Int] = mutable.Map() + private var currentUrlId: Int = 1 + private var currentQueryId: Int = 0 + + def maxQueryId: Int = currentQueryId + 1 + + private def getQueryId(query: String, region: String): Int = { +val key = (query, region) +queryToId.get(key) match { + case Some(queryId) => queryId + case None => +val queryId = currentQueryId +currentQueryId += 1 +queryToId.put(key, queryId) +queryId +} + } + + private def getUrlId(url: String): Int = { +urlToId.get(url) match { + case Some(urlId) => urlId + case None => +val urlId = currentUrlId +currentUrlId += 1 +urlToId.put(url, urlId) +urlId +} + } + + + private def parseJsonBooleanArray(json: String): Option[Array[Boolean]] = { +JSON.parseFull(json) match { + case Some(x: List[Any]) => +if (x.forall(_.isInstanceOf[Boolean])) { + Some(x.asInstanceOf[List[Boolean]].toArray) +} else { + None +} + case _ => None +} + } + + private def parseJsonStringArray(json: String): Option[Array[String]] = { +JSON.parseFull(json) match { + case Some(x: List[Any]) => +if (x.forall(_.isInstanceOf[String])) { + Some(x.asInstanceOf[List[String]].toArray) +} else { + None +} + case _ => None +} + } + + def makeSessionItem(query: String, region: String, urls: Array[String], clicks: Array[Boolean]): Option[SessionItem] = { + +val n = math.min(serpSize, urls.length) +val hasClicks = clicks.take(n).foldLeft(false)(_ || _) +if (urls.length < minDocsPerQuery || +(discardNoClicks && !hasClicks) +) { + None +} else { + val queryId = getQueryId(query, region) + val urlIds = urls.map(getUrlId) + Some(SessionItem(queryId, urlIds, clicks.take(n))) +} + } + + val PIECE_HASH_DIGEST = 0 + val PIECE_QUERY = 1 + val PIECE_REGION = 2 + val PIECE_INTENT_WEIGHT = 3 + val PIECE_URLS = 4 + val PIECE_LAYOUT = 5 + val PIECE_CLICKS = 6 + + // TODO: Ideally dont use this and make session items directly without extra ser/deser overhead + def read(f: Iterator[String]): Seq[SessionItem] = { +f.flatMap { line => { + val pieces = line.split("\t") + val query: String = pieces(PIECE_QUERY) + val region = pieces(PIECE_REGION) + val urls = parseJsonStringArray(pieces(PIECE_URLS)) match { +case Some(x: Array[String]) => x +case None => Array[String]() + } + val clicks = parseJsonBooleanArray(pieces(PIECE_CLICKS)) match { +case Some(x: Array[Boolean]) => x +case None => Array[Boolean]() + } + + makeSessionItem(query, region, urls, clicks) +
[MediaWiki-commits] [Gerrit] mediawiki/core[master]: Include highlight snippets when using search as api generator
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/394120 ) Change subject: Include highlight snippets when using search as api generator .. Include highlight snippets when using search as api generator It turned out when using ApiQuerySearch in generator mode important highlighting information was never returned. This makes it hard to figure out why a search for intitle:park returned 'Capilano Suspension Bridge' (because there is a redirect with the word park). Add all requested gsrprop to the generator result. Change-Id: Iea48937662492445783104077666ab1f1b30da2d --- M includes/api/ApiQuerySearch.php 1 file changed, 10 insertions(+), 3 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/core refs/changes/20/394120/1 diff --git a/includes/api/ApiQuerySearch.php b/includes/api/ApiQuerySearch.php index f0c4180..832d84f 100644 --- a/includes/api/ApiQuerySearch.php +++ b/includes/api/ApiQuerySearch.php @@ -145,9 +145,11 @@ // Add the search results to the result $terms = $wgContLang->convertForSearchResult( $matches->termMatches() ); $titles = []; + $metadata = []; $count = 0; $result = $matches->next(); $limit = $params['limit']; + $offset = $params['offset'] + 1; while ( $result ) { if ( ++$count > $limit ) { @@ -175,6 +177,12 @@ } } else { $titles[] = $result->getTitle(); + $metadata[] = [ + 'title' => $result->getTitle(), + 'data' => $this->getSearchResultData( $result, $prop, $terms ) + [ + 'index' => $count - 1 + $offset, + ], + ]; } $result = $matches->next(); @@ -209,9 +217,8 @@ return $current; } ); $resultPageSet->populateFromTitles( $titles ); - $offset = $params['offset'] + 1; - foreach ( $titles as $index => $title ) { - $resultPageSet->setGeneratorData( $title, [ 'index' => $index + $offset ] ); + foreach ( $metadata as $data ) { + $resultPageSet->setGeneratorData( $data['title'], $data['data'] ); } } } -- To view, visit https://gerrit.wikimedia.org/r/394120 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Iea48937662492445783104077666ab1f1b30da2d Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/core Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] search...deploy[master]: Copy spark config into place on deploy
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/394117 ) Change subject: Copy spark config into place on deploy .. Copy spark config into place on deploy Putting this configuration into a standard place allows calling spark commands without having to explicitly point out where mjolnir is installed to and shortens command lines. Also bumps mjolnir sub module to master which has support for this location. Before this can be deployed a puppet patch must be shipped to create the /etc/mjolnir directory and set its ownership to the deploy-service so the copy works. Change-Id: I7c69481156f543a8258a2e9b2c90f8e15984caaa --- M scap/checks.yaml M src 2 files changed, 7 insertions(+), 2 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR/deploy refs/changes/17/394117/1 diff --git a/scap/checks.yaml b/scap/checks.yaml index 411e399..63662bc 100644 --- a/scap/checks.yaml +++ b/scap/checks.yaml @@ -11,4 +11,9 @@ timeout: 300 group: analytics command: bash /srv/deployment/search/mjolnir/deploy/scap/checks/virtualenv.sh - +spark_config: +type: command +stage: promote +timeout: 10 +group: analytics +command: cp /srv/deployment/search/mjolnir/deploy/spark.yaml /etc/mjolnir/spark.yaml diff --git a/src b/src index 5799ac9..c2236ad 16 --- a/src +++ b/src @@ -1 +1 @@ -Subproject commit 5799ac99ccb095964d6550a9e45ee7abca768b55 +Subproject commit c2236adfd04280feef29b288ffc113355df83fe1 -- To view, visit https://gerrit.wikimedia.org/r/394117 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I7c69481156f543a8258a2e9b2c90f8e15984caaa Gerrit-PatchSet: 1 Gerrit-Project: search/MjoLniR/deploy Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: Better error message if wiki missing in data_pipeline
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/394108 ) Change subject: Better error message if wiki missing in data_pipeline .. Better error message if wiki missing in data_pipeline Not sure how this happens, but on one run through viwiki was in the input data but didn't make it to the check that we have expected sampling. This change will still error, but will give better messages about what went wrong. Change-Id: If7af29be5022c0b374c9b8836322ccf074467575 --- M mjolnir/utilities/data_pipeline.py 1 file changed, 6 insertions(+), 2 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR refs/changes/08/394108/1 diff --git a/mjolnir/utilities/data_pipeline.py b/mjolnir/utilities/data_pipeline.py index 20b06f0..ee479c8 100644 --- a/mjolnir/utilities/data_pipeline.py +++ b/mjolnir/utilities/data_pipeline.py @@ -145,8 +145,12 @@ for wiki in wikis: # We cant have more samples than we started with expected = min(samples_per_wiki, hit_page_id_counts[wiki]) -actual = actual_samples_per_wiki[wiki] -if expected / float(actual) < samples_size_tolerance: +try: +actual = actual_samples_per_wiki[wiki] +except KeyError: +# This will probably still error, but give better messages. +actual = 0 +if actual == 0 or expected / float(actual) < samples_size_tolerance: not_enough_samples.append( 'Collected %d samples from %s which is less than %d%% of the requested sample size %d' % (actual, wiki, samples_size_tolerance*100, expected)) -- To view, visit https://gerrit.wikimedia.org/r/394108 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: If7af29be5022c0b374c9b8836322ccf074467575 Gerrit-PatchSet: 1 Gerrit-Project: search/MjoLniR Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] wikimedia...relevanceForge[master]: Rename counter variables to i to make tox happy
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/394099 ) Change subject: Rename counter variables to i to make tox happy .. Rename counter variables to i to make tox happy Tox run by CI has decided that l is an ambiguous name for a variable. Switching it to i which is a bit of a standard counter variable seems to make it happy and didn't look to already be used. Change-Id: Iea9dd0ea2e900c9b0452795d8c732c539511da10 --- M other_tools/augmentdump.py M other_tools/metastats.py 2 files changed, 7 insertions(+), 7 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/wikimedia/discovery/relevanceForge refs/changes/99/394099/1 diff --git a/other_tools/augmentdump.py b/other_tools/augmentdump.py index 5acd27c..1b8a8b4 100755 --- a/other_tools/augmentdump.py +++ b/other_tools/augmentdump.py @@ -112,12 +112,12 @@ def read_dump(inputf, outputf, data, fieldname): -l = 0 +i = 0 pageId = -1 for line in inputf: -l += 1 +i += 1 page = {} -if l % 2 == 1: +if i % 2 == 1: outputf.write(line) page = json.loads(line) pageId = -1 diff --git a/other_tools/metastats.py b/other_tools/metastats.py index 5de85fe..1c01c79 100755 --- a/other_tools/metastats.py +++ b/other_tools/metastats.py @@ -43,17 +43,17 @@ FNULL = open(os.devnull, 'w') p = subprocess.Popen('curl -L ' + url + ' | gzip -cd', shell=True, stdout=subprocess.PIPE, stderr=FNULL) -l = 0 +i = 0 for line in p.stdout: -l += 1 +i += 1 page = json.loads(line) -if(l % 2 == 1): +if(i % 2 == 1): pageId = page['index']['_id'] continue try: int(pageId) except ValueError: -print("*** line:" + str(l) + " is not a valid id : '" + str(pageId) + "'") +print("*** line:" + str(i) + " is not a valid id : '" + str(pageId) + "'") continue callback(pageId, page) -- To view, visit https://gerrit.wikimedia.org/r/394099 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Iea9dd0ea2e900c9b0452795d8c732c539511da10 Gerrit-PatchSet: 1 Gerrit-Project: wikimedia/discovery/relevanceForge Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] wikimedia...relevanceForge[master]: Add basic pre-deployment sanity check for MLR
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/394011 ) Change subject: Add basic pre-deployment sanity check for MLR .. Add basic pre-deployment sanity check for MLR Implements a very simple configuration-driven sanity checker that ensures some set of urls is in the top 3 results of a given query. The intention of this script is to build up a small list of queries and results for each wiki we deploy MLR to and use that list as a smoke check before pushing a mediawiki-config change to move a new model to full production usage. Not sure relforge is the best place for this, or where the configuration should really go, but I couldn't think of a better place. Change-Id: Ie29ef99d2e404fe97e3b2e42b17df22b836385d8 --- A sanityCheck.py A sanityCheck/enwiki.json 2 files changed, 75 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/wikimedia/discovery/relevanceForge refs/changes/11/394011/1 diff --git a/sanityCheck.py b/sanityCheck.py new file mode 100644 index 000..bcf7136 --- /dev/null +++ b/sanityCheck.py @@ -0,0 +1,64 @@ +from __future__ import print_function +import argparse +import functools +import json +import requests +import sys +import urlparse + + +def check(model, config): +ok = True +query_params = { +'action': 'query', +'list': 'search', +'srlimit': 3, +'cirrusMLRModel': model, +'format': 'json', +'formatversion': 2, +} +if 'query' in config: +# Apply overrides from config if requested. This might +# apply a specific cirrusUserTesting param or some such. +query_params.update(config['query']) + +print('Running sanity check against %s' % (config['api'])) +for query, expected in config['queries'].items(): +print("Query: %s" % (query)) +query_params['srsearch'] = query +r = requests.get(config['api'], params=query_params) +results = [x['title'] for x in r.json()['query']['search']] +diff = set(expected).difference(results) +if diff: +ok = False +print("Results:\n\t" + '\n\t'.join(results)) +print("Expected:") +for title in expected: +marker = '+' if title in results else '-' +print('\t%s %s' % (marker, title)) +print('') +else: +print("PASSED\n") +return ok + + +def parse_arguments(argv): +parser = argparse.ArgumentParser(description='mlr sanity check') +parser.add_argument( + 'config', type=lambda x: json.load(open(x)), +help='json file containing queries to check and results expected in top 3') +parser.add_argument( +'model', help='MLR model to use for ranking') +args = parser.parse_args(argv) +return dict(vars(args)) + + +def main(argv=None): +args = parse_arguments(argv) +return check(**args) + + +if __name__ == "__main__": +ok = main() +sys.exit(0 if ok else 1) + diff --git a/sanityCheck/enwiki.json b/sanityCheck/enwiki.json new file mode 100644 index 000..a1902cb --- /dev/null +++ b/sanityCheck/enwiki.json @@ -0,0 +1,11 @@ +{ +"api": "https://en.wikipedia.org/w/api.php;, +"queries": { +"example": [ +"Example" +], +"JFK": [ +"John F. Kennedy" +] +} +} -- To view, visit https://gerrit.wikimedia.org/r/394011 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ie29ef99d2e404fe97e3b2e42b17df22b836385d8 Gerrit-PatchSet: 1 Gerrit-Project: wikimedia/discovery/relevanceForge Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: Add default path for spark utility config file
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/394003 ) Change subject: Add default path for spark utility config file .. Add default path for spark utility config file This defaults the config to /etc/mjolnir/spark.yaml. The separate deploy repo will install appropriate configuration there so spark commands can be called with little fuss. Change-Id: I9ef11aa00f237ee2486fde0049c88ed568b0f51a --- M mjolnir/utilities/spark.py 1 file changed, 1 insertion(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR refs/changes/03/394003/1 diff --git a/mjolnir/utilities/spark.py b/mjolnir/utilities/spark.py index 1ac7114..5954579 100644 --- a/mjolnir/utilities/spark.py +++ b/mjolnir/utilities/spark.py @@ -448,6 +448,7 @@ parser.add_argument( '-c', '--config', dest='config', type=str, required=True, +default='/etc/mjolnir/spark.yaml', help='Path to yaml configuration file.') parser.add_argument( '-t', '--template-var', dest='template_vars', action=KeyValueAction, -- To view, visit https://gerrit.wikimedia.org/r/394003 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I9ef11aa00f237ee2486fde0049c88ed568b0f51a Gerrit-PatchSet: 1 Gerrit-Project: search/MjoLniR Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] operations/puppet[production]: Revert "Revert "Deploy MjoLniR with new deploy repository""
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/394002 ) Change subject: Revert "Revert "Deploy MjoLniR with new deploy repository"" .. Revert "Revert "Deploy MjoLniR with new deploy repository"" The problems with mjolnir vs MjoLniR have been resolved, it required adjustments to scap.cfg in the repo. This is a standard revert with one addition, we now create an empty directory /etc/mjolnir owned by deploy-service. This gives the deploy repo a sane place to install a configuration script to that can be auto-magicaly found by mjolnir. This reverts commit 6a7753a14ac3cb66593eabbad30e8ac72e184751. Change-Id: I599341bd16ecba0a2b8d8132fde6fe3d1443d754 --- M hieradata/role/common/deployment_server.yaml A modules/mjolnir/manifests/init.pp M modules/profile/manifests/mjolnir/kafka_daemon.pp M modules/profile/templates/mjolnir/kafka-daemon.service.erb M modules/role/manifests/elasticsearch/analytics.pp 5 files changed, 28 insertions(+), 14 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/puppet refs/changes/02/394002/1 diff --git a/hieradata/role/common/deployment_server.yaml b/hieradata/role/common/deployment_server.yaml index 7c55e05..7444810 100644 --- a/hieradata/role/common/deployment_server.yaml +++ b/hieradata/role/common/deployment_server.yaml @@ -173,8 +173,8 @@ # Netbox software netbox/deploy: repository: operations/software/netbox-deploy - relforge/mjolnir: -repository: search/MjoLniR + search/mjolnir/deploy: +repository: search/MjoLniR/deploy statsv/statsv: repository: analytics/statsv "docker-pkg/deploy": diff --git a/modules/mjolnir/manifests/init.pp b/modules/mjolnir/manifests/init.pp new file mode 100644 index 000..e4bb89f --- /dev/null +++ b/modules/mjolnir/manifests/init.pp @@ -0,0 +1,21 @@ +# = Class: mjolnir +# +# This class installs the MjoLniR (Machine Learned Ranking) data +# processing package. +# +class mjolnir { +require_package('virtualenv', 'zip') + +file { '/etc/mjolnir': +ensure => 'directory', +user => 'deploy-service', +group => 'deploy-service', +mode => 0755 +} + +scap::target { 'search/mjolnir/deploy': +deploy_user => 'deploy-service', +} +} + + diff --git a/modules/profile/manifests/mjolnir/kafka_daemon.pp b/modules/profile/manifests/mjolnir/kafka_daemon.pp index c1dcaf9..479187c 100644 --- a/modules/profile/manifests/mjolnir/kafka_daemon.pp +++ b/modules/profile/manifests/mjolnir/kafka_daemon.pp @@ -9,18 +9,11 @@ # it is named just 'eqiad'. $kafka_config = kafka_config('eqiad'), ) { -scap::target { 'relforge/mjolnir': - deploy_user => 'deploy-service', -} - -# This is a limited subset of what the full mjolnir package requires because -# the daemon is a small part of the overall application. The daemon only needs -# to read/write kafka topics and send requests to localhost. -require_package('python-kafka', 'python-requests') +class { 'mjolnir': } systemd::service { 'mjolnir-kafka-daemon': content => template('profile/mjolnir/kafka-daemon.service.erb'), -require => Scap::Target['relforge/mjolnir'], +require => Scap::Target['search/mjolnir/deploy'], } } diff --git a/modules/profile/templates/mjolnir/kafka-daemon.service.erb b/modules/profile/templates/mjolnir/kafka-daemon.service.erb index b6947ba..7e40bd8 100644 --- a/modules/profile/templates/mjolnir/kafka-daemon.service.erb +++ b/modules/profile/templates/mjolnir/kafka-daemon.service.erb @@ -5,9 +5,7 @@ [Service] User=nobody Group=nogroup -WorkingDirectory=/srv/deployment/relforge/mjolnir -Environment=PYTHONPATH=/srv/deployment/relforge/mjolnir -ExecStart=/usr/bin/python2 /srv/deployment/relforge/mjolnir/mjolnir/cli/kafka_daemon.py --brokers <%= @kafka_config['brokers']['string'] %> +ExecStart=/srv/deployment/search/mjolnir/venv/bin/mjolnir-utilities.py kafka_daemon --brokers <%= @kafka_config['brokers']['string'] %> StandardInput=null StandardOutput=journal StandardError=journal diff --git a/modules/role/manifests/elasticsearch/analytics.pp b/modules/role/manifests/elasticsearch/analytics.pp index a46391e..66d7789 100644 --- a/modules/role/manifests/elasticsearch/analytics.pp +++ b/modules/role/manifests/elasticsearch/analytics.pp @@ -1,5 +1,7 @@ # Supports CirrusSearch usage on the analytics cluster class role::elasticsearch::analytics { +class { 'mjolnir': } + # wikimedia/discovery/analytics will be deployed to this node scap::target { 'wikimedia/discovery/analytics': deploy_user => 'deploy-service', -- To view, visit https://gerrit.wikimedia.org/r/394002 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I599341bd16ecba0a2b8d8132fde6fe3d1443d754 Gerrit-PatchSet: 1 Gerrit-Project: operations/puppet
[MediaWiki-commits] [Gerrit] search...deploy[master]: bump mjolnir dependency to master
EBernhardson has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/393997 ) Change subject: bump mjolnir dependency to master .. bump mjolnir dependency to master Change-Id: I4890f19f29827d8858248044322a1266871ad8be --- M src 1 file changed, 1 insertion(+), 1 deletion(-) Approvals: EBernhardson: Verified; Looks good to me, approved diff --git a/src b/src index 0d7fdcf..5799ac9 16 --- a/src +++ b/src @@ -1 +1 @@ -Subproject commit 0d7fdcf27b51b848a8c964f3c204f195c376dea5 +Subproject commit 5799ac99ccb095964d6550a9e45ee7abca768b55 -- To view, visit https://gerrit.wikimedia.org/r/393997 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I4890f19f29827d8858248044322a1266871ad8be Gerrit-PatchSet: 1 Gerrit-Project: search/MjoLniR/deploy Gerrit-Branch: master Gerrit-Owner: EBernhardsonGerrit-Reviewer: EBernhardson ___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] search...deploy[master]: bump mjolnir dependency to master
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/393997 ) Change subject: bump mjolnir dependency to master .. bump mjolnir dependency to master Change-Id: I4890f19f29827d8858248044322a1266871ad8be --- M src 1 file changed, 1 insertion(+), 1 deletion(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR/deploy refs/changes/97/393997/1 diff --git a/src b/src index 0d7fdcf..5799ac9 16 --- a/src +++ b/src @@ -1 +1 @@ -Subproject commit 0d7fdcf27b51b848a8c964f3c204f195c376dea5 +Subproject commit 5799ac99ccb095964d6550a9e45ee7abca768b55 -- To view, visit https://gerrit.wikimedia.org/r/393997 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I4890f19f29827d8858248044322a1266871ad8be Gerrit-PatchSet: 1 Gerrit-Project: search/MjoLniR/deploy Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Port prefer_recent_api.feature to nodejs
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/393693 ) Change subject: Port prefer_recent_api.feature to nodejs .. Port prefer_recent_api.feature to nodejs * Remove @expect_failure tag from scenario outlines * Drop test in final scenario with settings `.4,.0001`. It doesn't pass, and not really sure why or what it's supposed to do. * Tune down pause between first and second stage of the hook from 20s to 5s. At least locally this seems to still work. * Drop final pause in hook and replace with deletes at top of the hook. Best i can tell the pause was to ensure the final edit made it into elasticsearch. We can check the edit, but since we dont check revision ids we need to pre-delete so the check actually waits. Change-Id: I7fbf7b9945f71b0e46a769ec5b2ebec6f338af14 --- A tests/integration/features/prefer_recent_api.feature M tests/integration/features/support/hooks.js 2 files changed, 38 insertions(+), 5 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/93/393693/1 diff --git a/tests/integration/features/prefer_recent_api.feature b/tests/integration/features/prefer_recent_api.feature new file mode 100644 index 000..553b026 --- /dev/null +++ b/tests/integration/features/prefer_recent_api.feature @@ -0,0 +1,24 @@ +@clean @api @prefer_recent +Feature: Searches with prefer-recent + Scenario Outline: Recently updated articles are prefered if prefer-recent: is specified +When I api search for PreferRecent First OR Second OR Third +Then PreferRecent Second Second is the first api search result +When I api search for prefer-recent: PreferRecent First OR Second OR Third +Then PreferRecent Third is the first api search result + Examples: +| options | +| 1,.001 | +| 1,0.001 | +| 1,.0001 | +| .99,.0001 | +| .99,.001| + + Scenario Outline: You can specify prefer-recent: in such a way that being super recent isn't enough +When I api search for prefer-recent: PreferRecent First OR Second OR Third +Then PreferRecent Second Second is the first api search result + Examples: +| options | +| | +| 1 | +| 1,1 | +| 1,.1 | diff --git a/tests/integration/features/support/hooks.js b/tests/integration/features/support/hooks.js index 2c78666..ea9ad93 100644 --- a/tests/integration/features/support/hooks.js +++ b/tests/integration/features/support/hooks.js @@ -399,6 +399,17 @@ } ) ); BeforeOnce( { tags: "@prefer_recent", timeout: 6 }, Promise.coroutine( function* () { + // Deleting the pages first ensures we actually wait around for the edits to + // make it into the DB. Better might be if runBatch() could wait for revision id's, + // but it doesn't (yet). + yield runBatch( this, false, { + delete: [ + 'PreferRecent First', + 'PreferRecent Second Second', + 'PreferRecent Third', + ] + } ); + yield runBatch( this, false, { edit: { // Using epochs as content ensures the page is edited. @@ -407,17 +418,15 @@ } } ); - // We need to wait around to ensure the next page has enough time difference - // for prefer-recent to reorder things - yield this.stepHelpers.waitForMs( 2 ); + // We need to wait around to ensure the next page has enough time + // difference for prefer-recent to reorder things. + yield this.stepHelpers.waitForMs( 5000 ); yield runBatch( this, false, { edit: { 'PreferRecent Third': "" + ( new Date() / 1 ) } } ); - // TODO: Why are we waiting here? - yield this.stepHelpers.waitForMs( 1 ); } ) ); BeforeOnce( { tags: "@hastemplate" }, runBatchFn( { -- To view, visit https://gerrit.wikimedia.org/r/393693 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I7fbf7b9945f71b0e46a769ec5b2ebec6f338af14 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/CirrusSearch Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Port update_general_api.feature to nodejs
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/393694 ) Change subject: Port update_general_api.feature to nodejs .. Port update_general_api.feature to nodejs * Rework cirrusdoc api query to source page ids from archive and page table, whichever is more recent. This allows figuring out when deletes have made it into elasticsearch. * Rewrite all the 'within ...' clauses to direct query/check after waiting for the previous operation to go through * The only operation we can't directly wait for is the template update, which for unrelated reasons is broken on my MWV so commented out * The archive search is only exposed via browser, so its test uses the browser. As Special:Undelete requires special rights this meant rigging up a login method for the browser. * Changed baseurl from dev.wiki -> cirrustest.wiki. This probably needs to be handled more generically though to support browser with multiple wikis. * Adjust waitForOperation to take a revision id, and make step helpers editPage method pass the new revision id into waitForOperation. Without this an edit to page that already exists is not waited for and fails. * Implement step helpers movePage(). Waiting for the move to make it into cirrus required adding an additional check to the cirrusdoc query that the requested page matches the elastic page. This probably still has issues if a redirect points to the moved page, but we don't test that. * Support %{epoch} transformation in steps. This required normalizing all parameters that should support this to (.+), removing uses of (.*) as cucumber-js doesn't have a generic transformation step, only one on individual capture patterns. Change-Id: I99c0ef1e3453fedea5f3afbe29e5e8f9dd73d7e4 --- M includes/Api/QueryCirrusDoc.php M tests/integration/config/wdio.conf.js M tests/integration/features/step_definitions/page_step_helpers.js M tests/integration/features/step_definitions/page_steps.js M tests/integration/features/support/hooks.js M tests/integration/features/support/pages/page.js A tests/integration/features/support/pages/special_undelete.js M tests/integration/features/support/world.js A tests/integration/features/update_general_api.feature 9 files changed, 373 insertions(+), 66 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/94/393694/1 diff --git a/includes/Api/QueryCirrusDoc.php b/includes/Api/QueryCirrusDoc.php index 27e2216..e03ee1c 100644 --- a/includes/Api/QueryCirrusDoc.php +++ b/includes/Api/QueryCirrusDoc.php @@ -3,7 +3,7 @@ namespace CirrusSearch\Api; use CirrusSearch\Searcher; -use CirrusSearch\Updater; +use PageArchive; use Title; /** @@ -31,51 +31,139 @@ class QueryCirrusDoc extends \ApiQueryBase { use ApiTrait; + private $config; + private $searcher; + public function __construct( \ApiQuery $query, $moduleName ) { parent::__construct( $query, $moduleName, 'cd' ); } public function execute() { $conn = $this->getCirrusConnection(); - $config = $this->getSearchConfig(); - $updater = new Updater( $conn, $config ); - $searcher = new Searcher( $conn, 0, 0, $config, [], $this->getUser() ); - $result = []; + $this->config = $this->getSearchConfig(); + $this->searcher = new Searcher( $conn, 0, 0, $this->config, [], $this->getUser() ); foreach ( $this->getPageSet()->getGoodTitles() as $origPageId => $title ) { - list( $page, $redirects ) = $updater->traceRedirects( $title ); - - $result = []; - if ( $page ) { - $docId = $config->makeId( $page->getId() ); - // could be optimized by implementing multi-get but not - // expecting much usage except debugging/tests. - $esSources = $searcher->get( [ $docId ], true ); - if ( $esSources->isOK() ) { - foreach ( $esSources->getValue() as $i => $esSource ) { - // If we have followed redirects only report the - // article dump if the redirect has been indexed. If it - // hasn't been indexed this document does not represent - // the original title. - if ( count( $redirects ) && - !$this->hasRedirect( $esSource->getData(), $title ) - ) { - continue; -
[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: port froezn_index_api.feature to nodejs
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/392547 ) Change subject: port froezn_index_api.feature to nodejs .. port froezn_index_api.feature to nodejs * Deleted test marked @expect_failure * Converted `within` to plain search/check steps * Had to add 3 second pauses for that to work. Not sure why :S * Implemented missing steps Change-Id: Ib93a3859334920a0363e1498b124c857c2632d24 --- A tests/integration/features/frozen_index_api.feature M tests/integration/features/step_definitions/page_step_helpers.js M tests/integration/features/step_definitions/page_steps.js 3 files changed, 76 insertions(+), 2 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/47/392547/1 diff --git a/tests/integration/features/frozen_index_api.feature b/tests/integration/features/frozen_index_api.feature new file mode 100644 index 000..53657a1 --- /dev/null +++ b/tests/integration/features/frozen_index_api.feature @@ -0,0 +1,31 @@ +@frozen +Feature: Mutations to frozen indexes are properly delayed + Scenario: Updates to frozen indexes are delayed + Given I delete FrozenTest + And a page named FrozenTest exists with contents foobarbaz + And I wait 3 seconds + And I api search for foobarbaz + And FrozenTest is the first api search result + And I globally freeze indexing + And a page named FrozenTest exists with contents superduperfrozen + And I wait 10 seconds + And I api search for superduperfrozen + And FrozenTest is not in the api search results +When I globally thaw indexing + And I wait 10 seconds +Then I api search for superduperfrozen yields FrozenTest as the first result + + Scenario: Deletes to frozen indexes are delayed + Given a page named FrozenDeleteTest exists with contents bazbarfoo + And I wait 3 seconds + And I api search for bazbarfoo + And FrozenDeleteTest is the first api search result + And I globally freeze indexing + And I delete FrozenDeleteTest + And a page named FrozenDeleteTest exists with contents mrfreeze recreated this page to work around mediawiki's behavior of not showing deleted pages in search results. mrfreeze is surprisingly helpful. + And I wait 10 seconds + And I api search for bazbarfoo + And FrozenDeleteTest is the first api search result +When I globally thaw indexing + And I wait 10 seconds +Then I api search for bazbarfoo yields no results diff --git a/tests/integration/features/step_definitions/page_step_helpers.js b/tests/integration/features/step_definitions/page_step_helpers.js index ca4a92b..07c1c71 100644 --- a/tests/integration/features/step_definitions/page_step_helpers.js +++ b/tests/integration/features/step_definitions/page_step_helpers.js @@ -13,7 +13,8 @@ const expect = require( 'chai' ).expect, fs = require( 'fs' ), path = require( 'path' ), - Promise = require( 'bluebird' ); // jshint ignore:line + Promise = require( 'bluebird' ), // jshint ignore:line + articlePath = path.dirname(path.dirname(path.dirname(__dirname))) + '/browser/articles/'; class StepHelpers { constructor( world, wiki ) { @@ -39,12 +40,23 @@ } ); } + uploadFile( title, fileName, description ) { + return Promise.coroutine( function* () { + let client = yield this.apiPromise; + let filePath = path.join( articlePath, fileName ); + yield client.batch( [ + [ 'upload', fileName, filePath, '', { text: description } ] + ] ); + yield this.waitForOperation( 'upload', fileName ); + } ).call( this ); + } + editPage( title, text, append = false ) { return Promise.coroutine( function* () { let client = yield this.apiPromise; if ( text[0] === '@' ) { - text = fs.readFileSync( path.join( __dirname, 'articles', text.substr( 1 ) ) ).toString(); + text = fs.readFileSync( path.join( articlePath, text.substr( 1 ) ) ).toString(); } let fetchedText = yield this.getWikitext( title ); if ( append ) { diff --git a/tests/integration/features/step_definitions/page_steps.js b/tests/integration/features/step_definitions/page_steps.js index d7881db..f5349a7 100644 --- a/tests/integration/features/step_definitions/page_steps.js +++ b/tests/integration/features/step_definitions/page_steps.js @@ -309,4 +309,35 @@ this.searchVars[varname] = yield this.stepHelpers.pageIdOf( title ); } ).call( this ); } ); + + Then( /^I wait (\d+) seconds/, function ( seconds )
[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: port relevancy_api.feature to nodejs
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/392546 ) Change subject: port relevancy_api.feature to nodejs .. port relevancy_api.feature to nodejs * Removed 'within' steps. These are unnecessary now that we wait for pages to exist in cirrus after edits * Removed one test marked @expect_failure Change-Id: I1804bd4f13c110f960b8b7b04552beecb21658b8 --- A tests/integration/features/relevancy_api.feature 1 file changed, 104 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/46/392546/1 diff --git a/tests/integration/features/relevancy_api.feature b/tests/integration/features/relevancy_api.feature new file mode 100644 index 000..4f40a5a --- /dev/null +++ b/tests/integration/features/relevancy_api.feature @@ -0,0 +1,104 @@ +@clean @api @relevancy +Feature: Results are ordered from most relevant to least. + Scenario: Words in order are worth more then words out of order +When I api search for Relevancytwo Wordtest +Then Relevancytwo Wordtest is the first api search result + And Wordtest Relevancytwo is the second api search result + + Scenario: Results are sorted based on namespace: main, talk, file, help, file talk, etc +When I api search for all:Relevancynamespacetest +Then Relevancynamespacetest is the first api search result + And Talk:Relevancynamespacetest is the second api search result + And File:Relevancynamespacetest is the third api search result + And Help:Relevancynamespacetest is the fourth api search result + And File talk:Relevancynamespacetest is the fifth api search result + And User talk:Relevancynamespacetest is the sixth api search result + And Template:Relevancynamespacetest is the seventh api search result + + Scenario: When the user doesn't set a language are sorted with wiki language ahead of other languages +When I api search for Relevancylanguagetest +Then Relevancylanguagetest/en is the first api search result + + Scenario: Redirects count as incoming links +Given a page named Relevancyredirecttest Smaller exists with contents Relevancyredirecttest A text text text text text text text text text text text text text + And a page named Relevancyredirecttest Smaller/A exists with contents [[Relevancyredirecttest Smaller]] + And a page named Relevancyredirecttest Smaller/B exists with contents [[Relevancyredirecttest Smaller]] + And a page named Relevancyredirecttest Larger exists with contents Relevancyredirecttest B text text text text text text text text text text text text text + And a page named Relevancyredirecttest Larger/Redirect exists with contents #REDIRECT [[Relevancyredirecttest Larger]] + And a page named Relevancyredirecttest Larger/A exists with contents [[Relevancyredirecttest Larger]] + And a page named Relevancyredirecttest Larger/B exists with contents [[Relevancyredirecttest Larger/Redirect]] + And a page named Relevancyredirecttest Larger/C exists with contents [[Relevancyredirecttest Larger/Redirect]] + And I api search for Relevancyredirecttest + Then Relevancyredirecttest Larger is the first api search result + And Relevancyredirecttest Smaller is the second api search result +# Note that this test can fail spuriously in two ways: +# 1. If the required pages are created as part of the hook for @relevancy its quite possible for the large influx +# of jobs to cause the counting jobs to not pick up all the counts. I'm not super sure why that is but moving the +# creation into its own section makes it pretty consistent. +# 2. Its quite possible for the second result to be deeper in the result list for a few seconds after the pages are +# created. It gets its position updated by the link counting job which has to wait for refreshing and undelaying. + + # Last two tests use "sixth or seventh" because the current implementation of the all field + # and the copy_to hack will copy the content only one time for both text and auxiliary_text + # auxiliary_text is set to 0.5 but will be approximated to 1 (similar to text) + # phrase freq will be identical for both fields making length norms the sole discriminating + # criteria. + Scenario: Results are sorted based on what part of the page matches: title, redirect, category, etc +When I api search with query independent profile classic_noboostlinks for "Relevancytestphrase phrase" +Then Relevancytestphrase phrase is the first api search result + And Relevancytestphraseviaredirect is the second api search result + And Relevancytestphraseviacategory is the third api search result + And Relevancytestphraseviaheading is the fourth api search result + And Relevancytestphraseviaopening is the fifth api search result + And Relevancytestphraseviatext
[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Port linksto and more_like tests to nodejs
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/392538 ) Change subject: Port linksto and more_like tests to nodejs .. Port linksto and more_like tests to nodejs Change-Id: I9edd45add51bfad56dd87650de520bfeb08b9a20 --- A tests/integration/features/linksto_api.feature A tests/integration/features/more_like_api.feature M tests/integration/features/step_definitions/page_steps.js M tests/integration/features/support/hooks.js 4 files changed, 104 insertions(+), 19 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/38/392538/1 diff --git a/tests/integration/features/linksto_api.feature b/tests/integration/features/linksto_api.feature new file mode 100644 index 000..2503cc0 --- /dev/null +++ b/tests/integration/features/linksto_api.feature @@ -0,0 +1,23 @@ +@clean @filters @linksto @api +Feature: Searches with the linksto filter + Scenario: linksto only includes pages with the links +When I api search for linksto:"LinksToTest Target" +Then LinksToTest Plain is in the api search results + And LinksToTest OtherText is in the api search results + + Scenario: linksto can be combined with other text +When I api search for linksto:"LinksToTest Target" text +Then LinksToTest OtherText is the first api search result + + Scenario: -linksto excludes pages with the link +When I api search for -linksto:"LinksToTest Target" LinksToTest +Then LinksToTest No Link is in the api search results + But LinksToTest Plain is not in the api search results + + Scenario: linksto works on links from templates +When I api search for linksto:"LinksToTest Target" Using Template +Then LinksToTest Using Template is the first api search result + + Scenario: linksto finds links in non-main namespace +When I api search for linksto:"Template:LinksToTest Template" +Then LinksToTest LinksToTemplate is the first api search result diff --git a/tests/integration/features/more_like_api.feature b/tests/integration/features/more_like_api.feature new file mode 100644 index 000..c04a321 --- /dev/null +++ b/tests/integration/features/more_like_api.feature @@ -0,0 +1,29 @@ +@clean @more_like_this @api +Feature: More like an article + Scenario: Searching for morelike: returns no results +When I api search for morelike:IDontExist +Then there are no api search results + + Scenario: Searching for morelike: returns pages that are "like" that page +When I api search for morelike:More Like Me 1 +Then More Like Me is in the first api search result + But More Like Me 1 is not in the api search results + + Scenario: Searching for morelike: returns pages that are "like" the page that it is a redirect to +When I api search for morelike:More Like Me Rdir +Then More Like Me is in the first api search result + But More Like Me 1 is not in the api search results + + @redirect_loop + Scenario: Searching for morelike: returns no results +When I api search for morelike:Redirect Loop +Then there are no api search results + + Scenario: Searching for morelike:|| returns pages that are "like" all those pages +When I api search for morelike:More Like Me 1|More Like Me Set 2 Page 1|More Like Me Set 3 Page 1 +Then More Like Me is part of the api search result + And More Like Me Set 2 is part of the api search result + And More Like Me Set 3 is part of the api search result + But More Like Me 1 is not in the api search results + And More Like Me Set 2 Page 1 is not in the api search results + And More Like Me Set 3 Page 1 is not in the api search results diff --git a/tests/integration/features/step_definitions/page_steps.js b/tests/integration/features/step_definitions/page_steps.js index 6763049..d7881db 100644 --- a/tests/integration/features/step_definitions/page_steps.js +++ b/tests/integration/features/step_definitions/page_steps.js @@ -147,9 +147,12 @@ } } ); if ( in_ok ) { - // What exactly does this do? - // expect(found).to include(include(title)) - throw new Error( 'Not Implemented' ); + // Asserts that title is found within the strings that make up found. + // ex: found = ['foo bar baz'], title = 'bar' should pass. + // Chai doesnt (yet) have a native assertion for this: + // https://github.com/chaijs/chai/issues/858 + let ok = found.reduce( ( a, b ) => a || b.indexOf( title ) > -1, false ); + expect( ok, `expected ${JSON.stringify(found)} to include "${title}"` ).to.be.true; // jshint ignore:line
[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Fixup unicode literals in feature files
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/392537 ) Change subject: Fixup unicode literals in feature files .. Fixup unicode literals in feature files And add all the tests that wern't passing because of it. Change-Id: Ie371e2fca42cc3298cf06ccb5b29f71af5af108f --- A tests/integration/features/incategory_api.feature A tests/integration/features/insource_api.feature A tests/integration/features/intitle_api.feature A tests/integration/features/phrase_prefix_api.feature M tests/integration/features/step_definitions/page_step_helpers.js M tests/integration/features/step_definitions/page_steps.js 6 files changed, 297 insertions(+), 1 deletion(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/37/392537/1 diff --git a/tests/integration/features/incategory_api.feature b/tests/integration/features/incategory_api.feature new file mode 100644 index 000..aa31bfe --- /dev/null +++ b/tests/integration/features/incategory_api.feature @@ -0,0 +1,86 @@ +@clean @filters @incategory @api +Feature: Searches with the incategory filter + + Scenario: incategory: only includes pages with the category +When I api search for incategory:weaponry +Then Catapult is in the api search results + And Amazing Catapult is in the api search results + But Two Words is not in the api search results + + Scenario: incategory: splits on | to create an OR query +When I api search for incategory:weaponry|nothing +Then Catapult is in the api search results + And Amazing Catapult is in the api search results + But Two Words is not in the api search results + + Scenario Outline: incategory: does not fail when the category is unknown +When I api search for incategory: +Then there are no api search results + Examples: +| category | +| doesnotexistatleastihopenot | +| id:2147483600 | + + Scenario: incategory: finds categories by page id +When I locate the page id of Category:Weaponry and store it as %weaponry_id% + And I api search for incategory:id:%weaponry_id% +Then Catapult is in the api search results + And Amazing Catapult is in the api search results + But Two Words is not in the api search results + + Scenario: incategory: works on categories from templates +When I api search for incategory:templatetagged incategory:twowords +Then Two Words is the first api search result + + Scenario: incategory works with multi word categories +When I api search for incategory:"Categorywith Twowords" +Then Two Words is the first api search result + + Scenario: incategory can find categories containing quotes if the quote is escaped +When I api search for incategory:"Categorywith \" Quote" +Then Two Words is the first api search result + + Scenario: incategory can be repeated +When I api search for incategory:"Categorywith \" Quote" incategory:"Categorywith Twowords" +Then Two Words is the first api search result + + Scenario: incategory works with can find two word categories with spaces +When I api search for incategory:Categorywith_Twowords +Then Two Words is the first api search result + + Scenario: incategory: when passed a quoted category that doesn't exist finds nothing even though there is a category that matches one of the words +When I api search for incategory:"Dontfindme Weaponry" +Then there are no api search results + + Scenario: incategory when passed a single word category doesn't find a two word category that contains that word +When I api search for incategory:ASpace +Then there are no api search results + + Scenario: incategory: finds a multiword category when it is surrounded by quotes +When I api search for incategory:"CategoryWith ASpace" +Then IHaveATwoWordCategory is the first api search result + + Scenario: incategory: can be combined with other text +When I api search for incategory:weaponry amazing +Then Amazing Catapult is the first api search result + + Scenario: -incategory: excludes pages with the category +When I api search for -incategory:weaponry incategory:twowords +Then Two Words is the first api search result + + Scenario: incategory: can handle a space after the : +When I api search for incategory: weaponry +Then Catapult is in the api search results + And Amazing Catapult is in the api search results + But Two Words is not in the api search results + + Scenario Outline: incategory: can handle multiple spaces between clauses +When I api search for incategory:weaponryincategory:weaponry +Then Catapult is in the api search results + And Amazing Catapult is in the api search results + And Two Words is not in the api search results + Examples: +| spaces | +|%{\u0020}%%{\u0020}%| +
[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: [WIP] Add word ount statistic for articles
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/392471 ) Change subject: [WIP] Add word ount statistic for articles .. [WIP] Add word ount statistic for articles The community survey asked for this feature, and it was pretty straight forward to add to cirrus. Change-Id: I847f696405b447ab04972ad0215c09d0012c2098 --- M CirrusSearch.php M autoload.php M includes/CirrusSearch.php M includes/Hooks.php A includes/Query/CountContentWordsBuilder.php M includes/Search/ResultsType.php M includes/Search/SearchContext.php M includes/Search/SearchRequestBuilder.php M includes/Searcher.php 9 files changed, 131 insertions(+), 4 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/71/392471/1 diff --git a/CirrusSearch.php b/CirrusSearch.php index b7a8682..094d656 100644 --- a/CirrusSearch.php +++ b/CirrusSearch.php @@ -1302,6 +1302,7 @@ $wgHooks[ 'SoftwareInfo' ][] = 'CirrusSearch\Hooks::onSoftwareInfo'; $wgHooks[ 'SpecialSearchResults' ][] = 'CirrusSearch\Hooks::onSpecialSearchResults'; $wgHooks[ 'SpecialSearchResultsAppend' ][] = 'CirrusSearch\Hooks::onSpecialSearchResultsAppend'; +$wgHooks[ 'SpecialStatsAddExtra'][] = 'CirrusSearch\Hooks::onSpecialStatsAddExtra'; $wgHooks[ 'TitleMove' ][] = 'CirrusSearch\Hooks::onTitleMove'; $wgHooks[ 'TitleMoveComplete' ][] = 'CirrusSearch\Hooks::onTitleMoveComplete'; $wgHooks[ 'UnitTestsList' ][] = 'CirrusSearch\Hooks::onUnitTestsList'; diff --git a/autoload.php b/autoload.php index 094cff9..de0770f 100644 --- a/autoload.php +++ b/autoload.php @@ -115,6 +115,7 @@ 'CirrusSearch\\Query\\BoostTemplatesFeature' => __DIR__ . '/includes/Query/BoostTemplatesFeature.php', 'CirrusSearch\\Query\\CompSuggestQueryBuilder' => __DIR__ . '/includes/Query/CompSuggestQueryBuilder.php', 'CirrusSearch\\Query\\ContentModelFeature' => __DIR__ . '/includes/Query/ContentModelFeature.php', + 'CirrusSearch\\Query\\CountContentWordsBuilder' => __DIR__ . '/includes/Query/CountContentWordsBuilder.php', 'CirrusSearch\\Query\\FileNumericFeature' => __DIR__ . '/includes/Query/FileNumericFeature.php', 'CirrusSearch\\Query\\FileTypeFeature' => __DIR__ . '/includes/Query/FileTypeFeature.php', 'CirrusSearch\\Query\\FullTextQueryBuilder' => __DIR__ . '/includes/Query/FullTextQueryBuilder.php', @@ -195,6 +196,7 @@ 'CirrusSearch\\Search\\SearchMetricsProvider' => __DIR__ . '/includes/Search/SearchMetricsProvider.php', 'CirrusSearch\\Search\\SearchRequestBuilder' => __DIR__ . '/includes/Search/SearchRequestBuilder.php', 'CirrusSearch\\Search\\ShortTextIndexField' => __DIR__ . '/includes/Search/ShortTextIndexField.php', + 'CirrusSearch\\Search\\SingleAggResultsType' => __DIR__ . '/includes/Search/ResultsType.php', 'CirrusSearch\\Search\\SourceTextIndexField' => __DIR__ . '/includes/Search/SourceTextIndexField.php', 'CirrusSearch\\Search\\StaticCrossProjectBlockScorer' => __DIR__ . '/includes/Search/CrossProjectBlockScorer.php', 'CirrusSearch\\Search\\TeamDraftInterleaver' => __DIR__ . '/includes/Search/TeamDraftInterleaver.php', diff --git a/includes/CirrusSearch.php b/includes/CirrusSearch.php index 089589a..52b8e3f 100644 --- a/includes/CirrusSearch.php +++ b/includes/CirrusSearch.php @@ -798,10 +798,7 @@ return Status::newGood( [] ); } - $searcher = new Searcher( $this->connection, $this->offset, $this->limit, $this->config, $this->namespaces, - null, $this->indexBaseName ); - $searcher->setOptionsFromRequest( $this->request ); - + $searcher = $this->makeSearcher(); $status = $searcher->searchArchive( $term ); if ( $status->isOK() && $searcher->isReturnRaw() ) { $status->setResult( true, @@ -810,4 +807,22 @@ return $status; } + public function countContentWords() { + $this->limit = 1; + $searcher = $this->makeSearcher(); + $status = $searcher->countContentWords(); + + if ( $status->isOK() && $searcher->isReturnRaw() ) { + $status->setResult( true, + $searcher->processRawReturn( $status->getValue(), $this->request, $this->dumpAndDie ) ); + } + return $status; + } + + private function makeSearcher() { + $searcher = new Searcher( $this->connection, $this->offset, $this->limit, $this->config, $this->namespaces, + null, $this->indexBaseName ); + $searcher->setOptionsFromRequest( $this->request ); + return $searcher; + } } diff --git a/includes/Hooks.php b/includes/Hooks.php index 44cbc9d..8b783f8 100644 ---
[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Handle errors better in the tag tracker
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/392472 ) Change subject: Handle errors better in the tag tracker .. Handle errors better in the tag tracker When working on test sif you broke a tag all future uses of the tag would just wait until cucumber times them out, which is very painful. Rework the tracking so it remembers failures. Change-Id: I236780e30cab37884a569f0c6d27d11751fc4ee6 --- M tests/integration/features/support/hooks.js M tests/integration/features/support/world.js M tests/integration/lib/tracker.js 3 files changed, 46 insertions(+), 26 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/72/392472/1 diff --git a/tests/integration/features/support/hooks.js b/tests/integration/features/support/hooks.js index 19fa8d0..45f18c1 100644 --- a/tests/integration/features/support/hooks.js +++ b/tests/integration/features/support/hooks.js @@ -12,9 +12,20 @@ const BeforeOnce = function ( options, fn ) { Before( options, Promise.coroutine( function* () { const status = yield this.tags.check( options.tags ); - if ( status === 'new' ) { - yield fn.call( this ); + if ( status === 'complete' ) { + return; + } else if ( status === 'new' ) { + try { + yield fn.call( this ); + } catch ( err ) { + yield this.tags.reject( options.tags ); + return; + } yield this.tags.complete( options.tags ); + } else if ( status === 'reject' ) { + throw new Error( 'Tag failed to initialize previously' ); + } else { + throw new Error( 'Unknown tag check status: ' + status ); } } ) ); }; diff --git a/tests/integration/features/support/world.js b/tests/integration/features/support/world.js index 7e8c9c2..9df4f45 100644 --- a/tests/integration/features/support/world.js +++ b/tests/integration/features/support/world.js @@ -12,6 +12,7 @@ */ const {defineSupportCode} = require( 'cucumber' ), net = require( 'net' ), + log = require( 'semlog' ).log, Bot = require( 'mwbot' ), StepHelpers = require( '../step_definitions/page_step_helpers' ), Page = require( './pages/page' ), @@ -29,7 +30,7 @@ this.pendingResponses = {}; this.connection.on( 'data', ( data ) => { let parsed = JSON.parse( data ); - console.log( `received response for request ${parsed.requestId}: ${data}` ); + log( `received response for request ${parsed.requestId}: ${data}` ); if ( parsed && this.pendingResponses[parsed.requestId] ) { this.pendingResponses[parsed.requestId]( parsed ); delete this.pendingResponses[parsed.requestId]; @@ -41,7 +42,7 @@ req.requestId = this.nextRequestId++; return new Promise( ( resolve ) => { let data = JSON.stringify( req ); - console.log( `Issuing request: ${data}` ); + log( `Issuing request: ${data}` ); this.pendingResponses[req.requestId] = resolve; this.connection.write( data ); } ); @@ -50,17 +51,27 @@ check( tag ) { return Promise.coroutine( function* () { if ( this.tags[tag] ) { - return 'complete'; + return this.tags[tag]; } let response = yield this.request( { check: tag } ); - this.tags[tag] = true; + if ( response.status === 'complete' || response.status === 'reject' ) { + this.tags[tag] = response.status; + } return response.status; } ).call( this ); } + reject( tag ) { + this.tags[tag] = 'reject'; + return this.request( { + reject: tag + } ); + } + complete( tag ) { + this.tags[tag] = 'complete'; return this.request( { complete: tag } ); @@ -157,10 +168,10 @@ if ( !tmpUrl ) {
[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Add API action for dumping cirrus articles
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/392469 ) Change subject: Add API action for dumping cirrus articles .. Add API action for dumping cirrus articles This is particularly convenient for the browser tests to use, so they can ping the api to see if an article that it created/updated is now in cirrussearch. Change-Id: I5dbd02592eebb166362c7cb9dabcd2b93bae66c5 --- M CirrusSearch.php M autoload.php A includes/Api/ArticleDump.php M includes/Connection.php M tests/integration/features/step_definitions/page_step_helpers.js 5 files changed, 133 insertions(+), 1 deletion(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/69/392469/1 diff --git a/CirrusSearch.php b/CirrusSearch.php index bf53382..b7a8682 100644 --- a/CirrusSearch.php +++ b/CirrusSearch.php @@ -1336,6 +1336,7 @@ $wgAPIModules['cirrus-config-dump'] = 'CirrusSearch\Api\ConfigDump'; $wgAPIModules['cirrus-mapping-dump'] = 'CirrusSearch\Api\MappingDump'; $wgAPIModules['cirrus-settings-dump'] = 'CirrusSearch\Api\SettingsDump'; +$wgAPIModules['cirrus-article-dump'] = 'CirrusSearch\Api\ArticleDump'; /** * Configs diff --git a/autoload.php b/autoload.php index 1cb17de..094cff9 100644 --- a/autoload.php +++ b/autoload.php @@ -6,6 +6,7 @@ $wgAutoloadClasses += [ 'CirrusSearch' => __DIR__ . '/includes/CirrusSearch.php', 'CirrusSearch\\Api\\ApiBase' => __DIR__ . '/includes/Api/ApiBase.php', + 'CirrusSearch\\Api\\ArticleDump' => __DIR__ . '/includes/Api/ArticleDump.php', 'CirrusSearch\\Api\\ConfigDump' => __DIR__ . '/includes/Api/ConfigDump.php', 'CirrusSearch\\Api\\FreezeWritesToCluster' => __DIR__ . '/includes/Api/FreezeWritesToCluster.php', 'CirrusSearch\\Api\\MappingDump' => __DIR__ . '/includes/Api/MappingDump.php', diff --git a/includes/Api/ArticleDump.php b/includes/Api/ArticleDump.php new file mode 100644 index 000..5840eb9 --- /dev/null +++ b/includes/Api/ArticleDump.php @@ -0,0 +1,93 @@ +http://www.gnu.org/copyleft/gpl.html + */ +class ArticleDump extends ApiBase { + public function execute() { + $conn = $this->getCirrusConnection(); + $config = $conn->getConfig(); + $searcher = new Searcher( $conn, 0, 0, $config, [], $this->getUser() ); + + $params = $this->extractRequestParams(); + $title = Title::newFromText( $params['title'] ); + if ( !$title->exists() ) { + $this->dieWithError( 'apierror-missingtitle' ); + } + + // Reuse updater to find the final target post-redirect + $updater = new Updater( $conn, $config ); + list( $page, $redirects ) = $updater->traceRedirects( $title ); + + if ( !$page ) { + // Slight lie .. the title itself exists but not the redirect target. + // Use custom error message? + $this->dieWithError( 'apierror-missingtitle' ); + } + + $docId = $config->makeId( $page->getId() ); + $esSources = $searcher->get( [ $docId ], true ); + $result = []; + if ( $esSources->isOK() ) { + foreach ( $esSources->getValue() as $i => $esSource ) { + $result[] = [ + 'index' => $esSource->getIndex(), + 'type' => $esSource->getType(), + 'id' => $esSource->getId(), + 'version' => $esSource->getVersion(), + 'source' => $esSource->getData(), + ]; + } + } + $this->getResult()->addValue( null, 'cirrus-article-dump', $result ); + } + + public function getAllowedParams() { + return [ + 'title' => [ + ApiBase::PARAM_TYPE => 'string', + ApiBase::PARAM_REQUIRED => true, + ], + ]; + } + + /** +* @deprecated since MediaWiki core 1.25 +*/ + public function getDescription() { + return 'Dump stored CirrusSearch document for article.'; + } + + /** +* @see ApiBase::getExamplesMessages +* @return array +*/ + protected function getExamplesMessages() { + return [ + 'action=cirrus-article-dump' => + 'apihelp-cirrus-article-dump-example' + ]; + } + +} diff --git a/includes/Connection.php b/includes/Connection.php index a17ce88..8e2e389 100644 --- a/includes/Connection.php +++ b/includes/Connection.php
[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Get all nodejs tests passing from empty database
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/392470 ) Change subject: Get all nodejs tests passing from empty database .. Get all nodejs tests passing from empty database * Use the new cirrus-article-dump api to wait for edits to make it into elastic. Failures from an empty database seem almost entirely tied to tests running before the articles have made it into cirrus. * Convert one-off batch calls in hooks.js to use a single function so we don't duplicate checking the batch has made it into elastic * While at it reduce some promise spaghetti by converting things over to bluebird coroutines. If we require nodejs >= 7.6 we could use async/await directly, but coroutines allow us to support node 6 which is default on many distributions. * Swap config over to headless while we are here. * Put the @suggest hook, that builds the completion suggester, at the end of the hooks file. Cucumberjs seems to run these hooks in the order they are defined, so this ensures all the other tags from prefix_search_api.feature have run already * Merge suggest_api.feature with prefix_search_api.feature, as they both use the @suggest tag. Change-Id: Ie2f3142d8af9036a6a6e473a2a7d2fd557abeaca --- M tests/integration/config/wdio.conf.js M tests/integration/features/prefix_search_api.feature M tests/integration/features/step_definitions/page_step_helpers.js M tests/integration/features/step_definitions/page_steps.js D tests/integration/features/suggest_api.feature M tests/integration/features/support/hooks.js M tests/integration/features/support/world.js 7 files changed, 363 insertions(+), 370 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/70/392470/1 diff --git a/tests/integration/config/wdio.conf.js b/tests/integration/config/wdio.conf.js index 179e302..2cd2fb5 100644 --- a/tests/integration/config/wdio.conf.js +++ b/tests/integration/config/wdio.conf.js @@ -122,7 +122,7 @@ browserName: 'chrome', // Since Chrome v57 https://bugs.chromium.org/p/chromedriver/issues/detail?id=1625 chromeOptions: { - args: [ '--enable-automation' ] + args: [ '--enable-automation', '--headless' ] } } ], // diff --git a/tests/integration/features/prefix_search_api.feature b/tests/integration/features/prefix_search_api.feature index e84e830..e524c1b 100644 --- a/tests/integration/features/prefix_search_api.feature +++ b/tests/integration/features/prefix_search_api.feature @@ -45,7 +45,6 @@ Scenario: Searching for a bare namespace finds everything in the namespace Given a page named Template talk:Foo exists - And within 20 seconds api searching for Template talk:Foo yields Template talk:Foo as the first result When I get api suggestions for template talk: Then Template talk:Foo is in the api suggestions @@ -155,3 +154,85 @@ # And there are 1000 redirects to IHaveTonsOfRedirects of the form TonsOfRedirects%s # When I type TonsOfRedirects into the search box # Then suggestions should appear + + Scenario: Search suggestions +When I ask suggestion API for main + Then the API should produce list containing Main Page + + Scenario: Created pages suggestions +When I ask suggestion API for x-m + Then the API should produce list containing X-Men + + Scenario: Nothing to suggest +When I ask suggestion API for jabberwocky + Then the API should produce empty list + + Scenario: Ordering +When I ask suggestion API for x-m + Then the API should produce list starting with X-Men + + Scenario: Fuzzy +When I ask suggestion API for xmen + Then the API should produce list starting with X-Men + + Scenario: Empty tokens +When I ask suggestion API for はー + Then the API should produce list starting with はーい + And I ask suggestion API for はい + Then the API should produce list starting with はーい + + Scenario Outline: Search redirects shows the best redirect +When I ask suggestion API for + Then the API should produce list containing + Examples: +| term |suggested | +| eise| Eisenhardt, Max | +| max | Max Eisenhardt| +| magnetu | Magneto | + + Scenario Outline: Search prefers exact match over fuzzy match and ascii folded +When I ask suggestion API for + Then the API should produce list starting with + Examples: +| term |suggested | +| max | Max Eisenhardt| +| mai | Main Page | +| eis | Eisenhardt, Max | +| ele | Elektra | +| éle | Électricité | + + Scenario Outline: Search prefers exact db match over partial prefix match +When I ask suggestion API at
[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: Speed up DBN evaluation.
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/391728 ) Change subject: Speed up DBN evaluation. .. Speed up DBN evaluation. The toDF() call in dbn.py causes us to evaluate one partition on its own for spark to figure out what the field types are. Later spark will evaluate the other 199 partitions. On a a test with a dataframe containing enwiki and dewiki a single partition can take up to 15 minutes. Save this by defining the schema explicitly instead of making spark figure it out. 15 minutes is also a long time for a single partition to run. Use a heuristic to increase the number of partitions from 200 up to 2000 when we have more data. In tests this patch cut the total dbn time from 23 minutes to 8. Change-Id: I14d663f49a54b7bd130186aebfbeffde1e1a6d82 --- M mjolnir/dbn.py M mjolnir/utilities/data_pipeline.py 2 files changed, 19 insertions(+), 6 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR refs/changes/28/391728/1 diff --git a/mjolnir/dbn.py b/mjolnir/dbn.py index 536064e..8c4ab76 100644 --- a/mjolnir/dbn.py +++ b/mjolnir/dbn.py @@ -9,6 +9,7 @@ import json import pyspark.sql from pyspark.sql import functions as F +from pyspark.sql import types as T import mjolnir.spark @@ -179,7 +180,7 @@ model.train(sessions) return _extract_labels_from_dbn(model, reader) -return ( +rdd_rel = ( df # group and collect up the hits for individual (wikiid, norm_query_id, # session_id) tuples to match how the dbn expects to receive data. @@ -192,7 +193,14 @@ # of grouping into python, but that could just as well end up worse? .repartition(num_partitions, 'wikiid', 'norm_query_id') # Run each partition through the DBN to generate relevance scores. -.rdd.mapPartitions(train_partition) -# Convert the rdd of tuples back into a DataFrame so the fields all -# have a name. -.toDF(['wikiid', 'norm_query_id', 'hit_page_id', 'relevance'])) +.rdd.mapPartitions(train_partition)) + +# Using toDF() is very slow as it has to run some of the partitions to check their +# types, and then run all the partitions later to get the actual data. To prevent +# running twice specify the schema we expect. +return df.sql_ctx.createDataFrame(rdd_rel, T.StructType([ +T.StructField('wikiid', T.StringType(), False), +T.StructField('norm_query_id', T.LongType(), False), +T.StructField('hit_page_id', T.LongType(), False), +T.StructField('relevance', T.DoubleType(), False) +])) diff --git a/mjolnir/utilities/data_pipeline.py b/mjolnir/utilities/data_pipeline.py index a5c37d1..c8e676f 100644 --- a/mjolnir/utilities/data_pipeline.py +++ b/mjolnir/utilities/data_pipeline.py @@ -85,9 +85,14 @@ print 'Fetched a total of %d samples for %d wikis' % (nb_samples, len(wikis)) df_norm.unpersist() +# Target around 125k rows per partition. Note that this isn't +# how many the dbn will see, because it gets collected up. Just +# a rough guess. +dbn_partitions = int(max(200, min(2000, nb_samples / 125000 ) )) + # Learn relevances df_rel = ( -mjolnir.dbn.train(df_sampled, { +mjolnir.dbn.train(df_sampled, num_partitions=dbn_partitions, dbn_config={ 'MAX_ITERATIONS': 40, 'DEBUG': False, 'PRETTY_LOG': True, -- To view, visit https://gerrit.wikimedia.org/r/391728 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I14d663f49a54b7bd130186aebfbeffde1e1a6d82 Gerrit-PatchSet: 1 Gerrit-Project: search/MjoLniR Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: Repair ability to collect data for undersized wikis
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/391729 ) Change subject: Repair ability to collect data for undersized wikis .. Repair ability to collect data for undersized wikis When attempting to collect data for small wikis that have much less than the provided samples_per_wiki in data_pipeline.py we would fail, because the collected data was much less than expected. Rework this code to allow for wikis that start with much less data than available. While digging into this I realized that this check was being done much too early. It was calculating against data that was not of the same shape, so not the same counts, as the final data we feed into feature collection. Everything between sampling and feature collection is relatively cheap (compared to sending millions of queries to elasticsearch) so move the check down to just before feature collection where we know exactly how many observations we have. Change-Id: Ib9f8d9b6204d7568e02356c1062cf3263d8eedd6 --- M mjolnir/sampling.py M mjolnir/test/test_sampling.py M mjolnir/utilities/data_pipeline.py 3 files changed, 48 insertions(+), 24 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR refs/changes/29/391729/1 diff --git a/mjolnir/sampling.py b/mjolnir/sampling.py index 50f527a..50722b5 100644 --- a/mjolnir/sampling.py +++ b/mjolnir/sampling.py @@ -165,15 +165,18 @@ .agg(F.sum('num_hit_page_ids').alias('num_hit_page_ids')) .collect()) +hit_page_id_counts = {row.wikiid: row.num_hit_page_ids for row in hit_page_id_counts} + wiki_percents = {} needs_sampling = False -for row in hit_page_id_counts: -wiki_percents[row.wikiid] = min(1., float(samples_per_wiki) / row.num_hit_page_ids) -if wiki_percents[row.wikiid] < 1.: + +for wikiid, num_hit_page_ids in hit_page_id_counts.items(): +wiki_percents[wikiid] = min(1., float(samples_per_wiki) / num_hit_page_ids) +if wiki_percents[wikiid] < 1.: needs_sampling = True if not needs_sampling: -return df +return hit_page_id_counts, df # Aggregate down into a unique set of (wikiid, norm_query_id) and add in a # count of the number of unique sessions per pair. We will sample per-strata @@ -187,11 +190,11 @@ # Spark will eventually throw this away in an LRU fashion. .cache()) -# materialize df_queries_unique so we can unpersist the input df -df_queries_unique.count() -df.unpersist() - df_queries_sampled = _sample_queries(df_queries_unique, wiki_percents, seed=seed) # Select the rows chosen by sampling from the input df -return df.join(df_queries_sampled, how='inner', on=['wikiid', 'norm_query_id']) +df_sampled = df.join(df_queries_sampled, how='inner', on=['wikiid', 'norm_query_id']) +df_sampled.cache().count() +df.unpersist() + +return hit_page_id_counts, df_sampled diff --git a/mjolnir/test/test_sampling.py b/mjolnir/test/test_sampling.py index 2feeb29..66a4605 100644 --- a/mjolnir/test/test_sampling.py +++ b/mjolnir/test/test_sampling.py @@ -20,8 +20,9 @@ ('foo', 'e', 5, 'eee', list(range(3))), ]).toDF(['wikiid', 'query', 'norm_query_id', 'session_id', 'hit_page_ids']) -sampled = mjolnir.sampling.sample(df, samples_per_wiki=100, - seed=12345).collect() +hit_page_id_counts, df_sampled = mjolnir.sampling.sample( +df, samples_per_wiki=100, seed=12345) +sampled = df_sampled.collect() # The sampling rate should have been chosen as 1.0, so we should have all data # regardless of probabilities. assert len(sampled) == 5 @@ -60,8 +61,8 @@ # Using a constant seed ensures deterministic testing. Because this code # actually relies on the law of large numbers, and we do not have large # numbers here, many seeds probably fail. -df_sampled = mjolnir.sampling.sample(df, samples_per_wiki=samples_per_wiki, - seed=12345) +hit_page_id_counts, df_sampled = mjolnir.sampling.sample( +df, samples_per_wiki=samples_per_wiki, seed=12345) sampled = ( df_sampled .select('wikiid', 'query', F.explode('hit_page_ids').alias('hit_page_id')) diff --git a/mjolnir/utilities/data_pipeline.py b/mjolnir/utilities/data_pipeline.py index c8e676f..3827e08 100644 --- a/mjolnir/utilities/data_pipeline.py +++ b/mjolnir/utilities/data_pipeline.py @@ -64,11 +64,17 @@ min_sessions_per_query=min_sessions_per_query) # Sample to some subset of queries per wiki +hit_page_id_counts, df_sampled_raw = mjolnir.sampling.sample( +df_norm, +seed=54321, +samples_per_wiki=samples_per_wiki) + +df_sampled_raw.count() +df_norm.unpersist() + +# Transform our dataframe into the shape expected by the DBN df_sampled = ( -
[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: Replace custom array_contains with Column.isin
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/391622 ) Change subject: Replace custom array_contains with Column.isin .. Replace custom array_contains with Column.isin Not sure if this is new or I just wasn't aware of it at the time, but spark has a native Column.isin that does the same as our usage of the custom _array_contains method (checking a column has a value one of a provided array of values). Change-Id: I504492070c7cde4a4d93f2ff9c104b3f127b2757 --- M mjolnir/sampling.py M mjolnir/utilities/data_pipeline.py 2 files changed, 1 insertion(+), 32 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR refs/changes/22/391622/1 diff --git a/mjolnir/sampling.py b/mjolnir/sampling.py index e65281c..9ba3cf8 100644 --- a/mjolnir/sampling.py +++ b/mjolnir/sampling.py @@ -15,37 +15,6 @@ from pyspark.sql.column import Column, _to_java_column -def _array_contains(array, value): -"""Generic version of pyspark.sql.functions.array_contains - -array_contains provided by pyspark only allow checking if a value is inside -a column, but the value has to be a literal and not a column from the row. -This generalizes the function to allow the value to be a column, checking -if a column is within a provided literal array. - ->>> df = sc.parallelize([['foo'], ['bar']]).toDF(['id']) ->>> df.select(_array_contains(F.array(map(F.lit, ['this', 'is', 'foo'])), F.col('id'))).collect() -[Row(array_contains(array(this,is,foo),id)=True), Row(array_contains(array(this,is,foo),id)=False)] - -Parameters --- -array : pyspark.sql.Column -value : pyspark.sql.Column - -Returns ---- -pyspark.sql.Column -Column representing the array_contains expression -""" -j_array_expr = _to_java_column(array).expr() -j_value_expr = _to_java_column(value).expr() - -sql = pyspark.SparkContext._active_spark_context._jvm.org.apache.spark.sql -j_expr = sql.catalyst.expressions.ArrayContains(j_array_expr, j_value_expr) -jc = sql.Column(j_expr) -return Column(jc) - - def _calc_splits(df, num_buckets=100): """Calculate the right edge of num_session buckets diff --git a/mjolnir/utilities/data_pipeline.py b/mjolnir/utilities/data_pipeline.py index 62ec121..a5c37d1 100644 --- a/mjolnir/utilities/data_pipeline.py +++ b/mjolnir/utilities/data_pipeline.py @@ -40,7 +40,7 @@ df_clicks = ( sqlContext.read.parquet(input_dir) # Limit to the wikis we are working against -.where(mjolnir.sampling._array_contains(F.array(map(F.lit, wikis)), F.col('wikiid'))) +.where(F.col('wikiid').isin(wikis)) # Drop requests from 'too busy' IP's. These are plausibly bots, or maybe just proxys. .where(F.col('q_by_ip_day') < 50) .drop('q_by_ip_day') -- To view, visit https://gerrit.wikimedia.org/r/391622 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I504492070c7cde4a4d93f2ff9c104b3f127b2757 Gerrit-PatchSet: 1 Gerrit-Project: search/MjoLniR Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Port exact_quotes_api.feature to nodejs
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/391439 ) Change subject: Port exact_quotes_api.feature to nodejs .. Port exact_quotes_api.feature to nodejs Change-Id: I537684d737132755d726a6f7dad4e3f84dbe7b7b --- A tests/integration/features/exact_quotes_api.feature M tests/integration/features/support/hooks.js 2 files changed, 109 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/39/391439/1 diff --git a/tests/integration/features/exact_quotes_api.feature b/tests/integration/features/exact_quotes_api.feature new file mode 100644 index 000..9af0958 --- /dev/null +++ b/tests/integration/features/exact_quotes_api.feature @@ -0,0 +1,94 @@ +@clean @exact_quotes @api +Feature: Searches that contain quotes + Scenario: Searching for a word in quotes disbles stemming (can still find plural with exact match) +When I api search for "pickles" +Then Two Words is the first api search result + + Scenario: Searching for a phrase in quotes disbles stemming (can't find plural with singular) +When I api search for "catapult pickle" +Then there are no api search results + + Scenario: Searching for a phrase in quotes disbles stemming (can still find plural with exact match) +When I api search for "catapult pickles" +Then Two Words is the first api search result + + Scenario: Quoted phrases have a default slop of 0 +When I api search for "ffnonesenseword pickles" +Then none is the first api search result +When I api search for "ffnonesenseword pickles"~1 +Then Two Words is the first api search result + + Scenario: Quoted phrases match stop words +When I api search for "Contains A Stop Word" +Then Contains A Stop Word is the first api search result + + Scenario: Adding a ~ to a phrase keeps stemming enabled +When I api search for "catapult pickle"~ +Then Two Words is the first api search result + + Scenario: Adding a ~ to a phrase switches the default slop to 0 +When I api search for "ffnonesenseword pickle"~ +Then none is the first api search result +When I api search for "ffnonesenseword pickle"~1~ +Then Two Words is the first api search result + + Scenario: Adding a ~ to a phrase stops it from matching stop words so long as there is enough slop +When I api search for "doesn't actually Contain A Stop Words"~1~ +Then Doesn't Actually Contain Stop Words is the first api search result + + Scenario: Adding a ~~ to a phrase keeps stemming enabled +When I api search for "catapult pickle"~0~ +Then Two Words is the first api search result + + Scenario: Adding a ~ to a phrase turns off because it is a proximity search +When I api search for "catapult pickle"~0 +Then there are no api search results + + Scenario: Searching for a quoted * actually searches for a * +When I api search with query independent profile empty for "pick*" +Then Pick* is the first api search result + + Scenario Outline: Searching for " "~ activates a proximity search +When I api search for "ffnonesenseword anotherword"~ +Then is the first api search result + Examples: +| proximity | result| +| 0 | none | +| 1 | none | +| 2 | Two Words | +| 3 | Two Words | +| 77| Two Words | + + Scenario Outline: Prefixing a quoted phrase with - or ! or NOT negates it +When I api search for catapult "two words" +Then Catapult is in the api search results + And Two Words is not in the api search results + Examples: +|negation| suffix | +| - || +| ! || +| NOT|| +| - | ~ | +| ! | ~ | +| NOT| ~ | +| - | ~1 | +| ! | ~1 | +| NOT| ~1 | +| - | ~7~| +| ! | ~7~| +| NOT| ~7~| + + Scenario: Can combine positive and negative phrase search +When I api search for catapult "catapult" -"two words" -"some stuff" +Then Catapult is in the api search results + And Two Words is not in the api search results + + Scenario: Can combine positive and negative phrase search (backwards) +When I api search for catapult -"asdf" "two words" +Then Two Words is in the api search results + And Catapult is not in the api search results + + @setup_main + Scenario: Searching for a word in quotes disbles stemming (can't find plural with singular) +When I api search for "pickle" +Then there are no api search results diff --git a/tests/integration/features/support/hooks.js b/tests/integration/features/support/hooks.js index e1e5082..c035dbc 100644 --- a/tests/integration/features/support/hooks.js +++
[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Port all cucumber hooks to nodejs
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/391441 ) Change subject: Port all cucumber hooks to nodejs .. Port all cucumber hooks to nodejs Port over all the hooks from ruby to nodejs and add the feature files that now pass. WIP because I'm suspicious from other feature files that didn't pass that there remains a problem with mwbot sessions. Change-Id: I01b8192c4e2ef5f0b3b720034aa38c4686ffbeb3 --- A tests/integration/features/combined_filters_api.feature A tests/integration/features/full_text_api.feature A tests/integration/features/fuzzy_api.feature A tests/integration/features/go_api.feature A tests/integration/features/hastemplate_api.feature A tests/integration/features/prefix_api.feature A tests/integration/features/removed_text_api.feature M tests/integration/features/support/hooks.js 8 files changed, 983 insertions(+), 190 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/41/391441/1 diff --git a/tests/integration/features/combined_filters_api.feature b/tests/integration/features/combined_filters_api.feature new file mode 100644 index 000..665dd58 --- /dev/null +++ b/tests/integration/features/combined_filters_api.feature @@ -0,0 +1,10 @@ +@clean @filters @api +Feature: Searches with combined filters + Scenario Outline: Filters can be combined +When I api search for +Then is the first api search result + Examples: +| term | first_result | +| incategory:twowords intitle:catapult| none | +| incategory:twowords intitle:"Two Words" | Two Words| +| incategory:alpha incategory:beta| AlphaBeta| diff --git a/tests/integration/features/full_text_api.feature b/tests/integration/features/full_text_api.feature new file mode 100644 index 000..9ebcab5 --- /dev/null +++ b/tests/integration/features/full_text_api.feature @@ -0,0 +1,145 @@ +@clean @api +Feature: Full text search + @headings + Scenario: Pages can be found by their headings +When I api search for incategory:HeadingsTest "I am a heading" +Then HasHeadings is the first api search result + + @headings + Scenario: Ignored headings aren't searched so text with the same word is wins +When I api search for incategory:HeadingsTest References +Then HasReferencesInText is the first api search result + + @setup_main + Scenario: Searching for a page using its title and another word not in the page's text doesn't find the page +When I api search for DontExistWord Two Words +Then there are no api search results + + @setup_main + Scenario: Searching for a page using its title and another word in the page's text does find it +When I api search for catapult Two Words +Then Two Words is the first api search result + + @setup_phrase_rescore + Scenario: Searching for an unquoted phrase finds the phrase first +When I api search for Words Test Rescore +Then Rescore Test Words Chaff is the first api search result + + @setup_phrase_rescore + Scenario: Searching for a quoted phrase finds higher scored matches before the whole query interpreted as a phrase +When I api search for Rescore "Test Words" +Then Test Words Rescore Rescore Test Words is the first api search result + + # Note that other tests will catch this situation as well but this test should be pretty specific + @setup_phrase_rescore + Scenario: Searching for an unquoted phrase still prioritizes titles over text +When I api search for Rescore Test TextContent +Then Rescore Test TextContent is the first api search result + + @setup_phrase_rescore + Scenario: Searching with a quoted word just treats the word as though it didn't have quotes +When I api search for "Rescore" Words Test +Then Test Words Rescore Rescore Test Words is the first api search result + + @programmer_friendly + Scenario Outline: Programmer friendly searches +When I api search for +Then is the first api search result + Examples: +|term |page | +| namespace aliases | $wgNamespaceAliases | +| namespaceAliases| $wgNamespaceAliases | +| $wgNamespaceAliases | $wgNamespaceAliases | +| namespace_aliases | $wgNamespaceAliases | +| NamespaceAliases| $wgNamespaceAliases | +| wgnamespacealiases | $wgNamespaceAliases | +| snake case | PFSC| +| snakeCase | PFSC| +| snake_case | PFSC| +| SnakeCase | PFSC| +| Pascal Case | PascalCase | +| pascalCase | PascalCase | +| pascal_case | PascalCase | +| PascalCase | PascalCase | +| pascalcase | PascalCase | +| numeric 7 | NumericCase7| +
[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Add did you mean api feature test to nodejs integ tests
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/391438 ) Change subject: Add did you mean api feature test to nodejs integ tests .. Add did you mean api feature test to nodejs integ tests Change-Id: I943aedd0bc13111906a2aab7481250215c8dd2c9 --- A tests/integration/features/did_you_mean_api.feature M tests/integration/features/step_definitions/page_steps.js 2 files changed, 132 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/38/391438/1 diff --git a/tests/integration/features/did_you_mean_api.feature b/tests/integration/features/did_you_mean_api.feature new file mode 100644 index 000..66be262 --- /dev/null +++ b/tests/integration/features/did_you_mean_api.feature @@ -0,0 +1,100 @@ +@clean @api @suggestions +Feature: Did you mean + Scenario: Uncommon phrases spelled correctly don't get suggestions even if one of the words is very uncommon +When I api search for nobel prize +Then there is no api suggestion + + Scenario: No suggestions on pages that are not the first +When I api search with offset 20 for popular cultur +Then there is no api suggestion + + @stemming + Scenario: Suggestions do not show up when a full title matches but with stemming +When I api search for stemmingsingleword +Then there is no api suggestion + + @stemming + Scenario: Suggestions do not show up when a full multi word title matches but with stemming +When I api search for stemming multiword +Then there is no api suggestion + + @stemming + Scenario: Suggestions do not show up when a full multi word title matches but with apostrophe normalization +When I api search for stemming possessive's +Then there is no api suggestion + + Scenario: Suggestions don't come from redirect titles when it matches an actual title +When I api search for Noble Gasses +Then there is no api suggestion + + Scenario: Common phrases spelled incorrectly get suggestions +When I api search for popular cultur +Then popular *culture* is suggested by api + + Scenario Outline: Uncommon phrases spelled incorrectly get suggestions even if the typos is in the first 2 characters +When I api search for +Then is suggested by api + Examples: +|term | suggested | +| nabel prize | *nobel* prize | +| onbel prize | *nobel* prize | + + Scenario: Uncommon phrases spelled incorrectly get suggestions even if they contain words that are spelled correctly on their own +When I api search for noble prize +Then *nobel* prize is suggested by api + + Scenario: Suggestions can come from redirect titles when redirects are included in search +When I api search for Rrr Worrd +Then rrr *word* is suggested by api + + Scenario Outline: Special search syntax is preserved in suggestions (though sometimes moved around) +When I api search for +Then is suggested by api + Examples: +|term | suggested | +| prefer-recent:noble prize | prefer-recent:*nobel* prize | +| Template:nobel piep | Template:*noble pipe* | +| prefer-recent:noble prize | prefer-recent:*nobel* prize | +| incategory:prize noble prize | incategory:prize *nobel* prize | +| noble incategory:prize prize | incategory:prize *nobel* prize | +| hastemplate:prize noble prize | hastemplate:prize *nobel* prize | +| -hastemplate:prize noble prize| -hastemplate:prize *nobel* prize| +| boost-templates:"prize\|150%" noble prize | boost-templates:"prize\|150%" *nobel* prize | +| noble prize prefix:n | *nobel* prize prefix:n | + + Scenario: Customize prefix length of did you mean suggestions +When I set did you mean suggester option cirrusSuggPrefixLength to 5 +And I api search for noble prize +Then there is no api suggestion + + Scenario: Did you mean option suggests +When I api search for grammo awards +Then there is no api suggestion + + Scenario: Customize max term freq did you mean suggestions +When I set did you mean suggester option cirrusSuggMaxTermFreq to 0.4 +And I set did you mean suggester option cirrusSuggConfidence to 1 +And I api search for grammo +Then *grammy* is suggested by api + + Scenario: Customize prefix length of did you mean suggestions below the hard limit +When I reset did you mean suggester options +And I
[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Port highlighting feature to nodejs
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/391442 ) Change subject: Port highlighting feature to nodejs .. Port highlighting feature to nodejs Change-Id: I15dffe85e6366b21d0fd5742cb013589a12e24f6 --- A tests/integration/features/highlighting_api.feature M tests/integration/features/step_definitions/page_steps.js 2 files changed, 204 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/42/391442/1 diff --git a/tests/integration/features/highlighting_api.feature b/tests/integration/features/highlighting_api.feature new file mode 100644 index 000..87b07ee --- /dev/null +++ b/tests/integration/features/highlighting_api.feature @@ -0,0 +1,178 @@ +@clean @highlighting @api +Feature: Highlighting + @setup_main + Scenario Outline: Found words are highlighted +When I api search for +Then is the highlighted title of the first api search result + And is the highlighted snippet of the first api search result + Examples: +| term | highlighted_title| highlighted_text | +| two words | *Two* *Words*| ffnonesenseword catapult pickles anotherword | +| pickles| Two Words| ffnonesenseword catapult *pickles* anotherword | +| ffnonesenseword pickles| Two Words| *ffnonesenseword* catapult *pickles* anotherword | +| two words catapult pickles | *Two* *Words*| ffnonesenseword *catapult* *pickles* anotherword | +| template:test pickle | Template:Template *Test* | *pickles* | +# Verify highlighting the presence of accent squashing +| Africa test| *África* | for *testing* | +# Verify highlighting on large pages. +| "discuss problems of social and cultural importance" | Rashidun Caliphate | community centers as well where the faithful gathered to *discuss* *problems* *of* *social* *and* *cultural* *importance*. During the caliphate of Umar as many as four thousand | +| "discuss problems of social and cultural importance"~ | Rashidun Caliphate | community centers as well where the faithful gathered to *discuss* *problems* *of* *social* *and* *cultural* *importance*. During the caliphate of Umar as many as four thousand | +# Auxiliary text +| tallest alborz | Rashidun Caliphate | Mount Damavand, Iran's *tallest* mountain is located in *Alborz* mountain range. | + + Scenario: Even stopwords are highlighted +When I api search for the once and future king +Then *The* *Once* *and* *Future* *King* is the highlighted title of the first api search result + + Scenario: Found words are highlighted even if found by different analyzers +When I api search for "threatening the unity" community +Then Troubles emerged soon after Abu Bakr's succession, *threatening* *the* *unity* and stability of the new *community* and state. Apostasy had actually begun in the lifetime is the highlighted snippet of the first api search result + + @headings + Scenario: Found words are highlighted in headings +When I api search for "i am a heading" +Then *I* *am* *a* *heading* is the highlighted sectionsnippet of the first api search result + + @headings + Scenario: References are not included in headings +When I api search for "Reference in heading" +Then *Reference* *in* *heading* is the highlighted sectionsnippet of the first api search result + + Scenario: Found words are highlighted in headings even in large documents +When I api search for "Succession of Umar" +Then *Succession* *of* *Umar* is the highlighted sectionsnippet of the first api search result + + Scenario: Found words are highlighted in text even in large documents +When I api search for Allowance to non-Muslims +Then *Allowance* *to* *non*-*Muslims* is in the highlighted snippet of the first api search result + + Scenario: Found words are highlighted in text even in large documents +When I api search for "Allowance to non-Muslims" +Then *Allowance* *to* *non*-*Muslims* is in the highlighted snippet of the first api search result + + Scenario: Words are not found in image captions unless there are no matches in the page +When I api search for The Rose Trellis Egg +Then *The* *Rose* *Trellis* Faberge *Egg* is a jewelled enameled imperial Easter *egg* made in St. Petersburg, Russia under *the* supervision of *the* jeweler Peter Carl is the highlighted snippet of the first api search result + + @headings + Scenario: Found words are highlighted in headings even if they contain both a phrase and a non-phrase +When I api search for i "am a heading" +
[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Port filesearch_api.feature to nodejs
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/391440 ) Change subject: Port filesearch_api.feature to nodejs .. Port filesearch_api.feature to nodejs Change-Id: I057e51ae4755244f29d2d7f789b5737695cc7e63 --- A tests/integration/features/filesearch_api.feature M tests/integration/features/step_definitions/page_steps.js M tests/integration/features/support/hooks.js 3 files changed, 150 insertions(+), 65 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/40/391440/1 diff --git a/tests/integration/features/filesearch_api.feature b/tests/integration/features/filesearch_api.feature new file mode 100644 index 000..2ea145e --- /dev/null +++ b/tests/integration/features/filesearch_api.feature @@ -0,0 +1,70 @@ +@clean @api @filesearch +Feature: Searches with the file size filters + + Scenario Outline: filesize finds files with given size +When I api search in namespace 6 for -intitle:frozen +Then there are api search results +And is in the api search results +And is not in the api search results + Examples: +| search| count | musthave | mustnot| +| filesize:>10 | 2 | File:Linux Distribution Timeline text version.pdf | File:OnCommons.svg | +| filesize:<10 | 4 | File:DuplicatedLocally.svg | File:Linux Distribution Timeline text version.pdf | +| filesize:10 | 2 | File:Linux Distribution Timeline text version.pdf | File:OnCommons.svg | +| filesize:5,20 | 1 | File:Savepage-greyed.png | File:Linux Distribution Timeline text version.pdf | + + Scenario Outline: filetype finds files with given internal type +When I api search in namespace 6 for -intitle:frozen +Then there are api search results +And is in the api search results +And is not in the api search results +Examples: + | search | count | musthave | mustnot | + | filetype:bitmap | 1 | File:Savepage-greyed.png | File:DuplicatedLocally.svg | + | filetype:office | 1 | File:Linux Distribution Timeline text version.pdf | File:Savepage-greyed.png | + | filetype:Drawing | 4 | File:DuplicatedLocally.svg | File:Savepage-greyed.png | + + Scenario Outline: filemime finds files with given MIME type +When I api search in namespace 6 for -intitle:frozen +Then there are api search results +And is in the api search results +And is not in the api search results +Examples: + | search | count | musthave | mustnot| + | filemime:image/PNG | 1 | File:Savepage-greyed.png | File:DuplicatedLocally.svg | + | filemime:image/svg+xml | 4 | File:DuplicatedLocally.svg | File:Savepage-greyed.png | + | filemime:application/pdf | 1 | File:Linux Distribution Timeline text version.pdf | File:OnCommons.svg | + + Scenario Outline: Resolution filters find files with given dimensions +When I api search in namespace 6 for -intitle:frozen +Then there are api search results +And is in the api search results +And is not in the api search results + Examples: +| search | count | musthave | mustnot | +| fileres:>1000 | 1| File:Linux Distribution Timeline text version.pdf | File:Savepage-greyed.png | +| filew:>1000| 1| File:Linux Distribution Timeline text version.pdf | File:Savepage-greyed.png | +| fileh:>1000| 1| File:Linux Distribution Timeline text version.pdf | File:Savepage-greyed.png | +| filewidth:>1000| 1| File:Linux Distribution Timeline text version.pdf | File:Savepage-greyed.png | +| fileheight:>1000 | 1| File:Linux Distribution Timeline text version.pdf | File:Savepage-greyed.png | +| fileres:300,600| 1| File:Savepage-greyed.png | DuplicatedLocally.svg| +| fileres:<500 | 1| File:Savepage-greyed.png | File:Linux Distribution Timeline text version.pdf | +| filew:300,900 | 5| File:DuplicatedLocally.svg | File:Linux Distribution Timeline text version.pdf | +| filew:<500 | 1| File:Savepage-greyed.png | File:Linux Distribution Timeline text version.pdf | +| fileh:>200 | 6| File:Linux Distribution Timeline text version.pdf | anything | +| filew:300,600 fileh:200,300 | 1 | File:Savepage-greyed.png
[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Add integration feature files that already pass
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/391437 ) Change subject: Add integration feature files that already pass .. Add integration feature files that already pass These feature files were 95% of the way to implemented, they needed only a single new definition each to pass. Change-Id: I546fe7a8a2b32b8e705cd27278cb9195105e1e49 --- M tests/integration/features/step_definitions/page_steps.js M tests/integration/features/suggest_api.feature A tests/integration/features/wildcard_api.feature 3 files changed, 95 insertions(+), 1 deletion(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/37/391437/1 diff --git a/tests/integration/features/step_definitions/page_steps.js b/tests/integration/features/step_definitions/page_steps.js index 2e370d8..8e4a46c 100644 --- a/tests/integration/features/step_definitions/page_steps.js +++ b/tests/integration/features/step_definitions/page_steps.js @@ -216,6 +216,12 @@ } ); } ); + Then( /there are no api search results/, function () { + withApi( this, () => { + expect( this.apiResponse.query.search ).to.have.lengthOf( 0 ); + } ); + } ); + Then( /^(.+) is( not)? in the api search results$/, function( title, not ) { withApi( this, () => { let titles = this.apiResponse.query.search.map( res => res.title ); @@ -226,4 +232,10 @@ } } ); } ); + + Then( /^this error is reported by api: (.+)$/, function ( expected_error ) { + withApi( this, () => { + expect( this.apiError.info ).to.equal( expected_error.trim() ) + } ); + } ); }); diff --git a/tests/integration/features/suggest_api.feature b/tests/integration/features/suggest_api.feature index 79beb0c..0d70c7f 100644 --- a/tests/integration/features/suggest_api.feature +++ b/tests/integration/features/suggest_api.feature @@ -65,4 +65,31 @@ Examples: | term | first | other | | Ic | Iceman | Ice | -| Ice | Ice| Iceman | \ No newline at end of file +| Ice | Ice| Iceman | + + Scenario: Ordering & limit +When I ask suggestion API at most 1 item for x-m + Then the API should produce list starting with X-Men + And the API should produce list of length 1 + + Scenario Outline: Search fallback to prefix search if namespace is provided +When I ask suggestion API for + Then the API should produce list starting with + Examples: +| term |suggested| +| Special:| Special:ActiveUsers | +| Special:Act | Special:ActiveUsers | + + Scenario Outline: Search prefers main namespace over crossns redirects +When I ask suggestion API for + Then the API should produce list starting with + Examples: +| term |suggested | +| V | Venom | +| V: | V:N | +| Z | Zam Wilson| +| Z: | Z:Navigation | + + Scenario: Default sort can be used as search input +When I ask suggestion API for Wilson + Then the API should produce list starting with Sam Wilson diff --git a/tests/integration/features/wildcard_api.feature b/tests/integration/features/wildcard_api.feature new file mode 100644 index 000..fc86115 --- /dev/null +++ b/tests/integration/features/wildcard_api.feature @@ -0,0 +1,55 @@ +@clean @api @wildcard +Feature: Searches that contain wildcard matches + Scenario Outline: Wildcards match plain matches +When I api search for piles +Then Two Words is the first api search result + Examples: +| wildcard | +| *| +| \\?k | +| c\\? | + + Scenario Outline: Wildcards don't match stemmed matches +When I api search for pikle +Then there are no api search results + Examples: +| wildcard | +| *| +| \\?k | + + Scenario Outline: Wildcards in leading intitle: terms match +When I api search for intitle:functiona intitle:programming +Then Functional programming is the first api search result + Examples: +| wildcard | +| *| +| \\? | + + Scenario Outline: Wildcard suffixes in trailing intitle: terms match stemmed matches +When I api search for intitle:functional intitle:programmin +Then Functional programming is the first api search result + Examples: +| wildcard | +| *| +| \\? | + + Scenario Outline: Wildcards within trailing intitle: terms match stemmed matches +When I api search for intitle:functional intitle:progamming +Then Functional programming is the first api search result + Examples: +| wildcard | +| *| +| \\? | + +
[MediaWiki-commits] [Gerrit] search...deploy[master]: Dont recreate virtualenv unless necessary
EBernhardson has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/391296 ) Change subject: Dont recreate virtualenv unless necessary .. Dont recreate virtualenv unless necessary It appears that while the first run of the virtualenv.sh script will succeed on a debian jessie based system, future runs will attempt to overwrite the upgraded pip with the older system version of pip. This causes pip to completly break with a mismatch between versions. Change-Id: I7c6fd8ea1e1ee35a3e7386fd9b628c0605a11fda --- M scap/checks/virtualenv.sh 1 file changed, 7 insertions(+), 3 deletions(-) Approvals: EBernhardson: Verified; Looks good to me, approved diff --git a/scap/checks/virtualenv.sh b/scap/checks/virtualenv.sh index 2f8ebba..56da498 100644 --- a/scap/checks/virtualenv.sh +++ b/scap/checks/virtualenv.sh @@ -13,9 +13,13 @@ PIP="${VENV}/bin/pip" -# Ensure that the virtual environment exists -mkdir -p "$VENV" -virtualenv --never-download --python python2.7 $VENV || /bin/true +# Ensure that the virtual environment exists. Don't recreate if already +# existing, as this will try and downgrade pip on debian jessie from the one +# installed later which then breaks pip. +if [ ! -x "$PIP" ]; then +mkdir -p "$VENV" +virtualenv --never-download --python python2.7 $VENV || /bin/true +fi # Debian jessie based hosts need updated versions of pip and wheel or they will # fail to install some binary packages (numpy, scipy, maybe others) -- To view, visit https://gerrit.wikimedia.org/r/391296 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I7c6fd8ea1e1ee35a3e7386fd9b628c0605a11fda Gerrit-PatchSet: 1 Gerrit-Project: search/MjoLniR/deploy Gerrit-Branch: master Gerrit-Owner: EBernhardsonGerrit-Reviewer: EBernhardson ___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] search...deploy[master]: Dont recreate virtualenv unless necessary
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/391296 ) Change subject: Dont recreate virtualenv unless necessary .. Dont recreate virtualenv unless necessary It appears that while the first run of the virtualenv.sh script will succeed on a debian jessie based system, future runs will attempt to overwrite the upgraded pip with the older system version of pip. This causes pip to completly break with a mismatch between versions. Change-Id: I7c6fd8ea1e1ee35a3e7386fd9b628c0605a11fda --- M scap/checks/virtualenv.sh 1 file changed, 7 insertions(+), 3 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR/deploy refs/changes/96/391296/1 diff --git a/scap/checks/virtualenv.sh b/scap/checks/virtualenv.sh index 2f8ebba..56da498 100644 --- a/scap/checks/virtualenv.sh +++ b/scap/checks/virtualenv.sh @@ -13,9 +13,13 @@ PIP="${VENV}/bin/pip" -# Ensure that the virtual environment exists -mkdir -p "$VENV" -virtualenv --never-download --python python2.7 $VENV || /bin/true +# Ensure that the virtual environment exists. Don't recreate if already +# existing, as this will try and downgrade pip on debian jessie from the one +# installed later which then breaks pip. +if [ ! -x "$PIP" ]; then +mkdir -p "$VENV" +virtualenv --never-download --python python2.7 $VENV || /bin/true +fi # Debian jessie based hosts need updated versions of pip and wheel or they will # fail to install some binary packages (numpy, scipy, maybe others) -- To view, visit https://gerrit.wikimedia.org/r/391296 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I7c6fd8ea1e1ee35a3e7386fd9b628c0605a11fda Gerrit-PatchSet: 1 Gerrit-Project: search/MjoLniR/deploy Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] search...deploy[master]: Bump mjolnir submodule
EBernhardson has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/391294 ) Change subject: Bump mjolnir submodule .. Bump mjolnir submodule This brings in the patch that made working_dir configurable. We need that so the deployment to stat1005 is able to use the new configuration file. Change-Id: Ia73ba83edd1166412878008a32a09f36dd9a7572 --- M src 1 file changed, 1 insertion(+), 1 deletion(-) Approvals: EBernhardson: Verified; Looks good to me, approved diff --git a/src b/src index 96337a0..0d7fdcf 16 --- a/src +++ b/src @@ -1 +1 @@ -Subproject commit 96337a0ab1931278f93b752ca3be5f30e8124762 +Subproject commit 0d7fdcf27b51b848a8c964f3c204f195c376dea5 -- To view, visit https://gerrit.wikimedia.org/r/391294 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ia73ba83edd1166412878008a32a09f36dd9a7572 Gerrit-PatchSet: 1 Gerrit-Project: search/MjoLniR/deploy Gerrit-Branch: master Gerrit-Owner: EBernhardsonGerrit-Reviewer: EBernhardson ___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] search...deploy[master]: Bump mjolnir submodule
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/391294 ) Change subject: Bump mjolnir submodule .. Bump mjolnir submodule This brings in the patch that made working_dir configurable. We need that so the deployment to stat1005 is able to use the new configuration file. Change-Id: Ia73ba83edd1166412878008a32a09f36dd9a7572 --- M src 1 file changed, 1 insertion(+), 1 deletion(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR/deploy refs/changes/94/391294/1 diff --git a/src b/src index 96337a0..0d7fdcf 16 --- a/src +++ b/src @@ -1 +1 @@ -Subproject commit 96337a0ab1931278f93b752ca3be5f30e8124762 +Subproject commit 0d7fdcf27b51b848a8c964f3c204f195c376dea5 -- To view, visit https://gerrit.wikimedia.org/r/391294 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ia73ba83edd1166412878008a32a09f36dd9a7572 Gerrit-PatchSet: 1 Gerrit-Project: search/MjoLniR/deploy Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] search...deploy[master]: Deploy pip and wheel packages for jessie
EBernhardson has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/391286 ) Change subject: Deploy pip and wheel packages for jessie .. Deploy pip and wheel packages for jessie Debian jessie based hosts (relforge100*) fail to install the provided numpy and scipy packages. Included pip and wheel in the artifacts and explicitly install them into the deployed virtualenv. Also turn on verbose output for pip when installing, to give more information in `scap deploy-log` output for failures. Change-Id: Ie7a6788aec81b6b8dad4f0df0c17c26c8f2a3275 --- A artifacts/pip-9.0.1-py2.py3-none-any.whl A artifacts/wheel-0.30.0-py2.py3-none-any.whl M make_wheels.sh M scap/checks/virtualenv.sh M upload_wheels.py 5 files changed, 25 insertions(+), 3 deletions(-) Approvals: EBernhardson: Verified; Looks good to me, approved diff --git a/artifacts/pip-9.0.1-py2.py3-none-any.whl b/artifacts/pip-9.0.1-py2.py3-none-any.whl new file mode 100644 index 000..2b56f48 --- /dev/null +++ b/artifacts/pip-9.0.1-py2.py3-none-any.whl @@ -0,0 +1 @@ +#$# git-fat c70393185d27ae8b49a117e6dcc18bc5f8f3a1c3 1254803 diff --git a/artifacts/wheel-0.30.0-py2.py3-none-any.whl b/artifacts/wheel-0.30.0-py2.py3-none-any.whl new file mode 100644 index 000..4869eee --- /dev/null +++ b/artifacts/wheel-0.30.0-py2.py3-none-any.whl @@ -0,0 +1 @@ +#$# git-fat 11694b2cfb611fd4accb1135c7d0fef9db4cd92649751 diff --git a/make_wheels.sh b/make_wheels.sh index 7ad71c5..5c29233 100755 --- a/make_wheels.sh +++ b/make_wheels.sh @@ -23,7 +23,12 @@ virtualenv --python python2.7 $VENV || /bin/true $PIP install "${MJOLNIR}" $PIP freeze --local | grep -v mjolnir | grep -v pkg-resources > $REQUIREMENTS -$PIP install wheel +$PIP install pip wheel +# Debian jessie based hosts require updated pip and wheel packages or they will +# refuse to install some packages (numpy, scipy, maybe others) +$PIP wheel --find-links "${WHEEL_DIR}" \ +--wheel-dir "${WHEEL_DIR}" \ +pip wheel $PIP wheel --find-links "${WHEEL_DIR}" \ --wheel-dir "${WHEEL_DIR}" \ --requirement "${REQUIREMENTS}" diff --git a/scap/checks/virtualenv.sh b/scap/checks/virtualenv.sh index 55c0a33..2f8ebba 100644 --- a/scap/checks/virtualenv.sh +++ b/scap/checks/virtualenv.sh @@ -17,15 +17,30 @@ mkdir -p "$VENV" virtualenv --never-download --python python2.7 $VENV || /bin/true +# Debian jessie based hosts need updated versions of pip and wheel or they will +# fail to install some binary packages (numpy, scipy, maybe others) +$PIP install \ +-vv \ +--no-index \ +--find-links "${WHEEL_DIR}" \ +--upgrade \ +--force-reinstall \ +pip wheel # Install or upgrade our packages $PIP install \ +-vv \ --no-index \ --find-links "${WHEEL_DIR}" \ --upgrade \ --force-reinstall \ -r "${REQUIREMENTS}" -$PIP install --upgrade --no-deps "${MJOLNIR_DIR}" +$PIP install \ +-vv \ +--no-index \ +--upgrade \ +--no-deps \ +"${MJOLNIR_DIR}" # Build a .zip of the virtualenv that can be shipped to spark workers cd "${VENV}" diff --git a/upload_wheels.py b/upload_wheels.py index e16453b..5d93792 100755 --- a/upload_wheels.py +++ b/upload_wheels.py @@ -3,7 +3,7 @@ Uploads python wheels to archiva Usage: -upload-wheels.py wheels/*.whl +upload-wheels.py artifacts/*.whl """ from __future__ import print_function -- To view, visit https://gerrit.wikimedia.org/r/391286 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ie7a6788aec81b6b8dad4f0df0c17c26c8f2a3275 Gerrit-PatchSet: 3 Gerrit-Project: search/MjoLniR/deploy Gerrit-Branch: master Gerrit-Owner: EBernhardsonGerrit-Reviewer: DCausse Gerrit-Reviewer: EBernhardson ___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] search...deploy[master]: Deploy pip and wheel packages for jessie
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/391286 ) Change subject: Deploy pip and wheel packages for jessie .. Deploy pip and wheel packages for jessie Debian jessie based hosts (relforge100*) fail to install the provided numpy and scipy packages. Included pip and wheel in the artifacts and explicitly install them into the deployed virtualenv. Change-Id: Ie7a6788aec81b6b8dad4f0df0c17c26c8f2a3275 --- A artifacts/pip-9.0.1-py2.py3-none-any.whl A artifacts/wheel-0.30.0-py2.py3-none-any.whl M make_wheels.sh M scap/checks/virtualenv.sh M upload_wheels.py 5 files changed, 17 insertions(+), 2 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR/deploy refs/changes/86/391286/1 diff --git a/artifacts/pip-9.0.1-py2.py3-none-any.whl b/artifacts/pip-9.0.1-py2.py3-none-any.whl new file mode 100644 index 000..2b56f48 --- /dev/null +++ b/artifacts/pip-9.0.1-py2.py3-none-any.whl @@ -0,0 +1 @@ +#$# git-fat c70393185d27ae8b49a117e6dcc18bc5f8f3a1c3 1254803 diff --git a/artifacts/wheel-0.30.0-py2.py3-none-any.whl b/artifacts/wheel-0.30.0-py2.py3-none-any.whl new file mode 100644 index 000..4869eee --- /dev/null +++ b/artifacts/wheel-0.30.0-py2.py3-none-any.whl @@ -0,0 +1 @@ +#$# git-fat 11694b2cfb611fd4accb1135c7d0fef9db4cd92649751 diff --git a/make_wheels.sh b/make_wheels.sh index 7ad71c5..5c29233 100755 --- a/make_wheels.sh +++ b/make_wheels.sh @@ -23,7 +23,12 @@ virtualenv --python python2.7 $VENV || /bin/true $PIP install "${MJOLNIR}" $PIP freeze --local | grep -v mjolnir | grep -v pkg-resources > $REQUIREMENTS -$PIP install wheel +$PIP install pip wheel +# Debian jessie based hosts require updated pip and wheel packages or they will +# refuse to install some packages (numpy, scipy, maybe others) +$PIP wheel --find-links "${WHEEL_DIR}" \ +--wheel-dir "${WHEEL_DIR}" \ +pip wheel $PIP wheel --find-links "${WHEEL_DIR}" \ --wheel-dir "${WHEEL_DIR}" \ --requirement "${REQUIREMENTS}" diff --git a/scap/checks/virtualenv.sh b/scap/checks/virtualenv.sh index 55c0a33..000c22c 100644 --- a/scap/checks/virtualenv.sh +++ b/scap/checks/virtualenv.sh @@ -17,6 +17,14 @@ mkdir -p "$VENV" virtualenv --never-download --python python2.7 $VENV || /bin/true +# Debian jessie based hosts need updated versions of pip and wheel or they will +# fail to install some binary packages (numpy, scipy, maybe others) +$PIP install \ +--no-index \ +--find-links "${WHEEL_DIR}" \ +--upgrade \ +--force-reinstall \ +pip wheel # Install or upgrade our packages $PIP install \ --no-index \ diff --git a/upload_wheels.py b/upload_wheels.py index e16453b..5d93792 100755 --- a/upload_wheels.py +++ b/upload_wheels.py @@ -3,7 +3,7 @@ Uploads python wheels to archiva Usage: -upload-wheels.py wheels/*.whl +upload-wheels.py artifacts/*.whl """ from __future__ import print_function -- To view, visit https://gerrit.wikimedia.org/r/391286 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ie7a6788aec81b6b8dad4f0df0c17c26c8f2a3275 Gerrit-PatchSet: 1 Gerrit-Project: search/MjoLniR/deploy Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] search...deploy[master]: Remove --no-cache-dir from pip command
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/391268 ) Change subject: Remove --no-cache-dir from pip command .. Remove --no-cache-dir from pip command --no-cache-dir is not available for the pip version available in debian jessie. We could perhaps ship an updated pip version with this deploy repo, but that doesn't seem necessary yet. Skipping --no-cache-dir should hopefully have no effect on building the virtualenv. It's mostly just a stricter guarantee that we really are installing only wheels from the artifacts directory Change-Id: I00b96b84821a6bedf0d93400ca31842a0cd4e658 --- M scap/checks/virtualenv.sh 1 file changed, 0 insertions(+), 1 deletion(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR/deploy refs/changes/68/391268/1 diff --git a/scap/checks/virtualenv.sh b/scap/checks/virtualenv.sh index eb05944..55c0a33 100644 --- a/scap/checks/virtualenv.sh +++ b/scap/checks/virtualenv.sh @@ -19,7 +19,6 @@ # Install or upgrade our packages $PIP install \ ---no-cache-dir \ --no-index \ --find-links "${WHEEL_DIR}" \ --upgrade \ -- To view, visit https://gerrit.wikimedia.org/r/391268 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I00b96b84821a6bedf0d93400ca31842a0cd4e658 Gerrit-PatchSet: 1 Gerrit-Project: search/MjoLniR/deploy Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] search...deploy[master]: Remove --no-cache-dir from pip command
EBernhardson has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/391268 ) Change subject: Remove --no-cache-dir from pip command .. Remove --no-cache-dir from pip command --no-cache-dir is not available for the pip version available in debian jessie. We could perhaps ship an updated pip version with this deploy repo, but that doesn't seem necessary yet. Skipping --no-cache-dir should hopefully have no effect on building the virtualenv. It's mostly just a stricter guarantee that we really are installing only wheels from the artifacts directory Change-Id: I00b96b84821a6bedf0d93400ca31842a0cd4e658 --- M scap/checks/virtualenv.sh 1 file changed, 0 insertions(+), 1 deletion(-) Approvals: EBernhardson: Verified; Looks good to me, approved diff --git a/scap/checks/virtualenv.sh b/scap/checks/virtualenv.sh index eb05944..55c0a33 100644 --- a/scap/checks/virtualenv.sh +++ b/scap/checks/virtualenv.sh @@ -19,7 +19,6 @@ # Install or upgrade our packages $PIP install \ ---no-cache-dir \ --no-index \ --find-links "${WHEEL_DIR}" \ --upgrade \ -- To view, visit https://gerrit.wikimedia.org/r/391268 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I00b96b84821a6bedf0d93400ca31842a0cd4e658 Gerrit-PatchSet: 1 Gerrit-Project: search/MjoLniR/deploy Gerrit-Branch: master Gerrit-Owner: EBernhardsonGerrit-Reviewer: EBernhardson ___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] search...deploy[master]: Fix typo in checks.yaml
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/391266 ) Change subject: Fix typo in checks.yaml .. Fix typo in checks.yaml Change-Id: Ic261295a7ac7ff5bc155ea01928a08d4e49e7b2e --- M scap/checks.yaml 1 file changed, 1 insertion(+), 1 deletion(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR/deploy refs/changes/66/391266/1 diff --git a/scap/checks.yaml b/scap/checks.yaml index 3f0ed03..411e399 100644 --- a/scap/checks.yaml +++ b/scap/checks.yaml @@ -5,7 +5,7 @@ timeout: 300 group: relforge command: bash /srv/deployment/search/mjolnir/deploy/scap/checks/virtualenv.sh -virtualenv_analytics +virtualenv_analytics: type: command stage: promote timeout: 300 -- To view, visit https://gerrit.wikimedia.org/r/391266 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ic261295a7ac7ff5bc155ea01928a08d4e49e7b2e Gerrit-PatchSet: 1 Gerrit-Project: search/MjoLniR/deploy Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] search...deploy[master]: Fix typo in checks.yaml
EBernhardson has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/391266 ) Change subject: Fix typo in checks.yaml .. Fix typo in checks.yaml Change-Id: Ic261295a7ac7ff5bc155ea01928a08d4e49e7b2e --- M scap/checks.yaml 1 file changed, 1 insertion(+), 1 deletion(-) Approvals: EBernhardson: Verified; Looks good to me, approved diff --git a/scap/checks.yaml b/scap/checks.yaml index 3f0ed03..411e399 100644 --- a/scap/checks.yaml +++ b/scap/checks.yaml @@ -5,7 +5,7 @@ timeout: 300 group: relforge command: bash /srv/deployment/search/mjolnir/deploy/scap/checks/virtualenv.sh -virtualenv_analytics +virtualenv_analytics: type: command stage: promote timeout: 300 -- To view, visit https://gerrit.wikimedia.org/r/391266 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ic261295a7ac7ff5bc155ea01928a08d4e49e7b2e Gerrit-PatchSet: 1 Gerrit-Project: search/MjoLniR/deploy Gerrit-Branch: master Gerrit-Owner: EBernhardsonGerrit-Reviewer: EBernhardson ___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] search...deploy[master]: Setup scap checks for separate server groups
EBernhardson has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/391265 ) Change subject: Setup scap checks for separate server groups .. Setup scap checks for separate server groups Doesn't look like the 'default' checks run for the other groups, so define a check per group. Change-Id: I139d27d7eea21c21592f09266b5d20d909f905ea --- M scap/checks.yaml 1 file changed, 8 insertions(+), 2 deletions(-) Approvals: EBernhardson: Verified; Looks good to me, approved diff --git a/scap/checks.yaml b/scap/checks.yaml index 825..3f0ed03 100644 --- a/scap/checks.yaml +++ b/scap/checks.yaml @@ -1,8 +1,14 @@ checks: -virtualenv: +virtualenv_relforge: type: command stage: promote timeout: 300 -group: default +group: relforge +command: bash /srv/deployment/search/mjolnir/deploy/scap/checks/virtualenv.sh +virtualenv_analytics +type: command +stage: promote +timeout: 300 +group: analytics command: bash /srv/deployment/search/mjolnir/deploy/scap/checks/virtualenv.sh -- To view, visit https://gerrit.wikimedia.org/r/391265 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I139d27d7eea21c21592f09266b5d20d909f905ea Gerrit-PatchSet: 1 Gerrit-Project: search/MjoLniR/deploy Gerrit-Branch: master Gerrit-Owner: EBernhardsonGerrit-Reviewer: EBernhardson ___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] search...deploy[master]: Setup scap checks for separate server groups
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/391265 ) Change subject: Setup scap checks for separate server groups .. Setup scap checks for separate server groups Doesn't look like the 'default' checks run for the other groups, so define a check per group. Change-Id: I139d27d7eea21c21592f09266b5d20d909f905ea --- M scap/checks.yaml 1 file changed, 8 insertions(+), 2 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR/deploy refs/changes/65/391265/1 diff --git a/scap/checks.yaml b/scap/checks.yaml index 825..3f0ed03 100644 --- a/scap/checks.yaml +++ b/scap/checks.yaml @@ -1,8 +1,14 @@ checks: -virtualenv: +virtualenv_relforge: type: command stage: promote timeout: 300 -group: default +group: relforge +command: bash /srv/deployment/search/mjolnir/deploy/scap/checks/virtualenv.sh +virtualenv_analytics +type: command +stage: promote +timeout: 300 +group: analytics command: bash /srv/deployment/search/mjolnir/deploy/scap/checks/virtualenv.sh -- To view, visit https://gerrit.wikimedia.org/r/391265 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I139d27d7eea21c21592f09266b5d20d909f905ea Gerrit-PatchSet: 1 Gerrit-Project: search/MjoLniR/deploy Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] search...deploy[master]: scap.cfg: MjoLniR -> mjolnir
EBernhardson has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/391262 ) Change subject: scap.cfg: MjoLniR -> mjolnir .. scap.cfg: MjoLniR -> mjolnir Change-Id: I6b17fd077f7794fb3354496207cc9ec19c142d1e --- M scap/scap.cfg 1 file changed, 1 insertion(+), 1 deletion(-) Approvals: EBernhardson: Verified; Looks good to me, approved diff --git a/scap/scap.cfg b/scap/scap.cfg index 23f370e..8913d7a 100644 --- a/scap/scap.cfg +++ b/scap/scap.cfg @@ -1,5 +1,5 @@ [global] -git_repo: search/MjoLniR/deploy +git_repo: search/mjolnir/deploy ssh_user: deploy-service server_groups: analytics, relforge analytics_dsh_targets: discovery-analytics -- To view, visit https://gerrit.wikimedia.org/r/391262 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I6b17fd077f7794fb3354496207cc9ec19c142d1e Gerrit-PatchSet: 1 Gerrit-Project: search/MjoLniR/deploy Gerrit-Branch: master Gerrit-Owner: EBernhardsonGerrit-Reviewer: EBernhardson ___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] search...deploy[master]: scap.cfg: MjoLniR -> mjolnir
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/391262 ) Change subject: scap.cfg: MjoLniR -> mjolnir .. scap.cfg: MjoLniR -> mjolnir Change-Id: I6b17fd077f7794fb3354496207cc9ec19c142d1e --- M scap/scap.cfg 1 file changed, 1 insertion(+), 1 deletion(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR/deploy refs/changes/62/391262/1 diff --git a/scap/scap.cfg b/scap/scap.cfg index 23f370e..8913d7a 100644 --- a/scap/scap.cfg +++ b/scap/scap.cfg @@ -1,5 +1,5 @@ [global] -git_repo: search/MjoLniR/deploy +git_repo: search/mjolnir/deploy ssh_user: deploy-service server_groups: analytics, relforge analytics_dsh_targets: discovery-analytics -- To view, visit https://gerrit.wikimedia.org/r/391262 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I6b17fd077f7794fb3354496207cc9ec19c142d1e Gerrit-PatchSet: 1 Gerrit-Project: search/MjoLniR/deploy Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] mediawiki...WikimediaEvents[wmf/1.31.0-wmf.7]: Turn off AB test for DBN sizing on enwiki
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/390289 ) Change subject: Turn off AB test for DBN sizing on enwiki .. Turn off AB test for DBN sizing on enwiki Change-Id: I8cb7b54ddd4f7f39c482a183950081006f0262ab (cherry picked from commit 72028be976dc42a75b8934228070292a3f4dee7a) --- M modules/all/ext.wikimediaEvents.searchSatisfaction.js 1 file changed, 3 insertions(+), 5 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/WikimediaEvents refs/changes/89/390289/1 diff --git a/modules/all/ext.wikimediaEvents.searchSatisfaction.js b/modules/all/ext.wikimediaEvents.searchSatisfaction.js index a48355d..7517df7 100644 --- a/modules/all/ext.wikimediaEvents.searchSatisfaction.js +++ b/modules/all/ext.wikimediaEvents.searchSatisfaction.js @@ -114,9 +114,7 @@ function initialize( session ) { var sessionId = session.get( 'sessionId' ), - validBuckets = mw.config.get( 'wgDBname' ) === 'enwiki' ? - [ 'control', 'dbn20', 'dbn20-i', 'dbn35', 'dbn35-i' ] : - [], + validBuckets = [], sampleSize = ( function () { var dbName = mw.config.get( 'wgDBname' ), // Provides a place to handle wiki-specific sampling, @@ -141,8 +139,8 @@ // .15 increases that to 810k per week. Giving // 160k sessions per bucket per week. enwiki: { - test: 0.15, - subTest: 0.996 + test: 2000, + subTest: null }, enwiktionary: { test: 40, -- To view, visit https://gerrit.wikimedia.org/r/390289 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I8cb7b54ddd4f7f39c482a183950081006f0262ab Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/WikimediaEvents Gerrit-Branch: wmf/1.31.0-wmf.7 Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] mediawiki...WikimediaEvents[master]: Turn off AB test for DBN sizing on enwiki
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/390286 ) Change subject: Turn off AB test for DBN sizing on enwiki .. Turn off AB test for DBN sizing on enwiki Change-Id: I8cb7b54ddd4f7f39c482a183950081006f0262ab --- M modules/ext.wikimediaEvents.searchSatisfaction.js 1 file changed, 3 insertions(+), 5 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/WikimediaEvents refs/changes/86/390286/1 diff --git a/modules/ext.wikimediaEvents.searchSatisfaction.js b/modules/ext.wikimediaEvents.searchSatisfaction.js index 40b1cc1..d0697cf 100644 --- a/modules/ext.wikimediaEvents.searchSatisfaction.js +++ b/modules/ext.wikimediaEvents.searchSatisfaction.js @@ -113,9 +113,7 @@ function initialize( session ) { var sessionId = session.get( 'sessionId' ), - validBuckets = mw.config.get( 'wgDBname' ) === 'enwiki' ? - [ 'control', 'dbn20', 'dbn20-i', 'dbn35', 'dbn35-i' ] : - [], + validBuckets = [], sampleSize = ( function () { var dbName = mw.config.get( 'wgDBname' ), // Provides a place to handle wiki-specific sampling, @@ -140,8 +138,8 @@ // .15 increases that to 810k per week. Giving // 160k sessions per bucket per week. enwiki: { - test: 0.15, - subTest: 0.996 + test: 2000, + subTest: null }, enwiktionary: { test: 40, -- To view, visit https://gerrit.wikimedia.org/r/390286 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I8cb7b54ddd4f7f39c482a183950081006f0262ab Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/WikimediaEvents Gerrit-Branch: master Gerrit-Owner: EBernhardson___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] wikimedia...analytics[master]: Fetch inner hits and only the first page
EBernhardson has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/384989 ) Change subject: Fetch inner hits and only the first page .. Fetch inner hits and only the first page Change-Id: Ifc2dcb24111bfececa5c448f886f2db3a2b39aff --- M oozie/query_clicks/hourly/query_clicks_hourly.hql 1 file changed, 4 insertions(+), 2 deletions(-) Approvals: EBernhardson: Looks good to me, approved Bearloga: Looks good to me, but someone else must approve jenkins-bot: Verified Chelsyx: Checked; Looks good to me, but someone else must approve diff --git a/oozie/query_clicks/hourly/query_clicks_hourly.hql b/oozie/query_clicks/hourly/query_clicks_hourly.hql index 8b825b5..7cb82eb 100644 --- a/oozie/query_clicks/hourly/query_clicks_hourly.hql +++ b/oozie/query_clicks/hourly/query_clicks_hourly.hql @@ -138,7 +138,7 @@ csrs.identity, csrs.id AS request_set_token, csrs.ts AS timestamp, -csrs.hits +get_main_search_request(csrs.wikiid, csrs.requests).hits AS hits FROM ${source_cirrus_table} csrs JOIN @@ -156,9 +156,11 @@ -- Make sure we only extract from content index AND SIZE(get_main_search_request(csrs.wikiid, csrs.requests).indices) == 1 AND get_main_search_request(csrs.wikiid, csrs.requests).indices[0] LIKE '%_content' +-- Only fetch first page for simplicity +AND get_main_search_request(csrs.wikiid, csrs.requests).hitsoffset = 0 -- We only want 'normal' requests here. if the user requested more than -- the default 20 results filter them out -AND SIZE(csrs.hits) <= 20 +AND SIZE(get_main_search_request(csrs.wikiid, csrs.requests).hits) <= 20 ) INSERT OVERWRITE TABLE -- To view, visit https://gerrit.wikimedia.org/r/384989 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ifc2dcb24111bfececa5c448f886f2db3a2b39aff Gerrit-PatchSet: 2 Gerrit-Project: wikimedia/discovery/analytics Gerrit-Branch: master Gerrit-Owner: DCausseGerrit-Reviewer: Bearloga Gerrit-Reviewer: Chelsyx Gerrit-Reviewer: EBernhardson Gerrit-Reviewer: jenkins-bot <> ___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] wikimedia...analytics[master]: Calculate click data for top queries
EBernhardson has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/317019 ) Change subject: Calculate click data for top queries .. Calculate click data for top queries Joins search request logs against the web requests table to locate click throughs on search requests on a daily basis. This is the first step in figuring out a way to judge engine relevance based on user click throughs. * Applies some of dcausse's approaches to limit the search logs to full text searches performed via web on Special:Search. * Takes +1 hour of webrequest data as compared to search log data, to ensure we collect the clicks against searches near the end of the last hour * Applies naive sessionization of queries, defining a new session as being more than 30 minutes without a search query from a user identity. * Identities performing more than 1k queries per day are filtered out of the daily table for performance reasons. It may be desirable to limit even further in sources that consume this data. Bug: T162054 Depends-On: I67d5f0e7674f970b353ab5992fec1431f4592256 Depends-On: I458e7ac724fefe813732b48fcfcef4728359fca9 Change-Id: I09f253849d8a1d28a3c26dc6b0f60233074d6a90 --- A hive/query_clicks/create_query_clicks_daily.hql A hive/query_clicks/create_query_clicks_hourly.hql M oozie/datasets.xml A oozie/query_clicks/daily/coordinator.properties A oozie/query_clicks/daily/coordinator.xml A oozie/query_clicks/daily/drop_query_clicks_hourly_partitions.hql A oozie/query_clicks/daily/query_clicks_daily.hql A oozie/query_clicks/daily/workflow.xml A oozie/query_clicks/datasets.xml A oozie/query_clicks/hourly/coordinator.properties A oozie/query_clicks/hourly/coordinator.xml A oozie/query_clicks/hourly/query_clicks_hourly.hql A oozie/query_clicks/hourly/workflow.xml 13 files changed, 1,178 insertions(+), 0 deletions(-) Approvals: jenkins-bot: Verified DCausse: Looks good to me, approved diff --git a/hive/query_clicks/create_query_clicks_daily.hql b/hive/query_clicks/create_query_clicks_daily.hql new file mode 100644 index 000..1c8ff68 --- /dev/null +++ b/hive/query_clicks/create_query_clicks_daily.hql @@ -0,0 +1,19 @@ +CREATE TABLE `discovery.query_clicks_daily`( + `query` string, + `q_by_ip_day` int, + `timestamp` bigint, + `wikiid` string, + `project` string, + `hits` array>, + `clicks` array >, + `session_id` string +) +PARTITIONED BY ( + `year` int, + `month` int, + `day` int +) +STORED AS PARQUET +LOCATION 'hdfs://analytics-hadoop/wmf/data/discovery/query_clicks/daily' +; + diff --git a/hive/query_clicks/create_query_clicks_hourly.hql b/hive/query_clicks/create_query_clicks_hourly.hql new file mode 100644 index 000..77f5971 --- /dev/null +++ b/hive/query_clicks/create_query_clicks_hourly.hql @@ -0,0 +1,20 @@ +CREATE TABLE `discovery.query_clicks_hourly` ( + `query` string, + `ip` string, + `identity` string, + `timestamp` bigint, + `wikiid` string, + `project` string, + `hits` array >, + `clicks` array > +) +PARTITIONED BY ( + `year` int, + `month` int, + `day` int, + `hour` int +) +STORED AS PARQUET +LOCATION 'hdfs://analytics-hadoop/wmf/data/discovery/query_clicks/hourly' +; + diff --git a/oozie/datasets.xml b/oozie/datasets.xml index 68c7ec2..e33f73d 100644 --- a/oozie/datasets.xml +++ b/oozie/datasets.xml @@ -31,4 +31,5 @@ ${popularity_score_data_directory}/agg_days=${days_aggregated}/year=${YEAR}/month=${"$"}{MONTH + 0}/day=${"$"}{DAY + 0} _SUCCESS + diff --git a/oozie/query_clicks/daily/coordinator.properties b/oozie/query_clicks/daily/coordinator.properties new file mode 100644 index 000..3874b0b --- /dev/null +++ b/oozie/query_clicks/daily/coordinator.properties @@ -0,0 +1,63 @@ +# Configures a coordinator to manage automatically merging +# query_clicks_hourly into a daily table. +# +# Any of the following properties are overidable with -D. +# Usage: +# oozie job -Duser=$USER -Dstart_time=2016-12-01T00:00Z -submit \ +# -config oozie/query_clicks/daily/coordinator.properties +# +# NOTE: Both *_oozie_directory must be synced to HDFS so that all relevant +#.xml files exist there when this job is submitted. + +# Base path in HDFS to this repository oozie files. +# Other files will be used relative to this path. +discovery_oozie_directory = ${name_node}/wmf/discovery/current/oozie + +# Base path in HDFS to the analytics team oozie files. +# Other files will be used relative to this path +refinery_directory= ${name_node}/wmf/refinery/current +analytics_oozie_directory = ${refinery_directory}/oozie + +name_node =
[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[wmf/1.31.0-wmf.6]: Try to unify phrase rescore with RescoreBuilder
EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/390162 ) Change subject: Try to unify phrase rescore with RescoreBuilder .. Try to unify phrase rescore with RescoreBuilder - kill Searcher::installBoosts - single place to build rescore - all sort options handled in the same switch/case benefits: allow to customize positioning of the phrase rescore drawbacks: rescore profiles need to add a placeholder for the phrase rescore Bug: T178906 Change-Id: I438153c9fe52d8275868ddf3f0a0bd7a0cc5627f (cherry picked from commit bc5a8a63929c1e4cbec65ef16b5221c4c1264285) --- M includes/Query/FullTextQueryStringQueryBuilder.php M includes/Search/RescoreBuilders.php M includes/Search/SearchContext.php M includes/Searcher.php M profiles/RescoreProfiles.config.php M tests/unit/fixtures/searchText/ltr_001.query 6 files changed, 97 insertions(+), 80 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/62/390162/1 diff --git a/includes/Query/FullTextQueryStringQueryBuilder.php b/includes/Query/FullTextQueryStringQueryBuilder.php index d1883a5..0c727b3 100644 --- a/includes/Query/FullTextQueryStringQueryBuilder.php +++ b/includes/Query/FullTextQueryStringQueryBuilder.php @@ -223,19 +223,12 @@ $rescoreFields = $nonAllFields; } - $searchContext->addRescore( [ - 'window_size' => $this->config->get( 'CirrusSearchPhraseRescoreWindowSize' ), - 'query' => [ - 'rescore_query' => $this->buildPhraseRescoreQuery( + $searchContext->setPhraseRescoreQuery( $this->buildPhraseRescoreQuery( $searchContext, $rescoreFields, $this->queryStringQueryString, $this->config->getElement( 'CirrusSearchPhraseSlop', 'boost' ) - ), - 'query_weight' => 1.0, - 'rescore_query_weight' => $this->config->get( 'CirrusSearchPhraseRescoreBoost' ), - ] - ] ); + ) ); } if ( $showSuggestion ) { @@ -269,7 +262,6 @@ 'query' => $this->queryStringQueryString, 'default_operator' => 'AND', ] ] ) ); - $searchContext->clearRescore(); return true; } @@ -639,9 +631,7 @@ // Queries with the quote already contain a phrase query and we // can't build phrase queries out of phrase queries at this // point. - if ( $this->config->get( 'CirrusSearchPhraseRescoreBoost' ) > 0.0 && - $this->config->get( 'CirrusSearchPhraseRescoreWindowSize' ) && - !$searchContext->isSpecialKeywordUsed() && + if ( !$searchContext->isSpecialKeywordUsed() && strpos( $this->queryStringQueryString, '"' ) === false && ( $this->useTokenCountRouter || strpos( $this->queryStringQueryString, ' ' ) !== false ) ) { diff --git a/includes/Search/RescoreBuilders.php b/includes/Search/RescoreBuilders.php index f63826d..12d4054 100644 --- a/includes/Search/RescoreBuilders.php +++ b/includes/Search/RescoreBuilders.php @@ -52,6 +52,7 @@ const FUNCTION_SCORE_TYPE = "function_score"; const LTR_TYPE = "ltr"; + const PHRASE = "phrase"; /** * @var SearchContext @@ -82,11 +83,14 @@ $rescores = []; foreach ( $this->profile['rescore'] as $rescoreDef ) { $windowSize = $this->windowSize( $rescoreDef ); + if ( $windowSize <= 0 ) { + continue; + } $rescore = [ 'window_size' => $windowSize, ]; - $rescore['query'] = array_intersect_key( $rescoreDef, array_flip( self::$rescoreMainParams ) ); + $rescore['query'] = $this->prepareQueryParams( $rescoreDef ); $rescoreQuery = $this->buildRescoreQuery( $rescoreDef ); if ( $rescoreQuery === null ) { continue; @@ -111,6 +115,8 @@ return $funcChain->buildRescoreQuery(); case self::LTR_TYPE: return $this->buildLtrQuery( $rescoreDef['model'] ); + case self::PHRASE: +