EBernhardson has uploaded a new change for review. https://gerrit.wikimedia.org/r/318024
Change subject: Track reliability of query scores ...................................................................... Track reliability of query scores There can be problems where we have a large disagreement between users about what should or should not be relevant for a query. Try to resolve that by adding the query back into the scoring queue to get a couple more users to look at it. Also adds a 'reliable' column to the scores, so that relevance forge can filter on it and only pull in scores that have been deemed reliable. The current method is very naive, but might be "good enough". Bug: T146189 Change-Id: I36091436acb81b343a14ca0801ce77924055a654 --- M schema.mysql.sql M src/RelevanceScoring/QueriesManager.php M src/RelevanceScoring/RelevanceScoringProvider.php M src/RelevanceScoring/Repository/ScoresRepository.php M src/RelevanceScoring/Repository/ScoringQueueRepository.php 5 files changed, 102 insertions(+), 8 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/wikimedia/discovery/discernatron refs/changes/24/318024/1 diff --git a/schema.mysql.sql b/schema.mysql.sql index 72f3fb2..e3ff2f6 100644 --- a/schema.mysql.sql +++ b/schema.mysql.sql @@ -42,13 +42,14 @@ UNIQUE KEY `results_source_results_id_source` (`results_id`, `source`), KEY `results_sources_snippet_order` (`query_id`, `results_id`, `snippet_score`) ) CHARSET=utf8mb4; -CREATE TABLE IF NOT EXISTS scores ( +CREATE TABLE IF NOT EXISTS `scores` ( id INTEGER UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, user_id INTEGER UNSIGNED NOT NULL, result_id INTEGER UNSIGNED NOT NULL, query_id INTEGER UNSIGNED NOT NULL, score TINYINT UNSIGNED, created INTEGER UNSIGNED NOT NULL, + reliable TINYINY UNSIGNED NOT NULL, FOREIGN KEY `scores_user_id` (user_id) REFERENCES users(id), FOREIGN KEY `scores_result_id` (result_id) REFERENCES results(id), FOREIGN KEY `scores_query_id` (query_id) REFERENCES queries(id), diff --git a/src/RelevanceScoring/QueriesManager.php b/src/RelevanceScoring/QueriesManager.php index 4a642a7..53a90f2 100644 --- a/src/RelevanceScoring/QueriesManager.php +++ b/src/RelevanceScoring/QueriesManager.php @@ -27,14 +27,30 @@ private $queriesRepo; /** @var UsersRepository */ private $usersRepo; + /** @var int */ + private $maxScoresPerQuery; + /** @var int */ + private $queuePriority; + /** + * @param User $user + * @param QueriesRepository $queriesRepo + * @param ResultsRepository $resultsRepo + * @param ScoresRepository $scoresRepo + * @param ScoringQueueRepository $scoringQueueRepo + * @param UsersRepository $usersRepo + * @param int $maxScoresPerQuery + * @param int $queuePriority + */ public function __construct( User $user, QueriesRepository $queriesRepo, ResultsRepository $resultsRepo, ScoresRepository $scoresRepo, ScoringQueueRepository $scoringQueueRepo, - UsersRepository $usersRepo + UsersRepository $usersRepo, + $maxScoresPerQuery, + $queuePriority ) { $this->user = $user; $this->resultsRepo = $resultsRepo; @@ -42,6 +58,8 @@ $this->scoringQueueRepo = $scoringQueueRepo; $this->queriesRepo = $queriesRepo; $this->usersRepo = $usersRepo; + $this->maxScoresPerQuery = $maxScoresPerQuery; + $this->queuePriority = $queuePriority; } public function nextQueryId() { @@ -73,6 +91,7 @@ public function saveScores($queryId, array $scores) { $this->scoresRepo->storeQueryScores($this->user, $queryId, $scores); $this->scoringQueueRepo->markScored($this->user, $queryId); + $this->updateReliability($queryId); } public function updateUserStorage() { @@ -119,4 +138,38 @@ return $array; } + + private function updateReliability($queryId) { + // If there are still pending scores do nothing + $pendingCount = $this->scoringQueueRepo->getNumberPending([$queryId]); + if ( isset( $pendingCount[$queryId] ) ) { + return; + } + + $reliable = $this->scoresRepo->checkReliability($queryId); + if ( $reliable ) { + $this->scoresRepo->markReliable($queryId); + return; + } + + // Query is unreliable and there are no pending scores. + $numberOfScores = $this->scoringQueueRepo->getNumberOfScores([$queryId]); + if ( !isset( $numberOfScores[$queryId])) { + // what?!?! + return; + } + + if ( $numberOfScores[$queryId] > $this->maxScoresPerQuery ) { + // we have plenty of scores and this is still unreliable...just leave it. + // We should probably have some page that reports these so we can manually + // review/fix them. + return; + } + + // Request twice as many scores to be given. + $desiredScores = min($this->maxScoresPerQuery, 2 * $numberOfScores[$queryId] ); + $neededScores = $desiredScores - $numberOfScores[$queryId]; + $slots = array_pad([], $neededScores, $this->queuePriority); + $this->scoringQueue->insert($queryId, $slots); + } } diff --git a/src/RelevanceScoring/RelevanceScoringProvider.php b/src/RelevanceScoring/RelevanceScoringProvider.php index 5421111..091cc39 100644 --- a/src/RelevanceScoring/RelevanceScoringProvider.php +++ b/src/RelevanceScoring/RelevanceScoringProvider.php @@ -246,6 +246,8 @@ private function registerControllers(Application $app) { // helper for queries controller + $app['search.queries_manager.max_scores_per_query'] = 5; + $app['search.queries_manager.queue_priority'] = 3; $app['search.queries_manager'] = function () use ($app) { return new QueriesManager( $app['session']->get('user'), @@ -253,7 +255,9 @@ $app['search.repository.results'], $app['search.repository.scores'], $app['search.repository.scoring_queue'], - $app['search.repository.users'] + $app['search.repository.users'], + $app['search.queries_manager.max_scores_per_query'], + $app['search.queries_manager.queue_priority'] ); }; $app['search.controller.queries'] = function () use ($app) { diff --git a/src/RelevanceScoring/Repository/ScoresRepository.php b/src/RelevanceScoring/Repository/ScoresRepository.php index 7200b97..bfa7c03 100644 --- a/src/RelevanceScoring/Repository/ScoresRepository.php +++ b/src/RelevanceScoring/Repository/ScoresRepository.php @@ -37,6 +37,7 @@ 'query_id' => $queryId, 'score' => $score, 'created' => time(), + 'reliable' => false, ]; $affected = $this->db->insert('scores', $row); if ($affected !== 1) { @@ -46,6 +47,39 @@ return $this->db->lastInsertId(); } + public function checkReliability($queryId) { + $sql = <<<EOD +SELECT AVG(diff) + FROM ( SELECT MAX(score)-MIN(score) as diff + FROM scores + WHERE query_id = :queryId + GROUP BY result_id + ) x +EOD; + + $res = $this->db->executeQuery($sql, [ + 'queryId' => $queryId + ]); + + if ($res === false) { + throw new RuntimeException('Query Failure'); + } + + return $res->fetchColumn(0); + } + + public function markReliable($queryId) { + $sql = <<<EOD +UPDATE scores +SET reliable=1 +WHERE query_id = :queryId +EOD; + + $stmt = $this->db->executeQuery($sql, [ + 'queryId' => $queryId + ]); + } + public function getNumberOfScores(array $queryIds) { $sql = <<<EOD diff --git a/src/RelevanceScoring/Repository/ScoringQueueRepository.php b/src/RelevanceScoring/Repository/ScoringQueueRepository.php index 1c9c06e..e55a216 100644 --- a/src/RelevanceScoring/Repository/ScoringQueueRepository.php +++ b/src/RelevanceScoring/Repository/ScoringQueueRepository.php @@ -73,15 +73,17 @@ * Mark a queryId as needing to be scored $numSlots times. * * @param int $queryId + * @param int[] $slots The list of scoring priorities to + * insert into the queue. */ - public function insert($queryId) + public function insert($queryId, array $slots = null) { $params = ['queryId' => $queryId]; $rows = []; - // very simple priority assignment from 1 to $numSlots. Note - // that 0 is the highest priority, and we create two items with - // priority 1. - foreach ($this->defaultSlots as $priority) { + if ( $slots === null ) { + $slots = $this->defaultSlots; + } + foreach ($slots as $priority) { $rows[] = "(:queryId, :priority$priority)"; $params["priority$priority"] = $priority; } -- To view, visit https://gerrit.wikimedia.org/r/318024 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I36091436acb81b343a14ca0801ce77924055a654 Gerrit-PatchSet: 1 Gerrit-Project: wikimedia/discovery/discernatron Gerrit-Branch: master Gerrit-Owner: EBernhardson <ebernhard...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits