EBernhardson has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/318024

Change subject: Track reliability of query scores
......................................................................

Track reliability of query scores

There can be problems where we have a large disagreement between users
about what should or should not be relevant for a query. Try to resolve
that by adding the query back into the scoring queue to get a couple
more users to look at it.

Also adds a 'reliable' column to the scores, so that relevance forge can
filter on it and only pull in scores that have been deemed reliable. The
current method is very naive, but might be "good enough".

Bug: T146189
Change-Id: I36091436acb81b343a14ca0801ce77924055a654
---
M schema.mysql.sql
M src/RelevanceScoring/QueriesManager.php
M src/RelevanceScoring/RelevanceScoringProvider.php
M src/RelevanceScoring/Repository/ScoresRepository.php
M src/RelevanceScoring/Repository/ScoringQueueRepository.php
5 files changed, 102 insertions(+), 8 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/wikimedia/discovery/discernatron 
refs/changes/24/318024/1

diff --git a/schema.mysql.sql b/schema.mysql.sql
index 72f3fb2..e3ff2f6 100644
--- a/schema.mysql.sql
+++ b/schema.mysql.sql
@@ -42,13 +42,14 @@
     UNIQUE KEY `results_source_results_id_source` (`results_id`, `source`),
     KEY `results_sources_snippet_order` (`query_id`, `results_id`, 
`snippet_score`)
 ) CHARSET=utf8mb4;
-CREATE TABLE IF NOT EXISTS scores (
+CREATE TABLE IF NOT EXISTS `scores` (
     id INTEGER UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
     user_id INTEGER UNSIGNED NOT NULL,
     result_id INTEGER UNSIGNED NOT NULL,
     query_id INTEGER UNSIGNED NOT NULL,
     score TINYINT UNSIGNED,
     created INTEGER UNSIGNED NOT NULL,
+    reliable TINYINY UNSIGNED NOT NULL,
     FOREIGN KEY `scores_user_id` (user_id) REFERENCES users(id),
     FOREIGN KEY `scores_result_id` (result_id) REFERENCES results(id),
     FOREIGN KEY `scores_query_id` (query_id) REFERENCES queries(id),
diff --git a/src/RelevanceScoring/QueriesManager.php 
b/src/RelevanceScoring/QueriesManager.php
index 4a642a7..53a90f2 100644
--- a/src/RelevanceScoring/QueriesManager.php
+++ b/src/RelevanceScoring/QueriesManager.php
@@ -27,14 +27,30 @@
     private $queriesRepo;
     /** @var UsersRepository */
     private $usersRepo;
+    /** @var int */
+    private $maxScoresPerQuery;
+    /** @var int */
+    private $queuePriority;
 
+    /**
+     * @param User $user
+     * @param QueriesRepository $queriesRepo
+     * @param ResultsRepository $resultsRepo
+     * @param ScoresRepository $scoresRepo
+     * @param ScoringQueueRepository $scoringQueueRepo
+     * @param UsersRepository $usersRepo
+     * @param int $maxScoresPerQuery
+     * @param int $queuePriority
+     */
     public function __construct(
         User $user,
         QueriesRepository $queriesRepo,
         ResultsRepository $resultsRepo,
         ScoresRepository $scoresRepo,
         ScoringQueueRepository $scoringQueueRepo,
-        UsersRepository $usersRepo
+        UsersRepository $usersRepo,
+        $maxScoresPerQuery,
+        $queuePriority
     ) {
         $this->user = $user;
         $this->resultsRepo = $resultsRepo;
@@ -42,6 +58,8 @@
         $this->scoringQueueRepo = $scoringQueueRepo;
         $this->queriesRepo = $queriesRepo;
         $this->usersRepo = $usersRepo;
+        $this->maxScoresPerQuery = $maxScoresPerQuery;
+        $this->queuePriority = $queuePriority;
     }
 
     public function nextQueryId() {
@@ -73,6 +91,7 @@
     public function saveScores($queryId, array $scores) {
         $this->scoresRepo->storeQueryScores($this->user, $queryId, $scores);
         $this->scoringQueueRepo->markScored($this->user, $queryId);
+        $this->updateReliability($queryId);
     }
 
     public function updateUserStorage() {
@@ -119,4 +138,38 @@
 
         return $array;
     }
+
+    private function updateReliability($queryId) {
+        // If there are still pending scores do nothing
+        $pendingCount = $this->scoringQueueRepo->getNumberPending([$queryId]);
+        if ( isset( $pendingCount[$queryId] ) ) {
+            return;
+        }
+
+        $reliable = $this->scoresRepo->checkReliability($queryId);
+        if ( $reliable ) {
+            $this->scoresRepo->markReliable($queryId);
+            return;
+        }
+
+        // Query is unreliable and there are no pending scores.
+        $numberOfScores = 
$this->scoringQueueRepo->getNumberOfScores([$queryId]);
+        if ( !isset( $numberOfScores[$queryId])) {
+            // what?!?!
+            return;
+        }
+
+        if ( $numberOfScores[$queryId] > $this->maxScoresPerQuery ) {
+            // we have plenty of scores and this is still unreliable...just 
leave it.
+            // We should probably have some page that reports these so we can 
manually
+            // review/fix them.
+            return;
+        }
+
+        // Request twice as many scores to be given.
+        $desiredScores = min($this->maxScoresPerQuery, 2 * 
$numberOfScores[$queryId] );
+        $neededScores = $desiredScores - $numberOfScores[$queryId];
+        $slots = array_pad([], $neededScores, $this->queuePriority);
+        $this->scoringQueue->insert($queryId, $slots);
+    }
 }
diff --git a/src/RelevanceScoring/RelevanceScoringProvider.php 
b/src/RelevanceScoring/RelevanceScoringProvider.php
index 5421111..091cc39 100644
--- a/src/RelevanceScoring/RelevanceScoringProvider.php
+++ b/src/RelevanceScoring/RelevanceScoringProvider.php
@@ -246,6 +246,8 @@
     private function registerControllers(Application $app)
     {
         // helper for queries controller
+        $app['search.queries_manager.max_scores_per_query'] = 5;
+        $app['search.queries_manager.queue_priority'] = 3;
         $app['search.queries_manager'] = function () use ($app) {
             return new QueriesManager(
                 $app['session']->get('user'),
@@ -253,7 +255,9 @@
                 $app['search.repository.results'],
                 $app['search.repository.scores'],
                 $app['search.repository.scoring_queue'],
-                $app['search.repository.users']
+                $app['search.repository.users'],
+                $app['search.queries_manager.max_scores_per_query'],
+                $app['search.queries_manager.queue_priority']
             );
         };
         $app['search.controller.queries'] = function () use ($app) {
diff --git a/src/RelevanceScoring/Repository/ScoresRepository.php 
b/src/RelevanceScoring/Repository/ScoresRepository.php
index 7200b97..bfa7c03 100644
--- a/src/RelevanceScoring/Repository/ScoresRepository.php
+++ b/src/RelevanceScoring/Repository/ScoresRepository.php
@@ -37,6 +37,7 @@
             'query_id' => $queryId,
             'score' => $score,
             'created' => time(),
+            'reliable' => false,
         ];
         $affected = $this->db->insert('scores', $row);
         if ($affected !== 1) {
@@ -46,6 +47,39 @@
         return $this->db->lastInsertId();
     }
 
+    public function checkReliability($queryId) {
+        $sql = <<<EOD
+SELECT AVG(diff)
+  FROM ( SELECT MAX(score)-MIN(score) as diff
+           FROM scores
+           WHERE query_id = :queryId
+           GROUP BY result_id
+       ) x
+EOD;
+
+        $res = $this->db->executeQuery($sql, [
+            'queryId' => $queryId
+        ]);
+
+        if ($res === false) {
+            throw new RuntimeException('Query Failure');
+        }
+
+        return $res->fetchColumn(0);
+    }
+
+    public function markReliable($queryId) {
+        $sql = <<<EOD
+UPDATE scores
+SET reliable=1
+WHERE query_id = :queryId
+EOD;
+
+        $stmt = $this->db->executeQuery($sql, [
+            'queryId' => $queryId
+        ]);
+    }
+
     public function getNumberOfScores(array $queryIds)
     {
         $sql = <<<EOD
diff --git a/src/RelevanceScoring/Repository/ScoringQueueRepository.php 
b/src/RelevanceScoring/Repository/ScoringQueueRepository.php
index 1c9c06e..e55a216 100644
--- a/src/RelevanceScoring/Repository/ScoringQueueRepository.php
+++ b/src/RelevanceScoring/Repository/ScoringQueueRepository.php
@@ -73,15 +73,17 @@
      * Mark a queryId as needing to be scored $numSlots times.
      *
      * @param int $queryId
+     * @param int[] $slots The list of scoring priorities to
+     *  insert into the queue.
      */
-    public function insert($queryId)
+    public function insert($queryId, array $slots = null)
     {
         $params = ['queryId' => $queryId];
         $rows = [];
-        // very simple priority assignment from 1 to $numSlots. Note
-        // that 0 is the highest priority, and we create two items with
-        // priority 1.
-        foreach ($this->defaultSlots as $priority) {
+        if ( $slots === null ) {
+            $slots = $this->defaultSlots;
+        }
+        foreach ($slots as $priority) {
             $rows[] = "(:queryId, :priority$priority)";
             $params["priority$priority"] = $priority;
         }

-- 
To view, visit https://gerrit.wikimedia.org/r/318024
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I36091436acb81b343a14ca0801ce77924055a654
Gerrit-PatchSet: 1
Gerrit-Project: wikimedia/discovery/discernatron
Gerrit-Branch: master
Gerrit-Owner: EBernhardson <ebernhard...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to