[MediaWiki-commits] [Gerrit] wikimedia...relevanceForge[master]: Calculate a bootstrapped confidence interval for engine scores

EBernhardson (Code Review) Mon, 24 Oct 2016 10:16:43 -0700

EBernhardson has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/317541


Change subject: Calculate a bootstrapped confidence interval for engine scores
......................................................................

Calculate a bootstrapped confidence interval for engine scores

Change-Id: I25fdab358fd4aed1a73bb1312df7ccc398a7499a
---
M engineScore.ini
M engineScore.py
2 files changed, 92 insertions(+), 17 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/wikimedia/discovery/relevanceForge 
refs/changes/41/317541/1

diff --git a/engineScore.ini b/engineScore.ini
index 1586d14..b47dc55 100644
--- a/engineScore.ini
+++ b/engineScore.ini
@@ -10,6 +10,14 @@
 labHost = relforge-search.search.eqiad.wmflabs
 ; Note that when doing nDCG calculations the limit here must be >= the p value 
(default 20).
 searchCommand = cd /srv/mediawiki-vagrant && mwvagrant ssh -- mwscript 
extensions/CirrusSearch/maintenance/runSearch.php --wiki wiki --server 
en-wp-bm25-inclinks-relforge.wmflabs.org --fork 16 --limit 20
+; Generate a confidence interval. This can take a good amount of time depending
+; on the size. For example the ClickOrdering scorer with 2k queries and 9k
+; constraints takes ~51ms per round. With the default 10k samples thats a 
little
+; over 8 minutes on my laptop.
+confidence = true
+; Display a histogram of the samples generated for the confidence interval. Not
+; sure if useful for anything, but it's pretty to look at.
+ci_histogram = true
 
 ;; brute force optimization. requires scipy and matplotlib. This section
 ;; may be omitted to calculate the engine score a single time.
diff --git a/engineScore.py b/engineScore.py
index 694bdc1..b3ca42b 100755
--- a/engineScore.py
+++ b/engineScore.py
@@ -169,10 +169,19 @@
         for scorer in self.scorers:
             scorer.report()
 
+    def prepare(self, results):
+        data = []
+        for scorer in self.scorers:
+            try:
+                data.append(scorer.prepare(results))
+            except NotImplementedError:
+                data.append(results)
+        return data
+
     def engine_score(self, results):
         scores = []
-        for scorer in self.scorers:
-            score = scorer.engine_score(results)
+        for scorer, data in zip(self.scorers, results):
+            score = scorer.engine_score(data)
             if type(score) == EngineScoreSet:
                 scores = scores + score.scores
             else:
@@ -215,8 +224,6 @@
             # generated, for a single user + query pair.
             clicks_and_hits = json.loads(encoded_clicks_and_hits)
 
-            # Set ensures if multiple clicks generate the same constraints, we
-            # only evaluate them once. But is that desired? Not sure...
             constraints = set()
 
             # The set of clicked page id's for this user + query
@@ -250,7 +257,7 @@
         return "Click > Skip Above, Skip Next"
 
     def report(self):
-        print("Loaded ClickOrdering with %d queries with %d constraints" % (
+        print("Loaded ClickOrdering with %d queries and %d constraints" % (
               len(self._constraints),
               sum([len(self._constraints[q]) for q in self._constraints])))
 
@@ -273,17 +280,13 @@
         in hits.
         """
         for page_id in hits:
-            if type(page_id) != type(prefer) or type(page_id) != type(over):
-                pprint.pprint(page_id)
-                pprint.pprint(prefer)
-                pprint.pprint(over)
             if page_id == prefer:
                 return True
             if page_id == over:
                 return False
         return False
 
-    def engine_score(self, results):
+    def prepare(self, results):
         query_agreements = []
         for query in results:
             hits = [int(r['docId']) for r in results[query]]
@@ -293,7 +296,10 @@
                 if self._evaluate_constraint(prefer, over, hits):
                     agreements += 1
             query_agreements.append(agreements / float(len(constraints)))
-        return EngineScore(self.name(), sum(query_agreements) / 
len(query_agreements))
+        return query_agreements
+
+    def engine_score(self, results):
+        return EngineScore(self.name(), sum(results) / len(results))
 
 
 # Discounted Cumulative Gain
@@ -429,7 +435,10 @@
                 print("Expected %d queries, but %d were missing" % 
(len(self.dcg.dcgs), errors))
 
             scores.append(EngineScore(self.name(k), sum(ndcgs) / len(ndcgs)))
-        return EngineScoreSet(scores)
+        if len(scores) == 1:
+            return scores[0]
+        else:
+            return EngineScoreSet(scores)
 
 
 # http://olivier.chapelle.cc/pub/err.pdf
@@ -490,7 +499,10 @@
         for k in self.k:
             total_err = sum([self._query_score(k, q, results[q]) for q in 
results])
             scores.append(EngineScore(self.name(k), total_err / len(results)))
-        return EngineScoreSet(scores)
+        if len(scores) == 1:
+            return scores[0]
+        else:
+            return EngineScoreSet(scores)
 
 
 # Formula from talk given by Paul Nelson at ElasticON 2016
@@ -593,17 +605,23 @@
 class EngineScore(object):
     def __init__(self, name, score, histogram=None):
         self.name = name
-        self.score = score
+        self.engine_score = score
         self.histogram = histogram
 
     def name(self):
         return self.name
 
     def score(self):
-        return self.score
+        return self.engine_score
 
     def output(self, verbose=True):
-        print('%s: %0.2f' % (self.name, self.score))
+        print('%s: %0.4f' % (self.name, self.engine_score))
+        try:
+            pprint.pprint(self.confidence)
+            print('confidence: %f <= x <= %f' % (self.confidence[0], 
self.confidence[1]))
+        except AttributeError:
+            pass
+
         if verbose and self.histogram is not None:
             print('Histogram:')
             print(str(self.histogram))
@@ -630,6 +648,33 @@
             score.output(verbose)
 
 
+def confidence_interval(scorer, results, draw_histogram, n_samples=10000):
+    import scikits.bootstrap
+
+    scores = []
+
+    if type(results) == dict:
+        debug("Can't compute confidence interval with dicts, it doesn't " +
+              "allow for sampling with replacement")
+        return None
+
+    def f(data):
+        engine_score = scorer.engine_score(data)
+        if draw_histogram:
+            scores.append(engine_score.score())
+        return engine_score.score()
+
+    ci = scikits.bootstrap.ci(results, f, n_samples=n_samples)
+    if draw_histogram:
+        import matplotlib.pyplot as plt
+
+        n, bins, patches = plt.hist(scores, 50, normed=True)
+        plt.xlabel(scorer.name())
+        plt.ylabel('Probability')
+        plt.show()
+    return ci
+
+
 def score(scorer, config):
     # Run all the queries
     print('Running queries')
@@ -637,7 +682,29 @@
     results = load_results(results_dir)
 
     print('Calculating engine score')
-    return scorer.engine_score(results)
+
+    try:
+        results = scorer.prepare(results)
+    except NotImplementedError:
+        pass
+
+    engine_score = scorer.engine_score(results)
+    if config.has_option('test1', 'confidence') and config.getboolean('test1', 
'confidence'):
+        if type(engine_score) == EngineScoreSet:
+            # We could, but it's a pain and probably not necessary...
+            print("Can't generate confidence intervals for multiple scoring 
implementations")
+        else:
+            histogram = config.has_option('test1', 'ci_histogram') and \
+                        config.getboolean('test1', 'ci_histogram')
+            if config.has_option('test1', 'ci_samples'):
+                n_samples = config.getint('test1', 'ci_samples')
+            else:
+                n_samples = 10000
+            ci = confidence_interval(scorer, results, histogram, 
n_samples=n_samples)
+            if ci is not None:
+                engine_score.confidence = ci
+
+    return engine_score
 
 
 def make_search_config(config, x):

-- 
To view, visit https://gerrit.wikimedia.org/r/317541
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I25fdab358fd4aed1a73bb1312df7ccc398a7499a
Gerrit-PatchSet: 1
Gerrit-Project: wikimedia/discovery/relevanceForge
Gerrit-Branch: master
Gerrit-Owner: EBernhardson <ebernhard...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] wikimedia...relevanceForge[master]: Calculate a bootstrapped confidence interval for engine scores

Reply via email to