EBernhardson has uploaded a new change for review. https://gerrit.wikimedia.org/r/317541
Change subject: Calculate a bootstrapped confidence interval for engine scores ...................................................................... Calculate a bootstrapped confidence interval for engine scores Change-Id: I25fdab358fd4aed1a73bb1312df7ccc398a7499a --- M engineScore.ini M engineScore.py 2 files changed, 92 insertions(+), 17 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/wikimedia/discovery/relevanceForge refs/changes/41/317541/1 diff --git a/engineScore.ini b/engineScore.ini index 1586d14..b47dc55 100644 --- a/engineScore.ini +++ b/engineScore.ini @@ -10,6 +10,14 @@ labHost = relforge-search.search.eqiad.wmflabs ; Note that when doing nDCG calculations the limit here must be >= the p value (default 20). searchCommand = cd /srv/mediawiki-vagrant && mwvagrant ssh -- mwscript extensions/CirrusSearch/maintenance/runSearch.php --wiki wiki --server en-wp-bm25-inclinks-relforge.wmflabs.org --fork 16 --limit 20 +; Generate a confidence interval. This can take a good amount of time depending +; on the size. For example the ClickOrdering scorer with 2k queries and 9k +; constraints takes ~51ms per round. With the default 10k samples thats a little +; over 8 minutes on my laptop. +confidence = true +; Display a histogram of the samples generated for the confidence interval. Not +; sure if useful for anything, but it's pretty to look at. +ci_histogram = true ;; brute force optimization. requires scipy and matplotlib. This section ;; may be omitted to calculate the engine score a single time. diff --git a/engineScore.py b/engineScore.py index 694bdc1..b3ca42b 100755 --- a/engineScore.py +++ b/engineScore.py @@ -169,10 +169,19 @@ for scorer in self.scorers: scorer.report() + def prepare(self, results): + data = [] + for scorer in self.scorers: + try: + data.append(scorer.prepare(results)) + except NotImplementedError: + data.append(results) + return data + def engine_score(self, results): scores = [] - for scorer in self.scorers: - score = scorer.engine_score(results) + for scorer, data in zip(self.scorers, results): + score = scorer.engine_score(data) if type(score) == EngineScoreSet: scores = scores + score.scores else: @@ -215,8 +224,6 @@ # generated, for a single user + query pair. clicks_and_hits = json.loads(encoded_clicks_and_hits) - # Set ensures if multiple clicks generate the same constraints, we - # only evaluate them once. But is that desired? Not sure... constraints = set() # The set of clicked page id's for this user + query @@ -250,7 +257,7 @@ return "Click > Skip Above, Skip Next" def report(self): - print("Loaded ClickOrdering with %d queries with %d constraints" % ( + print("Loaded ClickOrdering with %d queries and %d constraints" % ( len(self._constraints), sum([len(self._constraints[q]) for q in self._constraints]))) @@ -273,17 +280,13 @@ in hits. """ for page_id in hits: - if type(page_id) != type(prefer) or type(page_id) != type(over): - pprint.pprint(page_id) - pprint.pprint(prefer) - pprint.pprint(over) if page_id == prefer: return True if page_id == over: return False return False - def engine_score(self, results): + def prepare(self, results): query_agreements = [] for query in results: hits = [int(r['docId']) for r in results[query]] @@ -293,7 +296,10 @@ if self._evaluate_constraint(prefer, over, hits): agreements += 1 query_agreements.append(agreements / float(len(constraints))) - return EngineScore(self.name(), sum(query_agreements) / len(query_agreements)) + return query_agreements + + def engine_score(self, results): + return EngineScore(self.name(), sum(results) / len(results)) # Discounted Cumulative Gain @@ -429,7 +435,10 @@ print("Expected %d queries, but %d were missing" % (len(self.dcg.dcgs), errors)) scores.append(EngineScore(self.name(k), sum(ndcgs) / len(ndcgs))) - return EngineScoreSet(scores) + if len(scores) == 1: + return scores[0] + else: + return EngineScoreSet(scores) # http://olivier.chapelle.cc/pub/err.pdf @@ -490,7 +499,10 @@ for k in self.k: total_err = sum([self._query_score(k, q, results[q]) for q in results]) scores.append(EngineScore(self.name(k), total_err / len(results))) - return EngineScoreSet(scores) + if len(scores) == 1: + return scores[0] + else: + return EngineScoreSet(scores) # Formula from talk given by Paul Nelson at ElasticON 2016 @@ -593,17 +605,23 @@ class EngineScore(object): def __init__(self, name, score, histogram=None): self.name = name - self.score = score + self.engine_score = score self.histogram = histogram def name(self): return self.name def score(self): - return self.score + return self.engine_score def output(self, verbose=True): - print('%s: %0.2f' % (self.name, self.score)) + print('%s: %0.4f' % (self.name, self.engine_score)) + try: + pprint.pprint(self.confidence) + print('confidence: %f <= x <= %f' % (self.confidence[0], self.confidence[1])) + except AttributeError: + pass + if verbose and self.histogram is not None: print('Histogram:') print(str(self.histogram)) @@ -630,6 +648,33 @@ score.output(verbose) +def confidence_interval(scorer, results, draw_histogram, n_samples=10000): + import scikits.bootstrap + + scores = [] + + if type(results) == dict: + debug("Can't compute confidence interval with dicts, it doesn't " + + "allow for sampling with replacement") + return None + + def f(data): + engine_score = scorer.engine_score(data) + if draw_histogram: + scores.append(engine_score.score()) + return engine_score.score() + + ci = scikits.bootstrap.ci(results, f, n_samples=n_samples) + if draw_histogram: + import matplotlib.pyplot as plt + + n, bins, patches = plt.hist(scores, 50, normed=True) + plt.xlabel(scorer.name()) + plt.ylabel('Probability') + plt.show() + return ci + + def score(scorer, config): # Run all the queries print('Running queries') @@ -637,7 +682,29 @@ results = load_results(results_dir) print('Calculating engine score') - return scorer.engine_score(results) + + try: + results = scorer.prepare(results) + except NotImplementedError: + pass + + engine_score = scorer.engine_score(results) + if config.has_option('test1', 'confidence') and config.getboolean('test1', 'confidence'): + if type(engine_score) == EngineScoreSet: + # We could, but it's a pain and probably not necessary... + print("Can't generate confidence intervals for multiple scoring implementations") + else: + histogram = config.has_option('test1', 'ci_histogram') and \ + config.getboolean('test1', 'ci_histogram') + if config.has_option('test1', 'ci_samples'): + n_samples = config.getint('test1', 'ci_samples') + else: + n_samples = 10000 + ci = confidence_interval(scorer, results, histogram, n_samples=n_samples) + if ci is not None: + engine_score.confidence = ci + + return engine_score def make_search_config(config, x): -- To view, visit https://gerrit.wikimedia.org/r/317541 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I25fdab358fd4aed1a73bb1312df7ccc398a7499a Gerrit-PatchSet: 1 Gerrit-Project: wikimedia/discovery/relevanceForge Gerrit-Branch: master Gerrit-Owner: EBernhardson <ebernhard...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits