EBernhardson has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/317540

Change subject: Potential new click based metric
......................................................................

Potential new click based metric

Our problem with the paul score, our only current click based metric, is
that bringing new (potentially better) results that the user has never
seen before into a result set can lead to a reduction in the score,
basically making something seem worse even though it might not be.

This implements a metric based on the relative ordering preferences of
users from their click behavior. It is based on the paper 'Accurately
Interpreting Clickthrough Data as Implicit Feedback' by Thorsten
Joachims which is a highly cited paper, and seems to be commonly used
in learn to rank scenarios.

One thing i couldn't find any reference to though was how to generate a
score from the preferences. I've taken the naive approach of scoring
each query as the % of constraints that were satisfied, and then having
the engine score be the average of the query score. I don't know if this
is the best way, but seems plausible.

Change-Id: I610db0e574164d32e659259ba325d3e681f06f2f
---
M engineScore.py
A sql/hive_clicks.yaml
2 files changed, 156 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/wikimedia/discovery/relevanceForge 
refs/changes/40/317540/1

diff --git a/engineScore.py b/engineScore.py
index 92b5f43..694bdc1 100755
--- a/engineScore.py
+++ b/engineScore.py
@@ -44,6 +44,7 @@
         'PaulScore': PaulScore,
         'nDCG': nDCG,
         'ERR': ERR,
+        'ClickOrdering': ClickOrdering,
     }
 
     query = CachedQuery(settings)
@@ -179,6 +180,122 @@
         return EngineScoreSet(scores)
 
 
+# https://www.cs.cornell.edu/people/tj/publications/joachims_etal_05a.pdf
+#
+# Generates a set of constraints, matching Click > Skip Above along with
+# Click > Skip Next, for each set of query + clicks. Reports the average
+# % of constraints that are satisfied per query by the provided results.
+class ClickOrdering(object):
+    def __init__(self, sql_result, options):
+        # Map from query string to set of tuples (a,b) such that a is preferred
+        # over b
+        self._constraints = {}
+
+        # The hive output is a mess, we need to burn a few lines.
+        # First remove lines until we get to one that starts with
+        # a space. This is where the query is being provided.
+        while sql_result[0][0] != ' ':
+            sql_result.pop(0)
+        # Then burn lines until it no longer starts with a space, which
+        # is where we will find the query header
+        while sql_result[0][0] == ' ':
+            sql_result.pop(0)
+        # Burn one more line, which is the query header
+        sql_result.pop(0)
+        # We also need to burn one line off the end, which is the command 
prompt
+        sql_result.pop()
+
+        for line in sql_result:
+            try:
+                query, encoded_clicks_and_hits = line.strip().split("\t")
+            except ValueError:
+                debug("Bad input line: %s" % (line))
+                continue
+            # The set of clicks, and the hits shown when that click was
+            # generated, for a single user + query pair.
+            clicks_and_hits = json.loads(encoded_clicks_and_hits)
+
+            # Set ensures if multiple clicks generate the same constraints, we
+            # only evaluate them once. But is that desired? Not sure...
+            constraints = set()
+
+            # The set of clicked page id's for this user + query
+            clicks = set([int(c['page_id']) for c in clicks_and_hits])
+            for meta in clicks_and_hits:
+                hits = meta['hits']
+                click_page_id = int(meta['page_id'])
+                click_pos = self._find_click_pos(hits, click_page_id)
+                if click_pos is None:
+                    # Click is not in hits somehow ... some sort of data
+                    # collection error but not sure what yet.
+                    continue
+                for i in xrange(0, click_pos):
+                    hit_page_id = int(hits[i]['pageid'])
+                    if hit_page_id not in clicks:
+                        constraints.add((click_page_id, hit_page_id))
+                if click_pos + 1 < len(hits):
+                    hit_page_id = int(hits[click_pos+1]['pageid'])
+                    if hit_page_id not in clicks:
+                        constraints.add((click_page_id, hit_page_id))
+
+            # Add new constraints to the set of constraints for the query
+            if len(constraints) > 0:
+                try:
+                    self._constraints[query] = 
self._constraints[query].union(constraints)
+                except KeyError:
+                    self._constraints[query] = constraints
+        self.queries = self._constraints.keys()
+
+    def name(self):
+        return "Click > Skip Above, Skip Next"
+
+    def report(self):
+        print("Loaded ClickOrdering with %d queries with %d constraints" % (
+              len(self._constraints),
+              sum([len(self._constraints[q]) for q in self._constraints])))
+
+    def _find_click_pos(self, hits, page_id):
+        """
+        Returns 0-indexed position of page_id in hits
+        """
+        pos = 0
+        for hit in hits:
+            if hit['pageid'] == page_id:
+                return pos
+            pos += 1
+        debug("page_id(%d) not found in hits: %s" % (page_id,
+              ",".join([str(hit['pageid']) for hit in hits])))
+        return None
+
+    def _evaluate_constraint(self, prefer, over, hits):
+        """
+        Returns True if prefer is found before page_id
+        in hits.
+        """
+        for page_id in hits:
+            if type(page_id) != type(prefer) or type(page_id) != type(over):
+                pprint.pprint(page_id)
+                pprint.pprint(prefer)
+                pprint.pprint(over)
+            if page_id == prefer:
+                return True
+            if page_id == over:
+                return False
+        return False
+
+    def engine_score(self, results):
+        query_agreements = []
+        for query in results:
+            hits = [int(r['docId']) for r in results[query]]
+            constraints = self._constraints[query]
+            agreements = 0
+            for prefer, over in constraints:
+                if self._evaluate_constraint(prefer, over, hits):
+                    agreements += 1
+            query_agreements.append(agreements / float(len(constraints)))
+        return EngineScore(self.name(), sum(query_agreements) / 
len(query_agreements))
+
+
 # Discounted Cumulative Gain
 class DCG(object):
     def __init__(self, sql_result, options):
diff --git a/sql/hive_clicks.yaml b/sql/hive_clicks.yaml
new file mode 100644
index 0000000..ac7f56a
--- /dev/null
+++ b/sql/hive_clicks.yaml
@@ -0,0 +1,39 @@
+scoring:
+    algorithm: ClickOrdering
+    options: {}
+
+servers:
+    - host: stat1002.eqiad.wmnet
+      cmd: hive
+
+variables:
+    limit: 2000
+    num_searches: 10
+    project: en.wikipedia
+
+query: >
+    SELECT
+        query,
+        collect_list(named_struct('page_id', meta.page_id, 'hits', meta.hits)) 
as clicks
+    FROM (
+        SELECT
+            query,
+            collect_list(named_struct('identity', 'identity', 'page_id', 
page_id, 'hits', hits)) as collected
+        FROM 
+            ebernhardson.top_query_clicks
+        WHERE
+            num_searches > {num_searches}
+            AND project = '{project}'
+        GROUP BY
+            query
+        ORDER BY
+            RAND()
+        LIMIT
+            {limit}
+        ) x
+    LATERAL VIEW
+        EXPLODE(collected) m as meta
+    GROUP BY
+        query,
+        meta.identity;
+

-- 
To view, visit https://gerrit.wikimedia.org/r/317540
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I610db0e574164d32e659259ba325d3e681f06f2f
Gerrit-PatchSet: 1
Gerrit-Project: wikimedia/discovery/relevanceForge
Gerrit-Branch: master
Gerrit-Owner: EBernhardson <ebernhard...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to