Source code for lerot.comparison.ProbabilisticInterleave

# This file is part of Lerot.
#
# Lerot is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Lerot is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with Lerot.  If not, see <http://www.gnu.org/licenses/>.

# KH, 2012/06/19

import argparse

from numpy import asarray, e, log, where, exp
from random import randint

from .AbstractInterleavedComparison import AbstractInterleavedComparison
from ..utils import split_arg_str


[docs]class ProbabilisticInterleave(AbstractInterleavedComparison): """Probabilistic interleaving, marginalizes over assignments""" def __init__(self, arg_str=None): if arg_str: parser = argparse.ArgumentParser(description="Parse arguments for " "interleaving method.", prog=self.__class__.__name__) parser.add_argument("-a", "--aggregate", choices=["expectation", "log-likelihood-ratio", "likelihood-ratio", "log-ratio", "binary"]) parser.add_argument("-d", "--det_interleave", type=bool, help="If true, use deterministic interleaving, regardless " "of the ranker type used for comparison.") parser.add_argument("-t", "--compare_td", type=bool, help="If true, compare rankers using observed assignments " "instead of marginalizing over possible assignments.") args = vars(parser.parse_known_args(split_arg_str(arg_str))[0]) if "aggregate" in args and args["aggregate"]: self.aggregate = args["aggregate"] if "det_interleave" in args and args["det_interleave"]: self.det_interleave = True if "compare_td" in args and args["compare_td"]: self.compare_td = True if not hasattr(self, "aggregate") or not self.aggregate: self.aggregate = "expectation" if not hasattr(self, "det_interleave"): self.det_interleave = False if not hasattr(self, "compare_td"): self.compare_td = False
[docs] def interleave(self, r1, r2, query, length): r1.init_ranking(query) r2.init_ranking(query) length = min(r1.document_count(), r2.document_count(), length) # start with empty document list l = [] # random bits indicate which r to use at each rank a = asarray([randint(0, 1) for _ in range(length)]) for next_a in a: # flip coin - which r contributes doc (pre-computed in a) select = r1 if (next_a == 0) else r2 other = r2 if (next_a == 0) else r1 # draw doc if self.det_interleave: pick = select.next_det() else: pick = select.next() l.append(pick) # let other ranker know that we removed this document try: other.rm_document(pick) except: pass return (asarray(l), (a, r1, r2))
[docs] def infer_outcome(self, l, a, c, query): (td_a, r1, r2) = a # for comparisons with TD, use naive comparison if self.compare_td: c1 = sum([1 if val_a == 0 and val_c == 1 else 0 for val_a, val_c in zip(td_a, c)]) c2 = sum([1 if val_a == 1 and val_c == 1 else 0 for val_a, val_c in zip(td_a, c)]) return -1 if c1 > c2 else 1 if c2 > c1 else 0 # comparison with marginalization # are there any clicks? (otherwise it's a tie) click_ids = where(asarray(c) == 1) if not len(click_ids[0]): # no clicks, will be a tie return 0, 0 r1.init_ranking(query) r2.init_ranking(query) # enumerate all possible assignments that go with l, add their # outcomes weighted by probabilities # original outcome is not needed in this case, only clicks and probs root = SimpleBinaryTree(None, 0.0, 0) # root nextLevel = [root] currentLevel = [] # traverse possible assignments breath-first log_p_a = len(l) * log(0.5) log_p_l = len(l) * log(0.5) for n in range(len(l)): currentLevel = nextLevel nextLevel = [] p_r1 = r1.get_document_probability(l[n]) p_r2 = r2.get_document_probability(l[n]) # zero probability: observed list is not possible (e.g., with # deterministic rankers and historical data) if p_r1 == 0 and p_r2 == 0: return .0 r1.rm_document(l[n]) try: r2.rm_document(l[n]) except: pass log_p_l += log(p_r1 + p_r2) for node in currentLevel: # expand children and add new nodes to queue for nextLevel # left child: r1 is selected, only expand if p > 0 if p_r1 > 0: p_left = node.prob + log(0.5 * p_r1) o_left = node.outcome if c[n] == 1: o_left += -1 node.left = SimpleBinaryTree(node, p_left, o_left) nextLevel.append(node.left) # right child: r2 is selected, only expand if p > 0 if p_r2 > 0: p_right = node.prob + log(0.5 * p_r2) o_right = node.outcome if c[n] == 1: o_right += 1 node.right = SimpleBinaryTree(node, p_right, o_right) nextLevel.append(node.right) # we have all log probabilities for outcomes =! 0 o1 = 0.0 o2 = 0.0 for node in nextLevel: if node.outcome != 0: # log_p_a and log_p_l cancel out if we turn the outcome into a # ratio for now, keep them for clarity if node.outcome < 0: o1 += e ** (node.prob + log_p_a - log_p_l) else: o2 += e ** (node.prob + log_p_a - log_p_l) # return -1 if o1 > o2 else 1 if o2 > o1 else 0 if o1 == o2: outcome = 0 elif self.aggregate == "expectation": outcome = o2 - o1 elif self.aggregate == "log-likelihood-ratio": if o1 > o2: outcome = log(o2 / o1) else: outcome = log(o1 / o2) elif self.aggregate == "likelihood-ratio": if o1 > o2: outcome = (float(o2) / o1) - 1 else: outcome = 1 - (float(o1) / o2) elif self.aggregate == "log-ratio": if o1 > o2: outcome = float(log(o1)) / log(o2) - 1 else: outcome = 1 - float(log(o2)) / log(o1) elif self.aggregate == "binary": outcome = -1 if o1 > o2 else 1 if o2 > o1 else 0 else: raise ValueError("Unknown aggregation method: %s", self.aggregate) return outcome, exp(log_p_l)
[docs] def get_probability_of_list(self, result_list, context, query): # P(l) = \prod_{doc in result_list} 1/2 P_1(doc) + 1/2 P_2(doc) p_l = 1.0 (_, r1, r2) = context r1.init_ranking(query) r2.init_ranking(query) for _, doc in enumerate(result_list): p_r1 = r1.get_document_probability(doc) p_r2 = r2.get_document_probability(doc) r1.rm_document(doc) r2.rm_document(doc) p_l *= 0.5 * (p_r1 + p_r2) return p_l
class SimpleBinaryTree: """tree that keeps track of outcome, probability of arriving at this outcome""" parent, left, right, prob, outcome = None, None, None, 0.0, 0 def __init__(self, parent, prob, outcome): self.parent = parent self.prob = prob self.outcome = outcome