Source code for lerot.comparison.OptimizedInterleave

# This file is part of Lerot.
#
# Lerot is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Lerot is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with Lerot.  If not, see <http://www.gnu.org/licenses/>.

import random
import argparse
import numpy as np
import math
import time

from .AbstractInterleavedComparison import AbstractInterleavedComparison
from ..utils import split_arg_str


# Maximum number of documents (in A and B) that we can feed to OI* alggorithms
MAX_NUMBER_OF_DOCS = 20

[docs]class OptimizedInterleave(AbstractInterleavedComparison): """ An implementation of Optimized Interleave as described in: @see: Radlinski, F., & Craswell, N. (2013, February). Optimized interleaving for online retrieval evaluation. In Proceedings of the sixth ACM international conference on Web search and data mining (pp. 245-254). @author: Anne Schuth @contact: anne.schuth@uva.nl @since: February 2013 @requires: Gurobi from http://www.gurobi.com/ """ def __init__(self, arg_str=""): parser = argparse.ArgumentParser(description=self.__doc__, prog=self.__class__.__name__) parser.add_argument("-c", "--credit", choices=["linear_credit", "binary_credit", "inverse_credit", "negative_credit"], default="linear_credit") parser.add_argument("--allowed_leavings", choices=["prefix_constraint", "sample_prefix_constraint", "sample_prefix_constraint_constructive"], default="prefix_constraint") parser.add_argument("--sample_size", type=int, default=-1) parser.add_argument("--prefix_bound", type=int, default=-1) parser.add_argument("--verbose", action="store_true", default=False) args = vars(parser.parse_known_args(split_arg_str(arg_str))[0]) self.credit = getattr(self, args["credit"]) self.allowed_leavings = getattr(self, args["allowed_leavings"]) self.verbose = args["verbose"] self.prefix_bound = args["prefix_bound"] self.sample_size = args["sample_size"]
[docs] def f(self, i): # Implemented as footnote 4 suggests return 1. / i
[docs] def precompute_rank(self, R): rank = {} for i in xrange(len(R)): rank[R[i]] = i + 1 return rank
[docs] def rank(self, li, R): # Implemented as in d'.1 if li in R: return R[li] return len(R) + 1
[docs] def binary_credit(self, li, rankA, rankB): if self.rank(li, rankA) < self.rank(li, rankB): return 1 elif self.rank(li, rankA) > self.rank(li, rankB): return -1 return 0
[docs] def linear_credit(self, li, rankA, rankB): # Equation (14) return self.rank(li, rankA) - self.rank(li, rankB)
[docs] def inverse_credit(self, li, rankA, rankB): # Equation (15) return 1. / self.rank(li, rankB) - 1. / self.rank(li, rankA)
[docs] def prefix_constraint_bound(self, rankings, length, prefix_bound): currentlevel = [([], [0] * len(rankings), 1)] nextlevel = [] for _ in range(length): for prefix, indexes, indexk in currentlevel: addedthislevel = [] for i in range(len(rankings)): index = indexes[i] ranking = rankings[i] d = None if index < len(ranking) and index <= indexk + prefix_bound: d = ranking[index] while d in prefix: d = None index += 1 if index < len(ranking) and index <= indexk + prefix_bound: d = ranking[index] else: break if d in addedthislevel: continue if d is not None: addedthislevel.append(d) branchindexes = indexes[:] branchindexes[i] = index + 1 if min(branchindexes) > indexk - prefix_bound: branchindexk = indexk + 1 else: branchindexk = indexk branch = (prefix + [d], branchindexes, branchindexk) nextlevel.append(branch) currentlevel = nextlevel nextlevel = [] # L contains allowed multileavings, according to equation (5) L = [n for n, _, _ in currentlevel] del currentlevel del nextlevel return L
[docs] def prefix_constraint(self, rankings, length): prefix_bound = length if self.prefix_bound < 0 else self.prefix_bound L = [] while len(L) == 0 and prefix_bound <= length: L = self.prefix_constraint_bound(rankings, length, prefix_bound) prefix_bound += 1 return L
[docs] def perm_given_index(self, alist, apermindex): """ See http://stackoverflow.com/questions/5602488/random-picks-from-permutation-generator """ alist = alist[:] for i in range(len(alist) - 1): apermindex, j = divmod(apermindex, len(alist) - i) alist[i], alist[i + j] = alist[i + j], alist[i] return alist
[docs] def sample(self, docs, length): r = random.randint(0, math.factorial(len(docs))) l = self.perm_given_index(docs, r) l = l[:length] return l
[docs] def reject(self, l, rankings): indexes = [0] * len(rankings) def update(i, l, k): if rankings[i][indexes[i]] == l[k]: indexes[i] += 1 while k >= indexes[i] and len(rankings[i]) > indexes[i]: found = False for m in range(k): if rankings[i][indexes[i]] == l[m]: indexes[i] += 1 found = True break if not found: break return True return False for k in range(len(l)): found = False for i in range(len(rankings)): if update(i, l, k): found = True if not found: return True return False
[docs] def sample_prefix_constraint(self, rankings, length): docs = list(set().union(*rankings)) L = [] start = time.time() reject1 = reject2 = reject3 = 0 rejects = {} accepts = {} while len(L) < 1000 and time.time() < start + 3: l = self.sample(docs, length) tl = tuple(l) if tl in accepts: reject1 += 1 continue if tl in rejects: reject2 += 1 continue if self.reject(l, rankings): reject3 += 1 rejects[tl] = 1 continue accepts[tl] = 1 L.append(l) if self.verbose: print "perms", math.factorial(len(docs)) print "reject1", reject1 print "reject2", reject2 print "reject3", reject3 print "l", len(L) print "time", time.time() - start return L
[docs] def sample_prefix_constraint_constructive(self, rankings, length): L = [] start = time.time() while len(L) < self.sample_size and time.time() < start + 10: l = [] indexes = [0] * len(rankings) while len(l) < length: r = random.randint(0, len(rankings) - 1) if indexes[r] >= length: continue d = rankings[r][indexes[r]] indexes[r] += 1 while d in l and indexes[r] < length: d = rankings[r][indexes[r]] indexes[r] += 1 if not d in l: l.append(d) if not l in L: L.append(l) return L
[docs] def interleave(self, r1, r2, query, length, bias=0): return self.interleave_n(r1, r2, query, length, 1, bias=0)[0]
[docs] def interleave_n(self, r1, r2, query, length, num_repeat, bias=0): import gurobipy r1.init_ranking(query) r2.init_ranking(query) rA, rB = (r.getDocs() for r in [r1, r2]) length = min(len(rA), len(rB), length) assert length <= MAX_NUMBER_OF_DOCS # We may need longer rA and rB for interleaving, so this is not a bug. rA = rA[:length] rB = rB[:length] L = self.allowed_leavings([rA, rB], length) assert len(L) > 0, (rA, rB, length) # Pre-compute credit for each list l in L rankA = self.precompute_rank(rA) rankB = self.precompute_rank(rB) credit = [[self.credit(li, rankA, rankB) for li in l] for l in L] # Construct a set of constraints m = gurobipy.Model("system") m.params.outputFlag = 0 P = [] # Add a parameter Pi for each list that adheres to equation (6) for i in range(len(L)): P.append(m.addVar(lb=0.0, ub=1.0, name='p%d' % i)) m.update() # Constraint for equation (7) m.addConstr(gurobipy.quicksum(P) == 1, 'sum') # Constraints for equation(8) for each k for k in xrange(length): m.addConstr(gurobipy.quicksum([P[n] * credit[n][k] for n in xrange(len(L))]) == 0, "c%d" % k) # Add sensitivity as an objective to the optimization, equation (13) S = [] for i in range(len(L)): # Equation (9) wa = sum([self.f(k + 1) for k in range(len(L[i])) if credit[i][k] > 0]) # Equation (10) wb = sum([self.f(k + 1) for k in range(len(L[i])) if credit[i][k] < 0]) # Equation (11) wt = sum([self.f(k + 1) for k in range(len(L[i])) if credit[i][k] == 0]) if wa + wb > 0: s = -((1 - wt) / (wa + wb)) * math.log(((wa ** wa) * (wb ** wb)) / ((wa + wb) ** (wa + wb)), 2) S.append(P[i] * s) m.setObjective(gurobipy.quicksum(S), gurobipy.GRB.MAXIMIZE) # Optimize the system and if it is infeasible, relax the constraints self.relaxed = False m.optimize() if m.status == gurobipy.GRB.INFEASIBLE: self.relaxed = True m.feasRelaxS(1, False, False, True) # Restore constraint for equation (7) m.addConstr(gurobipy.quicksum(P) == 1, 'sum') m.addConstr(gurobipy.quicksum( [gurobipy.quicksum([P[n] * credit[n][k] for n in xrange(len(L))]) for k in xrange(length)] ) == 0, "c%d" % k) m.optimize() assert m.status != gurobipy.GRB.INFEASIBLE if self.verbose: print rA print rB for i in range(len(L)): print L[i], credit[i], P[i].x m.printStats() # Sample n lists from L using the computed probabilities problist = sorted([(P[i].x, L[i], credit[i]) for i in range(len(L)) if P[i].x > 0]) result = [] for i in xrange(num_repeat): cumprob = 0.0 randsample = random.random() for (p, l, cr) in problist: cumprob += p if randsample <= cumprob: result.append((np.asarray(l), cr)) break assert len(result) == num_repeat return result
[docs] def infer_outcome(self, l, credit, clicks, query): return sum(cr for (cr, c) in zip(credit, clicks) if c > 0)