Source code for lerot.comparison.VaTdi

# This file is part of Lerot.
#
# Lerot is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Lerot is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with Lerot.  If not, see <http://www.gnu.org/licenses/>.

import random
import itertools

from numpy import asarray

from .TeamDraft import TeamDraft


class CannotInterleave(Exception):
    pass


NUM_RETRIES = 1000


[docs]class VaTdi(TeamDraft): """ Algorithm described in https://bitbucket.org/varepsilon/tois2013-interleaving """ def __init__(self, arg_str=None): pass @staticmethod
[docs] def sampleSmoothly(a, b, maxVal): if a > b: a, b = b, a if b > maxVal: b = maxVal if a > 0 and b < maxVal: randVal = random.randint(a, b + 1) if randVal == b + 1: return a - 1 if random.randint(0, 1) == 0 else b + 1 else: return randVal elif a == 0 and b == maxVal: return random.randint(a, b) else: # a > 0 or b < maxVal randVal = random.randint(0, 2 * (b - a) + 2) if randVal == 2 * (b - a) + 2: return (a - 1) if a > 0 else b + 1 else: return a + randVal // 2
[docs] def interleave(self, r1, r2, query, length=None): for i in xrange(NUM_RETRIES): try: return self._interleave(r1, r2, query, length) except CannotInterleave as e: pass raise Exception("Unable to interleave after %d attempts" % NUM_RETRIES)
def _interleave(self, r1, r2, query, length1=None): r1.init_ranking(query) r2.init_ranking(query) A, B = (X.getDocs() for X in [r1, r2]) length = min(len(A), len(B)) if length1 is not None: length = min(length, length1) allVertTypes = set((d.get_type() for d in itertools.chain(A, B) if d.get_type() != 'Web')) # Size of different vertical blocks. Initially set to zero for all verts. sizeL = dict(((t, 0) for t in allVertTypes)) for t in sizeL.iterkeys(): At = set(d for d in A if d.get_type() == t) Bt = set(d for d in B if d.get_type() == t) sizeA = len(At) sizeB = len(Bt) maxSize = min(length, len(At | Bt), 2 * sizeA + 1, 2 * sizeB + 1) sizeL[t] = self.sampleSmoothly(sizeA, sizeB, maxSize) def _addNextDocFrom(X, ranking, currentVert, vLeft): assert (X is A) or (X is B) if currentVert != 'Web': X_left = [x for x in X if x.get_type() == currentVert \ and (x not in ranking)] else: X_left = [x for x in X if x.get_type() in vLeft and (x not in ranking)] if len(X_left) == 0: raise CannotInterleave("No more documents of type %s. " "sizeL = %s, A = %s, B = %s, L = %s" % ( currentVert, str(sizeL), str(A), str(B), str(L))) return X_left[0] # Start with an empty document list and assignments. L, assignment = [], [] teamA, teamB = 0, 0 # All the verticals are not used in the beginning (except for 0-sized). vLeft = set((k for (k, v) in sizeL.iteritems() if v != 0)) vLeft.add('Web') # web document should always be available currentVert = 'Web' while len(L) < length: doc = None if teamA < teamB + random.randint(0, 1): doc = _addNextDocFrom(A, L, currentVert, vLeft) assignment.append(0) teamA += 1 else: doc = _addNextDocFrom(B, L, currentVert, vLeft) assignment.append(1) teamB += 1 L.append(doc) currentVert = doc.get_type() if currentVert != 'Web' and sum(1 for d in L if \ d.get_type() == currentVert) == sizeL[currentVert]: vLeft.remove(currentVert) currentVert = 'Web' return (asarray(L), assignment)