Source code for lerot.comparison.VaTdi

# This file is part of Lerot.
#
# Lerot is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Lerot is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with Lerot.  If not, see <http://www.gnu.org/licenses/>.

import random
import itertools

from numpy import asarray

from .TeamDraft import TeamDraft


class CannotInterleave(Exception):
    pass


NUM_RETRIES = 1000


[docs]class VaTdi(TeamDraft):
    """ Algorithm described in
    https://bitbucket.org/varepsilon/tois2013-interleaving """
    def __init__(self, arg_str=None):
        pass

    @staticmethod
[docs]    def sampleSmoothly(a, b, maxVal):
        if a > b:
            a, b = b, a
        if b > maxVal:
            b = maxVal
        if a > 0 and b < maxVal:
            randVal = random.randint(a, b + 1)
            if randVal == b + 1:
                return a - 1 if random.randint(0, 1) == 0 else b + 1
            else:
                return randVal
        elif a == 0 and b == maxVal:
            return random.randint(a, b)
        else:   # a > 0 or b < maxVal
            randVal = random.randint(0, 2 * (b - a) + 2)
            if randVal == 2 * (b - a) + 2:
                return (a - 1) if a > 0 else b + 1
            else:
                return a + randVal // 2

[docs]    def interleave(self, r1, r2, query, length=None):
        for i in xrange(NUM_RETRIES):
            try:
                return self._interleave(r1, r2, query, length)
            except CannotInterleave as e:
                pass
        raise Exception("Unable to interleave after %d attempts" % NUM_RETRIES)


    def _interleave(self, r1, r2, query, length1=None):
        r1.init_ranking(query)
        r2.init_ranking(query)
        A, B = (X.getDocs() for X in [r1, r2])
        length = min(len(A), len(B))
        if length1 is not None:
            length = min(length, length1)

        allVertTypes = set((d.get_type() for d in itertools.chain(A, B)
                            if d.get_type() != 'Web'))

        # Size of different vertical blocks. Initially set to zero for all verts.
        sizeL = dict(((t, 0) for t in allVertTypes))
        for t in sizeL.iterkeys():
            At = set(d for d in A if d.get_type() == t)
            Bt = set(d for d in B if d.get_type() == t)
            sizeA = len(At)
            sizeB = len(Bt)
            maxSize = min(length, len(At | Bt), 2 * sizeA + 1, 2 * sizeB + 1)
            sizeL[t] = self.sampleSmoothly(sizeA, sizeB, maxSize)

        def _addNextDocFrom(X, ranking, currentVert, vLeft):
            assert (X is A) or (X is B)
            if currentVert != 'Web':
                X_left = [x for x in X if x.get_type() == currentVert \
                          and (x not in ranking)]
            else:
                X_left = [x for x in X if x.get_type() in vLeft and (x not in ranking)]
            if len(X_left) == 0:
                raise CannotInterleave("No more documents of type %s. "
                                       "sizeL = %s, A = %s, B = %s, L = %s" % (
                                           currentVert,
                                           str(sizeL),
                                           str(A), str(B), str(L)))
            return X_left[0]

        # Start with an empty document list and assignments.
        L, assignment = [], []
        teamA, teamB = 0, 0
        # All the verticals are not used in the beginning (except for 0-sized).
        vLeft = set((k for (k, v) in sizeL.iteritems() if v != 0))
        vLeft.add('Web')       # web document should always be available
        currentVert = 'Web'
        while len(L) < length:
            doc = None
            if teamA < teamB + random.randint(0, 1):
                doc = _addNextDocFrom(A, L, currentVert, vLeft)
                assignment.append(0)
                teamA += 1
            else:
                doc = _addNextDocFrom(B, L, currentVert, vLeft)
                assignment.append(1)
                teamB += 1
            L.append(doc)
            currentVert = doc.get_type()
            if currentVert != 'Web' and sum(1 for d in L if \
                    d.get_type() == currentVert) == sizeL[currentVert]:
                vLeft.remove(currentVert)
                currentVert = 'Web'
        return (asarray(L), assignment)