File: ResultSet.py

package info (click to toggle)
zope-textindexng2 1%3A2.0.8-5
  • links: PTS
  • area: main
  • in suites: sarge
  • size: 3,772 kB
  • ctags: 3,537
  • sloc: ansic: 15,956; python: 6,129; xml: 185; makefile: 132; sh: 71
file content (224 lines) | stat: -rw-r--r-- 7,899 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
#############################################################################
#
# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
# 
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
# 
##############################################################################

"""
ResultSet

$Id: ResultSet.py,v 1.22 2004/02/26 17:50:36 ajung Exp $
"""

from math import log, sqrt

from BTrees.IIBTree import IISet, IIBTree
from BTrees.IOBTree import IOBTree
from BTrees.IIBTree import intersection as IntIntersection, union as IntUnion, difference as IntDifference
from BTrees.OOBTree import OOBTree, OOSet, intersection as ObjIntersection, union as ObjUnion
from Products.ZCTextIndex.NBest import NBest

class ResultSet:
    """ class to keep results for TextIndexNG queries """

    def __init__(self, mapping, words): 
        """ 'mapping' is either an IOBTree containing the mapping  
            documentId to list of positions of a wid/word in this
            document or an IISet() with a list of documentIds.

            'words' is a list of words (from the query)
        """
        # create a new IOBTree with the documentIds as keys
        if isinstance(mapping, IISet):
            self._mapping = IOBTree()

            for k in mapping:
                self._mapping[k] = IISet() 
        else:
            self._mapping   = mapping

        self._result = None
        self._docids = IISet(mapping.keys())
        self._words  = OOBTree(words)

        self.items   = self._mapping.items
        self.keys    = self._mapping.keys
        self.values  = self._mapping.values
        self.has_key = self._mapping.has_key


    def docIds(self):  return self._docids
    def mapping(self): return self._mapping
    def words(self):   return self._words  # fix this 
    def result(self):  return self._result # mapping docid -> score
    def __len__(self): return len(self._mapping)

    def __str__(self):
        return  'ResultSet([%s], %s)' % \
                (self.words(),str(self._mapping)) 
    __repr__ = __str__

    def cosine_ranking(self, index, hits=250):
        """ Calculate the ranking of the document based on the 
            cosine rule.
        """

        IDF = {}                # mapping term -> inverse document frequency
        cache = {}              # mapping term -> found docids
        wid_cache = {}          # mapping term -> wid
        N = len(index)          # length of collection
        nbest = NBest(hits)

        for term in self.words().keys():

            wid_cache[term] = wid = index.getLexicon().getWordId(term)                         
            docids = index.getStorage().getDocumentIdsForWordId(wid)
            cache[term] = docids

            # term frequence = number of documents a term appears in
            tf = len(docids)

            # calc and store the inverse document frequency given as
            # log(1+N/TF)
            if tf == 0: IDF[term] = 0
            else:       IDF[term] = log(1.0 + N / tf) 

        terms = list(self.words().keys())
        num_terms = len(terms)
        get_frequency = index.getStorage().getWordFrequency
        for docid in self.docIds():   # iterate over all found documents

            rank = 0.0                # ranking
            total_dwt = 0.0           # document weight

            for term in terms:
                if not docid in cache[term]: continue 

                # document term frequency = the number of times a term
                # appears within a particular document
                
                dtf = get_frequency(docid, wid_cache[term])

                # document term weight = the weight of a term within a
                # document and is calculated as:
                dtw = (1.0 + log(dtf)) * IDF[term] 

                # query term frequency and query max frequency are set
                # to 1 by default
                qtf = qmf = 1    

                # query term weight is the weight given to each term in the
                # query and is calculated as:        
                qtw = (0.5 + (0.5 * qtf/qmf)) * IDF[term] * self.words()[term]

                # add this stuff to the ranking
                rank += (qtw * dtw) 
                total_dwt += (dtw * dtw)
#                print 'q:%12d/%10s: dtf=%8.5f dtw=%8.5f rank=%8.5f totaldtw=%8.5f' % (docid, term.encode('iso-8859-15'),dtf, dtw,rank, total_dwt)

            total_dwt = sqrt(total_dwt)
            if total_dwt == 0:
                rank = 0
            else:
#                print "\t",rank, total_dwt, rank/total_dwt
#                rank = rank / total_dwt     # normalization
                rank = rank  / num_terms
                rank = int(rank * 1000 + 0.5)   # scale rank to be an integer

            nbest.add(docid, rank)

        self._result = IIBTree()
        for docid, score in nbest.getbest():
            self._result[docid] = score


#################################################################
# ResultSet functions
#################################################################

def intersectResultSets(sets):
    """ perform intersection of ResultSets """
                                    
    docids = None
    words = OOBTree()
    sets = list(sets)
    sets.sort(lambda s1,s2: cmp(len(s1.docIds()),len(s2.docIds())))

    for set in sets:
        docids = IntIntersection(docids, set.docIds())
        words.update(set.words())

    return ResultSet(docids, words)       

def unionResultSets(sets):
    """ perform union of ResultSets """

    docids = None
    words = OOBTree()
    sets = list(sets)
    sets.sort(lambda s1,s2: cmp(len(s1.docIds()),len(s2.docIds())))

    for set in sets:
        docids = IntUnion(docids, set.docIds())
        words.update(set.words())
    return ResultSet(docids,  words)       

def phraseResultSets(sets, index):
    """ quote is a special case of near search where the near-
        distance is one and we only search to the right
    """
    return nearResultSets(sets, index, distance=1, bidirectional=0)

def nearResultSets(sets, index, distance=5, bidirectional=1):
    """ perform near search on results sets """
    
    # One resultset consists of an IISet() or documentIds and 
    # tuple whose first element is the word (from LexiconLookup())
    # First we perform an intersection to get the documentIds of
    # those documents that contain all the words

    docids =  intersectResultSets(sets).docIds()

    # Now we determine for every document the positions of all
    # the words inside the document. Then we compare all the positions
    # to determine neighbourship
    
    words = []
    for set in sets:
        for word in set.words().keys():
            words.append(word)

    res_docids = IISet()

    for docId in docids:
        # the posMap is a list of tuples(word,IISet[positions])
        posMap = index.positionsFromDocumentLookup(docId, words)

        if bidirectional:
            if len(posMap.checkPositionMapBidirectional(distance)) > 0:
                res_docids.insert(docId)
        else:
            if len(posMap.checkPositionMapUnidirectional(distance)) > 0:
                res_docids.insert(docId)

    d = {}
    for w in words: d[w] = 1.0

    return ResultSet(res_docids, d)       

def inverseResultSet(set, index):
    """ compute inverse ResultSet (for NotNode) """

    docids = IntDifference(
                 IISet(index.getStorage().getDocIds()), 
                 set.docIds()
                )

    return ResultSet(docids,  set.words())