File: buzhug_algos.py

package info (click to toggle)
python-buzhug 1.8-2
  • links: PTS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 144 kB
  • ctags: 197
  • sloc: python: 1,653; makefile: 32
file content (175 lines) | stat: -rw-r--r-- 6,634 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
"""Implementation of fast search algorithms
Used in select when one of the search fields has a fixed-length size

Instead of taking all the blocks in field files one after the other, a big
number of blocks are read and the search is made on the buffer
"""

from datetime import date, datetime
import itertools

# compatibility with Python 2.3
try:
    set([1])
except NameError:
    from sets import Set as set

def rev(s):
    """ function used to compare strings in decreasing order"""
    return ''.join([chr(255-ord(c)) for c in s])

def make_search_func(db,field,value):
    """Return the search function on a field
    If value is a pair of values (v1,v2), all blocks between v1 and v2
    will be returned ; if value is a single value then all blocks with
    this value will be returned
    """
    bl = db._file[field].block_len  # block length
    if isinstance(value,(list,tuple)):
        value = list(value)
        if not len(value)==2:
            raise ValueError,"If argument is a list, only 2 values \
                should be passed (found %s)" %len(value)
        if not db.fields[field] in [int,float,date,datetime]:
            raise TypeError,"Search between values is only allowed for " \
                "int, float, date and datetime (found %s)" %db.fields[field]
        db._validate(field,value[0])
        db._validate(field,value[1])
        value.sort()
        # convert values in blocks (strings representation in field files)
        s1,s2 = [ db.f_encode[db.fields[field]](v) for v in value ]
        # search the common leading characters in s1 and s2
        common = ''
        for i in range(len(s1)):
            if s1[i] == s2[i]:
                common += s1[i]
            else:
                break
        lc = len(common)
        Min = s1[lc:] # part of s1 not common with s2
        Max = s2[lc:] # part of s2 not common with s1

        def _search(buf):
            """Function searching blocks in the buffer such that
            s1 <= block <= s2
            Return a dictionary mapping rank of the block to the block
            
            The algorithm searches occurences of 'common', then checks
            that the rest of the block is between Min and Max
            """
            ranks = {}
            pos = 0
            while True:
                # search occurences of the common part between s1 and s2
                pos = buf.find(common,pos)
                if pos == -1:
                    break
                if pos % bl == 0:
                    # pos is a block start
                    block = buf[pos:pos+bl]
                    # rest of the block
                    rest = buf[pos+lc:pos+bl]
                    # compare rest of block to Min and Max
                    if Min <= rest <= Max:
                        ranks[pos/bl] = block
                pos += 1
            return ranks

    else:
        v = db.f_encode[db.fields[field]](value)

        def _search(buf):
            """Function searching blocks in the buffer such that
            block == v
            Return a dictionary mapping rank of the block to the block
            
            The algorithm searches occurences of the block v in the 
            buffer
            """
            ranks = {}
            pos = 0
            while True:
                pos = buf.find(v,pos)
                if pos>-1:
                    if pos % bl == 0:
                        # pos is a block start
                        ranks[pos/bl] = buf[pos:pos+bl]
                    pos += 1
                else:
                    break
            return ranks

    return _search

def fast_select(db,names,**args):
    """Handles requests like select(['name'],age=23,name='pierre') when
    one of the arg keys is fixed length type ; uses a fast search algo
    instead of browsing all the records
    
    The search functions are defined for all fixed-length arguments and
    used to select a subset of record rows in field files
    """
    # fixed and variable length fields
    f_args = [ (k,v) for k,v in args.iteritems() 
        if hasattr(db._file[k],'block_len') ]
    v_args = [ (k,v) for (k,v) in args.iteritems() 
        if not hasattr(db._file[k],'block_len') ]
    arg_names = [ k for k,v in f_args + v_args ]
    no_args = [ n for n in names if not n in arg_names ]
    names = arg_names + no_args

    [ db._file[k].seek(0) for k in names + args.keys() ]
    max_len = max([ db._file[k[0]].block_len for k in f_args ])
    num_blocks = db.BLOCKSIZE / max_len
    funcs = dict([(k,make_search_func(db,k,v)) 
                    for (k,v) in f_args])
    fl_ranks = [] # absolute ranks in fixed length files
    bl_offset = 0 # offset of current chunck
    res = {}
    while True:
        buf = {}
        ranks = {}
        # read a chunk of num_blocks blocks in each fixed-length file
        for i,(k,v) in enumerate(f_args):
            # rank[field] stores the rank of found values in
            # the buffer, between 0 and num_blocks-1
            bl = db._file[k].block_len
            buf[k] = db._file[k].read(num_blocks*bl)
            ranks[k] = funcs[k](buf[k])
        # test end of files
        if not buf[f_args[0][0]]:
            break
        # valid records are those with the same rank in all files
        rank_set=set(ranks[f_args[0][0]].keys())
        if len(f_args)>1:
            for (k,v) in f_args[1:]:
                rank_set = rank_set.intersection(set(ranks[k].keys()))
        for c in rank_set:
            res[bl_offset+c] = [ ranks[k][c] for k,v in f_args ]
        bl_offset += num_blocks

    fl_ranks = res.keys()
    fl_ranks.sort()

    # The field files for the other arguments are browsed ; if their
    # row is in the subset, test if the value for variable length arguments
    # is equal to the keyword value
    vl_files = [ db._file[k] for k,v in v_args ]
    nbvl = len(vl_files)
    vl_values = tuple([ db._file[k].to_block(v) for (k,v) in v_args ])
    no_args_files = [ db._file[k] for k in no_args ]
    other_files = vl_files + no_args_files
    for f in other_files:
        f.seek(0)

    for i,lines in enumerate(itertools.izip(*other_files)):
        try:
            if i == fl_ranks[0]:
                fl_ranks.pop(0)
                if lines[:nbvl] == vl_values:
                    res[i]+=list(lines)
                else:
                    del res[i]
        except IndexError:
            break
    return res,names