1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
|
"""Implementation of fast search algorithms
Used in select when one of the search fields has a fixed-length size
Instead of taking all the blocks in field files one after the other, a big
number of blocks are read and the search is made on the buffer
"""
from datetime import date, datetime
import itertools
# compatibility with Python 2.3
try:
set([1])
except NameError:
from sets import Set as set
def rev(s):
""" function used to compare strings in decreasing order"""
return ''.join([chr(255-ord(c)) for c in s])
def make_search_func(db,field,value):
"""Return the search function on a field
If value is a pair of values (v1,v2), all blocks between v1 and v2
will be returned ; if value is a single value then all blocks with
this value will be returned
"""
bl = db._file[field].block_len # block length
if isinstance(value,(list,tuple)):
value = list(value)
if not len(value)==2:
raise ValueError,"If argument is a list, only 2 values \
should be passed (found %s)" %len(value)
if not db.fields[field] in [int,float,date,datetime]:
raise TypeError,"Search between values is only allowed for " \
"int, float, date and datetime (found %s)" %db.fields[field]
db._validate(field,value[0])
db._validate(field,value[1])
value.sort()
# convert values in blocks (strings representation in field files)
s1,s2 = [ db.f_encode[db.fields[field]](v) for v in value ]
# search the common leading characters in s1 and s2
common = ''
for i in range(len(s1)):
if s1[i] == s2[i]:
common += s1[i]
else:
break
lc = len(common)
Min = s1[lc:] # part of s1 not common with s2
Max = s2[lc:] # part of s2 not common with s1
def _search(buf):
"""Function searching blocks in the buffer such that
s1 <= block <= s2
Return a dictionary mapping rank of the block to the block
The algorithm searches occurences of 'common', then checks
that the rest of the block is between Min and Max
"""
ranks = {}
pos = 0
while True:
# search occurences of the common part between s1 and s2
pos = buf.find(common,pos)
if pos == -1:
break
if pos % bl == 0:
# pos is a block start
block = buf[pos:pos+bl]
# rest of the block
rest = buf[pos+lc:pos+bl]
# compare rest of block to Min and Max
if Min <= rest <= Max:
ranks[pos/bl] = block
pos += 1
return ranks
else:
v = db.f_encode[db.fields[field]](value)
def _search(buf):
"""Function searching blocks in the buffer such that
block == v
Return a dictionary mapping rank of the block to the block
The algorithm searches occurences of the block v in the
buffer
"""
ranks = {}
pos = 0
while True:
pos = buf.find(v,pos)
if pos>-1:
if pos % bl == 0:
# pos is a block start
ranks[pos/bl] = buf[pos:pos+bl]
pos += 1
else:
break
return ranks
return _search
def fast_select(db,names,**args):
"""Handles requests like select(['name'],age=23,name='pierre') when
one of the arg keys is fixed length type ; uses a fast search algo
instead of browsing all the records
The search functions are defined for all fixed-length arguments and
used to select a subset of record rows in field files
"""
# fixed and variable length fields
f_args = [ (k,v) for k,v in args.iteritems()
if hasattr(db._file[k],'block_len') ]
v_args = [ (k,v) for (k,v) in args.iteritems()
if not hasattr(db._file[k],'block_len') ]
arg_names = [ k for k,v in f_args + v_args ]
no_args = [ n for n in names if not n in arg_names ]
names = arg_names + no_args
[ db._file[k].seek(0) for k in names + args.keys() ]
max_len = max([ db._file[k[0]].block_len for k in f_args ])
num_blocks = db.BLOCKSIZE / max_len
funcs = dict([(k,make_search_func(db,k,v))
for (k,v) in f_args])
fl_ranks = [] # absolute ranks in fixed length files
bl_offset = 0 # offset of current chunck
res = {}
while True:
buf = {}
ranks = {}
# read a chunk of num_blocks blocks in each fixed-length file
for i,(k,v) in enumerate(f_args):
# rank[field] stores the rank of found values in
# the buffer, between 0 and num_blocks-1
bl = db._file[k].block_len
buf[k] = db._file[k].read(num_blocks*bl)
ranks[k] = funcs[k](buf[k])
# test end of files
if not buf[f_args[0][0]]:
break
# valid records are those with the same rank in all files
rank_set=set(ranks[f_args[0][0]].keys())
if len(f_args)>1:
for (k,v) in f_args[1:]:
rank_set = rank_set.intersection(set(ranks[k].keys()))
for c in rank_set:
res[bl_offset+c] = [ ranks[k][c] for k,v in f_args ]
bl_offset += num_blocks
fl_ranks = res.keys()
fl_ranks.sort()
# The field files for the other arguments are browsed ; if their
# row is in the subset, test if the value for variable length arguments
# is equal to the keyword value
vl_files = [ db._file[k] for k,v in v_args ]
nbvl = len(vl_files)
vl_values = tuple([ db._file[k].to_block(v) for (k,v) in v_args ])
no_args_files = [ db._file[k] for k in no_args ]
other_files = vl_files + no_args_files
for f in other_files:
f.seek(0)
for i,lines in enumerate(itertools.izip(*other_files)):
try:
if i == fl_ranks[0]:
fl_ranks.pop(0)
if lines[:nbvl] == vl_values:
res[i]+=list(lines)
else:
del res[i]
except IndexError:
break
return res,names
|