File: millions.py

package info (click to toggle)
metakit 2.4.3-3
  • links: PTS
  • area: main
  • in suites: woody
  • size: 6,468 kB
  • ctags: 3,548
  • sloc: xml: 29,455; cpp: 23,339; sh: 9,051; tcl: 1,195; python: 577; makefile: 254; ansic: 14
file content (207 lines) | stat: -rwxr-xr-x 5,117 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
"""
Storing 25.000.000 rows in a Metakit file.

(C) Christian Tismer, Professional Net Service
  first version from 990822
  update: improved, faster spreading.

This implementation is hereby donated to JCW, therefore
(C) Jean-Claude Wippler (Equi4 Software) 1999

Data structure:
We split the main view by some number of subviews.
This gives us one level of indirection.

First simple test:
10 fields of tiny integers.
Addressing is done only by row number.

In order to allow for deletes and inserts, we keep
a list of block sizes and do a little B-tree like
juggling.

"""

import Mk4py
mk=Mk4py

import whrandom, string, sys, bisect

class big_mk:
  def __init__(self, dbpath, rw):
    self.db = mk.storage(dbpath, rw)
    
  def getas(self, struc_str):
    parts = string.split(struc_str, "[", 1)
    if len(parts) < 2:
      self.db.getas(parts[0])
      return   # this was a delete
    self.main_name, rest = parts
    ret= big_view(self.db.getas("%s[data[%s]" % (self.main_name, rest)))
    ret.big_db = self
    return ret
    
  def commit(self): return self.db.commit()
  def rollback(self): return self.db.rollback()
  def description(self): return self.db.description()
    
class big_view:
  def __init__(self, view):
    self.view = view
    if len(self.view) == 0:
      self.view.append()
    self.calc_recnos()
    self.blocksize = blocksize
    self.lower = self.blocksize *2/3
    self.upper = self.blocksize *2 - self.lower
    self.bisect = bisect.bisect
    self.names = []
    for prop in self.view[-1].data.structure():
      self.names.append(prop.name)
      
  def __len__(self):
    rn = self.recnos
    if not rn: return 0
    return rn[-1]+len(self.view[-1].data)
    
  def __getitem__(self, idx):
    main, sub = self._seek(idx)
    return self.view[main].data[sub]
    
  def __setitem__(self, idx, rec):
    main, sub = self._seek(idx)
    self.view[main].data[sub] = rec
    
  """
  we can't do slices yet, since I have no idea
  if this is necessary, and I don't see exactly
  how this should work
  """
    
  def append(self, record = None):
    v = self.view
    if not self.recnos or len(v[-1].data) >= self.blocksize:
      v.append()
      self.calc_recnos()
    return self.recnos[-1] + v[-1].data.append(record)
    
  def insert(self, idx, rec=None):
    main, sub = self._seek(idx)
    view = self.view[main].data
    view.insert(sub, rec)
    if len(view) > self.upper:
      self._balance(main)
    self.calc_recnos()

  def delete(self, idx):
    main, sub = self._seek(idx)
    view = self.view[main].data
    view.delete(sub)
    if len(view) <= self.lower:
      self._balance(main)
    self.calc_recnos()

  def _seek(self, idx):
    rn = self.recnos
    pos = self.bisect(rn, idx)-1
    base = rn[pos]
    return pos, idx-base
  
  def calc_recnos(self):
    v = self.view
    res = [None] * len(v)
    recno = 0
    for i in range(len(res)):
      res[i] = recno
      recno = recno + len(v[i].data)
    self.recnos = res
    
  def _balance(self, spot):
    """
    very simple approach: we merge about three 
    blocks and spread them again.
    """
    v = self.view
    if spot < 0 or spot > len(v) or len(v)==1 : return
    if spot > 0:
      spot = spot-1
    self._merge(spot)
    if spot < len(v)-1:
      self._merge(spot)
    self._spread(spot)
      
  def _merge(self, spot):
    """
    merge this block and the next one.
    Delete the then empty next one
    """
    v = self.view
    v[spot].data = v[spot].data + v[spot+1].data
    v.delete(spot+1)
    
  def _spread(self, spot):
    """
    Spread this block into equally sized ones.
    """
    v = self.view
    source = v[spot]
    bs = self.blocksize
    nblocks = (len(source.data)+bs-1) / bs
    chunk = len(source.data) / nblocks +1
    if chunk >= self.upper-10 or chunk < self.lower+10: chunk = bs
    chunk, nextchunk = len(source.data) % chunk, chunk
    if chunk < self.lower:
      chunk = chunk + nextchunk
    while len(source.data) > chunk:
      self.view.insert(spot+1)
      self.view[spot+1].data=source.data[-chunk:]
      source.data = source.data[:-chunk]
      chunk = nextchunk
    return

  def _getrec(self, subview):
    res = []
    ga = getattr
    for name in self.names:
      res.append(ga(subview, name))
    return res
    
  def __del__(self):
    del self.view

view_struc = "big_test[A:I,B:I,C:I,D:I,E:I,F:I,G:I,H:I,J:I,K:I]"

dbpath = "_bigfile.mk"

n_fields = 10

rand_len = 4096 + n_fields

random_ints = map(lambda x:whrandom.randint(0,256), range(rand_len))

blocksize = 10000 # default size for append, but not mandatory

ds = None

db = big_mk(dbpath, 1)

ds = db.getas(view_struc)

def make_rec(idx):
  return random_ints[idx:idx+n_fields]
  
def add_recs(n=1000):
  for i in range(n):
    idx = len(ds) % rand_len
    ds.append(make_rec(idx))

if __name__ == '__main__':
    # expect this to take hours (1000..2000 recs/sec on modern PII)
    import sys
    for i in xrange(25000):
        add_recs()
        sys.stdout.write(".")
        sys.stdout.flush()
        if i % 50 == 49:
            db.commit()
            sys.stdout.write("\n")