File: utils.py

package info (click to toggle)
okasha 0.2.4-4
  • links: PTS, VCS
  • area: non-free
  • in suites: bullseye, sid
  • size: 856 kB
  • sloc: python: 1,092; sh: 29; makefile: 7
file content (259 lines) | stat: -rw-r--r-- 8,666 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
# -*- coding: UTF-8 -*-
"""

Copyright © 2009, Muayyad Alsadi <alsadi@ojuba.org>

    Released under terms of Waqf Public License.
    This program is free software; you can redistribute it and/or modify
    it under the terms of the latest version Waqf Public License as
    published by Ojuba.org.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

    The Latest version of the license can be found on
    "http://waqf.ojuba.org/license"

"""
import sys, time, hashlib, random
import bisect
from itertools import groupby
import re

fs_encoding=sys.getfilesystemencoding().startswith('ANSI') and 'UTF-8' or sys.getfilesystemencoding()
dD_re=re.compile(r'(\d+|\D+)')

def stringOrFloat(s):
  try: return float(s)
  except ValueError: return s

def stringOrInt(s):
  try: return int(s)
  except ValueError: return s


def strverscmp(a,b):
  "something like GNU strverscmp"
  return cmp([stringOrInt(i) for i in dD_re.findall(a)],
    [stringOrInt(i) for i in dD_re.findall(b)])

def strverscmp_old(a,b):
  "something like GNU strverscmp"
  return cmp([stringOrFloat(i) for i in dD_re.findall(a)],
    [stringOrFloat(i) for i in dD_re.findall(b)])

def fromFs(filenameblob):
  """
  recives a blob encoded in filesystem encoding and convert it into a unicode object
  """
  if type(filenameblob)!=str:
    return filenameblob.decode(fs_encoding)
  return filenameblob

def toFs(filename):
  """
  recives a unicode object and encode it into filesystem encoding
  """
  if type(filename)==str:
    return filename.encode(fs_encoding)
  return filename


def randomblob(m,M):
  return ''.join([chr(random.randrange(0,255)) for i in range(random.randrange(m,M))])


def safeHash(stringSeed, o):
  """
  a URL safe hash, it results a 22 byte long string hash based on md5sum
  """
  if isinstance(o,str): o=o.encode('utf8')
  return hashlib.md5(stringSeed+o).digest().encode('base64').replace('+','-').replace('/','_')[:22]

# TODO: is this needed by anything ?
def unixUniq(l):
  """
  Unix-like uniq, the iteratable argument should be sorted first to get unique elements.
  """
  return map(lambda j:j[0],groupby(l,lambda i: i))

def unixUniqAndCount(l):
  """
  Unix-like uniq -c, it returns an iteratable of tuples (count, uniq_entry)
  """
  return map(lambda j:(len(list(j[1])),j[0]),groupby(l,lambda i: i))

class ObjectsCacheObject:
  def __init__(self, objId, obj):
    self.objId=objId
    self.obj=obj
    self.atime=self.ctime=time.time()
    self.freq=1 # how many time it was accessed
    self.i=0 # shifted index

  def __cmp__(self, b):
    # passing cmp as an argument is twise as faster as overloading cmp
    if isinstance(b,ObjectsCacheObject): return cmp(self.atime,b.atime)
    if b<=0: return cmp(self.i,-b)
    return cmp(self.atime,b)

def cmp_bisect_right(ccmp, a, x, lo=0, hi=None):
    """
    same as bisect.bisect but uses custom cmp function
    """
    if lo < 0:
        raise ValueError('lo must be non-negative')
    if hi is None:
        hi = len(a)
    while lo < hi:
        mid = (lo+hi)>>1
        if ccmp(a[mid],x)>0: hi = mid # ie. if x < a[mid]
        else: lo = mid+1
    return lo
def cmp_bisect_left(ccmp, a, x, lo=0, hi=None):
    """
    same as bisect.bisect_left but uses custom cmp function
    """
    if lo < 0:
        raise ValueError('lo must be non-negative')
    if hi is None:
        hi = len(a)
    while lo < hi:
        mid = (lo+hi)>>1
        if ccmp(x, a[mid])>0: lo = mid+1 # ie. if a[mid] < x
        else: hi = mid
    return lo

class fakeLock:
  def acquire(self, blocking=True): pass
  def release(self): pass

class ObjectsCache:
  _feasible_bisect_limit=4
  def __init__(self, lock=None, minCount=10, maxCount=100, maxTime=3600):
    """
    minCount is the minimum number of cached objects below which no cached object will be freed, use 0 to set no lower limit
    maxCount is the maximum number of cached objects above which no cached object will be kept, use 0 to set no upper limit
    maxTime is positive time to live in seconds, all objects older than this will be removed when free is called, use 0 to discard time checking
    
    example:
      setting minCount and maxTime to 0 will keep all cached objects no matter how many or for how long
      setting maxCount to 0 will disable caching (all objects will be freed when free is called)
    """
    if not lock: self._lock=fakeLock()
    else: self._lock=lock
    self.maxTime=maxTime
    self.maxCount=maxCount
    self.minCount=minCount
    self._hash={} # a mapping between ids and shifted array index
    self._shift=0 # the current shift in array index
    self._shifts=0 # number of tailing shifts done
    self.objs=[] # objects list from older to most recent
    # when freeing old objects we don't need to update the mapping between ids and array index, we only adjust the shift

  def _get(self, objId):
    i=self._hash.get(objId,None)
    if i==None: return None
    si=min(i+self._shift,len(self.objs)-1)
    # NOTE: withou tailing shifts it would be o=self.objs[si]
    # because of Tailing Shifts we need to search between objs[si-shifts:si]
    # in case of shifts==0 then the object is at si
    # if shifts>0 then the worst case is that our very object was the target of all past tailing shifts, ie. it's on objs[si-shifts]
    # NOTE: worst case happens when one keep accessing the least expected object and this can only happen if the user access all objects in a uniform way
    cb=si-self._shifts # candidates lower bound
    if cb<0: cb=0
    if si-cb<self._feasible_bisect_limit:
      sh=si
      while(True): # should be sh>=cb but it's guaranteed to work without it
        o=self.objs[sh]
        if o.i==i: break
        sh-=1
    else:
      sh=cb+cmp_bisect_left(lambda a,b: cmp(a,b.i), self.objs[cb:si+1],i)
      o=self.objs[sh]
    o.atime=time.time()
    o.freq+=1
    # Tailing shifts should be done if o is not the last one
    # change position from: oldest,older,*accessed*,old, recent
    # to be something like: oldest,older,old, recent, *accessed*
    # this will case the shifted index returned from hash of old and recent to be greater than its real value
    if sh<len(self.objs)-1:
      self._shifts+=1
      del self.objs[sh]
      if self.objs: i=self.objs[-1].i+1
      else: i=0
      self._hash[objId]=i
      o.i=i
      self.objs.append(o)
    self._free()
    return o.obj

  def get(self, objId):
    self._lock.acquire()
    try: o=self._get(objId)
    finally: self._lock.release()
    return o

  def append(self, objId, obj):
    self._lock.acquire()
    try:
      if objId in self._hash:
        # replace it
        self._get(objId) # this will make it last object
        # remove it
        del self.objs[-1]
        del self._hash[objId]
      else: self._free()
      # add it
      o=ObjectsCacheObject(objId, obj)
      if self.objs: i=self.objs[-1].i+1
      else: i=0
      self._hash[objId]=i
      o.i=i
      self.objs.append(o)
    finally: self._lock.release()

  def free(self):
    """
    free old objects, return number of freed objects
    """
    self._lock.acquire()
    try:
      r=self._free()
    finally: self._lock.release()
    return r

  def _free(self):
    """
    the real internal free is done here without locks, locks are done in free()
    """
    # if there are too many tailing shifts then reconstruction would be feasible
    # or if there are 
    if len(self.objs)>self._feasible_bisect_limit and (self._shifts>>1)>len(self.objs): self._reconstruct()
    # TODO: also if the oldest element is older than some limit an expensive reconstruction based on frequency maybe feasible
    l=len(self.objs)
    if self.minCount>0 and l<=self.minCount: return 0
    k=l-self.minCount # max number of objs to remove
    j=max(l-self.maxCount,0) # number of objs to remove
    if self.maxTime>0:
      c=time.time()-self.maxTime # time older than which should be removed
      i=cmp_bisect_right(lambda a,b: cmp(a.atime,b),self.objs,c) # number of objs to remove by time
      i=max(i,j)
    else: i=j
    if self.minCount>0: i=min(j,k)
    # can be done by deleting hash elements which has values < -newshift
    for o in self.objs[:i]: del self._hash[o.objId]
    del self.objs[:i]
    self._shift-=i
    return i

  def _reconstruct(self):
    """
    reconstruct objects so that no shifting is used,
    this expensive operation O(n) might speed things up for next retrievals
    """
    for i,o in enumerate(self.objs):
      o.i=i; self._hash[o.objId]=i
    self._shift=0; self._shifts=0