File: databases.py

package info (click to toggle)
serpento 0.4.1-0.2
links: PTS
area: main
in suites: etch, etch-m68k
size: 360 kB
ctags: 391
sloc: python: 1,762; ansic: 669; perl: 157; sh: 127; makefile: 73
file content (308 lines) | stat: -rw-r--r-- 8,586 bytes
parent folder | download | duplicates (2)
import string, mmap, os, cPickle
from utils import *
from searching import loop_in_C
import dictzip, sgmllib, cStringIO, struct
import md5

from stat import ST_MTIME

execfile("config.py")

def unique_strings(l):
    dict = {}
    for s in l:
       dict[s] = 1
    return dict.keys()


class EmptyDb:
    def __init__(self, name):
        self.info = "empty database"
        self.name = name
        
    def match(self, strat, word):
        return []

    def define(self, word):
        return []

class FullDb:
    def __init__(self, name):
        self.info = "full database"
        self.name = name
        
    def match(self, strat, word):
        return [ (self.name, "one"), (self.name, "two") ]

    def define(self, word):
        return [ (self.name, "bla bla blablabla") ]

def sort_index(original, sorted):
    l = open(original).readlines()
    nl = []
    for i in range(len(l)):
        line = string.split(l[i], TAB, 1)
        if len(line)<>2:
            print "corrupted index entry", `l[i]`
            continue
        entry, rest = line
        try:
            entry = unicode(entry, 'utf-8')
        except UnicodeDecodeError:
            #print "Invalid UTF-8 sequence %s, assuming ISO-8859-1" % repr(entry)
            entry = unicode(entry, 'ISO-8859-1')
        entry = entry.lower()
        entry = entry.encode('utf-8')
        n = entry+TAB+rest
        nl.append(n)
    nl.sort()
    f = open(sorted, "w")        
    for i in nl:
        f.write(i)
    f.close()

def getcachenames(orig_inx):
    digest = md5.new(orig_inx).hexdigest()
    inx = os.path.join(cachedir, digest+".index")
    pos = os.path.join(cachedir, digest+".positions")
    return inx, pos

def dbargs(name, data=None, inx=None):
    """return tuple of:  mmap object for posittions, mmap for index file, file object for data file 
    """
    if not data:
        data = name+".dict"
    sep = os.sep
    sep1 = os.altsep or os.sep
    if not( (sep in data) or (sep1 in data) ):
        data = "/usr/share/dictd/%s" % data
    if not inx:
        inx = name+".index"
    if not( (sep in inx) or (sep1 in inx) ):
        inx = os.path.normpath("/usr/share/dictd/"+inx)
    if data[-3:]=='.dz':
        datafo = dictzip.DictzipFile(data)
    else:
        try:
            datafo = open(data)
        except IOError:
            datafo = dictzip.DictzipFile(data+".dz")
        
    cache_is_good = 0
    
    inx_cachename, positions_cachename = getcachenames(inx)
    
    if os.path.isfile(inx_cachename) and os.path.isfile(positions_cachename):
        s = os.stat(inx_cachename)
        indexcachetimestamp = s[ST_MTIME]
        s = os.stat(inx)
        indextimestamp = s[ST_MTIME]
        s = os.stat(positions_cachename)
        positionstimestamp = s[ST_MTIME]
        if indexcachetimestamp>indextimestamp and positionstimestamp>indextimestamp:
            cache_is_good = 1

    if not cache_is_good:
        sort_index(inx, inx_cachename)

    indexfo = open(inx_cachename)
    indexfo.seek(0, 2)
    indexlen = indexfo.tell()
    inx = mmap.mmap(indexfo.fileno(), indexlen, prot=mmap.PROT_READ)
    inx.seek(0)
    
    
    if not cache_is_good:
        indexpos = []
        while 1:
            p = inx.tell()
            l = inx.readline()
            if not l:
                break
            l = string.split(l, TAB, 1)[0]
            indexpos.append((l, p))
        positions_fo = open(positions_cachename, "w")
        for i in indexpos:
            positions_fo.write(struct.pack('i', i[1]))
        positions_fo.close()

        del indexpos
        
    positions_fo = open(positions_cachename, "r")

    positions_fo.seek(0, 2)
    positions_len = positions_fo.tell()

    positions = mmap.mmap(positions_fo.fileno(), positions_len, prot=mmap.PROT_READ)
    return positions, positions_len/INT_SIZE, inx, datafo


class FileDb:
    """Entries in index are in UTF8, sorted byte-after-byte
    """
    def __init__(self, name, data=None, inx=None, info=None):
        if not info:
            info = name
        self.name = name
        self.info = info
        self.data = data
        self.datafo = None
        self.index = None
        self.inx = inx
        self.initialized = 0

    def initialize(self):
        if not self.initialized:
            self.positions, self.nrwords, self.index, self.datafo  = dbargs(self.name, self.data, self.inx)
            self.initialized = 1
            
    def transformentry(self, s):
        # transforms read entry into plain text
        # or, in the future, into mime/html/sgml/whatever
        # to be overriden
        return s

    def readentry(self, arg):
        entry, st, ln = arg
        self.datafo.seek(st)
        r = self.transformentry(self.datafo.read(ln))
        return self.name, r

    def define(self, word):
        self.initialize()
        r = []
        poss = loop_in_C(self.index, self.positions, self.nrwords, word, 0, 0)
        for i in poss:
            self.index.seek(struct.unpack('i',self.positions[INT_SIZE*i:INT_SIZE*(i+1)])[0])
            l = string.rstrip(self.index.readline())
            entry, st, ln = string.split(l, TAB)
            st, ln = b64_decode(st), b64_decode(ln)
            r.append( (entry,st,ln) )
        r = map(self.readentry, r)
        return r

                
    def match(self, strategy, word):
        self.initialize()
        r = []
        if strategy=='.':
            strategy = 'lev'
        if strategies.has_key(strategy):
            strategy = strategies[strategy][0]
        else:
            return []
        # mmap pointer, list of index entries, word, strat, max nr
        r1 = loop_in_C(self.index, self.positions, self.nrwords, word, strategy, 20)
        for i in r1:
            ssz = struct.calcsize('i')
            self.index.seek(struct.unpack('i',self.positions[INT_SIZE*i:INT_SIZE*(i+1)])[0])
            l = string.rstrip(self.index.readline())
            try:
                entry, st, ln = string.split(l, TAB, 2)
            except ValueError:
                print "bad index file: l=",`l`, "i=",`i`
            st, ln = b64_decode(st), b64_decode(ln)
            r.append( (entry, st, ln) )
        for i in range(len(r)):
            r[i] = self.name, r[i][0]
        #r = kjbuckets.kjSet(r).items()
        r = unique_strings(r)
        return r

        
    def __del__(self):
        if self.datafo:
            self.datafo.close()
        if self.index:
            self.index.close()



class FileDbDict(FileDb):
    """Entries in index are in UTF8, sorted byte-after-byte
       Database file is raw dict file (with %h, %d)
    """

    def transformentry(self, s):
        # transforms read entry into plain text
        rs = string.split(s, '\n')
        reply = []
        i = 0
        while 1:
            srs = string.lstrip(rs[i])
            if srs[:2] == '%h':
                reply.append(srs[2:])
                i+=1
            else:
                break
        if string.strip(rs[i])<>'%d':
            return 'wrong entry, please check'
        i+=1
        reply.append("-----") # separating entry and body
        reply.extend(rs[i:])
        return string.join(reply, '\n')



class Parser(sgmllib.SGMLParser):

    def __init__(self):
        sgmllib.SGMLParser.__init__(self)
        self.inorth = self.intr = 0
        self.result = ""
    
    def start_entry(self, a):
        pass
    def end_entry(self):
        self.result = process_entry(self.header, self.translations)

    def start_form(self, a):
        self.header = []
    def end_form(self):
        pass

    def start_orth(self, a):
        self.inorth = 1
    def end_orth(self):
        self.inorth = 0

    def start_tr(self, a):
        self.intr = 1
    def end_tr(self):
        self.intr = 0
        
    def start_trans(self, a):
        self.translations = []
    def end_trans(self):
        pass
        
    def handle_data(self, d):
        if self.inorth:
            self.header.append(d)
        elif self.intr:
            self.translations.append(d)
        

def process_entry(orths, trs):
    r = string.join(orths, seporth)+"\n"
    r = r + "  "+string.join(trs, septrans)+"\n"
    return r

class FileDbTeit(FileDb):
    """Entries in index are in UTF8, sorted byte-after-byte
       Database file is raw tei file
    """

    def transformentry(self, s):
        # transforms read entry into plain text

        rs = string.split(s, '\n')
        p = Parser()

        f = open(teifile)
        for i in rs:
            p.feed(rs)
        f.close()

        return p.result