File: sendump.py

package info (click to toggle)
sphinxtrain 1.0.8%2B5prealpha%2B1-6
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 31,356 kB
  • sloc: ansic: 36,168; sh: 11,156; perl: 8,175; python: 5,620; cpp: 2,037; makefile: 1,552
file content (82 lines) | stat: -rw-r--r-- 2,793 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env python

import sys
import s3mixw
import struct
import numpy


class Sendump():
    def __init__(self,filename):
	self.integer_format = None
	self.load(filename)

    def unpack_endian(self, s):
        if self.integer_format == None:
            self.integer_format = 'I'
        nbytes = struct.unpack(self.integer_format, s)[0]
        if nbytes > 65536:
            self.integer_format = '!I';
            nbytes = struct.unpack(self.integer_format, s)[0]
        return nbytes

    def readstr(self, fh):
        nbytes = self.unpack_endian(fh.read(4))
        if nbytes == 0:
             return None
        else:
             return fh.read(nbytes)[0:-1]
    
    def mixw(self):
	return self.opdf

    def load(self,filename):    
        mixture_count = 0
        model_count = 0
        mixw_shift = 10
        cluster_bits = 8
        feature_count = 0
        cluster_count = 0
        
        sendump = open(filename)
        title = self.readstr(sendump)
        while True:
            header = self.readstr(sendump)
            if header == None:
                break
            if header.startswith("mixture_count"):
                mixture_count = (int)(header.split()[1])
            if header.startswith("model_count"):
                model_count = (int)(header.split()[1])
            if header.startswith("mixw_shift"):
                mixw_shift = (int)(header.split()[1])
            if header.startswith("cluster_bits"):
                cluster_bits = (int)(header.split()[1])
            if header.startswith("feature_count"):
                feature_count = (int)(header.split()[1])
            if header.startswith("cluster_count"):
                cluster_count = (int)(header.split()[1])
        
        if cluster_count == 0:
            # Number of codewords and pdfs
            model_count = self.unpack_endian(sendump.read(4))
            mixture_count = self.unpack_endian(sendump.read(4))
        print "rows (model_count): %d, columns (mixture_count): %d, features (feature_count): %d" % (model_count, mixture_count, feature_count)
        
        # Now read the stuff
        self.opdf = numpy.empty((mixture_count, feature_count, model_count))
        for i in range(0,feature_count):
            for j in range(0,model_count):
                # Read bytes, expand to ints, shift them up
                mixw = numpy.fromfile(sendump, 'B', mixture_count).astype('i') << mixw_shift
                # Negate, exponentiate, and untranspose
                self.opdf[:,i,j] = numpy.power(1.0001, -mixw)

def usage():
    print "Usage: %s IN_SENDUMP OUT_MIXW" % sys.argv[0]

if __name__ == '__main__':
    if len(sys.argv) < 3:
        usage()
        sys.exit(2)
    s3mixw.open(sys.argv[2], 'wb').writeall(Sendump(sys.argv[1]).mixw())