File: dict_spd.py

package info (click to toggle)
sphinxtrain 5.0.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 32,572 kB
  • sloc: ansic: 94,052; perl: 8,939; python: 6,702; cpp: 2,044; makefile: 6
file content (55 lines) | stat: -rwxr-xr-x 1,697 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#!/usr/bin/env python3
#
# Copyright (c) 2009 Carnegie Mellon University
#
# You may copy and modify this freely under the same terms as
# Sphinx-III
"""
Generate a single-pronunciation dictionary from an input dictionary
and the output of force alignment.
"""

__author__ = "David Huggins-Daines <dhdaines@gmail.com>"
__version__ = "$Revision $"

from collections import defaultdict
from cmusphinx import s3dict
import sys

if __name__ == "__main__":
    if len(sys.argv) < 3:
        print("Usage: %s INDICT FALIGNOUT [OUTDICT]" % sys.argv[0],
              file=sys.stderr)
        sys.exit(1)
    indict = s3dict.open(sys.argv[1])
    counts = defaultdict(int)
    falignout = open(sys.argv[2])
    for spam in falignout:
        for word in spam.split()[:-1]:
            if word in indict:
                counts[word] += 1
    words = list(indict.words())
    words.sort()
    if len(sys.argv) > 3:
        outfh = open(sys.argv[3], "w")
    else:
        outfh = sys.stdout
    for w in words:
        alts = sum(1 for x in indict.alts(w))
        if alts == 1:
            print("%s\t\t%s" % (w, " ".join(indict[w])), file=outfh)
        else:
            bestalt = None
            bestcount = 0
            for a in range(1, alts + 1):
                if a == 1:
                    wstr = w
                else:
                    wstr = "%s(%d)" % (w, a)
                if counts[wstr] > bestcount:
                    bestcount = counts[wstr]
                    bestalt = wstr
            if bestalt == None:
                print("%s\t\t%s" % (w, " ".join(indict[w])), file=outfh)
            else:
                print("%s\t\t%s" % (w, " ".join(indict[bestalt])), file=outfh)