1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
|
# Copyright (c) 2007 Carnegie Mellon University
#
# You may copy and modify this freely under the same terms as
# Sphinx-III
"""Corpus classes for acoustic model training.
This module provides classes for representing a corpus of utterances
for acoustic modeling. The Corpus class implements the iterator
protocol, acting as a list of Utterance objects.
"""
__author__ = "David Huggins-Daines <dhdaines@gmail.com>"
__version__ = "$Revision$"
import os
class Resource(object):
"""Resource associated with an utterance in a speech corpus.
Any utterance has an arbitrary set of resources associated with
it. These are things such as waveforms, acoustic feature files,
transcriptions and other forms of supervision, etc.
"""
pass
class FileResourceIterator(object):
"""
Iterator over items in a FileResource.
"""
def __init__(self, resource):
self.res = resource
self.ctl = iter(resource.ctl_file)
def __next__(self):
# This will raise StopIteration for us at EOF
entry = next(self.ctl)
if isinstance(entry, CtlEntry):
path = os.path.join(self.res.base_dir,
entry.fileid + self.res.file_ext)
else:
path = os.path.join(self.res.base_dir, entry + self.res.file_ext)
if self.res.data_type:
return self.res.data_type(path)
else:
return path
class FileResource(Resource):
def __init__(self, ctl_file, base_dir, file_ext, data_type=None):
"""
Initialize a file-based resource.
@param ctl_file: Control file resource on which this is based
@ptype ctl_file: iterator(CtlEntry)
@param base_dir: Base directory to prepend to control entries
@param file_ext: Filename extension to append to control entries
@param data_type: Class to construct from entries.
@ptype data_type: type
"""
self.ctl_file = ctl_file
self.base_dir = base_dir
self.file_ext = file_ext
self.data_type = data_type
def __iter__(self):
return FileResourceIterator(self)
class CtlEntry(object):
"""Entry in a control file"""
def __init__(self, str):
fields = str.split()
if len(fields) == 4:
self.fileid, self.sf, self.ef, self.uttid = fields
self.sf = int(self.sf)
self.ef = int(self.ef)
else:
self.fileid = self.uttid = str
self.sf = 0
self.ef = -1
class ListResourceIterator(object):
"""
Iterator over items in a ListResource.
"""
def __init__(self, resource):
self.fh = open(resource.file_name)
self.data_type = resource.data_type
def __del__(self):
if self.fh is not None:
self.fh.close()
self.fh = None
def __next__(self):
spam = self.fh.readline()
if spam == "":
raise StopIteration
if self.data_type:
return self.data_type(spam.rstrip())
else:
return spam.rstrip()
class ListResource(Resource):
"""
Corpus resource consisting of lines in a text file, of some data
type. This includes things like control and transcript files.
"""
def __init__(self, file_name, data_type=None):
"""
Initialize a listing-based resource.
If no data_type argument is specified, each item in the list
will be returned as a string.
@param file_name: File to read resource from
@ptype file_name: string
@param data_type: Class implementing the data type of each item
@ptype data_type: type
"""
self.data_type = data_type
self.file_name = file_name
def __iter__(self):
return ListResourceIterator(self)
class CorpusIterator(object):
"""
Iterator over elements in a Corpus.
"""
def __init__(self, corpus, part=1, npart=1):
self.corpus = corpus
self.iters = {}
if npart > 1:
pass
else:
for k, v in corpus.resources.items():
self.iters[k] = iter(v)
def __next__(self):
utt = {}
for k, v in self.iters.items():
utt[k] = next(v)
return utt
class Corpus(object):
"""Corpus of speech data."""
def __init__(self, ctl_file):
self.ctl = ListResource(ctl_file, CtlEntry)
self.resources = {'ctl': self.ctl}
def __iter__(self):
return CorpusIterator(self)
def add_resource(self, name, res):
self.resources[name] = res
|