File: sig2fv_main.cc

package info (click to toggle)
speech-tools 1%3A2.5.0-11
  • links: PTS, VCS
  • area: main
  • in suites: bullseye, sid
  • size: 9,988 kB
  • sloc: cpp: 67,350; ansic: 12,174; sh: 4,055; java: 3,748; makefile: 1,111; lisp: 711; perl: 396; awk: 85; xml: 9
file content (413 lines) | stat: -rw-r--r-- 14,190 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
/*************************************************************************/
/*                                                                       */
/*                Centre for Speech Technology Research                  */
/*                     University of Edinburgh, UK                       */
/*                       Copyright (c) 1995,1996                         */
/*                        All Rights Reserved.                           */
/*                                                                       */
/*  Permission is hereby granted, free of charge, to use and distribute  */
/*  this software and its documentation without restriction, including   */
/*  without limitation the rights to use, copy, modify, merge, publish,  */
/*  distribute, sublicense, and/or sell copies of this work, and to      */
/*  permit persons to whom this work is furnished to do so, subject to   */
/*  the following conditions:                                            */
/*   1. The code must retain the above copyright notice, this list of    */
/*      conditions and the following disclaimer.                         */
/*   2. Any modifications must be clearly marked as such.                */
/*   3. Original authors' names are not deleted.                         */
/*   4. The authors' names are not used to endorse or promote products   */
/*      derived from this software without specific prior written        */
/*      permission.                                                      */
/*                                                                       */
/*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
/*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
/*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
/*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
/*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
/*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
/*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
/*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
/*  THIS SOFTWARE.                                                       */
/*                                                                       */
/*************************************************************************/
/*                      Authors: Paul Taylor and Simon King              */
/*                       Date  : April 1995                              */
/*-----------------------------------------------------------------------*/
/*                     Generate feature vectors                          */
/*                                                                       */
/*=======================================================================*/

#include <cstdlib>
#include "EST_speech_class.h"
#include "EST_string_aux.h"
#include "EST_cmd_line.h"
#include "EST_cmd_line_options.h"
#include "sigpr/EST_sigpr_utt.h"
#include "sigpr/EST_filter.h"

#define EPSILON (0.0001)

#define DEFAULT_FRAME_SIZE 0.01
#define DEFAULT_FRAME_FACTOR 2.0
#define DEFAULT_LPC_ORDER 16
#define DEFAULT_REF_ORDER 16
#define DEFAULT_CEP_ORDER 12
#define DEFAULT_FBANK_ORDER 20
#define DEFAULT_MELCEP_ORDER 12
#define DEFAULT_WINDOW "hamming"
#define DEFAULT_PREEMPH 0
#define DEFAULT_LIFTER 0


// sane values for pitchmarks (in seconds)

#define MINIMUM_PITCH_PERIOD (0.0033) // 300 hz
#define MAXIMUM_PITCH_PERIOD (0.02)   // 50 Hz
#define DEFAULT_PITCH_PERIOD (0.01)   // 100 Hz

void calculate_orders(EST_StrList &clist, EST_IList &olist,
		      EST_Option &op);

void add_channels_to_map(EST_StrList &map, EST_StrList &types, 
			 EST_Features &op, int order);

void set_options(EST_Features &op, EST_Option &al);

EST_String sigpr_options_supported(void)
{
    return
	EST_String("")+ 
	"    lpc      linear predictive coding\n"
	"    cep      cepstrum coding from lpc coefficients\n"
	"    melcep   Mel scale cepstrum coding via fbank\n"
	"    fbank    Mel scale log filterbank analysis\n"
	"    lsf      line spectral frequencies\n"
	"    ref      Linear prediction reflection coefficients\n"
	"    power\n"
	"    f0\n"
	"    energy: root mean square energy\n";
};



/** @name <command>sig2fv</command> <emphasis>Generate signal processing coefficients from waveforms</emphasis>
  * @id sigfv-manual
  * @toc
 */

//@{

/**@name Synopsis
  */
//@{

//@synopsis

/**
sig2fv is used to create signal processing feature vector analysis on speech
waveforms.
The following types of analysis are provided:

<itemizedlist>
<listitem><para>Linear prediction (LPC)</para></listitem>
<listitem><para>Cepstrum coding from lpc coefficients</para></listitem>
<listitem><para>Mel scale cepstrum coding via fbank</para></listitem>
<listitem><para>Mel scale log filterbank analysis</para></listitem>
<listitem><para>Line spectral frequencies</para></listitem>
<listitem><para>Linear prediction reflection coefficients</para></listitem>
<listitem><para>Root mean square energy</para></listitem>
<listitem><para>Power</para></listitem>
<listitem><para>fundamental frequency (pitch)</para></listitem>
<listitem><para>calculation of delta and acceleration coefficients of all of the 
above</para></listitem>
</itemizedlist>

The -coefs option is used to specify a list of the names of what sort
of basic processing is required, and -delta and -acc are used for
delta and acceleration coefficients respectively.

*/

//@}

/**@name Options
  */
//@{

//@options

//@}


int main(int argc, char *argv[])
{
    EST_String out_file("-");
    EST_StrList files;
    EST_Option al;
    EST_Features op;
    EST_Wave sig;
    EST_Track full;
    EST_StrList coef_list, delta_list, acc_list, tlist, map;
    EST_IList olist;

    parse_command_line
	(argc, argv, 
	 EST_String("[input file] -o [output file]\n")+
	 "Summary: generate acoustic feature vectors for a waveform file \n"
	 "use \"-\" to make input and output files stdin/out \n"
	 "-h   Options help \n\n" +
	 options_wave_input() + 
	 options_track_output() + " \n"
	 "-shift <float> frame spacing in seconds for fixed frame analysis. This \n"
	 "    doesn't have to be the same as the output file spacing - the \n"
	 "    S option can be used to resample the track before saving \n"
	 "    default: "+ftoString(DEFAULT_FRAME_SIZE) +"\n\n"
	 "-factor <float> Frames lengths will be FACTOR times the \n"
	 "    local pitch period. \n"
	 "    default: "+ftoString(DEFAULT_FRAME_FACTOR) +"\n\n"
	 "-pm <ifile>  Pitch mark file name. This is used to \n"
	 "    specify the positions of the analysis frames for pitch \n"
	 "    synchronous analysis. Pitchmark files are just standard \n"
	 "    track files, but the channel information is ignored and \n"
	 "    only the time positions are used\n"
	 "-size <float> If specified with pm, size is used as the \n"
         "    fixed window size (times factor) rather than size within \n"
         "    each the pms.\n\n"

	 "-coefs <string> list of basic types of processing required. \n"
	 "    Permissible types are: \n" + sigpr_options_supported()+" \n"
	 "-delta <string> list of delta types of processing required. Basic \n"
	 "    processing does not need to be specified for this option to work. \n"
	 "    Permissible types are: \n" + sigpr_options_supported()+" \n"
	 "-acc <string>  list of acceleration (delta delta) processing \n"
	 "    required. Basic processing does not need to be specified for \n"
         "    this option to work. \n" 
	 "    Permissible types are: \n" 
	 + sigpr_options_supported()+"\n"
	 "-window_type <string> Type of window used on waveform. \n"
	 "    Permissible types are: \n" +
	 EST_Window::options_supported() + 
	 "    default: \"DEFAULT_WINDOW\"\n\n"
	 "-lpc_order <int>      Order of lpc analysis. \n\n"
	 "-ref_order <int>      Order of lpc reflection coefficient analysis. \n\n"
	 "-cep_order <int>      Order of lpc cepstral analysis.\n\n"
	 "-melcep_order <int>   Order of Mel cepstral analysis.\n\n"
	 "-fbank_order <int>    Order of filter bank analysis.\n\n"
	 "-preemph <float>      Perform pre-emphasis with this factor.\n\n"
	 "-lifter <float>       lifter coefficient.\n\n"
	 "-usepower             use power rather than energy in filter bank \n"
	 "    analysis\n\n"+
	 "-include_c0           include cepstral coefficient 0\n\n"
	 "-order <string>       order of analyses\n", files, al);

    out_file = al.present("-o") ? al.val("-o") : (EST_String)"-";
    set_options(op, al);

    StringtoStrList(al.val("-coefs"), coef_list);
    StringtoStrList(al.val("-delta"), delta_list);
    StringtoStrList(al.val("-acc"), acc_list);

    StringtoStrList(al.val("-order"), tlist);
    StrListtoIList(tlist, olist);
    
    if (read_wave(sig, files.first(), al) != read_ok)
	exit(-1);

    // allocate and fill time axis
    if (al.present("-pm"))
    {
	if (read_track(full, al.val("-pm"), al))
	    exit(1);
    }
    else
    {
	full.resize((int)ceil(sig.end() / op.F("frame_shift")), 0);
	full.fill_time(op.F("frame_shift"));
    }

    // allocate channels
    add_channels_to_map(map, coef_list, op, 0);
    add_channels_to_map(map, delta_list, op, 1);
    add_channels_to_map(map, acc_list, op, 2);

    //cerr << "MAP " << map << endl;

    full.resize(EST_CURRENT, map);

    if (al.present("-preemph"))
	pre_emphasis(sig, al.fval("-preemph"));

    if(al.present("-usepower"))
	cerr << "sig2fv: -usepower currently not supported" << endl;

    sigpr_base(sig, full, op, coef_list);
    sigpr_delta(sig, full, op, delta_list);
    sigpr_acc(sig, full, op, acc_list);

    if (al.present("-S"))
    {
	cout << "-S " << al.fval("-S") << endl;
	full.sample(al.fval("-S"));
    }

    if (full.save(out_file, al.val("-otype", 0)) != write_ok)
    {
	cerr << "sig2fv: failed to write output to \"" << out_file 
	    << "\"" << endl;
	exit(-1);
    }
    return 0;
}



void calculate_orders(EST_StrList &clist, EST_IList &olist,
		      EST_Option &op)
{
    EST_Litem *c, *o;
    EST_String k;
    int v;

    for (c = clist.head(), o = olist.head(); c && o; c= c->next(), o = o->next())
    {
	k = clist(c) + "_order";
	v = olist(o);
	op.override_ival(k, v);
    }
}

void set_options(EST_Features &op, EST_Option &al)
{ 
    op.set("frame_shift", DEFAULT_FRAME_SIZE);
    op.set("frame_factor", DEFAULT_FRAME_FACTOR);
    op.set("window_type", DEFAULT_WINDOW); 

    op.set("preemph", DEFAULT_PREEMPH);
    op.set("lifter", DEFAULT_LIFTER);

    op.set("lpc_order", DEFAULT_LPC_ORDER);
    op.set("ref_order", DEFAULT_REF_ORDER);
    op.set("cep_order", DEFAULT_CEP_ORDER);
    op.set("fbank_order", DEFAULT_FBANK_ORDER);
    op.set("melcep_order", DEFAULT_MELCEP_ORDER);

    op.set("max_period", MAXIMUM_PITCH_PERIOD);
    op.set("min_period", MINIMUM_PITCH_PERIOD);
    op.set("def_period", DEFAULT_PITCH_PERIOD);
    
    if (al.present("-max_period"))
	op.set("max_period", al.fval("-max_period", 0));
    if (al.present("-min_period"))
	op.set("min_period", al.fval("-min_period", 0));
    if (al.present("-def_period"))
	op.set("def_period", al.fval("-def_period", 0));

    if (al.present("-window_type"))
	op.set("window_type", al.sval("-window_type", 1));
    
    if (al.present("-shift"))
	op.set("frame_shift", al.fval("-shift", 1));
    if (al.present("-factor"))
	op.set("frame_factor", al.fval("-factor", 1));
    if (al.present("-size"))
	op.set("frame_factor", op.F("frame_factor")*-1.0*al.fval("-size"));
    if (al.present("-length"))
	op.set("frame_factor", 
			 al.fval("-length", est_errors_allowed)/op.F("frame_shift",est_errors_allowed));
    
    if (al.present("-preemph"))
	op.set("preemph", al.fval("-preemph", 1));
    if (al.present("-lifter"))
	op.set("lifter", al.fval("-lifter", 1));

    if (al.present("-lpc_order"))
	op.set("lpc_order", al.ival("-lpc_order", 1));
    if (al.present("-ref_order"))
	op.set("ref_order", al.ival("-ref_order", 1));
    if (al.present("-cep_order"))
	op.set("cep_order", al.ival("-cep_order", 1));
    if (al.present("-fbank_order"))
	op.set("fbank_order", al.ival("-fbank_order", 1));
    if (al.present("-melcep_order"))
	op.set("melcep_order", al.ival("-melcep_order", 1));

    if (al.present("-usepower"))
	op.set("usepower", al.val("-usepower", 1));

    if (al.present("-include_c0"))
	op.set("include_c0", al.val("-include_c0", 1));

}

/**@name Examples


Fixed frame basic linear prediction:

To produce a set of linear prediction coefficients at every 10ms, using
pre-emphasis  and saving in EST format:

<para>
<screen>
$ sig2fv kdt_010.wav -o kdt_010.lpc -coefs "lpc" -otype est -shift 0.01 -preemph 0.5
</screen>
</para>
<formalpara><title>
Pitch Synchronous linear prediction</title><para>. The following used the set of pitchmarks
in kdt_010.pm as the centres of the analysis windows.
</para>
</formalpara>

<para>
<screen>
$ sig2fv kdt_010.wav -pm kdt_010.pm -o kdt_010.lpc -coefs "lpc" -otype est -shift 0.01 -preemph 0.5
</screen>
</para>

<para>
F0, Linear prediction and cepstral coefficients:

<screen>
$ sig2fv kdt_010.wav -o kdt_010.lpc -coefs "f0 lpc cep" -otype est -shift 0.01
</screen>

Note that pitchtracking can also be done with the
<command>pda</command> program. Both use the same underlying
technique, but the pda program offers much finer control over the
pitch track specific processing parameters.

</para>

<para>Energy, Linear Prediction and Cepstral coefficients, with a 10ms frame shift
during analysis but a 5ms frame shift in the output file:

<para>
<screen>
$ sig2fv kdt_010.wav -o kdt_010.lpc -coefs "f0 lpc cep" -otype est -S 0.005
      -shift 0.01
</screen>
</para>

<para>Delta  and acc coefficients can be calculated even if their base form is not 
required. This produces normal energy coefficients and cepstral delta coefficients:

<para>
<screen>
$ sig2fv ../kdt_010.wav -o kdt_010.lpc -coefs "energy" -delta "cep" -otype est
</screen>
</para>

<para>Mel-scaled cepstra, Delta and acc coefficients, as is common in speech 
recognition:
<para>
<screen>
$ sig2fv ../kdt_010.wav -o kdt_010.lpc -coefs "melcep" -delta "melcep" -acc "melcep" -otype est -preemph 0.96
</screen>

*/
//@{
//@}



//@}