1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
|
from __future__ import division
import numpy as np
from . import processing
from scipy.fftpack import dct
from . import functions
try:
from functools import lru_cache
except ImportError:
from backports.functools_lru_cache import lru_cache
@lru_cache()
def filterbanks(num_filter, fftpoints, sampling_freq, low_freq=None, high_freq=None):
"""Compute the Mel-filterbanks. Each filter will be stored in one rows. The columns correspond to fft bins.
Args:
num_filter (int): the number of filters in the filterbank, default 20.
fftpoints (int): the FFT size. Default is 512.
sampling_freq (float): the samplerate of the signal we are working with. Affects mel spacing.
low_freq (float): lowest band edge of mel filters, default 0 Hz
high_freq (float): highest band edge of mel filters, default samplerate/2
Returns:
array: A numpy array of size num_filter x (fftpoints//2 + 1) which are filterbank
"""
high_freq = high_freq or sampling_freq / 2
low_freq = low_freq or 300
assert high_freq <= sampling_freq / 2, "High frequency cannot be greater than half of the sampling frequency!"
assert low_freq >= 0, "low frequency cannot be less than zero!"
######################################################
########### Computing the Mel filterbank #############
######################################################
# converting the upper and lower frequencies to Mels.
# num_filter + 2 is because for num_filter filterbanks we need num_filter+2 point.
mels = np.linspace(functions.frequency_to_mel(low_freq), functions.frequency_to_mel(high_freq), num_filter + 2)
# we should convert Mels back to Hertz because the start and end-points should be at the desired frequencies.
hertz = functions.mel_to_frequency(mels)
# The frequency resolution required to put filters at the
# exact points calculated above should be extracted.
# So we should round those frequencies to the closest FFT bin.
freq_index = (np.floor((fftpoints + 1) * hertz / sampling_freq)).astype(int)
# Initial definition
filterbank = np.zeros([num_filter, fftpoints])
# The triangular function for each filter
for i in range(0, num_filter):
left = int(freq_index[i])
middle = int(freq_index[i + 1])
right = int(freq_index[i + 2])
z = np.linspace(left, right, num=right - left + 1)
filterbank[i, left:right + 1] = functions.triangle(z, left=left, middle=middle, right=right)
return filterbank
def mfcc(signal, sampling_frequency, frame_length=0.020, frame_stride=0.01,num_cepstral =13,
num_filters=40, fft_length=512, low_frequency=0, high_frequency=None, dc_elimination=True):
"""Compute MFCC features from an audio signal.
Args:
signal (array): the audio signal from which to compute features. Should be an N x 1 array
sampling_frequency (int): the sampling frequency of the signal we are working with.
frame_length (float): the length of each frame in seconds. Default is 0.020s
frame_stride (float): the step between successive frames in seconds. Default is 0.02s (means no overlap)
num_filters (int): the number of filters in the filterbank, default 40.
fft_length (int): number of FFT points. Default is 512.
low_frequency (float): lowest band edge of mel filters. In Hz, default is 0.
high_frequency (float): highest band edge of mel filters. In Hz, default is samplerate/2
num_cepstral (int): Number of cepstral coefficients.
dc_elimination (bool): hIf the first dc component should be eliminated or not.
Returns:
array: A numpy array of size (num_frames x num_cepstral) containing mfcc features.
"""
feature, energy = mfe(signal, sampling_frequency=sampling_frequency, frame_length=frame_length, frame_stride=frame_stride,
num_filters=num_filters, fft_length=fft_length, low_frequency=low_frequency, high_frequency=high_frequency)
if len(feature) == 0:
return np.empty((0, num_cepstral))
feature = np.log(feature)
feature = dct(feature, type=2, axis=-1, norm='ortho')[:, :num_cepstral]
# replace first cepstral coefficient with log of frame energy for DC elimination.
if dc_elimination:
feature[:, 0] = np.log(energy)
return feature
def mfe(signal, sampling_frequency, frame_length=0.020, frame_stride=0.01,
num_filters=40, fft_length=512, low_frequency=0, high_frequency=None):
"""Compute Mel-filterbank energy features from an audio signal.
signal (array): the audio signal from which to compute features. Should be an N x 1 array
sampling_frequency (int): the sampling frequency of the signal we are working with.
frame_length (float): the length of each frame in seconds. Default is 0.020s
frame_stride (float): the step between successive frames in seconds. Default is 0.02s (means no overlap)
num_filters (int): the number of filters in the filterbank, default 40.
fft_length (int): number of FFT points. Default is 512.
low_frequency (float): lowest band edge of mel filters. In Hz, default is 0.
high_frequency (float): highest band edge of mel filters. In Hz, default is samplerate/2
Returns:
array: features - the energy of fiterbank: num_frames x num_filters frame_energies.
The energy of each frame: num_frames x 1
"""
# Convert to float
signal = signal.astype(float)
# Stack frames
frames = processing.stack_frames(signal, sampling_frequency=sampling_frequency, frame_length=frame_length,
frame_stride=frame_stride,
filter=lambda x: np.ones((x,)),
zero_padding=False)
# getting the high frequency
high_frequency = high_frequency or sampling_frequency / 2
# calculation of the power sprectum
power_spectrum = processing.power_spectrum(frames, fft_length)
number_fft_coefficients = power_spectrum.shape[1]
frame_energies = np.sum(power_spectrum, 1) # this stores the total energy in each frame
# Handling zero enegies.
frame_energies = functions.zero_handling(frame_energies)
# Extracting the filterbank
filter_banks = filterbanks(num_filters, number_fft_coefficients, sampling_frequency, low_frequency, high_frequency)
# Filterbank energies
features = np.dot(power_spectrum, filter_banks.T)
features = functions.zero_handling(features)
return features, frame_energies
def lmfe(signal, sampling_frequency, frame_length=0.020, frame_stride=0.01,
num_filters=40, fft_length=512, low_frequency=0, high_frequency=None):
"""Compute log Mel-filterbank energy features from an audio signal.
Args:
signal (array): the audio signal from which to compute features. Should be an N x 1 array
sampling_frequency (int): the sampling frequency of the signal we are working with.
frame_length (float): the length of each frame in seconds. Default is 0.020s
frame_stride (float): the step between successive frames in seconds. Default is 0.02s (means no overlap)
num_filters (int): the number of filters in the filterbank, default 40.
fft_length (int): number of FFT points. Default is 512.
low_frequency (float): lowest band edge of mel filters. In Hz, default is 0.
high_frequency (float): highest band edge of mel filters. In Hz, default is samplerate/2
Returns:
array: Features - The energy of fiterbank: num_frames x num_filters
frame_log_energies. The log energy of each frame: num_frames x 1
"""
feature, frame_energies = mfe(signal, sampling_frequency=sampling_frequency, frame_length=frame_length,
frame_stride=frame_stride,
num_filters=num_filters, fft_length=fft_length, low_frequency=low_frequency,
high_frequency=high_frequency)
feature = np.log(feature)
return feature
def extract_derivative_feature(feature):
"""
This function extracts temporal derivative features which are first and second derivatives.
Args:
feature (array): The feature vector which its size is: N x M
Return:
array: The feature cube vector which contains the static, first and second derivative features of size: N x M x 3
"""
first_derivative_feature = processing.derivative_extraction(feature, DeltaWindows=2)
second_derivative_feature = processing.derivative_extraction(first_derivative_feature, DeltaWindows=2)
# Creating the future cube for each file
feature_cube = np.concatenate(
(feature[:, :, None], first_derivative_feature[:, :, None],
second_derivative_feature[:, :, None]),
axis=2)
return feature_cube
|