feature-functions.h
8.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
// feat/feature-functions.h
// Copyright 2009-2011 Karel Vesely; Petr Motlicek; Microsoft Corporation
// 2014 IMSL, PKU-HKUST (author: Wei Shi)
// 2016 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_FEAT_FEATURE_FUNCTIONS_H_
#define KALDI_FEAT_FEATURE_FUNCTIONS_H_
#include <string>
#include <vector>
#include "matrix/matrix-lib.h"
#include "util/common-utils.h"
#include "base/kaldi-error.h"
namespace kaldi {
/// @addtogroup feat FeatureExtraction
/// @{
// ComputePowerSpectrum converts a complex FFT (as produced by the FFT
// functions in matrix/matrix-functions.h), and converts it into
// a power spectrum. If the complex FFT is a vector of size n (representing
// half the complex FFT of a real signal of size n, as described there),
// this function computes in the first (n/2) + 1 elements of it, the
// energies of the fft bins from zero to the Nyquist frequency. Contents of the
// remaining (n/2) - 1 elements are undefined at output.
void ComputePowerSpectrum(VectorBase<BaseFloat> *complex_fft);
struct DeltaFeaturesOptions {
int32 order;
int32 window; // e.g. 2; controls window size (window size is 2*window + 1)
// the behavior at the edges is to replicate the first or last frame.
// this is not configurable.
DeltaFeaturesOptions(int32 order = 2, int32 window = 2):
order(order), window(window) { }
void Register(OptionsItf *opts) {
opts->Register("delta-order", &order, "Order of delta computation");
opts->Register("delta-window", &window,
"Parameter controlling window for delta computation (actual window"
" size for each delta order is 1 + 2*delta-window-size)");
}
};
class DeltaFeatures {
public:
// This class provides a low-level function to compute delta features.
// The function takes as input a matrix of features and a frame index
// that it should compute the deltas on. It puts its output in an object
// of type VectorBase, of size (original-feature-dimension) * (opts.order+1).
// This is not the most efficient way to do the computation, but it's
// state-free and thus easier to understand
explicit DeltaFeatures(const DeltaFeaturesOptions &opts);
void Process(const MatrixBase<BaseFloat> &input_feats,
int32 frame,
VectorBase<BaseFloat> *output_frame) const;
private:
DeltaFeaturesOptions opts_;
std::vector<Vector<BaseFloat> > scales_; // a scaling window for each
// of the orders, including zero: multiply the features for each
// dimension by this window.
};
struct ShiftedDeltaFeaturesOptions {
int32 window, // The time delay and advance
num_blocks,
block_shift; // Distance between consecutive blocks
ShiftedDeltaFeaturesOptions():
window(1), num_blocks(7), block_shift(3) { }
void Register(OptionsItf *opts) {
opts->Register("delta-window", &window, "Size of delta advance and delay.");
opts->Register("num-blocks", &num_blocks, "Number of delta blocks in advance"
" of each frame to be concatenated");
opts->Register("block-shift", &block_shift, "Distance between each block");
}
};
class ShiftedDeltaFeatures {
public:
// This class provides a low-level function to compute shifted
// delta cesptra (SDC).
// The function takes as input a matrix of features and a frame index
// that it should compute the deltas on. It puts its output in an object
// of type VectorBase, of size original-feature-dimension + (1 * num_blocks).
explicit ShiftedDeltaFeatures(const ShiftedDeltaFeaturesOptions &opts);
void Process(const MatrixBase<BaseFloat> &input_feats,
int32 frame,
SubVector<BaseFloat> *output_frame) const;
private:
ShiftedDeltaFeaturesOptions opts_;
Vector<BaseFloat> scales_; // a scaling window for each
};
// ComputeDeltas is a convenience function that computes deltas on a feature
// file. If you want to deal with features coming in bit by bit you would have
// to use the DeltaFeatures class directly, and do the computation frame by
// frame. Later we will have to come up with a nice mechanism to do this for
// features coming in.
void ComputeDeltas(const DeltaFeaturesOptions &delta_opts,
const MatrixBase<BaseFloat> &input_features,
Matrix<BaseFloat> *output_features);
// ComputeShiftedDeltas computes deltas from a feature file by applying
// ShiftedDeltaFeatures over the frames. This function is provided for
// convenience, however, ShiftedDeltaFeatures can be used directly.
void ComputeShiftedDeltas(const ShiftedDeltaFeaturesOptions &delta_opts,
const MatrixBase<BaseFloat> &input_features,
Matrix<BaseFloat> *output_features);
// SpliceFrames will normally be used together with LDA.
// It splices frames together to make a window. At the
// start and end of an utterance, it duplicates the first
// and last frames.
// Will throw if input features are empty.
// left_context and right_context must be nonnegative.
// these both represent a number of frames (e.g. 4, 4 is
// a good choice).
void SpliceFrames(const MatrixBase<BaseFloat> &input_features,
int32 left_context,
int32 right_context,
Matrix<BaseFloat> *output_features);
// ReverseFrames reverses the frames in time (used for backwards decoding)
void ReverseFrames(const MatrixBase<BaseFloat> &input_features,
Matrix<BaseFloat> *output_features);
void InitIdftBases(int32 n_bases, int32 dimension, Matrix<BaseFloat> *mat_out);
// This is used for speaker-id. Also see OnlineCmnOptions in ../online2/, which
// is online CMN with no latency, for online speech recognition.
struct SlidingWindowCmnOptions {
int32 cmn_window;
int32 min_window;
int32 max_warnings;
bool normalize_variance;
bool center;
SlidingWindowCmnOptions():
cmn_window(600),
min_window(100),
max_warnings(5),
normalize_variance(false),
center(false) { }
void Register(OptionsItf *opts) {
opts->Register("cmn-window", &cmn_window, "Window in frames for running "
"average CMN computation");
opts->Register("min-cmn-window", &min_window, "Minimum CMN window "
"used at start of decoding (adds latency only at start). "
"Only applicable if center == false, ignored if center==true");
opts->Register("max-warnings", &max_warnings, "Maximum warnings to report "
"per utterance. 0 to disable, -1 to show all.");
opts->Register("norm-vars", &normalize_variance, "If true, normalize "
"variance to one."); // naming this as in apply-cmvn.cc
opts->Register("center", ¢er, "If true, use a window centered on the "
"current frame (to the extent possible, modulo end effects). "
"If false, window is to the left.");
}
void Check() const;
};
/// Applies sliding-window cepstral mean and/or variance normalization. See the
/// strings registering the options in the options class for information on how
/// this works and what the options are. input and output must have the same
/// dimension.
void SlidingWindowCmn(const SlidingWindowCmnOptions &opts,
const MatrixBase<BaseFloat> &input,
MatrixBase<BaseFloat> *output);
/// @} End of "addtogroup feat"
} // namespace kaldi
#endif // KALDI_FEAT_FEATURE_FUNCTIONS_H_