online-nnet2-feature-pipeline.h
13.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
// online2/online-nnet2-feature-pipeline.h
// Copyright 2013-2014 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_ONLINE2_ONLINE_NNET2_FEATURE_PIPELINE_H_
#define KALDI_ONLINE2_ONLINE_NNET2_FEATURE_PIPELINE_H_
#include <string>
#include <vector>
#include <deque>
#include "matrix/matrix-lib.h"
#include "util/common-utils.h"
#include "base/kaldi-error.h"
#include "feat/online-feature.h"
#include "feat/pitch-functions.h"
#include "online2/online-ivector-feature.h"
namespace kaldi {
/// @addtogroup onlinefeat OnlineFeatureExtraction
/// @{
/// @file
/// This file contains a different version of the feature-extraction pipeline in
/// \ref online-feature-pipeline.h, specialized for use in neural network
/// decoding with iVectors. Our recipe is that we extract iVectors that will
/// be used as an additional input to the neural network, in addition to
/// a window of several frames of spliced raw features (MFCC, PLP or filterbanks).
/// The iVectors are extracted on top of a (splice+LDA+MLLT) feature pipeline,
/// with the added complication that the GMM posteriors used for the iVector
/// extraction are obtained with a version of the features that has online
/// cepstral mean (and optionally variance) normalization, whereas the stats for
/// iVector are accumulated with a non-mean-normalized version of the features.
/// The idea here is that we want the iVector to learn the mean offset, but
/// we want the posteriors to be somewhat invariant to mean offsets.
///
/// Most of the logic for the actual iVector estimation is in \ref
/// online-ivector-feature.h, this header contains mostly glue.
///
/// Although the name of this header mentions nnet2, actually the code is
/// used in the online decoding with nnet3 also.
/// This configuration class is to set up OnlineNnet2FeaturePipelineInfo, which
/// in turn is the configuration class for OnlineNnet2FeaturePipeline.
/// Instead of taking the options for the parts of the feature pipeline
/// directly, it reads in the names of configuration classes.
struct OnlineNnet2FeaturePipelineConfig {
std::string feature_type; // "plp" or "mfcc" or "fbank"
std::string mfcc_config;
std::string plp_config;
std::string fbank_config;
// Note: if we do add pitch, it will not be added to the features we give to
// the iVector extractor but only to the features we give to the neural
// network, after the base features but before the iVector. We don't think
// the iVector will be particularly helpful in normalizing the pitch features,
// and we wanted to avoid complications with things like online CMVN.
bool add_pitch;
// the following contains the type of options that you could give to
// compute-and-process-kaldi-pitch-feats.
std::string online_pitch_config;
// The configuration variables in ivector_extraction_config relate to the
// iVector extractor and options related to it, see type
// OnlineIvectorExtractionConfig.
std::string ivector_extraction_config;
// Config that relates to how we weight silence for (ivector) adaptation
// this is registered directly to the command line as you might want to
// play with it in test time.
OnlineSilenceWeightingConfig silence_weighting_config;
OnlineNnet2FeaturePipelineConfig():
feature_type("mfcc"), add_pitch(false) { }
void Register(OptionsItf *opts) {
opts->Register("feature-type", &feature_type,
"Base feature type [mfcc, plp, fbank]");
opts->Register("mfcc-config", &mfcc_config, "Configuration file for "
"MFCC features (e.g. conf/mfcc.conf)");
opts->Register("plp-config", &plp_config, "Configuration file for "
"PLP features (e.g. conf/plp.conf)");
opts->Register("fbank-config", &fbank_config, "Configuration file for "
"filterbank features (e.g. conf/fbank.conf)");
opts->Register("add-pitch", &add_pitch, "Append pitch features to raw "
"MFCC/PLP/filterbank features [but not for iVector extraction]");
opts->Register("online-pitch-config", &online_pitch_config, "Configuration "
"file for online pitch features, if --add-pitch=true (e.g. "
"conf/online_pitch.conf)");
opts->Register("ivector-extraction-config", &ivector_extraction_config,
"Configuration file for online iVector extraction, "
"see class OnlineIvectorExtractionConfig in the code");
silence_weighting_config.RegisterWithPrefix("ivector-silence-weighting", opts);
}
};
/// This class is responsible for storing configuration variables, objects and
/// options for OnlineNnet2FeaturePipeline (including the actual LDA and
/// CMVN-stats matrices, and the iVector extractor, which is a member of
/// ivector_extractor_info. This class does not register options on the command
/// line; instead, it is initialized from class OnlineNnet2FeaturePipelineConfig
/// which reads the options from the command line. The reason for structuring
/// it this way is to make it easier to configure from code as well as from the
/// command line, as well as for easiter multithreaded operation.
struct OnlineNnet2FeaturePipelineInfo {
OnlineNnet2FeaturePipelineInfo():
feature_type("mfcc"), add_pitch(false) { }
OnlineNnet2FeaturePipelineInfo(
const OnlineNnet2FeaturePipelineConfig &config);
BaseFloat FrameShiftInSeconds() const;
std::string feature_type; // "mfcc" or "plp" or "fbank"
MfccOptions mfcc_opts; // options for MFCC computation,
// if feature_type == "mfcc"
PlpOptions plp_opts; // Options for PLP computation, if feature_type == "plp"
FbankOptions fbank_opts; // Options for filterbank computation, if
// feature_type == "fbank"
bool add_pitch;
PitchExtractionOptions pitch_opts; // Options for pitch extraction, if done.
ProcessPitchOptions pitch_process_opts; // Options for pitch post-processing
// If the user specified --ivector-extraction-config, we assume we're using
// iVectors as an extra input to the neural net. Actually, we don't
// anticipate running this setup without iVectors.
bool use_ivectors;
OnlineIvectorExtractionInfo ivector_extractor_info;
// Config for weighting silence in iVector adaptation.
// We declare this outside of ivector_extractor_info... it was
// just easier to set up the code that way; and also we think
// it's the kind of thing you might want to play with directly
// on the command line instead of inside sub-config-files.
OnlineSilenceWeightingConfig silence_weighting_config;
int32 IvectorDim() { return ivector_extractor_info.extractor.IvectorDim(); }
private:
KALDI_DISALLOW_COPY_AND_ASSIGN(OnlineNnet2FeaturePipelineInfo);
};
/// OnlineNnet2FeaturePipeline is a class that's responsible for putting
/// together the various parts of the feature-processing pipeline for neural
/// networks, in an online setting. The recipe here does not include fMLLR;
/// instead, it assumes we're giving raw features such as MFCC or PLP or
/// filterbank (with no CMVN) to the neural network, and optionally augmenting
/// these with an iVector that describes the speaker characteristics. The
/// iVector is extracted using class OnlineIvectorFeature (see that class for
/// more info on how it's done).
/// No splicing is currently done in this code, as we're currently only supporting
/// the nnet2 neural network in which the splicing is done inside the network.
/// Probably our strategy for nnet1 network conversion would be to convert to nnet2
/// and just add layers to do the splicing.
class OnlineNnet2FeaturePipeline: public OnlineFeatureInterface {
public:
/// Constructor from the "info" object. After calling this for a
/// non-initial utterance of a speaker, you may want to call
/// SetAdaptationState().
explicit OnlineNnet2FeaturePipeline(
const OnlineNnet2FeaturePipelineInfo &info);
/// Member functions from OnlineFeatureInterface:
/// Dim() will return the base-feature dimension (e.g. 13 for normal MFCC);
/// plus the pitch-feature dimension (e.g. 3), if used; plus the iVector
/// dimension, if used. Any frame-splicing happens inside the neural-network
/// code.
virtual int32 Dim() const;
virtual bool IsLastFrame(int32 frame) const;
virtual int32 NumFramesReady() const;
virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
/// If you are downweighting silence, you can call
/// OnlineSilenceWeighting::GetDeltaWeights and supply the output to this
/// class using UpdateFrameWeights(). The reason why this call happens
/// outside this class, rather than this class pulling in the data weights,
/// relates to multi-threaded operation and also from not wanting this class
/// to have excessive dependencies.
///
/// You must either always call this as soon as new data becomes available,
/// ideally just after calling AcceptWaveform(), or never call it for the
/// lifetime of this object.
void UpdateFrameWeights(
const std::vector<std::pair<int32, BaseFloat> > &delta_weights,
int32 frame_offset = 0);
/// Set the adaptation state to a particular value, e.g. reflecting previous
/// utterances of the same speaker; this will generally be called after
/// Copy().
void SetAdaptationState(
const OnlineIvectorExtractorAdaptationState &adaptation_state);
/// Get the adaptation state; you may want to call this before destroying this
/// object, to get adaptation state that can be used to improve decoding of
/// later utterances of this speaker. You might not want to do this, though,
/// if you have reason to believe that something went wrong in the recognition
/// (e.g., low confidence).
void GetAdaptationState(
OnlineIvectorExtractorAdaptationState *adaptation_state) const;
/// Accept more data to process. It won't actually process it until you call
/// GetFrame() [probably indirectly via (decoder).AdvanceDecoding()], when you
/// call this function it will just copy it). sampling_rate is necessary just
/// to assert it equals what's in the config.
void AcceptWaveform(BaseFloat sampling_rate,
const VectorBase<BaseFloat> &waveform);
BaseFloat FrameShiftInSeconds() const { return info_.FrameShiftInSeconds(); }
/// If you call InputFinished(), it tells the class you won't be providing any
/// more waveform. This will help flush out the last few frames of delta or
/// LDA features, and finalize the pitch features (making them more
/// accurate)... although since in neural-net decoding we don't anticipate
/// rescoring the lattices, this may not be much of an issue.
void InputFinished();
// This function returns the iVector-extracting part of the feature pipeline
// (or NULL if iVectors are not being used); the pointer ownership is retained
// by this object and not transferred to the caller. This function is used in
// nnet3, and also in the silence-weighting code used to exclude silence from
// the iVector estimation.
OnlineIvectorFeature *IvectorFeature() {
return ivector_feature_;
}
// A const accessor for the iVector extractor. Returns NULL if iVectors are
// not being used.
const OnlineIvectorFeature *IvectorFeature() const {
return ivector_feature_;
}
// This function returns the part of the feature pipeline that would be given
// as the primary (non-iVector) input to the neural network in nnet3
// applications.
OnlineFeatureInterface *InputFeature() {
return feature_plus_optional_pitch_;
}
virtual ~OnlineNnet2FeaturePipeline();
private:
const OnlineNnet2FeaturePipelineInfo &info_;
OnlineBaseFeature *base_feature_; // MFCC/PLP/filterbank
OnlinePitchFeature *pitch_; // Raw pitch, if used
OnlineProcessPitch *pitch_feature_; // Processed pitch, if pitch used.
// feature_plus_pitch_ is the base_feature_ appended (OnlineAppendFeature)
/// with pitch_feature_, if used; otherwise, points to the same address as
/// base_feature_.
OnlineFeatureInterface *feature_plus_optional_pitch_;
OnlineIvectorFeature *ivector_feature_; // iVector feature, if used.
// final_feature_ is feature_plus_optional_pitch_ appended
// (OnlineAppendFeature) with ivector_feature_, if ivector_feature_ is used;
// otherwise, points to the same address as feature_plus_optional_pitch_.
OnlineFeatureInterface *final_feature_;
// we cache the feature dimension, to save time when calling Dim().
int32 dim_;
};
/// @} End of "addtogroup onlinefeat"
} // namespace kaldi
#endif // KALDI_ONLINE2_ONLINE_NNET2_FEATURE_PIPELINE_H_