online-nnet2-feature-pipeline.h 13.1 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302


// online2/online-nnet2-feature-pipeline.h

// Copyright 2013-2014   Johns Hopkins University (author: Daniel Povey)

// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//  http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.


#ifndef KALDI_ONLINE2_ONLINE_NNET2_FEATURE_PIPELINE_H_
#define KALDI_ONLINE2_ONLINE_NNET2_FEATURE_PIPELINE_H_

#include <string>
#include <vector>
#include <deque>

#include "matrix/matrix-lib.h"
#include "util/common-utils.h"
#include "base/kaldi-error.h"
#include "feat/online-feature.h"
#include "feat/pitch-functions.h"
#include "online2/online-ivector-feature.h"

namespace kaldi {
/// @addtogroup  onlinefeat OnlineFeatureExtraction
/// @{

/// @file
/// This file contains a different version of the feature-extraction pipeline in
/// \ref online-feature-pipeline.h, specialized for use in neural network
/// decoding with iVectors.  Our recipe is that we extract iVectors that will
/// be used as an additional input to the neural network, in addition to
/// a window of several frames of spliced raw features (MFCC, PLP or filterbanks).
/// The iVectors are extracted on top of a (splice+LDA+MLLT) feature pipeline,
/// with the added complication that the GMM posteriors used for the iVector
/// extraction are obtained with a version of the features that has online
/// cepstral mean (and optionally variance) normalization, whereas the stats for
/// iVector are accumulated with a non-mean-normalized version of the features.
/// The idea here is that we want the iVector to learn the mean offset, but
/// we want the posteriors to be somewhat invariant to mean offsets.
///
/// Most of the logic for the actual iVector estimation is in \ref
/// online-ivector-feature.h, this header contains mostly glue.
///
/// Although the name of this header mentions nnet2, actually the code is
/// used in the online decoding with nnet3 also.


/// This configuration class is to set up OnlineNnet2FeaturePipelineInfo, which
/// in turn is the configuration class for OnlineNnet2FeaturePipeline.
/// Instead of taking the options for the parts of the feature pipeline
/// directly, it reads in the names of configuration classes.
struct OnlineNnet2FeaturePipelineConfig {
  std::string feature_type;  // "plp" or "mfcc" or "fbank"
  std::string mfcc_config;
  std::string plp_config;
  std::string fbank_config;

  // Note: if we do add pitch, it will not be added to the features we give to
  // the iVector extractor but only to the features we give to the neural
  // network, after the base features but before the iVector.  We don't think
  // the iVector will be particularly helpful in normalizing the pitch features,
  // and we wanted to avoid complications with things like online CMVN.
  bool add_pitch;

  // the following contains the type of options that you could give to
  // compute-and-process-kaldi-pitch-feats.
  std::string online_pitch_config;

  // The configuration variables in ivector_extraction_config relate to the
  // iVector extractor and options related to it, see type
  // OnlineIvectorExtractionConfig.
  std::string ivector_extraction_config;

  // Config that relates to how we weight silence for (ivector) adaptation
  // this is registered directly to the command line as you might want to
  // play with it in test time.
  OnlineSilenceWeightingConfig silence_weighting_config;

  OnlineNnet2FeaturePipelineConfig():
      feature_type("mfcc"), add_pitch(false) { }


  void Register(OptionsItf *opts) {
    opts->Register("feature-type", &feature_type,
                   "Base feature type [mfcc, plp, fbank]");
    opts->Register("mfcc-config", &mfcc_config, "Configuration file for "
                   "MFCC features (e.g. conf/mfcc.conf)");
    opts->Register("plp-config", &plp_config, "Configuration file for "
                   "PLP features (e.g. conf/plp.conf)");
    opts->Register("fbank-config", &fbank_config, "Configuration file for "
                   "filterbank features (e.g. conf/fbank.conf)");
    opts->Register("add-pitch", &add_pitch, "Append pitch features to raw "
                   "MFCC/PLP/filterbank features [but not for iVector extraction]");
    opts->Register("online-pitch-config", &online_pitch_config, "Configuration "
                   "file for online pitch features, if --add-pitch=true (e.g. "
                   "conf/online_pitch.conf)");
    opts->Register("ivector-extraction-config", &ivector_extraction_config,
                   "Configuration file for online iVector extraction, "
                   "see class OnlineIvectorExtractionConfig in the code");
    silence_weighting_config.RegisterWithPrefix("ivector-silence-weighting", opts);
  }
};


/// This class is responsible for storing configuration variables, objects and
/// options for OnlineNnet2FeaturePipeline (including the actual LDA and
/// CMVN-stats matrices, and the iVector extractor, which is a member of
/// ivector_extractor_info.  This class does not register options on the command
/// line; instead, it is initialized from class OnlineNnet2FeaturePipelineConfig
/// which reads the options from the command line.  The reason for structuring
/// it this way is to make it easier to configure from code as well as from the
/// command line, as well as for easiter multithreaded operation.
struct OnlineNnet2FeaturePipelineInfo {
  OnlineNnet2FeaturePipelineInfo():
      feature_type("mfcc"), add_pitch(false) { }

  OnlineNnet2FeaturePipelineInfo(
      const OnlineNnet2FeaturePipelineConfig &config);

  BaseFloat FrameShiftInSeconds() const;

  std::string feature_type;  // "mfcc" or "plp" or "fbank"

  MfccOptions mfcc_opts;  // options for MFCC computation,
                          // if feature_type == "mfcc"
  PlpOptions plp_opts;  // Options for PLP computation, if feature_type == "plp"
  FbankOptions fbank_opts;  // Options for filterbank computation, if
                            // feature_type == "fbank"

  bool add_pitch;
  PitchExtractionOptions pitch_opts;  // Options for pitch extraction, if done.
  ProcessPitchOptions pitch_process_opts;  // Options for pitch post-processing


  // If the user specified --ivector-extraction-config, we assume we're using
  // iVectors as an extra input to the neural net.  Actually, we don't
  // anticipate running this setup without iVectors.
  bool use_ivectors;
  OnlineIvectorExtractionInfo ivector_extractor_info;

  // Config for weighting silence in iVector adaptation.
  // We declare this outside of ivector_extractor_info... it was
  // just easier to set up the code that way; and also we think
  // it's the kind of thing you might want to play with directly
  // on the command line instead of inside sub-config-files.
  OnlineSilenceWeightingConfig silence_weighting_config;

  int32 IvectorDim() { return ivector_extractor_info.extractor.IvectorDim(); }
 private:
  KALDI_DISALLOW_COPY_AND_ASSIGN(OnlineNnet2FeaturePipelineInfo);
};


/// OnlineNnet2FeaturePipeline is a class that's responsible for putting
/// together the various parts of the feature-processing pipeline for neural
/// networks, in an online setting.  The recipe here does not include fMLLR;
/// instead, it assumes we're giving raw features such as MFCC or PLP or
/// filterbank (with no CMVN) to the neural network, and optionally augmenting
/// these with an iVector that describes the speaker characteristics.  The
/// iVector is extracted using class OnlineIvectorFeature (see that class for
/// more info on how it's done).
/// No splicing is currently done in this code, as we're currently only supporting
/// the nnet2 neural network in which the splicing is done inside the network.
/// Probably our strategy for nnet1 network conversion would be to convert to nnet2
/// and just add layers to do the splicing.
class OnlineNnet2FeaturePipeline: public OnlineFeatureInterface {
 public:
  /// Constructor from the "info" object.  After calling this for a
  /// non-initial utterance of a speaker, you may want to call
  /// SetAdaptationState().
  explicit OnlineNnet2FeaturePipeline(
      const OnlineNnet2FeaturePipelineInfo &info);

  /// Member functions from OnlineFeatureInterface:

  /// Dim() will return the base-feature dimension (e.g. 13 for normal MFCC);
  /// plus the pitch-feature dimension (e.g. 3), if used; plus the iVector
  /// dimension, if used.  Any frame-splicing happens inside the neural-network
  /// code.
  virtual int32 Dim() const;

  virtual bool IsLastFrame(int32 frame) const;
  virtual int32 NumFramesReady() const;
  virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);

  /// If you are downweighting silence, you can call
  /// OnlineSilenceWeighting::GetDeltaWeights and supply the output to this
  /// class using UpdateFrameWeights().  The reason why this call happens
  /// outside this class, rather than this class pulling in the data weights,
  /// relates to multi-threaded operation and also from not wanting this class
  /// to have excessive dependencies.
  ///
  /// You must either always call this as soon as new data becomes available,
  /// ideally just after calling AcceptWaveform(), or never call it for the
  /// lifetime of this object.
  void UpdateFrameWeights(
      const std::vector<std::pair<int32, BaseFloat> > &delta_weights,
      int32 frame_offset = 0);

  /// Set the adaptation state to a particular value, e.g. reflecting previous
  /// utterances of the same speaker; this will generally be called after
  /// Copy().
  void SetAdaptationState(
      const OnlineIvectorExtractorAdaptationState &adaptation_state);


  /// Get the adaptation state; you may want to call this before destroying this
  /// object, to get adaptation state that can be used to improve decoding of
  /// later utterances of this speaker.  You might not want to do this, though,
  /// if you have reason to believe that something went wrong in the recognition
  /// (e.g., low confidence).
  void GetAdaptationState(
      OnlineIvectorExtractorAdaptationState *adaptation_state) const;


  /// Accept more data to process.  It won't actually process it until you call
  /// GetFrame() [probably indirectly via (decoder).AdvanceDecoding()], when you
  /// call this function it will just copy it).  sampling_rate is necessary just
  /// to assert it equals what's in the config.
  void AcceptWaveform(BaseFloat sampling_rate,
                      const VectorBase<BaseFloat> &waveform);

  BaseFloat FrameShiftInSeconds() const { return info_.FrameShiftInSeconds(); }

  /// If you call InputFinished(), it tells the class you won't be providing any
  /// more waveform.  This will help flush out the last few frames of delta or
  /// LDA features, and finalize the pitch features (making them more
  /// accurate)... although since in neural-net decoding we don't anticipate
  /// rescoring the lattices, this may not be much of an issue.
  void InputFinished();

  // This function returns the iVector-extracting part of the feature pipeline
  // (or NULL if iVectors are not being used); the pointer ownership is retained
  // by this object and not transferred to the caller.  This function is used in
  // nnet3, and also in the silence-weighting code used to exclude silence from
  // the iVector estimation.
  OnlineIvectorFeature *IvectorFeature() {
    return ivector_feature_;
  }

  // A const accessor for the iVector extractor. Returns NULL if iVectors are
  // not being used.
  const OnlineIvectorFeature *IvectorFeature() const {
    return ivector_feature_;
  }

  // This function returns the part of the feature pipeline that would be given
  // as the primary (non-iVector) input to the neural network in nnet3
  // applications.
  OnlineFeatureInterface *InputFeature() {
    return feature_plus_optional_pitch_;
  }

  virtual ~OnlineNnet2FeaturePipeline();
 private:

  const OnlineNnet2FeaturePipelineInfo &info_;

  OnlineBaseFeature *base_feature_;        // MFCC/PLP/filterbank

  OnlinePitchFeature *pitch_;              // Raw pitch, if used
  OnlineProcessPitch *pitch_feature_;  // Processed pitch, if pitch used.


  // feature_plus_pitch_ is the base_feature_ appended (OnlineAppendFeature)
  /// with pitch_feature_, if used; otherwise, points to the same address as
  /// base_feature_.
  OnlineFeatureInterface *feature_plus_optional_pitch_;

  OnlineIvectorFeature *ivector_feature_;  // iVector feature, if used.

  // final_feature_ is feature_plus_optional_pitch_ appended
  // (OnlineAppendFeature) with ivector_feature_, if ivector_feature_ is used;
  // otherwise, points to the same address as feature_plus_optional_pitch_.
  OnlineFeatureInterface *final_feature_;

  // we cache the feature dimension, to save time when calling Dim().
  int32 dim_;
};


/// @} End of "addtogroup onlinefeat"
}  // namespace kaldi


#endif  // KALDI_ONLINE2_ONLINE_NNET2_FEATURE_PIPELINE_H_