Yannick Estève / ONTRAC-Kaldi

Blame view

src/online2/online-nnet2-feature-pipeline.h 13.1 KB
  // online2/online-nnet2-feature-pipeline.h
  
  // Copyright 2013-2014   Johns Hopkins University (author: Daniel Povey)
  
  // See ../../COPYING for clarification regarding multiple authors
  //
  // Licensed under the Apache License, Version 2.0 (the "License");
  // you may not use this file except in compliance with the License.
  // You may obtain a copy of the License at
  //
  //  http://www.apache.org/licenses/LICENSE-2.0
  //
  // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  // MERCHANTABLITY OR NON-INFRINGEMENT.
  // See the Apache 2 License for the specific language governing permissions and
  // limitations under the License.
  
  
  #ifndef KALDI_ONLINE2_ONLINE_NNET2_FEATURE_PIPELINE_H_
  #define KALDI_ONLINE2_ONLINE_NNET2_FEATURE_PIPELINE_H_
  
  #include <string>
  #include <vector>
  #include <deque>
  
  #include "matrix/matrix-lib.h"
  #include "util/common-utils.h"
  #include "base/kaldi-error.h"
  #include "feat/online-feature.h"
  #include "feat/pitch-functions.h"
  #include "online2/online-ivector-feature.h"
  
  namespace kaldi {
  /// @addtogroup  onlinefeat OnlineFeatureExtraction
  /// @{
  
  /// @file
  /// This file contains a different version of the feature-extraction pipeline in
  /// \ref online-feature-pipeline.h, specialized for use in neural network
  /// decoding with iVectors.  Our recipe is that we extract iVectors that will
  /// be used as an additional input to the neural network, in addition to
  /// a window of several frames of spliced raw features (MFCC, PLP or filterbanks).
  /// The iVectors are extracted on top of a (splice+LDA+MLLT) feature pipeline,
  /// with the added complication that the GMM posteriors used for the iVector
  /// extraction are obtained with a version of the features that has online
  /// cepstral mean (and optionally variance) normalization, whereas the stats for
  /// iVector are accumulated with a non-mean-normalized version of the features.
  /// The idea here is that we want the iVector to learn the mean offset, but
  /// we want the posteriors to be somewhat invariant to mean offsets.
  ///
  /// Most of the logic for the actual iVector estimation is in \ref
  /// online-ivector-feature.h, this header contains mostly glue.
  ///
  /// Although the name of this header mentions nnet2, actually the code is
  /// used in the online decoding with nnet3 also.
  
  
  /// This configuration class is to set up OnlineNnet2FeaturePipelineInfo, which
  /// in turn is the configuration class for OnlineNnet2FeaturePipeline.
  /// Instead of taking the options for the parts of the feature pipeline
  /// directly, it reads in the names of configuration classes.
  struct OnlineNnet2FeaturePipelineConfig {
    std::string feature_type;  // "plp" or "mfcc" or "fbank"
    std::string mfcc_config;
    std::string plp_config;
    std::string fbank_config;
  
    // Note: if we do add pitch, it will not be added to the features we give to
    // the iVector extractor but only to the features we give to the neural
    // network, after the base features but before the iVector.  We don't think
    // the iVector will be particularly helpful in normalizing the pitch features,
    // and we wanted to avoid complications with things like online CMVN.
    bool add_pitch;
  
    // the following contains the type of options that you could give to
    // compute-and-process-kaldi-pitch-feats.
    std::string online_pitch_config;
  
    // The configuration variables in ivector_extraction_config relate to the
    // iVector extractor and options related to it, see type
    // OnlineIvectorExtractionConfig.
    std::string ivector_extraction_config;
  
    // Config that relates to how we weight silence for (ivector) adaptation
    // this is registered directly to the command line as you might want to
    // play with it in test time.
    OnlineSilenceWeightingConfig silence_weighting_config;
  
    OnlineNnet2FeaturePipelineConfig():
        feature_type("mfcc"), add_pitch(false) { }
  
  
    void Register(OptionsItf *opts) {
      opts->Register("feature-type", &feature_type,
                     "Base feature type [mfcc, plp, fbank]");
      opts->Register("mfcc-config", &mfcc_config, "Configuration file for "
                     "MFCC features (e.g. conf/mfcc.conf)");
      opts->Register("plp-config", &plp_config, "Configuration file for "
                     "PLP features (e.g. conf/plp.conf)");
      opts->Register("fbank-config", &fbank_config, "Configuration file for "
                     "filterbank features (e.g. conf/fbank.conf)");
      opts->Register("add-pitch", &add_pitch, "Append pitch features to raw "
                     "MFCC/PLP/filterbank features [but not for iVector extraction]");
      opts->Register("online-pitch-config", &online_pitch_config, "Configuration "
                     "file for online pitch features, if --add-pitch=true (e.g. "
                     "conf/online_pitch.conf)");
      opts->Register("ivector-extraction-config", &ivector_extraction_config,
                     "Configuration file for online iVector extraction, "
                     "see class OnlineIvectorExtractionConfig in the code");
      silence_weighting_config.RegisterWithPrefix("ivector-silence-weighting", opts);
    }
  };
  
  
  /// This class is responsible for storing configuration variables, objects and
  /// options for OnlineNnet2FeaturePipeline (including the actual LDA and
  /// CMVN-stats matrices, and the iVector extractor, which is a member of
  /// ivector_extractor_info.  This class does not register options on the command
  /// line; instead, it is initialized from class OnlineNnet2FeaturePipelineConfig
  /// which reads the options from the command line.  The reason for structuring
  /// it this way is to make it easier to configure from code as well as from the
  /// command line, as well as for easiter multithreaded operation.
  struct OnlineNnet2FeaturePipelineInfo {
    OnlineNnet2FeaturePipelineInfo():
        feature_type("mfcc"), add_pitch(false) { }
  
    OnlineNnet2FeaturePipelineInfo(
        const OnlineNnet2FeaturePipelineConfig &config);
  
    BaseFloat FrameShiftInSeconds() const;
  
    std::string feature_type;  // "mfcc" or "plp" or "fbank"
  
    MfccOptions mfcc_opts;  // options for MFCC computation,
                            // if feature_type == "mfcc"
    PlpOptions plp_opts;  // Options for PLP computation, if feature_type == "plp"
    FbankOptions fbank_opts;  // Options for filterbank computation, if
                              // feature_type == "fbank"
  
    bool add_pitch;
    PitchExtractionOptions pitch_opts;  // Options for pitch extraction, if done.
    ProcessPitchOptions pitch_process_opts;  // Options for pitch post-processing
  
  
    // If the user specified --ivector-extraction-config, we assume we're using
    // iVectors as an extra input to the neural net.  Actually, we don't
    // anticipate running this setup without iVectors.
    bool use_ivectors;
    OnlineIvectorExtractionInfo ivector_extractor_info;
  
    // Config for weighting silence in iVector adaptation.
    // We declare this outside of ivector_extractor_info... it was
    // just easier to set up the code that way; and also we think
    // it's the kind of thing you might want to play with directly
    // on the command line instead of inside sub-config-files.
    OnlineSilenceWeightingConfig silence_weighting_config;
  
    int32 IvectorDim() { return ivector_extractor_info.extractor.IvectorDim(); }
   private:
    KALDI_DISALLOW_COPY_AND_ASSIGN(OnlineNnet2FeaturePipelineInfo);
  };
  
  
  
  /// OnlineNnet2FeaturePipeline is a class that's responsible for putting
  /// together the various parts of the feature-processing pipeline for neural
  /// networks, in an online setting.  The recipe here does not include fMLLR;
  /// instead, it assumes we're giving raw features such as MFCC or PLP or
  /// filterbank (with no CMVN) to the neural network, and optionally augmenting
  /// these with an iVector that describes the speaker characteristics.  The
  /// iVector is extracted using class OnlineIvectorFeature (see that class for
  /// more info on how it's done).
  /// No splicing is currently done in this code, as we're currently only supporting
  /// the nnet2 neural network in which the splicing is done inside the network.
  /// Probably our strategy for nnet1 network conversion would be to convert to nnet2
  /// and just add layers to do the splicing.
  class OnlineNnet2FeaturePipeline: public OnlineFeatureInterface {
   public:
    /// Constructor from the "info" object.  After calling this for a
    /// non-initial utterance of a speaker, you may want to call
    /// SetAdaptationState().
    explicit OnlineNnet2FeaturePipeline(
        const OnlineNnet2FeaturePipelineInfo &info);
  
    /// Member functions from OnlineFeatureInterface:
  
    /// Dim() will return the base-feature dimension (e.g. 13 for normal MFCC);
    /// plus the pitch-feature dimension (e.g. 3), if used; plus the iVector
    /// dimension, if used.  Any frame-splicing happens inside the neural-network
    /// code.
    virtual int32 Dim() const;
  
    virtual bool IsLastFrame(int32 frame) const;
    virtual int32 NumFramesReady() const;
    virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
  
    /// If you are downweighting silence, you can call
    /// OnlineSilenceWeighting::GetDeltaWeights and supply the output to this
    /// class using UpdateFrameWeights().  The reason why this call happens
    /// outside this class, rather than this class pulling in the data weights,
    /// relates to multi-threaded operation and also from not wanting this class
    /// to have excessive dependencies.
    ///
    /// You must either always call this as soon as new data becomes available,
    /// ideally just after calling AcceptWaveform(), or never call it for the
    /// lifetime of this object.
    void UpdateFrameWeights(
        const std::vector<std::pair<int32, BaseFloat> > &delta_weights,
        int32 frame_offset = 0);
  
    /// Set the adaptation state to a particular value, e.g. reflecting previous
    /// utterances of the same speaker; this will generally be called after
    /// Copy().
    void SetAdaptationState(
        const OnlineIvectorExtractorAdaptationState &adaptation_state);
  
  
    /// Get the adaptation state; you may want to call this before destroying this
    /// object, to get adaptation state that can be used to improve decoding of
    /// later utterances of this speaker.  You might not want to do this, though,
    /// if you have reason to believe that something went wrong in the recognition
    /// (e.g., low confidence).
    void GetAdaptationState(
        OnlineIvectorExtractorAdaptationState *adaptation_state) const;
  
  
    /// Accept more data to process.  It won't actually process it until you call
    /// GetFrame() [probably indirectly via (decoder).AdvanceDecoding()], when you
    /// call this function it will just copy it).  sampling_rate is necessary just
    /// to assert it equals what's in the config.
    void AcceptWaveform(BaseFloat sampling_rate,
                        const VectorBase<BaseFloat> &waveform);
  
    BaseFloat FrameShiftInSeconds() const { return info_.FrameShiftInSeconds(); }
  
    /// If you call InputFinished(), it tells the class you won't be providing any
    /// more waveform.  This will help flush out the last few frames of delta or
    /// LDA features, and finalize the pitch features (making them more
    /// accurate)... although since in neural-net decoding we don't anticipate
    /// rescoring the lattices, this may not be much of an issue.
    void InputFinished();
  
    // This function returns the iVector-extracting part of the feature pipeline
    // (or NULL if iVectors are not being used); the pointer ownership is retained
    // by this object and not transferred to the caller.  This function is used in
    // nnet3, and also in the silence-weighting code used to exclude silence from
    // the iVector estimation.
    OnlineIvectorFeature *IvectorFeature() {
      return ivector_feature_;
    }
  
    // A const accessor for the iVector extractor. Returns NULL if iVectors are
    // not being used.
    const OnlineIvectorFeature *IvectorFeature() const {
      return ivector_feature_;
    }
  
    // This function returns the part of the feature pipeline that would be given
    // as the primary (non-iVector) input to the neural network in nnet3
    // applications.
    OnlineFeatureInterface *InputFeature() {
      return feature_plus_optional_pitch_;
    }
  
    virtual ~OnlineNnet2FeaturePipeline();
   private:
  
    const OnlineNnet2FeaturePipelineInfo &info_;
  
    OnlineBaseFeature *base_feature_;        // MFCC/PLP/filterbank
  
    OnlinePitchFeature *pitch_;              // Raw pitch, if used
    OnlineProcessPitch *pitch_feature_;  // Processed pitch, if pitch used.
  
  
    // feature_plus_pitch_ is the base_feature_ appended (OnlineAppendFeature)
    /// with pitch_feature_, if used; otherwise, points to the same address as
    /// base_feature_.
    OnlineFeatureInterface *feature_plus_optional_pitch_;
  
    OnlineIvectorFeature *ivector_feature_;  // iVector feature, if used.
  
    // final_feature_ is feature_plus_optional_pitch_ appended
    // (OnlineAppendFeature) with ivector_feature_, if ivector_feature_ is used;
    // otherwise, points to the same address as feature_plus_optional_pitch_.
    OnlineFeatureInterface *final_feature_;
  
    // we cache the feature dimension, to save time when calling Dim().
    int32 dim_;
  };
  
  
  
  
  /// @} End of "addtogroup onlinefeat"
  }  // namespace kaldi
  
  
  
  #endif  // KALDI_ONLINE2_ONLINE_NNET2_FEATURE_PIPELINE_H_