// nnet3/decodable-simple-looped.h // Copyright 2016 Johns Hopkins University (author: Daniel Povey) // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #ifndef KALDI_NNET3_DECODABLE_SIMPLE_LOOPED_H_ #define KALDI_NNET3_DECODABLE_SIMPLE_LOOPED_H_ #include #include "base/kaldi-common.h" #include "gmm/am-diag-gmm.h" #include "hmm/transition-model.h" #include "itf/decodable-itf.h" #include "nnet3/nnet-optimize.h" #include "nnet3/nnet-compute.h" #include "nnet3/am-nnet-simple.h" namespace kaldi { namespace nnet3 { // See also nnet-am-decodable-simple.h, which is a decodable object that's based // on breaking up the input into fixed chunks. The decodable object defined here is based on // 'looped' computations, which naturally handle infinite left-context (but are // only ideal for systems that have only recurrence in the forward direction, // i.e. not BLSTMs... because there isn't a natural way to enforce extra right // context for each chunk.) // Note: the 'simple' in the name means it applies to networks for which // IsSimpleNnet(nnet) would return true. 'looped' means we use looped // computations, with a kGotoLabel statement at the end of it. struct NnetSimpleLoopedComputationOptions { int32 extra_left_context_initial; int32 frame_subsampling_factor; int32 frames_per_chunk; BaseFloat acoustic_scale; bool debug_computation; NnetOptimizeOptions optimize_config; NnetComputeOptions compute_config; NnetSimpleLoopedComputationOptions(): extra_left_context_initial(0), frame_subsampling_factor(1), frames_per_chunk(20), acoustic_scale(0.1), debug_computation(false) { } void Check() const { KALDI_ASSERT(extra_left_context_initial >= 0 && frame_subsampling_factor > 0 && frames_per_chunk > 0 && acoustic_scale > 0.0); } void Register(OptionsItf *opts) { opts->Register("extra-left-context-initial", &extra_left_context_initial, "Extra left context to use at the first frame of an utterance (note: " "this will just consist of repeats of the first frame, and should not " "usually be necessary."); opts->Register("frame-subsampling-factor", &frame_subsampling_factor, "Required if the frame-rate of the output (e.g. in 'chain' " "models) is less than the frame-rate of the original " "alignment."); opts->Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic log-likelihoods"); opts->Register("frames-per-chunk", &frames_per_chunk, "Number of frames in each chunk that is separately evaluated " "by the neural net. Measured before any subsampling, if the " "--frame-subsampling-factor options is used (i.e. counts " "input frames. This is only advisory (may be rounded up " "if needed."); opts->Register("debug-computation", &debug_computation, "If true, turn on " "debug for the actual computation (very verbose!)"); // register the optimization options with the prefix "optimization". ParseOptions optimization_opts("optimization", opts); optimize_config.Register(&optimization_opts); // register the compute options with the prefix "computation". ParseOptions compute_opts("computation", opts); compute_config.Register(&compute_opts); } }; /** When you instantiate class DecodableNnetSimpleLooped, you should give it a const reference to this class, that has been previously initialized. */ class DecodableNnetSimpleLoopedInfo { public: // The constructor takes a non-const pointer to 'nnet' because it may have to // modify it to be able to take multiple iVectors. DecodableNnetSimpleLoopedInfo(const NnetSimpleLoopedComputationOptions &opts, Nnet *nnet); // This constructor takes the priors from class AmNnetSimple (so it can divide by // them). DecodableNnetSimpleLoopedInfo(const NnetSimpleLoopedComputationOptions &opts, AmNnetSimple *nnet); // this constructor is for use in testing. DecodableNnetSimpleLoopedInfo(const NnetSimpleLoopedComputationOptions &opts, const Vector &priors, Nnet *nnet); void Init(const NnetSimpleLoopedComputationOptions &opts, Nnet *nnet); const NnetSimpleLoopedComputationOptions &opts; const Nnet &nnet; // the log priors (or the empty vector if the priors are not set in the model) CuVector log_priors; // frames_left_context equals the model left context plus the value of the // --extra-left-context-initial option. int32 frames_left_context; // frames_right_context is the same as the right-context of the model. int32 frames_right_context; // The frames_per_chunk_ equals the number of input frames we need for each // chunk (except for the first chunk). This divided by // opts_.frame_subsampling_factor gives the number of output frames. int32 frames_per_chunk; // The output dimension of the neural network. int32 output_dim; // True if the neural net accepts iVectors. If so, the neural net will have been modified // to accept the iVectors bool has_ivectors; // The 3 computation requests that are used to create the looped // computation are stored in the class, as we need them to work out // exactly shich iVectors are needed. ComputationRequest request1, request2, request3; // The compiled, 'looped' computation. NnetComputation computation; }; /* This class handles the neural net computation; it's mostly accessed via other wrapper classes. It can accept just input features, or input features plus iVectors. */ class DecodableNnetSimpleLooped { public: /** This constructor takes features as input, and you can either supply a single iVector input, estimated in batch-mode ('ivector'), or 'online' iVectors ('online_ivectors' and 'online_ivector_period', or none at all. Note: it stores references to all arguments to the constructor, so don't delete them till this goes out of scope. @param [in] info This helper class contains all the static pre-computed information this class needs, and contains a pointer to the neural net. @param [in] feats The input feature matrix. @param [in] ivector If you are using iVectors estimated in batch mode, a pointer to the iVector, else NULL. @param [in] ivector If you are using iVectors estimated in batch mode, a pointer to the iVector, else NULL. @param [in] online_ivectors If you are using iVectors estimated 'online' a pointer to the iVectors, else NULL. @param [in] online_ivector_period If you are using iVectors estimated 'online' (i.e. if online_ivectors != NULL) gives the periodicity (in frames) with which the iVectors are estimated. */ DecodableNnetSimpleLooped(const DecodableNnetSimpleLoopedInfo &info, const MatrixBase &feats, const VectorBase *ivector = NULL, const MatrixBase *online_ivectors = NULL, int32 online_ivector_period = 1); // returns the number of frames of likelihoods. The same as feats_.NumRows() // in the normal case (but may be less if opts_.frame_subsampling_factor != // 1). inline int32 NumFrames() const { return num_subsampled_frames_; } inline int32 OutputDim() const { return info_.output_dim; } // Gets the output for a particular frame, with 0 <= frame < NumFrames(). // 'output' must be correctly sized (with dimension OutputDim()). Note: // you're expected to call this, and GetOutput(), in an order of increasing // frames. If you deviate from this, one of these calls may crash. void GetOutputForFrame(int32 subsampled_frame, VectorBase *output); // Gets the output for a particular frame and pdf_id, with // 0 <= subsampled_frame < NumFrames(), // and 0 <= pdf_id < OutputDim(). inline BaseFloat GetOutput(int32 subsampled_frame, int32 pdf_id) { KALDI_ASSERT(subsampled_frame >= current_log_post_subsampled_offset_ && "Frames must be accessed in order."); while (subsampled_frame >= current_log_post_subsampled_offset_ + current_log_post_.NumRows()) AdvanceChunk(); return current_log_post_(subsampled_frame - current_log_post_subsampled_offset_, pdf_id); } private: KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableNnetSimpleLooped); // This function does the computation for the next chunk. void AdvanceChunk(); void AdvanceChunkInternal(const MatrixBase &input_feats, const VectorBase &ivector); // Gets the iVector for the specified frame., if we are // using iVectors (else does nothing). void GetCurrentIvector(int32 input_frame, Vector *ivector); // returns dimension of the provided iVectors if supplied, or 0 otherwise. int32 GetIvectorDim() const; const DecodableNnetSimpleLoopedInfo &info_; NnetComputer computer_; const MatrixBase &feats_; // note: num_subsampled_frames_ will equal feats_.NumRows() in the normal case // when opts_.frame_subsampling_factor == 1. int32 num_subsampled_frames_; // ivector_ is the iVector if we're using iVectors that are estimated in batch // mode. const VectorBase *ivector_; // online_ivector_feats_ is the iVectors if we're using online-estimated ones. const MatrixBase *online_ivector_feats_; // online_ivector_period_ helps us interpret online_ivector_feats_; it's the // number of frames the rows of ivector_feats are separated by. int32 online_ivector_period_; // The current log-posteriors that we got from the last time we // ran the computation. Matrix current_log_post_; // The number of chunks we have computed so far. int32 num_chunks_computed_; // The time-offset of the current log-posteriors, equals // (num_chunks_computed_ - 1) * // (info_.frames_per_chunk_ / info_.opts_.frame_subsampling_factor). int32 current_log_post_subsampled_offset_; }; class DecodableAmNnetSimpleLooped: public DecodableInterface { public: /** This constructor takes features as input, and you can either supply a single iVector input, estimated in batch-mode ('ivector'), or 'online' iVectors ('online_ivectors' and 'online_ivector_period', or none at all. Note: it stores references to all arguments to the constructor, so don't delete them till this goes out of scope. @param [in] info This helper class contains all the static pre-computed information this class needs, and contains a pointer to the neural net. If you want prior subtraction to be done, you should have initialized this with the constructor that takes class AmNnetSimple. @param [in] trans_model The transition model to use. This takes care of the mapping from transition-id (which is an arg to LogLikelihood()) to pdf-id (which is used internally). @param [in] feats A pointer to the input feature matrix; must be non-NULL. We @param [in] ivector If you are using iVectors estimated in batch mode, a pointer to the iVector, else NULL. @param [in] ivector If you are using iVectors estimated in batch mode, a pointer to the iVector, else NULL. @param [in] online_ivectors If you are using iVectors estimated 'online' a pointer to the iVectors, else NULL. @param [in] online_ivector_period If you are using iVectors estimated 'online' (i.e. if online_ivectors != NULL) gives the periodicity (in frames) with which the iVectors are estimated. */ DecodableAmNnetSimpleLooped(const DecodableNnetSimpleLoopedInfo &info, const TransitionModel &trans_model, const MatrixBase &feats, const VectorBase *ivector = NULL, const MatrixBase *online_ivectors = NULL, int32 online_ivector_period = 1); virtual BaseFloat LogLikelihood(int32 frame, int32 transition_id); virtual inline int32 NumFramesReady() const { return decodable_nnet_.NumFrames(); } virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); } virtual bool IsLastFrame(int32 frame) const { KALDI_ASSERT(frame < NumFramesReady()); return (frame == NumFramesReady() - 1); } private: KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableAmNnetSimpleLooped); DecodableNnetSimpleLooped decodable_nnet_; const TransitionModel &trans_model_; }; } // namespace nnet3 } // namespace kaldi #endif // KALDI_NNET3_DECODABLE_SIMPLE_LOOPED_H_