Blame view
src/online/online-faster-decoder.h
5.68 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
// online/online-faster-decoder.h // Copyright 2012 Cisco Systems (author: Matthias Paulik) // Modifications to the original contribution by Cisco Systems made by: // Vassil Panayotov // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #ifndef KALDI_ONLINE_ONLINE_FASTER_DECODER_H_ #define KALDI_ONLINE_ONLINE_FASTER_DECODER_H_ #include "util/stl-utils.h" #include "decoder/faster-decoder.h" #include "hmm/transition-model.h" namespace kaldi { // Extends the definition of FasterDecoder's options to include additional // parameters. The meaning of the "beam" option is also redefined as // the _maximum_ beam value allowed. struct OnlineFasterDecoderOpts : public FasterDecoderOptions { BaseFloat rt_min; // minimum decoding runtime factor BaseFloat rt_max; // maximum decoding runtime factor int32 batch_size; // number of features decoded in one go int32 inter_utt_sil; // minimum silence (#frames) to trigger end of utterance int32 max_utt_len_; // if utt. is longer, we accept shorter silence as utt. separators int32 update_interval; // beam update period in # of frames BaseFloat beam_update; // rate of adjustment of the beam BaseFloat max_beam_update; // maximum rate of beam adjustment OnlineFasterDecoderOpts() : rt_min(.7), rt_max(.75), batch_size(27), inter_utt_sil(50), max_utt_len_(1500), update_interval(3), beam_update(.01), max_beam_update(0.05) {} void Register(OptionsItf *opts, bool full) { FasterDecoderOptions::Register(opts, full); opts->Register("rt-min", &rt_min, "Approximate minimum decoding run time factor"); opts->Register("rt-max", &rt_max, "Approximate maximum decoding run time factor"); opts->Register("update-interval", &update_interval, "Beam update interval in frames"); opts->Register("beam-update", &beam_update, "Beam update rate"); opts->Register("max-beam-update", &max_beam_update, "Max beam update rate"); opts->Register("inter-utt-sil", &inter_utt_sil, "Maximum # of silence frames to trigger new utterance"); opts->Register("max-utt-length", &max_utt_len_, "If the utterance becomes longer than this number of frames, " "shorter silence is acceptable as an utterance separator"); } }; class OnlineFasterDecoder : public FasterDecoder { public: // Codes returned by Decode() to show the current state of the decoder enum DecodeState { kEndFeats = 1, // No more scores are available from the Decodable kEndUtt = 2, // End of utterance, caused by e.g. a sufficiently long silence kEndBatch = 4 // End of batch - end of utterance not reached yet }; // "sil_phones" - the IDs of all silence phones OnlineFasterDecoder(const fst::Fst<fst::StdArc> &fst, const OnlineFasterDecoderOpts &opts, const std::vector<int32> &sil_phones, const TransitionModel &trans_model) : FasterDecoder(fst, opts), opts_(opts), silence_set_(sil_phones), trans_model_(trans_model), max_beam_(opts.beam), effective_beam_(FasterDecoder::config_.beam), state_(kEndFeats), frame_(0), utt_frames_(0) {} DecodeState Decode(DecodableInterface *decodable); // Makes a linear graph, by tracing back from the last "immortal" token // to the previous one bool PartialTraceback(fst::MutableFst<LatticeArc> *out_fst); // Makes a linear graph, by tracing back from the best currently active token // to the last immortal token. This method is meant to be invoked at the end // of an utterance in order to get the last chunk of the hypothesis void FinishTraceBack(fst::MutableFst<LatticeArc> *fst_out); // Returns "true" if the best current hypothesis ends with long enough silence bool EndOfUtterance(); int32 frame() { return frame_; } private: void ResetDecoder(bool full); // Returns a linear fst by tracing back the last N frames, beginning // from the best current token void TracebackNFrames(int32 nframes, fst::MutableFst<LatticeArc> *out_fst); // Makes a linear "lattice", by tracing back a path delimited by two tokens void MakeLattice(const Token *start, const Token *end, fst::MutableFst<LatticeArc> *out_fst) const; // Searches for the last token, ancestor of all currently active tokens void UpdateImmortalToken(); const OnlineFasterDecoderOpts opts_; const ConstIntegerSet<int32> silence_set_; // silence phones IDs const TransitionModel &trans_model_; // needed for trans-id -> phone conversion const BaseFloat max_beam_; // the maximum allowed beam BaseFloat &effective_beam_; // the currently used beam DecodeState state_; // the current state of the decoder int32 frame_; // the next frame to be processed int32 utt_frames_; // # frames processed from the current utterance Token *immortal_tok_; // "immortal" token means it's an ancestor of ... Token *prev_immortal_tok_; // ... all currently active tokens KALDI_DISALLOW_COPY_AND_ASSIGN(OnlineFasterDecoder); }; } // namespace kaldi #endif // KALDI_ONLINE_ONLINE_FASTER_DECODER_H_ |