Blame view

src/online/online-faster-decoder.h 5.68 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
  // online/online-faster-decoder.h
  
  // Copyright 2012 Cisco Systems (author: Matthias Paulik)
  
  //   Modifications to the original contribution by Cisco Systems made by:
  //   Vassil Panayotov
  
  // See ../../COPYING for clarification regarding multiple authors
  //
  // Licensed under the Apache License, Version 2.0 (the "License");
  // you may not use this file except in compliance with the License.
  // You may obtain a copy of the License at
  //
  //  http://www.apache.org/licenses/LICENSE-2.0
  //
  // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  // MERCHANTABLITY OR NON-INFRINGEMENT.
  // See the Apache 2 License for the specific language governing permissions and
  // limitations under the License.
  
  #ifndef KALDI_ONLINE_ONLINE_FASTER_DECODER_H_
  #define KALDI_ONLINE_ONLINE_FASTER_DECODER_H_
  
  #include "util/stl-utils.h"
  #include "decoder/faster-decoder.h"
  #include "hmm/transition-model.h"
  
  namespace kaldi {
  
  // Extends the definition of FasterDecoder's options to include additional
  // parameters. The meaning of the "beam" option is also redefined as
  // the _maximum_ beam value allowed.
  struct OnlineFasterDecoderOpts : public FasterDecoderOptions {
    BaseFloat rt_min; // minimum decoding runtime factor
    BaseFloat rt_max; // maximum decoding runtime factor
    int32 batch_size; // number of features decoded in one go
    int32 inter_utt_sil; // minimum silence (#frames) to trigger end of utterance
    int32 max_utt_len_; // if utt. is longer, we accept shorter silence as utt. separators
    int32 update_interval; // beam update period in # of frames
    BaseFloat beam_update; // rate of adjustment of the beam
    BaseFloat max_beam_update; // maximum rate of beam adjustment
  
    OnlineFasterDecoderOpts() :
      rt_min(.7), rt_max(.75), batch_size(27),
      inter_utt_sil(50), max_utt_len_(1500),
      update_interval(3), beam_update(.01),
      max_beam_update(0.05) {}
  
    void Register(OptionsItf *opts, bool full) {
      FasterDecoderOptions::Register(opts, full);
      opts->Register("rt-min", &rt_min,
                     "Approximate minimum decoding run time factor");
      opts->Register("rt-max", &rt_max,
                     "Approximate maximum decoding run time factor");
      opts->Register("update-interval", &update_interval,
                     "Beam update interval in frames");
      opts->Register("beam-update", &beam_update, "Beam update rate");
      opts->Register("max-beam-update", &max_beam_update, "Max beam update rate");
      opts->Register("inter-utt-sil", &inter_utt_sil,
                     "Maximum # of silence frames to trigger new utterance");
      opts->Register("max-utt-length", &max_utt_len_,
                     "If the utterance becomes longer than this number of frames, "
                     "shorter silence is acceptable as an utterance separator");
    }
  };
  
  class OnlineFasterDecoder : public FasterDecoder {
   public:
    // Codes returned by Decode() to show the current state of the decoder
    enum DecodeState {
      kEndFeats = 1, // No more scores are available from the Decodable
      kEndUtt = 2, // End of utterance, caused by e.g. a sufficiently long silence
      kEndBatch = 4 // End of batch - end of utterance not reached yet
    };
  
    // "sil_phones" - the IDs of all silence phones
    OnlineFasterDecoder(const fst::Fst<fst::StdArc> &fst,
                        const OnlineFasterDecoderOpts &opts,
                        const std::vector<int32> &sil_phones,
                        const TransitionModel &trans_model)
        : FasterDecoder(fst, opts), opts_(opts),
          silence_set_(sil_phones), trans_model_(trans_model),
          max_beam_(opts.beam), effective_beam_(FasterDecoder::config_.beam),
          state_(kEndFeats), frame_(0), utt_frames_(0) {}
  
    DecodeState Decode(DecodableInterface *decodable);
    
    // Makes a linear graph, by tracing back from the last "immortal" token
    // to the previous one
    bool PartialTraceback(fst::MutableFst<LatticeArc> *out_fst);
  
    // Makes a linear graph, by tracing back from the best currently active token
    // to the last immortal token. This method is meant to be invoked at the end
    // of an utterance in order to get the last chunk of the hypothesis
    void FinishTraceBack(fst::MutableFst<LatticeArc> *fst_out);
  
    // Returns "true" if the best current hypothesis ends with long enough silence
    bool EndOfUtterance();
  
    int32 frame() { return frame_; }
  
   private:
    void ResetDecoder(bool full);
  
    // Returns a linear fst by tracing back the last N frames, beginning
    // from the best current token
    void TracebackNFrames(int32 nframes, fst::MutableFst<LatticeArc> *out_fst);
  
    // Makes a linear "lattice", by tracing back a path delimited by two tokens
    void MakeLattice(const Token *start,
                     const Token *end,
                     fst::MutableFst<LatticeArc> *out_fst) const;
  
    // Searches for the last token, ancestor of all currently active tokens
    void UpdateImmortalToken();
  
    const OnlineFasterDecoderOpts opts_;
    const ConstIntegerSet<int32> silence_set_; // silence phones IDs
    const TransitionModel &trans_model_; // needed for trans-id -> phone conversion
    const BaseFloat max_beam_; // the maximum allowed beam
    BaseFloat &effective_beam_; // the currently used beam
    DecodeState state_; // the current state of the decoder
    int32 frame_; // the next frame to be processed
    int32 utt_frames_; // # frames processed from the current utterance
    Token *immortal_tok_;      // "immortal" token means it's an ancestor of ...
    Token *prev_immortal_tok_; // ... all currently active tokens
    KALDI_DISALLOW_COPY_AND_ASSIGN(OnlineFasterDecoder);
  };
  
  } // namespace kaldi
  #endif // KALDI_ONLINE_ONLINE_FASTER_DECODER_H_