online-faster-decoder.h
5.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
// online/online-faster-decoder.h
// Copyright 2012 Cisco Systems (author: Matthias Paulik)
// Modifications to the original contribution by Cisco Systems made by:
// Vassil Panayotov
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_ONLINE_ONLINE_FASTER_DECODER_H_
#define KALDI_ONLINE_ONLINE_FASTER_DECODER_H_
#include "util/stl-utils.h"
#include "decoder/faster-decoder.h"
#include "hmm/transition-model.h"
namespace kaldi {
// Extends the definition of FasterDecoder's options to include additional
// parameters. The meaning of the "beam" option is also redefined as
// the _maximum_ beam value allowed.
struct OnlineFasterDecoderOpts : public FasterDecoderOptions {
BaseFloat rt_min; // minimum decoding runtime factor
BaseFloat rt_max; // maximum decoding runtime factor
int32 batch_size; // number of features decoded in one go
int32 inter_utt_sil; // minimum silence (#frames) to trigger end of utterance
int32 max_utt_len_; // if utt. is longer, we accept shorter silence as utt. separators
int32 update_interval; // beam update period in # of frames
BaseFloat beam_update; // rate of adjustment of the beam
BaseFloat max_beam_update; // maximum rate of beam adjustment
OnlineFasterDecoderOpts() :
rt_min(.7), rt_max(.75), batch_size(27),
inter_utt_sil(50), max_utt_len_(1500),
update_interval(3), beam_update(.01),
max_beam_update(0.05) {}
void Register(OptionsItf *opts, bool full) {
FasterDecoderOptions::Register(opts, full);
opts->Register("rt-min", &rt_min,
"Approximate minimum decoding run time factor");
opts->Register("rt-max", &rt_max,
"Approximate maximum decoding run time factor");
opts->Register("update-interval", &update_interval,
"Beam update interval in frames");
opts->Register("beam-update", &beam_update, "Beam update rate");
opts->Register("max-beam-update", &max_beam_update, "Max beam update rate");
opts->Register("inter-utt-sil", &inter_utt_sil,
"Maximum # of silence frames to trigger new utterance");
opts->Register("max-utt-length", &max_utt_len_,
"If the utterance becomes longer than this number of frames, "
"shorter silence is acceptable as an utterance separator");
}
};
class OnlineFasterDecoder : public FasterDecoder {
public:
// Codes returned by Decode() to show the current state of the decoder
enum DecodeState {
kEndFeats = 1, // No more scores are available from the Decodable
kEndUtt = 2, // End of utterance, caused by e.g. a sufficiently long silence
kEndBatch = 4 // End of batch - end of utterance not reached yet
};
// "sil_phones" - the IDs of all silence phones
OnlineFasterDecoder(const fst::Fst<fst::StdArc> &fst,
const OnlineFasterDecoderOpts &opts,
const std::vector<int32> &sil_phones,
const TransitionModel &trans_model)
: FasterDecoder(fst, opts), opts_(opts),
silence_set_(sil_phones), trans_model_(trans_model),
max_beam_(opts.beam), effective_beam_(FasterDecoder::config_.beam),
state_(kEndFeats), frame_(0), utt_frames_(0) {}
DecodeState Decode(DecodableInterface *decodable);
// Makes a linear graph, by tracing back from the last "immortal" token
// to the previous one
bool PartialTraceback(fst::MutableFst<LatticeArc> *out_fst);
// Makes a linear graph, by tracing back from the best currently active token
// to the last immortal token. This method is meant to be invoked at the end
// of an utterance in order to get the last chunk of the hypothesis
void FinishTraceBack(fst::MutableFst<LatticeArc> *fst_out);
// Returns "true" if the best current hypothesis ends with long enough silence
bool EndOfUtterance();
int32 frame() { return frame_; }
private:
void ResetDecoder(bool full);
// Returns a linear fst by tracing back the last N frames, beginning
// from the best current token
void TracebackNFrames(int32 nframes, fst::MutableFst<LatticeArc> *out_fst);
// Makes a linear "lattice", by tracing back a path delimited by two tokens
void MakeLattice(const Token *start,
const Token *end,
fst::MutableFst<LatticeArc> *out_fst) const;
// Searches for the last token, ancestor of all currently active tokens
void UpdateImmortalToken();
const OnlineFasterDecoderOpts opts_;
const ConstIntegerSet<int32> silence_set_; // silence phones IDs
const TransitionModel &trans_model_; // needed for trans-id -> phone conversion
const BaseFloat max_beam_; // the maximum allowed beam
BaseFloat &effective_beam_; // the currently used beam
DecodeState state_; // the current state of the decoder
int32 frame_; // the next frame to be processed
int32 utt_frames_; // # frames processed from the current utterance
Token *immortal_tok_; // "immortal" token means it's an ancestor of ...
Token *prev_immortal_tok_; // ... all currently active tokens
KALDI_DISALLOW_COPY_AND_ASSIGN(OnlineFasterDecoder);
};
} // namespace kaldi
#endif // KALDI_ONLINE_ONLINE_FASTER_DECODER_H_