online-faster-decoder.h 5.68 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133


// online/online-faster-decoder.h

// Copyright 2012 Cisco Systems (author: Matthias Paulik)

//   Modifications to the original contribution by Cisco Systems made by:
//   Vassil Panayotov

// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//  http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.

#ifndef KALDI_ONLINE_ONLINE_FASTER_DECODER_H_
#define KALDI_ONLINE_ONLINE_FASTER_DECODER_H_

#include "util/stl-utils.h"
#include "decoder/faster-decoder.h"
#include "hmm/transition-model.h"

namespace kaldi {

// Extends the definition of FasterDecoder's options to include additional
// parameters. The meaning of the "beam" option is also redefined as
// the _maximum_ beam value allowed.
struct OnlineFasterDecoderOpts : public FasterDecoderOptions {
  BaseFloat rt_min; // minimum decoding runtime factor
  BaseFloat rt_max; // maximum decoding runtime factor
  int32 batch_size; // number of features decoded in one go
  int32 inter_utt_sil; // minimum silence (#frames) to trigger end of utterance
  int32 max_utt_len_; // if utt. is longer, we accept shorter silence as utt. separators
  int32 update_interval; // beam update period in # of frames
  BaseFloat beam_update; // rate of adjustment of the beam
  BaseFloat max_beam_update; // maximum rate of beam adjustment

  OnlineFasterDecoderOpts() :
    rt_min(.7), rt_max(.75), batch_size(27),
    inter_utt_sil(50), max_utt_len_(1500),
    update_interval(3), beam_update(.01),
    max_beam_update(0.05) {}

  void Register(OptionsItf *opts, bool full) {
    FasterDecoderOptions::Register(opts, full);
    opts->Register("rt-min", &rt_min,
                   "Approximate minimum decoding run time factor");
    opts->Register("rt-max", &rt_max,
                   "Approximate maximum decoding run time factor");
    opts->Register("update-interval", &update_interval,
                   "Beam update interval in frames");
    opts->Register("beam-update", &beam_update, "Beam update rate");
    opts->Register("max-beam-update", &max_beam_update, "Max beam update rate");
    opts->Register("inter-utt-sil", &inter_utt_sil,
                   "Maximum # of silence frames to trigger new utterance");
    opts->Register("max-utt-length", &max_utt_len_,
                   "If the utterance becomes longer than this number of frames, "
                   "shorter silence is acceptable as an utterance separator");
  }
};

class OnlineFasterDecoder : public FasterDecoder {
 public:
  // Codes returned by Decode() to show the current state of the decoder
  enum DecodeState {
    kEndFeats = 1, // No more scores are available from the Decodable
    kEndUtt = 2, // End of utterance, caused by e.g. a sufficiently long silence
    kEndBatch = 4 // End of batch - end of utterance not reached yet
  };

  // "sil_phones" - the IDs of all silence phones
  OnlineFasterDecoder(const fst::Fst<fst::StdArc> &fst,
                      const OnlineFasterDecoderOpts &opts,
                      const std::vector<int32> &sil_phones,
                      const TransitionModel &trans_model)
      : FasterDecoder(fst, opts), opts_(opts),
        silence_set_(sil_phones), trans_model_(trans_model),
        max_beam_(opts.beam), effective_beam_(FasterDecoder::config_.beam),
        state_(kEndFeats), frame_(0), utt_frames_(0) {}

  DecodeState Decode(DecodableInterface *decodable);
  
  // Makes a linear graph, by tracing back from the last "immortal" token
  // to the previous one
  bool PartialTraceback(fst::MutableFst<LatticeArc> *out_fst);

  // Makes a linear graph, by tracing back from the best currently active token
  // to the last immortal token. This method is meant to be invoked at the end
  // of an utterance in order to get the last chunk of the hypothesis
  void FinishTraceBack(fst::MutableFst<LatticeArc> *fst_out);

  // Returns "true" if the best current hypothesis ends with long enough silence
  bool EndOfUtterance();

  int32 frame() { return frame_; }

 private:
  void ResetDecoder(bool full);

  // Returns a linear fst by tracing back the last N frames, beginning
  // from the best current token
  void TracebackNFrames(int32 nframes, fst::MutableFst<LatticeArc> *out_fst);

  // Makes a linear "lattice", by tracing back a path delimited by two tokens
  void MakeLattice(const Token *start,
                   const Token *end,
                   fst::MutableFst<LatticeArc> *out_fst) const;

  // Searches for the last token, ancestor of all currently active tokens
  void UpdateImmortalToken();

  const OnlineFasterDecoderOpts opts_;
  const ConstIntegerSet<int32> silence_set_; // silence phones IDs
  const TransitionModel &trans_model_; // needed for trans-id -> phone conversion
  const BaseFloat max_beam_; // the maximum allowed beam
  BaseFloat &effective_beam_; // the currently used beam
  DecodeState state_; // the current state of the decoder
  int32 frame_; // the next frame to be processed
  int32 utt_frames_; // # frames processed from the current utterance
  Token *immortal_tok_;      // "immortal" token means it's an ancestor of ...
  Token *prev_immortal_tok_; // ... all currently active tokens
  KALDI_DISALLOW_COPY_AND_ASSIGN(OnlineFasterDecoder);
};

} // namespace kaldi
#endif // KALDI_ONLINE_ONLINE_FASTER_DECODER_H_