online-nnet2-decodable.h 4.52 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122


// nnet2/online-nnet2-decodable.h

// Copyright  2014  Johns Hopkins Universithy (author: Daniel Povey)


// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//  http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.

#ifndef KALDI_NNET2_ONLINE_NNET2_DECODABLE_H_
#define KALDI_NNET2_ONLINE_NNET2_DECODABLE_H_

#include "itf/online-feature-itf.h"
#include "itf/decodable-itf.h"
#include "nnet2/am-nnet.h"
#include "nnet2/nnet-compute.h"
#include "hmm/transition-model.h"

namespace kaldi {
namespace nnet2 {

// Note: see also nnet-compute-online.h, which provides a different
// (lower-level) interface and more efficient for progressive evaluation of an
// nnet throughout an utterance, with re-use of already-computed activations.

struct DecodableNnet2OnlineOptions {
  BaseFloat acoustic_scale;
  bool pad_input;
  int32 max_nnet_batch_size;
  
  DecodableNnet2OnlineOptions():
      acoustic_scale(0.1),
      pad_input(true),
      max_nnet_batch_size(256) { }

  void Register(OptionsItf *opts) {
    opts->Register("acoustic-scale", &acoustic_scale,
                   "Scaling factor for acoustic likelihoods");
    opts->Register("pad-input", &pad_input,
                   "If true, pad acoustic features with required acoustic context "
                   "past edges of file.");
    opts->Register("max-nnet-batch-size", &max_nnet_batch_size,
                   "Maximum batch size we use in neural-network decodable object, "
                   "in cases where we are not constrained by currently available "
                   "frames (this will rarely make a difference)");
                 
  }
};


/**
   This Decodable object for class nnet2::AmNnet takes feature input from class
   OnlineFeatureInterface, unlike, say, class DecodableAmNnet which takes
   feature input from a matrix.
*/

class DecodableNnet2Online: public DecodableInterface {
 public:
  DecodableNnet2Online(const AmNnet &nnet,
                       const TransitionModel &trans_model,
                       const DecodableNnet2OnlineOptions &opts,
                       OnlineFeatureInterface *input_feats);
  
  
  /// Returns the scaled log likelihood
  virtual BaseFloat LogLikelihood(int32 frame, int32 index);
  
  virtual bool IsLastFrame(int32 frame) const;

  virtual int32 NumFramesReady() const;  
  
  /// Indices are one-based!  This is for compatibility with OpenFst.
  virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
  
 private:

  /// If the neural-network outputs for this frame are not cached, it computes
  /// them (and possibly for some succeeding frames)
  void ComputeForFrame(int32 frame);
  
  OnlineFeatureInterface *features_;
  const AmNnet &nnet_;
  const TransitionModel &trans_model_;
  DecodableNnet2OnlineOptions opts_;
  CuVector<BaseFloat> log_priors_;  // log-priors taken from the model.
  int32 feat_dim_;  // dimensionality of the input features.
  int32 left_context_;  // Left context of the network (cached here)
  int32 right_context_;  // Right context of the network (cached here)
  int32 num_pdfs_;  // Number of pdfs, equals output-dim of the network (cached
                    // here)
  
  int32 begin_frame_;  // First frame for which scaled_loglikes_ is valid
                       // (i.e. the first frame of the batch of frames for
                       // which we've computed the output).
  
  // scaled_loglikes_ contains the neural network pseudo-likelihoods: the log of
  // (prob divided by the prior), scaled by opts.acoustic_scale).  We may
  // compute this using the GPU, but we transfer it back to the system memory
  // when we store it here.  These scores are only kept for a subset of frames,
  // starting at begin_frame_, whose length depends how many frames were ready
  // at the time we called LogLikelihood(), and will never exceed
  // opts_.max_nnet_batch_size.
  Matrix<BaseFloat> scaled_loglikes_;

  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableNnet2Online);
};

} // namespace nnet2
} // namespace kaldi

#endif // KALDI_NNET2_ONLINE_NNET2_DECODABLE_H_