Blame view

src/online2/online-speex-wrapper.h 4.62 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
  // online2/online-speex-wrapper.h
  
  // Copyright   2014  IMSL, PKU-HKUST (author: Wei Shi)
  
  // See ../../COPYING for clarification regarding multiple authors
  //
  // Licensed under the Apache License, Version 2.0 (the "License");
  // you may not use this file except in compliance with the License.
  // You may obtain a copy of the License at
  //
  //  http://www.apache.org/licenses/LICENSE-2.0
  //
  // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  // MERCHANTABLITY OR NON-INFRINGEMENT.
  // See the Apache 2 License for the specific language governing permissions and
  // limitations under the License.
  
  
  #ifndef KALDI_ONLINE2_ONLINE_SPEEX_WRAPPER_H_
  #define KALDI_ONLINE2_ONLINE_SPEEX_WRAPPER_H_
  
  #ifdef HAVE_SPEEX
    #include <speex/speex.h>
    typedef SpeexBits SPEEXBITS;
  #else
    typedef char SPEEXBITS;
  #endif
  
  #include "matrix/kaldi-vector.h"
  #include "itf/options-itf.h"
  
  namespace kaldi {
  
  struct SpeexOptions {
    /// The sample frequency of the waveform, it decides which Speex mode
    /// to use. Often 8kHz---> narrow band, 16kHz---> wide band and 32kHz
    /// ---> ultra wide band
    BaseFloat sample_rate;
  
    /// Ranges from 0 to 10, the higher the quality is better. In my preliminary
    /// tests with the RM recipe, if set it to 8, I observed the WER incresed by
    /// 0.1%; while set it to 10, the WER almost kept unchanged.
    int32 speex_quality;
  
    /// In bytes.
    /// Should be set according to speex_quality. Just name a few here(wideband):
    ///     quality            size(in bytes)
    ///        8                  70
    ///        9                  86
    ///        10                 106
    int32 speex_bits_frame_size;
  
    /// In samples.
    /// The Speex toolkit uses a 20ms long window by default
    int32 speex_wave_frame_size;
  
    SpeexOptions(): sample_rate(16000.0),
                    speex_quality(10),
                    speex_bits_frame_size(106),
                    speex_wave_frame_size(320) { }
  
    void Register(OptionsItf *opts) {
      opts->Register("sample-rate", &sample_rate, "Sample frequency of the waveform.");
      opts->Register("speex-quality", &speex_quality, "Speex speech quality.");
      opts->Register("speex-bits-frame-size", &speex_bits_frame_size,
                     "#bytes of each Speex compressed frame.");
      opts->Register("speex-wave-frame-size", &speex_wave_frame_size,
                     "#samples of each waveform frame.");
    }
  };
  
  class OnlineSpeexEncoder {
    public:
      OnlineSpeexEncoder(const SpeexOptions &config);
      ~OnlineSpeexEncoder();
  
      void AcceptWaveform(int32 sample_rate,
             const VectorBase<BaseFloat> &waveform);
  
      void InputFinished();
  
      void GetSpeexBits(std::vector<char> *spx_bits) {  // call it after AcceptWaveform
        *spx_bits = speex_encoded_char_bits_;
        speex_encoded_char_bits_.clear();
      }
    private:
      int32 speex_frame_size_;  // in bytes, will be different according to the quality
      int32 speex_encoded_frame_size_;  // in samples, typically 320 in wideband mode, 16kHz
  #ifdef HAVE_SPEEX
      void *speex_state_;  // Holds the state of the speex encoder
  #endif
      SPEEXBITS speex_bits_;
  
      Vector<BaseFloat> waveform_remainder_;      // Holds the waveform that have not been processed
  
      // Holds the Speex-encoded char bits, will be peaked by GetSpeexBits().
      // We use a vector container rather than a char-type pointer because
      // it's a little easier to expand.
      std::vector<char> speex_encoded_char_bits_;
  
      BaseFloat sample_rate_;
      bool input_finished_;
  
      void Encode(const VectorBase<BaseFloat> &wave,
                  std::vector<char> *speex_encoder_bits) ;
  };
  
  class OnlineSpeexDecoder {
    public:
      OnlineSpeexDecoder(const SpeexOptions &config);
      ~OnlineSpeexDecoder();
  
      void AcceptSpeexBits(const std::vector<char> &spx_enc_bits);
  
      void GetWaveform(Vector<BaseFloat> *waveform) {  // call it after AcceptSpeexBits
        *waveform = waveform_;
        waveform_.Resize(0);
      }
    private:
      int32 speex_frame_size_;  // in bytes, will be different according to the quality
      int32 speex_decoded_frame_size_;  // in samples, typically 320 in wideband mode, 16kHz
  
  #ifdef HAVE_SPEEX
      void *speex_state_;  // Holds the state of the speex decoder
  #endif
      SPEEXBITS speex_bits_;
  
  
      Vector<BaseFloat> waveform_;  // Holds the waveform decoded from speex bits
      std::vector<char> speex_bits_remainder_;
  
      void Decode(const std::vector<char> &speex_char_bits,
                  Vector<BaseFloat> *decoded_wav) ;
  };
  
  }  // namespace kaldi
  
  #endif  // KALDI_ONLINE2_ONLINE_SPEEX_WRAPPER_H_