process-pitch-feats.cc
6.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
// featbin/process-pitch-feats.cc
// Copyright 2013 Bagher BabaAli
// Johns Hopkins University (author: Daniel Povey)
//
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "base/kaldi-common.h"
#include "util/common-utils.h"
namespace kaldi {
// For the probability-of-voicing features (the first element of
// each row), change p -> log ((p + 0.0001) / (1.00001 - p))
// This makes it more Gaussian; otherwise it clumps up near
// the edges of the range [0, 1].
void ProcessPovFeatures(Matrix<BaseFloat> *mat) {
int32 num_frames = mat->NumRows();
for (int32 i = 0; i < num_frames; i++) {
BaseFloat p = (*mat)(i, 0);
KALDI_ASSERT(p >= 0.0 && p <= 1.0);
(*mat)(i, 0) = Log((p + 0.0001) / (1.0001 - p));
}
}
void TakeLogOfPitch(Matrix<BaseFloat> *mat) {
int32 num_frames = mat->NumRows();
for (int32 i = 0; i < num_frames; i++) {
KALDI_ASSERT((*mat)(i, 1) > 0.0);
(*mat)(i, 1) = Log((*mat)(i, 1));
}
}
// Subtract the moving average over a largish window
// (e.g. 151 frames)
void SubtractMovingAverage(int32 normalization_window_size,
Matrix<BaseFloat> *mat) {
int32 num_frames = mat->NumRows();
Vector<BaseFloat> temp_pitch(num_frames);
Matrix<BaseFloat> &features = *mat;
int32 i;
for (i = 0; i < num_frames; i++)
temp_pitch(i) = features(i, 1);
// Moving Window Normalization
BaseFloat mean = 0.0;
int32 mid_win = (normalization_window_size - 1) / 2;
for (i = 0; (i < num_frames) && (i < normalization_window_size); i++) {
mean += features(i, 1);
}
mean /= i;
if (num_frames <= normalization_window_size) {
for (i = 0; i < num_frames; i++) {
features(i, 1) -= mean;
}
} else {
for (i = 0; i <= mid_win; i++) {
features(i, 1) -= mean;
}
for (i = (mid_win + 1); i < num_frames; i++) {
if (i + (mid_win + 1) < num_frames)
mean -= (temp_pitch(i - (mid_win + 1)) -
temp_pitch(i + (mid_win + 1))) / normalization_window_size;
features(i,1) -= mean;
}
}
}
// Set to the moving average over a small window, e.g. 5 frames.
void SetToMovingAverage(int32 average_window_size,
Matrix<BaseFloat> *mat) {
int32 num_frames = mat->NumRows();
Matrix<BaseFloat> &features = *mat;
Vector<BaseFloat> temp_pitch(num_frames);
int32 width = (average_window_size - 1) / 2, i;
// e.g. if average_window_size is 5, width will equal 2.
for (i = width; i < num_frames - width ; i++) {
temp_pitch(i) = features(i, 1);
for(int j = 1; j <= width; ++j) {
temp_pitch(i) += (features(i - j, 1) + features(i + j, 1));
}
temp_pitch(i) /= (2 * width + 1);
}
for (i = width; i < num_frames - width; i++)
features(i, 1) = temp_pitch(i);
}
} // namespace kaldi
int main(int argc, char *argv[]) {
try {
using namespace kaldi;
const char *usage =
"This is a rather special-purpose program which processes 2-dimensional\n"
"features consisting of (prob-of-voicing, pitch) into something suitable\n"
"to put into a speech recognizer. First use interpolate-feats\n"
"Usage: process-pitch-feats [options...] <feats-rspecifier> <feats-wspecifier>\n";
// construct all the global objects
ParseOptions po(usage);
int32 normalization_window_size = 151; // should be odd number
int32 average_window_size = 5;
// Register the options
po.Register("normalization-window-size",
&normalization_window_size, "Size of window used for "
"moving window nomalization (must be odd).");
po.Register("average-window-size",
&average_window_size,
"Size of moving average window (must be odd).");
// parse options (+filling the registered variables)
po.Read(argc, argv);
if (po.NumArgs() != 2) {
po.PrintUsage();
exit(1);
}
KALDI_ASSERT(average_window_size > 0 && average_window_size % 2 == 1 &&
"--average-window-size option must be an odd positive number.");
KALDI_ASSERT(normalization_window_size > 0 && normalization_window_size % 2 == 1 &&
"--normalization-window-size option must be an odd positive number.");
std::string input_rspecifier = po.GetArg(1);
std::string output_wspecifier = po.GetArg(2);
SequentialBaseFloatMatrixReader reader(input_rspecifier);
BaseFloatMatrixWriter kaldi_writer; // typedef to TableWriter<something>.
if (!kaldi_writer.Open(output_wspecifier))
KALDI_ERR << "Could not initialize output with wspecifier "
<< output_wspecifier;
int32 num_done = 0, num_err = 0;
for (; !reader.Done(); reader.Next()) {
std::string utt = reader.Key();
Matrix<BaseFloat> features = reader.Value();
int num_frames = features.NumRows();
if (num_frames == 0 && features.NumCols() != 2) {
KALDI_WARN << "Feature file has bad size "
<< features.NumRows() << " by " << features.NumCols();
num_err++;
continue;
}
ProcessPovFeatures(&features);
TakeLogOfPitch(&features);
SubtractMovingAverage(normalization_window_size, &features);
SetToMovingAverage(average_window_size, &features);
kaldi_writer.Write(utt, features);
num_done++;
if (num_done % 10 == 0)
KALDI_LOG << "Processed " << num_done << " utterances";
KALDI_VLOG(2) << "Processed features for key " << utt;
}
KALDI_LOG << "Done " << num_done << " utterances, " << num_err
<< " with errors.";
return (num_done != 0 ? 0 : 1);
} catch(const std::exception &e) {
std::cerr << e.what();
return -1;
}
}