interpolate-pitch.cc
14.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
// featbin/interpolate-pitch.cc
// Copyright 2013 Bagher BabaAli
// Johns Hopkins University (author: Daniel Povey)
//
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "base/kaldi-common.h"
#include "util/common-utils.h"
namespace kaldi {
struct PitchInterpolatorOptions {
BaseFloat pitch_interval; // Discretization interval [affects efficiency]
BaseFloat interpolator_factor; // This affects the tendency of the algorithm to
// follow the observed pitch contours versus pick its own path which will tend
// to be closer to a straight line.
BaseFloat max_voicing_prob; // p(voicing) we use at the end of the range when it was observed
// at one. (probably 0.9 is suitable; allows to not follow observed pitch even if p(voicing)=1.
BaseFloat max_pitch_change_per_frame;
PitchInterpolatorOptions(): pitch_interval(4.0),
interpolator_factor(1.0e-05),
max_voicing_prob(0.9),
max_pitch_change_per_frame(10.0) { }
void Register(OptionsItf *opts) {
opts->Register("pitch-interval", &pitch_interval, "Frequency interval in Hz, used "
"for the pitch interpolation and smoothing algorithm.");
opts->Register("interpolator-factor", &interpolator_factor, "Factor affecting the "
"interpolation algorithm; setting it closer to zero will cause "
"it to follow the measured pitch more faithfully but less "
"smoothly");
opts->Register("max-voicing-prob", &max_voicing_prob, "Probability of voicing the "
"algorithm uses as the observed p(voicing) approaches 1; having "
"value <1 allows it to interpolate even if p(voicing) = 1");
opts->Register("max-pitch-change-per-frame", &max_pitch_change_per_frame,
"This value should be set large enough to no longer affect the "
"results, but the larger it is the slower the algorithm will be.");
}
void Check() const {
KALDI_ASSERT(pitch_interval > 0.0 && pitch_interval < 20.0 &&
interpolator_factor > 0.0 && interpolator_factor < 1.0 &&
max_voicing_prob <= 1.0 && max_voicing_prob >= 0.5 &&
max_pitch_change_per_frame > 2.0 * pitch_interval);
}
};
struct PitchInterpolatorStats {
int64 num_frames_tot;
int64 num_frames_zero; // #frames that were zero in original pitch.
int64 num_frames_changed; // #frames that were not zero originally, but
// which the algorithm changed.
PitchInterpolatorStats(): num_frames_tot(0), num_frames_zero(0),
num_frames_changed(0) { }
void Print() {
BaseFloat zero_percent = num_frames_zero * 100.0 / num_frames_tot,
changed_percent = num_frames_changed * 100.0 / num_frames_tot;
KALDI_LOG << "Over " << num_frames_tot << " frames, "
<< zero_percent << "% were zero at input, and "
<< changed_percent << "% were not zero but were changed.";
}
};
class PitchInterpolator {
public:
PitchInterpolator(const PitchInterpolatorOptions &opts,
Matrix<BaseFloat> *mat,
PitchInterpolatorStats *stats):
opts_(opts) {
opts.Check();
InitValues(*mat);
Forward();
Backtrace(mat, stats);
}
private:
void InitValues(const Matrix<BaseFloat> &mat) {
BaseFloat pitch_interval = opts_.pitch_interval;
num_frames_ = mat.NumRows();
KALDI_ASSERT(mat.NumCols() == 2);
BaseFloat min_pitch = 1.0e+10, max_pitch = 0.0;
pitch_.resize(num_frames_);
p_voicing_.resize(num_frames_);
for (int32 f = 0; f < num_frames_; f++) {
BaseFloat p_voicing = mat(f, 0), pitch = mat(f, 1);
p_voicing *= opts_.max_voicing_prob;
if (pitch == 0.0) {
p_voicing = 0.0; // complete uncertainty about real pitch.
} else {
if (pitch < min_pitch) min_pitch = pitch;
if (pitch > max_pitch) max_pitch = pitch;
}
p_voicing_[f] = p_voicing;
}
if (max_pitch == 0.0) { // No voiced frames at all.
min_pitch = 100.0;
max_pitch = 100.0;
}
if (max_pitch <= min_pitch + (2.0 * pitch_interval)) {
max_pitch = min_pitch + 2.0 * pitch_interval;
} // avoid crashes.
// Note: the + 2 here is for edge effects.
num_pitches_ = floor((max_pitch - min_pitch) / pitch_interval + 0.5) + 2;
KALDI_ASSERT(num_pitches_ >= 3);
min_pitch_.resize(num_frames_);
for (int32 f = 0; f < num_frames_; f++) {
min_pitch_[f] = min_pitch - pitch_interval * RandUniform(); // bottom of
// discretization range for each frame is randomly different.
BaseFloat pitch = mat(f, 1);
if (pitch == 0.0) {
pitch_[f] = 0; // This will actually be a don't-care value; we just put in
// some value that won't crash the algorithm.
} else {
int32 int_pitch = floor((pitch - min_pitch_[f]) / pitch_interval + 0.5);
KALDI_ASSERT(int_pitch >= 0 && int_pitch < num_pitches_);
pitch_[f] = int_pitch;
}
}
}
void MultiplyObsProb(int32 t) {
// For the forward computation:
// Multiplies the observation probabilities into alpha at time t.
// constant_prob is the constant part that does not depend on the pitch value:
BaseFloat constant_prob = (1.0 - p_voicing_[t]) * opts_.interpolator_factor,
specified_prob = p_voicing_[t] + constant_prob;
// specified_prob adds in the extra probability mass at the observed pitch value.
BaseFloat log_constant_prob = Log(constant_prob),
log_ratio = Log(specified_prob / constant_prob);
log_alpha_.Add(log_constant_prob); // add log_constant_prob to all pitches at this time.
log_alpha_(pitch_[t]) += log_ratio; // corrects this to be like adding
// log(specified_prob) to the observed pitch at this time. Note: if pitch_[t] == 0,
// this won't have any effect because log_ratio will be zero too.
Vector<BaseFloat> temp_rand(num_pitches_);
temp_rand.SetRandn(); // Set to Gaussian noise. Type of noise doesn't really matter.
log_alpha_.AddVec(0.01, temp_rand); // We add a small amount of noise to the
// observation probabilities; this has the effect of breaking symmetries in
// a more random way to overcome certain weirdnesses that could otherwise
// happen due to the discretization.
}
// This function updates log_alpha_, as a function of prev_log_alpha_; it also
// updates back_pointers_[t];
void ComputeTransitionProb(int32 t) {
KALDI_ASSERT(t > 0);
BaseFloat pitch_interval = opts_.pitch_interval;
back_pointers_[t].resize(num_pitches_);
// Transition probability between pitch p and p' on times t-1 and t
// is (p - p')^2, with the pitch measured in Hz. We're doing Viterbi,
// so always pick the max over the previous frame's t.
KALDI_ASSERT(t > 0 && t < num_frames_);
int32 K = floor(opts_.max_pitch_change_per_frame / pitch_interval + 0.5);
// K is max #bins we can move; a kind of pruning, for speed.
for (int32 p = 0; p < num_pitches_; p++) {
int32 min_prev_p = p - K, max_prev_p = p + K;
if (min_prev_p < 0) min_prev_p = 0;
if (max_prev_p >= num_pitches_) max_prev_p = num_pitches_ - 1;
BaseFloat best_logprob = -1.0e+10;
int32 best_prev_p = -1;
for (int32 prev_p = min_prev_p; prev_p <= max_prev_p; prev_p++) {
BaseFloat delta_pitch = (min_pitch_[t-1] + prev_p * pitch_interval) -
(min_pitch_[t] + p * pitch_interval);
BaseFloat this_logprob = prev_log_alpha_(prev_p)
- 0.5 * delta_pitch * delta_pitch;
if (this_logprob > best_logprob) {
best_logprob = this_logprob;
best_prev_p = prev_p;
}
}
back_pointers_[t][p] = best_prev_p;
log_alpha_(p) = best_logprob;
}
}
void Forward() {
// Viterbi in a discrete model of the pitch, in which the observation
// probability of a pitch is p(voicing) at the observed pitch, and
// interpolator_factor_ * 1.0 - p(voicing) at all other pitches. the
// transition log-probability is -0.5 times the squared difference in pitch.
// [We measure this in Hz, not in integer values, to make it more invariant
// to the discretization interval].
back_pointers_.resize(num_frames_);
log_alpha_.Resize(num_pitches_);
prev_log_alpha_.Resize(num_pitches_);
log_alpha_.Set(0.0);
MultiplyObsProb(0);
for (int32 t = 1; t < num_frames_; t++) {
log_alpha_.Swap(&prev_log_alpha_);
ComputeTransitionProb(t);
MultiplyObsProb(t);
}
}
void Backtrace(Matrix<BaseFloat> *mat, PitchInterpolatorStats *stats) {
const BaseFloat pitch_interval = opts_.pitch_interval;
BaseFloat *p_begin = log_alpha_.Data(), *p_end = p_begin + num_pitches_,
*p_best = std::max_element(p_begin, p_end);
std::vector<int32> best_pitch(num_frames_);
int32 best_p = p_best - p_begin; // best discrete pitch p at time T-1.
for (int32 t = num_frames_ - 1; t >= 0; t--) {
{ // Update stats:
stats->num_frames_tot++;
if (pitch_[t] == 0) stats->num_frames_zero++;
else if (best_p != pitch_[t]) stats->num_frames_changed++;
}
BaseFloat pitch = min_pitch_[t] + pitch_interval * best_p;
(*mat)(t, 1) = pitch;
KALDI_ASSERT(best_p >= 0 && best_p < num_pitches_);
if (t > 0)
best_p = back_pointers_[t][best_p];
}
}
const PitchInterpolatorOptions &opts_;
std::vector<BaseFloat> min_pitch_; // Bottom of discretization range...
// previously this was a BaseFloat, but for better pseudo-randomization we
// have a slightly perturbed value for each frame now, so it's a vector.
int32 num_frames_; // number of frames;
int32 num_pitches_; // Number of discrete pitch intervals.
std::vector<int32> pitch_; // observed pitch, discretized; [it's don't-care if algorithm had no
// observation (0)]
std::vector<BaseFloat> p_voicing_; // p(voicing) times max_voicing_prob_; or zero if
// pitch was 0.0 for this frame.
std::vector<std::vector<int32> > back_pointers_; // at each t, points to best pitch
// on time t-1.
Vector<BaseFloat> log_alpha_;
Vector<BaseFloat> prev_log_alpha_;
};
// Linear Interpolation for places where the pitch value is zero
void LinearlyInterpolatePitch(Matrix<BaseFloat> *mat) {
int32 num_frames = mat->NumRows();
int i = 0;
Matrix<BaseFloat> &features = *mat;
while (i < num_frames) {
if(features(i, 1) == 0.0) {
int start = i - 1;
int end = i;
while( (features(end, 1)) == 0.0 && (end < num_frames))
end++;
BaseFloat end_value = -1, start_value = -1;
if (end < num_frames) end_value = features(end, 1);
if (start > 0) start_value = features(start, 1);
if (start_value < 0 && end_value < 0) {
// the whole file is unvoiced -> just put an arbitrary value,
// it will all be normalized out anyway.
start_value = 1.0;
end_value = 1.0;
}
// If we don't have a value for one end of the range, i.e. at the start or
// end, set it to 0.9 times the pitch value that we have at the other end
// of the range. The reason we don't set it to that value itself, is that
// then over this segment we would have zero time-derivative, so if we
// took time derivatives we would have an artificial spike at zero.
if (start_value < 0.0) start_value = 0.9 * end_value;
if (end_value < 0.0) end_value = 0.9 * start_value;
for(int k = start + 1; k < end; k++)
features(k, 1) = start_value +
(end_value - start_value) / (end - start) * (k - start);
i = end;
}
i++;
}
}
} // namespace kaldi
int main(int argc, char *argv[]) {
try {
using namespace kaldi;
const char *usage =
"This is a rather special-purpose program which processes 2-dimensional\n"
"features consisting of (prob-of-voicing, pitch). By default we do model-based\n"
"pitch smoothing and interpolation (see code), or if --linear-interpolation=true,\n"
"just linear interpolation across gaps where pitch == 0 (not predicted).\n"
"Usage: interpolate-pitch [options...] <feats-rspecifier> <feats-wspecifier>\n";
// construct all the global objects
ParseOptions opts(usage);
bool linear_interpolation = false;
PitchInterpolatorOptions interpolate_opts;
opts.Register("linear-interpolation",
&linear_interpolation, "If true, just do simple linear "
"interpolation across gaps (else, model-based)");
interpolate_opts.Register(&opts);
// parse options (+filling the registered variables)
opts.Read(argc, argv);
if (opts.NumArgs() != 2) {
opts.PrintUsage();
exit(1);
}
std::string input_rspecifier = opts.GetArg(1);
std::string output_wspecifier = opts.GetArg(2);
SequentialBaseFloatMatrixReader reader(input_rspecifier);
BaseFloatMatrixWriter kaldi_writer; // typedef to TableWriter<something>.
if (!kaldi_writer.Open(output_wspecifier))
KALDI_ERR << "Could not initialize output with wspecifier "
<< output_wspecifier;
int32 num_done = 0, num_err = 0;
PitchInterpolatorStats stats;
for (; !reader.Done(); reader.Next()) {
std::string utt = reader.Key();
Matrix<BaseFloat> features = reader.Value();
int num_frames = features.NumRows();
if (num_frames == 0 && features.NumCols() != 2) {
KALDI_WARN << "Feature file has bad size "
<< features.NumRows() << " by " << features.NumCols();
num_err++;
continue;
}
if (linear_interpolation) LinearlyInterpolatePitch(&features);
else {
// work happens in constructor of this class.
PitchInterpolator pi(interpolate_opts, &features, &stats);
}
kaldi_writer.Write(utt, features);
num_done++;
if (num_done % 10 == 0)
KALDI_LOG << "Processed " << num_done << " utterances";
KALDI_VLOG(2) << "Processed features for key " << utt;
}
if (!linear_interpolation) stats.Print();
KALDI_LOG << "Done " << num_done << " utterances, " << num_err
<< " with errors.";
return (num_done != 0 ? 0 : 1);
} catch(const std::exception &e) {
std::cerr << e.what();
return -1;
}
}