pitch-functions.h 20.4 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450


// feat/pitch-functions.h

// Copyright     2013  Pegah Ghahremani
//               2014  IMSL, PKU-HKUST (author: Wei Shi)
//               2014  Yanqing Sun, Junjie Wang,
//                     Daniel Povey, Korbinian Riedhammer
//                     Xin Lei

// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//  http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.

#ifndef KALDI_FEAT_PITCH_FUNCTIONS_H_
#define KALDI_FEAT_PITCH_FUNCTIONS_H_

#include <cassert>
#include <cstdlib>
#include <string>
#include <vector>

#include "base/kaldi-error.h"
#include "feat/mel-computations.h"
#include "itf/online-feature-itf.h"
#include "matrix/matrix-lib.h"
#include "util/common-utils.h"

namespace kaldi {
/// @addtogroup  feat FeatureExtraction
/// @{

struct PitchExtractionOptions {
  // FrameExtractionOptions frame_opts;
  BaseFloat samp_freq;          // sample frequency in hertz
  BaseFloat frame_shift_ms;     // in milliseconds.
  BaseFloat frame_length_ms;    // in milliseconds.
  BaseFloat preemph_coeff;      // Preemphasis coefficient. [use is deprecated.]
  BaseFloat min_f0;             // min f0 to search (Hz)
  BaseFloat max_f0;             // max f0 to search (Hz)
  BaseFloat soft_min_f0;        // Minimum f0, applied in soft way, must not
                                // exceed min-f0
  BaseFloat penalty_factor;     // cost factor for FO change
  BaseFloat lowpass_cutoff;     // cutoff frequency for Low pass filter
  BaseFloat resample_freq;      // Integer that determines filter width when
                                // upsampling NCCF
  BaseFloat delta_pitch;        // the pitch tolerance in pruning lags
  BaseFloat nccf_ballast;       // Increasing this factor reduces NCCF for
                                // quiet frames, helping ensure pitch
                                // continuity in unvoiced region
  int32 lowpass_filter_width;   // Integer that determines filter width of
                                // lowpass filter
  int32 upsample_filter_width;  // Integer that determines filter width when
                                // upsampling NCCF

  // Below are newer config variables, not present in the original paper,
  // that relate to the online pitch extraction algorithm.

  // The maximum number of frames of latency that we allow the pitch-processing
  // to introduce, for online operation. If you set this to a large value,
  // there would be no inaccuracy from the Viterbi traceback (but it might make
  // you wait to see the pitch). This is not very relevant for the online
  // operation: normalization-right-context is more relevant, you
  // can just leave this value at zero.
  int32 max_frames_latency;

  // Only relevant for the function ComputeKaldiPitch which is called by
  // compute-kaldi-pitch-feats. If nonzero, we provide the input as chunks of
  // this size. This affects the energy normalization which has a small effect
  // on the resulting features, especially at the beginning of a file. For best
  // compatibility with online operation (e.g. if you plan to train models for
  // the online-deocding setup), you might want to set this to a small value,
  // like one frame.
  int32 frames_per_chunk;

  // Only relevant for the function ComputeKaldiPitch which is called by
  // compute-kaldi-pitch-feats, and only relevant if frames_per_chunk is
  // nonzero. If true, it will query the features as soon as they are
  // available, which simulates the first-pass features you would get in online
  // decoding. If false, the features you will get will be the same as those
  // available at the end of the utterance, after InputFinished() has been
  // called: e.g. during lattice rescoring.
  bool simulate_first_pass_online;

  // Only relevant for online operation or when emulating online operation
  // (e.g. when setting frames_per_chunk). This is the frame-index on which we
  // recompute the NCCF (e.g. frame-index 500 = after 5 seconds); if the
  // segment ends before this we do it when the segment ends. We do this by
  // re-computing the signal average energy, which affects the NCCF via the
  // "ballast term", scaling the resampled NCCF by a factor derived from the
  // average change in the "ballast term", and re-doing the backtrace
  // computation. Making this infinity would be the most exact, but would
  // introduce unwanted latency at the end of long utterances, for little
  // benefit.
  int32 recompute_frame;

  // This is a "hidden config" used only for testing the online pitch
  // extraction. If true, we compute the signal root-mean-squared for the
  // ballast term, only up to the current frame, rather than the end of the
  // current chunk of signal. This makes the output insensitive to the
  // chunking, which is useful for testing purposes.
  bool nccf_ballast_online;
  bool snip_edges;
  PitchExtractionOptions():
      samp_freq(16000),
      frame_shift_ms(10.0),
      frame_length_ms(25.0),
      preemph_coeff(0.0),
      min_f0(50),
      max_f0(400),
      soft_min_f0(10.0),
      penalty_factor(0.1),
      lowpass_cutoff(1000),
      resample_freq(4000),
      delta_pitch(0.005),
      nccf_ballast(7000),
      lowpass_filter_width(1),
      upsample_filter_width(5),
      max_frames_latency(0),
      frames_per_chunk(0),
      simulate_first_pass_online(false),
      recompute_frame(500),
      nccf_ballast_online(false),
      snip_edges(true) { }

  void Register(OptionsItf *opts) {
    opts->Register("sample-frequency", &samp_freq,
                   "Waveform data sample frequency (must match the waveform "
                   "file, if specified there)");
    opts->Register("frame-length", &frame_length_ms, "Frame length in "
                   "milliseconds");
    opts->Register("frame-shift", &frame_shift_ms, "Frame shift in "
                   "milliseconds");
    opts->Register("preemphasis-coefficient", &preemph_coeff,
                   "Coefficient for use in signal preemphasis (deprecated)");
    opts->Register("min-f0", &min_f0,
                   "min. F0 to search for (Hz)");
    opts->Register("max-f0", &max_f0,
                   "max. F0 to search for (Hz)");
    opts->Register("soft-min-f0", &soft_min_f0,
                   "Minimum f0, applied in soft way, must not exceed min-f0");
    opts->Register("penalty-factor", &penalty_factor,
                   "cost factor for FO change.");
    opts->Register("lowpass-cutoff", &lowpass_cutoff,
                   "cutoff frequency for LowPass filter (Hz) ");
    opts->Register("resample-frequency", &resample_freq,
                   "Frequency that we down-sample the signal to.  Must be "
                   "more than twice lowpass-cutoff");
    opts->Register("delta-pitch", &delta_pitch,
                   "Smallest relative change in pitch that our algorithm "
                   "measures");
    opts->Register("nccf-ballast", &nccf_ballast,
                   "Increasing this factor reduces NCCF for quiet frames");
    opts->Register("nccf-ballast-online", &nccf_ballast_online,
                   "This is useful mainly for debug; it affects how the NCCF "
                   "ballast is computed.");
    opts->Register("lowpass-filter-width", &lowpass_filter_width,
                   "Integer that determines filter width of "
                   "lowpass filter, more gives sharper filter");
    opts->Register("upsample-filter-width", &upsample_filter_width,
                   "Integer that determines filter width when upsampling NCCF");
    opts->Register("frames-per-chunk", &frames_per_chunk, "Only relevant for "
                   "offline pitch extraction (e.g. compute-kaldi-pitch-feats), "
                   "you can set it to a small nonzero value, such as 10, for "
                   "better feature compatibility with online decoding (affects "
                   "energy normalization in the algorithm)");
    opts->Register("simulate-first-pass-online", &simulate_first_pass_online,
                   "If true, compute-kaldi-pitch-feats will output features "
                   "that correspond to what an online decoder would see in the "
                   "first pass of decoding-- not the final version of the "
                   "features, which is the default.  Relevant if "
                   "--frames-per-chunk > 0");
    opts->Register("recompute-frame", &recompute_frame, "Only relevant for "
                   "online pitch extraction, or for compatibility with online "
                   "pitch extraction.  A non-critical parameter; the frame at "
                   "which we recompute some of the forward pointers, after "
                   "revising our estimate of the signal energy.  Relevant if"
                   "--frames-per-chunk > 0");
    opts->Register("max-frames-latency", &max_frames_latency, "Maximum number "
                   "of frames of latency that we allow pitch tracking to "
                   "introduce into the feature processing (affects output only "
                   "if --frames-per-chunk > 0 and "
                   "--simulate-first-pass-online=true");
    opts->Register("snip-edges", &snip_edges, "If this is set to false, the "
                   "incomplete frames near the ending edge won't be snipped, "
                   "so that the number of frames is the file size divided by "
                   "the frame-shift. This makes different types of features "
                   "give the same number of frames.");
  }
  /// Returns the window-size in samples, after resampling.  This is the
  /// "basic window size", not the full window size after extending by max-lag.
  // Because of floating point representation, it is more reliable to divide
  // by 1000 instead of multiplying by 0.001, but it is a bit slower.
  int32 NccfWindowSize() const {
    return static_cast<int32>(resample_freq * frame_length_ms / 1000.0);
  }
  /// Returns the window-shift in samples, after resampling.
  int32 NccfWindowShift() const {
    return static_cast<int32>(resample_freq * frame_shift_ms / 1000.0);
  }
};

struct ProcessPitchOptions {
  BaseFloat pitch_scale;  // the final normalized-log-pitch feature is scaled
                          // with this value
  BaseFloat pov_scale;    // the final POV feature is scaled with this value
  BaseFloat pov_offset;   // An offset that can be added to the final POV
                          // feature (useful for online-decoding, where we don't
                          // do CMN to the pitch-derived features.

  BaseFloat delta_pitch_scale;
  BaseFloat delta_pitch_noise_stddev;  // stddev of noise we add to delta-pitch
  int32 normalization_left_context;    // left-context used for sliding-window
                                       // normalization
  int32 normalization_right_context;   // this should be reduced in online
                                       // decoding to reduce latency

  int32 delta_window;
  int32 delay;

  bool add_pov_feature;
  bool add_normalized_log_pitch;
  bool add_delta_pitch;
  bool add_raw_log_pitch;

  ProcessPitchOptions() :
      pitch_scale(2.0),
      pov_scale(2.0),
      pov_offset(0.0),
      delta_pitch_scale(10.0),
      delta_pitch_noise_stddev(0.005),
      normalization_left_context(75),
      normalization_right_context(75),
      delta_window(2),
      delay(0),
      add_pov_feature(true),
      add_normalized_log_pitch(true),
      add_delta_pitch(true),
      add_raw_log_pitch(false) { }


  void Register(ParseOptions *opts) {
    opts->Register("pitch-scale", &pitch_scale,
                   "Scaling factor for the final normalized log-pitch value");
    opts->Register("pov-scale", &pov_scale,
                   "Scaling factor for final POV (probability of voicing) "
                   "feature");
    opts->Register("pov-offset", &pov_offset,
                   "This can be used to add an offset to the POV feature. "
                   "Intended for use in online decoding as a substitute for "
                   " CMN.");
    opts->Register("delta-pitch-scale", &delta_pitch_scale,
                   "Term to scale the final delta log-pitch feature");
    opts->Register("delta-pitch-noise-stddev", &delta_pitch_noise_stddev,
                   "Standard deviation for noise we add to the delta log-pitch "
                   "(before scaling); should be about the same as delta-pitch "
                   "option to pitch creation.  The purpose is to get rid of "
                   "peaks in the delta-pitch caused by discretization of pitch "
                   "values.");
    opts->Register("normalization-left-context", &normalization_left_context,
                   "Left-context (in frames) for moving window normalization");
    opts->Register("normalization-right-context", &normalization_right_context,
                   "Right-context (in frames) for moving window normalization");
    opts->Register("delta-window", &delta_window,
                   "Number of frames on each side of central frame, to use for "
                   "delta window.");
    opts->Register("delay", &delay,
                   "Number of frames by which the pitch information is "
                   "delayed.");
    opts->Register("add-pov-feature", &add_pov_feature,
                   "If true, the warped NCCF is added to output features");
    opts->Register("add-normalized-log-pitch", &add_normalized_log_pitch,
                   "If true, the log-pitch with POV-weighted mean subtraction "
                   "over 1.5 second window is added to output features");
    opts->Register("add-delta-pitch", &add_delta_pitch,
                   "If true, time derivative of log-pitch is added to output "
                   "features");
    opts->Register("add-raw-log-pitch", &add_raw_log_pitch,
                   "If true, log(pitch) is added to output features");
  }
};


// We don't want to expose the pitch-extraction internals here as it's
// quite complex, so we use a private implementation.
class OnlinePitchFeatureImpl;


// Note: to start on a new waveform, just construct a new version
// of this object.
class OnlinePitchFeature: public OnlineBaseFeature {
 public:
  explicit OnlinePitchFeature(const PitchExtractionOptions &opts);

  virtual int32 Dim() const { return 2; /* (NCCF, pitch) */ }

  virtual int32 NumFramesReady() const;

  virtual BaseFloat FrameShiftInSeconds() const;

  virtual bool IsLastFrame(int32 frame) const;

  /// Outputs the two-dimensional feature consisting of (pitch, NCCF).  You
  /// should probably post-process this using class OnlineProcessPitch.
  virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);

  virtual void AcceptWaveform(BaseFloat sampling_rate,
                              const VectorBase<BaseFloat> &waveform);

  virtual void InputFinished();

  virtual ~OnlinePitchFeature();

 private:
  OnlinePitchFeatureImpl *impl_;
};


/// This online-feature class implements post processing of pitch features.
/// Inputs are original 2 dims (nccf, pitch).  It can produce various
/// kinds of outputs, using the default options it will be (pov-feature,
/// normalized-log-pitch, delta-log-pitch).
class OnlineProcessPitch: public OnlineFeatureInterface {
 public:
  virtual int32 Dim() const { return dim_; }

  virtual bool IsLastFrame(int32 frame) const {
    if (frame <= -1)
      return src_->IsLastFrame(-1);
    else if (frame < opts_.delay)
      return src_->IsLastFrame(-1) == true ? false : src_->IsLastFrame(0);
    else
      return src_->IsLastFrame(frame - opts_.delay);
  }
  virtual BaseFloat FrameShiftInSeconds() const {
    return src_->FrameShiftInSeconds();
  }

  virtual int32 NumFramesReady() const;

  virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);

  virtual ~OnlineProcessPitch() {  }

  // Does not take ownership of "src".
  OnlineProcessPitch(const ProcessPitchOptions &opts,
                     OnlineFeatureInterface *src);

 private:
  enum { kRawFeatureDim = 2};  // anonymous enum to define a constant.
                               // kRawFeatureDim defines the dimension
                               // of the input: (nccf, pitch)

  ProcessPitchOptions opts_;
  OnlineFeatureInterface *src_;
  int32 dim_;  // Output feature dimension, set in initializer.

  struct NormalizationStats {
    int32 cur_num_frames;      // value of src_->NumFramesReady() when
                               // "mean_pitch" was set.
    bool input_finished;       // true if input data was finished when
                               // "mean_pitch" was computed.
    double sum_pov;            // sum of pov over relevant range
    double sum_log_pitch_pov;  // sum of log(pitch) * pov over relevant range

    NormalizationStats(): cur_num_frames(-1), input_finished(false),
                          sum_pov(0.0), sum_log_pitch_pov(0.0) { }
  };

  std::vector<BaseFloat> delta_feature_noise_;

  std::vector<NormalizationStats> normalization_stats_;

  /// Computes and returns the POV feature for this frame.
  /// Called from GetFrame().
  inline BaseFloat GetPovFeature(int32 frame) const;

  /// Computes and returns the delta-log-pitch feature for this frame.
  /// Called from GetFrame().
  inline BaseFloat GetDeltaPitchFeature(int32 frame);

  /// Computes and returns the raw log-pitch feature for this frame.
  /// Called from GetFrame().
  inline BaseFloat GetRawLogPitchFeature(int32 frame) const;

  /// Computes and returns the mean-subtracted log-pitch feature for this frame.
  /// Called from GetFrame().
  inline BaseFloat GetNormalizedLogPitchFeature(int32 frame);

  /// Computes the normalization window sizes.
  inline void GetNormalizationWindow(int32 frame,
                                     int32 src_frames_ready,
                                     int32 *window_begin,
                                     int32 *window_end) const;

  /// Makes sure the entry in normalization_stats_ for this frame is up to date;
  /// called from GetNormalizedLogPitchFeature.
  inline void UpdateNormalizationStats(int32 frame);
};


/// This function extracts (pitch, NCCF) per frame, using the pitch extraction
/// method described in "A Pitch Extraction Algorithm Tuned for Automatic Speech
/// Recognition", Pegah Ghahremani, Bagher BabaAli, Daniel Povey, Korbinian
/// Riedhammer, Jan Trmal and Sanjeev Khudanpur, ICASSP 2014.  The output will
/// have as many rows as there are frames, and two columns corresponding to
/// (NCCF, pitch)
void ComputeKaldiPitch(const PitchExtractionOptions &opts,
                       const VectorBase<BaseFloat> &wave,
                       Matrix<BaseFloat> *output);

/// This function processes the raw (NCCF, pitch) quantities computed by
/// ComputeKaldiPitch, and processes them into features.  By default it will
/// output three-dimensional features, (POV-feature, mean-subtracted-log-pitch,
/// delta-of-raw-pitch), but this is configurable in the options.  The number of
/// rows of "output" will be the number of frames (rows) in "input", and the
/// number of columns will be the number of different types of features
/// requested (by default, 3; 4 is the max).  The four config variables
/// --add-pov-feature, --add-normalized-log-pitch, --add-delta-pitch,
/// --add-raw-log-pitch determine which features we create; by default we create
/// the first three.
void ProcessPitch(const ProcessPitchOptions &opts,
                  const MatrixBase<BaseFloat> &input,
                  Matrix<BaseFloat> *output);

/// This function combines ComputeKaldiPitch and ProcessPitch.  The reason
/// why we need a separate function to do this is in order to be able to
/// accurately simulate the online pitch-processing, for testing and for
/// training models matched to the "first-pass" features.  It is sensitive to
/// the variables in pitch_opts that relate to online processing,
/// i.e. max_frames_latency, frames_per_chunk, simulate_first_pass_online,
/// recompute_frame.
void ComputeAndProcessKaldiPitch(const PitchExtractionOptions &pitch_opts,
                                 const ProcessPitchOptions &process_opts,
                                 const VectorBase<BaseFloat> &wave,
                                 Matrix<BaseFloat> *output);


/// @} End of "addtogroup feat"
}  // namespace kaldi
#endif  // KALDI_FEAT_PITCH_FUNCTIONS_H_